def getChunks(url): try: soup = BeautifulSoup(loadURL(url)) except: # print "Page load failed, probably because the web page could not be reached." return "" tags = soup.find('div') if tags == None: return "" tags = tags.findAllNext('div') tags = [str(''.join(map(str, t.contents))) for t in tags if len(t.contents) > 0] tags.sort(longer) return stripTags(tags[0])
def getPageInfo(url): info = {} try: soup = BeautifulSoup(loadURL(url)) except: # print "Page load failed, probably because the web page could not be reached." return "" # info['images'] = soup.findAll('img') tags = soup.find('div') desc = "" if tags == None: tags = [] tags.append(soup.find('body')) desc = stripTags(str(soup.find('body'))) else: tags = tags.findAllNext('div') tags = [str(''.join(map(str, t.contents))) for t in tags if len(t.contents) > 0] tags.sort(longer) desc = stripTags(tags[0]) try: tmp = BlogPost(url, soup.html.head.title.string, "", "", 0, desc) except: tmp = BlogPost(url, url, "", "", 0, desc) category = getProbableCategory(tmp) print '<?xml version="1.0" encoding="UTF-8"?>' print '<category>' print '<title><![CDATA[' + tmp.title.encode('utf-8') + ']]></title>' print '<item>' print '<url><![CDATA[' + url + ']]></url>' if len(category) > 0: print '<cat>' + category + '</cat>' print '<title><![CDATA[' + tmp.title.encode('utf-8') + ']]></title>' print '<description>' print '<![CDATA[' + desc + ']]>' print '</description>' print '</item>' print '</category>'