Ejemplo n.º 1
0
def getChunks(url):
    try:
        soup = BeautifulSoup(loadURL(url))
    except:
        # print "Page load failed, probably because the web page could not be reached."
        return ""
    
    tags =  soup.find('div')
    if tags == None: return ""
    tags = tags.findAllNext('div')
    tags = [str(''.join(map(str, t.contents))) for t in tags if len(t.contents) > 0]
    tags.sort(longer)
    return stripTags(tags[0])
Ejemplo n.º 2
0
def getPageInfo(url):
    info = {}
    try:
        soup = BeautifulSoup(loadURL(url))
    except:
        # print "Page load failed, probably because the web page could not be reached."
        return ""
    # info['images'] = soup.findAll('img')
    tags =  soup.find('div')
    desc = ""
    if tags == None: 
        tags = []
        tags.append(soup.find('body'))
        desc = stripTags(str(soup.find('body')))
    else: 
        tags = tags.findAllNext('div')
        tags = [str(''.join(map(str, t.contents))) for t in tags if len(t.contents) > 0]
        tags.sort(longer)
        desc = stripTags(tags[0])

    try:
        tmp = BlogPost(url, soup.html.head.title.string, "", "", 0, desc)
    except:
        tmp = BlogPost(url, url, "", "", 0, desc)
    
    category = getProbableCategory(tmp)

    print '<?xml version="1.0" encoding="UTF-8"?>'
    print '<category>'
    print '<title><![CDATA[' + tmp.title.encode('utf-8') + ']]></title>'
    print '<item>'
    print '<url><![CDATA[' + url + ']]></url>'
    if len(category) > 0: print '<cat>' + category + '</cat>'
    print '<title><![CDATA[' + tmp.title.encode('utf-8') + ']]></title>'
    print '<description>'
    print '<![CDATA[' + desc + ']]>'
    print '</description>'
    print '</item>'
    print '</category>'