Beispiel #1
0
def parse(url, html, webInfo):
    encoding, html = html_util.get_unicode_str(html)
    if encoding == '': return '', '', '', '', '', ''
    try:
        doc = lxml.html.document_fromstring(html)
        doc.make_links_absolute(url)
        page = lxml.etree.tounicode(doc, method='html')
    except:
        traceback.print_exc()
        pass
    picUrl = getDataByXpath(doc, webInfo['imgXpath'])
    title = getDataByXpath(doc, webInfo['titleXpath'])
    text = getDataByXpath(doc, webInfo['contentXpath'], 1)
    publishTime = getDataByXpath(doc, webInfo['timeXpath'])
    mypos = getDataByXpath(doc, webInfo['myposXpath'], 1)
    publishTime = strtotime(publishTime)
    detailFy = getPageNum(doc, webInfo)
    '''
    print "标题:" + title.encode('utf-8','ignore')
    print "时间:" + publishTime.encode('utf-8','ignore')
    print "内容:" + text.encode('utf-8','ignore')
    print "图片:" + picUrl.encode('utf-8','ignore')
    print "导航:" + mypos.encode('utf-8','ignore')
    '''
    return encoding, picUrl, title, text, publishTime, mypos, detailFy
Beispiel #2
0
def parse(url, html):
    encoding, html = html_util.get_unicode_str(html)
    if encoding == '': return '', '', '', ''
    try:
        doc = lxml.html.document_fromstring(html)
        doc.make_links_absolute(url)
        html = lxml.etree.tounicode(doc, method='html')
    except:
        pass
    title, text = get_raw_info(html)
    
    time, text = get_main_content(text)
    return encoding, time, title, text
Beispiel #3
0
def parse(url, html):
    encoding, html = html_util.get_unicode_str(html)
    if encoding == '': return '', '', '', ''
    try:
        doc = lxml.html.document_fromstring(html)
        doc.make_links_absolute(url)
        html = lxml.etree.tounicode(doc, method='html')
    except:
        pass
    title, text = get_raw_info(html)

    time, text = get_main_content(text)
    return encoding, time, title, text
def pageNumExtractBak(html):
    encoding, html = html_util.get_unicode_str(html)
    for line in html.split('\n'):
        line = line.strip().encode('utf-8')
        if '下一页' in line:
            print line
            if '4.htm' in line:
                return 4
            elif '3.htm' in line:
                return 3
            elif '2.htm' in line:
                return 2
    return 0
Beispiel #5
0
def parse(url, html, webInfo):
    encoding, html = html_util.get_unicode_str(html)
    if 'ISO' in encoding:
        return '', '', '', '', '', ''
    if encoding == '': return '', '', '', '', '', ''
    newHtml = ''
    imgList = []
    for x in html.split('\n'):
        if x.count('<img') > 1:
            x = x.replace(u'<img', u'\n<img')
        newHtml = newHtml + "\n" + x
    html = newHtml
    try:
        doc = lxml.html.document_fromstring(html)
        doc.make_links_absolute(url)
        html = lxml.etree.tounicode(doc, method='html')
    except:
        traceback.print_exc()
        pass
    newHtml = getBody(html, webInfo['bodyPattern'])
    if newHtml == "":
        return '', '', '', '', '', ''
    title, time, text, images = get_main_content(html, newHtml, webInfo)
    mypos = ''
    if webInfo.has_key('textPattern'):
        text = getText(html, webInfo['textPattern'])
    if webInfo.has_key('titlePattern'):
        title = getText(html, webInfo['titlePattern'])
    if webInfo.has_key('myposPattern'):
        mypos = getText(html, webInfo['myposPattern'])
    if webInfo.has_key('textXpath'):
        text = getDataByXpath(doc, webInfo['textXpath'])
    if webInfo.has_key('titleXpath'):
        title = getDataByXpath(doc, webInfo['titleXpath'])
    if webInfo.has_key('myposXpath'):
        mypos = getDataByXpath(doc, webInfo['myposXpath'])
    if webInfo.has_key('publishTimeXpath'):
        time = getDataByXpath(doc, webInfo['publishTimeXpath'])
        time = strtotime(time, '')

    if webInfo.has_key('imgReplace'):
        patternList = webInfo['imgReplace']
        for picUrl in images:
            for pattern in patternList:
                picUrl = picUrl.replace(pattern[0], pattern[1])
            imgList.append(picUrl)
    else:
        imgList = images
    #print time.encode('utf-8')
    #print text.encode('utf-8')
    return encoding, title, text, time, imgList, mypos
Beispiel #6
0
def parse(url, html):
    encoding, html = html_util.get_unicode_str(html)
    if encoding == '': return '', []
    doc = lxml.html.document_fromstring(html)
    doc.make_links_absolute(url)
    links = [(lxml.etree.tounicode(node,method='text'), node.xpath('@href'))\
            for node in doc.xpath('//a')]
    links = [(re.sub(ur'\s+', ' ', x), y[0].strip()) for x, y in links
             if is_useful_link(x, y)]
    return encoding, links


if __name__ == "__main__":
    url = 'http://www.sohu.com/'
    html = open('list.html').read()
    encoding, links = parse(url, html)
    for x, y in links:
        print x, y
def parse(url, html):
    #encoding
    encoding, html = html_util.get_unicode_str(html)
    originhtml = html
    if encoding == '': return '', '', '', '', '', []
    try:
        doc = lxml.html.document_fromstring(html)
        doc.make_links_absolute(url)
        html = lxml.etree.tounicode(doc, method='html')
    except:
        pass

    #title
    title, text = get_raw_info(html)

    #time,块密度text
    time, text = get_main_content(text)
    try:
        print "text:\n%s" % text
    except Exception, e:
        print e
        pass