コード例 #1
0
def htm2txt(inf):
    """ extract the text context"""
    doc=html.document_fromstring(inf)
    content=doc.xpath('//*[@id="contents"]')
    htmls=html.tostring(content[0],False)
    htmls=htmls.replace('<br>','\n')
    htmls=htmls.replace('<p>','\n')
    htmls=unescape(htmls)
    p=re.compile('\n{2,}') #replace more than 2 newlines in a row into one newline
    htmls=p.sub('\n',htmls)
    newdoc=html.document_fromstring(htmls)
    return newdoc.text_content()
コード例 #2
0
def htm2txt(inf):
    """ extract the text context"""
    doc=html.document_fromstring(inf)
    content=doc.xpath('//*[@id="bgdiv"]/table[2]/tbody/tr[1]/td/table/tbody/tr')
    htmls=html.tostring(content[0],False)
    htmls=htmls.replace('<br>','\n')
    htmls=htmls.replace('<p>','\n')
    htmls=htmls.replace('&#160;',' ')
    p=re.compile('\n{2,}') #replace more than 2 newlines in a row into one newline
    htmls=p.sub('\n',htmls)
    newdoc=html.document_fromstring(htmls)
    return newdoc.text_content()
コード例 #3
0
def htm2txt(inf):
    """ extract the text context"""
    doc = html.document_fromstring(inf)
    #content=doc.xpath('//*[@id="bgdiv"]/table[2]/tbody/tr[1]/td/table/tbody/tr')
    content = doc.xpath('//*[@id="content"]')
    htmls = html.tostring(content[0], False)
    htmls = htmls.replace('<br>', '\n')
    htmls = htmls.replace('<p>', '\n')
    htmls = htmls.replace('&#160;', ' ')
    p = re.compile(
        '\n{2,}')  #replace more than 2 newlines in a row into one newline
    htmls = p.sub('\n', htmls)
    newdoc = html.document_fromstring(htmls)
    return newdoc.text_content()