def htm2txt(inf): """ extract the text context""" doc=html.document_fromstring(inf) content=doc.xpath('//*[@id="contents"]') htmls=html.tostring(content[0],False) htmls=htmls.replace('<br>','\n') htmls=htmls.replace('<p>','\n') htmls=unescape(htmls) p=re.compile('\n{2,}') #replace more than 2 newlines in a row into one newline htmls=p.sub('\n',htmls) newdoc=html.document_fromstring(htmls) return newdoc.text_content()
def htm2txt(inf): """ extract the text context""" doc=html.document_fromstring(inf) content=doc.xpath('//*[@id="bgdiv"]/table[2]/tbody/tr[1]/td/table/tbody/tr') htmls=html.tostring(content[0],False) htmls=htmls.replace('<br>','\n') htmls=htmls.replace('<p>','\n') htmls=htmls.replace(' ',' ') p=re.compile('\n{2,}') #replace more than 2 newlines in a row into one newline htmls=p.sub('\n',htmls) newdoc=html.document_fromstring(htmls) return newdoc.text_content()
def htm2txt(inf): """ extract the text context""" doc = html.document_fromstring(inf) #content=doc.xpath('//*[@id="bgdiv"]/table[2]/tbody/tr[1]/td/table/tbody/tr') content = doc.xpath('//*[@id="content"]') htmls = html.tostring(content[0], False) htmls = htmls.replace('<br>', '\n') htmls = htmls.replace('<p>', '\n') htmls = htmls.replace(' ', ' ') p = re.compile( '\n{2,}') #replace more than 2 newlines in a row into one newline htmls = p.sub('\n', htmls) newdoc = html.document_fromstring(htmls) return newdoc.text_content()