def convert_to_text(top_node): txts = [] for node in list(top_node): txt = parser.get_text(node) if txt: txt = HTMLParser().unescape(txt) txts.append(inner_trim(txt)) return '\n\n'.join(txts)
def get_text(node): txts = [i for i in node.itertext()] return inner_trim(u' '.join(txts).strip())