def convertToText(self,article): text = Parser.getFormattedText(self.topNode) lines = text.split(u'\n') good_lines = [] for line in lines: if re.search('[^ \xa0]',line): good_lines.append(line.strip()) text = u'\n'.join(good_lines) Parser.adjustTopNode(article) return text
def convertToText(self,article): txts = [] for node in list(self.getTopNode()): txt = Parser.getFormattedText(node) if txt: txt = HTMLParser().unescape(txt) txts.append(innerTrim(txt)) text = '\n'.join(txts) text = re.sub(u'[\ufffc]','\n',text) lines = text.split('\n') text = '' # cutting title from article text if found in first 4 rows if len(lines) > 4: for i in range(0,4): if lines[i] == article.h1 or lines[i] == article.title: del lines[i] break for line in lines: if re.search('[^ \t\r]',line): text += line + '\n' return text