def convertToText(self): txts = [] node = self.getTopNode() txt = node.text if txt and re.search('[^ \t\r\n]',txt): txt = HTMLParser().unescape(txt) txts.append(innerTrim(txt)) for node in list(self.getTopNode()): txt = Parser.getText(node) if txt: txt = HTMLParser().unescape(txt) txts.append(innerTrim(txt)) return '\n\n'.join(txts)
def getTextAndWriteToFile(self, node): txts = [i for i in node.itertext()] f = codecs.open("log.txt",'wb','utf-8') for line in txts: f.write(line) f.close() return innerTrim(u' '.join(txts).strip())
def convertToText(self): txts = [] for node in list(self.getTopNode()): txt = Parser.getText(node) if txt: txt = HTMLParser().unescape(txt) txts.append(innerTrim(txt)) return '\n\n'.join(txts)
def convert_to_text(self): txts = [] for node in list(self.get_top_node()): txt = self.parser.getText(node) if txt: txt = HTMLParser().unescape(txt) txt_lis = innerTrim(txt).split(r'\n') txts.extend(txt_lis) return '\n\n'.join(txts)
def convert_to_text(self): txts = [] for node in list(self.get_top_node()): txt = self.parser.getText(node) if txt: txt = HTMLParser().unescape(txt) txt_lis = innerTrim(txt).split(r'\n') txts.extend(txt_lis) return '<br/>'.join(txts)
def convertToText(self,article): txts = [] for node in list(self.getTopNode()): txt = Parser.getFormattedText(node) if txt: txt = HTMLParser().unescape(txt) txts.append(innerTrim(txt)) text = '\n'.join(txts) text = re.sub(u'[\ufffc]','\n',text) lines = text.split('\n') text = '' # cutting title from article text if found in first 4 rows if len(lines) > 4: for i in range(0,4): if lines[i] == article.h1 or lines[i] == article.title: del lines[i] break for line in lines: if re.search('[^ \t\r]',line): text += line + '\n' return text
def getText(self, node): txts = [i for i in node.itertext()] return innerTrim(u' '.join(txts).strip())
def clean(self, node): html_string = self.parser.nodeToString(node, method='html') clean_html_string = self.clean_html(html_string) return innerTrim(clean_html_string)
def getTextAndShowInConsole(self, node): txts = [i for i in node.itertext()] print txts return innerTrim(u' '.join(txts).strip())
def getText(self, node): txts = [i for i in node.itertext()] return innerTrim(u" ".join(txts))