def extract(self): authors = set() for known_tag in self.config.known_author_patterns: meta_tags = self.parser.getElementsByTag(self.article.doc, attr=known_tag.attr, value=known_tag.value, tag=known_tag.tag) if not meta_tags: continue for meta_tag in meta_tags: if known_tag.subpattern: name_nodes = self.parser.getElementsByTag( meta_tag, attr=known_tag.subpattern.attr, value=known_tag.subpattern.value, tag=known_tag.subpattern.tag) if len(name_nodes) > 0: name = self.parser.getText(name_nodes[0]) authors.add(innerTrim(name)) else: if known_tag.content is None: authors.add(innerTrim(meta_tag.text_content().strip())) else: name = self.parser.getAttribute( meta_tag, known_tag.content) if not name: continue authors.add(innerTrim(name)) return list(authors)
def convert_to_text(self): txts = [] for node in list(self.get_top_node()): txt = self.parser.getText(node) if txt: txt = html.unescape(txt) print(node.attrib) if 'class' in node.attrib: if node.tag == 'blockquote' and node.attrib[ 'class'] == 'twitter-tweet': txt = '$tweet_begin$' + txt + '$tweet_end$' txt = re.sub(r'https?:\/\/[^ ]*', '', txt, flags=re.MULTILINE) txt = re.sub(r'pic.twitter[^ ]*', '', txt, flags=re.MULTILINE) print(txt) txt_lis = innerTrim(txt).split(r'\n') txts.extend(txt_lis) text = '\n\n'.join(txts) # ensure no double newlines at the beginning of lists if self.config.parse_lists: # Split out the lists and clean them up! Ensuring no trailing spaces txt = text.replace('\n•', '•').split('• ') txt = [x.strip() for x in txt] if self.config.pretty_lists: text = '\n• '.join(txt) else: text = '\n'.join(txt) return text
def convert_to_text(self): txts = [] for node in list(self.get_top_node()): txt = self.parser.getText(node) if txt: txt = self._text_parser.unescape(txt) txt_lis = innerTrim(txt).split(r'\n') txts.extend(txt_lis) return '\n\n'.join(txts)
def convert_to_text(self): txts = [] for node in list(self.get_top_node()): txt = self.parser.getText(node) if txt: txt = html.unescape(txt) txt_lis = innerTrim(txt).split(r'\n') txts.extend(txt_lis) text = '\n\n'.join(txts) # ensure no double newlines at the beginning of lists if self.config.parse_lists: # Split out the lists and clean them up! Ensuring no trailing spaces txt = text.replace('\n•', '•').split('• ') txt = [x.strip() for x in txt] if self.config.pretty_lists: text = '\n• '.join(txt) else: text = '\n'.join(txt) return text
def getText(cls, node): txts = [i for i in node.itertext()] return innerTrim(' '.join(txts).strip())