Ejemplo n.º 1
0
    def extract(self):
        authors = set()

        for known_tag in self.config.known_author_patterns:
            meta_tags = self.parser.getElementsByTag(self.article.doc,
                                                     attr=known_tag.attr,
                                                     value=known_tag.value,
                                                     tag=known_tag.tag)
            if not meta_tags:
                continue

            for meta_tag in meta_tags:

                if known_tag.subpattern:
                    name_nodes = self.parser.getElementsByTag(
                        meta_tag,
                        attr=known_tag.subpattern.attr,
                        value=known_tag.subpattern.value,
                        tag=known_tag.subpattern.tag)

                    if len(name_nodes) > 0:
                        name = self.parser.getText(name_nodes[0])
                        authors.add(innerTrim(name))
                else:
                    if known_tag.content is None:
                        authors.add(innerTrim(meta_tag.text_content().strip()))
                    else:
                        name = self.parser.getAttribute(
                            meta_tag, known_tag.content)
                        if not name:
                            continue
                        authors.add(innerTrim(name))
        return list(authors)
Ejemplo n.º 2
0
    def convert_to_text(self):
        txts = []
        for node in list(self.get_top_node()):
            txt = self.parser.getText(node)
            if txt:
                txt = html.unescape(txt)
                print(node.attrib)
                if 'class' in node.attrib:
                    if node.tag == 'blockquote' and node.attrib[
                            'class'] == 'twitter-tweet':
                        txt = '$tweet_begin$' + txt + '$tweet_end$'
                        txt = re.sub(r'https?:\/\/[^ ]*',
                                     '',
                                     txt,
                                     flags=re.MULTILINE)
                        txt = re.sub(r'pic.twitter[^ ]*',
                                     '',
                                     txt,
                                     flags=re.MULTILINE)
                        print(txt)
                txt_lis = innerTrim(txt).split(r'\n')
                txts.extend(txt_lis)
        text = '\n\n'.join(txts)
        # ensure no double newlines at the beginning of lists
        if self.config.parse_lists:
            # Split out the lists and clean them up! Ensuring no trailing spaces
            txt = text.replace('\n•', '•').split('• ')
            txt = [x.strip() for x in txt]

            if self.config.pretty_lists:
                text = '\n• '.join(txt)
            else:
                text = '\n'.join(txt)
        return text
Ejemplo n.º 3
0
 def convert_to_text(self):
     txts = []
     for node in list(self.get_top_node()):
         txt = self.parser.getText(node)
         if txt:
             txt = self._text_parser.unescape(txt)
             txt_lis = innerTrim(txt).split(r'\n')
             txts.extend(txt_lis)
     return '\n\n'.join(txts)
Ejemplo n.º 4
0
    def convert_to_text(self):
        txts = []
        for node in list(self.get_top_node()):
            txt = self.parser.getText(node)
            if txt:
                txt = html.unescape(txt)
                txt_lis = innerTrim(txt).split(r'\n')
                txts.extend(txt_lis)
        text = '\n\n'.join(txts)
        # ensure no double newlines at the beginning of lists
        if self.config.parse_lists:
            # Split out the lists and clean them up! Ensuring no trailing spaces
            txt = text.replace('\n•', '•').split('• ')
            txt = [x.strip() for x in txt]

            if self.config.pretty_lists:
                text = '\n• '.join(txt)
            else:
                text = '\n'.join(txt)
        return text
Ejemplo n.º 5
0
 def getText(cls, node):
     txts = [i for i in node.itertext()]
     return innerTrim(' '.join(txts).strip())