def test_title_parse(self):
        # Missing w/wikipedia identifier.
        self.assertTupleEqual(
            (None, None),
            html_anchor_parser.parse_title_if_wikipedia(":woosh:nomatch"))

        # Empty title.
        self.assertTupleEqual(
            (None, None), html_anchor_parser.parse_title_if_wikipedia("w:"))

        self.assertTupleEqual(
            (None, "APage"),
            html_anchor_parser.parse_title_if_wikipedia("w:APage"))
        self.assertTupleEqual(
            (None, "APage"),
            html_anchor_parser.parse_title_if_wikipedia("wikiPEDIA:APage"))
        self.assertTupleEqual(
            (None, "APage"),
            html_anchor_parser.parse_title_if_wikipedia(":wikipedia:APage"))

        # Non-existent language identifier.
        self.assertTupleEqual(
            (None, "xxxx:APage"),
            html_anchor_parser.parse_title_if_wikipedia("w:xxxx:APage"))

        # Valid language identifier.
        self.assertTupleEqual(
            ("es", "APage"),
            html_anchor_parser.parse_title_if_wikipedia("w:es:APage"))

        # Implicitly just a title even though it matches language.
        self.assertTupleEqual(
            (None, "es:"),
            html_anchor_parser.parse_title_if_wikipedia("w:es:"))
Esempio n. 2
0
    def parse_doc(self, wiki_doc, docid):
        """Parse wiki_doc to produce a text document and a set of mention spans."""
        output_text = ""
        logging.debug("Parsing doc [%s]", docid)
        wiki_doc = self._truncate_end(wiki_doc)

        markup_parser = html_anchor_parser.WikiExtractorHTMLParser()
        raw_mentions = []
        try:
            markup_parser.feed(wiki_doc)
            output_text = markup_parser.output
            raw_mentions = markup_parser.mentions
        except ValueError:
            # Ignore all mentions from these pages.
            logging.warning("Ignore %d mentions due to parse fail [%s]",
                            len(raw_mentions), docid)
            raw_mentions = []

        # Filter raw mentions to ones that land on Wikipedia pages and add
        # page-level metadata.
        final_mentions = []
        for mention_dict in raw_mentions:
            link_target = mention_dict.pop("target")

            # Limit mentions to links starting with wikipedia prefix.
            lang, title = html_anchor_parser.parse_title_if_wikipedia(
                link_target)
            if not title:  # need at least title
                logging.debug("Skip anchor with target: %s", link_target)
                continue

            # Default to the wiki's language when the link target itself does not
            # provide a language code.
            lang = lang or self._language

            mention_dict["docid"] = docid
            mention_dict["url"] = title_to_url(title, lang)  # entity target
            final_mentions.append(mention_dict)

        return output_text, pd.DataFrame(final_mentions), markup_parser._tags