Esempio n. 1
0
    def add_wiki_info(self, pack: DataPack, statements: List):
        link_grouped: DefaultDict[
            str, Dict[str, rdflib.term.Node]
        ] = defaultdict(dict)
        for nif_range, rel, info in statements:
            range_ = get_resource_attribute(nif_range, "char")
            r = get_resource_fragment(rel)
            if range_ is not None and r is not None:
                link_grouped[range_][r] = info

        for range_, link_infos in link_grouped.items():
            begin, end = [int(d) for d in range_.split(",")]

            if end > len(pack.text):
                # Some nif dataset are off by a bit, mostly when there are
                # new line characters, we cannot correct them.
                # but we need to make sure they don't go longer than the
                # text.
                logging.info(
                    "Provided anchor end is %d, "
                    "clipped to fit with the text.",
                    end,
                )
                end = len(pack.text)

            if end <= begin:
                logging.info("Provided anchor [%d:%d is invalid.]", begin, end)
                continue

            for info_key, info_value in link_infos.items():
                info_value = str(info_value)
                if info_key == "type":
                    anchor_type = get_resource_fragment(info_value)
                    if (
                        not anchor_type == "Phrase"
                        and not anchor_type == "Word"
                    ):
                        logging.warning("Unknown anchor type: %s", info_value)
                if info_key == "taIdentRef":
                    target_page_name = get_resource_name(info_value)
                    if (
                        target_page_name is not None
                        and target_page_name in self._redirects
                    ):
                        target_page_name = self._redirects[target_page_name]

                    if target_page_name is not None:
                        # Only create anchor with proper link.
                        anchor = WikiAnchor(pack, begin, end)
                        anchor.target_page_name = target_page_name
                        # If it is an DBpedia resource, the domain will be
                        # truncated, otherwise it will stay the same, meaning
                        # it is an external link.
                        anchor.is_external = target_page_name == str(info_value)
Esempio n. 2
0
    def add_wiki_info(self, pack: DataPack, statements: List):
        link_grouped: DefaultDict[str,
                                  Dict[str,
                                       rdflib.term.Node]] = defaultdict(dict)
        for nif_range, rel, info in statements:
            range_ = get_resource_attribute(nif_range, 'char')
            r = get_resource_fragment(rel)
            if range_ is not None and r is not None:
                link_grouped[range_][r] = info

        for range_, link_infos in link_grouped.items():
            begin, end = [int(d) for d in range_.split(',')]

            if end > len(pack.text):
                # Some nif dataset are off by a bit, mostly when there are
                # new line characters, we cannot correct them.
                # but we need to make sure they don't go longer than the
                # text.
                logging.info(
                    "Provided anchor end is %d, "
                    "clipped to fit with the text.", end)
                end = len(pack.text)

            if end <= begin:
                logging.info("Provided anchor [%d:%d is invalid.]", begin, end)
                continue

            anchor = WikiAnchor(pack, begin, end)
            for info_key, info_value in link_infos.items():
                if info_key == 'type':
                    anchor_type = get_resource_fragment(info_value)
                    if (not anchor_type == 'Phrase'
                            and not anchor_type == 'Word'):
                        logging.warning("Unknown anchor type: %s", info_value)
                if info_key == 'taIdentRef':
                    target_page_name = get_resource_name(info_value)
                    if (target_page_name is not None
                            and target_page_name in self._redirects):
                        target_page_name = self._redirects[target_page_name]
                    anchor.target_page_name = target_page_name
Esempio n. 3
0
    def _process(self, input_pack: DataPack):
        kp = KeywordProcessor(case_sensitive=True)
        anchor_entities = {}
        existing_anchors = set()

        anchor: WikiAnchor
        for anchor in input_pack.get(WikiAnchor):
            kp.add_keyword(anchor.text)
            existing_anchors.add((anchor.span.begin, anchor.span.end))

            try:
                anchor_entities[anchor.text].append(anchor)
            except KeyError:
                anchor_entities[anchor.text] = [anchor]

        for kw, b, e in kp.extract_keywords(input_pack.text, span_info=True):
            targets = anchor_entities[kw]

            if (b, e) in existing_anchors:
                # Ignore existing anchors.
                continue

            copy_from: WikiAnchor
            if len(targets) == 1:
                copy_from = targets[0]
            elif len(targets) > 1:
                latest_ = targets[0]
                for t in targets:
                    if t.begin < b:
                        latest_ = t
                copy_from = latest_
            else:
                raise RuntimeError(f"Unknown target length {len(targets)}")

            anchor = WikiAnchor(input_pack, b, e)
            anchor.target_page_name = copy_from.target_page_name
            anchor.is_external = copy_from.is_external
            input_pack.add_entry(anchor)
Esempio n. 4
0
def add_anchor_links(pack: DataPack, text_link_statements: List[state_type],
                     redirects: Dict[str, str]):
    link_grouped: DefaultDict[str, Dict[str,
                                        rdflib.term.Node]] = defaultdict(dict)
    for nif_range, rel, info in text_link_statements:
        range_ = get_resource_attribute(nif_range, 'char')
        r = get_resource_fragment(rel)
        link_grouped[range_][r] = info

    for range_, link_infos in link_grouped.items():
        begin, end = [int(d) for d in range_.split(',')]
        anchor = WikiAnchor(pack, begin, end)
        for info_key, info_value in link_infos.items():
            if info_key == 'type':
                anchor_type = get_resource_fragment(info_value)
                if not anchor_type == 'Phrase' and not anchor_type == 'Word':
                    logging.warning("Unknown anchor type: %s", info_value)
            if info_key == 'taIdentRef':
                target_page_name = get_resource_name(info_value)
                if target_page_name in redirects:
                    target_page_name = redirects[target_page_name]
                anchor.set_target_page_name(target_page_name)
        pack.add_entry(anchor)