def add_wiki_info(self, pack: DataPack, statements: List): link_grouped: DefaultDict[ str, Dict[str, rdflib.term.Node] ] = defaultdict(dict) for nif_range, rel, info in statements: range_ = get_resource_attribute(nif_range, "char") r = get_resource_fragment(rel) if range_ is not None and r is not None: link_grouped[range_][r] = info for range_, link_infos in link_grouped.items(): begin, end = [int(d) for d in range_.split(",")] if end > len(pack.text): # Some nif dataset are off by a bit, mostly when there are # new line characters, we cannot correct them. # but we need to make sure they don't go longer than the # text. logging.info( "Provided anchor end is %d, " "clipped to fit with the text.", end, ) end = len(pack.text) if end <= begin: logging.info("Provided anchor [%d:%d is invalid.]", begin, end) continue for info_key, info_value in link_infos.items(): info_value = str(info_value) if info_key == "type": anchor_type = get_resource_fragment(info_value) if ( not anchor_type == "Phrase" and not anchor_type == "Word" ): logging.warning("Unknown anchor type: %s", info_value) if info_key == "taIdentRef": target_page_name = get_resource_name(info_value) if ( target_page_name is not None and target_page_name in self._redirects ): target_page_name = self._redirects[target_page_name] if target_page_name is not None: # Only create anchor with proper link. anchor = WikiAnchor(pack, begin, end) anchor.target_page_name = target_page_name # If it is an DBpedia resource, the domain will be # truncated, otherwise it will stay the same, meaning # it is an external link. anchor.is_external = target_page_name == str(info_value)
def add_wiki_info(self, pack: DataPack, statements: List): link_grouped: DefaultDict[str, Dict[str, rdflib.term.Node]] = defaultdict(dict) for nif_range, rel, info in statements: range_ = get_resource_attribute(nif_range, 'char') r = get_resource_fragment(rel) if range_ is not None and r is not None: link_grouped[range_][r] = info for range_, link_infos in link_grouped.items(): begin, end = [int(d) for d in range_.split(',')] if end > len(pack.text): # Some nif dataset are off by a bit, mostly when there are # new line characters, we cannot correct them. # but we need to make sure they don't go longer than the # text. logging.info( "Provided anchor end is %d, " "clipped to fit with the text.", end) end = len(pack.text) if end <= begin: logging.info("Provided anchor [%d:%d is invalid.]", begin, end) continue anchor = WikiAnchor(pack, begin, end) for info_key, info_value in link_infos.items(): if info_key == 'type': anchor_type = get_resource_fragment(info_value) if (not anchor_type == 'Phrase' and not anchor_type == 'Word'): logging.warning("Unknown anchor type: %s", info_value) if info_key == 'taIdentRef': target_page_name = get_resource_name(info_value) if (target_page_name is not None and target_page_name in self._redirects): target_page_name = self._redirects[target_page_name] anchor.target_page_name = target_page_name
def _process(self, input_pack: DataPack): kp = KeywordProcessor(case_sensitive=True) anchor_entities = {} existing_anchors = set() anchor: WikiAnchor for anchor in input_pack.get(WikiAnchor): kp.add_keyword(anchor.text) existing_anchors.add((anchor.span.begin, anchor.span.end)) try: anchor_entities[anchor.text].append(anchor) except KeyError: anchor_entities[anchor.text] = [anchor] for kw, b, e in kp.extract_keywords(input_pack.text, span_info=True): targets = anchor_entities[kw] if (b, e) in existing_anchors: # Ignore existing anchors. continue copy_from: WikiAnchor if len(targets) == 1: copy_from = targets[0] elif len(targets) > 1: latest_ = targets[0] for t in targets: if t.begin < b: latest_ = t copy_from = latest_ else: raise RuntimeError(f"Unknown target length {len(targets)}") anchor = WikiAnchor(input_pack, b, e) anchor.target_page_name = copy_from.target_page_name anchor.is_external = copy_from.is_external input_pack.add_entry(anchor)
def add_anchor_links(pack: DataPack, text_link_statements: List[state_type], redirects: Dict[str, str]): link_grouped: DefaultDict[str, Dict[str, rdflib.term.Node]] = defaultdict(dict) for nif_range, rel, info in text_link_statements: range_ = get_resource_attribute(nif_range, 'char') r = get_resource_fragment(rel) link_grouped[range_][r] = info for range_, link_infos in link_grouped.items(): begin, end = [int(d) for d in range_.split(',')] anchor = WikiAnchor(pack, begin, end) for info_key, info_value in link_infos.items(): if info_key == 'type': anchor_type = get_resource_fragment(info_value) if not anchor_type == 'Phrase' and not anchor_type == 'Word': logging.warning("Unknown anchor type: %s", info_value) if info_key == 'taIdentRef': target_page_name = get_resource_name(info_value) if target_page_name in redirects: target_page_name = redirects[target_page_name] anchor.set_target_page_name(target_page_name) pack.add_entry(anchor)