def add_wiki_info(self, pack: DataPack, statements: List): for nif_range, rel, struct_type in statements: r = get_resource_fragment(rel) if r == 'type': range_ = get_resource_attribute(nif_range, 'char') begin, end = [int(d) for d in range_.split(',')] if end > len(pack.text): # Some nif dataset are off by a bit, mostly when there # are new line characters, we cannot correct them. # but we need to make sure they don't go longer than # the text. logging.info("NIF Structure end is %d by %s, " "clipped to fit with the text.", end, nif_range) end = len(pack.text) if end <= begin: logging.info( "Provided struct [%d:%d] is invalid.", begin, end) continue struct_ = get_resource_fragment(struct_type) if struct_ == 'Section': WikiSection(pack, begin, end) elif struct_ == 'Paragraph': WikiParagraph(pack, begin, end) elif struct_ == 'Title': WikiTitle(pack, begin, end) else: logging.warning("Unknown struct type: %s", struct_type)
def add_wiki_info(self, pack: DataPack, statements: List): link_grouped: DefaultDict[ str, Dict[str, rdflib.term.Node] ] = defaultdict(dict) for nif_range, rel, info in statements: range_ = get_resource_attribute(nif_range, "char") r = get_resource_fragment(rel) if range_ is not None and r is not None: link_grouped[range_][r] = info for range_, link_infos in link_grouped.items(): begin, end = [int(d) for d in range_.split(",")] if end > len(pack.text): # Some nif dataset are off by a bit, mostly when there are # new line characters, we cannot correct them. # but we need to make sure they don't go longer than the # text. logging.info( "Provided anchor end is %d, " "clipped to fit with the text.", end, ) end = len(pack.text) if end <= begin: logging.info("Provided anchor [%d:%d is invalid.]", begin, end) continue for info_key, info_value in link_infos.items(): info_value = str(info_value) if info_key == "type": anchor_type = get_resource_fragment(info_value) if ( not anchor_type == "Phrase" and not anchor_type == "Word" ): logging.warning("Unknown anchor type: %s", info_value) if info_key == "taIdentRef": target_page_name = get_resource_name(info_value) if ( target_page_name is not None and target_page_name in self._redirects ): target_page_name = self._redirects[target_page_name] if target_page_name is not None: # Only create anchor with proper link. anchor = WikiAnchor(pack, begin, end) anchor.target_page_name = target_page_name # If it is an DBpedia resource, the domain will be # truncated, otherwise it will stay the same, meaning # it is an external link. anchor.is_external = target_page_name == str(info_value)
def test_grouped_nif_reader(self): p = os.path.join(self.data_dir, "nif_context.tql") parsed = {} for context, statements in ContextGroupedNIFReader(p): for statement in statements: s, v, o = statement r = get_resource_fragment(v) n = get_resource_name(s) try: parsed[get_resource_name(s)].append(r) except KeyError: parsed[get_resource_name(s)] = [r] expected = { "Animalia_(book)": [ "type", "beginIndex", "endIndex", "sourceUrl", "isString", "predLang", ], "List_of_Atlas_Shrugged_characters": [ "type", "beginIndex", "endIndex", "sourceUrl", ], } self.assertEqual(parsed, expected)
def _collect( # type: ignore self, nif_context: str ) -> Iterator[Dict[str, str]]: str_data: Dict[str, str] = {} for context_statements in NIFParser(nif_context): for s, v, o, c in context_statements: nif_type = get_resource_attribute(s, "nif") print_progress(f"Collecting DBpedia resource: [{c.identifier}]") fragment = get_resource_fragment(v) if ( nif_type and nif_type == "context" and fragment is not None and fragment == "isString" ): str_data["text"] = o.toPython() doc_name: Optional[str] = get_resource_name(s) old_id: Optional[str] = get_resource_attribute( c.identifier, "oldid" ) if doc_name is not None and old_id is not None: str_data["doc_name"] = doc_name str_data["oldid"] = old_id yield str_data
def add_wiki_info(self, pack: DataPack, statements: List): link_grouped: DefaultDict[str, Dict[str, rdflib.term.Node]] = defaultdict(dict) for nif_range, rel, info in statements: range_ = get_resource_attribute(nif_range, 'char') r = get_resource_fragment(rel) if range_ is not None and r is not None: link_grouped[range_][r] = info for range_, link_infos in link_grouped.items(): begin, end = [int(d) for d in range_.split(',')] if end > len(pack.text): # Some nif dataset are off by a bit, mostly when there are # new line characters, we cannot correct them. # but we need to make sure they don't go longer than the # text. logging.info( "Provided anchor end is %d, " "clipped to fit with the text.", end) end = len(pack.text) if end <= begin: logging.info("Provided anchor [%d:%d is invalid.]", begin, end) continue anchor = WikiAnchor(pack, begin, end) for info_key, info_value in link_infos.items(): if info_key == 'type': anchor_type = get_resource_fragment(info_value) if (not anchor_type == 'Phrase' and not anchor_type == 'Word'): logging.warning("Unknown anchor type: %s", info_value) if info_key == 'taIdentRef': target_page_name = get_resource_name(info_value) if (target_page_name is not None and target_page_name in self._redirects): target_page_name = self._redirects[target_page_name] anchor.target_page_name = target_page_name
def _collect(self, nif_context: str # type: ignore ) -> Iterator[Dict[str, str]]: str_data: Dict[str, str] = {} for context_statements in NIFParser(nif_context): for s, v, o, c in context_statements: nif_type = get_resource_attribute(s, "nif") print_progress(f'Collecting DBpedia resource: [{c.identifier}]') if nif_type and nif_type == "context" and get_resource_fragment( v) == 'isString': str_data['text'] = o.toPython() str_data['doc_name'] = get_resource_name(s) str_data['oldid'] = get_resource_attribute( c.identifier, 'oldid') yield str_data
def test_grouped_nif_reader(self): p = os.path.join(self.data_dir, 'nif_context.tql') parsed = {} for context, statements in ContextGroupedNIFReader(p): for statement in statements: s, v, o = statement r = get_resource_fragment(v) try: parsed[get_resource_name(s)].append(r) except KeyError: parsed[get_resource_name(s)] = [r] expected = { 'Animalia_(book)': [ 'type', 'beginIndex', 'endIndex', 'sourceUrl', 'isString', 'predLang' ], 'List_of_Atlas_Shrugged_characters': ['type', 'beginIndex', 'endIndex', 'sourceUrl'] } self.assertEqual(parsed, expected)
def load_from_nif(link_file, output_file): linkings = {} bilinks = [] num_articles = 0 num_bilinks = 0 start_time = timeit.default_timer() with open(output_file, "w") as out: for _, statements in ContextGroupedNIFReader(link_file): num_articles += 1 for nif_range, rel, info in statements: r = get_resource_fragment(rel) if r is not None and r == "taIdentRef": src_name = get_resource_name(nif_range) target_name = get_resource_name(info) if src_name == target_name: continue if linkings.get(target_name, None) == src_name: bilinks.append((src_name, target_name)) linkings.pop(target_name) num_bilinks += 1 out.write(f"{src_name}\t{target_name}\n") out.flush() else: linkings[src_name] = target_name elapsed = timeit.default_timer() - start_time print_progress( f"{num_bilinks} bi-links found in {num_articles} after " f"{datetime.timedelta(seconds=elapsed)}, speed is " f"{num_articles / elapsed:.2f} (packs/second)." )
def test_nif_parser(self): p = os.path.join(self.data_dir, "nif_page_structure.tql") parsed = [] for statements in NIFParser(p): for statement in statements: s, v, o, c = statement parsed.append( ( context_base(c), get_resource_fragment(v), get_resource_name(s), strip_url_params(s), ) ) expected = [ ( "http://en.wikipedia.org/wiki/Animalia_(book)", "type", "Animalia_(book)", "http://dbpedia.org/resource/Animalia_(book)", ), ( "http://en.wikipedia.org/wiki/Animalia_(book)", "notation", "Animalia_(book)", "http://dbpedia.org/resource/Animalia_(book)", ), ( "http://en.wikipedia.org/wiki/Animalia_(book)", "beginIndex", "Animalia_(book)", "http://dbpedia.org/resource/Animalia_(book)", ), ( "http://en.wikipedia.org/wiki/Animalia_(book)", "endIndex", "Animalia_(book)", "http://dbpedia.org/resource/Animalia_(book)", ), ( "http://en.wikipedia.org/wiki/Animalia_(book)", "referenceContext", "Animalia_(book)", "http://dbpedia.org/resource/Animalia_(book)", ), ( "http://en.wikipedia.org/wiki/Animalia_(book)", "superString", "Animalia_(book)", "http://dbpedia.org/resource/Animalia_(book)", ), ( "http://en.wikipedia.org/wiki/Animalia_(book)", "hasSection", "Animalia_(book)", "http://dbpedia.org/resource/Animalia_(book)", ), ( "http://en.wikipedia.org/wiki/Animalia_(book)", "firstSection", "Animalia_(book)", "http://dbpedia.org/resource/Animalia_(book)", ), ( "http://en.wikipedia.org/wiki/Animalia_(book)", "lastSection", "Animalia_(book)", "http://dbpedia.org/resource/Animalia_(book)", ), ( "http://en.wikipedia.org/wiki/Animalia_(book)", "type", "Animalia_(book)", "http://dbpedia.org/resource/Animalia_(book)", ), ( "http://en.wikipedia.org/wiki/Animalia_(book)", "beginIndex", "Animalia_(book)", "http://dbpedia.org/resource/Animalia_(book)", ), ( "http://en.wikipedia.org/wiki/Animalia_(book)", "endIndex", "Animalia_(book)", "http://dbpedia.org/resource/Animalia_(book)", ), ( "http://en.wikipedia.org/wiki/Animalia_(book)", "referenceContext", "Animalia_(book)", "http://dbpedia.org/resource/Animalia_(book)", ), ( "http://en.wikipedia.org/wiki/Animalia_(book)", "superString", "Animalia_(book)", "http://dbpedia.org/resource/Animalia_(book)", ), ( "http://en.wikipedia.org/wiki/Animalia_(book)", "hasParagraph", "Animalia_(book)", "http://dbpedia.org/resource/Animalia_(book)", ), ( "http://en.wikipedia.org/wiki/Animalia_(book)", "lastParagraph", "Animalia_(book)", "http://dbpedia.org/resource/Animalia_(book)", ), ( "http://en.wikipedia.org/wiki/Animalia_(book)", "type", "Animalia_(book)", "http://dbpedia.org/resource/Animalia_(book)", ), ( "http://en.wikipedia.org/wiki/Animalia_(book)", "notation", "Animalia_(book)", "http://dbpedia.org/resource/Animalia_(book)", ), ( "http://en.wikipedia.org/wiki/Animalia_(book)", "beginIndex", "Animalia_(book)", "http://dbpedia.org/resource/Animalia_(book)", ), ( "http://en.wikipedia.org/wiki/Animalia_(book)", "endIndex", "Animalia_(book)", "http://dbpedia.org/resource/Animalia_(book)", ), ( "http://en.wikipedia.org/wiki/Animalia_(book)", "referenceContext", "Animalia_(book)", "http://dbpedia.org/resource/Animalia_(book)", ), ( "http://en.wikipedia.org/wiki/Animalia_(book)", "superString", "Animalia_(book)", "http://dbpedia.org/resource/Animalia_(book)", ), ( "http://en.wikipedia.org/wiki/Animalia_(book)", "hasSection", "Animalia_(book)", "http://dbpedia.org/resource/Animalia_(book)", ), ( "http://en.wikipedia.org/wiki/Animalia_(book)", "firstSection", "Animalia_(book)", "http://dbpedia.org/resource/Animalia_(book)", ), ( "http://en.wikipedia.org/wiki/Animalia_(book)", "type", "Animalia_(book)", "http://dbpedia.org/resource/Animalia_(book)", ), ( "http://en.wikipedia.org/wiki/Animalia_(book)", "referenceContext", "Animalia_(book)", "http://dbpedia.org/resource/Animalia_(book)", ), ( "http://en.wikipedia.org/wiki/Animalia_(book)", "beginIndex", "Animalia_(book)", "http://dbpedia.org/resource/Animalia_(book)", ), ( "http://en.wikipedia.org/wiki/Animalia_(book)", "endIndex", "Animalia_(book)", "http://dbpedia.org/resource/Animalia_(book)", ), ] self.assertEqual(parsed, expected)
def test_nif_parser(self): p = os.path.join(self.data_dir, 'nif_page_structure.tql') parsed = [] for statements in NIFParser(p): for statement in statements: s, v, o, c = statement parsed.append((context_base(c), get_resource_fragment(v), get_resource_name(s), strip_url_params(s))) expected = [ ('http://en.wikipedia.org/wiki/Animalia_(book)', 'type', 'Animalia_(book)', 'http://dbpedia.org/resource/Animalia_(book)'), ('http://en.wikipedia.org/wiki/Animalia_(book)', 'notation', 'Animalia_(book)', 'http://dbpedia.org/resource/Animalia_(book)'), ('http://en.wikipedia.org/wiki/Animalia_(book)', 'beginIndex', 'Animalia_(book)', 'http://dbpedia.org/resource/Animalia_(book)'), ('http://en.wikipedia.org/wiki/Animalia_(book)', 'endIndex', 'Animalia_(book)', 'http://dbpedia.org/resource/Animalia_(book)'), ('http://en.wikipedia.org/wiki/Animalia_(book)', 'referenceContext', 'Animalia_(book)', 'http://dbpedia.org/resource/Animalia_(book)'), ('http://en.wikipedia.org/wiki/Animalia_(book)', 'superString', 'Animalia_(book)', 'http://dbpedia.org/resource/Animalia_(book)'), ('http://en.wikipedia.org/wiki/Animalia_(book)', 'hasSection', 'Animalia_(book)', 'http://dbpedia.org/resource/Animalia_(book)'), ('http://en.wikipedia.org/wiki/Animalia_(book)', 'firstSection', 'Animalia_(book)', 'http://dbpedia.org/resource/Animalia_(book)'), ('http://en.wikipedia.org/wiki/Animalia_(book)', 'lastSection', 'Animalia_(book)', 'http://dbpedia.org/resource/Animalia_(book)'), ('http://en.wikipedia.org/wiki/Animalia_(book)', 'type', 'Animalia_(book)', 'http://dbpedia.org/resource/Animalia_(book)'), ('http://en.wikipedia.org/wiki/Animalia_(book)', 'beginIndex', 'Animalia_(book)', 'http://dbpedia.org/resource/Animalia_(book)'), ('http://en.wikipedia.org/wiki/Animalia_(book)', 'endIndex', 'Animalia_(book)', 'http://dbpedia.org/resource/Animalia_(book)'), ('http://en.wikipedia.org/wiki/Animalia_(book)', 'referenceContext', 'Animalia_(book)', 'http://dbpedia.org/resource/Animalia_(book)'), ('http://en.wikipedia.org/wiki/Animalia_(book)', 'superString', 'Animalia_(book)', 'http://dbpedia.org/resource/Animalia_(book)'), ('http://en.wikipedia.org/wiki/Animalia_(book)', 'hasParagraph', 'Animalia_(book)', 'http://dbpedia.org/resource/Animalia_(book)'), ('http://en.wikipedia.org/wiki/Animalia_(book)', 'lastParagraph', 'Animalia_(book)', 'http://dbpedia.org/resource/Animalia_(book)'), ('http://en.wikipedia.org/wiki/Animalia_(book)', 'type', 'Animalia_(book)', 'http://dbpedia.org/resource/Animalia_(book)'), ('http://en.wikipedia.org/wiki/Animalia_(book)', 'notation', 'Animalia_(book)', 'http://dbpedia.org/resource/Animalia_(book)'), ('http://en.wikipedia.org/wiki/Animalia_(book)', 'beginIndex', 'Animalia_(book)', 'http://dbpedia.org/resource/Animalia_(book)'), ('http://en.wikipedia.org/wiki/Animalia_(book)', 'endIndex', 'Animalia_(book)', 'http://dbpedia.org/resource/Animalia_(book)'), ('http://en.wikipedia.org/wiki/Animalia_(book)', 'referenceContext', 'Animalia_(book)', 'http://dbpedia.org/resource/Animalia_(book)'), ('http://en.wikipedia.org/wiki/Animalia_(book)', 'superString', 'Animalia_(book)', 'http://dbpedia.org/resource/Animalia_(book)'), ('http://en.wikipedia.org/wiki/Animalia_(book)', 'hasSection', 'Animalia_(book)', 'http://dbpedia.org/resource/Animalia_(book)'), ('http://en.wikipedia.org/wiki/Animalia_(book)', 'firstSection', 'Animalia_(book)', 'http://dbpedia.org/resource/Animalia_(book)'), ('http://en.wikipedia.org/wiki/Animalia_(book)', 'type', 'Animalia_(book)', 'http://dbpedia.org/resource/Animalia_(book)'), ('http://en.wikipedia.org/wiki/Animalia_(book)', 'referenceContext', 'Animalia_(book)', 'http://dbpedia.org/resource/Animalia_(book)'), ('http://en.wikipedia.org/wiki/Animalia_(book)', 'beginIndex', 'Animalia_(book)', 'http://dbpedia.org/resource/Animalia_(book)'), ('http://en.wikipedia.org/wiki/Animalia_(book)', 'endIndex', 'Animalia_(book)', 'http://dbpedia.org/resource/Animalia_(book)') ] self.assertEqual(parsed, expected)