def _collect( # type: ignore self, nif_context: str ) -> Iterator[Dict[str, str]]: str_data: Dict[str, str] = {} for context_statements in NIFParser(nif_context): for s, v, o, c in context_statements: nif_type = get_resource_attribute(s, "nif") print_progress(f"Collecting DBpedia resource: [{c.identifier}]") fragment = get_resource_fragment(v) if ( nif_type and nif_type == "context" and fragment is not None and fragment == "isString" ): str_data["text"] = o.toPython() doc_name: Optional[str] = get_resource_name(s) old_id: Optional[str] = get_resource_attribute( c.identifier, "oldid" ) if doc_name is not None and old_id is not None: str_data["doc_name"] = doc_name str_data["oldid"] = old_id yield str_data
def _collect(self, nif_context: str # type: ignore ) -> Iterator[Dict[str, str]]: str_data: Dict[str, str] = {} for context_statements in NIFParser(nif_context): for s, v, o, c in context_statements: nif_type = get_resource_attribute(s, "nif") print_progress(f'Collecting DBpedia resource: [{c.identifier}]') if nif_type and nif_type == "context" and get_resource_fragment( v) == 'isString': str_data['text'] = o.toPython() str_data['doc_name'] = get_resource_name(s) str_data['oldid'] = get_resource_attribute( c.identifier, 'oldid') yield str_data
def test_nif_parser(self): p = os.path.join(self.data_dir, "nif_page_structure.tql") parsed = [] for statements in NIFParser(p): for statement in statements: s, v, o, c = statement parsed.append( ( context_base(c), get_resource_fragment(v), get_resource_name(s), strip_url_params(s), ) ) expected = [ ( "http://en.wikipedia.org/wiki/Animalia_(book)", "type", "Animalia_(book)", "http://dbpedia.org/resource/Animalia_(book)", ), ( "http://en.wikipedia.org/wiki/Animalia_(book)", "notation", "Animalia_(book)", "http://dbpedia.org/resource/Animalia_(book)", ), ( "http://en.wikipedia.org/wiki/Animalia_(book)", "beginIndex", "Animalia_(book)", "http://dbpedia.org/resource/Animalia_(book)", ), ( "http://en.wikipedia.org/wiki/Animalia_(book)", "endIndex", "Animalia_(book)", "http://dbpedia.org/resource/Animalia_(book)", ), ( "http://en.wikipedia.org/wiki/Animalia_(book)", "referenceContext", "Animalia_(book)", "http://dbpedia.org/resource/Animalia_(book)", ), ( "http://en.wikipedia.org/wiki/Animalia_(book)", "superString", "Animalia_(book)", "http://dbpedia.org/resource/Animalia_(book)", ), ( "http://en.wikipedia.org/wiki/Animalia_(book)", "hasSection", "Animalia_(book)", "http://dbpedia.org/resource/Animalia_(book)", ), ( "http://en.wikipedia.org/wiki/Animalia_(book)", "firstSection", "Animalia_(book)", "http://dbpedia.org/resource/Animalia_(book)", ), ( "http://en.wikipedia.org/wiki/Animalia_(book)", "lastSection", "Animalia_(book)", "http://dbpedia.org/resource/Animalia_(book)", ), ( "http://en.wikipedia.org/wiki/Animalia_(book)", "type", "Animalia_(book)", "http://dbpedia.org/resource/Animalia_(book)", ), ( "http://en.wikipedia.org/wiki/Animalia_(book)", "beginIndex", "Animalia_(book)", "http://dbpedia.org/resource/Animalia_(book)", ), ( "http://en.wikipedia.org/wiki/Animalia_(book)", "endIndex", "Animalia_(book)", "http://dbpedia.org/resource/Animalia_(book)", ), ( "http://en.wikipedia.org/wiki/Animalia_(book)", "referenceContext", "Animalia_(book)", "http://dbpedia.org/resource/Animalia_(book)", ), ( "http://en.wikipedia.org/wiki/Animalia_(book)", "superString", "Animalia_(book)", "http://dbpedia.org/resource/Animalia_(book)", ), ( "http://en.wikipedia.org/wiki/Animalia_(book)", "hasParagraph", "Animalia_(book)", "http://dbpedia.org/resource/Animalia_(book)", ), ( "http://en.wikipedia.org/wiki/Animalia_(book)", "lastParagraph", "Animalia_(book)", "http://dbpedia.org/resource/Animalia_(book)", ), ( "http://en.wikipedia.org/wiki/Animalia_(book)", "type", "Animalia_(book)", "http://dbpedia.org/resource/Animalia_(book)", ), ( "http://en.wikipedia.org/wiki/Animalia_(book)", "notation", "Animalia_(book)", "http://dbpedia.org/resource/Animalia_(book)", ), ( "http://en.wikipedia.org/wiki/Animalia_(book)", "beginIndex", "Animalia_(book)", "http://dbpedia.org/resource/Animalia_(book)", ), ( "http://en.wikipedia.org/wiki/Animalia_(book)", "endIndex", "Animalia_(book)", "http://dbpedia.org/resource/Animalia_(book)", ), ( "http://en.wikipedia.org/wiki/Animalia_(book)", "referenceContext", "Animalia_(book)", "http://dbpedia.org/resource/Animalia_(book)", ), ( "http://en.wikipedia.org/wiki/Animalia_(book)", "superString", "Animalia_(book)", "http://dbpedia.org/resource/Animalia_(book)", ), ( "http://en.wikipedia.org/wiki/Animalia_(book)", "hasSection", "Animalia_(book)", "http://dbpedia.org/resource/Animalia_(book)", ), ( "http://en.wikipedia.org/wiki/Animalia_(book)", "firstSection", "Animalia_(book)", "http://dbpedia.org/resource/Animalia_(book)", ), ( "http://en.wikipedia.org/wiki/Animalia_(book)", "type", "Animalia_(book)", "http://dbpedia.org/resource/Animalia_(book)", ), ( "http://en.wikipedia.org/wiki/Animalia_(book)", "referenceContext", "Animalia_(book)", "http://dbpedia.org/resource/Animalia_(book)", ), ( "http://en.wikipedia.org/wiki/Animalia_(book)", "beginIndex", "Animalia_(book)", "http://dbpedia.org/resource/Animalia_(book)", ), ( "http://en.wikipedia.org/wiki/Animalia_(book)", "endIndex", "Animalia_(book)", "http://dbpedia.org/resource/Animalia_(book)", ), ] self.assertEqual(parsed, expected)
def test_nif_parser(self): p = os.path.join(self.data_dir, 'nif_page_structure.tql') parsed = [] for statements in NIFParser(p): for statement in statements: s, v, o, c = statement parsed.append((context_base(c), get_resource_fragment(v), get_resource_name(s), strip_url_params(s))) expected = [ ('http://en.wikipedia.org/wiki/Animalia_(book)', 'type', 'Animalia_(book)', 'http://dbpedia.org/resource/Animalia_(book)'), ('http://en.wikipedia.org/wiki/Animalia_(book)', 'notation', 'Animalia_(book)', 'http://dbpedia.org/resource/Animalia_(book)'), ('http://en.wikipedia.org/wiki/Animalia_(book)', 'beginIndex', 'Animalia_(book)', 'http://dbpedia.org/resource/Animalia_(book)'), ('http://en.wikipedia.org/wiki/Animalia_(book)', 'endIndex', 'Animalia_(book)', 'http://dbpedia.org/resource/Animalia_(book)'), ('http://en.wikipedia.org/wiki/Animalia_(book)', 'referenceContext', 'Animalia_(book)', 'http://dbpedia.org/resource/Animalia_(book)'), ('http://en.wikipedia.org/wiki/Animalia_(book)', 'superString', 'Animalia_(book)', 'http://dbpedia.org/resource/Animalia_(book)'), ('http://en.wikipedia.org/wiki/Animalia_(book)', 'hasSection', 'Animalia_(book)', 'http://dbpedia.org/resource/Animalia_(book)'), ('http://en.wikipedia.org/wiki/Animalia_(book)', 'firstSection', 'Animalia_(book)', 'http://dbpedia.org/resource/Animalia_(book)'), ('http://en.wikipedia.org/wiki/Animalia_(book)', 'lastSection', 'Animalia_(book)', 'http://dbpedia.org/resource/Animalia_(book)'), ('http://en.wikipedia.org/wiki/Animalia_(book)', 'type', 'Animalia_(book)', 'http://dbpedia.org/resource/Animalia_(book)'), ('http://en.wikipedia.org/wiki/Animalia_(book)', 'beginIndex', 'Animalia_(book)', 'http://dbpedia.org/resource/Animalia_(book)'), ('http://en.wikipedia.org/wiki/Animalia_(book)', 'endIndex', 'Animalia_(book)', 'http://dbpedia.org/resource/Animalia_(book)'), ('http://en.wikipedia.org/wiki/Animalia_(book)', 'referenceContext', 'Animalia_(book)', 'http://dbpedia.org/resource/Animalia_(book)'), ('http://en.wikipedia.org/wiki/Animalia_(book)', 'superString', 'Animalia_(book)', 'http://dbpedia.org/resource/Animalia_(book)'), ('http://en.wikipedia.org/wiki/Animalia_(book)', 'hasParagraph', 'Animalia_(book)', 'http://dbpedia.org/resource/Animalia_(book)'), ('http://en.wikipedia.org/wiki/Animalia_(book)', 'lastParagraph', 'Animalia_(book)', 'http://dbpedia.org/resource/Animalia_(book)'), ('http://en.wikipedia.org/wiki/Animalia_(book)', 'type', 'Animalia_(book)', 'http://dbpedia.org/resource/Animalia_(book)'), ('http://en.wikipedia.org/wiki/Animalia_(book)', 'notation', 'Animalia_(book)', 'http://dbpedia.org/resource/Animalia_(book)'), ('http://en.wikipedia.org/wiki/Animalia_(book)', 'beginIndex', 'Animalia_(book)', 'http://dbpedia.org/resource/Animalia_(book)'), ('http://en.wikipedia.org/wiki/Animalia_(book)', 'endIndex', 'Animalia_(book)', 'http://dbpedia.org/resource/Animalia_(book)'), ('http://en.wikipedia.org/wiki/Animalia_(book)', 'referenceContext', 'Animalia_(book)', 'http://dbpedia.org/resource/Animalia_(book)'), ('http://en.wikipedia.org/wiki/Animalia_(book)', 'superString', 'Animalia_(book)', 'http://dbpedia.org/resource/Animalia_(book)'), ('http://en.wikipedia.org/wiki/Animalia_(book)', 'hasSection', 'Animalia_(book)', 'http://dbpedia.org/resource/Animalia_(book)'), ('http://en.wikipedia.org/wiki/Animalia_(book)', 'firstSection', 'Animalia_(book)', 'http://dbpedia.org/resource/Animalia_(book)'), ('http://en.wikipedia.org/wiki/Animalia_(book)', 'type', 'Animalia_(book)', 'http://dbpedia.org/resource/Animalia_(book)'), ('http://en.wikipedia.org/wiki/Animalia_(book)', 'referenceContext', 'Animalia_(book)', 'http://dbpedia.org/resource/Animalia_(book)'), ('http://en.wikipedia.org/wiki/Animalia_(book)', 'beginIndex', 'Animalia_(book)', 'http://dbpedia.org/resource/Animalia_(book)'), ('http://en.wikipedia.org/wiki/Animalia_(book)', 'endIndex', 'Animalia_(book)', 'http://dbpedia.org/resource/Animalia_(book)') ] self.assertEqual(parsed, expected)