def add_property(pack: DataPack, statements: List): for _, v, o in statements: slot_name = v.toPython() slot_value = get_resource_name(o) info_box = WikiInfoBoxProperty(pack) info_box.key = slot_name info_box.value = slot_value
def add_info_boxes(pack: DataPack, info_box_statements: List): for _, v, o in info_box_statements: slot_name = v.toPython() slot_value = get_resource_name(o) info_box = WikiInfoBoxMapped(pack) info_box.key = slot_name info_box.value = slot_value
def add_info_boxes(pack: DataPack, info_box_statements: List, info_type: str): for _, v, o in info_box_statements: info_box = WikiInfoBoxMapped(pack) info_box.set_key(v.toPython()) info_box.set_value(get_resource_name(o)) info_box.set_infobox_type(info_type) pack.add_entry(info_box)
def add_info_boxes(pack: DataPack, info_box_statements: List): for _, v, o in info_box_statements: slot_name = v.toPython() slot_value = get_resource_name(o) info_box = WikiInfoBoxMapped(pack) info_box.set_key(slot_name) info_box.set_value(slot_value) pack.add_entry(info_box)
def _collect(self, info_box_raw: str # type: ignore ) -> Iterator[Tuple[str, Dict[str, List[state_type]]]]: for c, statements in ContextGroupedNIFReader(info_box_raw): yield get_resource_name(statements[0][0]), { 'properties': statements, 'literals': self.literal_info_reader.get(c), 'objects': self.object_info_reader.get(c), }
def add_property(pack: DataPack, statements: List): for _, v, o in statements: slot_name = v.toPython() slot_value = get_resource_name(o) info_box = WikiInfoBoxProperty(pack) info_box.set_key(slot_name) info_box.set_value(slot_value) pack.add_entry(info_box)
def add_anchor_links(pack: DataPack, text_link_statements: List[state_type], redirects: Dict[str, str]): link_grouped: DefaultDict[str, Dict[str, rdflib.term.Node]] = defaultdict(dict) for nif_range, rel, info in text_link_statements: range_ = get_resource_attribute(nif_range, 'char') r = get_resource_fragment(rel) link_grouped[range_][r] = info for range_, link_infos in link_grouped.items(): begin, end = [int(d) for d in range_.split(',')] anchor = WikiAnchor(pack, begin, end) for info_key, info_value in link_infos.items(): if info_key == 'type': anchor_type = get_resource_fragment(info_value) if not anchor_type == 'Phrase' and not anchor_type == 'Word': logging.warning("Unknown anchor type: %s", info_value) if info_key == 'taIdentRef': target_page_name = get_resource_name(info_value) if target_page_name in redirects: target_page_name = redirects[target_page_name] anchor.set_target_page_name(target_page_name) pack.add_entry(anchor)
def add_anchor_links(pack: DataPack, text_link_statements: List[state_type], redirects: Dict[str, str]): link_grouped: DefaultDict[str, Dict[str, rdflib.term.Node]] = defaultdict(dict) for nif_range, rel, info in text_link_statements: range_ = get_resource_attribute(nif_range, 'char') r = get_resource_fragment(rel) link_grouped[range_][r] = info for range_, link_infos in link_grouped.items(): begin, end = [int(d) for d in range_.split(',')] if end > len(pack.text): # Some nif dataset are off by a bit, mostly when there are # new line characters, we cannot correct them. # but we need to make sure they don't go longer than the text. logging.info( "Provided anchor end is %d, " "clipped to fit with the text.", end) end = len(pack.text) if end <= begin: logging.info("Provided anchor [%d:%d is invalid.]", begin, end) continue anchor = WikiAnchor(pack, begin, end) for info_key, info_value in link_infos.items(): if info_key == 'type': anchor_type = get_resource_fragment(info_value) if not anchor_type == 'Phrase' and not anchor_type == 'Word': logging.warning("Unknown anchor type: %s", info_value) if info_key == 'taIdentRef': target_page_name = get_resource_name(info_value) if target_page_name in redirects: target_page_name = redirects[target_page_name] anchor.target_page_name = target_page_name
def _collect( self, nif_context: str # type: ignore ) -> Iterator[Tuple[Dict[str, str], Dict[str, List[state_type]]]]: str_data: Dict[str, str] = {} node_data: Dict[str, List[state_type]] = {} for context_statements in NIFParser(nif_context): for s, v, o, c in context_statements: nif_type = get_resource_attribute(s, "nif") print_progress(f'Collecting DBpedia context: [{c.identifier}]') if nif_type and nif_type == "context" and get_resource_fragment( v) == 'isString': str_data['text'] = o.toPython() str_data['doc_name'] = get_resource_name(s) str_data['oldid'] = get_resource_attribute( c.identifier, 'oldid') node_data['struct'] = self.struct_reader.get(c) node_data['links'] = self.link_reader.get(c) yield str_data, node_data print(' ..Done')