def __transform_wordnet_csv( self, *, wordnet_csv_file_path: Path, yielded_words: _KgNodeSet ) -> Generator[Union[KgNode, KgEdge], None, None]: self._logger.info("transforming wordnet mappings from %s", wordnet_csv_file_path) with open(wordnet_csv_file_path) as csv_file: csv_reader = csv.DictReader(csv_file, delimiter="\t", quoting=csv.QUOTE_NONE) for row in csv_reader: word_nid = self.__webchild_nid(row["WordNet-synsetid"]) word = row["#word"] # Skip edge generation if the word node already has a wn mapping, # or if the word is not represented in the yielded nodes, yielded_word = yielded_words.get(word_nid) if yielded_word is None or yielded_word.labels[0].lower( ) != word.lower(): continue lemma = "_".join(word.split()) sense_num = row["sense-number"] synset_nid = f"wn:{lemma}.n.{int(sense_num):02d}" yield KgEdge.legacy( datasource=self.__DATASOURCE_ID, object=synset_nid, predicate=WN_SYNSET, subject=word_nid, ) # For tracking which nodes have mappings already # Deleting from yielded instead of tracking in a new set to save memory. yielded_words.delete(word_nid)
def test_swow_edge(): edge = swow_edge( cue="test", response="test response", cue_response_counts=Counter(R1=2, R3=4), response_counts=Counter(R1=1, R3=1), ) expected_edge = KgEdge.legacy( datasource=SWOW_DATASOURCE_ID, subject=f"{SWOW_NAMESPACE}:test", object=f'{SWOW_NAMESPACE}:{quote("test response")}', predicate=RELATED_TO, weight=2 / 6, other={ "response_counts": { "R1": 1, "R2": 0, "R3": 1 }, "response_strengths": { "R1": 1 / 2, "R2": 0, "R3": 1 / 4 }, }, ) assert edge == expected_edge
def __yield_same_as_edges( self, *, arg1_node: KgNode, arg1_object, arg2_node: KgNode, arg2_object, same_as_edges_yielded: Dict[str, Set[str]] ) -> Generator[KgEdge, None, None]: for arg_node, arg_object in ( (arg1_node, arg1_object), (arg2_node, arg2_object), ): metadata = arg_object.get("metadata") if metadata is None: continue node_same_as_edges_yielded = same_as_edges_yielded.get(arg_node.id) if node_same_as_edges_yielded is None: same_as_edges_yielded[ arg_node.id] = node_same_as_edges_yielded = set() if "synset" in metadata: synset = metadata["synset"] assert synset.startswith("wn.") wn_node_id = "wn:" + synset[len("wn."):] if wn_node_id in node_same_as_edges_yielded: continue yield KgEdge.legacy( datasource=self.__DATASOURCE, object=wn_node_id, predicate=SAME_AS, subject=arg_node.id, ) node_same_as_edges_yielded.add(wn_node_id) if "wikipedia_primary_page" in metadata: wikipedia_primary_page = metadata["wikipedia_primary_page"] wikipedia_node_id = "wikipedia:" + quote( wikipedia_primary_page) if wikipedia_node_id in node_same_as_edges_yielded: continue yield KgEdge.legacy( datasource=self.__DATASOURCE, object=wikipedia_node_id, predicate=SAME_AS, subject=arg_node.id, ) node_same_as_edges_yielded.add(wikipedia_node_id)
def test_mixed_datasource(pipeline_storage): try: run((SUBJECT_NODE, OBJECT_NODE, KgEdge.legacy(subject=SUBJECT_NODE.id, object="externalnode", predicate=DATASOURCE, datasource="otherdatasource")), pipeline_storage) fail() except ValueError: pass
def __yield_has_part_edges( self, *, arg1_node: KgNode, arg2_node: KgNode, average_score: float) -> Generator[KgEdge, None, None]: # arg1 HasA arg2 yield KgEdge.legacy( datasource=self.__DATASOURCE, subject=arg1_node.id, object=arg2_node.id, predicate=HAS_A, weight=average_score, ) # Inverse, arg2 PartOf arg2 yield KgEdge.legacy( datasource=self.__DATASOURCE, subject=arg2_node.id, object=arg1_node.id, predicate=PART_OF, weight=average_score, )
def __create_type_edge(self, *, arg_node: KgNode, type_word_net_id: WordNetId) -> KgEdge: # arg node IsA WordNet node # Only yield this once, when the arg is yielded. return \ KgEdge.legacy( datasource=self.__DATASOURCE, object="wn:" + str(type_word_net_id), predicate=IS_A, subject=arg_node.id, )
def _generator(): nid_counter = count(1) while True: nodes = tuple( KgNode.legacy(datasource='test_datasource', id=f'test_node_{next(nid_counter)}', label='test node') for _ in range(2)) yield from nodes yield KgEdge.legacy(datasource='test_datasource', object=nodes[1].id, predicate='test_predicate', subject=nodes[0].id)
def transform(self, food_on_owl_file_path: Path): graph = Graph() self._logger.info("parsing FoodOn OWL") graph.parse(source=str(food_on_owl_file_path)) self._logger.info("parsed FoodOn OWL") self._logger.info("parsing FoodOn classes") classes_by_uri = {} for class_uri in graph.subjects(RDF.type, OWL.Class): if not str(class_uri).startswith(self.__FoodOnClass._URI_PREFIX): continue labels = tuple(graph.objects(class_uri, RDFS.label)) if not labels: continue # Just use the first label label = labels[0] assert label, class_uri sub_class_of = tuple(graph.objects(class_uri, RDFS.subClassOf)) if not sub_class_of: continue class_ = self.__FoodOnClass( label=label, sub_class_of=sub_class_of, uri=class_uri ) assert class_.uri not in classes_by_uri classes_by_uri[class_.uri] = class_ self._logger.info("parsed %d classes from FoodOn", len(classes_by_uri)) for class_ in classes_by_uri.values(): for sub_class_of in class_.sub_class_of: parent_class = classes_by_uri.get(sub_class_of) if not parent_class: continue # Only yield nodes that are part of an edge. if not class_.node_yielded: yield class_.node class_.node_yielded = True if not parent_class.node_yielded: yield parent_class.node parent_class.node_yielded = True edge = \ KgEdge.legacy( datasource=self._DATASOURCE, subject=class_.node.id, predicate=concept_net_predicates.IS_A, object=parent_class.node.id ) yield edge
def usf_edge(*, cue: Union[KgNode, str], response: Union[KgNode, str], strength: float) -> KgEdge: return KgEdge.legacy( datasource=USF_DATASOURCE_ID, subject=cue.id if isinstance(cue, KgNode) else usf_node( cue, "", ), object=response.id if isinstance(response, KgNode) else usf_node( response, ""), predicate=RELATED_TO, weight=strength)
def sentic_edge( *, subject: str, object_: str, weight: Optional[float] = None, ) -> KgEdge: return KgEdge.legacy( datasource=SENTIC_DATASOURCE_ID, subject=subject, object=object_, predicate=RELATED_TO, weight=weight, )
def __transform(self, edges_csv_file: TextIO) -> Generator[KgEdge, None, None]: csv_reader = csv.DictReader(edges_csv_file, delimiter="\t", quoting=csv.QUOTE_NONE) for csv_row in csv_reader: # Edges may refer to nodes that are outside of the ones we've created e.g., WordNet. yield \ KgEdge.legacy( datasource=self._get_required_column(csv_row, "datasource"), object=self._get_required_column(csv_row, "object"), other=csv_row.get("other"), predicate=self._get_required_column(csv_row, "predicate"), subject=self._get_required_column(csv_row, "subject"), weight=float(self._get_required_column(csv_row, "weight")) )
def swow_edge( *, cue: Union[KgNode, str], response: Union[KgNode, str], cue_response_counts: Counter, response_counts: Counter, ) -> KgEdge: """ Create a cskg edge from a SWOW cue, response, and strength value. :param cue: cue phrase :param response: response to the cue phrase :param cue_response_counts: total response counts for the cue :param response_counts: counts of this response to the cue """ assert all(k in SwowResponseType.__members__ for k in cue_response_counts.keys()) assert all(k in SwowResponseType.__members__ for k in response_counts.keys()) strength_r123 = sum(response_counts.values()) / sum(cue_response_counts.values()) other = { "response_counts": { rt: response_counts[rt] for rt in SwowResponseType.__members__.keys() }, "response_strengths": { rt: ( response_counts[rt] / cue_response_counts[rt] if cue_response_counts[rt] > 0 else 0 ) for rt in SwowResponseType.__members__.keys() }, } return KgEdge.legacy( datasource=SWOW_DATASOURCE_ID, subject=cue.id if isinstance(cue, KgNode) else swow_node_id(cue), object=response.id if isinstance(response, KgNode) else swow_node_id(response), predicate=RELATED_TO, weight=strength_r123, other=other, )
def test_write_edge(pipeline_storage): test_edge = KgEdge.legacy( datasource='test_datasource', object='test_obj', predicate='test_rel', subject='test_subject', # other={'datasets': ['test_dataset', 'other_test_dataset']}, weight=0.999) with CskgCsvLoader().open(pipeline_storage) as loader: loader.load_kg_edge(test_edge) # Load twice to test handling of redundant edges # 20200310 MG: duplicate removal has been moved to the PipelineWrapper # loader.load_kg_edge(test_edge) expected_edge_text = ( _EXPECTED_EDGE_HEADER + '\n' + 'test_subject\ttest_rel\ttest_obj\ttest_datasource\t0.999\t\n') with open(pipeline_storage.loaded_data_dir_path / "edges.csv") as f: assert f.read() == expected_edge_text with open(pipeline_storage.loaded_data_dir_path / "nodes.csv") as f: assert f.read() == _EXPECTED_NODE_HEADER + '\n'
def __read_webchild_csv_row(self, row: dict) -> Tuple[KgNode, KgNode, KgEdge]: subject_node = self.__webchild_node(ssid=row["to_ss"], word=row["to_word"]) object_node = self.__webchild_node(ssid=row["from_ss"], word=row["from_word"]) relation, inverted = self.__RELATION_DICT[row["relation"]] if inverted: subject_node, object_node = object_node, subject_node other = { "isvisual": row["isvisual"] == "v", "cardinality": row["cardinality"].strip(), } score = float(row["score"]) edge = KgEdge.legacy( datasource=self.__DATASOURCE_ID, object=object_node.id, predicate=relation, subject=subject_node.id, other=other, weight=score, ) return subject_node, object_node, edge
def test_eat_tranform(): test_file_dir = pathlib.Path(__file__).parent.absolute() test_file_path = os.path.join(test_file_dir, 'sample_eat100.xml') transformer = EatTransformer() nodes, edges = set(), set() for result in transformer.transform(xml_file_path=test_file_path): if isinstance(result, KgNode): nodes.add(result) elif isinstance(result, KgEdge): edges.add(result) expected_stimulus_nodes = set(KgNode.legacy(datasource="eat", id="eat:" + stim_word, label=stim_word) for stim_word in [ 'SPECIAL', 'SET' ]) expected_response_nodes = set(KgNode.legacy(datasource="eat", id="eat:" + response_word, label=response_word) for response_word in [ 'TRAIN', 'PARTICULAR', 'EXTRA', 'ORDINARY', 'CASE', 'PERSON', 'BEER', 'CAR', 'CONSTABLE', 'TELEVISION', 'UP', 'OUT', 'TO', 'DOWN', 'GAME', 'GROUP', 'T.V.', 'TEA' ]) expected_nodes = expected_stimulus_nodes | expected_response_nodes expected_edges = set( KgEdge.legacy(datasource="eat", object="eat:" + stim_node, predicate="cn:RelatedTo", subject="eat:" + response_node, weight=response_weight) for (stim_node, response_node, response_weight) in [ ('SPECIAL', 'TRAIN', 0.07), ('SPECIAL', 'PARTICULAR', 0.05), ('SPECIAL', 'EXTRA', 0.04), ('SPECIAL', 'ORDINARY', 0.04), ('SPECIAL', 'CASE', 0.03), ('SPECIAL', 'PERSON', 0.03), ('SPECIAL', 'BEER', 0.02), ('SPECIAL', 'CAR', 0.02), ('SPECIAL', 'CONSTABLE', 0.02), ('SET', 'TELEVISION', 0.06), ('SET', 'UP', 0.05), ('SET', 'OUT', 0.04), ('SET', 'TO', 0.04), ('SET', 'DOWN', 0.03), ('SET', 'GAME', 0.03), ('SET', 'GROUP', 0.03), ('SET', 'T.V.', 0.03), ('SET', 'TEA', 0.03) ]) assert nodes == expected_nodes assert edges == expected_edges
pos="n", datasource=DATASOURCE) EXACT_DUPLICATE_SUBJECT_NODE = KgNode.legacy(id="testid", label="test label", pos="n", datasource=DATASOURCE) INEXACT_DUPLICATE_SUBJECT_NODE = KgNode.legacy(id="testid", label="test label variation", pos="n", datasource=DATASOURCE) OBJECT_NODE = KgNode.legacy(id="testobject", label="test object", pos="n", datasource=DATASOURCE) EDGE = KgEdge.legacy(subject=SUBJECT_NODE.id, object=OBJECT_NODE.id, predicate=DATASOURCE, datasource=DATASOURCE) def test_exact_duplicate_node(pipeline_storage): # Exact duplicates are ignored run((SUBJECT_NODE, OBJECT_NODE, EDGE, EXACT_DUPLICATE_SUBJECT_NODE), pipeline_storage) def test_inexact_duplicate_node(pipeline_storage): try: run((SUBJECT_NODE, OBJECT_NODE, EDGE, INEXACT_DUPLICATE_SUBJECT_NODE), pipeline_storage) fail() except ValueError:
def transform(self, combined_kb_tsv_file_path: Path): yielded_edges_tree = {} yielded_node_ids = set() unmapped_preds = Counter() with open(combined_kb_tsv_file_path, "r") as combined_kb_tsv_file: for row in csv.DictReader(combined_kb_tsv_file, delimiter='\t'): # QStrength - The quantification strength of the triple (0-1), where 1 = applies to most members of Arg1, 0 = applies to just a few members of Arg1. # The scale is purely a ranking scale (has no probabilistic meaning) - feel free to rescale it as required for your application. # Quantifier - Simple qualitative quantifier: if QStrength > 0.5 it is "most", otherwise it is "some". # Arg1 - in (Arg1 Pred Arg2) # Pred - in (Arg1 Pred Arg2) # Arg2 - in (Arg1 Pred Arg2) # Sentence - Expression of this tuple as an English sentence. # Score - This score is now redundant (superceded by QStrength), but was the either Turk-derived or model-derived quality of the tuple (range 0-1) # Inferred? - "n": The tuple was directly extracted from text, WordNet, or produced by KBCompletion. The source sentence(s) id(s) are listed in the Provenance field. # "y": The tuple was inferred using schema mapping rule(s) from other tuple(s). The source tuple(s) are listed in the Provenance field. # "m": Mixed - the tuple was both extracted from text and inferred. The source sentences and tuples are listed in the Provenance field. # Multiword? - If the Arg1 or Arg2 include a multiword, this is "y", else "n" # Canonical? - The tuple is in its canonical (normalized) form. (We retain both the original and canonical forms in this database). # Non-canonical tuples are also transformed to a canonical form, elsewhere in the database. # Domain - The general type of Arg1 # Range - The general type of Arg2 # Provenance - KBCompletion - inferred by KB Completion methods. # WordNet3.0 - tuple comes from WordNet v3.0. # ("cat","eat","food") - tuple was inferred from this tuple using a schema mapping rule (see TACL paper) # 12413 - tuple was extracted from sentence 12413. Source setnences are available on request. # Pull the columns out into typed variables qstrength = float(row["QStrength"]) quantifier = row["Quantifier"].strip() assert quantifier arg1 = row["Arg1"].strip() assert arg1 pred = row["Pred"].strip() assert pred arg2 = row["Arg2"].strip() assert arg2 # we selected 49 types (with help from WordNet) to mark the domain/range of triples (see below), plus "Thing" for the remainder. domain = row["Domain"].strip() assert domain range = row["Range"].strip() assert range provenance = row["Provenance"].strip() assert provenance # Mapping multiple preds to a single edge type may lead to duplicate edges concept_net_predicate_mapping = self.__PRED_TO_CONCEPT_NET_PREDICATE_MAPPINGS.get( pred) if concept_net_predicate_mapping is None: if pred not in unmapped_preds: self._logger.debug( "ignoring unmapped pred %s: %s %s %s", pred, arg1, pred, arg2) unmapped_preds[pred] += 1 continue concept_net_predicate = concept_net_predicate_mapping.concept_net_predicate reverse_args = concept_net_predicate_mapping.reverse_args # Convert arg1 and arg2 into nodes subject_node, subject_type_word_net_id = self.__parse_arg( arg=arg1, provenance=provenance, type_=domain) object_node, object_type_word_net_id = self.__parse_arg( arg=arg2, provenance=provenance, type_=range) for arg_node, type_word_net_id in ((subject_node, subject_type_word_net_id), (object_node, object_type_word_net_id)): if arg_node.id in yielded_node_ids: continue # arg_node has not been yielded yet yield arg_node yielded_node_ids.add(arg_node.id) # The domain or range (type) is a WordNet synset, or "Thing" if unknown # Only yield this edge once, along with the node. if type_word_net_id is not None: yield self.__create_type_edge( arg_node=arg_node, type_word_net_id=type_word_net_id) # Yield the tuple as an KgEdge if an equivalent edge hasn't been yielded before if reverse_args: # The pred -> predicate mapping above told us that the object should be the subject and the subject the object # ConceptNet has few symmetric relations. For example, it has "CreatedBy" but not "Creates". # So we map "produce" to "CreatedBy" and reverse the args. subject_node, object_node = object_node, subject_node object_edges = yielded_edges_tree.setdefault( subject_node.id, {}).setdefault(object_node.id, set()) if concept_net_predicate in object_edges: continue yield \ KgEdge.legacy( datasource=self.__DATASOURCE, predicate=concept_net_predicate, subject=subject_node.id, object=object_node.id, weight=qstrength, ) object_edges.add(concept_net_predicate) self._logger.info("top unmapped preds: %s", unmapped_preds.most_common(20))
def edge(): return KgEdge.legacy(subject="testsubject", predicate="testrelation", object="testobject", datasource="test", other={"test": 1})
def edge() -> KgEdge: return KgEdge.legacy(subject="testsubject", predicate="testrelation", object="testobject", datasource="test")