def __transform_wordnet_csv(
     self, *, wordnet_csv_file_path: Path, yielded_words: _KgNodeSet
 ) -> Generator[Union[KgNode, KgEdge], None, None]:
     self._logger.info("transforming wordnet mappings from %s",
                       wordnet_csv_file_path)
     with open(wordnet_csv_file_path) as csv_file:
         csv_reader = csv.DictReader(csv_file,
                                     delimiter="\t",
                                     quoting=csv.QUOTE_NONE)
         for row in csv_reader:
             word_nid = self.__webchild_nid(row["WordNet-synsetid"])
             word = row["#word"]
             # Skip edge generation if the word node already has a wn mapping,
             # or if the word is not represented in the yielded nodes,
             yielded_word = yielded_words.get(word_nid)
             if yielded_word is None or yielded_word.labels[0].lower(
             ) != word.lower():
                 continue
             lemma = "_".join(word.split())
             sense_num = row["sense-number"]
             synset_nid = f"wn:{lemma}.n.{int(sense_num):02d}"
             yield KgEdge.legacy(
                 datasource=self.__DATASOURCE_ID,
                 object=synset_nid,
                 predicate=WN_SYNSET,
                 subject=word_nid,
             )
             # For tracking which nodes have mappings already
             # Deleting from yielded instead of tracking in a new set to save memory.
             yielded_words.delete(word_nid)
Ejemplo n.º 2
0
 def __find_predicate(self, product1, product2):
     # Placeholder predicates
     # MUCH_SMALLER_THAN = "/r/MuchSmallerThan"
     SMALLER_THAN = "/r/SmallerThan"
     EQUIVALENT_TO = "/r/EquivalentTo"
     LARGER_THAN = "/r/LargerThan"
     # MUCH_LARGER_THAN = "/r/MuchLargerThan"
     CANT_COMPARE = "/r/Can'tCompare"
     """
     Find the appropriate predicate for two WdcProductSize objects
     """
     pred = None
     if not product1.bucket or not product2.bucket:
         pred = CANT_COMPARE
     elif product1.bucket > product2.bucket:
         pred = LARGER_THAN
     elif product1.bucket < product2.bucket:
         pred = SMALLER_THAN
     else:
         pred = EQUIVALENT_TO
     return KgEdge.with_generated_id(
         subject=product1.name,
         object=product2.name,
         predicate=pred,
         source_ids=(WDC_DATASOURCE_ID, ),
     )
Ejemplo n.º 3
0
def test_swow_edge():
    edge = swow_edge(
        cue="test",
        response="test response",
        cue_response_counts=Counter(R1=2, R3=4),
        response_counts=Counter(R1=1, R3=1),
    )
    expected_edge = KgEdge.legacy(
        datasource=SWOW_DATASOURCE_ID,
        subject=f"{SWOW_NAMESPACE}:test",
        object=f'{SWOW_NAMESPACE}:{quote("test response")}',
        predicate=RELATED_TO,
        weight=2 / 6,
        other={
            "response_counts": {
                "R1": 1,
                "R2": 0,
                "R3": 1
            },
            "response_strengths": {
                "R1": 1 / 2,
                "R2": 0,
                "R3": 1 / 4
            },
        },
    )
    assert edge == expected_edge
    def __yield_same_as_edges(
        self, *, arg1_node: KgNode, arg1_object, arg2_node: KgNode,
        arg2_object, same_as_edges_yielded: Dict[str, Set[str]]
    ) -> Generator[KgEdge, None, None]:
        for arg_node, arg_object in (
            (arg1_node, arg1_object),
            (arg2_node, arg2_object),
        ):
            metadata = arg_object.get("metadata")
            if metadata is None:
                continue

            node_same_as_edges_yielded = same_as_edges_yielded.get(arg_node.id)
            if node_same_as_edges_yielded is None:
                same_as_edges_yielded[
                    arg_node.id] = node_same_as_edges_yielded = set()

            if "synset" in metadata:
                synset = metadata["synset"]
                assert synset.startswith("wn.")
                wn_node_id = "wn:" + synset[len("wn."):]
                if wn_node_id in node_same_as_edges_yielded:
                    continue
                yield KgEdge.legacy(
                    datasource=self.__DATASOURCE,
                    object=wn_node_id,
                    predicate=SAME_AS,
                    subject=arg_node.id,
                )
                node_same_as_edges_yielded.add(wn_node_id)

            if "wikipedia_primary_page" in metadata:
                wikipedia_primary_page = metadata["wikipedia_primary_page"]
                wikipedia_node_id = "wikipedia:" + quote(
                    wikipedia_primary_page)
                if wikipedia_node_id in node_same_as_edges_yielded:
                    continue
                yield KgEdge.legacy(
                    datasource=self.__DATASOURCE,
                    object=wikipedia_node_id,
                    predicate=SAME_AS,
                    subject=arg_node.id,
                )
                node_same_as_edges_yielded.add(wikipedia_node_id)
Ejemplo n.º 5
0
def test_mixed_datasource(pipeline_storage):
    try:
        run((SUBJECT_NODE, OBJECT_NODE,
             KgEdge.legacy(subject=SUBJECT_NODE.id,
                           object="externalnode",
                           predicate=DATASOURCE,
                           datasource="otherdatasource")), pipeline_storage)
        fail()
    except ValueError:
        pass
Ejemplo n.º 6
0
 def __create_type_edge(self, *, arg_node: KgNode,
                        type_word_net_id: WordNetId) -> KgEdge:
     # arg node IsA WordNet node
     # Only yield this once, when the arg is yielded.
     return \
         KgEdge.legacy(
             datasource=self.__DATASOURCE,
             object="wn:" + str(type_word_net_id),
             predicate=IS_A,
             subject=arg_node.id,
         )
    def __yield_has_part_edges(
            self, *, arg1_node: KgNode, arg2_node: KgNode,
            average_score: float) -> Generator[KgEdge, None, None]:
        # arg1 HasA arg2
        yield KgEdge.legacy(
            datasource=self.__DATASOURCE,
            subject=arg1_node.id,
            object=arg2_node.id,
            predicate=HAS_A,
            weight=average_score,
        )

        # Inverse, arg2 PartOf arg2
        yield KgEdge.legacy(
            datasource=self.__DATASOURCE,
            subject=arg2_node.id,
            object=arg1_node.id,
            predicate=PART_OF,
            weight=average_score,
        )
Ejemplo n.º 8
0
 def _generator():
     nid_counter = count(1)
     while True:
         nodes = tuple(
             KgNode.legacy(datasource='test_datasource',
                           id=f'test_node_{next(nid_counter)}',
                           label='test node') for _ in range(2))
         yield from nodes
         yield KgEdge.legacy(datasource='test_datasource',
                             object=nodes[1].id,
                             predicate='test_predicate',
                             subject=nodes[0].id)
Ejemplo n.º 9
0
    def transform(self, food_on_owl_file_path: Path):
        graph = Graph()
        self._logger.info("parsing FoodOn OWL")
        graph.parse(source=str(food_on_owl_file_path))
        self._logger.info("parsed FoodOn OWL")

        self._logger.info("parsing FoodOn classes")

        classes_by_uri = {}
        for class_uri in graph.subjects(RDF.type, OWL.Class):
            if not str(class_uri).startswith(self.__FoodOnClass._URI_PREFIX):
                continue
            labels = tuple(graph.objects(class_uri, RDFS.label))
            if not labels:
                continue
            # Just use the first label
            label = labels[0]
            assert label, class_uri

            sub_class_of = tuple(graph.objects(class_uri, RDFS.subClassOf))
            if not sub_class_of:
                continue

            class_ = self.__FoodOnClass(
                label=label,
                sub_class_of=sub_class_of,
                uri=class_uri
            )
            assert class_.uri not in classes_by_uri
            classes_by_uri[class_.uri] = class_
        self._logger.info("parsed %d classes from FoodOn", len(classes_by_uri))

        for class_ in classes_by_uri.values():
            for sub_class_of in class_.sub_class_of:
                parent_class = classes_by_uri.get(sub_class_of)
                if not parent_class:
                    continue
                # Only yield nodes that are part of an edge.
                if not class_.node_yielded:
                    yield class_.node
                    class_.node_yielded = True
                if not parent_class.node_yielded:
                    yield parent_class.node
                    parent_class.node_yielded = True
                edge = \
                    KgEdge.legacy(
                        datasource=self._DATASOURCE,
                        subject=class_.node.id,
                        predicate=concept_net_predicates.IS_A,
                        object=parent_class.node.id
                    )
                yield edge
Ejemplo n.º 10
0
def usf_edge(*, cue: Union[KgNode, str], response: Union[KgNode, str],
             strength: float) -> KgEdge:

    return KgEdge.legacy(
        datasource=USF_DATASOURCE_ID,
        subject=cue.id if isinstance(cue, KgNode) else usf_node(
            cue,
            "",
        ),
        object=response.id if isinstance(response, KgNode) else usf_node(
            response, ""),
        predicate=RELATED_TO,
        weight=strength)
Ejemplo n.º 11
0
def sentic_edge(
    *,
    subject: str,
    object_: str,
    weight: Optional[float] = None,
) -> KgEdge:

    return KgEdge.legacy(
        datasource=SENTIC_DATASOURCE_ID,
        subject=subject,
        object=object_,
        predicate=RELATED_TO,
        weight=weight,
    )
 def __transform(self,
                 edges_csv_file: TextIO) -> Generator[KgEdge, None, None]:
     csv_reader = csv.DictReader(edges_csv_file,
                                 delimiter="\t",
                                 quoting=csv.QUOTE_NONE)
     for csv_row in csv_reader:
         # Edges may refer to nodes that are outside of the ones we've created e.g., WordNet.
         yield \
             KgEdge.legacy(
                 datasource=self._get_required_column(csv_row, "datasource"),
                 object=self._get_required_column(csv_row, "object"),
                 other=csv_row.get("other"),
                 predicate=self._get_required_column(csv_row, "predicate"),
                 subject=self._get_required_column(csv_row, "subject"),
                 weight=float(self._get_required_column(csv_row, "weight"))
             )
Ejemplo n.º 13
0
 def map(self, node: KgNode) -> Generator[KgEdge, None, None]:
     """
     Given a node from another data source, generate a sequence of edges mapping that node to ConceptNet concepts.
     """
     for node_label in node.labels:
         concept_net_id = self.__concept_net_index.get(
             label=node_label, pos=node.pos
         )
         if concept_net_id is None:
             continue
         yield KgEdge.with_generated_id(
             object=concept_net_id,
             predicate=mowgli_predicates.SAME_AS,
             source_ids=node.source_ids,
             subject=node.id,
         )
         return
Ejemplo n.º 14
0
def swow_edge(
    *,
    cue: Union[KgNode, str],
    response: Union[KgNode, str],
    cue_response_counts: Counter,
    response_counts: Counter,
) -> KgEdge:
    """
    Create a cskg edge from a SWOW cue, response, and strength value.
    :param cue: cue phrase
    :param response: response to the cue phrase
    :param cue_response_counts: total response counts for the cue
    :param response_counts: counts of this response to the cue
    """
    assert all(k in SwowResponseType.__members__ for k in cue_response_counts.keys())
    assert all(k in SwowResponseType.__members__ for k in response_counts.keys())
    strength_r123 = sum(response_counts.values()) / sum(cue_response_counts.values())
    other = {
        "response_counts": {
            rt: response_counts[rt] for rt in SwowResponseType.__members__.keys()
        },
        "response_strengths": {
            rt: (
                response_counts[rt] / cue_response_counts[rt]
                if cue_response_counts[rt] > 0
                else 0
            )
            for rt in SwowResponseType.__members__.keys()
        },
    }
    return KgEdge.legacy(
        datasource=SWOW_DATASOURCE_ID,
        subject=cue.id if isinstance(cue, KgNode) else swow_node_id(cue),
        object=response.id if isinstance(response, KgNode) else swow_node_id(response),
        predicate=RELATED_TO,
        weight=strength_r123,
        other=other,
    )
def test_write_edge(pipeline_storage):
    test_edge = KgEdge.legacy(
        datasource='test_datasource',
        object='test_obj',
        predicate='test_rel',
        subject='test_subject',
        # other={'datasets': ['test_dataset', 'other_test_dataset']},
        weight=0.999)

    with CskgCsvLoader().open(pipeline_storage) as loader:
        loader.load_kg_edge(test_edge)
        # Load twice to test handling of redundant edges
        # 20200310 MG: duplicate removal has been moved to the PipelineWrapper
        # loader.load_kg_edge(test_edge)

    expected_edge_text = (
        _EXPECTED_EDGE_HEADER + '\n' +
        'test_subject\ttest_rel\ttest_obj\ttest_datasource\t0.999\t\n')

    with open(pipeline_storage.loaded_data_dir_path / "edges.csv") as f:
        assert f.read() == expected_edge_text

    with open(pipeline_storage.loaded_data_dir_path / "nodes.csv") as f:
        assert f.read() == _EXPECTED_NODE_HEADER + '\n'
    def __read_webchild_csv_row(self,
                                row: dict) -> Tuple[KgNode, KgNode, KgEdge]:
        subject_node = self.__webchild_node(ssid=row["to_ss"],
                                            word=row["to_word"])
        object_node = self.__webchild_node(ssid=row["from_ss"],
                                           word=row["from_word"])

        relation, inverted = self.__RELATION_DICT[row["relation"]]
        if inverted:
            subject_node, object_node = object_node, subject_node
        other = {
            "isvisual": row["isvisual"] == "v",
            "cardinality": row["cardinality"].strip(),
        }
        score = float(row["score"])
        edge = KgEdge.legacy(
            datasource=self.__DATASOURCE_ID,
            object=object_node.id,
            predicate=relation,
            subject=subject_node.id,
            other=other,
            weight=score,
        )
        return subject_node, object_node, edge
def test_eat_tranform():
    test_file_dir = pathlib.Path(__file__).parent.absolute()
    test_file_path = os.path.join(test_file_dir, 'sample_eat100.xml')
    transformer = EatTransformer()

    nodes, edges = set(), set()
    for result in transformer.transform(xml_file_path=test_file_path):
        if isinstance(result, KgNode):
            nodes.add(result)
        elif isinstance(result, KgEdge):
            edges.add(result)

    expected_stimulus_nodes = set(KgNode.legacy(datasource="eat", id="eat:" + stim_word, label=stim_word) for stim_word in [
        'SPECIAL',
        'SET'
    ])

    expected_response_nodes = set(KgNode.legacy(datasource="eat", id="eat:" + response_word, label=response_word) for response_word in [
        'TRAIN',
        'PARTICULAR',
        'EXTRA',
        'ORDINARY',
        'CASE',
        'PERSON',
        'BEER',
        'CAR',
        'CONSTABLE',
        'TELEVISION',
        'UP',
        'OUT',
        'TO',
        'DOWN',
        'GAME',
        'GROUP',
        'T.V.',
        'TEA'
    ])

    expected_nodes = expected_stimulus_nodes | expected_response_nodes

    expected_edges = set(
        KgEdge.legacy(datasource="eat", object="eat:" + stim_node, predicate="cn:RelatedTo", subject="eat:" + response_node,
             weight=response_weight) for (stim_node, response_node, response_weight) in [
            ('SPECIAL', 'TRAIN', 0.07),
            ('SPECIAL', 'PARTICULAR', 0.05),
            ('SPECIAL', 'EXTRA', 0.04),
            ('SPECIAL', 'ORDINARY', 0.04),
            ('SPECIAL', 'CASE', 0.03),
            ('SPECIAL', 'PERSON', 0.03),
            ('SPECIAL', 'BEER', 0.02),
            ('SPECIAL', 'CAR', 0.02),
            ('SPECIAL', 'CONSTABLE', 0.02),
            ('SET', 'TELEVISION', 0.06),
        ('SET', 'UP', 0.05),
        ('SET', 'OUT', 0.04),
        ('SET', 'TO', 0.04),
        ('SET', 'DOWN', 0.03),
        ('SET', 'GAME', 0.03),
        ('SET', 'GROUP', 0.03),
        ('SET', 'T.V.', 0.03),
        ('SET', 'TEA', 0.03)
    ])

    assert nodes == expected_nodes
    assert edges == expected_edges
Ejemplo n.º 18
0
                             pos="n",
                             datasource=DATASOURCE)
EXACT_DUPLICATE_SUBJECT_NODE = KgNode.legacy(id="testid",
                                             label="test label",
                                             pos="n",
                                             datasource=DATASOURCE)
INEXACT_DUPLICATE_SUBJECT_NODE = KgNode.legacy(id="testid",
                                               label="test label variation",
                                               pos="n",
                                               datasource=DATASOURCE)
OBJECT_NODE = KgNode.legacy(id="testobject",
                            label="test object",
                            pos="n",
                            datasource=DATASOURCE)
EDGE = KgEdge.legacy(subject=SUBJECT_NODE.id,
                     object=OBJECT_NODE.id,
                     predicate=DATASOURCE,
                     datasource=DATASOURCE)


def test_exact_duplicate_node(pipeline_storage):
    # Exact duplicates are ignored
    run((SUBJECT_NODE, OBJECT_NODE, EDGE, EXACT_DUPLICATE_SUBJECT_NODE),
        pipeline_storage)


def test_inexact_duplicate_node(pipeline_storage):
    try:
        run((SUBJECT_NODE, OBJECT_NODE, EDGE, INEXACT_DUPLICATE_SUBJECT_NODE),
            pipeline_storage)
        fail()
    except ValueError:
Ejemplo n.º 19
0
    def transform(self, combined_kb_tsv_file_path: Path):
        yielded_edges_tree = {}
        yielded_node_ids = set()
        unmapped_preds = Counter()
        with open(combined_kb_tsv_file_path, "r") as combined_kb_tsv_file:
            for row in csv.DictReader(combined_kb_tsv_file, delimiter='\t'):
                # QStrength	- The quantification strength of the triple (0-1), where 1 = applies to most members of Arg1, 0 = applies to just a few members of Arg1.
                # The scale is purely a ranking scale (has no probabilistic meaning) - feel free to rescale it as required for your application.
                # Quantifier	- Simple qualitative quantifier: if QStrength > 0.5 it is "most", otherwise it is "some".
                # Arg1		- in (Arg1 Pred Arg2)
                # Pred		- in (Arg1 Pred Arg2)
                # Arg2		- in (Arg1 Pred Arg2)
                # Sentence	- Expression of this tuple as an English sentence.
                # Score	- This score is now redundant (superceded by QStrength), but was the either Turk-derived or model-derived quality of the tuple (range 0-1)
                # Inferred?	- "n": The tuple was directly extracted from text, WordNet, or produced by KBCompletion. The source sentence(s) id(s) are listed in the Provenance field.
                # "y": The tuple was inferred using schema mapping rule(s) from other tuple(s). The source tuple(s) are listed in the Provenance field.
                # "m": Mixed - the tuple was both extracted from text and inferred. The source sentences and tuples are listed in the Provenance field.
                # Multiword?	- If the Arg1 or Arg2 include a multiword, this is "y", else "n"
                # Canonical?	- The tuple is in its canonical (normalized) form. (We retain both the original and canonical forms in this database).
                # Non-canonical tuples are also transformed to a canonical form, elsewhere in the database.
                # Domain	- The general type of Arg1
                # Range	- The general type of Arg2
                # Provenance	- KBCompletion - inferred by KB Completion methods.
                # WordNet3.0 - tuple comes from WordNet v3.0.
                # ("cat","eat","food") - tuple was inferred from this tuple using a schema mapping rule (see TACL paper)
                # 12413 - tuple was extracted from sentence 12413. Source setnences are available on request.

                # Pull the columns out into typed variables
                qstrength = float(row["QStrength"])
                quantifier = row["Quantifier"].strip()
                assert quantifier
                arg1 = row["Arg1"].strip()
                assert arg1
                pred = row["Pred"].strip()
                assert pred
                arg2 = row["Arg2"].strip()
                assert arg2
                # we selected 49 types (with help from WordNet) to mark the domain/range of triples (see below), plus "Thing" for the remainder.
                domain = row["Domain"].strip()
                assert domain
                range = row["Range"].strip()
                assert range
                provenance = row["Provenance"].strip()
                assert provenance

                # Mapping multiple preds to a single edge type may lead to duplicate edges
                concept_net_predicate_mapping = self.__PRED_TO_CONCEPT_NET_PREDICATE_MAPPINGS.get(
                    pred)
                if concept_net_predicate_mapping is None:
                    if pred not in unmapped_preds:
                        self._logger.debug(
                            "ignoring unmapped pred %s: %s %s %s", pred, arg1,
                            pred, arg2)
                    unmapped_preds[pred] += 1
                    continue

                concept_net_predicate = concept_net_predicate_mapping.concept_net_predicate
                reverse_args = concept_net_predicate_mapping.reverse_args

                # Convert arg1 and arg2 into nodes
                subject_node, subject_type_word_net_id = self.__parse_arg(
                    arg=arg1, provenance=provenance, type_=domain)
                object_node, object_type_word_net_id = self.__parse_arg(
                    arg=arg2, provenance=provenance, type_=range)

                for arg_node, type_word_net_id in ((subject_node,
                                                    subject_type_word_net_id),
                                                   (object_node,
                                                    object_type_word_net_id)):
                    if arg_node.id in yielded_node_ids:
                        continue
                    # arg_node has not been yielded yet

                    yield arg_node
                    yielded_node_ids.add(arg_node.id)

                    # The domain or range (type) is a WordNet synset, or "Thing" if unknown
                    # Only yield this edge once, along with the node.
                    if type_word_net_id is not None:
                        yield self.__create_type_edge(
                            arg_node=arg_node,
                            type_word_net_id=type_word_net_id)

                # Yield the tuple as an KgEdge if an equivalent edge hasn't been yielded before
                if reverse_args:
                    # The pred -> predicate mapping above told us that the object should be the subject and the subject the object
                    # ConceptNet has few symmetric relations. For example, it has "CreatedBy" but not "Creates".
                    # So we map "produce" to "CreatedBy" and reverse the args.
                    subject_node, object_node = object_node, subject_node
                object_edges = yielded_edges_tree.setdefault(
                    subject_node.id, {}).setdefault(object_node.id, set())
                if concept_net_predicate in object_edges:
                    continue

                yield \
                    KgEdge.legacy(
                        datasource=self.__DATASOURCE,
                        predicate=concept_net_predicate,
                        subject=subject_node.id,
                        object=object_node.id,
                        weight=qstrength,
                    )
                object_edges.add(concept_net_predicate)

            self._logger.info("top unmapped preds: %s",
                              unmapped_preds.most_common(20))
Ejemplo n.º 20
0
def edge():
    return KgEdge.legacy(subject="testsubject",
                         predicate="testrelation",
                         object="testobject",
                         datasource="test",
                         other={"test": 1})
Ejemplo n.º 21
0
def edge() -> KgEdge:
    return KgEdge.legacy(subject="testsubject",
                         predicate="testrelation",
                         object="testobject",
                         datasource="test")