def usf_node(cueOrResponse: str, pos: str, other: dict = {}) -> KgNode:
    return KgNode.legacy(
        datasource=USF_DATASOURCE_ID,
        id=f'{USF_NAMESPACE}:{quote("%(cueOrResponse)s-%(pos)s" % locals())}',
        label=cueOrResponse,
        pos=pos,
        other=None if len(other) == 0 else other)
    def __convert_normalized_arg_to_node(self, normalized_arg):
        # Create nodes in a custom namespace.
        # Will do sameAs WordNet or Wikipedia nodes in the transform instead of reusing their id's here.
        # Don't include metadata as "other", since the data set contains multiple normalized args with different metadata,
        # which violates our duplicate node id checks.
        metadata = normalized_arg.get("metadata", {})
        if "synset" in metadata:
            synset = metadata["synset"]
            assert synset.startswith("wn.")
            word_net_id = WordNetId.parse(synset[len("wn."):])
            pos = word_net_id.pos
        else:
            pos = None

        label = normalized_arg["normalized"]
        id_ = f"{self.__DATASOURCE}:{quote(label)}"
        if pos is not None:
            id_ += ":" + pos

        return \
            KgNode.legacy(
                datasource=self.__DATASOURCE,
                id=id_,
                label=label,
                pos=pos,
                # other=normalized_arg.get("metadata")
            )
 def __webchild_node(self, *, ssid: str, word: str) -> KgNode:
     return KgNode.legacy(
         datasource=self.__DATASOURCE_ID,
         id=self.__webchild_nid(ssid),
         label=word,
         # All subjects/objects are nouns in WebChild part-whole
         pos="n",
     )
Beispiel #4
0
def sentic_node(*, id: str, label: str = None, sentic_type: str) -> KgNode:
    if label is None:
        label = id
    return KgNode.legacy(
        datasource=SENTIC_DATASOURCE_ID,
        id=sentic_id(id, sentic_type),
        label=label,
        # other={SENTIC_TYPE_KEY: sentic_type},
    )
 def test_map_unqualified_node(concept_net_mapper):
     edges = tuple(
         concept_net_mapper.map(
             KgNode.legacy(id="a", datasource="test", label="a")))
     assert len(edges) == 1
     edge = edges[0]
     assert edge.subject == "a"
     assert edge.object == "/c/en/a"
     assert edge.predicate == mowgli_predicates.SAME_AS
     assert edge.source_ids == ("test", )
Beispiel #6
0
        def __init__(self, *, label: str, sub_class_of: Optional[Tuple[URIRef, ...]], uri: URIRef):
            self.label = label
            self.sub_class_of = sub_class_of
            self.uri = uri

            self.node = \
                KgNode.legacy(
                    datasource=FoodOnTransformer._DATASOURCE,
                    id="foodon:" + str(uri)[len(self._URI_PREFIX):],
                    label=label
                )
            self.node_yielded = False
 def _generator():
     nid_counter = count(1)
     while True:
         nodes = tuple(
             KgNode.legacy(datasource='test_datasource',
                           id=f'test_node_{next(nid_counter)}',
                           label='test node') for _ in range(2))
         yield from nodes
         yield KgEdge.legacy(datasource='test_datasource',
                             object=nodes[1].id,
                             predicate='test_predicate',
                             subject=nodes[0].id)
 def test_map_node_with_pos(concept_net_mapper):
     edges = tuple(
         concept_net_mapper.map(
             KgNode.legacy(id="nid30",
                           datasource="test",
                           label="30",
                           pos="a")))
     assert len(edges) == 1
     edge = edges[0]
     assert edge.subject == "nid30"
     assert edge.object == "/c/en/30/a/wn"
     assert edge.predicate == mowgli_predicates.SAME_AS
     assert edge.source_ids == ("test", )
Beispiel #9
0
def test_swow_node():
    node = swow_node(word="test response",
                     response_counts=Counter(R1=3, R2=2, R3=0))
    expected_node = KgNode.legacy(
        datasource=SWOW_DATASOURCE_ID,
        id=f'{SWOW_NAMESPACE}:{quote("test response")}',
        label="test response",
        other={"response_counts": {
            "R1": 3,
            "R2": 2,
            "R3": 0
        }},
    )
    assert node == expected_node
Beispiel #10
0
def swow_node(*, word: str, response_counts: Counter) -> KgNode:
    """
    Create a cskg node from a SWOW cue or response.
    :param word: a SWOW cue or response
    :param response_counts: counts of responses to this word
    """
    assert all(k in SwowResponseType.__members__ for k in response_counts.keys())
    return KgNode.legacy(
        datasource=SWOW_DATASOURCE_ID,
        id=swow_node_id(word),
        label=word,
        other={
            "response_counts": {
                rt: response_counts[rt] for rt in SwowResponseType.__members__.keys()
            }
        },
    )
Beispiel #11
0
 def __transform(self, *,
                 nodes_csv_file: TextIO) -> Generator[KgNode, None, None]:
     csv_reader = csv.DictReader(nodes_csv_file,
                                 delimiter="\t",
                                 quoting=csv.QUOTE_NONE)
     for csv_row_i, csv_row in enumerate(csv_reader):
         try:
             yield \
                 KgNode.legacy(
                     aliases=self._get_optional_column(csv_row, "aliases"),
                     datasource=self._get_required_column(csv_row, "datasource"),
                     id=self._get_required_column(csv_row, "id"),
                     label=self._get_optional_column(csv_row, "label"),
                     other=self._get_optional_column(csv_row, "other"),
                     pos=self._get_optional_column(csv_row, "pos"),
                 )
         except ValueError as e:
             self._logger.warning("CSKG nodes CSV row %d %s: %s", csv_row_i,
                                  e, csv_row)
Beispiel #12
0
    def __parse_arg(self, *, arg: str, provenance: str, type_: str) -> KgNode:
        # Put the type in the id in case words are reused
        if type_ != "Thing":
            word_net_type = type_.rsplit('_', 1)
            assert len(word_net_type[1]) >= 2
            type_word_net_id = WordNetId(word=word_net_type[0],
                                         pos=word_net_type[1][0],
                                         offset=int(word_net_type[1][1:]))
        else:
            type_word_net_id = None

        node = \
            KgNode.legacy(
                datasource=self.__DATASOURCE,
                id=f"{self.__DATASOURCE}:{type_}:{quote(arg)}",
                label=arg,
                # Assume the part of speech of the arg is the same as the part of speech of the type
                pos=type_word_net_id.pos if type_word_net_id is not None else None,
                other={"provenance": provenance, "type": type_}
            )
        return node, type_word_net_id
def test_write_node(pipeline_storage):
    test_node = KgNode.legacy(
        datasource='test_datasource',
        id='test_nid',
        label='Test KgNode',
        aliases=('t-node', 'KgNode Test'),
        # other={'datasets': ['test_dataset', 'other_test_dataset']},
        pos='N')

    with CskgCsvLoader().open(pipeline_storage) as loader:
        loader.load_kg_node(test_node)
        # 20200310 MG: duplicate removal has been moved to the PipelineWrapper
        # loader.load_kg_node(test_node)

    expected_node_text = (
        _EXPECTED_NODE_HEADER + '\n' +
        'test_nid\tTest KgNode\tt-node KgNode Test\tN\ttest_datasource\t\n')

    with open(pipeline_storage.loaded_data_dir_path / "edges.csv") as f:
        assert f.read() == _EXPECTED_EDGE_HEADER + '\n'

    with open(pipeline_storage.loaded_data_dir_path / "nodes.csv") as f:
        assert f.read() == expected_node_text
def test_eat_tranform():
    test_file_dir = pathlib.Path(__file__).parent.absolute()
    test_file_path = os.path.join(test_file_dir, 'sample_eat100.xml')
    transformer = EatTransformer()

    nodes, edges = set(), set()
    for result in transformer.transform(xml_file_path=test_file_path):
        if isinstance(result, KgNode):
            nodes.add(result)
        elif isinstance(result, KgEdge):
            edges.add(result)

    expected_stimulus_nodes = set(KgNode.legacy(datasource="eat", id="eat:" + stim_word, label=stim_word) for stim_word in [
        'SPECIAL',
        'SET'
    ])

    expected_response_nodes = set(KgNode.legacy(datasource="eat", id="eat:" + response_word, label=response_word) for response_word in [
        'TRAIN',
        'PARTICULAR',
        'EXTRA',
        'ORDINARY',
        'CASE',
        'PERSON',
        'BEER',
        'CAR',
        'CONSTABLE',
        'TELEVISION',
        'UP',
        'OUT',
        'TO',
        'DOWN',
        'GAME',
        'GROUP',
        'T.V.',
        'TEA'
    ])

    expected_nodes = expected_stimulus_nodes | expected_response_nodes

    expected_edges = set(
        KgEdge.legacy(datasource="eat", object="eat:" + stim_node, predicate="cn:RelatedTo", subject="eat:" + response_node,
             weight=response_weight) for (stim_node, response_node, response_weight) in [
            ('SPECIAL', 'TRAIN', 0.07),
            ('SPECIAL', 'PARTICULAR', 0.05),
            ('SPECIAL', 'EXTRA', 0.04),
            ('SPECIAL', 'ORDINARY', 0.04),
            ('SPECIAL', 'CASE', 0.03),
            ('SPECIAL', 'PERSON', 0.03),
            ('SPECIAL', 'BEER', 0.02),
            ('SPECIAL', 'CAR', 0.02),
            ('SPECIAL', 'CONSTABLE', 0.02),
            ('SET', 'TELEVISION', 0.06),
        ('SET', 'UP', 0.05),
        ('SET', 'OUT', 0.04),
        ('SET', 'TO', 0.04),
        ('SET', 'DOWN', 0.03),
        ('SET', 'GAME', 0.03),
        ('SET', 'GROUP', 0.03),
        ('SET', 'T.V.', 0.03),
        ('SET', 'TEA', 0.03)
    ])

    assert nodes == expected_nodes
    assert edges == expected_edges
Beispiel #15
0
def node():
    return KgNode.legacy(id="testid",
                         label="test label",
                         pos="n",
                         datasource="test",
                         other={"test": 1})
Beispiel #16
0
def node() -> KgNode:
    return KgNode.legacy(id="testid",
                         label="test label",
                         pos="n",
                         datasource="test")
Beispiel #17
0
class MockPipeline(_Pipeline):
    def __init__(self, node_edge_sequence: Tuple[Union[KgNode, KgEdge], ...]):
        _Pipeline.__init__(self,
                           extractor=NopExtractor(),
                           id=DATASOURCE,
                           transformer=MockTransformer(node_edge_sequence))


def run(node_edge_sequence: Tuple[Union[KgNode, KgEdge], ...],
        pipeline_storage: PipelineStorage):
    return PipelineWrapper(MockPipeline(node_edge_sequence),
                           pipeline_storage).run()


SUBJECT_NODE = KgNode.legacy(id="testid",
                             label="test label",
                             pos="n",
                             datasource=DATASOURCE)
EXACT_DUPLICATE_SUBJECT_NODE = KgNode.legacy(id="testid",
                                             label="test label",
                                             pos="n",
                                             datasource=DATASOURCE)
INEXACT_DUPLICATE_SUBJECT_NODE = KgNode.legacy(id="testid",
                                               label="test label variation",
                                               pos="n",
                                               datasource=DATASOURCE)
OBJECT_NODE = KgNode.legacy(id="testobject",
                            label="test object",
                            pos="n",
                            datasource=DATASOURCE)
EDGE = KgEdge.legacy(subject=SUBJECT_NODE.id,
                     object=OBJECT_NODE.id,