Example #1
0
def test_meta_knowledge_graph_of_complex_graph_data():
    """
    Test generate meta knowledge graph operation.
    """
    input_args = {
        "filename": [
            os.path.join(RESOURCE_DIR, "complex_graph_nodes.tsv"),
            os.path.join(RESOURCE_DIR, "complex_graph_edges.tsv"),
        ],
        "format":
        "tsv",
    }

    transformer = Transformer()

    transformer.transform(input_args)

    output_filename = os.path.join(TARGET_DIR,
                                   "test_meta_knowledge_graph-1.json")

    generate_meta_knowledge_graph(
        graph=transformer.store.graph,
        name="Complex Test Graph",
        filename=output_filename,
        edge_facet_properties=["aggregator_knowledge_source"])

    data = json.load(open(output_filename))
    assert data["name"] == "Complex Test Graph"
    print(f"\n{json.dumps(data, indent=4)}")
Example #2
0
def test_summarize_graph_inspector():
    """
    Test for Inspector sourced graph stats, and comparing the resulting stats.
    """
    input_args = {
        'filename': [
            os.path.join(RESOURCE_DIR, 'graph_nodes.tsv'),
            os.path.join(RESOURCE_DIR, 'graph_edges.tsv'),
        ],
        'format':
        'tsv',
    }

    transformer = Transformer(stream=True)

    inspector = GraphSummary('Test Graph Summary - Streamed')

    transformer.transform(input_args=input_args, inspector=inspector)

    output_filename = os.path.join(TARGET_DIR,
                                   'test_graph-summary-from-inspection.json')

    with open(output_filename, 'w') as gsh:
        inspector.save(output_filename)

    data = json.load(open(output_filename))
    assert data['name'] == 'Test Graph Summary - Streamed'
    assert 'NCBIGene' in data['nodes']['biolink:Gene']['id_prefixes']
    assert 'REACT' in data['nodes']['biolink:Pathway']['id_prefixes']
    assert 'HP' in data['nodes']['biolink:PhenotypicFeature']['id_prefixes']
    assert data['nodes']['biolink:Gene']['count'] == 178
    assert len(data['nodes']) == 8
    assert len(data['edges']) == 13
Example #3
0
def test_validate_by_stream_inspector():
    """
    Test generate the validate function by streaming
    graph data through a graph Transformer.process() Inspector
    """
    input_args = {
        "filename": [
            os.path.join(RESOURCE_DIR, "graph_nodes.tsv"),
            os.path.join(RESOURCE_DIR, "graph_edges.tsv"),
        ],
        "format": "tsv",
        "aggregator_knowledge_source": True,
    }

    Validator.set_biolink_model("1.8.2")

    # Validator assumes the currently set Biolink Release
    validator = Validator()

    transformer = Transformer(stream=True)

    transformer.transform(
        input_args=input_args,
        output_args={
            "format": "null"
        },  # streaming processing throws the graph data away
        # ... Second, we inject the Inspector into the transform() call,
        # for the underlying Transformer.process() to use...
        inspector=validator,
    )

    validator.write_report()

    e = validator.get_errors()
    assert len(e) == 0
def test_clique_merge():
    """
    Test for clique merge.
    """
    input_args = {
        'filename': [
            os.path.join(RESOURCE_DIR, 'cm_nodes.csv'),
            os.path.join(RESOURCE_DIR, 'cm_edges.csv'),
        ],
        'format':
        'csv',
    }
    t = Transformer()
    t.transform(input_args)
    updated_graph, clique_graph = clique_merge(
        target_graph=t.store.graph,
        prefix_prioritization_map=prefix_prioritization_map)
    leaders = NxGraph.get_node_attributes(updated_graph, 'clique_leader')
    leader_list = list(leaders.keys())
    leader_list.sort()
    assert len(leader_list) == 2
    n1 = updated_graph.nodes()[leader_list[0]]
    assert n1['election_strategy'] == 'PREFIX_PRIORITIZATION'
    assert 'NCBIGene:100302240' in n1['same_as']
    assert 'ENSEMBL:ENSG00000284458' in n1['same_as']
    n2 = updated_graph.nodes()[leader_list[1]]
    assert n2['election_strategy'] == 'PREFIX_PRIORITIZATION'
    assert 'NCBIGene:8202' in n2['same_as']
    assert 'OMIM:601937' in n2['same_as']
    assert 'ENSEMBL:ENSG00000124151' not in n2['same_as']
Example #5
0
def _stream_transform(query):
    """
    Transform an input to an output via Transformer where streaming is enabled.
    """
    t1 = Transformer(stream=True)
    t1.transform(query[0], query[1])

    output = query[1]
    if output["format"] in {"tsv", "csv", "jsonl"}:
        input_args = {
            "filename": [
                f"{output['filename']}_nodes.{output['format']}",
                f"{output['filename']}_edges.{output['format']}",
            ],
            "format":
            output["format"],
        }
    elif output["format"] in {"neo4j"}:
        input_args = {
            "uri": DEFAULT_NEO4J_URL,
            "username": DEFAULT_NEO4J_USERNAME,
            "password": DEFAULT_NEO4J_PASSWORD,
            "format": "neo4j",
        }
    else:
        input_args = {
            "filename": [f"{output['filename']}"],
            "format": output["format"]
        }

    t2 = Transformer()
    t2.transform(input_args)

    assert t2.store.graph.number_of_nodes() == query[2]
    assert t2.store.graph.number_of_edges() == query[3]
Example #6
0
def test_transformer_infores_parser_prefix_rewrite():
    input_args = {
        "filename": [
            os.path.join(RESOURCE_DIR, "test_infores_coercion_nodes.tsv"),
            os.path.join(RESOURCE_DIR, "test_infores_coercion_edges.tsv"),
        ],
        "format":
        "tsv",
        "provided_by": (r"\(.+\)", "", "Monarch"),
        "aggregator_knowledge_source": (r"\(.+\)", "", "Monarch"),
    }

    t = Transformer()
    t.transform(input_args=input_args)

    n1 = t.store.graph.nodes()["FlyBase:FBgn0000008"]
    assert "provided_by" in n1
    assert len(n1["provided_by"]) == 1
    assert "infores:monarch-flybase" in n1["provided_by"]

    n2 = t.store.graph.nodes()["GO:0005912"]
    assert "provided_by" in n2
    assert len(n2["provided_by"]) == 1
    assert "infores:monarch-gene-ontology" in n2["provided_by"]

    et = list(
        t.store.graph.get_edge("FlyBase:FBgn0000008",
                               "GO:0005912").values())[0]
    assert "infores:monarch-gene-ontology" in et["aggregator_knowledge_source"]

    irc = t.get_infores_catalog()
    assert len(irc) == 2
    assert "Gene Ontology (Monarch version 202012)" in irc
    assert ("infores:monarch-gene-ontology"
            in irc["Gene Ontology (Monarch version 202012)"])
Example #7
0
def test_transformer_infores_basic_formatting():
    input_args = {
        "filename": [
            os.path.join(RESOURCE_DIR, "test_infores_coercion_nodes.tsv"),
            os.path.join(RESOURCE_DIR, "test_infores_coercion_edges.tsv"),
        ],
        "format":
        "tsv",
        "provided_by":
        True,
        "aggregator_knowledge_source":
        "true",
    }

    t = Transformer()
    t.transform(input_args=input_args)

    n1 = t.store.graph.nodes()["FlyBase:FBgn0000008"]
    assert "provided_by" in n1
    assert len(n1["provided_by"]) == 1
    assert "infores:flybase-monarch-version-202012" in n1["provided_by"]

    n2 = t.store.graph.nodes()["GO:0005912"]
    assert "provided_by" in n2
    assert len(n2["provided_by"]) == 1
    assert "infores:gene-ontology-monarch-version-202012" in n2["provided_by"]

    et = list(
        t.store.graph.get_edge("FlyBase:FBgn0000008",
                               "GO:0005912").values())[0]
    assert ("infores:gene-ontology-monarch-version-202012"
            in et["aggregator_knowledge_source"])
Example #8
0
def test_transformer_infores_suppression():
    input_args = {
        "filename": [
            os.path.join(RESOURCE_DIR, "test_infores_coercion_nodes.tsv"),
            os.path.join(RESOURCE_DIR, "test_infores_coercion_edges.tsv"),
        ],
        "format":
        "tsv",
        "provided_by":
        "False",
        "aggregator_knowledge_source":
        False,
    }

    t = Transformer()
    t.transform(input_args=input_args)

    n1 = t.store.graph.nodes()["FlyBase:FBgn0000008"]
    assert "provided_by" not in n1

    n2 = t.store.graph.nodes()["GO:0005912"]
    assert "provided_by" not in n2

    et = list(
        t.store.graph.get_edge("FlyBase:FBgn0000008",
                               "GO:0005912").values())[0]
    assert "aggregator_knowledge_source" not in et
Example #9
0
def test_generate_classical_meta_knowledge_graph():
    """
    Test generate meta knowledge graph operation.
    """
    input_args = {
        'filename': [
            os.path.join(RESOURCE_DIR, 'graph_nodes.tsv'),
            os.path.join(RESOURCE_DIR, 'graph_edges.tsv'),
        ],
        'format':
        'tsv',
    }

    transformer = Transformer()

    transformer.transform(input_args)

    output_filename = os.path.join(TARGET_DIR,
                                   'test_meta_knowledge_graph-1.json')

    generate_meta_knowledge_graph(transformer.store.graph, 'Test Graph',
                                  output_filename)

    data = json.load(open(output_filename))
    assert data['name'] == 'Test Graph'
    assert 'NCBIGene' in data['nodes']['biolink:Gene']['id_prefixes']
    assert 'REACT' in data['nodes']['biolink:Pathway']['id_prefixes']
    assert 'HP' in data['nodes']['biolink:PhenotypicFeature']['id_prefixes']
    assert data['nodes']['biolink:Gene']['count'] == 178
    assert len(data['nodes']) == 8
    assert len(data['edges']) == 13
Example #10
0
def test_generate_streaming_meta_knowledge_graph_direct():
    """
    Test generate meta knowledge graph operation...
    MetaKnowledgeGraph as direct Transformer.transform Inspector
    """
    input_args = {
        'filename': [
            os.path.join(RESOURCE_DIR, 'graph_nodes.tsv'),
            os.path.join(RESOURCE_DIR, 'graph_edges.tsv'),
        ],
        'format':
        'tsv',
    }

    transformer = Transformer(stream=True)

    mkg = MetaKnowledgeGraph('Test Graph - Streamed')

    transformer.transform(input_args=input_args, inspector=mkg)

    assert mkg.get_name() == 'Test Graph - Streamed'
    assert mkg.get_total_nodes_count() == 512
    assert mkg.get_number_of_categories() == 8
    assert mkg.get_total_edges_count() == 540
    assert mkg.get_edge_mapping_count() == 13
    assert 'NCBIGene' in mkg.get_category('biolink:Gene').get_id_prefixes()
    assert 'REACT' in mkg.get_category('biolink:Pathway').get_id_prefixes()
    assert 'HP' in mkg.get_category(
        'biolink:PhenotypicFeature').get_id_prefixes()
    assert mkg.get_category('biolink:Gene').get_count() == 178
def test_clique_merge():
    """
    Test for clique merge.
    """
    input_args = {
        "filename": [
            os.path.join(RESOURCE_DIR, "cm_nodes.csv"),
            os.path.join(RESOURCE_DIR, "cm_edges.csv"),
        ],
        "format": "csv",
    }
    t = Transformer()
    t.transform(input_args)
    updated_graph, clique_graph = clique_merge(
        target_graph=t.store.graph, prefix_prioritization_map=prefix_prioritization_map
    )
    leaders = NxGraph.get_node_attributes(updated_graph, "clique_leader")
    leader_list = list(leaders.keys())
    leader_list.sort()
    assert len(leader_list) == 2
    n1 = updated_graph.nodes()[leader_list[0]]
    assert n1["election_strategy"] == "PREFIX_PRIORITIZATION"
    assert "NCBIGene:100302240" in n1["same_as"]
    assert "ENSEMBL:ENSG00000284458" in n1["same_as"]
    n2 = updated_graph.nodes()[leader_list[1]]
    assert n2["election_strategy"] == "PREFIX_PRIORITIZATION"
    assert "NCBIGene:8202" in n2["same_as"]
    assert "OMIM:601937" in n2["same_as"]
    assert "ENSEMBL:ENSG00000124151" not in n2["same_as"]
Example #12
0
def test_generate_classical_meta_knowledge_graph():
    """
    Test generate meta knowledge graph operation.
    """
    input_args = {
        "filename": [
            os.path.join(RESOURCE_DIR, "graph_nodes.tsv"),
            os.path.join(RESOURCE_DIR, "graph_edges.tsv"),
        ],
        "format":
        "tsv",
    }

    transformer = Transformer()

    transformer.transform(input_args)

    output_filename = os.path.join(TARGET_DIR,
                                   "test_meta_knowledge_graph-1.json")

    generate_meta_knowledge_graph(
        graph=transformer.store.graph,
        name="Test Graph",
        filename=output_filename,
        edge_facet_properties=["aggregator_knowledge_source"])

    data = json.load(open(output_filename))
    assert data["name"] == "Test Graph"
    _check_mkg_json_contents(data)
def _stream_transform(query):
    """
    Transform an input to an output via Transformer where streaming is enabled.
    """
    t1 = Transformer(stream=True)
    t1.transform(query[0], query[1])

    output = query[1]
    if output['format'] in {'tsv', 'csv', 'jsonl'}:
        input_args = {
            'filename': [
                f"{output['filename']}_nodes.{output['format']}",
                f"{output['filename']}_edges.{output['format']}",
            ],
            'format':
            output['format'],
        }
    elif output['format'] in {'neo4j'}:
        input_args = {
            'uri': DEFAULT_NEO4J_URL,
            'username': DEFAULT_NEO4J_USERNAME,
            'password': DEFAULT_NEO4J_PASSWORD,
            'format': 'neo4j',
        }
    else:
        input_args = {
            'filename': [f"{output['filename']}"],
            'format': output['format']
        }

    t2 = Transformer()
    t2.transform(input_args)

    assert t2.store.graph.number_of_nodes() == query[2]
    assert t2.store.graph.number_of_edges() == query[3]
Example #14
0
def parse_source(
    key: str,
    source: dict,
    output_directory: str,
    prefix_map: Dict[str, str] = None,
    node_property_predicates: Set[str] = None,
    predicate_mappings: Dict[str, str] = None,
    checkpoint: bool = False,
) -> Sink:
    """
    Parse a source from a merge config YAML.

    Parameters
    ----------
    key: str
        Source key
    source: Dict
        Source configuration
    output_directory: str
        Location to write output to
    prefix_map: Dict[str, str]
        Non-canonical CURIE mappings
    node_property_predicates: Set[str]
        A set of predicates that ought to be treated as node properties (This is applicable for RDF)
    predicate_mappings: Dict[str, str]
        A mapping of predicate IRIs to property names (This is applicable for RDF)
    checkpoint: bool
        Whether to serialize each individual source to a TSV

    Returns
    -------
    kgx.sink.sink.Sink
        Returns an instance of Sink

    """
    log.info(f"Processing source '{key}'")
    if not key:
        key = os.path.basename(source["input"]["filename"][0])
    input_args = prepare_input_args(
        key,
        source,
        output_directory,
        prefix_map,
        node_property_predicates,
        predicate_mappings,
    )
    transformer = Transformer(stream=True)
    transformer.transform(input_args)
    transformer.store.graph.name = key
    if checkpoint:
        log.info(f"Writing checkpoint for source '{key}'")
        checkpoint_output = f"{output_directory}/{key}" if output_directory else key
        transformer.save({"filename": checkpoint_output, "format": "tsv"})

    # Current "Callable" metadata not needed at this  point
    # but causes peculiar problems downstream, so we clear it.
    transformer.store.clear_graph_metadata()

    return transformer.store
Example #15
0
def neo4j_download(
    uri: str,
    username: str,
    password: str,
    output: str,
    output_format: str,
    output_compression: Optional[str],
    stream: bool,
    node_filters: Optional[Tuple] = None,
    edge_filters: Optional[Tuple] = None,
) -> Transformer:
    """
    Download nodes and edges from Neo4j database.

    Parameters
    ----------
    uri: str
        Neo4j URI. For example, https://localhost:7474
    username: str
        Username for authentication
    password: str
        Password for authentication
    output: str
        Where to write the output (stdout, by default)
    output_format: Optional[str]
        The output type (``tsv``, by default)
    output_compression: Optional[str]
        The output compression type
    stream: bool
        Whether to parse input as a stream
    node_filters: Optional[Tuple]
        Node filters
    edge_filters: Optional[Tuple]
        Edge filters

    Returns
    -------
    kgx.Transformer
        The NeoTransformer

    """
    transformer = Transformer(stream=stream)
    transformer.transform({
        "uri": uri,
        "username": username,
        "password": password,
        "format": "neo4j",
        "node_filters": node_filters,
        "edge_filters": edge_filters,
    })

    if not output_format:
        output_format = "tsv"
    transformer.save({
        "filename": output,
        "format": output_format,
        "compression": output_compression
    })
    return transformer
Example #16
0
def neo4j_upload(
    inputs: List[str],
    input_format: str,
    input_compression: Optional[str],
    uri: str,
    username: str,
    password: str,
    stream: bool,
    node_filters: Optional[Tuple] = None,
    edge_filters: Optional[Tuple] = None,
) -> Transformer:
    """
    Upload a set of nodes/edges to a Neo4j database.

    Parameters
    ----------
    inputs: List[str]
        A list of files that contains nodes/edges
    input_format: str
        The input format
    input_compression: Optional[str]
        The input compression type
    uri: str
        The full HTTP address for Neo4j database
    username: str
        Username for authentication
    password: str
        Password for authentication
    stream: bool
        Whether to parse input as a stream
    node_filters: Optional[Tuple]
        Node filters
    edge_filters: Optional[Tuple]
        Edge filters

    Returns
    -------
    kgx.Transformer
        The NeoTransformer

    """
    transformer = Transformer(stream=stream)
    transformer.transform({
        "filename": inputs,
        "format": input_format,
        "compression": input_compression,
        "node_filters": node_filters,
        "edge_filters": edge_filters,
    })
    transformer.save({
        "uri": uri,
        "username": username,
        "password": password,
        "format": "neo4j"
    })
    return transformer
Example #17
0
def test_validate_json():
    """
    Validate against a valid representative Biolink Model compliant JSON.
    """
    input_args = {
        "filename": [os.path.join(RESOURCE_DIR, "valid.json")],
        "format": "json",
    }
    t = Transformer()
    t.transform(input_args)
    validator = Validator()
    validator.validate(t.store.graph)
    assert len(validator.get_errors()) == 0
Example #18
0
def test_validate_json():
    """
    Validate against a valid representative Biolink Model compliant JSON.
    """
    input_args = {
        'filename': [os.path.join(RESOURCE_DIR, 'valid.json')],
        'format': 'json'
    }
    t = Transformer()
    t.transform(input_args)
    validator = Validator()
    e = validator.validate(t.store.graph)
    assert len(e) == 0
Example #19
0
def test_meta_knowledge_graph_multiple_category_and_predicate_parsing():
    """
    Test meta knowledge graph parsing multiple categories
    """
    input_args = {
        'filename': [
            os.path.join(RESOURCE_DIR, 'graph_multi_category_nodes.tsv'),
            os.path.join(RESOURCE_DIR, 'graph_multi_category_edges.tsv'),
        ],
        'format':
        'tsv',
    }

    t = Transformer(stream=True)

    mkg = MetaKnowledgeGraph(name='Test Graph - Multiple Node Categories')

    t.transform(input_args=input_args, inspector=mkg)

    assert mkg.get_name() == 'Test Graph - Multiple Node Categories'

    assert mkg.get_total_nodes_count() == 10

    # unique set, including (shared) parent
    # classes (not including category 'unknown' )
    assert mkg.get_number_of_categories() == 7

    assert mkg.get_node_count_by_category("biolink:Disease") == 1
    assert mkg.get_node_count_by_category("biolink:BiologicalEntity") == 5
    assert mkg.get_node_count_by_category(
        "biolink:AnatomicalEntityEntity") == 0

    # sums up all the counts of node mappings across
    # all categories (not including category 'unknown')
    assert mkg.get_total_node_counts_across_categories() == 35

    # only counts 'valid' edges for which
    # subject and object nodes are in the nodes file
    assert mkg.get_total_edges_count() == 8

    # total number of distinct predicates
    assert mkg.get_predicate_count() == 2

    # counts edges with a given predicate
    # (ignoring edges with unknown subject or object identifiers)
    assert mkg.get_edge_count_by_predicate("biolink:has_phenotype") == 4
    assert mkg.get_edge_count_by_predicate("biolink:involved_in") == 0

    assert mkg.get_edge_mapping_count() == 25

    assert mkg.get_total_edge_counts_across_mappings() == 100
Example #20
0
def test_rdf_transform_with_filters1(query):
    """
    Test RDF transform with filters.
    """
    input_args = {
        "filename": [os.path.join(RESOURCE_DIR, "rdf", "test3.nt")],
        "format": "nt",
        "node_filters": query[0],
        "edge_filters": query[1],
    }
    t = Transformer()
    t.transform(input_args)

    assert t.store.graph.number_of_edges() == query[3]
Example #21
0
def test_rdf_transform3():
    """
    Test parsing an RDF N-triple and round-trip.
    """
    input_args1 = {
        'filename': [os.path.join(RESOURCE_DIR, 'rdf', 'test1.nt')],
        'format': 'nt'
    }
    t1 = Transformer()
    t1.transform(input_args1)
    assert t1.store.graph.number_of_nodes() == 2
    assert t1.store.graph.number_of_edges() == 1

    output_args1 = {
        'filename': os.path.join(TARGET_DIR, 'test1-export.nt'),
        'format': 'nt'
    }
    t1.save(output_args1)

    input_args2 = {
        'filename': [os.path.join(TARGET_DIR, 'test1-export.nt')],
        'format': 'nt'
    }
    t2 = Transformer()
    t2.transform(input_args2)
    assert t2.store.graph.number_of_nodes() == 2
    assert t2.store.graph.number_of_edges() == 1

    n1t1 = t1.store.graph.nodes()['ENSEMBL:ENSG0000000000001']
    n1t2 = t2.store.graph.nodes()['ENSEMBL:ENSG0000000000001']
    n1t3 = t2.store.graph.nodes()['ENSEMBL:ENSG0000000000001']

    assert n1t1['type'] == n1t2['type'] == n1t3['type'] == 'SO:0000704'
    assert len(n1t1['category']) == len(n1t2['category']) == len(
        n1t3['category']) == 4
    assert ('biolink:Gene' in n1t1['category']
            and 'biolink:Gene' in n1t2['category']
            and 'biolink:Gene' in n1t3['category'])
    assert ('biolink:GenomicEntity' in n1t1['category']
            and 'biolink:GenomicEntity' in n1t2['category']
            and 'biolink:GenomicEntity' in n1t3['category'])
    assert ('biolink:NamedThing' in n1t1['category']
            and 'biolink:NamedThing' in n1t2['category']
            and 'biolink:NamedThing' in n1t3['category'])
    assert n1t1['name'] == n1t2['name'] == n1t3['name'] == 'Test Gene 123'
    assert (n1t1['description'] == n1t2['description'] == n1t3['description']
            == 'This is a Test Gene 123')
    assert ('Test Dataset' in n1t1['provided_by']
            and 'Test Dataset' in n1t2['provided_by']
            and 'Test Dataset' in n1t3['provided_by'])
Example #22
0
def test_rdf_transform3():
    """
    Test parsing an RDF N-triple and round-trip.
    """
    input_args1 = {
        "filename": [os.path.join(RESOURCE_DIR, "rdf", "test1.nt")],
        "format": "nt",
    }
    t1 = Transformer()
    t1.transform(input_args1)
    assert t1.store.graph.number_of_nodes() == 2
    assert t1.store.graph.number_of_edges() == 1

    output_args1 = {
        "filename": os.path.join(TARGET_DIR, "test1-export.nt"),
        "format": "nt",
    }
    t1.save(output_args1)

    input_args2 = {
        "filename": [os.path.join(TARGET_DIR, "test1-export.nt")],
        "format": "nt",
    }
    t2 = Transformer()
    t2.transform(input_args2)
    assert t2.store.graph.number_of_nodes() == 2
    assert t2.store.graph.number_of_edges() == 1

    n1t1 = t1.store.graph.nodes()["ENSEMBL:ENSG0000000000001"]
    n1t2 = t2.store.graph.nodes()["ENSEMBL:ENSG0000000000001"]
    n1t3 = t2.store.graph.nodes()["ENSEMBL:ENSG0000000000001"]

    assert n1t1["type"] == n1t2["type"] == n1t3["type"] == "SO:0000704"
    assert len(n1t1["category"]) == len(n1t2["category"]) == len(
        n1t3["category"]) == 4
    assert ("biolink:Gene" in n1t1["category"]
            and "biolink:Gene" in n1t2["category"]
            and "biolink:Gene" in n1t3["category"])
    assert ("biolink:GenomicEntity" in n1t1["category"]
            and "biolink:GenomicEntity" in n1t2["category"]
            and "biolink:GenomicEntity" in n1t3["category"])
    assert ("biolink:NamedThing" in n1t1["category"]
            and "biolink:NamedThing" in n1t2["category"]
            and "biolink:NamedThing" in n1t3["category"])
    assert n1t1["name"] == n1t2["name"] == n1t3["name"] == "Test Gene 123"
    assert (n1t1["description"] == n1t2["description"] == n1t3["description"]
            == "This is a Test Gene 123")
    assert ("Test Dataset" in n1t1["provided_by"]
            and "Test Dataset" in n1t2["provided_by"]
            and "Test Dataset" in n1t3["provided_by"])
Example #23
0
def parse_source(
    key: str,
    source: dict,
    output_directory: str,
    prefix_map: Dict[str, str] = None,
    node_property_predicates: Set[str] = None,
    predicate_mappings: Dict[str, str] = None,
    checkpoint: bool = False,
) -> Sink:
    """
    Parse a source from a merge config YAML.

    Parameters
    ----------
    key: str
        Source key
    source: Dict
        Source configuration
    output_directory: str
        Location to write output to
    prefix_map: Dict[str, str]
        Non-canonical CURIE mappings
    node_property_predicates: Set[str]
        A set of predicates that ought to be treated as node properties (This is applicable for RDF)
    predicate_mappings: Dict[str, str]
        A mapping of predicate IRIs to property names (This is applicable for RDF)
    checkpoint: bool
        Whether to serialize each individual source to a TSV

    Returns
    -------
    kgx.sink.sink.Sink
        Returns an instance of Sink

    """
    log.info(f"Processing source '{key}'")
    if not key:
        key = os.path.basename(source['input']['filename'][0])
    input_args = prepare_input_args(key, source, output_directory, prefix_map,
                                    node_property_predicates,
                                    predicate_mappings)
    transformer = Transformer()
    transformer.transform(input_args)
    transformer.store.graph.name = key
    if checkpoint:
        log.info(f"Writing checkpoint for source '{key}'")
        checkpoint_output = f"{output_directory}/{key}" if output_directory else key
        transformer.save({'filename': checkpoint_output, 'format': 'tsv'})
    return transformer.store
Example #24
0
def test_rdf_transform_with_filters1(query):
    """
    Test RDF transform with filters.
    """
    input_args = {
        'filename': [os.path.join(RESOURCE_DIR, 'rdf', 'test3.nt')],
        'format': 'nt',
        'node_filters': query[0],
        'edge_filters': query[1],
    }
    t = Transformer()
    t.transform(input_args)

    assert t.store.graph.number_of_nodes() == query[2]
    assert t.store.graph.number_of_edges() == query[3]
Example #25
0
def test_neo_to_graph_transform():
    """
    Test to read from Neo4j and write to CSV.
    """
    input_args = {
        'uri': DEFAULT_NEO4J_URL,
        'username': DEFAULT_NEO4J_USERNAME,
        'password': DEFAULT_NEO4J_PASSWORD,
        'format': 'neo4j',
    }
    output_filename = os.path.join(TARGET_DIR, 'neo_graph')
    output_args = {'filename': output_filename, 'format': 'csv'}
    t = Transformer()
    t.transform(input_args, output_args)
    assert t.store.graph.number_of_nodes() == 10
    assert t.store.graph.number_of_edges() == 11
    assert os.path.exists(f"{output_filename}_nodes.csv")
    assert os.path.exists(f"{output_filename}_edges.csv")
def test_clique_generation():
    """
    Test for generation of cliques.
    """
    input_args = {
        "filename": [
            os.path.join(RESOURCE_DIR, "cm_nodes.csv"),
            os.path.join(RESOURCE_DIR, "cm_edges.csv"),
        ],
        "format": "csv",
    }
    t = Transformer()
    t.transform(input_args)
    updated_graph, clique_graph = clique_merge(
        target_graph=t.store.graph, prefix_prioritization_map=prefix_prioritization_map
    )
    cliques = list(nx.strongly_connected_components(clique_graph))
    assert len(cliques) == 2
Example #27
0
def test_transform_filters1(query):
    """
    Test transform with filters.
    """
    input_args = {
        'filename': [
            os.path.join(RESOURCE_DIR, 'test2_nodes.tsv'),
            os.path.join(RESOURCE_DIR, 'test2_edges.tsv'),
        ],
        'format':
        'tsv',
        'node_filters':
        query[0],
        'edge_filters':
        query[1],
    }
    t = Transformer()
    t.transform(input_args)
    assert t.store.graph.number_of_nodes() == query[2]
    assert t.store.graph.number_of_edges() == query[3]
Example #28
0
def test_transform_filters1(query):
    """
    Test transform with filters.
    """
    input_args = {
        "filename": [
            os.path.join(RESOURCE_DIR, "test2_nodes.tsv"),
            os.path.join(RESOURCE_DIR, "test2_edges.tsv"),
        ],
        "format":
        "tsv",
        "node_filters":
        query[0],
        "edge_filters":
        query[1],
    }
    t = Transformer()
    t.transform(input_args)
    assert t.store.graph.number_of_nodes() == query[2]
    assert t.store.graph.number_of_edges() == query[3]
Example #29
0
def test_csv_to_neo4j_load_to_graph_transform(clean_database):
    """
    Test to load a csv KGX file into Neo4j.
    """
    logger.debug("test_csv_to_neo4j_load...")
    input_args1 = {
        "filename": [
            os.path.join(RESOURCE_DIR, "cm_nodes.csv"),
            os.path.join(RESOURCE_DIR, "cm_edges.csv"),
        ],
        "format":
        "csv",
    }
    t1 = Transformer()
    t1.transform(input_args1)

    output_args = {
        "uri": DEFAULT_NEO4J_URL,
        "username": DEFAULT_NEO4J_USERNAME,
        "password": DEFAULT_NEO4J_PASSWORD,
        "format": "neo4j",
    }
    t1.save(output_args)
    """
    Continue sequentially to test read from Neo4j to write out back to CSV.
    """
    logger.debug("test_neo4j_to_graph_transform")
    input_args = {
        "uri": DEFAULT_NEO4J_URL,
        "username": DEFAULT_NEO4J_USERNAME,
        "password": DEFAULT_NEO4J_PASSWORD,
        "format": "neo4j",
    }
    output_filename = os.path.join(TARGET_DIR, "neo_graph")
    output_args = {"filename": output_filename, "format": "csv"}
    t = Transformer()
    t.transform(input_args, output_args)
    assert t.store.graph.number_of_nodes() == 10
    assert t.store.graph.number_of_edges() == 11
    assert os.path.exists(f"{output_filename}_nodes.csv")
    assert os.path.exists(f"{output_filename}_edges.csv")
Example #30
0
def test_merge():
    """
    Test for merging graphs.
    """
    input_args1 = {
        'filename': [
            os.path.join(RESOURCE_DIR, 'merge', 'test1_nodes.tsv'),
            os.path.join(RESOURCE_DIR, 'merge', 'test1_edges.tsv'),
        ],
        'format':
        'tsv',
    }
    t1 = Transformer()
    t1.transform(input_args1)

    input_args2 = {
        'filename': [
            os.path.join(RESOURCE_DIR, 'merge', 'test2_nodes.tsv'),
            os.path.join(RESOURCE_DIR, 'merge', 'test2_edges.tsv'),
        ],
        'format':
        'tsv',
    }
    t2 = Transformer()
    t2.transform(input_args2)

    merged_graph = merge_all_graphs([t1.store.graph, t2.store.graph],
                                    preserve=True)
    assert len(merged_graph.nodes()) == 6
    assert len(merged_graph.edges()) == 8

    x1 = merged_graph.nodes()['x1']
    assert x1['name'] == 'node x1'

    assert isinstance(x1['category'], list)
    assert 'a' in x1['p1']
    assert '1' in x1['p1']

    x10 = merged_graph.nodes()['x10']
    assert x10['id'] == 'x10'
    assert x10['name'] == 'node x10'