def test_graph_to_neo_load(): """ load nx graph to neo4j test """ t = PandasTransformer() t.parse("tests/resources/x1n.csv") t.parse("tests/resources/x1e.csv") t.report() n = NeoTransformer(t) n.save() n.neo4j_report()
def test_neo_to_graph_upload(): """ loads a neo4j graph from a json file """ jt = JsonTransformer() jt.parse('resources/robodb2.json') nt = NeoTransformer(jt.graph, uri=DEFAULT_NEO4J_URL, username=DEFAULT_NEO4J_USERNAME, password=DEFAULT_NEO4J_PASSWORD) nt.save() nt.neo4j_report()
def test_csv_to_neo_load(): """ load csv to neo4j test """ pt = PandasTransformer() pt.parse(os.path.join(resource_dir, "cm_nodes.csv")) pt.parse(os.path.join(resource_dir, "cm_edges.csv")) nt = NeoTransformer(pt.graph, uri=DEFAULT_NEO4J_URL, username=DEFAULT_NEO4J_USERNAME, password=DEFAULT_NEO4J_PASSWORD) nt.save() nt.neo4j_report()
def test_get_edges(clean_slate, query): g = get_graph('kgx-unit-test')[3] t = NeoTransformer(g, uri=DEFAULT_NEO4J_URL, username=DEFAULT_NEO4J_USERNAME, password=DEFAULT_NEO4J_PASSWORD) t.save() for k, v in query[1].items(): t.set_edge_filter(k, v) edges = t.get_edges() edge_list = [x[1] for x in edges] assert len(edges) == query[2]
def test_load(clean_slate, query): t = NeoTransformer(query[0], uri=DEFAULT_NEO4J_URL, username=DEFAULT_NEO4J_USERNAME, password=DEFAULT_NEO4J_PASSWORD) t.save() nr = t.http_driver.query("MATCH (n) RETURN count(n)") [node_counts] = [x for x in nr][0] assert node_counts == query[1] er = t.http_driver.query("MATCH ()-[p]->() RETURN count(p)") [edge_counts] = [x for x in er][0] assert edge_counts == query[2]
def test_get_nodes(clean_slate, query): g = query[0] t = NeoTransformer(g, uri=DEFAULT_NEO4J_URL, username=DEFAULT_NEO4J_USERNAME, password=DEFAULT_NEO4J_PASSWORD) t.save() for k, v in query[1].items(): t.set_node_filter(k, v) nodes = t.get_nodes() assert len(nodes) == query[2] node_ids = [x['id'] for x in nodes] for x in query[3]: assert x in node_ids
def test_save_merge(clean_slate): g = get_graph('kgx-unit-test')[2] t = NeoTransformer(g, uri=DEFAULT_NEO4J_URL, username=DEFAULT_NEO4J_USERNAME, password=DEFAULT_NEO4J_PASSWORD) t.save() t.graph.add_node('B', id='B', publications=['PMID:1', 'PMID:2'], category=['biolink:NamedThing']) t.graph.add_node('C', id='C', source='kgx-unit-test') t.graph.add_edge('A', 'B', subject='A', object='B', edge_label='biolink:related_to', test_prop='VAL123') assert t.graph.number_of_nodes() == 3 t.save() nr = t.http_driver.query("MATCH (n) RETURN n") for node in nr: data = node[0]['data'] if data['id'] == 'B': assert 'category' in data and data['category'] == [ 'biolink:NamedThing' ] assert 'publications' in data and data['publications'] == [ 'PMID:1', 'PMID:2' ] er = t.http_driver.query("MATCH ()-[p]-() RETURN p", data_contents=True, returns=(Node, Relationship, Node)) for edge in er: data = edge[0].properties # assert data['id'] == 'A-biolink:related_to-B' assert data['subject'] == 'A' assert data['object'] == 'B' assert data['edge_label'] == 'biolink:related_to' assert data['test_prop'] == 'VAL123'
def load_and_merge(yaml_file: str) -> nx.MultiDiGraph: """Load and merge sources defined in the config YAML. Args: yaml_file: A string pointing to a KGX compatible config YAML. Returns: networkx.MultiDiGraph: The merged graph. """ gm = GraphMerge() config = parse_load_config(yaml_file) transformers: List = [] # make sure all files exist before we start load for key in config['target']: target = config['target'][key] logging.info("Checking that file exist for {}".format(key)) if target['type'] in get_file_types(): for f in target['filename']: if not os.path.exists(f) or not os.path.isfile(f): raise FileNotFoundError( "File {} for transform {} in yaml file {} " "doesn't exist! Dying.", f, key, yaml_file) # read all the sources defined in the YAML for key in config['target']: target = config['target'][key] logging.info("Loading {}".format(key)) if target['type'] in get_file_types(): # loading from a file transformer = get_transformer(target['type'])() for f in target['filename']: transformer.parse(f, input_format='tsv') transformer.graph.name = key transformers.append(transformer) elif target['type'] == 'neo4j': transformer = NeoTransformer(None, target['uri'], target['username'], target['password']) transformer.load() transformers.append(transformer) transformer.graph.name = key else: logging.error("type {} not yet supported".format(target['type'])) stats_filename = f"{key}_stats.yaml" generate_graph_stats(transformer.graph, key, stats_filename) # merge all subgraphs into a single graph merged_graph = gm.merge_all_graphs([x.graph for x in transformers]) merged_graph.name = 'merged_graph' generate_graph_stats(merged_graph, merged_graph.name, f"merged_graph_stats.yaml") # write the merged graph if 'destination' in config: for _, destination in config['destination'].items(): if destination['type'] == 'neo4j': destination_transformer = NeoTransformer( merged_graph, uri=destination['uri'], username=destination['username'], password=destination['password']) destination_transformer.save_with_unwind() elif destination['type'] in get_file_types(): destination_transformer = get_transformer( destination['type'])(merged_graph) destination_transformer.save(destination['filename'], extension=destination['type']) else: logging.error( "type {} not yet supported for KGX load-and-merge operation." .format(destination['type'])) return merged_graph
parser.add_argument('--edges', help='file with edges in TSV format') parser.add_argument('--uri', help='URI/URL for Neo4j (including port)', default='localhost:7474') parser.add_argument('--username', help='username', default='neo4j') parser.add_argument('--password', help='password', default='demo') args = parser.parse_args() if args.nodes is None and args.edges is None: usage() exit() # Initialize PandasTransformer t = PandasTransformer() # Load nodes and edges into graph if args.nodes: t.parse(args.nodes, input_format='tsv') if args.edges: t.parse(args.edges, input_format='tsv') # Initialize NeoTransformer n = NeoTransformer(t.graph, uri=args.uri, username=args.username, password=args.password) # Save graph into Neo4j n.save() n.neo4j_report()
def load_and_merge(yaml_file: str) -> nx.MultiDiGraph: """Load and merge sources defined in the config YAML. Args: yaml_file: A string pointing to a KGX compatible config YAML. Returns: networkx.MultiDiGraph: The merged graph. """ config = parse_load_config(yaml_file) transformers: List = [] # make sure all files exist before we start load for key in config['target']: target = config['target'][key] logging.info("Checking that file exist for {}".format(key)) if target['type'] in get_file_types(): for f in target['filename']: if not os.path.exists(f) or not os.path.isfile(f): raise FileNotFoundError( "File {} for transform {} in yaml file {} " "doesn't exist! Dying.", f, key, yaml_file) # read all the sources defined in the YAML for key in config['target']: target = config['target'][key] logging.info("Loading {}".format(key)) if target['type'] in get_file_types(): # loading from a file try: transformer = get_transformer(target['type'])() if target['type'] in {'tsv', 'neo4j'}: if 'filters' in target: apply_filters(target, transformer) for f in target['filename']: transformer.parse(f, input_format='tsv') transformer.graph.name = key if 'operations' in target: apply_operations(target, transformer) transformers.append(transformer) except: logging.error("Failed loading {}".format(f)) elif target['type'] == 'neo4j': transformer = NeoTransformer(None, target['uri'], target['username'], target['password']) if 'filters' in target: apply_filters(target, transformer) transformer.load() if 'operations' in target: apply_operations(target, transformer) transformers.append(transformer) transformer.graph.name = key else: logging.error("type {} not yet supported".format(target['type'])) stats_filename = f"{key}_stats.yaml" generate_graph_stats(transformer.graph, key, stats_filename) # merge all subgraphs into a single graph merged_graph = merge_all_graphs([x.graph for x in transformers]) merged_graph.name = 'merged_graph' generate_graph_stats(merged_graph, merged_graph.name, "merged_graph_stats.yaml", ['provided_by'], ['provided_by']) # write the merged graph if 'destination' in config: for _, destination in config['destination'].items(): if destination['type'] == 'neo4j': destination_transformer = NeoTransformer( merged_graph, uri=destination['uri'], username=destination['username'], password=destination['password']) destination_transformer.save() elif destination['type'] in get_file_types(): destination_transformer = get_transformer( destination['type'])(merged_graph) mode = 'w:gz' if destination['type'] in {'tsv'} else None if destination['type'] in {'nt', 'nt.gz', 'ttl'}: destination_transformer.set_property_types(PROPERTY_TYPES) destination_transformer.save(destination['filename'], output_format=destination['type'], mode=mode) else: logging.error( "type {} not yet supported for KGX load-and-merge operation." .format(destination['type'])) return merged_graph