def load_and_merge(config: dict, load_config): """ Load nodes and edges from files and KGs, as defined in a config YAML, and merge them into a single graph. The merge happens in-memory. This merged graph can then be written to a local/remote Neo4j instance OR be serialized into a file. \f .. note:: Everything here is driven by the ``load_config`` YAML. Parameters ---------- """ with open(load_config, 'r') as YML: cfg = yaml.load(YML, Loader=yaml.FullLoader) transformers = [] for key in cfg['target']: target = cfg['target'][key] logging.info("Loading {}".format(key)) if target['type'] in get_file_types(): # loading from a file transformer = get_transformer(target['type'])() transformer.parse(target['filename']) transformers.append(transformer) elif target['type'] == 'neo4j': transformer = kgx.NeoTransformer(None, target['uri'], target['username'], target['password']) # TODO: support filters transformer.load() transformers.append(transformer) else: logging.error( "type {} not yet supported for KGX load-and-merge operation.". format(target['type'])) merged_transformer = Transformer() merged_transformer.merge_graphs([x.graph for x in transformers]) destination = cfg['destination'] if destination['type'] in ['csv', 'tsv', 'ttl', 'json', 'tar']: destination_transformer = get_transformer(destination['type'])() destination_transformer.save(destination['filename']) elif destination['type'] == 'neo4j': destination_transformer = kgx.NeoTransformer( merged_transformer.graph, uri=destination['uri'], username=destination['username'], password=destination['password']) destination_transformer.save_with_unwind() else: logging.error( "type {} not yet supported for KGX load-and-merge operation.". format(destination['type']))
def load_and_merge(merge_config, destination_uri, destination_username, destination_password): """ Load nodes and edges from KGs, as defined in a config YAML, and merge them into a single graph """ with open(merge_config, 'r') as ymlfile: cfg = yaml.load(ymlfile) transformers = [] for key in cfg['target']: logging.info("Connecting to {}".format(cfg['target'][key])) uri = "{}:{}".format(cfg['target'][key]['neo4j']['host'], cfg['target'][key]['neo4j']['port']) n = kgx.NeoTransformer(None, uri, cfg['target'][key]['neo4j']['username'], cfg['target'][key]['neo4j']['password']) transformers.append(n) if 'target_filter' in cfg['target'][key]: for target_filter in cfg['target'][key]['target_filter']: # Set filters n.set_filter( target_filter, cfg['target'][key]['target_filter'][target_filter]) start = 0 end = None if 'query_limits' in cfg['target'][key]: if 'start' in cfg['target'][key]['query_limits']: start = cfg['target'][key]['query_limits']['start'] if 'end' in cfg['target'][key]['query_limits']: end = cfg['target'][key]['query_limits']['end'] n.load(start=start, end=end) mergedTransformer = Transformer() mergedTransformer.merge([x.graph for x in transformers]) if destination_uri and destination_username and destination_password: destination = kgx.NeoTransformer(mergedTransformer.graph, uri=destination_uri, username=destination_username, password=destination_password) destination.save_with_unwind()
def test_serialization(): graphs = get_graphs() t1 = Transformer(source_graph=graphs[0]) assert t1.is_empty() is False Transformer.dump_to_file(t1.graph, os.path.join(target_dir, 'graph_serialization.json')) new_graph = Transformer.restore_from_file(os.path.join(target_dir, 'graph_serialization.json')) t2 = Transformer(source_graph=new_graph) assert t1.is_empty() is False assert t2.graph.number_of_nodes() == t1.graph.number_of_nodes() assert t2.graph.number_of_edges() == t1.graph.number_of_edges()
def load_and_merge(yaml_file: str) -> Transformer: """Load and merge sources defined in the config YAML. Args: yaml_file: A string pointing to a KGX compatible config YAML. Returns: kgx.Transformer: The merged transformer that contains the merged graph. """ config = parse_load_config(yaml_file) transformers: List = [] # read all the sources defined in the YAML for key in config['target']: target = config['target'][key] logging.info("Loading {}".format(key)) if target['type'] in get_file_types(): # loading from a file transformer = get_transformer(target['type'])() for f in target['filename']: transformer.parse(f, input_format='tsv') transformers.append(transformer) elif target['type'] == 'neo4j': transformer = NeoTransformer(None, target['uri'], target['username'], target['password']) transformer.load() transformers.append(transformer) else: logging.error("type {} not yet supported".format(target['type'])) # merge all subgraphs into a single graph merged_transformer = Transformer() merged_transformer.merge_graphs([x.graph for x in transformers]) merged_transformer.report() # write the merged graph if 'destination' in config: destination = config['destination'] if destination['type'] in ['csv', 'tsv', 'ttl', 'json', 'tar']: destination_transformer = get_transformer(destination['type'])( merged_transformer.graph) destination_transformer.save(destination['filename'], extension=destination['type']) elif destination['type'] == 'neo4j': destination_transformer = NeoTransformer( merged_transformer.graph, uri=destination['uri'], username=destination['username'], password=destination['password']) destination_transformer.save_with_unwind() else: logging.error( "type {} not yet supported for KGX load-and-merge operation.". format(destination['type'])) return merged_transformer
def set_transformer_filters(transformer:Transformer, labels:list, properties:list) -> None: for location, label in labels: if location == FilterLocation.EDGE.value: target = '{}_label'.format(location) transformer.set_filter(target=target, value=label) else: target = '{}_category'.format(location) transformer.set_filter(target=target, value=label) for location, property_name, property_value in properties: target = '{}_property'.format(location) transformer.set_filter(target=target, value=(property_name, property_value))
def test_validate_correct_edge(edge): e = Transformer.validate_edge(edge) assert e is not None
def test_validate_incorrect_edge(edge): with pytest.raises(KeyError): Transformer.validate_edge(edge)
def test_validate_correct_node(node): n = Transformer.validate_node(node) assert n is not None assert 'category' in n assert n['category'][0] == Transformer.DEFAULT_NODE_CATEGORY
def test_validate_incorrect_node(node): with pytest.raises(KeyError): Transformer.validate_node(node)
def test_transformer(): t = Transformer() assert isinstance(t.graph, Graph) assert t.is_empty() t.set_node_filter('category', {'biolink:Gene'}) t.set_node_filter('category', {'biolink:Disease'}) t.set_edge_filter('edge_label', {'biolink:related_to'}) t.set_edge_filter('edge_label', {'biolink:interacts_with'}) t.set_edge_filter('subject_category', {'biolink:Drug'}) assert len(t.node_filters.keys()) == 1 assert len(t.edge_filters.keys()) == 3 assert 'category' in t.node_filters and len(t.node_filters['category']) == 3 assert 'edge_label' in t.edge_filters and len(t.edge_filters['edge_label']) == 2 assert 'subject_category' in t.edge_filters \ and len(t.edge_filters['subject_category']) == 3 \ and 'biolink:Gene' in t.edge_filters['subject_category'] assert 'object_category' in t.edge_filters \ and len(t.edge_filters['object_category']) == 3 \ and 'biolink:Gene' in t.edge_filters['object_category'] assert 'biolink:Drug' in t.node_filters['category']