def test_neo_to_graph_upload(): """ loads a neo4j graph from a json file """ jt = JsonTransformer() jt.parse('resources/robodb2.json') nt = NeoTransformer(jt.graph, host='localhost', port='7474', username='******', password='******') nt.save_with_unwind() nt.neo4j_report()
def test_load(): """ Test for loading into JsonTransformer """ json_file = os.path.join(resource_dir, 'semmed/gene.json') jt = JsonTransformer() jt.parse(json_file) edge_list = list(jt.graph.edges(data=True)) assert edge_list[0][-1]['subject'] == 'UMLS:C0948075' assert edge_list[0][-1]['object'] == 'UMLS:C1290952'
def test_validate_json(): """ Validate against a valid representative Biolink Model compliant JSON """ json_file = os.path.join(resource_dir, 'valid.json') jt = JsonTransformer() jt.parse(json_file) validator = Validator() e = validator.validate(jt.graph) assert len(e) == 0
def test_neo_to_graph_upload(): """ loads a neo4j graph from a json file """ jt = JsonTransformer() jt.parse('resources/robodb2.json') nt = NeoTransformer(jt.graph, uri=DEFAULT_NEO4J_URL, username=DEFAULT_NEO4J_USERNAME, password=DEFAULT_NEO4J_PASSWORD) nt.save() nt.neo4j_report()
def test_load(): """ load tests """ t = ObanRdfTransformer() t.parse("tests/resources/monarch/biogrid_test.ttl") t.report() w1 = PandasTransformer(t) w1.save('target/biogrid-e.csv', type='e') w1.save('target/biogrid-n.csv', type='n') w2 = GraphMLTransformer(t) w2.save("target/x1n.graphml") w3 = JsonTransformer(t) w3.save("target/x1n.json")
def test_owl_load(): """ Load a test OWL and export as JSON """ input_file = os.path.join(resource_dir, 'mody.ttl') output_archive_file = os.path.join(target_dir, 'mondo_test') output_json_file = os.path.join(target_dir, 'mondo_test.json') t = RdfOwlTransformer() t.parse(input_file, input_format='ttl') t.report() pt = PandasTransformer(t.graph) pt.save(output_archive_file) jt = JsonTransformer(t.graph) jt.save(output_json_file)
def test_load(): """ load and save tests """ cwd = os.path.abspath(os.path.dirname(__file__)) src_path = os.path.join(cwd, 'resources', 'monarch', 'biogrid_test.ttl') tpath = os.path.join(cwd, 'target') os.makedirs(tpath, exist_ok=True) tg_path = os.path.join(tpath, "test_output.ttl") # execute ObanRdfTransformer's parse and save function t = ObanRdfTransformer() t.parse(src_path, input_format="turtle") t.save(tg_path, output_format="turtle") t.report() w1 = PandasTransformer(t.graph) w1.save(os.path.join(tpath, 'biogrid-e.csv'), type='e') w1.save(os.path.join(tpath, 'biogrid-n.csv'), type='n') # read again the source, test graph src_graph = rdflib.Graph() src_graph.parse(src_path, format="turtle") # read again the dumped target graph tg_graph = rdflib.Graph() tg_graph.parse(tg_path, format="turtle") # compare subgraphs from the source and the target graph. OBAN = Namespace('http://purl.org/oban/') for a in src_graph.subjects(RDF.type, OBAN.association): oban_src_graph = rdflib.Graph() oban_src_graph += src_graph.triples((a, None, None)) oban_tg_graph = rdflib.Graph() oban_tg_graph += tg_graph.triples((a, None, None)) # see they are indeed identical (isomorphic) if not oban_src_graph.isomorphic(oban_tg_graph): raise RuntimeError('The subgraphs whose subject is ' + str(a) + ' are not isomorphic ones.') w2 = GraphMLTransformer(t.graph) w2.save(os.path.join(tpath, "x1n.graphml")) w3 = JsonTransformer(t.graph) w3.save(os.path.join(tpath, "x1n.json"))
def main(path, output, model): if model is not None: bmt.load(model) t = JsonTransformer() t.parse(path) t = PandasTransformer(t.graph) t.save(output)
def test_json_load(): t = JsonTransformer() t.parse(os.path.join(resource_dir, 'valid.json')) assert t.graph.number_of_nodes() == 6 assert t.graph.number_of_edges() == 5 n = t.graph.nodes['MONDO:0017148'] assert isinstance(n, dict) assert 'id' in n and n['id'] == 'MONDO:0017148' assert n['name'] == 'heritable pulmonary arterial hypertension' assert n['category'][0] == 'biolink:Disease' data = t.graph.get_edge_data('HGNC:11603', 'MONDO:0017148') assert len(data.keys()) == 1 data = data.popitem()[1] assert data['subject'] == 'HGNC:11603' assert data['object'] == 'MONDO:0017148' assert data['edge_label'] == 'biolink:related_to' assert data['relation'] == 'RO:0004013'
def test_load(): """ load tests """ cwd = os.path.abspath(os.path.dirname(__file__)) resdir = os.path.join(cwd, 'resources') tdir = os.path.join(cwd, 'target') os.makedirs(tdir, exist_ok=True) t = RdfOwlTransformer() fn = os.path.join(resdir, "mody.ttl.gz") f = gzip.open(fn, 'rb') t.parse(f, input_format='ttl') t.report() w1 = PandasTransformer(t.graph) w1.save(os.path.join(tdir, 'mondo-e.csv'), type='e') w1.save(os.path.join(tdir, 'mondo-n.csv'), type='n') w3 = JsonTransformer(t.graph) w3.save(os.path.join(tdir, "mondo.json"))
def test_load(): """ load TTL and save as CSV """ input_file = os.path.join(resource_dir, 'monarch/biogrid_test.ttl') output_file = os.path.join(target_dir, 'test_output.ttl') t = ObanRdfTransformer() t.parse(input_file, input_format="turtle") t.report() t.save(output_file, output_format="turtle") output_archive_file = os.path.join(target_dir, 'biogrid_test') pt = PandasTransformer(t.graph) pt.save(output_archive_file) # read again the source, test graph src_graph = rdflib.Graph() src_graph.parse(input_file, format="turtle") # read again the dumped target graph target_graph = rdflib.Graph() target_graph.parse(output_file, format="turtle") # compare subgraphs from the source and the target graph. OBAN = Namespace('http://purl.org/oban/') for a in src_graph.subjects(RDF.type, OBAN.association): oban_src_graph = rdflib.Graph() oban_src_graph += src_graph.triples((a, None, None)) oban_tg_graph = rdflib.Graph() oban_tg_graph += target_graph.triples((a, None, None)) # see they are indeed identical (isomorphic) if not oban_src_graph.isomorphic(oban_tg_graph): print( 'The subgraphs whose subject is {} are not isomorphic'.format( a)) # w2 = GraphMLTransformer(t.graph) # w2.save(os.path.join(tpath, "x1n.graphml")) w3 = JsonTransformer(t.graph) w3.save(os.path.join(target_dir, "biogrid_test.json"))
def main(path, output): G = JsonTransformer(path).graph for u, v, attr_dict in G.edges(data=True): edge_label = attr_dict['edge_label'] try: predicate, category = edge_label.replace(' ', '_').rsplit('_', 1) except ValueError: continue is_predicate = bmt.get_predicate(predicate) is not None is_category = bmt.get_class(category) is not None if is_predicate and is_category: if 'category' not in G.node[v]: G.node[v]['category'] = [category] elif category not in G.node[v]['category']: G.node[v]['category'].append(category) logging.info('from {u} {p} {v} found {v} is a {c}'.format( u=u, p=p, v=v, c=category))
def main(path, output, model): if model is not None: bmt.load(model) t = JsonTransformer() t.parse(path) def curie_to_label(curie:str): """ Uses the biolink model toolkit to look up an element (on the tree rooted at `named thing` and `related to`) for a given curie. If none can be found then returns the original curie. """ if isinstance(curie, (list, tuple, set)): return [curie_to_label(c) for c in curie] elif isinstance(curie, str): e = bmt.get_by_mapping(curie) return e if e is not None else curie else: return None for n, attr in t.graph.nodes(data=True): attr['category'] = curie_to_label(attr.get('category')) for s, o, attr in t.graph.edges(data=True): attr['predicate'] = curie_to_label(attr.get('predicate')) t.save(output)
def test_json_save(): t = JsonTransformer() t.parse(os.path.join(resource_dir, 'valid.json')) assert t.graph.number_of_nodes() == 6 assert t.graph.number_of_edges() == 5 t.save(os.path.join(target_dir, 'graph.json')) assert os.path.exists(os.path.join(target_dir, 'graph.json'))
def test_export(): """ Test export behavior of JsonTransformer """ json_file = os.path.join(resource_dir, 'semmed/gene.json') output_file = os.path.join(target_dir, 'semmeddb_export.json') jt = JsonTransformer() jt.parse(json_file) jt.save(output_file) assert os.path.isfile(output_file)
""" Loads all the turtle files with their required ontologies and transforms them to json. Then loads all these json files, along with the semmeddb edges.csv and nodes.csv files, into a single NetworkX graph, and performs `clique_merge` on it. Finally, saves the resulting NetworkX graph as `clique_merged.csv` """ from kgx import ObanRdfTransformer2, JsonTransformer, HgncRdfTransformer, RdfOwlTransformer2 from kgx import clique_merge, make_valid_types t = RdfOwlTransformer2() t.parse('data/hp.owl') t = JsonTransformer(t) t.save('results/hp.json') t = RdfOwlTransformer2() t.parse('data/mondo.owl') t = JsonTransformer(t) t.save('results/mondo.json') t = HgncRdfTransformer() t.parse('data/hgnc.ttl') t = JsonTransformer(t) t.save('results/hgnc.json') t = ObanRdfTransformer2() t.add_ontology('data/mondo.owl') t.add_ontology('data/hp.owl') t.parse('data/orphanet.ttl') t = JsonTransformer(t) t.save('results/orphanet.json')
""" This script prepares the clique_merged.json file for uploading to Neo4j - Removes nodes that cannot be categorized into the biolink model - Renames edge labels that don't matche the biolink model to "related_to" - Transforms into CSV format """ from kgx import JsonTransformer, PandasTransformer import bmt t = JsonTransformer() t.parse('results/clique_merged.json') t = PandasTransformer(t) G = t.graph size = len(G) nodes = [] for n, data in G.nodes(data=True): data['category'] = [ c for c in data.get('category', []) if bmt.get_class(c) is not None ] if data['category'] == []: if 'name' in data: data['category'] = ['named thing'] else: nodes.append(n) G.remove_nodes_from(nodes)
from kgx import JsonTransformer from pprint import pprint from terminaltables import AsciiTable import sys, numpy if len(sys.argv) < 3: min_frequency = 100 else: min_frequency = int(sys.argv[2]) if len(sys.argv) < 2: quit('Required argument: path to json knowledge graph') else: path = sys.argv[1] t = JsonTransformer() t.parse(path) category_list = [] uncategorized_example = {} uncategorized_frequency = {} for n in t.graph.nodes(): c = t.graph.node[n].get('category') if c is None: iri = t.graph.node[n].get('iri') k = iri.split('/') if '_' in k[-1]: prefix, _ = k[-1].split('_', 1) k = tuple(k[:-1] + [prefix]) else:
from kgx import ObanRdfTransformer, JsonTransformer, HgncRdfTransformer from kgx import clique_merge t = JsonTransformer() t.parse('hgnc.json') t.parse('clinvar.json') t.parse('omim.json') t.parse('hpoa.json') t.parse('orphanet.json') t.save('merged.json') t.graph = clique_merge(t.graph) t.save('clique_merged.json')
def test_neo_to_graph_download(): """ downloads a neo4j graph """ return subject_label = 'gene' object_label = None edge_type = None stop_after = 100 output_transformer = JsonTransformer() G = output_transformer.graph driver = http_gdb('http://localhost:7474', username='', password='') subject_label = ':`{}`'.format(subject_label) if isinstance( subject_label, str) else '' object_label = ':`{}`'.format(object_label) if isinstance( object_label, str) else '' edge_type = ':`{}`'.format(edge_type) if isinstance(edge_type, str) else '' match = 'match (n{})-[e{}]->(m{})'.format(subject_label, edge_type, object_label) results = driver.query('{} return count(*)'.format(match)) print('Using cyper query: {} return n, e, m'.format(match)) for a, in results: size = a break if size == 0: print('No data available') quit() page_size = 1_000 skip_flag = False for i in range(0, size, page_size): q = '{} return n, e, m skip {} limit {}'.format(match, i, page_size) results = driver.query(q) for n, e, m in results: subject_attr = n['data'] object_attr = m['data'] edge_attr = e['data'] if 'id' not in subject_attr or 'id' not in object_attr: if not skip_flag: print('Skipping records that have no id attribute') skip_flag = True continue s = subject_attr['id'] o = object_attr['id'] if 'edge_label' not in edge_attr: edge_attr['edge_label'] = e['metadata']['type'] if 'category' not in subject_attr: subject_attr['category'] = n['metadata']['labels'] if 'category' not in object_attr: object_attr['category'] = m['metadata']['labels'] if s not in G: G.add_node(s, **subject_attr) if o not in G: G.add_node(o, **object_attr) G.add_edge(s, o, key=edge_attr['edge_label'], **edge_attr) if stop_after is not None and G.number_of_edges() > stop_after: break
from kgx import JsonTransformer, clique_merge t = JsonTransformer() t.parse('results/hp.owl') t.parse('results/mondo.json') t.parse('results/hgnc.json') t.parse('results/clinvar.json') t.parse('results/omim.json') t.parse('results/hpoa.json') t.parse('results/orphanet.json') #t = PandasTransformer(t.graph) #t.parse('data/semmeddb_edges.csv') #t.parse('data/semmeddb_nodes.csv') t.graph = clique_merge(t.graph) t.save('results/clique_merged.json')
from kgx import JsonTransformer, clique_merge import sys path = sys.argv[1] t = JsonTransformer() t.parse(path) t.graph = clique_merge(t.graph) t.save('clique_merged.json')
from kgx import ObanRdfTransformer, JsonTransformer, HgncRdfTransformer from collections import Counter #o = ObanRdfTransformer() #o.add_ontology('data/mondo.owl') #o.add_ontology('data/hp.owl') #o.add_ontology('data/go.owl') #o.add_ontology('data/so.owl') #o.add_ontology('data/ordo.owl') from rdflib import URIRef t = HgncRdfTransformer() t.parse('data/hgnc.ttl') t = JsonTransformer(t) t.save('hgnc.json') quit() t = ObanRdfTransformer() t.ontologies = o.ontologies t.parse('data/orphanet.ttl') t = JsonTransformer(t) t.save('orphanet.json') t = ObanRdfTransformer() t.ontologies = o.ontologies t.parse('data/hpoa.ttl') t = JsonTransformer(t) t.save('hpoa.json') t = ObanRdfTransformer()
from kgx import ObanRdfTransformer, JsonTransformer, HgncRdfTransformer from kgx import clique_merge t = JsonTransformer() #t.parse('hgnc.json') #t.parse('clinvar.json') #t.parse('omim.json') #t.parse('hpoa.json') #t.parse('orphanet.json') t.parse('semmeddb.json') t.parse('merged.json') t.save('merged.json') t.graph = clique_merge(t.graph) t.save('clique_merged.json')