def test_generate_set(rosetta): drug_name = 'test_drug' drug_name_node = KNode('{}.{}'.format(node_types.DRUG_NAME, drug_name), node_types.DRUG_NAME) drug_identifiers = ['CTD:123'] disease_name = 'test_disease' disease_name_node = KNode( '{}.{}'.format(node_types.DISEASE_NAME, disease_name), node_types.DISEASE_NAME) disease_identifiers = ['DOID:123'] query = UserQuery(drug_identifiers, node_types.DRUG, drug_name_node) query.add_transition(node_types.GENE) query.add_transition(node_types.PROCESS) query.add_transition(node_types.CELL) query.add_transition(node_types.ANATOMY) query.add_transition(node_types.PHENOTYPE) query.add_transition(node_types.DISEASE, end_values=disease_identifiers) query.add_end_lookup_node(disease_name_node) d = query.definition l, r = d.generate_paired_query(4) assert len(l.transitions) == 4 assert len(r.transitions) == 2 lq = OneSidedLinearUserQuerySet(l) rq = OneSidedLinearUserQuerySet(r) #print(lq.generate_cypher()[0]) #print() #print(rq.generate_cypher()[0]) print(rq.generate_cypher()[0]) assert lq.compile_query(rosetta) assert rq.compile_query(rosetta)
def test_batch_sequence_variant_to_gene(myvariant): variant_node = KNode('MYVARIANT_HG38:chr11:g.68032291C>G', type=node_types.SEQUENCE_VARIANT) variant_node2 = KNode('MYVARIANT_HG38:chrX:g.32389644G>A', type=node_types.SEQUENCE_VARIANT) variant_node3 = KNode('MYVARIANT_HG38:chr17:g.7674894G>A', type=node_types.SEQUENCE_VARIANT) batch_annotations = myvariant.batch_sequence_variant_to_gene([variant_node, variant_node2, variant_node3]) relations = batch_annotations['MYVARIANT_HG38:chr11:g.68032291C>G'] identifiers = [node.id for r,node in relations] assert 'HGNC:7715' in identifiers predicates = [ relation.standard_predicate for relation,n in relations ] plabels = [p.label for p in predicates] assert 'is_missense_variant_of' in plabels relations = batch_annotations['MYVARIANT_HG38:chrX:g.32389644G>A'] identifiers = [node.id for r,node in relations] assert 'HGNC:2928' in identifiers predicates = [ relation.standard_predicate for relation,n in relations ] plabels = [p.label for p in predicates] assert 'is_nonsense_variant_of' in plabels relations = batch_annotations['MYVARIANT_HG38:chr17:g.7674894G>A'] identifiers = [node.id for r,node in relations] assert 'HGNC:11998' in identifiers predicates = [ relation.standard_predicate for relation,n in relations ] plabels = [p.label for p in predicates] assert 'is_nonsense_variant_of' in plabels
def test_two_sided_query_compose(rosetta): """Create a 2sided query, composing it by hand.Mimics what should happen automatically in a 2 sided user query""" drug_name = 'test_drug' drug_name_node = KNode('{}.{}'.format(node_types.DRUG_NAME, drug_name), node_types.DRUG_NAME) drug_identifiers = ['CTD:123'] disease_name = 'test_disease' disease_name_node = KNode( '{}.{}'.format(node_types.DISEASE_NAME, disease_name), node_types.DISEASE_NAME) disease_identifiers = ['DOID:123'] #This will create a OneSidedQuerySet queryl = UserQuery(drug_identifiers, node_types.DRUG, drug_name_node) queryl.add_transition(node_types.GENE) queryl.add_transition(node_types.PROCESS) queryl.add_transition(node_types.CELL) queryl.add_transition(node_types.ANATOMY) #this is another queryr = UserQuery(disease_identifiers, node_types.DISEASE, disease_name_node) queryr.add_transition(node_types.PHENOTYPE) queryr.add_transition(node_types.ANATOMY) #The individual queries check out assert queryl.compile_query(rosetta) assert queryr.compile_query(rosetta) #The two sided checks out twolq = TwoSidedLinearUserQuery(queryl.query, queryr.query) assert twolq.compile_query(rosetta) twolqs = TwoSidedLinearUserQuerySet() twolqs.add_query(twolq, rosetta) #the two sided set checks out assert twolqs.compile_query(rosetta)
def test_flush_nodes_changing_node(): # exact same test as non changing node except have to get different synonym map from flush_nodes node = KNode('MESH:D000096', type=node_types.CHEMICAL_SUBSTANCE) properties = {'a': 'some prop'} node.properties = properties bf = BufferedWriter(rosetta_mock) bf.write_node(node) def write_transaction_mock(export_func, nodes, types): print(types) # make sure this is the right function assert export_func == export_node_chunk # make sure we have out node id in there assert node.id in nodes # get the node and see if the properties are preserved assert nodes[node.id].properties == properties # see if the types are expected assert nodes[node.id].export_labels == types == frozenset([ "chemical_substance", "named_thing", "biological_entity", "molecular_entity" ]) session = Mock() session.write_transaction = write_transaction_mock # pass the mock tester to bf and let it rip bf.flush_nodes(session) # make sure the synonym map we get here to be used for edge correction is sane assert 'MESH:D000096' in bf.synonym_map assert bf.synonym_map['MESH:D000096'] == 'CHEBI:15347'
def test_flush_nodes_non_normilizable(): # exact same test as non changing node except have to get different synonym map from flush_nodes node = KNode('SOME:curie', type=node_types.CHEMICAL_SUBSTANCE) properties = {'a': 'some prop'} node.properties = properties bf = BufferedWriter(rosetta_mock) bf.write_node(node) def write_transaction_mock(export_func, nodes, types): print(types) # make sure this is the right function assert export_func == export_node_chunk # make sure we have out node id in there assert node.id in nodes # get the node and see if the properties are preserved assert nodes[node.id].properties == properties # see if the types are expected assert nodes[node.id].export_labels == [] assert types == frozenset() session = Mock() session.write_transaction = write_transaction_mock # pass the mock tester to bf and let it rip bf.flush_nodes(session) # make sure the synonym map we get here to be used for edge correction is sane assert 'SOME:curie' in bf.synonym_map assert bf.synonym_map['SOME:curie'] == 'SOME:curie'
def a_test_gwascatalog_variant_to_phenotype(gwascatalog, rosetta): # turned this off for now because it relies on gwascatalog being precached for CAIDs #relations = rosetta.cache.get('gwascatalog.sequence_variant_to_disease_or_phenotypic_feature(CAID:CA248392703)') #identifiers = [node.id for r,node in relations] #assert 'EFO:0002690' in identifiers #predicates = [ relation.standard_predicate for relation,n in relations ] #plabels = set( [p.label for p in predicates] ) #assert 'has_phenotype' in plabels variant_node = KNode('CAID:CA248392703', type=node_types.SEQUENCE_VARIANT) relations = gwascatalog.sequence_variant_to_disease_or_phenotypic_feature(variant_node) identifiers = [node.id for r,node in relations] assert 'EFO:0002690' in identifiers predicates = [ relation.standard_predicate for relation,n in relations ] plabels = set( [p.label for p in predicates] ) assert 'has_phenotype' in plabels variant_node = KNode('CAID:CA16058750', type=node_types.SEQUENCE_VARIANT) relations = gwascatalog.sequence_variant_to_disease_or_phenotypic_feature(variant_node) identifiers = [node.id for r,node in relations] assert 'EFO:0003898' in identifiers assert 'EFO:0001359' in identifiers assert 'ORPHANET:1572' in identifiers names = [node.name for r,node in relations] assert 'ankylosing spondylitis' in names assert 'chronic childhood arthritis' in names publications = [r.publications for r,node in relations] assert ['PMID:26301688'] in publications properties = [r.properties for r,node in relations] assert properties[0]['pvalue'] == 8.0E-11 variant_node = KNode('DBSNP:rs369602258', type=node_types.SEQUENCE_VARIANT) results = gwascatalog.sequence_variant_to_disease_or_phenotypic_feature(variant_node) assert len(results) == 0
def parse_edges(self, provided_by, limit=0): """ Construct KEdges""" if not provided_by: raise RuntimeError( 'Error edge property provided by is not specified') limit_counter = 0 with open(os.path.join(self.cord_dir, 'edges.txt')) as edges_file: reader = csv.DictReader(edges_file, delimiter='\t') for edge_raw in reader: predicate = LabeledID(identifier='SEMMEDDB:ASSOCIATED_WITH', label='related_to') source_node = KNode(edge_raw['Term1']) target_node = KNode(edge_raw['Term2']) edge = self.create_edge(source_node=source_node, target_node=target_node, input_id=edge_raw['Term1'], provided_by=provided_by, predicate=predicate, publications=[], properties={ 'num_publications': float(edge_raw['Effective_Pubs']), 'enrichment_p': float(edge_raw['Enrichment_p']) }) edge.standard_predicate = predicate limit_counter += 1 if limit and limit_counter > limit: break yield limit_counter - 1, edge
def synonymize_knowledge_graph(knowledge_graph): id_mappings = {} if 'nodes' in knowledge_graph: rosetta = rossetta_setup_default() nodes = knowledge_graph['nodes'] for node in nodes: id = ':'.join(re.split(r'\..*:', node['id'])) # try and make nodes with single type nodes = [] if type(node['type']) == type([]): nodes = [ KNode(id=id, type=node_type) for node_type in node['type'] ] else: nodes = [KNode(id=id, type=node['type'])] id_picks = [id] for n in nodes: rosetta.synonymizer.synonymize(n) # if node Id after synonymization is d/t from kg node track change to use that if n.id not in id_picks: id_picks.append(n.id) if 'equivalent_identifiers' not in node: node['equivalent_identifiers'] = [] node['equivalent_identifiers'].extend([ x[0] for x in list(n.synonyms) if x[0] not in node['equivalent_identifiers'] ]) id_last_change = id_picks[len(id_picks) - 1] id_mappings[node['id']] = id_last_change node['id'] = id_last_change else: logger.warning('Unable to locate nodes in knowledge graph') return knowledge_graph, id_mappings
def test_list_returns_zero(omnicorpus): disease_node = KNode('UBERON:0013694', type=node_types.ANATOMICAL_ENTITY) go_node = KNode('GO:0045892', type=node_types.BIOLOGICAL_PROCESS) nodes = [disease_node, go_node] results = omnicorpus.get_all_shared_pmids(nodes) assert len(results) == 1 assert len(list(results.values())[0]) == 0
def get_edges_from_file(self, file_name, provided_by, delimiter): """ All is stuff is till we get kgx to merge edges. For now creating a pattern looking like a robokopservice and let writer handle it. :param file_name: :return: """ if not file_name: return bl_resolver = BL_lookup() with open(file_name) as edge_file: reader = csv.DictReader(edge_file, delimiter=delimiter) for raw_edge in reader: edge_label = raw_edge['edge_label'].split(':')[-1] relation_predicate = raw_edge['relation'] predicate = LabeledID( identifier= relation_predicate, #bl_resolver.resolve_curie(edge_label), label=edge_label) source_node = KNode(raw_edge['subject']) target_node = KNode(raw_edge['object']) edge = self.create_edge( source_node=source_node, target_node=target_node, input_id=source_node.id, provided_by=provided_by, predicate=predicate, ) edge.standard_predicate = predicate yield edge
def process_variant_to_gene_relationships(self, variant_nodes: list, writer: WriterDelegator): all_results = self.genetics_services.get_variant_to_gene(self.crawl_for_service, variant_nodes) for source_node_id, results in all_results.items(): # convert the simple edges and nodes to rags objects and write them to the graph for (edge, node) in results: gene_node = KNode(node.id, type=node.type, name=node.name, properties=node.properties) if self.recreate_sv_node: variant_node = KNode(source_node_id, type= node_types.SEQUENCE_VARIANT) variant_node.add_export_labels([node_types.SEQUENCE_VARIANT]) writer.write_node(variant_node) if gene_node.id not in self.written_genes: writer.write_node(gene_node) self.written_genes.add(gene_node.id) predicate = LabeledID(identifier=edge.predicate_id, label=edge.predicate_label) gene_edge = KEdge(source_id=source_node_id, target_id=gene_node.id, provided_by=edge.provided_by, ctime=edge.ctime, original_predicate=predicate, # standard_predicate=predicate, input_id=edge.input_id, properties=edge.properties) writer.write_edge(gene_edge) logger.info(f'added {len(results)} variant relationships for {source_node_id}')
def test_get_biological_process_by_gene_family(panther): top_family_node = KNode('PTHR11003', type=node_types.GENE_FAMILY, name='POTASSIUM CHANNEL, SUBFAMILY K') sub_family_node = KNode('PTHR11003:SF241', type=node_types.GENE_FAMILY, name='POTASSIUM CHANNEL, SUBFAMILY K Member 5') response = panther.get_biological_process_or_activity_by_gene_family( top_family_node) node_ids = [relation[1].id for relation in response] for edge, node in response: assert node.type == node_types.BIOLOGICAL_PROCESS_OR_ACTIVITY #molecular activity assert 'GO:0005261' in node_ids #biological process assert 'GO:0006811' in node_ids response = panther.get_biological_process_or_activity_by_gene_family( sub_family_node) node_ids = [relation[1].id for relation in response] for edge, node in response: assert node.type == node_types.BIOLOGICAL_PROCESS_OR_ACTIVITY #molecular activity assert 'GO:0005261' in node_ids #biological process assert 'GO:0006811' in node_ids
def process_associations(self, r, predicate, target_node_type, reverse=False): """Given a response from biolink, create our edge and node structures. Sometimes (as in pathway->Genes) biolink returns the query as the object, rather than the subject. reverse=True will handle this case, bringing back the subject of the response, rather than the object.""" edge_nodes = [] for association in r['associations']: pubs = [] if 'publications' in association and association[ 'publications'] is not None: for pub in association['publications']: # Sometimes, we get back something like "uniprotkb" instead of a PMID. We don't want it. pubid_prefix = pub['id'][:4].upper() if pubid_prefix == 'PMID': pubs.append(pub['id']) if reverse: obj = KNode(association['subject']['id'], target_node_type, association['subject']['label']) else: obj = KNode(association['object']['id'], target_node_type, association['object']['label']) rel = { 'typeid': association['relation']['id'], 'label': association['relation']['label'] } props = {'publications': pubs, 'relation': rel} edge = KEdge('biolink', predicate, props) edge_nodes.append((edge, obj)) return edge_nodes
def parse_nodes(self, limit=0): """ Parse nodes. :param limit: for testing reads first n nodes from file :return: dict with node_id as key and KNode as value """ print('parsing nodes...') limit_counter = 0 with open(os.path.join(self.cord_dir, 'nodes.txt')) as nodes_file: reader = csv.DictReader(nodes_file, delimiter='\t') for raw_node in reader: # transform headers to knode attrbutes labels = raw_node.get('semantic_type') labels = labels.replace(']', '').replace('[', '').replace( '\\', '').replace("'", '') labels = labels.split(',') node = KNode({ 'id': raw_node.get('normalized_curie'), 'type': labels[0], 'name': raw_node.get('name'), 'properties': { 'input_term': raw_node.get('input_term') } }) node.add_export_labels(labels) limit_counter += 1 if limit and limit_counter > limit: break yield limit_counter - 1, node
def test_phenotype(rosetta): node = KNode("MEDDRA:10014408", type=node_types.PHENOTYPIC_FEATURE) synonymize(node, rosetta.core) assert len(node.synonyms) > 10 hpsyns = node.get_synonyms_by_prefix("HP") assert len(hpsyns) > 0 print(hpsyns)
def test_hgnc_label(rosetta): """Do I get a label back?""" node = KNode('HGNC:18729', type=node_types.GENE) rosetta.synonymizer.synonymize(node) hgnc = node.get_synonyms_by_prefix('HGNC') assert node.name is not None assert node.name != ''
def term_get_ancestors(self, node_type, root_iri): results = self.triplestore.query_template( template_text=self.query, inputs={'root_uri': root_iri}, outputs=['parent_id', 'parent_label', 'child_id', 'child_label']) print('found total ', len(results), ' results.') nodes = set() edges = set() for index, row in enumerate(results): # Output type would be same as input type? ancestor_node = KNode(Text.obo_to_curie(row['parent_id']), name=row['parent_label'], type=node_type) child_node = KNode(Text.obo_to_curie(row['child_id']), name=row['child_label'], type=node_type) if ancestor_node.id == child_node.id: # refrain from adding edge to the node itself continue predicate = LabeledID(identifier='rdfs:subClassOf', label='subclass of') edge = self.create_edge( source_node=child_node, target_node=ancestor_node, predicate=predicate, provided_by='uberongraph.term_get_ancestors', input_id=child_node.id) nodes.add(child_node) nodes.add(ancestor_node) edges.add(edge) return nodes, edges
def create_node(session): #Make sure we're clean session.run("MATCH (a {id:{id}}) DETACH DELETE a", {"id": TEST_ID}) original = get_node(TEST_ID, session) assert original is None node = KNode(TEST_ID, node_types.DISEASE) node.add_synonyms(ORIGINAL_SYNONYMS) export_node(node, session)
def x_test_event_to_drug(mychem): node = KNode('HP:0002018', type=node_types.PHENOTYPIC_FEATURE, name='Nausea') node.add_synonyms( set([LabeledID(identifier='MedDRA:10028813', label='Nausea')])) results = mychem.get_drug_from_adverse_events(node) assert len(results) > 0
def x_test_event_to_drug(mychem): node = KNode('MONDO:0002050', type=node_types.DISEASE, name='Mental Depression') node.add_synonyms( set([LabeledID(identifier='MedDRA:10002855', label='Depression')])) results = mychem.get_drug_from_adverse_events(node) assert len(results) > 0
def test_list_with_bad_curie(omnicorpus): node = KNode('CL:0000097', type=node_types.CELL) disease_node = KNode('MONDO:0004979', type=node_types.DISEASE) drug_node = KNode('CHEBI:45783', type=node_types.CHEMICAL_SUBSTANCE) stinker = KNode('FAKEO:102830', type=node_types.CHEMICAL_SUBSTANCE) nodes = [node, disease_node, drug_node, stinker] results = omnicorpus.get_all_shared_pmids(nodes) assert len(results) == 6
def future_test_disease_normalization(rosetta): node = KNode('DOID:4325', type=node_types.DISEASE) synonyms = synonymize(node, rosetta.core) print(synonyms) node.add_synonyms(synonyms) mondos = node.get_synonyms_by_prefix('MONDO') assert len(mondos) > 0 assert Text.get_curie(node.id) == 'MONDO'
def test2(): from greent.rosetta import Rosetta rosetta = Rosetta() gt = rosetta.core support = ChemotextSupport(gt) from greent.graph_components import KNode node_a = KNode('CTD:1,2-linoleoylphosphatidylcholine', type=node_types.CHEMICAL_SUBSTANCE, name='1,2-linoleoylphosphatidylcholine') node_b = KNode('CTD:Hydrogen Peroxide', type=node_types.CHEMICAL_SUBSTANCE, name='Hydrogen Peroxide')
def xxtest_go(rosetta): node = KNode("HGNC:10593",label="SCN5A",type=node_types.GENE) s3 = rosetta.cache.get('synonymize(HGNC:10593)') rosetta.synonymizer.synonymize(node) print (node.get_synonyms_by_prefix('UNIPROTKB')) biolink = rosetta.core.biolink r=biolink.gene_get_process_or_function(node) assert len(r) > 0
def test_gene_to_drug_synonym(ctd): # Even though the main identifier is drugbank, CTD should find the right synonym in there somewhere. input_node = KNode("DB:FakeID", type=node_types.GENE) input_node.add_synonyms(set(["NCBIGene:5743"])) results = ctd.gene_to_drug(input_node) for _, node in results: assert node.type == node_types.DRUG result_ids = [node.id for edge, node in results] assert 'MESH:D000068579' in result_ids # Cox2 for a cox2 inhibitor
def test_complicated(rosetta): """make sure that a very complicated cast gets everything to the right place""" fname = 'caster.output_filter(input_filter(upcast(hetio~disease_to_phenotype,disease_or_phenotypic_feature),disease,typecheck~is_disease),disease,typecheck~is_disease)' func = rosetta.get_ops(fname) assert func is not None node = KNode('HP:0007354', type=node_types.PHENOTYPIC_FEATURE) node.add_synonyms(set([LabeledID(identifier='DOID:332', label='ALS')])) results = func(node) assert results is not None
def test_smdb_id_normalizer(hmdb): node = KNode('HMDB:HMDB0112245', type=node_types.CHEMICAL_SUBSTANCE) node.add_synonyms(['HMDB:HMDB0112245']) results = hmdb.metabolite_to_pathway(node) for r in results: r_node = r[1] if r_node.id.startswith('SMPDB'): uncuried = Text.un_curie(r_node.id) assert len(uncuried) == 10
def test_uniprot(rosetta): """Do we correctly synonymize if all we have is a UniProtKB identifier?""" node = KNode('UniProtKB:O75381', type=node_types.GENE) rosetta.synonymizer.synonymize(node) hgnc = node.get_synonyms_by_prefix('HGNC') assert len(hgnc) == 1 assert hgnc.pop() == 'HGNC:8856' assert node.id == 'HGNC:8856' assert node.name == 'PEX14'
def test_crappy_uniprot(rosetta): """Do we correctly synonymize if all we have is a UniProtKB identifier?""" node = KNode('UniProtKB:A0A024QZH5', type=node_types.GENE) rosetta.synonymizer.synonymize(node) hgnc = node.get_synonyms_by_prefix('HGNC') assert len(hgnc) == 1 assert hgnc.pop() == 'HGNC:18859' assert node.id == 'HGNC:18859' assert node.name == 'SPHK2'
def test_failing_uniprot_2(rosetta): """Do we correctly synonymize if all we have is a UniProtKB identifier?""" node = KNode('UniProtKB:P14416', type=node_types.GENE, name='') rosetta.synonymizer.synonymize(node) hgnc = node.get_synonyms_by_prefix('HGNC') assert len(hgnc) == 1 assert hgnc.pop() == 'HGNC:3023' assert node.id == 'HGNC:3023' assert node.name == 'DRD2'