def test_generate_set(rosetta):
    drug_name = 'test_drug'
    drug_name_node = KNode('{}.{}'.format(node_types.DRUG_NAME, drug_name),
                           node_types.DRUG_NAME)
    drug_identifiers = ['CTD:123']
    disease_name = 'test_disease'
    disease_name_node = KNode(
        '{}.{}'.format(node_types.DISEASE_NAME, disease_name),
        node_types.DISEASE_NAME)
    disease_identifiers = ['DOID:123']
    query = UserQuery(drug_identifiers, node_types.DRUG, drug_name_node)
    query.add_transition(node_types.GENE)
    query.add_transition(node_types.PROCESS)
    query.add_transition(node_types.CELL)
    query.add_transition(node_types.ANATOMY)
    query.add_transition(node_types.PHENOTYPE)
    query.add_transition(node_types.DISEASE, end_values=disease_identifiers)
    query.add_end_lookup_node(disease_name_node)
    d = query.definition
    l, r = d.generate_paired_query(4)
    assert len(l.transitions) == 4
    assert len(r.transitions) == 2
    lq = OneSidedLinearUserQuerySet(l)
    rq = OneSidedLinearUserQuerySet(r)
    #print(lq.generate_cypher()[0])
    #print()
    #print(rq.generate_cypher()[0])
    print(rq.generate_cypher()[0])
    assert lq.compile_query(rosetta)
    assert rq.compile_query(rosetta)
def test_batch_sequence_variant_to_gene(myvariant):
    variant_node = KNode('MYVARIANT_HG38:chr11:g.68032291C>G', type=node_types.SEQUENCE_VARIANT)
    variant_node2 = KNode('MYVARIANT_HG38:chrX:g.32389644G>A', type=node_types.SEQUENCE_VARIANT)
    variant_node3 = KNode('MYVARIANT_HG38:chr17:g.7674894G>A', type=node_types.SEQUENCE_VARIANT)

    batch_annotations = myvariant.batch_sequence_variant_to_gene([variant_node, variant_node2, variant_node3])
    relations = batch_annotations['MYVARIANT_HG38:chr11:g.68032291C>G']
    identifiers = [node.id for r,node in relations]
    assert 'HGNC:7715' in identifiers
    predicates = [ relation.standard_predicate for relation,n in relations ] 
    plabels = [p.label for p in predicates]
    assert 'is_missense_variant_of' in plabels

    relations = batch_annotations['MYVARIANT_HG38:chrX:g.32389644G>A']
    identifiers = [node.id for r,node in relations]
    assert 'HGNC:2928' in identifiers
    predicates = [ relation.standard_predicate for relation,n in relations ] 
    plabels = [p.label for p in predicates]
    assert 'is_nonsense_variant_of' in plabels

    relations = batch_annotations['MYVARIANT_HG38:chr17:g.7674894G>A']
    identifiers = [node.id for r,node in relations]
    assert 'HGNC:11998' in identifiers
    predicates = [ relation.standard_predicate for relation,n in relations ] 
    plabels = [p.label for p in predicates]
    assert 'is_nonsense_variant_of' in plabels
def test_two_sided_query_compose(rosetta):
    """Create a 2sided query, composing it by hand.Mimics what should happen
    automatically in a 2 sided user query"""
    drug_name = 'test_drug'
    drug_name_node = KNode('{}.{}'.format(node_types.DRUG_NAME, drug_name),
                           node_types.DRUG_NAME)
    drug_identifiers = ['CTD:123']
    disease_name = 'test_disease'
    disease_name_node = KNode(
        '{}.{}'.format(node_types.DISEASE_NAME, disease_name),
        node_types.DISEASE_NAME)
    disease_identifiers = ['DOID:123']
    #This will create a OneSidedQuerySet
    queryl = UserQuery(drug_identifiers, node_types.DRUG, drug_name_node)
    queryl.add_transition(node_types.GENE)
    queryl.add_transition(node_types.PROCESS)
    queryl.add_transition(node_types.CELL)
    queryl.add_transition(node_types.ANATOMY)
    #this is another
    queryr = UserQuery(disease_identifiers, node_types.DISEASE,
                       disease_name_node)
    queryr.add_transition(node_types.PHENOTYPE)
    queryr.add_transition(node_types.ANATOMY)
    #The individual queries check out
    assert queryl.compile_query(rosetta)
    assert queryr.compile_query(rosetta)
    #The two sided checks out
    twolq = TwoSidedLinearUserQuery(queryl.query, queryr.query)
    assert twolq.compile_query(rosetta)
    twolqs = TwoSidedLinearUserQuerySet()
    twolqs.add_query(twolq, rosetta)
    #the two sided set checks out
    assert twolqs.compile_query(rosetta)
Exemple #4
0
def test_flush_nodes_changing_node():
    # exact same test as non changing node except have to get different synonym map from flush_nodes
    node = KNode('MESH:D000096', type=node_types.CHEMICAL_SUBSTANCE)
    properties = {'a': 'some prop'}
    node.properties = properties

    bf = BufferedWriter(rosetta_mock)

    bf.write_node(node)

    def write_transaction_mock(export_func, nodes, types):
        print(types)
        # make sure this is the right function
        assert export_func == export_node_chunk
        # make sure we have out node id in there
        assert node.id in nodes
        # get the node and see if the properties are preserved
        assert nodes[node.id].properties == properties
        # see if the types are expected
        assert nodes[node.id].export_labels == types == frozenset([
            "chemical_substance", "named_thing", "biological_entity",
            "molecular_entity"
        ])

    session = Mock()
    session.write_transaction = write_transaction_mock
    # pass the mock tester to bf and let it rip
    bf.flush_nodes(session)

    # make sure the synonym map we get here to be used for edge correction is sane
    assert 'MESH:D000096' in bf.synonym_map
    assert bf.synonym_map['MESH:D000096'] == 'CHEBI:15347'
Exemple #5
0
def test_flush_nodes_non_normilizable():
    # exact same test as non changing node except have to get different synonym map from flush_nodes
    node = KNode('SOME:curie', type=node_types.CHEMICAL_SUBSTANCE)
    properties = {'a': 'some prop'}
    node.properties = properties

    bf = BufferedWriter(rosetta_mock)

    bf.write_node(node)

    def write_transaction_mock(export_func, nodes, types):
        print(types)
        # make sure this is the right function
        assert export_func == export_node_chunk
        # make sure we have out node id in there
        assert node.id in nodes
        # get the node and see if the properties are preserved
        assert nodes[node.id].properties == properties
        # see if the types are expected
        assert nodes[node.id].export_labels == []
        assert types == frozenset()

    session = Mock()
    session.write_transaction = write_transaction_mock
    # pass the mock tester to bf and let it rip
    bf.flush_nodes(session)

    # make sure the synonym map we get here to be used for edge correction is sane
    assert 'SOME:curie' in bf.synonym_map
    assert bf.synonym_map['SOME:curie'] == 'SOME:curie'
def a_test_gwascatalog_variant_to_phenotype(gwascatalog, rosetta):
    # turned this off for now because it relies on gwascatalog being precached for CAIDs

    #relations = rosetta.cache.get('gwascatalog.sequence_variant_to_disease_or_phenotypic_feature(CAID:CA248392703)')
    #identifiers = [node.id for r,node in relations]
    #assert 'EFO:0002690' in identifiers
    #predicates = [ relation.standard_predicate for relation,n in relations ] 
    #plabels = set( [p.label for p in predicates] )
    #assert 'has_phenotype' in plabels

    variant_node = KNode('CAID:CA248392703', type=node_types.SEQUENCE_VARIANT)
    relations = gwascatalog.sequence_variant_to_disease_or_phenotypic_feature(variant_node)
    identifiers = [node.id for r,node in relations]
    assert 'EFO:0002690' in identifiers
    predicates = [ relation.standard_predicate for relation,n in relations ] 
    plabels = set( [p.label for p in predicates] )
    assert 'has_phenotype' in plabels

    variant_node = KNode('CAID:CA16058750', type=node_types.SEQUENCE_VARIANT)
    relations = gwascatalog.sequence_variant_to_disease_or_phenotypic_feature(variant_node)
    identifiers = [node.id for r,node in relations]
    assert 'EFO:0003898' in identifiers
    assert 'EFO:0001359' in identifiers
    assert 'ORPHANET:1572' in identifiers
    names = [node.name for r,node in relations]
    assert 'ankylosing spondylitis' in names
    assert 'chronic childhood arthritis' in names
    publications = [r.publications for r,node in relations]
    assert ['PMID:26301688'] in publications
    properties = [r.properties for r,node in relations]
    assert properties[0]['pvalue'] == 8.0E-11

    variant_node = KNode('DBSNP:rs369602258', type=node_types.SEQUENCE_VARIANT)
    results = gwascatalog.sequence_variant_to_disease_or_phenotypic_feature(variant_node)
    assert len(results) == 0
 def parse_edges(self, provided_by, limit=0):
     """ Construct KEdges"""
     if not provided_by:
         raise RuntimeError(
             'Error edge property provided by is not specified')
     limit_counter = 0
     with open(os.path.join(self.cord_dir, 'edges.txt')) as edges_file:
         reader = csv.DictReader(edges_file, delimiter='\t')
         for edge_raw in reader:
             predicate = LabeledID(identifier='SEMMEDDB:ASSOCIATED_WITH',
                                   label='related_to')
             source_node = KNode(edge_raw['Term1'])
             target_node = KNode(edge_raw['Term2'])
             edge = self.create_edge(source_node=source_node,
                                     target_node=target_node,
                                     input_id=edge_raw['Term1'],
                                     provided_by=provided_by,
                                     predicate=predicate,
                                     publications=[],
                                     properties={
                                         'num_publications':
                                         float(edge_raw['Effective_Pubs']),
                                         'enrichment_p':
                                         float(edge_raw['Enrichment_p'])
                                     })
             edge.standard_predicate = predicate
             limit_counter += 1
             if limit and limit_counter > limit:
                 break
             yield limit_counter - 1, edge
def synonymize_knowledge_graph(knowledge_graph):
    id_mappings = {}
    if 'nodes' in knowledge_graph:
        rosetta = rossetta_setup_default()
        nodes = knowledge_graph['nodes']
        for node in nodes:
            id = ':'.join(re.split(r'\..*:', node['id']))
            # try and make nodes with single type
            nodes = []
            if type(node['type']) == type([]):
                nodes = [
                    KNode(id=id, type=node_type) for node_type in node['type']
                ]
            else:
                nodes = [KNode(id=id, type=node['type'])]
            id_picks = [id]
            for n in nodes:
                rosetta.synonymizer.synonymize(n)
                # if node Id after synonymization is d/t from kg node track change to use that
                if n.id not in id_picks:
                    id_picks.append(n.id)
                if 'equivalent_identifiers' not in node:
                    node['equivalent_identifiers'] = []
                node['equivalent_identifiers'].extend([
                    x[0] for x in list(n.synonyms)
                    if x[0] not in node['equivalent_identifiers']
                ])
            id_last_change = id_picks[len(id_picks) - 1]
            id_mappings[node['id']] = id_last_change
            node['id'] = id_last_change
    else:
        logger.warning('Unable to locate nodes in knowledge graph')
    return knowledge_graph, id_mappings
Exemple #9
0
def test_list_returns_zero(omnicorpus):
    disease_node = KNode('UBERON:0013694', type=node_types.ANATOMICAL_ENTITY)
    go_node = KNode('GO:0045892', type=node_types.BIOLOGICAL_PROCESS)
    nodes = [disease_node, go_node]
    results = omnicorpus.get_all_shared_pmids(nodes)
    assert len(results) == 1
    assert len(list(results.values())[0]) == 0
    def get_edges_from_file(self, file_name, provided_by, delimiter):
        """
        All is stuff is till we get kgx to merge edges. For now creating
        a pattern looking like a robokopservice and let writer handle it.
        :param file_name:
        :return:
        """
        if not file_name:
            return

        bl_resolver = BL_lookup()
        with open(file_name) as edge_file:
            reader = csv.DictReader(edge_file, delimiter=delimiter)
            for raw_edge in reader:
                edge_label = raw_edge['edge_label'].split(':')[-1]
                relation_predicate = raw_edge['relation']
                predicate = LabeledID(
                    identifier=
                    relation_predicate,  #bl_resolver.resolve_curie(edge_label),
                    label=edge_label)
                source_node = KNode(raw_edge['subject'])
                target_node = KNode(raw_edge['object'])
                edge = self.create_edge(
                    source_node=source_node,
                    target_node=target_node,
                    input_id=source_node.id,
                    provided_by=provided_by,
                    predicate=predicate,
                )
                edge.standard_predicate = predicate
                yield edge
Exemple #11
0
    def process_variant_to_gene_relationships(self, variant_nodes: list, writer: WriterDelegator):
        all_results = self.genetics_services.get_variant_to_gene(self.crawl_for_service, variant_nodes)
        for source_node_id, results in all_results.items():
            # convert the simple edges and nodes to rags objects and write them to the graph
            for (edge, node) in results:
                gene_node = KNode(node.id, type=node.type, name=node.name, properties=node.properties)
                if self.recreate_sv_node:
                    variant_node = KNode(source_node_id, type= node_types.SEQUENCE_VARIANT)
                    variant_node.add_export_labels([node_types.SEQUENCE_VARIANT])
                    writer.write_node(variant_node)
                if gene_node.id not in self.written_genes:
                    writer.write_node(gene_node)
                    self.written_genes.add(gene_node.id)

                predicate = LabeledID(identifier=edge.predicate_id, label=edge.predicate_label)
                gene_edge = KEdge(source_id=source_node_id,
                                  target_id=gene_node.id,
                                  provided_by=edge.provided_by,
                                  ctime=edge.ctime,
                                  original_predicate=predicate,
                                  # standard_predicate=predicate,
                                  input_id=edge.input_id,
                                  properties=edge.properties)
                writer.write_edge(gene_edge)
            logger.info(f'added {len(results)} variant relationships for {source_node_id}')
Exemple #12
0
def test_get_biological_process_by_gene_family(panther):
    top_family_node = KNode('PTHR11003',
                            type=node_types.GENE_FAMILY,
                            name='POTASSIUM CHANNEL, SUBFAMILY K')
    sub_family_node = KNode('PTHR11003:SF241',
                            type=node_types.GENE_FAMILY,
                            name='POTASSIUM CHANNEL, SUBFAMILY K Member 5')

    response = panther.get_biological_process_or_activity_by_gene_family(
        top_family_node)
    node_ids = [relation[1].id for relation in response]
    for edge, node in response:
        assert node.type == node_types.BIOLOGICAL_PROCESS_OR_ACTIVITY
    #molecular activity
    assert 'GO:0005261' in node_ids
    #biological process
    assert 'GO:0006811' in node_ids
    response = panther.get_biological_process_or_activity_by_gene_family(
        sub_family_node)
    node_ids = [relation[1].id for relation in response]
    for edge, node in response:
        assert node.type == node_types.BIOLOGICAL_PROCESS_OR_ACTIVITY
    #molecular activity
    assert 'GO:0005261' in node_ids
    #biological process
    assert 'GO:0006811' in node_ids
Exemple #13
0
 def process_associations(self,
                          r,
                          predicate,
                          target_node_type,
                          reverse=False):
     """Given a response from biolink, create our edge and node structures.
     Sometimes (as in pathway->Genes) biolink returns the query as the object, rather
     than the subject.  reverse=True will handle this case, bringing back the subject
     of the response, rather than the object."""
     edge_nodes = []
     for association in r['associations']:
         pubs = []
         if 'publications' in association and association[
                 'publications'] is not None:
             for pub in association['publications']:
                 # Sometimes, we get back something like "uniprotkb" instead of a PMID.  We don't want it.
                 pubid_prefix = pub['id'][:4].upper()
                 if pubid_prefix == 'PMID':
                     pubs.append(pub['id'])
         if reverse:
             obj = KNode(association['subject']['id'], target_node_type,
                         association['subject']['label'])
         else:
             obj = KNode(association['object']['id'], target_node_type,
                         association['object']['label'])
         rel = {
             'typeid': association['relation']['id'],
             'label': association['relation']['label']
         }
         props = {'publications': pubs, 'relation': rel}
         edge = KEdge('biolink', predicate, props)
         edge_nodes.append((edge, obj))
     return edge_nodes
 def parse_nodes(self, limit=0):
     """
     Parse nodes.
     :param limit: for testing reads first n nodes from file
     :return: dict with node_id as key and KNode as value
     """
     print('parsing nodes...')
     limit_counter = 0
     with open(os.path.join(self.cord_dir, 'nodes.txt')) as nodes_file:
         reader = csv.DictReader(nodes_file, delimiter='\t')
         for raw_node in reader:
             # transform headers to knode attrbutes
             labels = raw_node.get('semantic_type')
             labels = labels.replace(']', '').replace('[', '').replace(
                 '\\', '').replace("'", '')
             labels = labels.split(',')
             node = KNode({
                 'id': raw_node.get('normalized_curie'),
                 'type': labels[0],
                 'name': raw_node.get('name'),
                 'properties': {
                     'input_term': raw_node.get('input_term')
                 }
             })
             node.add_export_labels(labels)
             limit_counter += 1
             if limit and limit_counter > limit:
                 break
             yield limit_counter - 1, node
def test_phenotype(rosetta):
    node = KNode("MEDDRA:10014408", type=node_types.PHENOTYPIC_FEATURE)
    synonymize(node, rosetta.core)
    assert len(node.synonyms) > 10
    hpsyns = node.get_synonyms_by_prefix("HP")
    assert len(hpsyns) > 0
    print(hpsyns)
def test_hgnc_label(rosetta):
    """Do I get a label back?"""
    node = KNode('HGNC:18729', type=node_types.GENE)
    rosetta.synonymizer.synonymize(node)
    hgnc = node.get_synonyms_by_prefix('HGNC')
    assert node.name is not None
    assert node.name != ''
Exemple #17
0
 def term_get_ancestors(self, node_type, root_iri):
     results = self.triplestore.query_template(
         template_text=self.query,
         inputs={'root_uri': root_iri},
         outputs=['parent_id', 'parent_label', 'child_id', 'child_label'])
     print('found total ', len(results), ' results.')
     nodes = set()
     edges = set()
     for index, row in enumerate(results):
         # Output type would be same as input type?
         ancestor_node = KNode(Text.obo_to_curie(row['parent_id']),
                               name=row['parent_label'],
                               type=node_type)
         child_node = KNode(Text.obo_to_curie(row['child_id']),
                            name=row['child_label'],
                            type=node_type)
         if ancestor_node.id == child_node.id:
             # refrain from adding edge to the node itself
             continue
         predicate = LabeledID(identifier='rdfs:subClassOf',
                               label='subclass of')
         edge = self.create_edge(
             source_node=child_node,
             target_node=ancestor_node,
             predicate=predicate,
             provided_by='uberongraph.term_get_ancestors',
             input_id=child_node.id)
         nodes.add(child_node)
         nodes.add(ancestor_node)
         edges.add(edge)
     return nodes, edges
def create_node(session):
    #Make sure we're clean
    session.run("MATCH (a {id:{id}}) DETACH DELETE a", {"id": TEST_ID})
    original = get_node(TEST_ID, session)
    assert original is None
    node = KNode(TEST_ID, node_types.DISEASE)
    node.add_synonyms(ORIGINAL_SYNONYMS)
    export_node(node, session)
def x_test_event_to_drug(mychem):
    node = KNode('HP:0002018',
                 type=node_types.PHENOTYPIC_FEATURE,
                 name='Nausea')
    node.add_synonyms(
        set([LabeledID(identifier='MedDRA:10028813', label='Nausea')]))
    results = mychem.get_drug_from_adverse_events(node)
    assert len(results) > 0
Exemple #20
0
def x_test_event_to_drug(mychem):
    node = KNode('MONDO:0002050',
                 type=node_types.DISEASE,
                 name='Mental Depression')
    node.add_synonyms(
        set([LabeledID(identifier='MedDRA:10002855', label='Depression')]))
    results = mychem.get_drug_from_adverse_events(node)
    assert len(results) > 0
Exemple #21
0
def test_list_with_bad_curie(omnicorpus):
    node = KNode('CL:0000097', type=node_types.CELL)
    disease_node = KNode('MONDO:0004979', type=node_types.DISEASE)
    drug_node = KNode('CHEBI:45783', type=node_types.CHEMICAL_SUBSTANCE)
    stinker = KNode('FAKEO:102830', type=node_types.CHEMICAL_SUBSTANCE)
    nodes = [node, disease_node, drug_node, stinker]
    results = omnicorpus.get_all_shared_pmids(nodes)
    assert len(results) == 6
def future_test_disease_normalization(rosetta):
    node = KNode('DOID:4325', type=node_types.DISEASE)
    synonyms = synonymize(node, rosetta.core)
    print(synonyms)
    node.add_synonyms(synonyms)
    mondos = node.get_synonyms_by_prefix('MONDO')
    assert len(mondos) > 0
    assert Text.get_curie(node.id) == 'MONDO'
def test2():
    from greent.rosetta import Rosetta
    rosetta = Rosetta()
    gt = rosetta.core
    support = ChemotextSupport(gt)
    from greent.graph_components import KNode
    node_a = KNode('CTD:1,2-linoleoylphosphatidylcholine', type=node_types.CHEMICAL_SUBSTANCE, name='1,2-linoleoylphosphatidylcholine')
    node_b = KNode('CTD:Hydrogen Peroxide', type=node_types.CHEMICAL_SUBSTANCE, name='Hydrogen Peroxide')
def xxtest_go(rosetta):
    node = KNode("HGNC:10593",label="SCN5A",type=node_types.GENE)
    s3 = rosetta.cache.get('synonymize(HGNC:10593)')
    rosetta.synonymizer.synonymize(node)
    print (node.get_synonyms_by_prefix('UNIPROTKB'))
    biolink = rosetta.core.biolink
    r=biolink.gene_get_process_or_function(node)
    assert len(r) > 0
Exemple #25
0
def test_gene_to_drug_synonym(ctd):
    # Even though the main identifier is drugbank, CTD should find the right synonym in there somewhere.
    input_node = KNode("DB:FakeID", type=node_types.GENE)
    input_node.add_synonyms(set(["NCBIGene:5743"]))
    results = ctd.gene_to_drug(input_node)
    for _, node in results:
        assert node.type == node_types.DRUG
    result_ids = [node.id for edge, node in results]
    assert 'MESH:D000068579' in result_ids  # Cox2 for a cox2 inhibitor
Exemple #26
0
def test_complicated(rosetta):
    """make sure that a very complicated cast gets everything to the right place"""
    fname = 'caster.output_filter(input_filter(upcast(hetio~disease_to_phenotype,disease_or_phenotypic_feature),disease,typecheck~is_disease),disease,typecheck~is_disease)'
    func = rosetta.get_ops(fname)
    assert func is not None
    node = KNode('HP:0007354', type=node_types.PHENOTYPIC_FEATURE)
    node.add_synonyms(set([LabeledID(identifier='DOID:332', label='ALS')]))
    results = func(node)
    assert results is not None
def test_smdb_id_normalizer(hmdb):
    node = KNode('HMDB:HMDB0112245', type=node_types.CHEMICAL_SUBSTANCE)
    node.add_synonyms(['HMDB:HMDB0112245'])
    results = hmdb.metabolite_to_pathway(node)
    for r in results:
        r_node = r[1]
        if r_node.id.startswith('SMPDB'):
            uncuried = Text.un_curie(r_node.id)
            assert len(uncuried) == 10
def test_uniprot(rosetta):
    """Do we correctly synonymize if all we have is a UniProtKB identifier?"""
    node = KNode('UniProtKB:O75381', type=node_types.GENE)
    rosetta.synonymizer.synonymize(node)
    hgnc = node.get_synonyms_by_prefix('HGNC')
    assert len(hgnc) == 1
    assert hgnc.pop() == 'HGNC:8856'
    assert node.id == 'HGNC:8856'
    assert node.name == 'PEX14'
def test_crappy_uniprot(rosetta):
    """Do we correctly synonymize if all we have is a UniProtKB identifier?"""
    node = KNode('UniProtKB:A0A024QZH5', type=node_types.GENE)
    rosetta.synonymizer.synonymize(node)
    hgnc = node.get_synonyms_by_prefix('HGNC')
    assert len(hgnc) == 1
    assert hgnc.pop() == 'HGNC:18859'
    assert node.id == 'HGNC:18859'
    assert node.name == 'SPHK2'
def test_failing_uniprot_2(rosetta):
    """Do we correctly synonymize if all we have is a UniProtKB identifier?"""
    node = KNode('UniProtKB:P14416', type=node_types.GENE, name='')
    rosetta.synonymizer.synonymize(node)
    hgnc = node.get_synonyms_by_prefix('HGNC')
    assert len(hgnc) == 1
    assert hgnc.pop() == 'HGNC:3023'
    assert node.id == 'HGNC:3023'
    assert node.name == 'DRD2'