def __init__(self, rosetta: Rosetta):
        self.rosetta = rosetta
        self.written_anatomical_entities = set()
        self.written_genes = set()
        self.max_nodes = 100_000

        # create static labels for the edge predicates
        self.variant_anatomy_predicate = LabeledID(
            identifier=f'biolink:affects_expression_of',
            label=f'affects_expression_in')
        self.gene_anatomy_predicate = LabeledID(
            identifier=f'biolink:gene_to_expression_site_association',
            label=f'gene_to_expression_site_association')
        self.variant_gene_sqtl_predicate = LabeledID(
            identifier=f'biolink:affects_splicing_of',
            label=f'affects_splicing_of')
        self.increases_expression_predicate = LabeledID(
            identifier='biolink:increases_expression_of',
            label='increases_expression_of')
        self.decreases_expression_predicate = LabeledID(
            identifier='biolink:decreases_expression_of',
            label='decreases_expression_of')

        # get a ref to the util class
        self.gtu = GTExUtils(self.rosetta)
Ejemplo n.º 2
0
 def chemical_get_enzyme(self,chemnode):
     """To get an enzyme from chemicals, we first look up the reactions for the chemical.
     Then we pull the reaction which gives us (1) the enzyme and (2) whether the chemical
     is a reactant or a product."""
     reactions = self.chemical_get_reaction(chemnode)
     chemids = set([Text.un_curie(x) for x in chemnode.get_synonyms_by_prefix('KEGG')])
     results = []
     for reaction_id in reactions:
         rxns = self.get_reaction(reaction_id)
         for rxn in rxns:
             if 'enzyme' in rxn:
                 for gene_id in rxn['enzyme']:
                     enzyme = KNode(gene_id, type=node_types.GENE)
                     if len(chemids.intersection(rxn['reactants'])) > 0:
                         predicate = LabeledID('CTD:increases_degradation_of', label='increases degradation of')
                         #predicate = LabeledID('RO:0002449','negatively regulates, entity to entity')
                         input_identifier = chemids.intersection(rxn['reactants']).pop()
                     elif len(chemids.intersection(rxn['products'])) > 0:
                         predicate = LabeledID('CTD:increases_synthesis_of', label='increases synthesis of')
                         #predicate = LabeledID('RO:0002450','positively regulates, entity to entity')
                         input_identifier = chemids.intersection(rxn['products']).pop()
                     else:
                         logger.error(f"Mismatch between query and answer: {rxn} {chemids}")
                         continue
                     edge = self.create_edge(enzyme, chemnode, f'kegg.chemical_get_enzyme',  input_identifier, predicate)
                     results.append( (edge, enzyme))
     return results
 def synonymize(node, retry=3):
     normalization_url = f'{Synonymizer.NODE_NORMALIZATION_URL}?curie={node.id}'
     response = None
     try:
         with requests.Session() as session:
             response = session.get(normalization_url)
     except:
         logger.error(
             f"Failed to contact {normalization_url} retries left --- {retry}"
         )
     if not response:
         logger.error(
             f"Synonymization network error -- Failed for {normalization_url}"
         )
         return
     if response.status_code == 200:
         response = response.json()[node.id]
         main_id = LabeledID(**response['id'])
         node.id = main_id.identifier
         node.name = main_id.label
         node.add_synonyms(
             map(lambda synonym: LabeledID(**synonym),
                 response['equivalent_identifiers']))
         node.add_export_labels(frozenset(response['type']))
         if node.name == '':
             for syns in response['equivalent_identifiers']:
                 # if main_id didn't have label look for the first occurance in the eq' ids.
                 if 'label' in syns:
                     node.name = syns['label']
                     break
         node.type = response['type']
     else:
         logger.debug(
             f'failed to normalize node {node.id} on {normalization_url}')
         logger.debug(f'{response.content.decode()}')
Ejemplo n.º 4
0
def synonymize_with_MONDO(node,gt):
    syns = set([ LabeledID(identifier=x, label="") for x in gt.mondo.mondo_get_doid( node.id )])
    syns.update( set( [ LabeledID(identifier=x, label="") for x in gt.mondo.mondo_get_umls( node.id )]) )
    syns.update( set( [ LabeledID(identifier=x, label="") for x in gt.mondo.mondo_get_efo( node.id )]))
    mondo_ids = {s.identifier for s in node.synonyms if s.identifier.split(':')[0].upper() == 'MONDO'}
    #node.add_synonyms(syns)
    return syns
def get_labels_multiple(curie_list, url='https://onto.renci.org/lable'):
    """
    Grab labels and return labeledIDs
    :param curie_list:
    :param url:
    :return:
    """
    ## going @ 1000 requests per round, not to overload server
    req_per_round = 1000
    # chunck up
    curie_list_chunck = [
        curie_list[i:i + req_per_round]
        for i in range(0, len(curie_list), req_per_round)
    ]
    results = []
    for chunk in curie_list_chunck:
        loop = asyncio.new_event_loop()
        results += loop.run_until_complete(get_label_async(chunk))
    ### returns [{id: 'xxxx', label: 'xxxxx'}]

    return list(
        map(
            lambda x: LabeledID(identifier=x['id'], label=x['label']
                                ),  # create LabeledIDS
            filter(lambda result: not result['label'].startswith('obsolete'),
                   results))  # filter obsolete results
    )
Ejemplo n.º 6
0
def build_exact_sets(o,u):
    sets = []
    mids = o.get_ids()
    print(len(mids))
    n = 0
    now = dt.now()
    for mid in mids:
        if n % 100 == 0 and n > 0:
            later = dt.now()
            delt = (later-now).seconds
            f = n / len(mids)
            print(f'{n}/{len(mids)} = {f} in {delt} s')
            print(f'  estimated time remaining = {delt * (1-f)/(f)}')
        #FWIW, ICD codes tend to be mapped to multiple MONDO identifiers, leading to mass confusion. So we
        #just excise them here.  It's possible that we'll want to revisit this decision in the future.  If so,
        #then we probably will want to set a 'glommable' and 'not glommable' set.
        print(mid)
        dbx = [ Text.upper_curie(x) for x in o.get_exact_matches(mid) ]
        print(dbx)
        dbx = set( filter( lambda x: not x.startswith('ICD'), dbx ) )
        label = u.get_label(mid)
        print(label)
        mid = Text.upper_curie(mid)
        dbx.add(LabeledID(mid,label))
        sets.append(dbx)
        n += 1
    return sets
Ejemplo n.º 7
0
def get_variants_without_genes_from_graph(rosetta: object) -> list:
    custom_query = 'match (s:sequence_variant) where not (s)--(:gene) return distinct s.id'
    variants_without_genes = []
    var_list = query_the_graph(rosetta, custom_query)
    for variant in var_list:
        variants_without_genes.append(LabeledID(variant[0], variant[0]))
    return variants_without_genes
    def __init__(self, *args, **kwargs):
        self.id = None
        self.name = None
        self.type = None
        self.original_curie = None
        self.properties = {}

        if args and len(args) == 1 and isinstance(args[0], str):
            self.id = args[0]
            args = []
        # TODO: Currently hack to only utilize the 1st curie in a list if multiple curies provided
        elif args and len(args) == 1 and isinstance(
                args[0], list) and isinstance(args[0][0], str):
            self.id = args[0][0]
            args = []

        super().__init__(*args, **kwargs)

        # Another hack to keep things running.
        if isinstance(self.name, list):
            self.name = self.name[0]

        if self.id.startswith('http'):
            self.id = Text.obo_to_curie(self.id)

        #Synonyms is just for CURIEs
        self.synonyms = set()
        self.synonyms.add(LabeledID(identifier=self.id, label=self.name))

        #List of labels to attach to exports
        self.export_labels = []
Ejemplo n.º 9
0
def get_all_variant_ids_from_graph(rosetta: object) -> list:
    all_lids = []
    custom_query = 'match (s:sequence_variant) return distinct s.id'
    var_list = query_the_graph(rosetta, custom_query)
    for variant in var_list:
        all_lids.append(LabeledID(variant[0], variant[0]))
    return all_lids
Ejemplo n.º 10
0
def get_gwas_knowledge_variants_from_graph(rosetta: object) -> list:
    custom_query = 'match (s:sequence_variant)-[x]-(d:disease_or_phenotypic_feature) where "gwascatalog.sequence_variant_to_disease_or_phenotypic_feature" in x.edge_source return distinct s.id'
    gwas_lids = []
    var_list = query_the_graph(rosetta, custom_query)
    for variant in var_list:
        gwas_lids.append(LabeledID(variant[0], variant[0]))
    return gwas_lids
 def add_synonyms(self, new_synonym_set):
     """Accepts a collection of either String CURIES or LabeledIDs"""
     #Once I am sure that we're only sending in strings, we can dunk this and go back to just using update
     #self.synonyms.update(new_synonym_set)
     for newsyn in filter(lambda x: x != None, new_synonym_set):
         if isinstance(newsyn, str):
             self.synonyms.add(LabeledID(identifier=newsyn, label=""))
         else:
             #Better be a LabeledID
             self.synonyms.add(newsyn)
 def parse_dict_to_knode(nn_dict: dict) -> KNode:
     node = KNode(
         id=nn_dict.get('id', {}).get('identifier', ''),
         name=nn_dict.get('id', {}).get('label', ''),
         type=nn_dict.get('type', ['named_thing'])[0],
     )
     node.add_synonyms(
         set(
             map(lambda x: LabeledID(**x),
                 nn_dict.get('equivalent_identifiers', []))))
     node.add_export_labels(nn_dict.get('type', ['named_thing']))
     return node
Ejemplo n.º 13
0
def build_sets(o, ignore_list = ['ICD']):
    sets = []
    mids = o.get_ids()
    for mid in mids:
        #FWIW, ICD codes tend to be mapped to multiple MONDO identifiers, leading to mass confusion. So we
        #just excise them here.  It's possible that we'll want to revisit this decision in the future.  If so,
        #then we probably will want to set a 'glommable' and 'not glommable' set.
        dbx = set([Text.upper_curie(x) for x in o.get_xrefs(mid) if not reduce(lambda accumlator, ignore_prefix: accumlator or x.startswith(ignore_prefix) , ignore_list, False)])
        dbx = set([norm(x) for x in dbx])
        label = o.get_label(mid)
        mid = Text.upper_curie(mid)
        dbx.add(LabeledID(mid,label))
        sets.append(dbx)
    return sets
Ejemplo n.º 14
0
 def chemical_get_chemical(self,chemnode):
     """One chemical might be produced from the metabolism of another or it may produce another
     as a metabolite. We first look up the reactions for the input chemical.
     Then we pull the reaction which gives us the other chemicals and the relationship"""
     reactions = self.chemical_get_reaction(chemnode)
     chemids = set([Text.un_curie(x) for x in chemnode.get_synonyms_by_prefix('KEGG')])
     results = []
     for reaction_id in reactions:
         rxns = self.get_reaction(reaction_id)
         for rxn in rxns:
             #Only rxns with enzymes are directional I think.
             if 'enzyme' in rxn and len(rxn['enzyme']) > 0:
                 if len(chemids.intersection(rxn['reactants'])) > 0:
                     predicate = LabeledID('RO:0001001','derives into')
                     input_identifier = chemids.intersection(rxn['reactants']).pop()
                     other_chems = rxn['products']
                     forward = True
                 elif len(chemids.intersection(rxn['products'])) > 0:
                     predicate = LabeledID('RO:0001001','derives into')
                     input_identifier = chemids.intersection(rxn['products']).pop()
                     other_chems = rxn['reactants']
                     forward = False
                 else:
                     logger.error(f"Mismatch between query and answer: {rxn} {chemids}")
                     continue
                 for chem in other_chems:
                     output = KNode(f'KEGG:{chem}', type=node_types.METABOLITE)
                     if forward:
                         subj = chemnode
                         obj = output
                     else:
                         subj = output
                         obj = chemnode
                     edge = self.create_edge(subj, obj, f'kegg.chemical_get_chemical',  input_identifier, predicate)
                     results.append( (edge, output))
     return results
Ejemplo n.º 15
0
def synonymize_with_OXO(node,gt):
    synonyms =  oxo_synonymizer.synonymize(node,gt)
    node.synonyms.update(synonyms)
    #Now, if we didn't start with a MONDO id, OXO is not going to give us one.
    #So let's get any doids we have and get a mondo from them
    mondos = {s.identifier for s in node.synonyms if s.identifier.split(':')[0].upper() == 'MONDO'}
    if len(mondos) == 0:
        doids = {s.identifier for s in node.synonyms if s.identifier.split(':')[0].upper() == 'DOID'}
        for doid in doids:
            mids,label = gt.mondo.get_mondo_id_and_label(doid)
            #moremondos comes out as a list of identifiers and one label
            moremondos = [ LabeledID( mid, label) for mid in mids ]
            if len(moremondos) > 0:
                synonyms.update(moremondos)
    return synonyms
Ejemplo n.º 16
0
def run(id_list, service):
    rosy = Rosetta()
    triplets = get_supported_types(service_name=service, rosetta=rosy)

    for triplet in triplets:
        # here a triplet contains something like
        # 'gene' or 'disease' coming from the name attr of concept graph
        # this mini 'crawl' should run for a type that exists in the keys
        # of the grouped types. The keys look something like
        # `gene:gene_or_gene_product:macromolecular ...`
        key = list(filter(lambda b: triplet[0] in b, id_list.keys()))
        if not len(key):
            # if there is no match continue for others
            continue
        key = key[0]
        identifiers = [LabeledID(identifier=y) for y in id_list[key]]
        print(f'running {triplet[0]} --> {triplet[2]}')
        bake_programs(triplet, rosy, identifier_list=identifiers)
Ejemplo n.º 17
0
    def __init__(self, *args, **kwargs):
        self.id = None
        self.name = None
        self.type = None
        self.properties = {}

        if args and len(args) == 1 and isinstance(args[0], str):
            self.id = args[0]
            args = []

        super().__init__(*args, **kwargs)

        if self.id.startswith('http'):
            self.id = Text.obo_to_curie(self.id)

        #Synonyms is just for CURIEs
        self.synonyms = set()
        self.synonyms.add(LabeledID(identifier=self.id, label=self.name))
Ejemplo n.º 18
0
def start(args) :
    if args.annotate:
        rosetta = Rosetta()
        if args.annotate in annotator_class_list:
            print('starting annotation and synonmization')
            results = grab_all(args.annotate, rosetta)
            lids = [LabeledID(x['id'],x['label']) for x in results]
            pool_size = 10
            chunks = pool_size * 2
            chunksize = int(len(lids)/chunks)
            single_run_size = chunksize if chunksize > 0 else 1 
            lids_chunks = [lids[i: i+ single_run_size] for i in range(0, len(lids),single_run_size)]
            partial_run = partial(run_wrapper,f'{args.annotate}')
            print('starting processes')
            pool = Pool(processes = pool_size)
            pool.map_async(partial_run, lids_chunks, error_callback = lambda error: print(error))  
            pool.close()
            pool.join() 
            print('done.')
        else: 
            raise Exception(f'No annotator found for {args.annotate}')
    else:
        raise Exception('No argument passed.')
Ejemplo n.º 19
0
def json_2_identifiers(gene_dict):
    symbol = gene_dict['symbol']
    hgnc_id = LabeledID(identifier=gene_dict['hgnc_id'], label=symbol)
    hgnc_symbol = LabeledID(identifier=f"HGNC.SYMBOL:{symbol}", label=symbol)
    idset = set([hgnc_id, hgnc_symbol])
    if 'entrez_id' in gene_dict:
        idset.add(
            LabeledID(identifier=f"NCBIGENE:{gene_dict['entrez_id']}",
                      label=symbol))
    #There's a strong debate to be had about whether UniProtKB id's belong with genes
    # or with proteins.  In SwissProt, an identifier is meant to be 1:1 with a gene.
    # In my mind, that makes it a gene.  So we will continue to group UniProtKB with them
    #For individual protein sequences, or peptide sequences, we will make them gene_products.
    #Also generate a PR identifier for each from the uniprot id (PR uses uniprot ids for uniprot things)
    if 'uniprot_ids' in gene_dict:
        idset.update([
            LabeledID(identifier=f"UniProtKB:{uniprotkbid}", label=symbol)
            for uniprotkbid in gene_dict['uniprot_ids']
        ])
        idset.update([
            LabeledID(identifier=f"PR:{uniprotkbid}", label=symbol)
            for uniprotkbid in gene_dict['uniprot_ids']
        ])
    if 'ensembl_gene_id' in gene_dict:
        idset.add(
            LabeledID(identifier=f"ENSEMBL:{gene_dict['ensembl_gene_id']}",
                      label=symbol))
    if 'iuphar' in gene_dict:
        if gene_dict['iuphar'].startswith('objectId'):
            gid = gene_dict['iuphar'].split(':')[1]
            idset.add(LabeledID(identifier=f'IUPHAR:{gid}', label=symbol))
    #1. Enzymes aren't really genes
    #2. Even if they were, the mapping in this file is kind of crappy
    #if 'enzyme_id' in gene_dict:
    #    for eid in gene_dict['enzyme_id']:
    #        idset.add( LabeledID(identifier=f'EC:{eid}',label=symbol ) )
    return idset
def get_identifiers(input_type, rosetta):
    lids = []  #get_pickled_labeled_ids(input_type)
    if input_type == node_types.DISEASE:
        identifiers = rosetta.core.mondo.get_ids()
        lids = get_labels_multiple(identifiers)
        # for ident in identifiers:
        #     if ident not in bad_idents:
        #         #label = rosetta.core.mondo.get_label(ident)
        #         label = get_label(ident)
        #         if label is not None and not label.startswith('obsolete'):
        #             lids.append(LabeledID(ident,label))
        # print("got labels")
    if input_type == node_types.PHENOTYPIC_FEATURE:
        # filtering to avoid things like
        # "C0341110" http://www.orpha.net/ORDO/Orphanet:73247
        identifiers = list(
            filter(
                lambda x: x.startswith('HP:'),
                requests.get(
                    'https://onto.renci.org/descendants/HP:0000118').json()))
        lids = get_labels_multiple(identifiers)
        # for ident in identifiers:
        #     if ident not in bad_idents:
        #         label = get_label(ident)
        #         if label is not None and not label['label'].startswith('obsolete'):
        #             lids.append(LabeledID(ident,label['label']))
    elif input_type == node_types.GENETIC_CONDITION:
        identifiers = []
        GENETIC_DISEASE = ('MONDO:0020573', 'MONDO:0003847')
        for disease in GENETIC_DISEASE:
            identifiers += requests.get(
                f'https://onto.renci.org/descendants/{disease}').json()
        lids = get_labels_multiple(identifiers)
        ## this is slow I think we can just grab children of genetic conditions.
        # identifiers_disease = rosetta.core.mondo.get_ids()
        # for ident in identifiers_disease:
        # # print(ident)
        # if ident not in bad_idents:
        #     if rosetta.core.mondo.is_genetic_disease(KNode(ident,type=node_types.DISEASE)):
        #         label = rosetta.core.mondo.get_label(ident)
        #         if label is not None and not label.startswith('obsolete'):
        #             print(ident,label,len(lids))
        #             lids.append(LabeledID(ident,label))
    elif input_type == node_types.ANATOMICAL_ENTITY:
        identifiers = requests.get(
            "https://onto.renci.org/descendants/UBERON:0001062").json()
        identifiers = list(
            filter(
                lambda x: x not in bad_idents and x.split(':')[
                    0] in ['UBERON', 'CL', 'GO'],
                identifiers))  # filter out some bad ids
        lids = get_labels_multiple(identifiers)
        # for ident in identifiers:
        #     if ident not in bad_idents:
        #         if ident.split(':')[0] in ['UBERON','CL','GO']:
        #             #res = get_label(ident) #requests.get(f'https://onto.renci.org/label/{ident}').json()
        #             #lids.append(LabeledID(ident,res['label']))
        #             print(ident)
        #             label = rosetta.core.uberongraph.get_label(ident)
        #             lids.append(LabeledID(ident,label))
    elif input_type == node_types.CELL:
        identifiers = list(
            filter(
                lambda x: x not in bad_idents,
                requests.get(
                    "https://onto.renci.org/descendants/CL:0000000").json()))
        lids = get_labels_multiple(identifiers)
        pickle_labeled_ids(node_types.CELL, lids)
        # identifiers = requests.get("https://onto.renci.org/descendants/CL:0000000").json()
        # for ident in identifiers:
        #     if ident not in bad_idents:
        #         res = get_label(ident) #requests.get(f'https://onto.renci.org/label/{ident}/').json()
        #         lids.append(LabeledID(ident,res['label']))
    elif input_type == node_types.GENE:
        print("Pull genes")
        file_name = get_most_recent_file_name(
            'ftp.ebi.ac.uk',
            '/pub/databases/genenames/hgnc/archive/monthly/json')
        data = pull_via_ftp(
            'ftp.ebi.ac.uk',
            '/pub/databases/genenames/hgnc/archive/monthly/json', file_name)
        hgnc_json = loads(data.decode())
        hgnc_genes = hgnc_json['response']['docs']
        for gene_dict in hgnc_genes:
            symbol = gene_dict['symbol']
            lids.append(
                LabeledID(identifier=gene_dict['hgnc_id'], label=symbol))
        print("OK")
    elif input_type == node_types.CELLULAR_COMPONENT:
        lids = get_pickled_labeled_ids(node_types.CELLULAR_COMPONENT)
        if lids == []:
            print('Pulling cellular compnent descendants')
            identifiers = list(
                filter(
                    lambda x: x not in bad_idents and not x.startswith('CL:'),
                    requests.get(
                        "https://onto.renci.org/descendants/GO:0005575").json(
                        )))
            lids = get_labels_multiple(identifiers)
            pickle_labeled_ids(node_types.CELLULAR_COMPONENT, lids)
        # print('Pulling cellular compnent descendants')
        # identifiers = requests.get("https://onto.renci.org/descendants/GO:0005575").json()
        # for now trying with exclusive descendants of cellular component
        # "cell" is a cellular component, and therefore every type of cell is a cellular component.
        # For querying neo4j, this is confusing, so let's subset to not include things in CL here.
        # for ident in identifiers:
        #     if ident.startswith('CL:'):
        #         continue
        #     if ident in bad_idents:
        #         continue
        #     res = get_label(ident) #requests.get(f'https://onto.renci.org/label/{ident}/').json()
        #     lids.append(LabeledID(ident,res['label']))
    elif input_type == node_types.CHEMICAL_SUBSTANCE:
        print('pull chem ids')
        identifiers = requests.get(
            "https://onto.renci.org/descendants/CHEBI:23367").json()
        identifiers = [x for x in identifiers if 'CHEBI' in x]
        print('pull labels...')
        #This is the good way to do this, but it's soooooo slow
        #n = 0
        #for ident in identifiers:
        #    if n % 100 == 0:
        #        print(n,ident)
        #    n+=1
        #    res = requests.get(f'http://onto.renci.org/label/{ident}/').json()
        #    lids.append(LabeledID(ident,res['label']))
        #Instead:
        chebiobo = pull_via_ftp('ftp.ebi.ac.uk',
                                '/pub/databases/chebi/ontology',
                                'chebi_lite.obo').decode()
        lines = chebiobo.split('\n')
        chebi_labels = {}
        for line in lines:
            if line.startswith('[Term]'):
                tid = None
                label = None
            elif line.startswith('id:'):
                tid = line[3:].strip()
            elif line.startswith('name:'):
                label = line[5:].strip()
                chebi_labels[tid] = label
        # go for KEGG
        print('pull KEGG')
        content = requests.get(
            'http://rest.kegg.jp/list/compound').content.decode('utf-8')
        line_counter = 0
        for line in content.split('\n'):
            if line:
                contains_tab = '\t' in line
                if not contains_tab:
                    print(f"expected tab but not found line : {line} ")
                    print(f"error parsing line {line_counter}")
                    with open('kegg-file.xml', 'w') as file:
                        file.write(content)
                    exit(1)

                identifier, label = line.split('\t')
                identifier = identifier.replace('cpd', 'KEGG')
                identifier = identifier.replace('CPD', 'KEGG')
                # maybe pick the first one for kegg,
                label = label.split(';')[0].strip(' ')
                lids.append(LabeledID(identifier, label))
                line_counter += 1

        for ident in identifiers:
            try:
                lids.append(LabeledID(ident, chebi_labels[ident]))
            except KeyError:
                res = get_label(
                    ident
                )  #requests.get(f'https://onto.renci.org/label/{ident}/').json()
                lids.append(LabeledID(ident, res['label']))

        print('pull GTOPDB')
        gtopdb_ligands = requests.get(
            'https://www.guidetopharmacology.org/services/ligands').json()
        n = 0
        for gtopdb_ligand in gtopdb_ligands:
            try:
                lids.append(
                    LabeledID(f"gtpo:{gtopdb_ligand['ligandId']}",
                              gtopdb_ligand['name']))
                n += 1
            except:
                print(gtopdb_ligand)
        print(n, len(gtopdb_ligands))

    elif input_type == node_types.BIOLOGICAL_PROCESS_OR_ACTIVITY:
        # pull Biological process decendants
        identifiers = requests.get(
            'https://onto.renci.org/descendants/GO:0008150').json()
        identifiers += requests.get(
            'https://onto.renci.org/descendants/GO:0003674').json()
        identifiers = list(filter(lambda x: x not in bad_idents, identifiers))
        lids = get_labels_multiple(identifiers)
        #     # # pull Biological process decendants
        # identifiers = requests.get('https://onto.renci.org/descendants/GO:0008150').json()
        # # merge with molucular activity decendants
        # identifiers = identifiers + requests.get('https://onto.renci.org/descendants/GO:0003674').json()
        # for ident in identifiers:
        #     if ident not in bad_idents:
        #         p = get_label(ident) #requests.get(f'https://onto.renci.org/label/{ident}/')
        #         lids.append(LabeledID(ident, p['label']))
    elif input_type == node_types.GENE_FAMILY:
        gene_fam_data = rosetta.core.panther.gene_family_data
        for key in gene_fam_data:
            name = gene_fam_data[key]['family_name']
            name = f'{name} ({key})' if 'NOT NAMED' in name else name
            lids.append(LabeledID(f'PANTHER.FAMILY:{key}', name))
            sub_keys = [
                k for k in gene_fam_data[key].keys() if k != 'family_name'
            ]
            for k in sub_keys:
                name = gene_fam_data[key][k]['sub_family_name']
                name = f'{name} ({key})' if 'NOT NAMED' in name else name
                lids.append(
                    LabeledID(f'PANTHER.FAMILY:{key}:{k}',
                              gene_fam_data[key][k]['sub_family_name']))

    elif input_type == node_types.FOOD:
        #get the full list of Food ids here here~~~~~
        foods = rosetta.core.foodb.load_all_foods('foods.csv')
        lids = list(map(lambda x: LabeledID(f'FOODB:{x[0]}', x[1]), foods))
    elif input_type == node_types.SEQUENCE_VARIANT:
        # grab every variant already in the graph
        lids = get_all_variant_ids_from_graph(rosetta)

    elif input_type == node_types.METABOLITE:
        ## since we are calling kegg treat kegg compounds as metabolites?
        # go for KEGG
        print('pull KEGG')
        content = requests.get(
            'http://rest.kegg.jp/list/compound').content.decode('utf-8')

        content = requests.get(
            'http://rest.kegg.jp/list/compound').content.decode('utf-8')
        line_counter = 0
        for line in content.split('\n'):
            if line:
                contains_tab = '\t' in line
                if not contains_tab:
                    print(f"expected tab but not found line : {line} ")
                    print(f"error parsing line {line_counter}")
                    with open('kegg-file.xml', 'w') as file:
                        file.write(content)
                    exit(1)
                identifier, label = line.split('\t')
                identifier = identifier.replace('cpd', 'KEGG')
                identifier = identifier.replace('CPD', 'KEGG')
                # maybe pick the first one for kegg,
                label = label.split(';')[0].strip(' ')
                lids.append(LabeledID(identifier, label))
    else:
        print(f'Not configured for input type: {input_type}')
    pickle_labeled_ids(input_type, lids)
    return lids
 def make_predicate(self, json_node):
     if json_node['negated']:
         return None, True
     pred_id = self.predicates[json_node['relation']]
     pred_label = json_node['relation']
     return LabeledID(identifier=pred_id, label=pred_label), False
 def parse_dict_to_kedge(en_dict: dict) -> LabeledID:
     return LabeledID(**en_dict)
 def load_attribute(self, key, value):
     if key == 'original_predicate' or key == 'standard_predicate':
         return LabeledID(**value) if isinstance(value, dict) else value
     else:
         return super().load_attribute(key, value)