def build_exact_sets(o,u): sets = [] mids = o.get_ids() print(len(mids)) n = 0 now = dt.now() for mid in mids: if n % 100 == 0 and n > 0: later = dt.now() delt = (later-now).seconds f = n / len(mids) print(f'{n}/{len(mids)} = {f} in {delt} s') print(f' estimated time remaining = {delt * (1-f)/(f)}') #FWIW, ICD codes tend to be mapped to multiple MONDO identifiers, leading to mass confusion. So we #just excise them here. It's possible that we'll want to revisit this decision in the future. If so, #then we probably will want to set a 'glommable' and 'not glommable' set. print(mid) dbx = [ Text.upper_curie(x) for x in o.get_exact_matches(mid) ] print(dbx) dbx = set( filter( lambda x: not x.startswith('ICD'), dbx ) ) label = u.get_label(mid) print(label) mid = Text.upper_curie(mid) dbx.add(LabeledID(mid,label)) sets.append(dbx) n += 1 return sets
def build_sets(o, ignore_list = ['ICD']): sets = [] mids = o.get_ids() for mid in mids: #FWIW, ICD codes tend to be mapped to multiple MONDO identifiers, leading to mass confusion. So we #just excise them here. It's possible that we'll want to revisit this decision in the future. If so, #then we probably will want to set a 'glommable' and 'not glommable' set. dbx = set([Text.upper_curie(x) for x in o.get_xrefs(mid) if not reduce(lambda accumlator, ignore_prefix: accumlator or x.startswith(ignore_prefix) , ignore_list, False)]) dbx = set([norm(x) for x in dbx]) label = o.get_label(mid) mid = Text.upper_curie(mid) dbx.add(LabeledID(mid,label)) sets.append(dbx) return sets
def gene_to_drug_expanded(self, gene_node): output = [] identifiers = gene_node.get_synonyms_by_prefix('NCBIGENE') for identifier in identifiers: unique = set() geneid = Text.un_curie(identifier) url = f"{self.url}CTD_chem_gene_expanded_geneID/ncbigene:{geneid}/" obj = requests.get (url).json () for r in obj: good_row, predicate_label, props, pmids = self.check_expanded_gene_chemical_row(r) if not good_row: continue predicate = self.normalize_predicate( LabeledID(identifier=f"CTD:{Text.snakify(predicate_label)}", label=predicate_label) ) #Should this be substance? drug_node = KNode(Text.upper_curie(r['chemicalID']), type=node_types.CHEMICAL_SUBSTANCE, name=r['chem_label']) direction = r['direction'] if direction == '->': subject = drug_node object = gene_node else: subject = gene_node object = drug_node edge = self.create_edge(subject,object,'ctd.gene_to_drug_expanded',identifier,predicate,properties = props,url=url,publications=pmids) #This is what we'd like it to be, but right now there's not enough real specificity on the predicates #key = (drug_node.id, edge.standard_predicate.label) key = (drug_node.id, edge.original_predicate.label) if key not in unique: output.append( (edge,drug_node) ) unique.add(key) return output
def drug_to_gene_expanded(self, drug): output = [] identifiers = drug.get_synonyms_by_prefix('MESH') for identifier in identifiers: url=f"{self.url}CTD_chem_gene_expanded_chemicalID/mesh:{Text.un_curie(identifier)}/" result = requests.get(url) obj=result.json() for r in obj: good_row, predicate_label, props, pmids = self.check_expanded_gene_chemical_row(r) if not good_row: continue predicate = self.normalize_predicate( LabeledID(identifier=f"CTD:{Text.snakify(predicate_label)}", label=predicate_label) ) gene_node = KNode(Text.upper_curie(r['geneID']), name=r['gene_label'],type=node_types.GENE) direction = r['direction'] if direction == '->': subject = drug object = gene_node else: subject = gene_node object = drug edge = self.create_edge(subject,object,'ctd.drug_to_gene_expanded',identifier,predicate,publications=pmids,properties=props,url=url ) output.append( (edge,gene_node) ) return output
def check_expanded_gene_chemical_row(self, r): props = {"description": r['interaction'], 'taxon': f"taxon:{r['taxonID']}"} pmids = r['PMID'].split('|') predicate_label = r['interaction'] # there are lots of garbage microarrays with only one paper. THey goop the place up # ignore them good_row = True if len(pmids) < 3: if predicate_label in ['affects expression of', 'increases expression of', 'decreases expression of', 'affects methylation of', 'increases methylation of', 'decreases methylation of', 'affects molecular modification of', 'increases molecular modification of', 'decreases molecular modification of']: good_row = False if len(pmids) < 2: if predicate_label in ['affects splicing of', 'increases splicing of', 'decreases splicing of']: good_row = False pmids = [Text.upper_curie(p) for p in pmids] return good_row, predicate_label, props, pmids