def searchAlias(key, propertyID, value): ''' To search wikidata through "Also known as"(alias) field. Here keys maybe "entrez:1234" or "uniprot:P1234" or "go:1234" ''' ID = "" res = wikidata.search_Item(key) if res: ID = wikidata.search_claim(res, propertyID, value) return ID
def run_MouseProtein(self, MouseProtein): ''' Run the bot for Mouse Protein item Arguments: MouseProtein : MouseProtein item constructed from mygeneinfo.api Search for already created Human Gene item by wikidata label. If fails, as backup search from "Also known as"(alias) field which has values as "uniprot:1234". ''' key = MouseProtein.fieldsdict['Uniprot ID'] title = MouseProtein.fieldsdict['Name'] res = wikidata.search_Item(title) uniprot = 'P352' if res: ID = wikidata.search_claim(res, uniprot, key) if not ID: res = wikidata.search_Item('uniprot:' + str(key)) ID = wikidata.search_claim(res, uniprot, key) if not ID: message = 'Failed to retreive MouseProtein item with uniprot:{UP} from search result:{RES}'.format( RES=res, UP=key) raise wikidata.WikidataSearchError(message) Item = self.genewikidata.get_item(ID) else: message = 'Failed to search already created MouseProtein Wikidata item with UniprotID={val}'.format( val=key) raise wikidata.WikidataSearchError(message) try: CurMProtein = wikidata.construct_from_item(Item, WItem.MouseProtein()) updatedMProtein, summary, updatedClaims = CurMProtein.updateWith( MouseProtein) #print updatedClaims except Exception as err: if isinstance(err, wikidata.WikidataConstructItem): message = 'WikidataParseFailure. ErrorCause:{e} '.format(e=err) raise wikidata.WikidataConstructItem(message) else: raise err #ipdb.set_trace() message = self.write(Item, updatedMProtein, updatedClaims) self.logger(0, Item.getID(), msg=message)
def run_HumanGene(self, HumanGene): ''' Run the bot for the Human Gene item Arguments: HumanGene : HumanGene object constructed from mygeneinfo.api Search for already created Human Gene item by wikidata label. If fails, as backup search from "Also known as"(alias) field which has values as "entrez:1234". ''' key = HumanGene.fieldsdict['Entrez Gene ID'] title = HumanGene.fieldsdict['Name'] res = wikidata.search_Item(title) entrez = 'P351' if res: ID = wikidata.search_claim(res, entrez, key) if not ID: res = wikidata.search_Item('entrez:' + str(key)) ID = wikidata.search_claim(res, entrez, key) if not ID: message = 'Failed to retreive HumanGene item with entrez:{EZ} from search result:{RES}'.format( RES=res, EZ=key) raise wikidata.WikidataSearchError(message) Item = self.genewikidata.get_item(ID) else: message = 'Failed to search by label already created HumanGene Wikidata item with EntrezID={val}'.format( val=key) raise wikidata.WikidataSearchError(message) try: CurHGene = wikidata.construct_from_item(Item, WItem.HumanGene()) updatedHGene, summary, updatedClaims = CurHGene.updateWith( HumanGene) #print updatedClaims except Exception as err: if isinstance(err, wikidata.WikidataConstructItem): message = 'WikidataParseFailure. ErrorCause:{e} '.format(e=err) raise wikidata.WikidataConstructItem(message) else: raise err #ipdb.set_trace() message = self.write(Item, updatedHGene, updatedClaims) self.logger(0, Item.getID(), msg=message)
def parse_HumanProtein_json(gene_json, label): '''Construct the Human Gene from gene_json. The entire fields are specified in WItem Arguments: gene_json - mygeneinfo json document for given gene homoog_json - mygeneinfo json_documnet for corresponding mouse gene ''' #ipdb.set_trace() root = gene_json HPItem = HumanProtein() entrez = get(root, 'entrezgene') uniprot = findReviewedUniprotEntry(get(root, 'uniprot'), entrez) if not uniprot: raise UniProtError('Could not find uniprot ID') HPItem.setField("Uniprot ID", uniprot) HPItem.setField("aliases", 'uniprot:' + str(uniprot)) #First setup Human protein Item with label = HGNC name and uniprot ID name = get(root, 'name') wikidata.setHumanProtein(name, label, uniprot) #found in taxon wikidata item human=Q5 , protein=Q8054 #for proteins label= HGNC fullname HPItem.setField("Name", name) HPItem.setField("description", "human protein") HPItem.setField("found in taxon", "Q5") HPItem.setField("subclass of", "Q8054") name = get(root, 'name') HPItem.setField("Name", name) # HPItem.setField("EC number", get(root, 'ec')) #adding refseq id's based on valid accession prefixes initial = get(get(root, 'refseq'), 'protein') HPItem.setField("RefSeq Protein ID", parse_accession(initial, "RefSeq Protein ID")) HPItem.setField("Ensembl Protein ID", get(get(root, 'ensembl'), 'protein')) #Wikidata items for GO terms GO_ID = 'P686' if get(root, 'go'): GO_DICT = get(root, 'go') for key in GO_DICT: res_list = GO_DICT[key] ID = [] #single term if 'term' in res_list: GID = [] title = res_list['term'] res = wikidata.search_Item(title) if res: for val in res: if val['label'] == title: GID = val['id'] wikidata.setLabel(GID, res_list['id']) wikidata.set_GO_Terms(str(GID), res_list['id'][3:]) break if not GID: GID = searchAlias(res_list['id'], GO_ID, res_list['id'][3:]) #Create GO Item if it does not exist if not GID: GID = wikidata.create_Item(title) print "created GO item ", GO_ID, GID wikidata.setLabel(GID, res_list['id']) #add created id's for the go terms wikidata.addClaim(GID, GO_ID, res_list['id'][3:], 'Gene Ontology ID') CreatedItemlogger(Item=GID, Type='GO TERM', field='Gene Ontology ID', value=res_list['id'], name=str(key)) if not GID.title() in ID: ID.append(GID.title()) else: #mutiple terms in go field for val in res_list: #search for the item title = val['term'] # title has multiple words seperated by / # found if title.find('/') != -1: match = re.search(r'([\w ]*)\/.*', title) title = match.group(1) res = wikidata.search_Item(title) GID = [] #search for the corresponding go term if res: for each_val in res: if each_val['label'] == title: GID = each_val['id'] wikidata.setLabel(GID, val['id']) wikidata.set_GO_Terms(str(GID), val['id'][3:]) break if not GID: GID = searchAlias(val['id'], GO_ID, val['id'][3:]) #Create GO Item if it does not exist if not GID: GID = wikidata.create_Item(title) print "created GO item ", GO_ID, GID wikidata.setLabel(GID, val['id']) wikidata.addClaim(GID, GO_ID, val['id'][3:], 'Gene Ontology ID') CreatedItemlogger(Item=GID, Type='GO TERM', field='Gene Ontology ID', value=val['id'][3:], name=str(title)) if not GID.title() in ID: ID.append(GID.title()) if key == 'CC': HPItem.setField("cell component", ID) elif key == 'MF': HPItem.setField("molecular function", ID) elif key == 'BP': HPItem.setField("biological process", ID) #PDB pdbs = rcsb.pdbs_for_uniprot(uniprot) if not pdbs: pdbs = get(root, 'pdb') HPItem.setField("PDB", pdbs) #For "encoded by" property search for corresponding gene item. If not present ,create it and obtain wikidata identifier #search_title = HGNC symbol key = get(root, 'symbol') ID = [] res = wikidata.search_Item(key) #search for human gene, property = entrez entrezID = 'P351' if res: ID = wikidata.search_claim(res, entrezID, entrez) if not ID: ID = searchAlias("entrez:" + str(entrez), entrezID, entrez) #search result is null or corresponding human gene doesnot exist if not ID: #create human gene item ID = wikidata.create_Item(key) print "created human gene", entrez elabel = "entrez:" + str(entrez) wikidata.setLabel(ID, elabel) #add entrez claim to human gene item wikidata.addClaim(ID, 'P351', str(entrez), 'Entrez Gene ID') CreatedItemlogger(Item=ID, Type='Human Gene', field='Entrez Gene ID', value=entrez, name=str(key)) HPItem.setField("encoded by", ID.title()) #ipdb.set_trace() return HPItem
def parse_MouseProtein_json(Homolog_json): '''Construct the Mouse Protein from gene_json. The entire fields are specified in WItem Arguments: gene_json - mygeneinfo json document for given gene homolog_json - mygeneinfo json_documnet for corresponding mouse gene ''' #ipdb.set_trace() root = Homolog_json MPItem = MouseProtein() #found in taxon wikidata item mouse=Q83310 , protein=Q8054 MPItem.setField("Name", get(root, 'name')) MPItem.setField("description", "mouse protein") MPItem.setField("found in taxon", "Q83310") MPItem.setField("subclass of", "Q8054") name = get(root, 'name') MPItem.setField("Name", name) entrez = get(root, 'entrezgene') uniprot = findReviewedUniprotEntry(get(root, 'uniprot'), entrez) if not uniprot: raise UniProtError('Could not find uniprot ID') MPItem.setField("Uniprot ID", uniprot) MPItem.setField("aliases", 'uniprot:' + str(uniprot)) #MPItem.setField("EC number", get(root, 'ec')) initial = get(get(root, 'refseq'), 'protein') MPItem.setField("RefSeq Protein ID", parse_accession(initial, "RefSeq Protein ID")) MPItem.setField("Ensembl Protein ID", get(get(root, 'ensembl'), 'protein')) #GO TERMS #Wikidata items for GO terms GO_ID = 'P686' if get(root, 'go'): GO_DICT = get(root, 'go') for key in GO_DICT: res_list = GO_DICT[key] ID = [] #single term if 'term' in res_list: GID = [] title = res_list['term'] res = wikidata.search_Item(title) if res: for val in res: if val['label'] == title: GID = val['id'] wikidata.setLabel(GID, res_list['id']) wikidata.set_GO_Terms(str(GID), res_list['id'][3:]) break if not GID: GID = searchAlias(res_list['id'], GO_ID, res_list['id'][3:]) #Create GO Item if it does not exist if not GID: GID = wikidata.create_Item(title) print "created GO item ", GO_ID, GID #add created id's for the go terms wikidata.setLabel(GID, str(res_list['id'])) wikidata.addClaim(GID, GO_ID, res_list['id'][3:], 'Gene Ontology ID') CreatedItemlogger(Item=GID, Type='GO TERM', field='Gene Ontology ID', value=res_list['id'], name=str(key)) if not GID.title() in ID: ID.append(GID.title()) else: #mutiple terms in go field for val in res_list: #search for the item title = val['term'] # title has multiple words seperated by / # found if title.find('/') != -1: match = re.search(r'([\w ]*)\/.*', title) title = match.group(1) res = wikidata.search_Item(title) GID = [] #search for the corresponding go term if res: for each_val in res: if each_val['label'] == title: GID = each_val['id'] wikidata.setLabel(GID, str(val['id'])) wikidata.set_GO_Terms(str(GID), val['id'][3:]) break #Create GO Item if it does not exist if not GID: GID = searchAlias(val['id'], GO_ID, val['id'][3:]) if not GID: GID = wikidata.create_Item(title) print "created GO item ", GO_ID, GID #add created id's for the go terms wikidata.setLabel(GID, str(val['id'])) wikidata.addClaim(GID, GO_ID, val['id'][3:], 'Gene Ontology ID') CreatedItemlogger(Item=GID, Type='GO TERM', field='Gene Ontology ID', value=val['id'][3:], name=str(title)) if not GID.title() in ID: ID.append(GID.title()) if key == 'CC': MPItem.setField("cell component", ID) elif key == 'MF': MPItem.setField("molecular function", ID) elif key == 'BP': MPItem.setField("biological process", ID) #PDB - CHECK what if Human proteins donot have pdb Id? pdbs = rcsb.pdbs_for_uniprot(uniprot) if not pdbs: pdbs = get(root, 'pdb') MPItem.setField("PDB", pdbs) #For "encoded by" property search for corresponding gene item. If not present ,create it and obtain wikidata identifier #search_title = HGNC symbol key = get(root, 'symbol') ID = [] res = wikidata.search_Item(key) entrezID = 'P351' #search for mouse gene, property = entrez if res: ID = wikidata.search_claim(res, entrezID, entrez) if not ID: ID = searchAlias("entrez:" + str(entrez), entrezID, entrez) #search result is null or corresponding mouse gene doesnot exist if not ID: message = "Failed to retreive Mouse Gene wikidata item with entrez:{ez}".format( ez=entrez) raise wikidata.WikidataSearchError(message) MPItem.setField("encoded by", ID.title()) #ipdb.set_trace() return MPItem
def parse_MouseGene_json(homolog_json, gene_json): '''Construct the Mouse Gene from gene_json. The entire fields are specified in WItem Arguments: gene_json - mygeneinfo json document for given gene homolog_json - mygeneinfo json_documnet for corresponding mouse gene ''' #ipdb.set_trace() MGItem = MouseGene() root = homolog_json MGItem.setField("found in taxon", "Q83310") MGItem.setField("subclass of", "Q7187") MGItem.setField("description", "mouse gene") #for mouse gene label = MGI symbol MGItem.setField("Name", get(root, 'symbol')) entrez = get(root, 'entrezgene') MGItem.setField("Entrez Gene ID", entrez) MGItem.setField("aliases", 'entrez:' + str(entrez)) MGItem.setField("Homologene ID", get(get(root, 'homologene'), 'id')) MGItem.setField("gene symbol", get(root, 'symbol')) MGItem.setField("Ensembl Gene ID", get(get(root, 'ensembl'), 'gene')) MGItem.setField("Ensembl Transcript ID", get(get(root, 'ensembl'), 'transcript')) MGItem.setField("GenLoc_chr", get(get(root, 'genomic_pos'), 'chr')) MGItem.setField("GenLoc_start", get(get(root, 'genomic_pos'), 'start')) MGItem.setField("GenLoc_end", get(get(root, 'genomic_pos'), 'end')) #MGItem.setField("AltSymbols", get(root, 'alias')) #adding id's based on valid refseq prefixes initial = get(get(root, 'refseq'), 'rna') MGItem.setField("RefSeq", parse_accession(initial, "RefSeq")) initial = get(get(root, 'accession'), 'rna') MGItem.setField("RefSeq RNA ID", parse_accession(initial, "RefSeq RNA ID")) #encodes -- search for mouse protein key = get(root, 'name') ID = [] res = wikidata.search_Item(key) #search for mouse protein, property = uniprot ID #most surely mouse protein is present, but still.... uniprot = findReviewedUniprotEntry(get(root, 'uniprot'), entrez) uniprotID = 'P352' if res: ID = wikidata.search_claim(res, uniprotID, uniprot) if not ID: ID = searchAlias("uniprot:" + str(uniprot), uniprotID, uniprot) #search result is null or corresponding human gene doesnot exist if not ID: #create mouse protein item print "creating mouse protein with uniprot ID", uniprot ID = wikidata.create_Item(key) ulabel = "uniprot:" + str(uniprot) wikidata.setLabel(ID, ulabel) #add uniprot claim to mouse protein item wikidata.addClaim(ID, 'P352', uniprot, 'Uniprot ID') CreatedItemlogger(Item=ID, Type='Mouse Protein', field='Uniprot', value='uniprot', name=str(key)) MGItem.setField("encodes", ID.title()) #ortholog key = get(gene_json, 'symbol') ID = [] res = wikidata.search_Item(key) #search for human gene, property = entrez ID entrezID = 'P351' if res: ID = wikidata.search_claim(res, entrezID, get(gene_json, 'entrezgene')) if not ID: ID = searchAlias("entrez:" + str(get(gene_json, 'entrezgene')), entrezID, get(gene_json, 'entrezgene')) #search result is null or corresponding human gene doesnot exist if not ID: message = "Failed to retreive Human Gene wikidata item with entrez:{ez}".format( ez=get(gene_json, 'entrezgene')) raise wikidata.WikidataSearchError(message) MGItem.setField("ortholog", ID.title()) #ipdb.set_trace() return MGItem
def parse_HumanGene_json(gene_json, homolog_json): '''Construct the Human Gene from gene_json. The entire fields are specified in WItem Arguments: gene_json - mygeneinfo json document for given gene homolog_json - mygeneinfo json_documnet for corresponding mouse gene ''' #ipdb.set_trace() HGItem = HumanGene() root = gene_json HGItem.setField("found in taxon", "Q5") HGItem.setField("description", "human gene") HGItem.setField("subclass of", "Q7187") #for genes label = HGNC symbol HGItem.setField("Name", get(root, 'symbol')) entrez = get(root, 'entrezgene') HGItem.setField("Entrez Gene ID", entrez) HGItem.setField("aliases", 'entrez:' + str(entrez)) HGItem.setField("Homologene ID", get(get(root, 'homologene'), 'id')) HGItem.setField("gene symbol", get(root, 'symbol')) HGItem.setField("Ensembl Gene ID", get(get(root, 'ensembl'), 'gene')) HGItem.setField("Ensembl Transcript ID", get(get(root, 'ensembl'), 'transcript')) HGItem.setField("GenLoc_chr", get(get(root, 'genomic_pos'), 'chr')) HGItem.setField("GenLoc_start", get(get(root, 'genomic_pos'), 'start')) HGItem.setField("GenLoc_end", get(get(root, 'genomic_pos'), 'end')) #HGItem.setField("AltSymbols", get(root, 'alias')) #adding id's based on valid refseq prefixes initial = get(get(root, 'refseq'), 'rna') HGItem.setField("RefSeq", parse_accession(initial, "RefSeq")) initial = get(get(root, 'accession'), 'rna') HGItem.setField("RefSeq RNA ID", parse_accession(initial, "RefSeq RNA ID")) HGItem.setField("HGNC ID", get(root, 'HGNC')) HGItem.setField("OMIM ID", get(root, 'MIM')) #encodes -- search for human protein key = get(root, 'name') ID = [] res = wikidata.search_Item(key) #search for human protein, property = uniprot ID #ipdb.set_trace() uniprot = findReviewedUniprotEntry(get(root, 'uniprot'), entrez) uniprotID = 'P352' if res: ID = wikidata.search_claim(res, uniprotID, uniprot) if not ID: ID = searchAlias("uniprot:" + str(uniprot), uniprotID, uniprot) #search result is null or corresponding human protein doesnot exist if not ID: message = "Failed to retreive HumanProteinItem with Uniprot:{up}".format( up=uniprot) raise wikidata.WikidataSearchError(message) #following convention of having capitalised wikidata identifiers HGItem.setField("encodes", ID.title()) if not homolog_json: return HGItem #ortholog key = get(homolog_json, 'symbol') ID = [] res = wikidata.search_Item(key) #search for mouse gene, property = entrez ID entrezID = 'P351' mouse_entrez = get(homolog_json, 'entrezgene') if res: ID = wikidata.search_claim(res, entrezID, mouse_entrez) #backup search if not ID: ID = searchAlias("entrez:" + str(mouse_entrez), entrezID, mouse_entrez) #search result is null or corresponding mouse gene doesnot exist if not ID: #create mouse gene item ID = wikidata.create_Item(key) #add entrez claim to mouse gene item mouse_entrez = get(homolog_json, 'entrezgene') elabel = "entrez:" + str(mouse_entrez) wikidata.setLabel(ID, elabel) wikidata.addClaim(ID, 'P351', str(mouse_entrez), 'Entrez Gene ID') CreatedItemlogger(Item=ID, Type='Mouse Gene', field='Entrez', value=mouse_entrez, name=str(key)) print "created mouse gene item -- with entrez", mouse_entrez #following convention of having capitalised wikidata identifiers HGItem.setField("ortholog", ID.title()) #ipdb.set_trace() return HGItem