def create_ref_sources(self): # create an entrez ref and ensembl ref (optional) self.entrez_ref = make_ref_source(self.record['entrezgene']['@source'], PROPS['Entrez Gene ID'], self.external_ids['Entrez Gene ID'], login=self.login) if 'Ensembl Gene ID' in self.external_ids: if len(self.external_ids['Ensembl Gene ID']) != 1: raise ValueError("more than one ensembl gene ID: {}".format(self.record['entrezgene'])) ensembl_gene_id = list(self.external_ids['Ensembl Gene ID'])[0] self.ensembl_ref = make_ref_source(self.record['ensembl']['@source'], PROPS['Ensembl Gene ID'], ensembl_gene_id, login=self.login)
def create_ref_sources(self): # create an entrez ref and ensembl ref (optional) self.entrez_ref = make_ref_source(self.record['entrezgene']['@source'], PROPS['Entrez Gene ID'], self.external_ids['Entrez Gene ID'], login=self.login) if 'Reference Ensembl Gene ID' in self.external_ids: self.ensembl_ref = make_ref_source(self.record['ensembl']['@source'], PROPS['Ensembl Gene ID'], self.external_ids['Reference Ensembl Gene ID'], login=self.login) elif 'Ensembl Gene ID' in self.external_ids: if len(self.external_ids['Ensembl Gene ID']) == 1: self.ensembl_ref = make_ref_source(self.record['ensembl']['@source'], PROPS['Ensembl Gene ID'], list(self.external_ids['Ensembl Gene ID'])[0], login=self.login)
def create_statements(self): """ create statements common to all proteins """ s = [] ############ # ID statements # Required: uniprot (1) # Optional: OMIM (1?), Ensembl protein (0 or more), refseq protein (0 or more) ############ entrez_gene = self.external_ids['Entrez Gene ID'] uniprot_ref = make_ref_source(self.record['uniprot']['@source'], PROPS['UniProt ID'], self.external_ids['UniProt ID'], login=self.login) entrez_ref = make_ref_source(self.record['entrezgene']['@source'], PROPS['Entrez Gene ID'], self.external_ids['Entrez Gene ID'], login=self.login) s.append(wdi_core.WDString(self.external_ids['UniProt ID'], PROPS['UniProt ID'], references=[uniprot_ref])) for key in ['Saccharomyces Genome Database ID']: if key in self.external_ids: s.append(wdi_core.WDString(self.external_ids[key], PROPS[key], references=[entrez_ref])) key = 'Ensembl Protein ID' if key in self.external_ids: for id in self.external_ids[key]: ref = make_ref_source(self.record['ensembl']['@source'], PROPS[key], id, login=self.login) s.append(wdi_core.WDString(id, PROPS[key], references=[ref])) key = 'RefSeq Protein ID' if key in self.external_ids: for id in self.external_ids[key]: ref = make_ref_source(self.record['refseq']['@source'], PROPS['Entrez Gene ID'], entrez_gene, login=self.login) s.append(wdi_core.WDString(id, PROPS[key], references=[ref])) ############ # Protein statements ############ # instance of protein s.append(wdi_core.WDItemID("Q8054", PROPS['instance of'], references=[uniprot_ref])) # found in taxon s.append(wdi_core.WDItemID(self.organism_info['wdid'], PROPS['found in taxon'], references=[uniprot_ref])) # encoded by s.append(wdi_core.WDItemID(self.gene_wdid, PROPS['encoded by'], references=[uniprot_ref])) return s
def create_gp_statements(self): """ Create genomic_pos start stop orientation plus chromosome qualifiers :return: """ genomic_pos_value = self.record['genomic_pos']['@value'][0] genomic_pos_source = self.record['genomic_pos']['@source'] genomic_pos_id_prop = source_ref_id[genomic_pos_source['id']] assert isinstance(self.external_ids[genomic_pos_id_prop], str) external_id = self.external_ids[genomic_pos_id_prop] genomic_pos_ref = make_ref_source(genomic_pos_source, PROPS[genomic_pos_id_prop], external_id, login=self.login) s = [] # create qualifier for chromosome (which has the refseq ID on it) chr_refseq = genomic_pos_value['chr'] chr_qid = self.refseq_qid_chrom[chr_refseq] qualifiers = [wdi_core.WDItemID(value=chr_qid, prop_nr=PROPS['chromosome'], is_qualifier=True)] # strand orientation strand_orientation = 'Q22809680' if genomic_pos_value['strand'] == 1 else 'Q22809711' s.append(wdi_core.WDItemID(strand_orientation, PROPS['strand orientation'], references=[genomic_pos_ref], qualifiers=qualifiers)) # genomic start and end s.append(wdi_core.WDString(str(int(genomic_pos_value['start'])), PROPS['genomic start'], references=[genomic_pos_ref], qualifiers=qualifiers)) s.append(wdi_core.WDString(str(int(genomic_pos_value['end'])), PROPS['genomic end'], references=[genomic_pos_ref], qualifiers=qualifiers)) return s
def make_gene_encodes(self, write=True): """ Add an "encodes" statement to the gene item :return: """ uniprot_ref = make_ref_source(self.record['uniprot']['@source'], PROPS['UniProt ID'], self.external_ids['UniProt ID'], login=self.login) try: statements = [wdi_core.WDItemID(self.protein_wdid, PROPS['encodes'], references=[uniprot_ref])] wd_item_gene = wdi_core.WDItemEngine(wd_item_id=self.gene_wdid,data=statements, append_value=[PROPS['encodes']], fast_run=fast_run, fast_run_base_filter={PROPS['Entrez Gene ID']: '', PROPS['found in taxon']: self.organism_info[ 'wdid']}, global_ref_mode="CUSTOM", ref_handler=update_retrieved_if_new, core_props=core_props) wdi_helpers.try_write(wd_item_gene, self.external_ids['UniProt ID'], PROPS['UniProt ID'], self.login, write=write) except Exception as e: exc_info = sys.exc_info() traceback.print_exception(*exc_info) msg = wdi_helpers.format_msg(self.external_ids['UniProt ID'], PROPS['UniProt ID'], None, str(e), msg_type=type(e)) wdi_core.WDItemEngine.log("ERROR", msg)
def make_gene_encodes(self, write=True): """ Add an "encodes" statement to the gene item :return: """ uniprot_ref = make_ref_source(self.record['uniprot']['@source'], PROPS['UniProt ID'], self.external_ids['UniProt ID'], login=self.login) try: statements = [ wdi_core.WDItemID(self.protein_wdid, PROPS['encodes'], references=[uniprot_ref]) ] wd_item_gene = wdi_core.WDItemEngine( wd_item_id=self.gene_wdid, domain='genes', data=statements, append_value=[PROPS['encodes']]) wdi_helpers.try_write(wd_item_gene, self.external_ids['UniProt ID'], PROPS['UniProt ID'], self.login, write=write) except Exception as e: exc_info = sys.exc_info() traceback.print_exception(*exc_info) msg = wdi_helpers.format_msg(self.external_ids['UniProt ID'], PROPS['UniProt ID'], None, str(e), msg_type=type(e)) wdi_core.WDItemEngine.log("ERROR", msg)
def create_gp_statements_chr(self): """ Create genomic_pos start stop orientation on a chromosome :return: """ genomic_pos_value = self.record['genomic_pos']['@value'] genomic_pos_source = self.record['genomic_pos']['@source'] genomic_pos_id_prop = source_ref_id[genomic_pos_source['id']] genomic_pos_ref = make_ref_source( genomic_pos_source, PROPS[genomic_pos_id_prop], self.external_ids[genomic_pos_id_prop], login=self.login) # create qualifier for start/stop/orientation chrom_wdid = self.chr_num_wdid[genomic_pos_value['chr']] qualifiers = [ wdi_core.WDItemID(chrom_wdid, PROPS['chromosome'], is_qualifier=True) ] s = [] # strand orientation strand_orientation = 'Q22809680' if genomic_pos_value[ 'strand'] == 1 else 'Q22809711' s.append( wdi_core.WDItemID(strand_orientation, PROPS['strand orientation'], references=[genomic_pos_ref])) # genomic start and end s.append( wdi_core.WDString(str(int(genomic_pos_value['start'])), PROPS['genomic start'], references=[genomic_pos_ref], qualifiers=qualifiers)) s.append( wdi_core.WDString(str(int(genomic_pos_value['end'])), PROPS['genomic end'], references=[genomic_pos_ref], qualifiers=qualifiers)) # chromosome s.append( wdi_core.WDItemID(chrom_wdid, PROPS['chromosome'], references=[genomic_pos_ref])) return s
def create_statements(self): # create gene statements s = Gene.create_statements(self) entrez_ref = make_ref_source(self.record['entrezgene']['@source'], PROPS['Entrez Gene ID'], self.external_ids['Entrez Gene ID'], login=self.login) # add on human specific gene statements for key in ['HGNC ID', 'HGNC Gene Symbol']: if key in self.external_ids: s.append(wdi_core.WDString(self.external_ids[key], PROPS[key], references=[entrez_ref])) # add on gene position statements if 'genomic_pos' in self.record: ss = self.do_gp_human() if ss: s.extend(ss) return s
def create_gp_statements(self): """ Create genomic_pos start stop orientation no chromosome :return: """ genomic_pos_value = self.record['genomic_pos']['@value'] genomic_pos_source = self.record['genomic_pos']['@source'] genomic_pos_id_prop = source_ref_id[genomic_pos_source['id']] genomic_pos_ref = make_ref_source( genomic_pos_source, PROPS[genomic_pos_id_prop], self.external_ids[genomic_pos_id_prop], login=self.login) s = [] # create qualifier for chromosome REFSEQ ID (not chrom item) chromosome = genomic_pos_value['chr'] rs_chrom = wdi_core.WDString(value=chromosome, prop_nr='P2249', is_qualifier=True) # strand orientation strand_orientation = 'Q22809680' if genomic_pos_value[ 'strand'] == 1 else 'Q22809711' s.append( wdi_core.WDItemID(strand_orientation, PROPS['strand orientation'], references=[genomic_pos_ref], qualifiers=[rs_chrom])) # genomic start and end s.append( wdi_core.WDString(str(int(genomic_pos_value['start'])), PROPS['genomic start'], references=[genomic_pos_ref], qualifiers=[rs_chrom])) s.append( wdi_core.WDString(str(int(genomic_pos_value['end'])), PROPS['genomic end'], references=[genomic_pos_ref], qualifiers=[rs_chrom])) return s
def do_gp_human(self): """ create genomic pos, chr, strand statements for human includes genomic assembly genes that are on an unlocalized scaffold will have no genomic position statements example: https://mygene.info/v3/gene/102724770 https://www.wikidata.org/wiki/Q20970159 :return: """ genomic_pos_value = self.record['genomic_pos']['@value'] if genomic_pos_value['chr'] not in self.chr_num_wdid: return [] genomic_pos_source = self.record['genomic_pos']['@source'] genomic_pos_id_prop = source_ref_id[genomic_pos_source['id']] genomic_pos_ref = make_ref_source( genomic_pos_source, PROPS[genomic_pos_id_prop], self.external_ids[genomic_pos_id_prop], login=self.login) assembly = wdi_core.WDItemID("Q20966585", PROPS['genomic assembly'], is_qualifier=True) # create qualifier for start/stop chrom_wdid = self.chr_num_wdid[genomic_pos_value['chr']] qualifiers = [ wdi_core.WDItemID(chrom_wdid, PROPS['chromosome'], is_qualifier=True), assembly ] strand_orientation = 'Q22809680' if genomic_pos_value[ 'strand'] == 1 else 'Q22809711' if 'genomic_pos_hg19' in self.record: do_hg19 = True genomic_pos_value_hg19 = self.record['genomic_pos_hg19']['@value'] genomic_pos_source_hg19 = self.record['genomic_pos_hg19'][ '@source'] genomic_pos_id_prop_hg19 = source_ref_id[ genomic_pos_source_hg19['id']] genomic_pos_ref_hg19 = make_ref_source( genomic_pos_source_hg19, PROPS[genomic_pos_id_prop_hg19], self.external_ids[genomic_pos_id_prop_hg19], login=self.login) assembly_hg19 = wdi_core.WDItemID("Q21067546", PROPS['genomic assembly'], is_qualifier=True) chrom_wdid_hg19 = self.chr_num_wdid[genomic_pos_value_hg19['chr']] qualifiers_hg19 = [ wdi_core.WDItemID(chrom_wdid_hg19, PROPS['chromosome'], is_qualifier=True), assembly_hg19 ] strand_orientation_hg19 = 'Q22809680' if genomic_pos_value_hg19[ 'strand'] == 1 else 'Q22809711' else: do_hg19 = False strand_orientation_hg19 = None assembly_hg19 = None genomic_pos_ref_hg19 = None genomic_pos_value_hg19 = None qualifiers_hg19 = None chrom_wdid_hg19 = None s = [] # strand orientation # if the same for both assemblies, only put one statement if do_hg19 and strand_orientation == strand_orientation_hg19: s.append( wdi_core.WDItemID(strand_orientation, PROPS['strand orientation'], references=[genomic_pos_ref], qualifiers=[assembly, assembly_hg19])) else: s.append( wdi_core.WDItemID(strand_orientation, PROPS['strand orientation'], references=[genomic_pos_ref], qualifiers=[assembly])) if do_hg19: s.append( wdi_core.WDItemID(strand_orientation_hg19, PROPS['strand orientation'], references=[genomic_pos_ref_hg19], qualifiers=[assembly_hg19])) # genomic start and end for both assemblies s.append( wdi_core.WDString(str(int(genomic_pos_value['start'])), PROPS['genomic start'], references=[genomic_pos_ref], qualifiers=qualifiers)) s.append( wdi_core.WDString(str(int(genomic_pos_value['end'])), PROPS['genomic end'], references=[genomic_pos_ref], qualifiers=qualifiers)) if do_hg19: s.append( wdi_core.WDString(str(int(genomic_pos_value_hg19['start'])), PROPS['genomic start'], references=[genomic_pos_ref_hg19], qualifiers=qualifiers_hg19)) s.append( wdi_core.WDString(str(int(genomic_pos_value_hg19['end'])), PROPS['genomic end'], references=[genomic_pos_ref_hg19], qualifiers=qualifiers_hg19)) # chromosome # if the same for both assemblies, only put one statement if do_hg19 and chrom_wdid == chrom_wdid_hg19: s.append( wdi_core.WDItemID(chrom_wdid, PROPS['chromosome'], references=[genomic_pos_ref], qualifiers=[assembly, assembly_hg19])) else: s.append( wdi_core.WDItemID(chrom_wdid, PROPS['chromosome'], references=[genomic_pos_ref], qualifiers=[assembly])) if do_hg19: s.append( wdi_core.WDItemID(chrom_wdid_hg19, PROPS['chromosome'], references=[genomic_pos_ref_hg19], qualifiers=[assembly_hg19])) return s
def create_statements(self): """ create statements common to all genes """ s = [] ############ # ID statements (required) ############ entrez_ref = make_ref_source(self.record['entrezgene']['@source'], PROPS['Entrez Gene ID'], self.external_ids['Entrez Gene ID'], login=self.login) s.append( wdi_core.WDString(self.external_ids['Entrez Gene ID'], PROPS['Entrez Gene ID'], references=[entrez_ref])) # optional ID statements ensembl_ref = None if 'Ensembl Gene ID' in self.external_ids: ensembl_ref = make_ref_source(self.record['ensembl']['@source'], PROPS['Ensembl Gene ID'], self.external_ids['Ensembl Gene ID'], login=self.login) s.append( wdi_core.WDString(self.external_ids['Ensembl Gene ID'], PROPS['Ensembl Gene ID'], references=[ensembl_ref])) # no ensembl transcript ID unless ensembl gene is there also if 'Ensembl Transcript ID' in self.external_ids: for id in self.external_ids['Ensembl Transcript ID']: s.append( wdi_core.WDString(id, PROPS['Ensembl Transcript ID'], references=[ensembl_ref])) key = 'RefSeq RNA ID' if key in self.external_ids: for id in self.external_ids[key]: s.append( wdi_core.WDString(id, PROPS[key], references=[entrez_ref])) for key in [ 'NCBI Locus tag', 'Saccharomyces Genome Database ID', 'Mouse Genome Informatics ID', 'MGI Gene Symbol', 'HomoloGene ID', 'Rat Genome Database ID', 'FlyBase Gene ID', 'Wormbase Gene ID', 'ZFIN Gene ID' ]: if key in self.external_ids: s.append( wdi_core.WDString(self.external_ids[key], PROPS[key], references=[entrez_ref])) ############ # Gene statements ############ # if there is an ensembl ID, this comes from ensembl, otherwise, entrez gene_ref = ensembl_ref if ensembl_ref is not None else entrez_ref # instance of gene s.append( wdi_core.WDItemID('Q7187', PROPS['instance of'], references=[gene_ref])) # instance of 'gene' # found in taxon s.append( wdi_core.WDItemID(self.organism_info['wdid'], PROPS['found in taxon'], references=[gene_ref])) return s