def run(login, gene_records, chrom_wdid): for record in tqdm(gene_records): if 'genomic_pos' not in record: # see: http://mygene.info/v3/gene/855814 PBB_Core.WDItemEngine.log( "WARNING", format_msg(record['_id']['@value'], "no_position", '', ENTREZ_PROP)) continue if isinstance(record['genomic_pos']['@value'], list): # see: http://mygene.info/v3/gene/853483 PBB_Core.WDItemEngine.log( "WARNING", format_msg(record['_id']['@value'], "multiple_positions", '', ENTREZ_PROP)) continue wd_item_construction(record, strain_info, chrom_wdid, login)
def run(login, records, add_pubmed): # get all entrez gene id -> wdid mappings, where found in taxon is this strain gene_wdid_mapping = WDHelper().id_mapper( "P351", (("P703", strain_info['organism_wdid']), )) # get all goID to wdid mappings go_wdid_mapping = WDHelper().id_mapper("P686") for record in tqdm(records, desc=strain_info['organism_name']): entrez_gene = str(record['entrezgene']['@value']) if entrez_gene not in gene_wdid_mapping: PBB_Core.WDItemEngine.log( "ERROR", format_msg(record['_id']['@value'], "gene_not_found", None, ENTREZ_PROP)) continue gene_qid = gene_wdid_mapping[entrez_gene] protein_item(record, strain_info, gene_qid, go_wdid_mapping, login, add_pubmed)
def run_encodes(login, records): # get all entrez gene id -> wdid mappings, where found in taxon is this strain gene_wdid_mapping = PBB_Helpers.id_mapper( "P351", (("P703", strain_info['organism_wdid']), )) # get all ensembl protein id -> wdid mappings, where found in taxon is this strain protein_wdid_mapping = PBB_Helpers.id_mapper( "P705", (("P703", strain_info['organism_wdid']), )) for record in tqdm(records, desc=strain_info['organism_name']): entrez_gene = str(record['entrezgene']['@value']) if entrez_gene not in gene_wdid_mapping: PBB_Core.WDItemEngine.log( "ERROR", format_msg(record['_id']['@value'], "gene_not_found", None, ENTREZ_PROP)) continue gene_qid = gene_wdid_mapping[entrez_gene] protein_qid = protein_wdid_mapping[record['ensembl']['@value'] ['protein']] gene_encodes_statement(gene_qid, protein_qid, 'ncbi_gene', entrez_gene, record['ensembl']['@source'], login)
def protein_item(record, strain_info, gene_qid, go_wdid_mapping, login, add_pubmed): """ generate pbb_core item object """ item_name = '{} {}'.format(record['name']['@value'], record['ensembl']['@value']['protein']) item_description = '{} protein found in {}'.format( strain_info['organism_type'], strain_info['organism_name']) s = [] ############ # external IDs ############ # will be used for reference statements external_ids = { 'entrez_gene': str(record['entrezgene']['@value']), 'ensembl_protein': record['ensembl']['@value']['protein'], 'ensembl_gene': record['ensembl']['@value']['gene'], 'refseq_protein': record['refseq']['@value']['protein'], 'uniprot': record['uniprot']['@value']['Swiss-Prot'] } # ensembl protein id ensembl_ref = make_ref_source(record['ensembl']['@source'], 'ensembl_protein', external_ids['ensembl_protein']) s.append( PBB_Core.WDString(external_ids['ensembl_protein'], 'P705', references=[ensembl_ref])) # refseq protein id refseq_ref = make_ref_source(record['refseq']['@source'], 'refseq_protein', external_ids['refseq_protein']) s.append( PBB_Core.WDString(external_ids['refseq_protein'], 'P637', references=[refseq_ref])) # uniprot id uniprot_ref = make_ref_source(record['uniprot']['@source'], 'uniprot', external_ids['uniprot']) s.append( PBB_Core.WDString(external_ids['uniprot'], 'P352', references=[uniprot_ref])) ############ # GO terms # TODO: https://www.wikidata.org/wiki/Q3460832 ############ preprocess_go(record) print(record) go_source = record['go']['@source'] go_id_prop = source_ref_id[go_source['_id']] reference = make_ref_source(go_source, go_id_prop, external_ids[go_id_prop]) for go_level, go_records in record['go']['@value'].items(): level_wdid = go_props[go_level] for go_record in go_records: go_wdid = go_wdid_mapping[go_record['id']] evidence_wdid = go_evidence_codes[go_record['evidence']] evidence_statement = PBB_Core.WDItemID(value=evidence_wdid, prop_nr='P459', is_qualifier=True) this_reference = copy.deepcopy(reference) if add_pubmed: for pubmed in go_record['pubmed']: pmid_wdid = PBB_Helpers.PubmedStub(pubmed).create(login) this_reference.append( PBB_Core.WDItemID(pmid_wdid, 'P248', is_reference=True)) s.append( PBB_Core.WDItemID(go_wdid, level_wdid, references=[this_reference], qualifiers=[evidence_statement])) ############ # statements with no referencable sources (make by hand, for now...) ############ # subclass of protein s.append(PBB_Core.WDItemID('Q8054', 'P279', references=[ensembl_ref])) # found in taxon s.append( PBB_Core.WDItemID(strain_info['organism_wdid'], 'P703', references=[ensembl_ref])) # encodes gene s.append(PBB_Core.WDItemID(gene_qid, 'P702', references=[ensembl_ref])) try: wd_item_protein = PBB_Core.WDItemEngine( item_name=item_name, domain='proteins', data=s, append_value=['P279'], fast_run=True, fast_run_base_filter={ 'P352': '', 'P703': strain_info['organism_wdid'] }) wd_item_protein.set_label(item_name) wd_item_protein.set_description(item_description, lang='en') wd_item_protein.set_aliases( [record['symbol']['@value'], record['locus_tag']['@value']]) except Exception as e: print(e) PBB_Core.WDItemEngine.log( "ERROR", format_msg(record['entrezgene']['@value'], str(e), None, ENTREZ_PROP)) return try_write(wd_item_protein, record['entrezgene']['@value'], 'P351', login)
def make_chroms(strain_info, retrieved, login): chrom_wdid = {} for chrom_num, genome_id in strain_info['chrom_genomeid_map'].items(): item_name = '{} chromosome {}'.format(strain_info['organism_name'], chrom_num) item_description = '{} chromosome'.format(strain_info['organism_type']) print(item_name) print(genome_id) reference = make_ref(retrieved, genome_id) statements = [] statements.append( PBB_Core.WDItemID(value='Q37748', prop_nr='P279', references=[reference ])) # subclass of chromosome statements.append( PBB_Core.WDItemID(value=strain_info['organism_wdid'], prop_nr='P703', references=[reference])) # found in taxon statements.append( PBB_Core.WDString(value=genome_id, prop_nr='P2249', references=[reference])) # genome id wd_item = PBB_Core.WDItemEngine(item_name=item_name, domain='chromosome', data=statements, append_value=['P279'], fast_run=True, fast_run_base_filter={ 'P703': strain_info['organism_wdid'], 'P2249': '' }) if wd_item.require_write: print("require write") wd_item.set_label(item_name) wd_item.set_description(item_description, lang='en') try: msg = "CREATE" if wd_item.create_new_item else "UPDATE" wd_item.write(login=login) PBB_Core.WDItemEngine.log( "INFO", format_msg(genome_id, msg, wd_item.wd_item_id, external_id_prop='P2249')) except Exception as e: print(e) PBB_Core.WDItemEngine.log( "ERROR", format_msg(genome_id, str(e), wd_item.wd_item_id, external_id_prop='P2249')) else: chrom_wdid[chrom_num] = wd_item.wd_item_id PBB_Core.WDItemEngine.log( "INFO", format_msg(genome_id, "SKIP", wd_item.wd_item_id, external_id_prop='P2249')) return chrom_wdid