def create_reference(self): """ Create wikidata references for interpro Items: Q3047275: InterPro Properties: stated in (P248) imported from (P143) software version (P348) publication date (P577) """ # This same reference will be used for everything. Except for a ref to the interpro item itself ref_stated_in = PBB_Core.WDItemID("Q3047275", 'P248', is_reference=True) ref_imported = PBB_Core.WDItemID("Q3047275", 'P143', is_reference=True) ref_version = PBB_Core.WDString(self.version, 'P348', is_reference=True) ref_date = PBB_Core.WDTime(self.date.strftime("+%Y-%m-%dT00:00:00Z"), 'P577', is_reference=True) ref_ipr = PBB_Core.WDString(self.id, "P2926", is_reference=True) self.reference = [ ref_stated_in, ref_imported, ref_version, ref_date, ref_ipr ] for ref in self.reference: ref.overwrite_references = True
def make_ref_source(source_doc, id_prop, identifier, login=None): """ Reference is made up of: stated_in: if the source has a release #: release edition else, stated in the source link to id: link to identifier in source retrieved: only if source has no release # login: must be passed if you want to be able to create new release items :param source_doc: :param id_prop: :param identifier: :return: """ # source_doc = {'_id': 'uniprot', 'timestamp': '20161006'} # source_doc = {'_id': 'ensembl', 'release': 86, 'timestamp': '20161005'} source = source_doc['_id'] if source not in source_items: raise ValueError( "Unknown source for reference creation: {}".format(source)) if id_prop not in prop_ids: raise ValueError( "Unknown id_prop for reference creation: {}".format(id_prop)) link_to_id = PBB_Core.WDString(value=str(identifier), prop_nr=prop_ids[id_prop], is_reference=True) if "release" in source_doc: source_doc['release'] = str(source_doc['release']) title = "{} Release {}".format(source_doc['_id'], source_doc['release']) description = "Release {} of {}".format(source_doc['release'], source_doc['_id']) edition_of_wdid = source_items[source_doc['_id']] release = PBB_Helpers.Release( title, description, source_doc['release'], edition_of_wdid=edition_of_wdid).get_or_create(login) stated_in = PBB_Core.WDItemID(value=release, prop_nr='P248', is_reference=True) reference = [stated_in, link_to_id] else: date_string = source_doc['timestamp'] retrieved = datetime.strptime(date_string, "%Y%m%d") stated_in = PBB_Core.WDItemID(value=source_items[source], prop_nr='P248', is_reference=True) retrieved = PBB_Core.WDTime(retrieved.strftime('+%Y-%m-%dT00:00:00Z'), prop_nr='P813', is_reference=True) reference = [stated_in, retrieved, link_to_id] return reference
def create_relationships(self, ipr_wd): # ipr_wd is a dict ipr ID to wikidata ID mapping statements = [ PBB_Core.WDExternalID(value=self.id, prop_nr=INTERPRO, references=[self.reference]) ] if self.parent: statements.append( PBB_Core.WDItemID(value=ipr_wd[self.parent], prop_nr='P279', references=[self.reference])) # subclass of if self.contains: for c in self.contains: statements.append( PBB_Core.WDItemID(value=ipr_wd[c], prop_nr='P527', references=[self.reference])) # has part if self.found_in: for f in self.found_in: statements.append( PBB_Core.WDItemID(value=ipr_wd[f], prop_nr='P361', references=[self.reference])) # part of if len(statements) == 1: return # write data item = PBB_Core.WDItemEngine(item_name=self.name, domain='interpro', data=statements, server=SERVER, append_value=["P279", "P527", "P361"]) try: item.write(self.login) except WDApiError as e: print(e) PBB_Core.WDItemEngine.log( 'ERROR', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}' .format(main_data_id=self.id, exception_type=type(e), message=e.__str__(), wd_id=self.wd_item_id, duration=datetime.now())) return PBB_Core.WDItemEngine.log( 'INFO', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}' .format(main_data_id=self.id, exception_type='', message='created interpro relationships: {}'.format([ (x.prop_nr, x.value) for x in statements ]), wd_id=item.wd_item_id, duration=datetime.now()))
def create_relationships(self, login): try: # endpoint may not get updated in time? self.do_wdid_lookup() except KeyError as e: PBB_Core.WDItemEngine.log( "ERROR", format_msg(self.id, INTERPRO, None, str(e), type(e))) return statements = [ PBB_Core.WDExternalID(value=self.id, prop_nr=INTERPRO, references=[self.reference]) ] if self.parent: # subclass of statements.append( PBB_Core.WDItemID(value=self.parent_wdid, prop_nr='P279', references=[self.reference])) if self.contains: for c in self.contains_wdid: statements.append( PBB_Core.WDItemID(value=c, prop_nr='P527', references=[self.reference])) # has part if self.found_in: for f in self.found_in_wdid: statements.append( PBB_Core.WDItemID(value=f, prop_nr='P361', references=[self.reference])) # part of if len(statements) == 1: return wd_item = PBB_Core.WDItemEngine( wd_item_id=self.wdid, domain='interpro', data=statements, append_value=['P279', 'P527', 'P361'], fast_run=True, fast_run_base_filter=IPRTerm.fast_run_base_filter) PBB_Helpers.try_write( wd_item, self.id, INTERPRO, login, edit_summary="create/update subclass/has part/part of")
def gene_encodes_statement(gene_qid, protein_qid, id_prop, external_id, source, login): """ :param gene_qid: :param protein_qid: :param id_prop: :param external_id: :param source: :param login: :return: """ ensembl_protein_reference = make_ref_source(source, id_prop, external_id) # gene gene_encodes = PBB_Core.WDItemID(value=protein_qid, prop_nr='P688', references=[ensembl_protein_reference]) wd_item_protein = PBB_Core.WDItemEngine(wd_item_id=gene_qid, domain='genes', data=[gene_encodes], fast_run=True, fast_run_base_filter={ 'P351': '', 'P703': strain_info['organism_wdid'] }) if wd_item_protein.create_new_item: raise ValueError("nooo!!") try_write(wd_item_protein, external_id, id_prop, login)
def create_item(self, login): statements = [ PBB_Core.WDExternalID(value=self.id, prop_nr=INTERPRO, references=[self.reference]), PBB_Core.WDItemID(value=self.type_wdid, prop_nr="P279", references=[self.reference]) ] wd_item = PBB_Core.WDItemEngine( item_name=self.name, domain='interpro', data=statements, append_value=["P279"], fast_run=True, fast_run_base_filter=IPRTerm.fast_run_base_filter) wd_item.set_label(self.name, lang='en') for lang, description in self.lang_descr.items(): wd_item.set_description(description, lang=lang) wd_item.set_aliases([self.short_name, self.id]) PBB_Helpers.try_write(wd_item, self.id, INTERPRO, login) return wd_item
def create_reference(self): ref_stated_in = PBB_Core.WDItemID(self.MONDO_WDID, 'P248', is_reference=True) ref_retrieved = PBB_Core.WDTime(self.retrieved.strftime('+%Y-%m-%dT00:00:00Z'), 'P813', is_reference=True) # interpro ID #ref_archive_url = PBB_Core.WDUrl(self.ref_url, 'P1065', is_reference=True) #reference = [ref_stated_in, ref_retrieved, ref_archive_url] reference = [ref_stated_in, ref_retrieved] self.reference = reference
def create_uniprot_relationships(login, release_wdid, collection, taxon=None): # only do uniprot proteins that are already in wikidata if taxon: uniprot2wd = PBB_Helpers.id_mapper(UNIPROT, (("P703", taxon),)) fast_run_base_filter = {UNIPROT: "", "P703": taxon} else: uniprot2wd = PBB_Helpers.id_mapper(UNIPROT) fast_run_base_filter = {UNIPROT: ""} cursor = collection.find({'_id': {'$in': list(uniprot2wd.keys())}}, no_cursor_timeout=True) for doc in tqdm(cursor, total=cursor.count()): uniprot_id = doc['_id'] statements = [] # uniprot ID. needed for PBB_core to find uniprot item # statements.append(PBB_Core.WDExternalID(value=uniprot_id, prop_nr=UNIPROT)) ## References # stated in Interpro version XX.X ref_stated_in = PBB_Core.WDItemID(release_wdid, 'P248', is_reference=True) ref_ipr = PBB_Core.WDString("http://www.ebi.ac.uk/interpro/protein/{}".format(uniprot_id), "P854", is_reference=True) reference = [ref_stated_in, ref_ipr] if doc['subclass']: for f in doc['subclass']: statements.append(PBB_Core.WDItemID(value=IPRTerm.ipr2wd[f], prop_nr='P279', references=[reference])) if doc['has_part']: for hp in doc['has_part']: statements.append(PBB_Core.WDItemID(value=IPRTerm.ipr2wd[hp], prop_nr='P527', references=[reference])) if uniprot_id not in uniprot2wd: print("wdid_not_found " + uniprot_id + " " + uniprot2wd[uniprot_id]) PBB_Core.WDItemEngine.log("ERROR", PBB_Helpers.format_msg(uniprot_id, UNIPROT, None, "wdid_not_found")) wd_item = PBB_Core.WDItemEngine(wd_item_id=uniprot2wd[uniprot_id], domain="proteins", data=statements, fast_run=True, fast_run_base_filter=fast_run_base_filter, append_value=["P279", "P527", "P361"]) if wd_item.create_new_item: raise ValueError("something bad happened") PBB_Helpers.try_write(wd_item, uniprot_id, INTERPRO, login, edit_summary="add/update family and/or domains") cursor.close()
def create_reference(self): """ Create wikidata references for interpro This same reference will be used for everything. Except for a ref to the interpro item itself """ # stated in Interpro version XX.X ref_stated_in = PBB_Core.WDItemID(self.release_wdid, 'P248', is_reference=True) ref_ipr = PBB_Core.WDString(self.id, INTERPRO, is_reference=True) # interpro ID self.reference = [ref_stated_in, ref_ipr]
def make_ref(retrieved, genome_id): refs = [ PBB_Core.WDItemID(value='Q20641742', prop_nr='P248', is_reference=True), # stated in ncbi gene PBB_Core.WDString(value=genome_id, prop_nr='P2249', is_reference=True), # Link to Refseq Genome ID PBB_Core.WDTime(retrieved.strftime('+%Y-%m-%dT00:00:00Z'), prop_nr='P813', is_reference=True) ] return refs
def make_reference(source, id_prop, identifier, retrieved): reference = [ PBB_Core.WDItemID(value=source_items[source], prop_nr='P248', is_reference=True), # stated in PBB_Core.WDString(value=str(identifier), prop_nr=prop_ids[id_prop], is_reference=True), # Link to ID PBB_Core.WDTime(retrieved.strftime('+%Y-%m-%dT00:00:00Z'), prop_nr='P813', is_reference=True) ] return reference
def create_item(self): statements = [ PBB_Core.WDExternalID(value=self.id, prop_nr=INTERPRO, references=[self.reference]), PBB_Core.WDItemID(value=IPRItem.type2subclass[self.type], prop_nr="P279", references=[self.reference]) ] item = PBB_Core.WDItemEngine(item_name=self.name, domain='interpro', data=statements, server=SERVER) item.set_label(self.name) for lang, description in self.description.items(): item.set_description(description, lang=lang) item.set_aliases([self.short_name, self.id]) try: item.write(login=self.login) except WDApiError as e: print(e) PBB_Core.WDItemEngine.log( 'ERROR', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}' .format(main_data_id=self.id, exception_type=type(e), message=e.__str__(), wd_id=self.wd_item_id, duration=datetime.now())) return self.wd_item_id = item.wd_item_id PBB_Core.WDItemEngine.log( 'INFO', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}' .format(main_data_id=self.id, exception_type='', message='created/updated interpro item', wd_id=item.wd_item_id, duration=datetime.now()))
def protein_item(record, strain_info, gene_qid, go_wdid_mapping, login, add_pubmed): """ generate pbb_core item object """ item_name = '{} {}'.format(record['name']['@value'], record['ensembl']['@value']['protein']) item_description = '{} protein found in {}'.format( strain_info['organism_type'], strain_info['organism_name']) s = [] ############ # external IDs ############ # will be used for reference statements external_ids = { 'entrez_gene': str(record['entrezgene']['@value']), 'ensembl_protein': record['ensembl']['@value']['protein'], 'ensembl_gene': record['ensembl']['@value']['gene'], 'refseq_protein': record['refseq']['@value']['protein'], 'uniprot': record['uniprot']['@value']['Swiss-Prot'] } # ensembl protein id ensembl_ref = make_ref_source(record['ensembl']['@source'], 'ensembl_protein', external_ids['ensembl_protein']) s.append( PBB_Core.WDString(external_ids['ensembl_protein'], 'P705', references=[ensembl_ref])) # refseq protein id refseq_ref = make_ref_source(record['refseq']['@source'], 'refseq_protein', external_ids['refseq_protein']) s.append( PBB_Core.WDString(external_ids['refseq_protein'], 'P637', references=[refseq_ref])) # uniprot id uniprot_ref = make_ref_source(record['uniprot']['@source'], 'uniprot', external_ids['uniprot']) s.append( PBB_Core.WDString(external_ids['uniprot'], 'P352', references=[uniprot_ref])) ############ # GO terms # TODO: https://www.wikidata.org/wiki/Q3460832 ############ preprocess_go(record) print(record) go_source = record['go']['@source'] go_id_prop = source_ref_id[go_source['_id']] reference = make_ref_source(go_source, go_id_prop, external_ids[go_id_prop]) for go_level, go_records in record['go']['@value'].items(): level_wdid = go_props[go_level] for go_record in go_records: go_wdid = go_wdid_mapping[go_record['id']] evidence_wdid = go_evidence_codes[go_record['evidence']] evidence_statement = PBB_Core.WDItemID(value=evidence_wdid, prop_nr='P459', is_qualifier=True) this_reference = copy.deepcopy(reference) if add_pubmed: for pubmed in go_record['pubmed']: pmid_wdid = PBB_Helpers.PubmedStub(pubmed).create(login) this_reference.append( PBB_Core.WDItemID(pmid_wdid, 'P248', is_reference=True)) s.append( PBB_Core.WDItemID(go_wdid, level_wdid, references=[this_reference], qualifiers=[evidence_statement])) ############ # statements with no referencable sources (make by hand, for now...) ############ # subclass of protein s.append(PBB_Core.WDItemID('Q8054', 'P279', references=[ensembl_ref])) # found in taxon s.append( PBB_Core.WDItemID(strain_info['organism_wdid'], 'P703', references=[ensembl_ref])) # encodes gene s.append(PBB_Core.WDItemID(gene_qid, 'P702', references=[ensembl_ref])) try: wd_item_protein = PBB_Core.WDItemEngine( item_name=item_name, domain='proteins', data=s, append_value=['P279'], fast_run=True, fast_run_base_filter={ 'P352': '', 'P703': strain_info['organism_wdid'] }) wd_item_protein.set_label(item_name) wd_item_protein.set_description(item_description, lang='en') wd_item_protein.set_aliases( [record['symbol']['@value'], record['locus_tag']['@value']]) except Exception as e: print(e) PBB_Core.WDItemEngine.log( "ERROR", format_msg(record['entrezgene']['@value'], str(e), None, ENTREZ_PROP)) return try_write(wd_item_protein, record['entrezgene']['@value'], 'P351', login)
def create_protein_ipr(uniprot_id, uniprot_wdid, families, has_part, release_info, login): """ Create interpro relationships to one protein :param uniprot_id: uniprot ID of the protein to modify :type uniprot_id: str :param uniprot_wdid: wikidata ID of the protein :param families: list of ipr wd ids the protein is a (P279) subclass of :param has_part: list of ipr wd ids the protein has (P527) has part :return: """ date = release_info['date'] version = release_info['version'] # create ref ref_stated_in = PBB_Core.WDItemID("Q3047275", 'P248', is_reference=True) ref_imported = PBB_Core.WDItemID("Q3047275", 'P143', is_reference=True) ref_version = PBB_Core.WDString(version, 'P348', is_reference=True) ref_date = PBB_Core.WDTime(date.strftime("+%Y-%m-%dT00:00:00Z"), 'P577', is_reference=True) ref_ipr = PBB_Core.WDString( "http://www.ebi.ac.uk/interpro/protein/{}".format(uniprot_id), "P854", is_reference=True) reference = [ref_stated_in, ref_imported, ref_version, ref_date, ref_ipr] for ref in reference: ref.overwrite_references = True statements = [] if families: for f in families: statements.append( PBB_Core.WDItemID(value=f, prop_nr='P279', references=[reference])) if has_part: for hp in has_part: statements.append( PBB_Core.WDItemID(value=hp, prop_nr='P527', references=[reference])) item = PBB_Core.WDItemEngine(wd_item_id=uniprot_wdid, data=statements, server=SERVER, append_value=["P279", "P527", "P361"]) # print(item.get_wd_json_representation()) try: item.write(login) except WDApiError as e: print(e) PBB_Core.WDItemEngine.log( 'ERROR', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}' .format(main_data_id=uniprot_id, exception_type=type(e), message=e.__str__(), wd_id=uniprot_wdid, duration=datetime.now())) return PBB_Core.WDItemEngine.log( 'INFO', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}'. format(main_data_id=uniprot_id, exception_type='', message='created protein interpro relationships: {}'.format([ (x.prop_nr, x.value) for x in statements ]), wd_id=uniprot_wdid, duration=datetime.now()))
def make_chroms(strain_info, retrieved, login): chrom_wdid = {} for chrom_num, genome_id in strain_info['chrom_genomeid_map'].items(): item_name = '{} chromosome {}'.format(strain_info['organism_name'], chrom_num) item_description = '{} chromosome'.format(strain_info['organism_type']) print(item_name) print(genome_id) reference = make_ref(retrieved, genome_id) statements = [] statements.append( PBB_Core.WDItemID(value='Q37748', prop_nr='P279', references=[reference ])) # subclass of chromosome statements.append( PBB_Core.WDItemID(value=strain_info['organism_wdid'], prop_nr='P703', references=[reference])) # found in taxon statements.append( PBB_Core.WDString(value=genome_id, prop_nr='P2249', references=[reference])) # genome id wd_item = PBB_Core.WDItemEngine(item_name=item_name, domain='chromosome', data=statements, append_value=['P279'], fast_run=True, fast_run_base_filter={ 'P703': strain_info['organism_wdid'], 'P2249': '' }) if wd_item.require_write: print("require write") wd_item.set_label(item_name) wd_item.set_description(item_description, lang='en') try: msg = "CREATE" if wd_item.create_new_item else "UPDATE" wd_item.write(login=login) PBB_Core.WDItemEngine.log( "INFO", format_msg(genome_id, msg, wd_item.wd_item_id, external_id_prop='P2249')) except Exception as e: print(e) PBB_Core.WDItemEngine.log( "ERROR", format_msg(genome_id, str(e), wd_item.wd_item_id, external_id_prop='P2249')) else: chrom_wdid[chrom_num] = wd_item.wd_item_id PBB_Core.WDItemEngine.log( "INFO", format_msg(genome_id, "SKIP", wd_item.wd_item_id, external_id_prop='P2249')) return chrom_wdid
def gene_item_statements(): """ construct list of referenced statements to past to PBB_Core Item engine """ s = [] ############ # external IDs ############ # will be used for reference statements external_ids = { 'entrez_gene': str(record['entrezgene']['@value']), 'ensembl_gene': record['ensembl']['@value']['gene'], 'locus_tag': record['locus_tag']['@value'] } # entrez gene id entrez_ref = make_ref_source(record['entrezgene']['@source'], 'entrez_gene', external_ids['entrez_gene']) s.append( PBB_Core.WDString(external_ids['entrez_gene'], PROPS['Entrez Gene ID'], references=[entrez_ref])) # ensembl gene id ensembl_ref = make_ref_source(record['ensembl']['@source'], 'ensembl_gene', external_ids['ensembl_gene']) s.append( PBB_Core.WDString(external_ids['ensembl_gene'], PROPS['Ensembl Gene ID'], references=[ensembl_ref])) # ncbi locus tag s.append( PBB_Core.WDString(external_ids['locus_tag'], PROPS['NCBI Locus tag'], references=[entrez_ref])) ############ # statements with no referencable sources (make by hand, for now...) ############ # subclass of gene s.append( PBB_Core.WDItemID('Q7187', PROPS['subclass of'], references=[ensembl_ref])) # found in taxon s.append( PBB_Core.WDItemID(strain_info['organism_wdid'], PROPS['found in taxon'], references=[ensembl_ref])) ############ # genomic position: start, end, strand orientation, chromosome ############ genomic_pos_value = record['genomic_pos']['@value'] genomic_pos_source = record['genomic_pos']['@source'] genomic_pos_id_prop = source_ref_id[genomic_pos_source['_id']] genomic_pos_ref = make_ref_source(genomic_pos_source, genomic_pos_id_prop, external_ids[genomic_pos_id_prop]) # create chromosome qualifier chrom_genomeid = strain_info['chrom_genomeid_map'][ genomic_pos_value['chr']] rs_chrom = PBB_Core.WDString(chrom_genomeid, 'P2249', is_qualifier=True) # Refseq Genome ID # strand orientation strand_orientation = 'Q22809680' if genomic_pos_value[ 'strand'] == 1 else 'Q22809711' s.append( PBB_Core.WDItemID(strand_orientation, PROPS['strand orientation'], references=[genomic_pos_ref])) # genomic start and end s.append( PBB_Core.WDString(str(int(genomic_pos_value['start'])), PROPS['genomic start'], references=[genomic_pos_ref], qualifiers=[rs_chrom])) s.append( PBB_Core.WDString(str(int(genomic_pos_value['end'])), PROPS['genomic end'], references=[genomic_pos_ref], qualifiers=[rs_chrom])) # chromosome chr_genomic_id = strain_info['chrom_genomeid_map'][ genomic_pos_value['chr']] s.append( PBB_Core.WDItemID(chrom_wdid[chr_genomic_id], PROPS['chromosome'], references=[genomic_pos_ref])) return s
def __init__(self, object): """ :type self: object """ self.start = object["start"] self.entrezgene = object["entrezgene"] self.uniprotwikidataids = object["uniprotwikidataids"] gene_annotations = self.annotate_gene() self.genomeInfo = object["speciesInfo"][str(gene_annotations['taxid'])] self.content = object self.name = gene_annotations["name"] self.logincreds = object["logincreds"] if "_timestamp" in gene_annotations.keys(): self.annotationstimestamp = gene_annotations["_timestamp"] self.wdid = object["wdid"] # symbol: self.symbol = gene_annotations["symbol"] print(self.symbol) # HGNC if "HGNC" in gene_annotations: if isinstance(gene_annotations["HGNC"], list): self.hgnc = gene_annotations["HGNC"] else: self.hgnc = [gene_annotations["HGNC"]] else: self.hgnc = None # Ensembl Gene & transcript if "ensembl" in gene_annotations: if "gene" in gene_annotations["ensembl"]: if isinstance(gene_annotations["ensembl"]["gene"], list): self.ensembl_gene = gene_annotations["ensembl"]["gene"] else: self.ensembl_gene = [gene_annotations["ensembl"]["gene"]] else: self.ensembl_gene = None if "transcript" in gene_annotations["ensembl"]: if isinstance(gene_annotations["ensembl"]["transcript"], list): self.ensembl_transcript = gene_annotations["ensembl"]["transcript"] else: self.ensembl_transcript = [gene_annotations["ensembl"]["transcript"]] else: self.ensembl_transcript = None # Homologene if "homologene" in gene_annotations: if isinstance(gene_annotations["homologene"]["id"], list): self.homologene = [str(i) for i in gene_annotations["homologene"]["id"]] else: self.homologene = [str(gene_annotations["homologene"]["id"])] else: self.homologene = None # Refseq if "refseq" in gene_annotations: if "rna" in gene_annotations["refseq"]: if isinstance(gene_annotations["refseq"]["rna"], list): self.refseq_rna = gene_annotations["refseq"]["rna"] else: self.refseq_rna = [gene_annotations["refseq"]["rna"]] else: self.refseq_rna = None else: self.refseq_rna = None # MGI if "MGI" in gene_annotations: if isinstance(gene_annotations["MGI"], list): self.MGI = gene_annotations["MGI"] else: self.MGI = [gene_annotations["MGI"]] else: self.MGI = None self.chromosome = None self.startpost = None self.endpos = None if "genomic_pos" in gene_annotations: if isinstance(gene_annotations["genomic_pos"], list): self.chromosome = [] self.startpos = [] self.endpos = [] for i in range(len(gene_annotations["genomic_pos"])): if gene_annotations["genomic_pos"][i]["chr"] in ProteinBoxBotKnowledge.chromosomes[ self.genomeInfo["name"]].keys(): self.chromosome.append(ProteinBoxBotKnowledge.chromosomes[self.genomeInfo["name"]][ gene_annotations["genomic_pos"][i]["chr"]]) self.startpos.append(gene_annotations["genomic_pos"][i]["start"]) self.endpos.append(gene_annotations["genomic_pos"][i]["end"]) else: self.chromosome = [] self.startpos = [] self.endpos = [] if gene_annotations["genomic_pos"]["chr"] in ProteinBoxBotKnowledge.chromosomes[ self.genomeInfo["name"]].keys(): self.chromosome.append(ProteinBoxBotKnowledge.chromosomes[self.genomeInfo["name"]][ gene_annotations["genomic_pos"]["chr"]]) self.startpos.append(gene_annotations["genomic_pos"]["start"]) self.endpos.append(gene_annotations["genomic_pos"]["end"]) self.encodes = None if "uniprot" in gene_annotations.keys(): if "Swiss-Prot" in gene_annotations["uniprot"].keys(): if isinstance(gene_annotations["uniprot"]["Swiss-Prot"], list): self.encodes = [] for uniprot in gene_annotations["uniprot"]["Swiss-Prot"]: self.encodes.append(uniprot) else: self.encodes = [gene_annotations["uniprot"]["Swiss-Prot"]] self.chromosomeHg19 = None self.startposHg19 = None self.endposHg19 = None if "genomic_pos_hg19" in gene_annotations: if isinstance(gene_annotations["genomic_pos_hg19"], list): self.chromosomeHg19 = [] self.startposHg19 = [] self.endposHg19 = [] for i in range(len(gene_annotations["genomic_pos_hg19"])): if gene_annotations["genomic_pos_hg19"][i]["chr"] in ProteinBoxBotKnowledge.chromosomes[ self.genomeInfo["name"]].keys(): self.chromosomeHg19.append(ProteinBoxBotKnowledge.chromosomes[self.genomeInfo["name"]][ gene_annotations["genomic_pos_hg19"][i]["chr"]]) self.startposHg19.append(gene_annotations["genomic_pos_hg19"][i]["start"]) self.endposHg19.append(gene_annotations["genomic_pos_hg19"][i]["end"]) else: self.chromosomeHg19 = [] self.startposHg19 = [] self.endposHg19 = [] if gene_annotations["genomic_pos_hg19"]["chr"] in ProteinBoxBotKnowledge.chromosomes[ self.genomeInfo["name"]].keys(): self.chromosomeHg19.append(ProteinBoxBotKnowledge.chromosomes[self.genomeInfo["name"]][ gene_annotations["genomic_pos_hg19"]["chr"]]) self.startposHg19.append(gene_annotations["genomic_pos_hg19"]["start"]) self.endposHg19.append(gene_annotations["genomic_pos_hg19"]["end"]) # type of Gene if "type_of_gene" in gene_annotations: self.type_of_gene = [] if gene_annotations["type_of_gene"] == "ncRNA": self.type_of_gene.append("Q427087") if gene_annotations["type_of_gene"] == "snRNA": self.type_of_gene.append("Q284578") if gene_annotations["type_of_gene"] == "snoRNA": self.type_of_gene.append("Q284416") if gene_annotations["type_of_gene"] == "rRNA": self.type_of_gene.append("Q215980") if gene_annotations["type_of_gene"] == "tRNA": self.type_of_gene.append("Q201448") if gene_annotations["type_of_gene"] == "pseudo": self.type_of_gene.append("Q277338") if gene_annotations["type_of_gene"] == "protein-coding": self.type_of_gene.append("Q20747295") else: self.type_of_gene = None # Reference section # Prepare references refStatedIn = PBB_Core.WDItemID(value=self.genomeInfo["release"], prop_nr='P248', is_reference=True) refStatedIn.overwrite_references = True refImported = PBB_Core.WDItemID(value='Q20641742', prop_nr='P143', is_reference=True) refImported.overwrite_references = True timeStringNow = strftime("+%Y-%m-%dT00:00:00Z", gmtime()) refRetrieved = PBB_Core.WDTime(timeStringNow, prop_nr='P813', is_reference=True) refRetrieved.overwrite_references = True gene_reference = [refStatedIn, refImported, refRetrieved] refStatedInEnsembl = PBB_Core.WDItemID(value= 'Q21996330', prop_nr='P248', is_reference=True) refStatedInEnsembl.overwrite_references = True refImportedEnsembl = PBB_Core.WDItemID(value='Q1344256', prop_nr='P143', is_reference=True) refImportedEnsembl.overwrite_references = True ensembl_reference = [refStatedInEnsembl, refImportedEnsembl, refRetrieved] genomeBuildQualifier = PBB_Core.WDItemID(value=self.genomeInfo["genome_assembly"], prop_nr='P659', is_qualifier=True) genomeBuildPreviousQualifier = PBB_Core.WDItemID(value=self.genomeInfo["genome_assembly_previous"], prop_nr='P659', is_qualifier=True) prep = dict() prep['P703'] = [PBB_Core.WDItemID(value=self.genomeInfo['wdid'], prop_nr='P703', references=[copy.deepcopy(gene_reference)])] if self.genomeInfo["name"] == "human": prep['P353'] = [ PBB_Core.WDString(value=self.symbol, prop_nr='P353', references=[copy.deepcopy(gene_reference)])] prep['P351'] = [ PBB_Core.WDString(value=str(self.entrezgene), prop_nr='P351', references=[copy.deepcopy(gene_reference)])] prep['P279'] = [PBB_Core.WDItemID(value='Q7187', prop_nr='P279', references=[copy.deepcopy(gene_reference)])] if "type_of_gene" in vars(self): if self.type_of_gene != None: for i in range(len(self.type_of_gene)): prep['P279'].append(PBB_Core.WDItemID(value=self.type_of_gene[i], prop_nr='P279', references=[copy.deepcopy(gene_reference)])) if "ensembl_gene" in vars(self): if self.ensembl_gene != None: prep['P594'] = [] for ensemblg in self.ensembl_gene: prep['P594'].append( PBB_Core.WDString(value=ensemblg, prop_nr='P594', references=[copy.deepcopy(gene_reference)])) if "ensembl_transcript" in vars(self): if self.ensembl_transcript != None: prep['P704'] = [] for ensemblt in self.ensembl_transcript: prep['P704'].append( PBB_Core.WDString(value=ensemblt, prop_nr='P704', references=[copy.deepcopy(gene_reference)])) if "encodes" in vars(self): if self.encodes != None: prep['P688'] = [] for uniprot in self.encodes: if uniprot in self.uniprotwikidataids.keys(): prep['P688'].append(PBB_Core.WDItemID(value=self.uniprotwikidataids[uniprot], prop_nr='P688', references=[copy.deepcopy(gene_reference)])) if "hgnc" in vars(self): if self.hgnc != None: prep['P354'] = [] for hugo in self.hgnc: prep['P354'].append( PBB_Core.WDString(value=hugo, prop_nr='P354', references=[copy.deepcopy(gene_reference)])) if "homologene" in vars(self): if self.homologene != None: prep['P593'] = [] for ortholog in self.homologene: prep['P593'].append( PBB_Core.WDString(value=ortholog, prop_nr='P593', references=[copy.deepcopy(gene_reference)])) if "refseq_rna" in vars(self): if self.refseq_rna != None: prep['P639'] = [] for refseq in self.refseq_rna: prep['P639'].append( PBB_Core.WDString(value=refseq, prop_nr='P639', references=[copy.deepcopy(gene_reference)])) if "chromosome" in vars(self): prep['P1057'] = [] if self.chromosome != None: for chrom in list(set(self.chromosome)): prep['P1057'].append( PBB_Core.WDItemID(value=chrom, prop_nr='P1057', references=[copy.deepcopy(gene_reference)])) if "startpos" in vars(self): if not 'P644' in prep.keys(): prep['P644'] = [] if self.startpos != None: for pos in self.startpos: prep['P644'].append( PBB_Core.WDString(value=str(pos), prop_nr='P644', references=[copy.deepcopy(ensembl_reference)], qualifiers=[copy.deepcopy(genomeBuildQualifier)])) if "endpos" in vars(self): if not 'P645' in prep.keys(): prep['P645'] = [] if self.endpos != None: for pos in self.endpos: prep['P645'].append( PBB_Core.WDString(value=str(pos), prop_nr='P645', references=[copy.deepcopy(ensembl_reference)], qualifiers=[copy.deepcopy(genomeBuildQualifier)])) if "startposHg19" in vars(self): if not 'P644' in prep.keys(): prep['P644'] = [] if self.startposHg19 != None: for pos in self.startposHg19: prep['P644'].append( PBB_Core.WDString(value=str(pos), prop_nr='P644', references=[copy.deepcopy(ensembl_reference)], qualifiers=[copy.deepcopy(genomeBuildPreviousQualifier)])) if "endposHg19" in vars(self): if not 'P644' in prep.keys(): prep['P645'] = [] if self.endposHg19 != None: for pos in self.endposHg19: prep['P645'].append( PBB_Core.WDString(value=str(pos), prop_nr='P645', references=[copy.deepcopy(ensembl_reference)], qualifiers=[copy.deepcopy(genomeBuildPreviousQualifier)])) if "MGI" in vars(self): prep['P671'] = [] if self.MGI != None: for mgi in self.MGI: prep['P671'].append(PBB_Core.WDString(value=mgi, prop_nr='P671', references=[copy.deepcopy(gene_reference)])) if "alias" in gene_annotations.keys(): if isinstance(gene_annotations["alias"], list): self.synonyms = [] for alias in gene_annotations["alias"]: self.synonyms.append(alias) else: self.synonyms = [gene_annotations["alias"]] self.synonyms.append(self.symbol) print(self.synonyms) else: self.synonyms = None data2add = [] for key in prep.keys(): for statement in prep[key]: data2add.append(statement) print(statement.prop_nr, statement.value) if self.wdid != None: # if self.encodes != None: wdPage = PBB_Core.WDItemEngine(self.wdid, item_name=self.name, data=data2add, server="www.wikidata.org", domain="genes") if wdPage.get_description() == "": wdPage.set_description(description=self.genomeInfo['name'] + ' gene', lang='en') if wdPage.get_description(lang='fr') == "" or wdPage.get_description(lang='fr') == "gène": wdPage.set_description(description="Un gène " + self.genomeInfo['fr-name'], lang='fr') if wdPage.get_description(lang='nl') == "" or wdPage.get_description(lang='nl') == "gen": wdPage.set_description(description="Een "+ self.genomeInfo['nl-name']+ " gen", lang='nl') if self.synonyms != None: wdPage.set_aliases(aliases=self.synonyms, lang='en', append=True) print(self.wdid) self.wd_json_representation = wdPage.get_wd_json_representation() PBB_Debug.prettyPrint(self.wd_json_representation) PBB_Debug.prettyPrint(data2add) # print(self.wd_json_representation) wdPage.write(self.logincreds) print("aa") else: #if self.encodes != None: wdPage = PBB_Core.WDItemEngine(item_name=self.name, data=data2add, server="www.wikidata.org", domain="genes") if wdPage.get_description() != "": wdPage.set_description(description=self.genomeInfo['name'] + ' gene', lang='en') if wdPage.get_description(lang='fr') == "" or wdPage.get_description(lang='fr') == "gène": wdPage.setdescription(description="Un gène " + self.genomeInfo['fr-name'], lang='fr') if wdPage.get_description(lang='nl') == "" or wdPage.get_description(lang='nl') == "gen": wdPage.setdescription(description="Een "+ self.genomeInfo['nl-name']+ " gen", lang='nl') if self.synonyms != None: wdPage.set_aliases(aliases=self.synonyms, lang='en', append=True) self.wd_json_representation = wdPage.get_wd_json_representation() PBB_Debug.prettyPrint(self.wd_json_representation) PBB_Debug.prettyPrint(data2add) # print(self.wd_json_representation) self.wdid = wdPage.write(self.logincreds) PBB_Core.WDItemEngine.log('INFO', '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}'.format( main_data_id=str(self.entrezgene), exception_type='', message=f.name, wd_id=self.wdid, duration=time.time()-self.start ))