def create_item(self, login): statements = [ PBB_Core.WDExternalID(value=self.id, prop_nr=INTERPRO, references=[self.reference]), PBB_Core.WDItemID(value=self.type_wdid, prop_nr="P279", references=[self.reference]) ] wd_item = PBB_Core.WDItemEngine( item_name=self.name, domain='interpro', data=statements, append_value=["P279"], fast_run=True, fast_run_base_filter=IPRTerm.fast_run_base_filter) wd_item.set_label(self.name, lang='en') for lang, description in self.lang_descr.items(): wd_item.set_description(description, lang=lang) wd_item.set_aliases([self.short_name, self.id]) PBB_Helpers.try_write(wd_item, self.id, INTERPRO, login) return wd_item
def create_relationships(self, login): try: # endpoint may not get updated in time? self.do_wdid_lookup() except KeyError as e: PBB_Core.WDItemEngine.log( "ERROR", format_msg(self.id, INTERPRO, None, str(e), type(e))) return statements = [ PBB_Core.WDExternalID(value=self.id, prop_nr=INTERPRO, references=[self.reference]) ] if self.parent: # subclass of statements.append( PBB_Core.WDItemID(value=self.parent_wdid, prop_nr='P279', references=[self.reference])) if self.contains: for c in self.contains_wdid: statements.append( PBB_Core.WDItemID(value=c, prop_nr='P527', references=[self.reference])) # has part if self.found_in: for f in self.found_in_wdid: statements.append( PBB_Core.WDItemID(value=f, prop_nr='P361', references=[self.reference])) # part of if len(statements) == 1: return wd_item = PBB_Core.WDItemEngine( wd_item_id=self.wdid, domain='interpro', data=statements, append_value=['P279', 'P527', 'P361'], fast_run=True, fast_run_base_filter=IPRTerm.fast_run_base_filter) PBB_Helpers.try_write( wd_item, self.id, INTERPRO, login, edit_summary="create/update subclass/has part/part of")
def make_ref_source(source_doc, id_prop, identifier, login=None): """ Reference is made up of: stated_in: if the source has a release #: release edition else, stated in the source link to id: link to identifier in source retrieved: only if source has no release # login: must be passed if you want to be able to create new release items :param source_doc: :param id_prop: :param identifier: :return: """ # source_doc = {'_id': 'uniprot', 'timestamp': '20161006'} # source_doc = {'_id': 'ensembl', 'release': 86, 'timestamp': '20161005'} source = source_doc['_id'] if source not in source_items: raise ValueError( "Unknown source for reference creation: {}".format(source)) if id_prop not in prop_ids: raise ValueError( "Unknown id_prop for reference creation: {}".format(id_prop)) link_to_id = PBB_Core.WDString(value=str(identifier), prop_nr=prop_ids[id_prop], is_reference=True) if "release" in source_doc: source_doc['release'] = str(source_doc['release']) title = "{} Release {}".format(source_doc['_id'], source_doc['release']) description = "Release {} of {}".format(source_doc['release'], source_doc['_id']) edition_of_wdid = source_items[source_doc['_id']] release = PBB_Helpers.Release( title, description, source_doc['release'], edition_of_wdid=edition_of_wdid).get_or_create(login) stated_in = PBB_Core.WDItemID(value=release, prop_nr='P248', is_reference=True) reference = [stated_in, link_to_id] else: date_string = source_doc['timestamp'] retrieved = datetime.strptime(date_string, "%Y%m%d") stated_in = PBB_Core.WDItemID(value=source_items[source], prop_nr='P248', is_reference=True) retrieved = PBB_Core.WDTime(retrieved.strftime('+%Y-%m-%dT00:00:00Z'), prop_nr='P813', is_reference=True) reference = [stated_in, retrieved, link_to_id] return reference
def create_uniprot_relationships(login, release_wdid, collection, taxon=None): # only do uniprot proteins that are already in wikidata if taxon: uniprot2wd = PBB_Helpers.id_mapper(UNIPROT, (("P703", taxon),)) fast_run_base_filter = {UNIPROT: "", "P703": taxon} else: uniprot2wd = PBB_Helpers.id_mapper(UNIPROT) fast_run_base_filter = {UNIPROT: ""} cursor = collection.find({'_id': {'$in': list(uniprot2wd.keys())}}, no_cursor_timeout=True) for doc in tqdm(cursor, total=cursor.count()): uniprot_id = doc['_id'] statements = [] # uniprot ID. needed for PBB_core to find uniprot item # statements.append(PBB_Core.WDExternalID(value=uniprot_id, prop_nr=UNIPROT)) ## References # stated in Interpro version XX.X ref_stated_in = PBB_Core.WDItemID(release_wdid, 'P248', is_reference=True) ref_ipr = PBB_Core.WDString("http://www.ebi.ac.uk/interpro/protein/{}".format(uniprot_id), "P854", is_reference=True) reference = [ref_stated_in, ref_ipr] if doc['subclass']: for f in doc['subclass']: statements.append(PBB_Core.WDItemID(value=IPRTerm.ipr2wd[f], prop_nr='P279', references=[reference])) if doc['has_part']: for hp in doc['has_part']: statements.append(PBB_Core.WDItemID(value=IPRTerm.ipr2wd[hp], prop_nr='P527', references=[reference])) if uniprot_id not in uniprot2wd: print("wdid_not_found " + uniprot_id + " " + uniprot2wd[uniprot_id]) PBB_Core.WDItemEngine.log("ERROR", PBB_Helpers.format_msg(uniprot_id, UNIPROT, None, "wdid_not_found")) wd_item = PBB_Core.WDItemEngine(wd_item_id=uniprot2wd[uniprot_id], domain="proteins", data=statements, fast_run=True, fast_run_base_filter=fast_run_base_filter, append_value=["P279", "P527", "P361"]) if wd_item.create_new_item: raise ValueError("something bad happened") PBB_Helpers.try_write(wd_item, uniprot_id, INTERPRO, login, edit_summary="add/update family and/or domains") cursor.close()
def run_encodes(login, records): # get all entrez gene id -> wdid mappings, where found in taxon is this strain gene_wdid_mapping = PBB_Helpers.id_mapper( "P351", (("P703", strain_info['organism_wdid']), )) # get all ensembl protein id -> wdid mappings, where found in taxon is this strain protein_wdid_mapping = PBB_Helpers.id_mapper( "P705", (("P703", strain_info['organism_wdid']), )) for record in tqdm(records, desc=strain_info['organism_name']): entrez_gene = str(record['entrezgene']['@value']) if entrez_gene not in gene_wdid_mapping: PBB_Core.WDItemEngine.log( "ERROR", format_msg(record['_id']['@value'], "gene_not_found", None, ENTREZ_PROP)) continue gene_qid = gene_wdid_mapping[entrez_gene] protein_qid = protein_wdid_mapping[record['ensembl']['@value'] ['protein']] gene_encodes_statement(gene_qid, protein_qid, 'ncbi_gene', entrez_gene, record['ensembl']['@source'], login)
def main(log_dir="./logs", run_id=None): if run_id is None: run_id = datetime.now().strftime('%Y%m%d_%H:%M') __metadata__['run_id'] = run_id __metadata__['timestamp'] = str(datetime.now()) log_name = 'YeastBot_gene-{}.log'.format(run_id) __metadata__['log_name'] = log_name __metadata__['sources'] = get_source_versions() records = get_data_from_mygene() login = PBB_login.WDLogin(user=WDUSER, pwd=WDPASS) chrom_wdid = PBB_Helpers.id_mapper("P2249", (("P703", "Q27510868"), )) if PBB_Core.WDItemEngine.logger is not None: PBB_Core.WDItemEngine.logger.handles = [] PBB_Core.WDItemEngine.setup_logging(log_dir=log_dir, log_name=log_name, header=json.dumps(__metadata__)) run(login, records, chrom_wdid)
def main(version_info, log_dir="./logs", run_id=None, mongo_uri="mongodb://localhost:27017", mongo_db="wikidata_src", mongo_coll="interpro_protein", taxon=None): # data sources db = MongoClient(mongo_uri)[mongo_db] collection = db[mongo_coll] if run_id is None: run_id = datetime.now().strftime('%Y%m%d_%H:%M') if log_dir is None: log_dir = "./logs" __metadata__['run_id'] = run_id __metadata__['timestamp'] = str(datetime.now()) login = PBB_login.WDLogin(user=WDUSER, pwd=WDPASS) # handle version_info. parsed from interpro xml file. looks like: # { "_id" : "INTERPRO", "dbname" : "INTERPRO", "file_date" : "03-NOV-16", "version" : "60.0", "entry_count" : "29700" } version = version_info['version'] pub_date = date_parse(version_info['file_date']) release = PBB_Helpers.Release(title="InterPro Release {}".format(version), description="Release {} of the InterPro database & software".format(version), edition_of_wdid="Q3047275", edition=version, pub_date=pub_date, archive_url="ftp://ftp.ebi.ac.uk/pub/databases/interpro/{}/".format(version)) release_wdid = release.get_or_create(login) __metadata__['release'] = { 'InterPro': {'release': version, '_id': 'InterPro', 'wdid': release_wdid, 'timestamp': str(pub_date)}} log_name = '{}-{}.log'.format(__metadata__['name'], __metadata__['run_id']) if PBB_Core.WDItemEngine.logger is not None: PBB_Core.WDItemEngine.logger.handles = [] PBB_Core.WDItemEngine.setup_logging(log_dir=log_dir, log_name=log_name, header=json.dumps(__metadata__)) create_uniprot_relationships(login, release_wdid, collection, taxon=taxon) return os.path.join(log_dir, log_name)
def wd_item_construction(record, strain_info, chrom_wdid, login): """ generate pbb_core item object """ # If the source is "entrez", the reference identifier to be used is "entrez_gene" # These are defined in HelperBot source_ref_id = { 'Ensembl': 'ensembl_gene', 'Entrez': 'entrez_gene', 'Uniprot': 'uniprot' } def gene_item_statements(): """ construct list of referenced statements to past to PBB_Core Item engine """ s = [] ############ # external IDs ############ # will be used for reference statements external_ids = { 'entrez_gene': str(record['entrezgene']['@value']), 'ensembl_gene': record['ensembl']['@value']['gene'], 'locus_tag': record['locus_tag']['@value'] } # entrez gene id entrez_ref = make_ref_source(record['entrezgene']['@source'], 'entrez_gene', external_ids['entrez_gene']) s.append( PBB_Core.WDString(external_ids['entrez_gene'], PROPS['Entrez Gene ID'], references=[entrez_ref])) # ensembl gene id ensembl_ref = make_ref_source(record['ensembl']['@source'], 'ensembl_gene', external_ids['ensembl_gene']) s.append( PBB_Core.WDString(external_ids['ensembl_gene'], PROPS['Ensembl Gene ID'], references=[ensembl_ref])) # ncbi locus tag s.append( PBB_Core.WDString(external_ids['locus_tag'], PROPS['NCBI Locus tag'], references=[entrez_ref])) ############ # statements with no referencable sources (make by hand, for now...) ############ # subclass of gene s.append( PBB_Core.WDItemID('Q7187', PROPS['subclass of'], references=[ensembl_ref])) # found in taxon s.append( PBB_Core.WDItemID(strain_info['organism_wdid'], PROPS['found in taxon'], references=[ensembl_ref])) ############ # genomic position: start, end, strand orientation, chromosome ############ genomic_pos_value = record['genomic_pos']['@value'] genomic_pos_source = record['genomic_pos']['@source'] genomic_pos_id_prop = source_ref_id[genomic_pos_source['_id']] genomic_pos_ref = make_ref_source(genomic_pos_source, genomic_pos_id_prop, external_ids[genomic_pos_id_prop]) # create chromosome qualifier chrom_genomeid = strain_info['chrom_genomeid_map'][ genomic_pos_value['chr']] rs_chrom = PBB_Core.WDString(chrom_genomeid, 'P2249', is_qualifier=True) # Refseq Genome ID # strand orientation strand_orientation = 'Q22809680' if genomic_pos_value[ 'strand'] == 1 else 'Q22809711' s.append( PBB_Core.WDItemID(strand_orientation, PROPS['strand orientation'], references=[genomic_pos_ref])) # genomic start and end s.append( PBB_Core.WDString(str(int(genomic_pos_value['start'])), PROPS['genomic start'], references=[genomic_pos_ref], qualifiers=[rs_chrom])) s.append( PBB_Core.WDString(str(int(genomic_pos_value['end'])), PROPS['genomic end'], references=[genomic_pos_ref], qualifiers=[rs_chrom])) # chromosome chr_genomic_id = strain_info['chrom_genomeid_map'][ genomic_pos_value['chr']] s.append( PBB_Core.WDItemID(chrom_wdid[chr_genomic_id], PROPS['chromosome'], references=[genomic_pos_ref])) return s item_name = '{} {}'.format(record['name']['@value'], record['ensembl']['@value']['gene']) item_description = '{} gene found in {}'.format( strain_info['organism_type'], strain_info['organism_name']) statements = gene_item_statements() wd_item_gene = PBB_Core.WDItemEngine(item_name=item_name, domain='genes', data=statements, append_value=[PROPS['subclass of']], fast_run=True, fast_run_base_filter={ PROPS['Entrez Gene ID']: '', PROPS['found in taxon']: strain_info['organism_wdid'] }) wd_item_gene.set_label(item_name) wd_item_gene.set_description(item_description, lang='en') wd_item_gene.set_aliases( [record['symbol']['@value'], record['locus_tag']['@value']]) PBB_Helpers.try_write(wd_item_gene, record['_id']['@value'], ENTREZ_PROP, login)
def main(version_info, log_dir="./logs", run_id=None, mongo_uri="mongodb://localhost:27017", mongo_db="wikidata_src", mongo_coll="interpro", debug=False): # data sources db = MongoClient(mongo_uri)[mongo_db] interpro_coll = db[mongo_coll] if run_id is None: run_id = datetime.now().strftime('%Y%m%d_%H:%M') if log_dir is None: log_dir = "./logs" __metadata__['run_id'] = run_id __metadata__['timestamp'] = str(datetime.now()) login = PBB_login.WDLogin(user=WDUSER, pwd=WDPASS) # handle version_info. parsed from interpro xml file. looks like: # { "_id" : "INTERPRO", "dbname" : "INTERPRO", "file_date" : "03-NOV-16", "version" : "60.0", "entry_count" : "29700" } version = version_info['version'] pub_date = date_parse(version_info['file_date']) release = PBB_Helpers.Release( title="InterPro Release {}".format(version), description="Release {} of the InterPro database & software".format( version), edition_of_wdid="Q3047275", edition=version, pub_date=pub_date, archive_url="ftp://ftp.ebi.ac.uk/pub/databases/interpro/{}/".format( version)) release_wdid = release.get_or_create(login) __metadata__['release'] = { 'InterPro': { 'release': version, '_id': 'InterPro', 'wdid': release_wdid, 'timestamp': str(pub_date) } } log_name = '{}-{}.log'.format(__metadata__['name'], __metadata__['run_id']) if PBB_Core.WDItemEngine.logger is not None: PBB_Core.WDItemEngine.logger.handles = [] PBB_Core.WDItemEngine.setup_logging(log_dir=log_dir, log_name=log_name, header=json.dumps(__metadata__)) # create/update all interpro items terms = [] cursor = interpro_coll.find(no_cursor_timeout=True) for n, doc in tqdm(enumerate(cursor), total=cursor.count()): doc['release_wdid'] = release_wdid term = IPRTerm(**doc) term.create_item(login) terms.append(term) if debug and n > 100: break cursor.close() # create/update interpro item relationships IPRTerm.refresh_ipr_wd() for term in tqdm(terms): term.create_relationships(login) return os.path.join(log_dir, log_name)
def protein_item(record, strain_info, gene_qid, go_wdid_mapping, login, add_pubmed): """ generate pbb_core item object """ item_name = '{} {}'.format(record['name']['@value'], record['ensembl']['@value']['protein']) item_description = '{} protein found in {}'.format( strain_info['organism_type'], strain_info['organism_name']) s = [] ############ # external IDs ############ # will be used for reference statements external_ids = { 'entrez_gene': str(record['entrezgene']['@value']), 'ensembl_protein': record['ensembl']['@value']['protein'], 'ensembl_gene': record['ensembl']['@value']['gene'], 'refseq_protein': record['refseq']['@value']['protein'], 'uniprot': record['uniprot']['@value']['Swiss-Prot'] } # ensembl protein id ensembl_ref = make_ref_source(record['ensembl']['@source'], 'ensembl_protein', external_ids['ensembl_protein']) s.append( PBB_Core.WDString(external_ids['ensembl_protein'], 'P705', references=[ensembl_ref])) # refseq protein id refseq_ref = make_ref_source(record['refseq']['@source'], 'refseq_protein', external_ids['refseq_protein']) s.append( PBB_Core.WDString(external_ids['refseq_protein'], 'P637', references=[refseq_ref])) # uniprot id uniprot_ref = make_ref_source(record['uniprot']['@source'], 'uniprot', external_ids['uniprot']) s.append( PBB_Core.WDString(external_ids['uniprot'], 'P352', references=[uniprot_ref])) ############ # GO terms # TODO: https://www.wikidata.org/wiki/Q3460832 ############ preprocess_go(record) print(record) go_source = record['go']['@source'] go_id_prop = source_ref_id[go_source['_id']] reference = make_ref_source(go_source, go_id_prop, external_ids[go_id_prop]) for go_level, go_records in record['go']['@value'].items(): level_wdid = go_props[go_level] for go_record in go_records: go_wdid = go_wdid_mapping[go_record['id']] evidence_wdid = go_evidence_codes[go_record['evidence']] evidence_statement = PBB_Core.WDItemID(value=evidence_wdid, prop_nr='P459', is_qualifier=True) this_reference = copy.deepcopy(reference) if add_pubmed: for pubmed in go_record['pubmed']: pmid_wdid = PBB_Helpers.PubmedStub(pubmed).create(login) this_reference.append( PBB_Core.WDItemID(pmid_wdid, 'P248', is_reference=True)) s.append( PBB_Core.WDItemID(go_wdid, level_wdid, references=[this_reference], qualifiers=[evidence_statement])) ############ # statements with no referencable sources (make by hand, for now...) ############ # subclass of protein s.append(PBB_Core.WDItemID('Q8054', 'P279', references=[ensembl_ref])) # found in taxon s.append( PBB_Core.WDItemID(strain_info['organism_wdid'], 'P703', references=[ensembl_ref])) # encodes gene s.append(PBB_Core.WDItemID(gene_qid, 'P702', references=[ensembl_ref])) try: wd_item_protein = PBB_Core.WDItemEngine( item_name=item_name, domain='proteins', data=s, append_value=['P279'], fast_run=True, fast_run_base_filter={ 'P352': '', 'P703': strain_info['organism_wdid'] }) wd_item_protein.set_label(item_name) wd_item_protein.set_description(item_description, lang='en') wd_item_protein.set_aliases( [record['symbol']['@value'], record['locus_tag']['@value']]) except Exception as e: print(e) PBB_Core.WDItemEngine.log( "ERROR", format_msg(record['entrezgene']['@value'], str(e), None, ENTREZ_PROP)) return try_write(wd_item_protein, record['entrezgene']['@value'], 'P351', login)
def refresh_ipr_wd(cls): cls.ipr2wd = PBB_Helpers.id_mapper(INTERPRO)
class IPRTerm: """ Represents one interproscan term/item {'children': ['IPR020635'], 'contains': ['IPR001824', 'IPR002011', 'IPR008266', 'IPR017441'], 'description': 'InterPro Domain', 'found_in': ['IPR009136','IPR012234','IPR020777'], 'id': 'IPR001245', 'name': 'Serine-threonine/tyrosine-protein kinase catalytic domain', 'parent': 'IPR000719', 'short_name': 'Ser-Thr/Tyr_kinase_cat_dom', 'type': 'Domain', 'type_wdid': 'Q898273'} """ fast_run_base_filter = {INTERPRO: ''} ipr2wd = PBB_Helpers.id_mapper(INTERPRO) type2desc = { "Active_site": "InterPro Active Site", "Binding_site": "InterPro Binding Site", "Conserved_site": "InterPro Conserved Site", "Domain": "InterPro Domain", "Family": "InterPro Family", "PTM": "InterPro PTM", "Repeat": "InterPro Repeat" } type2wdid = { "Active_site": "Q423026", # Active site "Binding_site": "Q616005", # Binding site "Conserved_site": "Q7644128", # Supersecondary_structure "Domain": "Q898273", # Protein domain "Family": "Q417841", # Protein family "PTM": "Q898362", # Post-translational modification "Repeat": "Q3273544" } # Structural motif def __init__(self, name=None, short_name=None, id=None, parent=None, children=None, contains=None, found_in=None, type=None, description=None, release_wdid=None, **kwargs): self.name = name self.short_name = short_name self.id = id self.wdid = None self.parent = parent # subclass of (P279) self.parent_wdid = None self.children = children # not added to wd self.children_wdid = None self.contains = contains # has part (P527) self.contains_wdid = None self.found_in = found_in # part of (P361) self.found_in_wdid = None self.type = type self.type_wdid = IPRTerm.type2wdid[ self.type] # subclass of (from type2wdid) self.description = description if self.description is None and self.type: self.description = IPRTerm.type2desc[self.type] self.lang_descr = {'en': self.description} self.release_wdid = release_wdid self.reference = None self.create_reference() def __repr__(self): return '{}: {}'.format(self.id, self.name) def __str__(self): return '{}: {}'.format(self.id, self.name) @classmethod def refresh_ipr_wd(cls): cls.ipr2wd = PBB_Helpers.id_mapper(INTERPRO) def do_wdid_lookup(self): # this can only be done after all items have been created self.wdid = IPRTerm.ipr2wd[self.id] if self.parent: self.parent_wdid = IPRTerm.ipr2wd[self.parent] # children aren't added (reverse of parent relationship) if self.contains: self.contains_wdid = [IPRTerm.ipr2wd[x] for x in self.contains] if self.found_in: self.found_in_wdid = [IPRTerm.ipr2wd[x] for x in self.found_in] def create_reference(self): """ Create wikidata references for interpro This same reference will be used for everything. Except for a ref to the interpro item itself """ # stated in Interpro version XX.X ref_stated_in = PBB_Core.WDItemID(self.release_wdid, 'P248', is_reference=True) ref_ipr = PBB_Core.WDString(self.id, INTERPRO, is_reference=True) # interpro ID self.reference = [ref_stated_in, ref_ipr] def create_item(self, login): statements = [ PBB_Core.WDExternalID(value=self.id, prop_nr=INTERPRO, references=[self.reference]), PBB_Core.WDItemID(value=self.type_wdid, prop_nr="P279", references=[self.reference]) ] wd_item = PBB_Core.WDItemEngine( item_name=self.name, domain='interpro', data=statements, append_value=["P279"], fast_run=True, fast_run_base_filter=IPRTerm.fast_run_base_filter) wd_item.set_label(self.name, lang='en') for lang, description in self.lang_descr.items(): wd_item.set_description(description, lang=lang) wd_item.set_aliases([self.short_name, self.id]) PBB_Helpers.try_write(wd_item, self.id, INTERPRO, login) return wd_item def create_relationships(self, login): try: # endpoint may not get updated in time? self.do_wdid_lookup() except KeyError as e: PBB_Core.WDItemEngine.log( "ERROR", format_msg(self.id, INTERPRO, None, str(e), type(e))) return statements = [ PBB_Core.WDExternalID(value=self.id, prop_nr=INTERPRO, references=[self.reference]) ] if self.parent: # subclass of statements.append( PBB_Core.WDItemID(value=self.parent_wdid, prop_nr='P279', references=[self.reference])) if self.contains: for c in self.contains_wdid: statements.append( PBB_Core.WDItemID(value=c, prop_nr='P527', references=[self.reference])) # has part if self.found_in: for f in self.found_in_wdid: statements.append( PBB_Core.WDItemID(value=f, prop_nr='P361', references=[self.reference])) # part of if len(statements) == 1: return wd_item = PBB_Core.WDItemEngine( wd_item_id=self.wdid, domain='interpro', data=statements, append_value=['P279', 'P527', 'P361'], fast_run=True, fast_run_base_filter=IPRTerm.fast_run_base_filter) PBB_Helpers.try_write( wd_item, self.id, INTERPRO, login, edit_summary="create/update subclass/has part/part of")