def test_make_gene_class(): coll = MongoClient().wikidata_src.mygene metadata_coll = MongoClient().wikidata_src.mygene_sources metadata = metadata_coll.find_one() doc_filter = {'_id': '100861512'} docs = coll.find(doc_filter) print("total number of records: {}".format(coll.find(doc_filter).count())) validate_type = 'eukaryotic' docs = HelperBot.validate_docs(docs, validate_type, 'P351') records = HelperBot.tag_mygene_docs(docs, metadata) record = next(records) organism_info = { "name": "H**o sapiens", "type": "mammalian", "wdid": "Q15978631", 'taxid': 9606 } login = wdi_login.WDLogin(WDUSER, WDPASS) gene = Gene(record, organism_info, login) gene.create_item(fast_run=False, write=True) gene.remove_deprecated_statements()
def validate_all_human_genes(): # runs all genes through the validator # and generates a log file coll = MongoClient().wikidata_src.mygene metadata_coll = MongoClient().wikidata_src.mygene_sources metadata = metadata_coll.find_one() doc_filter = {'taxid': 9606, 'entrezgene': {'$exists': True}} docs = coll.find(doc_filter) print("total number of records: {}".format(coll.find(doc_filter).count())) validate_type = 'eukaryotic' docs = HelperBot.validate_docs(docs, validate_type, 'P351') records = HelperBot.tag_mygene_docs(docs, metadata) _ = list(records)
def genes(): entrez_wd = id_mapper("P351") login = wdi_login.WDLogin(user=WDUSER, pwd=WDPASS) coll = MongoClient().wikidata_src.mygene metadata_coll = MongoClient().wikidata_src.mygene_sources metadata = metadata_coll.find_one() organism_info = organisms_info[7955] doc_filter = {'taxid': 7955, 'entrezgene': {'$exists': True}} docs = coll.find(doc_filter).batch_size(20) total = docs.count() print("total number of records: {}".format(total)) docs = HelperBot.validate_docs(docs, 'eukaryotic', PROPS['Entrez Gene ID']) records = HelperBot.tag_mygene_docs(docs, metadata) records = list(records) # find all names with dupes dupe_names = { k for k, v in Counter([x['symbol']['@value'] for x in records]).items() if v > 1 } # for all records that have one of these names, change the name to "name (entrezgene)" records = [x for x in records if x['symbol']['@value'] in dupe_names] for record in records: record['symbol']['@value'] = record['symbol']['@value'] + " (" + str( record['entrezgene']['@value']) + ")" # skip items that aren't already in wikidata (DONT CREATE NEW ITEMS!) records = [ x for x in records if str(x['entrezgene']['@value']) in entrez_wd ] print("len records: {}".format(len(records))) cb = ChromosomeBot() chr_num_wdid = cb.get_or_create(organism_info, login=login) bot = GeneBot.ChromosomalGeneBot(organism_info, chr_num_wdid, login) bot.filter = lambda x: iter(x) bot.run(records, total=total, fast_run=True, write=True)
def main(taxid, metadata, log_dir="./logs", run_id=None, fast_run=True, write=True, entrez=None): """ Main function for creating/updating genes :param taxid: taxon to use (ncbi tax id) :type taxid: str :param metadata: looks like: {"ensembl" : 84, "cpdb" : 31, "netaffy" : "na35", "ucsc" : "20160620", .. } :type metadata: dict :param log_dir: dir to store logs :type log_dir: str :param fast_run: use fast run mode :type fast_run: bool :param write: actually perform write :type write: bool :param entrez: Only run this one gene :type entrez: int :return: None """ # make sure the organism is found in wikidata taxid = int(taxid) organism_wdid = wdi_helpers.prop2qid("P685", str(taxid)) if not organism_wdid: print("organism {} not found in wikidata".format(taxid)) return None # login login = wdi_login.WDLogin(user=WDUSER, pwd=WDPASS) if wdi_core.WDItemEngine.logger is not None: wdi_core.WDItemEngine.logger.handles = [] wdi_core.WDItemEngine.logger.handlers = [] run_id = run_id if run_id is not None else datetime.now().strftime('%Y%m%d_%H:%M') log_name = '{}-{}.log'.format(__metadata__['name'], run_id) __metadata__['taxid'] = taxid wdi_core.WDItemEngine.setup_logging(log_dir=log_dir, logger_name='WD_logger', log_name=log_name, header=json.dumps(__metadata__)) # get organism metadata (name, organism type, wdid) # TODO: this can be pulled from wd if taxid in organisms_info and organisms_info[taxid]['type'] != "microbial": validate_type = 'eukaryotic' organism_info = organisms_info[taxid] # make sure all chromosome items are found in wikidata cb = ChromosomeBot() chr_num_wdid = cb.get_or_create(organism_info, login=login) chr_num_wdid = {k.upper(): v for k, v in chr_num_wdid.items()} if int(organism_info['taxid']) == 9606: bot = HumanGeneBot(organism_info, chr_num_wdid, login) else: bot = ChromosomalGeneBot(organism_info, chr_num_wdid, login) else: # check if its one of the reference microbial genomes # raises valueerror if not... organism_info = mcb.get_organism_info(taxid) refseq_qid_chrom = mcb.get_or_create_chromosomes(taxid, login) print(organism_info) bot = MicrobeGeneBot(organism_info, refseq_qid_chrom, login) validate_type = "microbial" # Get handle to mygene records mgd = MyGeneDownloader() if entrez: doc, total = mgd.get_mg_gene(entrez) docs = iter([doc]) else: doc_filter = lambda x: (x.get("type_of_gene") != "biological-region") and ("entrezgene" in x) docs, total = mgd.get_mg_cursor(taxid, doc_filter) print("total number of records: {}".format(total)) # the scroll_id/cursor times out from mygene if we iterate. So.... get the whole thing now docs = list(docs) docs = HelperBot.validate_docs(docs, validate_type, PROPS['Entrez Gene ID']) records = HelperBot.tag_mygene_docs(docs, metadata) bot.run(records, total=total, fast_run=fast_run, write=write) for frc in wdi_core.WDItemEngine.fast_run_store: frc.clear() print("done updating, waiting 10 min") time.sleep(10 * 60) releases = dict() releases_to_remove = set() last_updated = dict() metadata = {k: v for k, v in metadata.items() if k in {'uniprot', 'ensembl', 'entrez'}} for k, v in parse_mygene_src_version(metadata).items(): if "release" in v: if k not in releases: releases[k] = wdi_helpers.id_mapper('P393', (('P629', source_items[k]),)) to_remove = set(releases[k].values()) to_remove.discard(releases[k][v['release']]) releases_to_remove.update(to_remove) print( "{}: Removing releases: {}, keeping release: {}".format(k, ", ".join(set(releases[k]) - {v['release']}), v['release'])) else: last_updated[source_items[k]] = datetime.strptime(v["timestamp"], "%Y%m%d") print(last_updated) bot.cleanup(releases_to_remove, last_updated)
def main(coll, taxid, metadata, log_dir="./logs", fast_run=True, write=True): """ Main function for creating/updating genes :param coll: mongo collection containing gene data from mygene :type coll: pymongo.collection.Collection :param taxid: taxon to use (ncbi tax id) :type taxid: str :param metadata: looks like: {"ensembl" : 84, "cpdb" : 31, "netaffy" : "na35", "ucsc" : "20160620", .. } :type metadata: dict :param log_dir: dir to store logs :type log_dir: str :param fast_run: use fast run mode :type fast_run: bool :param write: actually perform write :type write: bool :return: None """ # make sure the organism is found in wikidata taxid = int(taxid) organism_wdid = wdi_helpers.prop2qid("P685", str(taxid)) if not organism_wdid: print("organism {} not found in wikidata".format(taxid)) return None # login login = wdi_login.WDLogin(user=WDUSER, pwd=WDPASS) wdi_core.WDItemEngine.setup_logging(log_dir=log_dir, logger_name='WD_logger', log_name=log_name, header=json.dumps(__metadata__)) # get organism metadata (name, organism type, wdid) if taxid in organisms_info: # its one of fungal, mammalian, plant (not microbe) validate_type = 'gene' organism_info = organisms_info[taxid] # make sure all chromosome items are found in wikidata cb = ChromosomeBot() chr_num_wdid = cb.get_or_create(organism_info, login=login) if int(organism_info['taxid']) == 9606: bot = HumanGeneBot(organism_info, chr_num_wdid, login) else: bot = MammalianGeneBot(organism_info, chr_num_wdid, login) else: # check if its one of the microbe refs # raises valueerror if not... organism_info = get_organism_info(taxid) print(organism_info) bot = MicrobeGeneBot(organism_info, login) validate_type = "microbial" # only do certain records doc_filter = { 'taxid': taxid, 'type_of_gene': 'protein-coding', 'genomic_pos': { '$exists': True } } docs = coll.find(doc_filter, no_cursor_timeout=True) total = docs.count() print("total number of records: {}".format(total)) docs = HelperBot.validate_docs(docs, validate_type, PROPS['Entrez Gene ID']) records = HelperBot.tag_mygene_docs(docs, metadata) bot.run(records, total=total, fast_run=fast_run, write=write) docs.close() # after the run is done, disconnect the logging handler # so that if we start another, it doesn't write twice if wdi_core.WDItemEngine.logger is not None: wdi_core.WDItemEngine.logger.handles = []
def main(taxid, metadata, log_dir="./logs", run_id=None, fast_run=True, write=True, entrez=None): """ Main function for creating/updating proteins :param taxid: taxon to use (ncbi tax id) :type taxid: str :param metadata: looks like: {"ensembl" : 84, "cpdb" : 31, "netaffy" : "na35", "ucsc" : "20160620", .. } :type metadata: dict :param log_dir: dir to store logs :type log_dir: str :param fast_run: use fast run mode :type fast_run: bool :param write: actually perform write :type write: bool :param entrez: Only run this one protein (given by entrezgene id) :type entrez: int :return: None """ # make sure the organism is found in wikidata taxid = int(taxid) organism_wdid = wdi_helpers.prop2qid("P685", str(taxid)) if not organism_wdid: print("organism {} not found in wikidata".format(taxid)) return None # login login = wdi_login.WDLogin(user=WDUSER, pwd=WDPASS) if wdi_core.WDItemEngine.logger is not None: wdi_core.WDItemEngine.logger.handles = [] wdi_core.WDItemEngine.logger.handlers = [] run_id = run_id if run_id is not None else datetime.now().strftime('%Y%m%d_%H:%M') log_name = '{}-{}.log'.format(__metadata__['name'], run_id) __metadata__['taxid'] = taxid wdi_core.WDItemEngine.setup_logging(log_dir=log_dir, log_name=log_name, header=json.dumps(__metadata__)) # get organism metadata (name, organism type, wdid) if taxid in organisms_info: validate_type = 'eukaryotic' organism_info = organisms_info[taxid] else: # check if its one of the microbe refs # raises valueerror if not... organism_info = get_organism_info(taxid) validate_type = 'microbial' print(organism_info) # get all entrez gene id -> wdid mappings, where found in taxon is this strain gene_wdid_mapping = id_mapper("P351", (("P703", organism_info['wdid']),)) bot = ProteinBot(organism_info, gene_wdid_mapping, login) # Get handle to mygene records mgd = MyGeneDownloader() if entrez: doc, total = mgd.get_mg_gene(entrez) docs = iter([doc]) else: doc_filter = lambda x: (x.get("type_of_gene") == "protein-coding") and ("uniprot" in x) and ("entrezgene" in x) docs, total = mgd.get_mg_cursor(taxid, doc_filter) print("total number of records: {}".format(total)) # the scroll_id/cursor times out from mygene if we iterate. So.... get the whole thing now docs = list(docs) docs = HelperBot.validate_docs(docs, validate_type, PROPS['Entrez Gene ID']) records = HelperBot.tag_mygene_docs(docs, metadata) bot.run(records, total=total, fast_run=fast_run, write=write) for frc in wdi_core.WDItemEngine.fast_run_store: frc.clear() time.sleep(10 * 60) releases = dict() releases_to_remove = set() last_updated = dict() metadata = {k: v for k, v in metadata.items() if k in {'uniprot', 'ensembl', 'entrez'}} for k, v in parse_mygene_src_version(metadata).items(): if "release" in v: if k not in releases: releases[k] = wdi_helpers.id_mapper('P393', (('P629', source_items[k]),)) to_remove = set(releases[k].values()) to_remove.discard(releases[k][v['release']]) releases_to_remove.update(to_remove) print( "{}: Removing releases: {}, keeping release: {}".format(k, ", ".join(set(releases[k]) - {v['release']}), v['release'])) else: last_updated[source_items[k]] = datetime.strptime(v["timestamp"], "%Y%m%d") print(last_updated) bot.cleanup(releases_to_remove, last_updated) # after the run is done, disconnect the logging handler # so that if we start another, it doesn't write twice if wdi_core.WDItemEngine.logger is not None: wdi_core.WDItemEngine.logger.handles = []
def main(metadata, log_dir="./logs", fast_run=True, write=True): """ Main function for creating/updating genes :param metadata: looks like: {"ensembl" : 84, "cpdb" : 31, "netaffy" : "na35", "ucsc" : "20160620", .. } :type metadata: dict :param log_dir: dir to store logs :type log_dir: str :param fast_run: use fast run mode :type fast_run: bool :param write: actually perform write :type write: bool :return: None """ # login login = wdi_login.WDLogin(user=WDUSER, pwd=WDPASS) wdi_core.WDItemEngine.setup_logging(log_dir=log_dir, logger_name='WD_logger', log_name=log_name, header=json.dumps(__metadata__)) # get all ids mappings entrez_wdid = wdi_helpers.id_mapper(PROPS['Entrez Gene ID']) wdid_entrez = {v: k for k, v in entrez_wdid.items()} homo_wdid = wdi_helpers.id_mapper(PROPS['HomoloGene ID'], return_as_set=True) wdid_homo = dict() for h**o, wdids in homo_wdid.items(): for wdid in wdids: wdid_homo[wdid] = h**o entrez_homo = { wdid_entrez[wdid]: h**o for wdid, h**o in wdid_homo.items() if wdid in wdid_entrez } taxon_wdid = wdi_helpers.id_mapper(PROPS['NCBI Taxonomy ID']) # only do certain records mgd = MyGeneDownloader( q="_exists_:homologene AND type_of_gene:protein-coding", fields=','.join(['taxid', 'homologene', 'entrezgene'])) docs, total = mgd.query() docs = list(tqdm(docs, total=total)) records = HelperBot.tag_mygene_docs(docs, metadata) # group together all orthologs # d[taxid][entrezgene] = { set of entrezgene ids for orthologs } d = defaultdict(lambda: defaultdict(set)) entrez_taxon = dict() # keep this for the qualifier on the statements for doc in records: this_taxid = doc['taxid']['@value'] this_entrez = doc['entrezgene']['@value'] entrez_taxon[str(this_entrez)] = str(this_taxid) if str(this_entrez) not in entrez_wdid: continue for taxid, entrez in doc['homologene']['@value']['genes']: if taxid == 4932 and this_taxid == 559292: # ridiculous workaround because entrez has the taxid for the strain and homologene has it for the species # TODO: This needs to be fixed if you want to use other things that may have species/strains .. ?` continue if taxid != this_taxid and str(entrez) in entrez_wdid: d[str(this_taxid)][str(this_entrez)].add(str(entrez)) print("taxid: # of genes : {}".format({k: len(v) for k, v in d.items()})) homogene_ver = metadata['homologene'] release = wdi_helpers.Release( "HomoloGene build{}".format(homogene_ver), "Version of HomoloGene", homogene_ver, edition_of_wdid='Q468215', archive_url='ftp://ftp.ncbi.nih.gov/pub/HomoloGene/build{}/'.format( homogene_ver)).get_or_create(login) reference = lambda homogeneid: [ wdi_core.WDItemID(release, PROPS['stated in'], is_reference=True), wdi_core.WDExternalID( homogeneid, PROPS['HomoloGene ID'], is_reference=True) ] ec = 0 for taxid, subd in tqdm(d.items()): for entrezgene, orthologs in tqdm(subd.items(), leave=False): try: do_item(entrezgene, orthologs, reference, entrez_homo, entrez_taxon, taxon_wdid, entrez_wdid, login, write) except Exception as e: wdi_helpers.format_msg(entrezgene, PROPS['Entrez Gene ID'], None, str(e), type(e)) ec += 1 # clear the fast run store once we move on to the next taxon wdi_core.WDItemEngine.fast_run_store = [] wdi_core.WDItemEngine.fast_run_container = None print("Completed succesfully with {} exceptions".format(ec))
def proteins(): uni_wd = id_mapper("P352") login = wdi_login.WDLogin(user=WDUSER, pwd=WDPASS) coll = MongoClient().wikidata_src.mygene metadata_coll = MongoClient().wikidata_src.mygene_sources metadata = metadata_coll.find_one() organism_info = organisms_info[7955] doc_filter = { 'taxid': 7955, 'uniprot': { '$exists': True }, 'entrezgene': { '$exists': True } } docs = coll.find(doc_filter).batch_size(20) total = docs.count() print("total number of records: {}".format(total)) docs = HelperBot.validate_docs(docs, 'eukaryotic', PROPS['Entrez Gene ID']) records = HelperBot.tag_mygene_docs(docs, metadata) records = list(records) for record in records: if 'Swiss-Prot' in record['uniprot']['@value']: record['uniprot_id'] = record['uniprot']['@value']['Swiss-Prot'] elif 'TrEMBL' in record['uniprot']['@value']: record['uniprot_id'] = record['uniprot']['@value']['TrEMBL'] records = [ x for x in records if 'uniprot_id' in x and isinstance(x['uniprot_id'], str) ] # find all names with dupes dupe_names = { k for k, v in Counter([x['name']['@value'] for x in records]).items() if v > 1 } # for all records that have one of these names, change the name to "name (uniprot)" records = [x for x in records if x['name']['@value'] in dupe_names] print("len dupe records: {}".format(len(records))) for record in records: record['name']['@value'] = record['name']['@value'] + " (" + record[ 'uniprot_id'] + ")" # skip items that aren't already in wikidata (DONT CREATE NEW ITEMS!) records = [x for x in records if x['uniprot_id'] in uni_wd] print("len records: {}".format(len(records))) cb = ChromosomeBot() chr_num_wdid = cb.get_or_create(organism_info, login=login) bot = ProteinBot.ProteinBot(organism_info, chr_num_wdid, login) bot.filter = lambda x: iter(x) bot.run(records, total=total, fast_run=False, write=True)
def main(coll, taxid, metadata, log_dir="./logs", fast_run=True, write=True): """ Main function for creating/updating proteins :param coll: mongo collection containing protein data from mygene :type coll: pymongo.collection.Collection :param taxid: taxon to use (ncbi tax id) :type taxid: str :param metadata: looks like: {"ensembl" : 84, "cpdb" : 31, "netaffy" : "na35", "ucsc" : "20160620", .. } :type metadata: dict :param log_dir: dir to store logs :type log_dir: str :param fast_run: use fast run mode :type fast_run: bool :param write: actually perform write :type write: bool :return: None """ # make sure the organism is found in wikidata taxid = int(taxid) organism_wdid = wdi_helpers.prop2qid("P685", str(taxid)) if not organism_wdid: print("organism {} not found in wikidata".format(taxid)) return None # login login = wdi_login.WDLogin(user=WDUSER, pwd=WDPASS) wdi_core.WDItemEngine.setup_logging(log_dir=log_dir, log_name=log_name, header=json.dumps(__metadata__)) # get organism metadata (name, organism type, wdid) validate_type = 'protein' if taxid in organisms_info: organism_info = organisms_info[taxid] else: # check if its one of the microbe refs # raises valueerror if not... organism_info = get_organism_info(taxid) print(organism_info) # get all entrez gene id -> wdid mappings, where found in taxon is this strain gene_wdid_mapping = id_mapper("P351", (("P703", organism_info['wdid']), )) bot = ProteinBot(organism_info, gene_wdid_mapping, login) # only do certain records doc_filter = {'taxid': taxid, 'type_of_gene': 'protein-coding'} docs = coll.find(doc_filter, no_cursor_timeout=True) total = docs.count() print("total number of records: {}".format(total)) docs = HelperBot.validate_docs(docs, validate_type, PROPS['Entrez Gene ID']) records = HelperBot.tag_mygene_docs(docs, metadata) bot.run(records, total=total, fast_run=fast_run, write=write) docs.close() # after the run is done, disconnect the logging handler # so that if we start another, it doesn't write twice if wdi_core.WDItemEngine.logger is not None: wdi_core.WDItemEngine.logger.handles = []