def main(taxid, metadata, log_dir="./logs", run_id=None, fast_run=True, write=True, entrez=None): """ Main function for creating/updating genes :param taxid: taxon to use (ncbi tax id) :type taxid: str :param metadata: looks like: {"ensembl" : 84, "cpdb" : 31, "netaffy" : "na35", "ucsc" : "20160620", .. } :type metadata: dict :param log_dir: dir to store logs :type log_dir: str :param fast_run: use fast run mode :type fast_run: bool :param write: actually perform write :type write: bool :param entrez: Only run this one gene :type entrez: int :return: None """ # make sure the organism is found in wikidata taxid = int(taxid) organism_wdid = wdi_helpers.prop2qid("P685", str(taxid)) if not organism_wdid: print("organism {} not found in wikidata".format(taxid)) return None # login login = wdi_login.WDLogin(user=WDUSER, pwd=WDPASS) if wdi_core.WDItemEngine.logger is not None: wdi_core.WDItemEngine.logger.handles = [] wdi_core.WDItemEngine.logger.handlers = [] run_id = run_id if run_id is not None else datetime.now().strftime('%Y%m%d_%H:%M') log_name = '{}-{}.log'.format(__metadata__['name'], run_id) __metadata__['taxid'] = taxid wdi_core.WDItemEngine.setup_logging(log_dir=log_dir, logger_name='WD_logger', log_name=log_name, header=json.dumps(__metadata__)) # get organism metadata (name, organism type, wdid) # TODO: this can be pulled from wd if taxid in organisms_info and organisms_info[taxid]['type'] != "microbial": validate_type = 'eukaryotic' organism_info = organisms_info[taxid] # make sure all chromosome items are found in wikidata cb = ChromosomeBot() chr_num_wdid = cb.get_or_create(organism_info, login=login) chr_num_wdid = {k.upper(): v for k, v in chr_num_wdid.items()} if int(organism_info['taxid']) == 9606: bot = HumanGeneBot(organism_info, chr_num_wdid, login) else: bot = ChromosomalGeneBot(organism_info, chr_num_wdid, login) else: # check if its one of the reference microbial genomes # raises valueerror if not... organism_info = mcb.get_organism_info(taxid) refseq_qid_chrom = mcb.get_or_create_chromosomes(taxid, login) print(organism_info) bot = MicrobeGeneBot(organism_info, refseq_qid_chrom, login) validate_type = "microbial" # Get handle to mygene records mgd = MyGeneDownloader() if entrez: doc, total = mgd.get_mg_gene(entrez) docs = iter([doc]) else: doc_filter = lambda x: (x.get("type_of_gene") != "biological-region") and ("entrezgene" in x) docs, total = mgd.get_mg_cursor(taxid, doc_filter) print("total number of records: {}".format(total)) # the scroll_id/cursor times out from mygene if we iterate. So.... get the whole thing now docs = list(docs) docs = HelperBot.validate_docs(docs, validate_type, PROPS['Entrez Gene ID']) records = HelperBot.tag_mygene_docs(docs, metadata) bot.run(records, total=total, fast_run=fast_run, write=write) for frc in wdi_core.WDItemEngine.fast_run_store: frc.clear() print("done updating, waiting 10 min") time.sleep(10 * 60) releases = dict() releases_to_remove = set() last_updated = dict() metadata = {k: v for k, v in metadata.items() if k in {'uniprot', 'ensembl', 'entrez'}} for k, v in parse_mygene_src_version(metadata).items(): if "release" in v: if k not in releases: releases[k] = wdi_helpers.id_mapper('P393', (('P629', source_items[k]),)) to_remove = set(releases[k].values()) to_remove.discard(releases[k][v['release']]) releases_to_remove.update(to_remove) print( "{}: Removing releases: {}, keeping release: {}".format(k, ", ".join(set(releases[k]) - {v['release']}), v['release'])) else: last_updated[source_items[k]] = datetime.strptime(v["timestamp"], "%Y%m%d") print(last_updated) bot.cleanup(releases_to_remove, last_updated)
def main(taxid, metadata, log_dir="./logs", run_id=None, fast_run=True, write=True, entrez=None): """ Main function for creating/updating proteins :param taxid: taxon to use (ncbi tax id) :type taxid: str :param metadata: looks like: {"ensembl" : 84, "cpdb" : 31, "netaffy" : "na35", "ucsc" : "20160620", .. } :type metadata: dict :param log_dir: dir to store logs :type log_dir: str :param fast_run: use fast run mode :type fast_run: bool :param write: actually perform write :type write: bool :param entrez: Only run this one protein (given by entrezgene id) :type entrez: int :return: None """ # make sure the organism is found in wikidata taxid = int(taxid) organism_wdid = wdi_helpers.prop2qid("P685", str(taxid)) if not organism_wdid: print("organism {} not found in wikidata".format(taxid)) return None # login login = wdi_login.WDLogin(user=WDUSER, pwd=WDPASS) if wdi_core.WDItemEngine.logger is not None: wdi_core.WDItemEngine.logger.handles = [] wdi_core.WDItemEngine.logger.handlers = [] run_id = run_id if run_id is not None else datetime.now().strftime('%Y%m%d_%H:%M') log_name = '{}-{}.log'.format(__metadata__['name'], run_id) __metadata__['taxid'] = taxid wdi_core.WDItemEngine.setup_logging(log_dir=log_dir, log_name=log_name, header=json.dumps(__metadata__)) # get organism metadata (name, organism type, wdid) if taxid in organisms_info: validate_type = 'eukaryotic' organism_info = organisms_info[taxid] else: # check if its one of the microbe refs # raises valueerror if not... organism_info = get_organism_info(taxid) validate_type = 'microbial' print(organism_info) # get all entrez gene id -> wdid mappings, where found in taxon is this strain gene_wdid_mapping = id_mapper("P351", (("P703", organism_info['wdid']),)) bot = ProteinBot(organism_info, gene_wdid_mapping, login) # Get handle to mygene records mgd = MyGeneDownloader() if entrez: doc, total = mgd.get_mg_gene(entrez) docs = iter([doc]) else: doc_filter = lambda x: (x.get("type_of_gene") == "protein-coding") and ("uniprot" in x) and ("entrezgene" in x) docs, total = mgd.get_mg_cursor(taxid, doc_filter) print("total number of records: {}".format(total)) # the scroll_id/cursor times out from mygene if we iterate. So.... get the whole thing now docs = list(docs) docs = HelperBot.validate_docs(docs, validate_type, PROPS['Entrez Gene ID']) records = HelperBot.tag_mygene_docs(docs, metadata) bot.run(records, total=total, fast_run=fast_run, write=write) for frc in wdi_core.WDItemEngine.fast_run_store: frc.clear() time.sleep(10 * 60) releases = dict() releases_to_remove = set() last_updated = dict() metadata = {k: v for k, v in metadata.items() if k in {'uniprot', 'ensembl', 'entrez'}} for k, v in parse_mygene_src_version(metadata).items(): if "release" in v: if k not in releases: releases[k] = wdi_helpers.id_mapper('P393', (('P629', source_items[k]),)) to_remove = set(releases[k].values()) to_remove.discard(releases[k][v['release']]) releases_to_remove.update(to_remove) print( "{}: Removing releases: {}, keeping release: {}".format(k, ", ".join(set(releases[k]) - {v['release']}), v['release'])) else: last_updated[source_items[k]] = datetime.strptime(v["timestamp"], "%Y%m%d") print(last_updated) bot.cleanup(releases_to_remove, last_updated) # after the run is done, disconnect the logging handler # so that if we start another, it doesn't write twice if wdi_core.WDItemEngine.logger is not None: wdi_core.WDItemEngine.logger.handles = []