def get_or_create_chromosomes(self, taxid, login=None): # main function to use to get or create all of the chromosomes for a bacterial organism # returns dict with key = refseq ID, value = qid for chromosome item if self.df.empty: self.get_microbial_ref_genome_table() df = self.df taxid = str(taxid) entry = df[df.TaxID == taxid].to_dict("records")[0] organism_name = entry['Organism/Name'] organism_qid = prop2qid(PROPS['NCBI Taxonomy ID'], taxid) chroms = self.get_chromosome_info(taxid) chr_map = dict() chr_name_type = {'chromosome circular': 'circular', 'chromosome linear': 'chromosome', 'chromosome': 'chromosome'} for chrom in chroms: chrom_name = chrom['name'].lower() genome_id = chrom['refseq'] if chrom_name in chr_name_type: chr_type = chr_name_type[chrom_name] elif "plasmid" in chrom_name: chr_type = 'plasmid' else: raise ValueError("unknown chromosome type: {}".format(chrom['name'])) qid = self.create_chrom(organism_name, organism_qid, chrom_name, genome_id, chr_type, login=login) chr_map[chrom['refseq']] = qid return chr_map
def get_organism_info(self, taxid): taxid = str(taxid) if taxid not in self.get_all_taxids(): raise ValueError("taxid {} not found in microbe ref genomes".format(taxid)) entry = self.df[self.df.TaxID == taxid].to_dict("records")[0] qid = prop2qid(PROPS['NCBI Taxonomy ID'], taxid) return {'name': entry['Organism/Name'], 'type': "microbial", 'wdid': qid, 'qid': qid, 'taxid': taxid}
def published_in_isbn(self, value): assert self.published_in_issn is None, "Can't give both ISSN and ISBN" if not isinstance(value, list): value = [value] self._published_in_isbn = value qids = set(prop2qid(PROPS['ISBN-13'], v) for v in value) if len(qids) == 1: self.published_in_qid = list(qids)[0] else: self.warnings.append("conflictings ISBN qids: {}".format(qids)) if not self.published_in_qid: self.warnings.append("ISBN:{} not found".format(value))
def main(taxon, file, retrieved, log_dir="./logs", fast_run=True, write=True): """ Main function for annotating GO terms on proteins :param taxon: taxon to use (ncbi tax id) :type taxid: str :param file: path to gaf_file to use. See below for format :type str :param retrieved: date that the GO annotations were retrieved :type retrieved: datetime :param log_dir: dir to store logs :type log_dir: str :param fast_run: use fast run mode :type fast_run: bool :param write: actually perform write :type write: bool :return: The following columns are expected in the gaf file ['db','id','go_id','reference','evidence','aspect','taxon','source'] This can be created by selecting columns $1,$2,$5,$6,$7,$9,$13,$15 from goa_uniprot_all.gaf.gz downloaded from ftp://ftp.ebi.ac.uk/pub/databases/GO/goa/UNIPROT/goa_uniprot_all.gaf.gz """ login = wdi_login.WDLogin(user=WDUSER, pwd=WDPASS) if wdi_core.WDItemEngine.logger is not None: wdi_core.WDItemEngine.logger.handles = [] wdi_core.WDItemEngine.setup_logging(log_dir=log_dir, log_name=log_name, header=json.dumps(__metadata__)) organism_wdid = wdi_helpers.prop2qid("P685", taxon) if not organism_wdid: raise ValueError("organism {} not found".format(taxon)) print("Running organism: {} {}".format(taxon, organism_wdid)) # get all uniprot id -> wdid mappings, where found in taxon is this organism prot_wdid_mapping = wdi_helpers.id_mapper(UNIPROT, (("P703", organism_wdid), )) # get all goID to wdid mappings go_map = wdi_helpers.id_mapper("P686") # Get GO terms from our local store for this taxon colnames = [ 'db', 'id', 'go_id', 'reference', 'evidence', 'aspect', 'taxon', 'source' ] df = pd.read_csv(file, sep=' ', names=colnames, index_col=False) df = df[df.taxon == "taxon:" + str(taxon)] if len(df) == 0: print("No GO annotations found for taxid: {}".format(taxon)) return None else: print("Found {} GO annotations".format(len(df))) # get all pmids and make items for them pmids = set([x[5:] for x in df['reference'] if x.startswith("PMID:")]) print("Need {} pmids".format(len(pmids))) pmid_map = get_values("P698", pmids) print("Found {} pmids".format(len(pmid_map))) pmids_todo = pmids - set(pmid_map.keys()) print("Creating {} pmid items".format(len(pmids_todo))) new_pmids = create_articles(pmids_todo, login, write) pmid_map.update(new_pmids) print("Done creating pmid items") # get all external IDs we may need by uniprot id external_ids = defaultdict(dict) external_ids_info = { 'Saccharomyces Genome Database ID': 'P3406', 'Mouse Genome Informatics ID': 'P671', 'UniProt ID': 'P352' } for external_id_name, prop in external_ids_info.items(): id_map = wdi_helpers.id_mapper(prop, (("P703", organism_wdid), )) if not id_map: continue for id, wdid in id_map.items(): external_ids[wdid][external_id_name] = id # groupby ID, GOID & evidence, the make references a list go_annotations = df.groupby( ['id', 'go_id', 'evidence', 'source', 'db', 'aspect'])['reference'].apply(list) # iterate through all proteins & write failed_items = [] for uniprot_id, item_wdid in tqdm(prot_wdid_mapping.items()): # if uniprot_id != "Q9RJK2": # continue if uniprot_id not in go_annotations: continue this_go = go_annotations[uniprot_id] external_id = external_ids[item_wdid] # print(this_go) try: statements = make_go_statements(uniprot_id, this_go, go_map, pmid_map, external_id, retrieved) wditem = wdi_core.WDItemEngine( wd_item_id=item_wdid, domain='protein', data=statements, fast_run=fast_run, fast_run_base_filter={ UNIPROT: "", "P703": organism_wdid }, fast_run_use_refs=True, ref_handler=update_retrieved_if_new, append_value=['P680', 'P681', 'P682'], core_props=core_props) wdi_helpers.try_write(wditem, record_id=uniprot_id, record_prop=UNIPROT, edit_summary="update GO terms", login=login, write=write) except Exception as e: print(e) traceback.print_exc() failed_items.append(uniprot_id) wdi_core.WDItemEngine.log( "ERROR", wdi_helpers.format_msg(uniprot_id, UNIPROT, item_wdid, str(e), msg_type=type(e))) print("{} items failed: {}".format(len(failed_items), failed_items)) if wdi_core.WDItemEngine.logger is not None: wdi_core.WDItemEngine.logger.handles = []
def main(coll, taxon, retrieved, log_dir="./logs", fast_run=True, write=True): """ Main function for annotating GO terms on proteins :param coll: mongo collection containing GO annotations :type coll: pymongo.collection.Collection :param taxon: taxon to use (ncbi tax id) :type taxid: str :param retrieved: date that the GO annotations were retrieved :type retrieved: datetime :param log_dir: dir to store logs :type log_dir: str :param fast_run: use fast run mode :type fast_run: bool :param write: actually perform write :type write: bool :return: """ login = wdi_login.WDLogin(user=WDUSER, pwd=WDPASS) if wdi_core.WDItemEngine.logger is not None: wdi_core.WDItemEngine.logger.handles = [] wdi_core.WDItemEngine.setup_logging(log_dir=log_dir, log_name=log_name, header=json.dumps(__metadata__)) organism_wdid = wdi_helpers.prop2qid("P685", taxon) if not organism_wdid: raise ValueError("organism {} not found".format(taxon)) print("Running organism: {} {}".format(taxon, organism_wdid)) # get all uniprot id -> wdid mappings, where found in taxon is this organism prot_wdid_mapping = wdi_helpers.id_mapper(UNIPROT, (("P703", organism_wdid), )) # get all goID to wdid mappings go_map = wdi_helpers.id_mapper("P686") # Get GO terms from our local store for this taxon df = pd.DataFrame(list(coll.find({'Taxon': int(taxon)}))) if len(df) == 0: print("No GO annotations found for taxid: {}".format(taxon)) return None # get all pmids and make items for them pmids = set([x[5:] for x in df['Reference'] if x.startswith("PMID:")]) pmid_map = wdi_helpers.id_mapper("P698") print("Total number of pmids: {}".format(len(pmids))) pmids_todo = pmids - set(pmid_map.keys()) print("Creating {} pmid items".format(len(pmids_todo))) new_pmids = create_articles(pmids_todo, login, write) pmid_map.update(new_pmids) print("Done creating pmid items") # get all external IDs we may need by uniprot id external_ids = defaultdict(dict) external_ids_info = { 'Saccharomyces Genome Database ID': 'P3406', 'Mouse Genome Informatics ID': 'P671', 'UniProt ID': 'P352' } for external_id_name, prop in external_ids_info.items(): id_map = wdi_helpers.id_mapper(prop, (("P703", organism_wdid), )) if not id_map: continue for id, wdid in id_map.items(): external_ids[wdid][external_id_name] = id # groupby ID, GOID & evidence, the make references a list go_annotations = df.groupby( ['ID', 'GO ID', 'Evidence', 'Source', 'DB', 'Aspect'])['Reference'].apply(list) # iterate through all proteins & write failed_items = [] for uniprot_id, item_wdid in tqdm(prot_wdid_mapping.items()): #if uniprot_id != "P28005": # continue if uniprot_id not in go_annotations: continue this_go = go_annotations[uniprot_id] external_id = external_ids[item_wdid] try: statements = make_go_statements(item_wdid, uniprot_id, this_go, retrieved, go_map, pmid_map, external_id, login) wditem = wdi_core.WDItemEngine(wd_item_id=item_wdid, domain='protein', data=statements, fast_run=fast_run, fast_run_base_filter={ UNIPROT: "", "P703": organism_wdid }) # good_refs=[{'P248': None}], keep_good_ref_statements=True) wdi_helpers.try_write(wditem, record_id=uniprot_id, record_prop=UNIPROT, edit_summary="update GO terms", login=login, write=write) except Exception as e: print(e) failed_items.append(uniprot_id) wdi_core.WDItemEngine.log( "ERROR", wdi_helpers.format_msg(uniprot_id, UNIPROT, item_wdid, str(e), msg_type=type(e))) print("{} items failed: {}".format(len(failed_items), failed_items)) if wdi_core.WDItemEngine.logger is not None: wdi_core.WDItemEngine.logger.handles = []
def main(taxid, metadata, log_dir="./logs", run_id=None, fast_run=True, write=True, entrez=None): """ Main function for creating/updating genes :param taxid: taxon to use (ncbi tax id) :type taxid: str :param metadata: looks like: {"ensembl" : 84, "cpdb" : 31, "netaffy" : "na35", "ucsc" : "20160620", .. } :type metadata: dict :param log_dir: dir to store logs :type log_dir: str :param fast_run: use fast run mode :type fast_run: bool :param write: actually perform write :type write: bool :param entrez: Only run this one gene :type entrez: int :return: None """ # make sure the organism is found in wikidata taxid = int(taxid) organism_wdid = wdi_helpers.prop2qid("P685", str(taxid)) if not organism_wdid: print("organism {} not found in wikidata".format(taxid)) return None # login login = wdi_login.WDLogin(user=WDUSER, pwd=WDPASS) if wdi_core.WDItemEngine.logger is not None: wdi_core.WDItemEngine.logger.handles = [] wdi_core.WDItemEngine.logger.handlers = [] run_id = run_id if run_id is not None else datetime.now().strftime('%Y%m%d_%H:%M') log_name = '{}-{}.log'.format(__metadata__['name'], run_id) __metadata__['taxid'] = taxid wdi_core.WDItemEngine.setup_logging(log_dir=log_dir, logger_name='WD_logger', log_name=log_name, header=json.dumps(__metadata__)) # get organism metadata (name, organism type, wdid) # TODO: this can be pulled from wd if taxid in organisms_info and organisms_info[taxid]['type'] != "microbial": validate_type = 'eukaryotic' organism_info = organisms_info[taxid] # make sure all chromosome items are found in wikidata cb = ChromosomeBot() chr_num_wdid = cb.get_or_create(organism_info, login=login) chr_num_wdid = {k.upper(): v for k, v in chr_num_wdid.items()} if int(organism_info['taxid']) == 9606: bot = HumanGeneBot(organism_info, chr_num_wdid, login) else: bot = ChromosomalGeneBot(organism_info, chr_num_wdid, login) else: # check if its one of the reference microbial genomes # raises valueerror if not... organism_info = mcb.get_organism_info(taxid) refseq_qid_chrom = mcb.get_or_create_chromosomes(taxid, login) print(organism_info) bot = MicrobeGeneBot(organism_info, refseq_qid_chrom, login) validate_type = "microbial" # Get handle to mygene records mgd = MyGeneDownloader() if entrez: doc, total = mgd.get_mg_gene(entrez) docs = iter([doc]) else: doc_filter = lambda x: (x.get("type_of_gene") != "biological-region") and ("entrezgene" in x) docs, total = mgd.get_mg_cursor(taxid, doc_filter) print("total number of records: {}".format(total)) # the scroll_id/cursor times out from mygene if we iterate. So.... get the whole thing now docs = list(docs) docs = HelperBot.validate_docs(docs, validate_type, PROPS['Entrez Gene ID']) records = HelperBot.tag_mygene_docs(docs, metadata) bot.run(records, total=total, fast_run=fast_run, write=write) for frc in wdi_core.WDItemEngine.fast_run_store: frc.clear() print("done updating, waiting 10 min") time.sleep(10 * 60) releases = dict() releases_to_remove = set() last_updated = dict() metadata = {k: v for k, v in metadata.items() if k in {'uniprot', 'ensembl', 'entrez'}} for k, v in parse_mygene_src_version(metadata).items(): if "release" in v: if k not in releases: releases[k] = wdi_helpers.id_mapper('P393', (('P629', source_items[k]),)) to_remove = set(releases[k].values()) to_remove.discard(releases[k][v['release']]) releases_to_remove.update(to_remove) print( "{}: Removing releases: {}, keeping release: {}".format(k, ", ".join(set(releases[k]) - {v['release']}), v['release'])) else: last_updated[source_items[k]] = datetime.strptime(v["timestamp"], "%Y%m%d") print(last_updated) bot.cleanup(releases_to_remove, last_updated)
def main(coll, taxid, metadata, log_dir="./logs", fast_run=True, write=True): """ Main function for creating/updating genes :param coll: mongo collection containing gene data from mygene :type coll: pymongo.collection.Collection :param taxid: taxon to use (ncbi tax id) :type taxid: str :param metadata: looks like: {"ensembl" : 84, "cpdb" : 31, "netaffy" : "na35", "ucsc" : "20160620", .. } :type metadata: dict :param log_dir: dir to store logs :type log_dir: str :param fast_run: use fast run mode :type fast_run: bool :param write: actually perform write :type write: bool :return: None """ # make sure the organism is found in wikidata taxid = int(taxid) organism_wdid = wdi_helpers.prop2qid("P685", str(taxid)) if not organism_wdid: print("organism {} not found in wikidata".format(taxid)) return None # login login = wdi_login.WDLogin(user=WDUSER, pwd=WDPASS) wdi_core.WDItemEngine.setup_logging(log_dir=log_dir, logger_name='WD_logger', log_name=log_name, header=json.dumps(__metadata__)) # get organism metadata (name, organism type, wdid) if taxid in organisms_info: # its one of fungal, mammalian, plant (not microbe) validate_type = 'gene' organism_info = organisms_info[taxid] # make sure all chromosome items are found in wikidata cb = ChromosomeBot() chr_num_wdid = cb.get_or_create(organism_info, login=login) if int(organism_info['taxid']) == 9606: bot = HumanGeneBot(organism_info, chr_num_wdid, login) else: bot = MammalianGeneBot(organism_info, chr_num_wdid, login) else: # check if its one of the microbe refs # raises valueerror if not... organism_info = get_organism_info(taxid) print(organism_info) bot = MicrobeGeneBot(organism_info, login) validate_type = "microbial" # only do certain records doc_filter = { 'taxid': taxid, 'type_of_gene': 'protein-coding', 'genomic_pos': { '$exists': True } } docs = coll.find(doc_filter, no_cursor_timeout=True) total = docs.count() print("total number of records: {}".format(total)) docs = HelperBot.validate_docs(docs, validate_type, PROPS['Entrez Gene ID']) records = HelperBot.tag_mygene_docs(docs, metadata) bot.run(records, total=total, fast_run=fast_run, write=write) docs.close() # after the run is done, disconnect the logging handler # so that if we start another, it doesn't write twice if wdi_core.WDItemEngine.logger is not None: wdi_core.WDItemEngine.logger.handles = []
def main(taxid, metadata, log_dir="./logs", run_id=None, fast_run=True, write=True, entrez=None): """ Main function for creating/updating proteins :param taxid: taxon to use (ncbi tax id) :type taxid: str :param metadata: looks like: {"ensembl" : 84, "cpdb" : 31, "netaffy" : "na35", "ucsc" : "20160620", .. } :type metadata: dict :param log_dir: dir to store logs :type log_dir: str :param fast_run: use fast run mode :type fast_run: bool :param write: actually perform write :type write: bool :param entrez: Only run this one protein (given by entrezgene id) :type entrez: int :return: None """ # make sure the organism is found in wikidata taxid = int(taxid) organism_wdid = wdi_helpers.prop2qid("P685", str(taxid)) if not organism_wdid: print("organism {} not found in wikidata".format(taxid)) return None # login login = wdi_login.WDLogin(user=WDUSER, pwd=WDPASS) if wdi_core.WDItemEngine.logger is not None: wdi_core.WDItemEngine.logger.handles = [] wdi_core.WDItemEngine.logger.handlers = [] run_id = run_id if run_id is not None else datetime.now().strftime('%Y%m%d_%H:%M') log_name = '{}-{}.log'.format(__metadata__['name'], run_id) __metadata__['taxid'] = taxid wdi_core.WDItemEngine.setup_logging(log_dir=log_dir, log_name=log_name, header=json.dumps(__metadata__)) # get organism metadata (name, organism type, wdid) if taxid in organisms_info: validate_type = 'eukaryotic' organism_info = organisms_info[taxid] else: # check if its one of the microbe refs # raises valueerror if not... organism_info = get_organism_info(taxid) validate_type = 'microbial' print(organism_info) # get all entrez gene id -> wdid mappings, where found in taxon is this strain gene_wdid_mapping = id_mapper("P351", (("P703", organism_info['wdid']),)) bot = ProteinBot(organism_info, gene_wdid_mapping, login) # Get handle to mygene records mgd = MyGeneDownloader() if entrez: doc, total = mgd.get_mg_gene(entrez) docs = iter([doc]) else: doc_filter = lambda x: (x.get("type_of_gene") == "protein-coding") and ("uniprot" in x) and ("entrezgene" in x) docs, total = mgd.get_mg_cursor(taxid, doc_filter) print("total number of records: {}".format(total)) # the scroll_id/cursor times out from mygene if we iterate. So.... get the whole thing now docs = list(docs) docs = HelperBot.validate_docs(docs, validate_type, PROPS['Entrez Gene ID']) records = HelperBot.tag_mygene_docs(docs, metadata) bot.run(records, total=total, fast_run=fast_run, write=write) for frc in wdi_core.WDItemEngine.fast_run_store: frc.clear() time.sleep(10 * 60) releases = dict() releases_to_remove = set() last_updated = dict() metadata = {k: v for k, v in metadata.items() if k in {'uniprot', 'ensembl', 'entrez'}} for k, v in parse_mygene_src_version(metadata).items(): if "release" in v: if k not in releases: releases[k] = wdi_helpers.id_mapper('P393', (('P629', source_items[k]),)) to_remove = set(releases[k].values()) to_remove.discard(releases[k][v['release']]) releases_to_remove.update(to_remove) print( "{}: Removing releases: {}, keeping release: {}".format(k, ", ".join(set(releases[k]) - {v['release']}), v['release'])) else: last_updated[source_items[k]] = datetime.strptime(v["timestamp"], "%Y%m%d") print(last_updated) bot.cleanup(releases_to_remove, last_updated) # after the run is done, disconnect the logging handler # so that if we start another, it doesn't write twice if wdi_core.WDItemEngine.logger is not None: wdi_core.WDItemEngine.logger.handles = []
def lookup_author(orcid_id): return prop2qid(PROPS['orcid id'], orcid_id)
def main(coll, taxid, metadata, log_dir="./logs", fast_run=True, write=True): """ Main function for creating/updating proteins :param coll: mongo collection containing protein data from mygene :type coll: pymongo.collection.Collection :param taxid: taxon to use (ncbi tax id) :type taxid: str :param metadata: looks like: {"ensembl" : 84, "cpdb" : 31, "netaffy" : "na35", "ucsc" : "20160620", .. } :type metadata: dict :param log_dir: dir to store logs :type log_dir: str :param fast_run: use fast run mode :type fast_run: bool :param write: actually perform write :type write: bool :return: None """ # make sure the organism is found in wikidata taxid = int(taxid) organism_wdid = wdi_helpers.prop2qid("P685", str(taxid)) if not organism_wdid: print("organism {} not found in wikidata".format(taxid)) return None # login login = wdi_login.WDLogin(user=WDUSER, pwd=WDPASS) wdi_core.WDItemEngine.setup_logging(log_dir=log_dir, log_name=log_name, header=json.dumps(__metadata__)) # get organism metadata (name, organism type, wdid) validate_type = 'protein' if taxid in organisms_info: organism_info = organisms_info[taxid] else: # check if its one of the microbe refs # raises valueerror if not... organism_info = get_organism_info(taxid) print(organism_info) # get all entrez gene id -> wdid mappings, where found in taxon is this strain gene_wdid_mapping = id_mapper("P351", (("P703", organism_info['wdid']), )) bot = ProteinBot(organism_info, gene_wdid_mapping, login) # only do certain records doc_filter = {'taxid': taxid, 'type_of_gene': 'protein-coding'} docs = coll.find(doc_filter, no_cursor_timeout=True) total = docs.count() print("total number of records: {}".format(total)) docs = HelperBot.validate_docs(docs, validate_type, PROPS['Entrez Gene ID']) records = HelperBot.tag_mygene_docs(docs, metadata) bot.run(records, total=total, fast_run=fast_run, write=write) docs.close() # after the run is done, disconnect the logging handler # so that if we start another, it doesn't write twice if wdi_core.WDItemEngine.logger is not None: wdi_core.WDItemEngine.logger.handles = []