def get_ref_microbe_taxids(): """ Downloads the latest bacterial genome assembly summary from the NCBI genome ftp site and generate a list of taxids of the bacterial reference genomes. :return: """ import urllib.request import csv urlbase = 'ftp://ftp.ncbi.nlm.nih.gov' urlextension = '/genomes/refseq/bacteria/assembly_summary.txt' assembly = urllib.request.urlopen(urlbase + urlextension) datareader = csv.reader(assembly.read().decode().splitlines(), delimiter="\t") taxid = [] for row in datareader: if len(row) == 1 and row[0].startswith("#"): continue if row[4] in ['reference genome','representative genome']: taxid.append(row[5]) ts = get_timestamp() dump(taxid, "ref_microbe_taxids_{}.pyobj".format(ts)) return taxid
def diff_worker_old_vs_new(id_list_old, new_db_col_names, batch_num, diff_folder): new = create_backend(new_db_col_names) docs_common = new.mget_from_ids(id_list_old) ids_common = [_doc['_id'] for _doc in docs_common] id_in_old = list(set(id_list_old) - set(ids_common)) file_name = os.path.join(diff_folder, "%s.pyobj" % str(batch_num)) _result = { 'delete': id_in_old, 'add': [], 'update': [], 'source': new.target_name, 'timestamp': get_timestamp() } summary = {"add": 0, "update": 0, "delete": len(id_in_old)} if len(id_in_old) != 0: dump(_result, file_name) # compute md5 so when downloaded, users can check integreity md5 = md5sum(file_name) summary["diff_file"] = { "name": os.path.basename(file_name), "md5sum": md5 } return summary
def get_ref_microbe_taxids(): """ Downloads the latest bacterial genome assembly summary from the NCBI genome ftp site and generate a list of taxids of the bacterial reference genomes. :return: """ import urllib.request import csv urlbase = 'ftp://ftp.ncbi.nlm.nih.gov' urlextension = '/genomes/refseq/bacteria/assembly_summary.txt' assembly = urllib.request.urlopen(urlbase + urlextension) datareader = csv.reader(assembly.read().decode().splitlines(), delimiter="\t") taxid = [] for row in datareader: if len(row) == 1 and row[0].startswith("#"): continue if row[4] in ['reference genome', 'representative genome']: taxid.append(row[5]) ts = get_timestamp() dump(taxid, "ref_microbe_taxids_{}.pyobj".format(ts)) return taxid
def parse_gbff(self, gbff_files, job_manager): out_d = {} jobs = [] got_error = False for infile in gbff_files: baseinfile = os.path.basename(infile) pinfo = self.get_pinfo() pinfo["step"] = "post-dump (gbff)" pinfo["description"] = baseinfile job = yield from job_manager.defer_to_process( pinfo, partial(parser_worker, infile)) def parsed(res, fn): nonlocal out_d try: out_li = res.result() self.logger.info("%d records parsed from %s" % (len(out_li), fn)) species = os.path.basename(fn).split('.')[0] out_d.setdefault(species, []).extend(out_li) except Exception as e: self.logger.error("Failed parsing gbff file '%s': %s" % (fn, e)) nonlocal got_error got_error = e job.add_done_callback(partial(parsed, fn=infile)) jobs.append(job) # stop the loop asap if error if got_error: raise got_error if jobs: yield from asyncio.gather(*jobs) if got_error: raise got_error # if we get here, result is ready to be dumped outfile = os.path.join(self.new_data_folder, 'rna.gbff.parsed.pyobj') self.logger.info("Dump gbff parsed data to '%s'" % outfile) dump(out_d, outfile, compress="lzma") #output gene2summary text file self.logger.info("Generate gene2summary") sumout = os.path.join(self.new_data_folder, 'gene2summary_all.txt') output_gene2summary(out_d, sumout) assert os.path.getsize(sumout) > 0 #output gene2ec text file self.logger.info("Generate gene2ec") ecout = os.path.join(self.new_data_folder, 'gene2ec_all.txt') output_gene2ec(out_d, ecout) assert os.path.getsize(ecout) > 0
def reset_synced(self, diff_folder, backend=None): """ Remove "synced" flag from any pyobj file in diff_folder """ diff_files = glob.glob(os.path.join(diff_folder, "*.pyobj")) for diff in diff_files: pyobj = loadobj(diff) try: if pyobj.get("synced"): if backend: self.logger.info( "Removing synced flag from '%s' for backend '%s'" % (diff, backend)) pyobj["synced"].pop(backend, None) else: self.logger.info("Removing synced flag from '%s'" % diff) pyobj.pop("synced") dump(pyobj, diff) except AttributeError: # pyobj not a dict continue
def diff_mapping(old, new, diff_folder): summary = {} old_build = get_src_build().find_one( {"_id": old.target_collection.name}) new_build = get_src_build().find_one( {"_id": new.target_collection.name}) if old_build and new_build: # mapping diff always in jsondiff mapping_diff = jsondiff(old_build["mapping"], new_build["mapping"]) if mapping_diff: file_name = os.path.join(diff_folder, "mapping.pyobj") dump(mapping_diff, file_name) md5 = md5sum(file_name) summary["mapping_file"] = { "name": os.path.basename(file_name), "md5sum": md5 } else: self.logger.info("Neither '%s' nor '%s' have mappings associated to them, skip" % \ (old.target_collection.name,new.target_collection.name)) return summary
def diff_worker_new_vs_old(id_list_new, old_db_col_names, new_db_col_names, batch_num, diff_folder, diff_func, exclude=[], selfcontained=False): new = create_backend(new_db_col_names) old = create_backend(old_db_col_names) docs_common = old.mget_from_ids(id_list_new) ids_common = [_doc['_id'] for _doc in docs_common] id_in_new = list(set(id_list_new) - set(ids_common)) _updates = [] if len(ids_common) > 0: _updates = diff_func(old, new, list(ids_common), exclude_attrs=exclude) file_name = os.path.join(diff_folder, "%s.pyobj" % str(batch_num)) _result = { 'add': id_in_new, 'update': _updates, 'delete': [], 'source': new.target_name, 'timestamp': get_timestamp() } if selfcontained: _result["add"] = new.mget_from_ids(id_in_new) summary = {"add": len(id_in_new), "update": len(_updates), "delete": 0} if len(_updates) != 0 or len(id_in_new) != 0: dump(_result, file_name) # compute md5 so when downloaded, users can check integreity md5 = md5sum(file_name) summary["diff_file"] = { "name": os.path.basename(file_name), "md5sum": md5 } return summary
def finalize(self): '''dump target_dict into a file.''' from biothings.utils.common import dump dump(self.target_dict, self.target_name+'.pyobj')
def get_geneid_d(data_folder, species_li=None, load_cache=True, save_cache=True, only_for={}): '''return a dictionary of current/retired geneid to current geneid mapping. This is useful, when other annotations were mapped to geneids may contain retired gene ids. if species_li is None, genes from all species are loaded. Note that all ids are int type. ''' if species_li: taxid_set = set( [TAXONOMY[species]["tax_id"] for species in species_li]) else: taxid_set = None orig_cwd = os.getcwd() os.chdir(data_folder) # check cache file _cache_file = 'geneid_d.pyobj' if load_cache and os.path.exists(_cache_file) and \ file_newer(_cache_file, 'gene_info.gz') and \ file_newer(_cache_file, 'gene_history.gz'): _taxid_set, out_d = loadobj(_cache_file) assert _taxid_set == taxid_set os.chdir(orig_cwd) return out_d DATAFILE = os.path.join(data_folder, 'gene_info.gz') if species_li: species_filter = lambda ld: int(ld[0]) in taxid_set and ( only_for and ld[1] in only_for) elif only_for: species_filter = lambda ld: only_for and ld[1] in only_for else: species_filter = None geneid_li = set(tab2list(DATAFILE, 1, includefn=species_filter)) DATAFILE = os.path.join(data_folder, 'gene_history.gz') if species_li: _includefn = lambda ld: int(ld[0]) in taxid_set and ld[1] in geneid_li else: _includefn = lambda ld: ld[1] in geneid_li # include all species retired2gene = tab2dict(DATAFILE, (1, 2), 1, alwayslist=0, includefn=_includefn) # includefn above makes sure taxid is for species_li and filters out those # mapped_to geneid exists in gene_info list # convert key/value to int out_d = dict_convert(retired2gene, keyfn=int, valuefn=int) # TODO: this fills memory with key==value ... for g in geneid_li: _g = int(g) out_d[_g] = _g if save_cache: if species_li: dump((taxid_set, out_d), _cache_file) else: dump((None, out_d), _cache_file) os.chdir(orig_cwd) return out_d
def finalize(self): '''dump target_dict into a file.''' from biothings.utils.common import dump dump(self.target_dict, self.target_name + '.pyobj')
def sync_es_jsondiff_worker(diff_file, es_config, new_db_col_names, batch_size, cnt, force=False, selfcontained=False, metadata={}): """Worker to sync data between a new mongo collection and an elasticsearch index""" new = create_backend(new_db_col_names) # mongo collection to sync from indexer = create_backend(es_config).target_esidxer diff = loadobj(diff_file) res = {"added": 0, "updated": 0, "deleted": 0, "skipped": 0} # check if diff files was already synced if not force and diff.get("synced", {}).get("es") == True: logging.info("Diff file '%s' already synced, skip it" % diff_file) res["skipped"] += len(diff["add"]) + len(diff["delete"]) + len( diff["update"]) return res assert new.target_collection.name == diff[ "source"], "Source is different in diff file '%s': %s" % ( diff_file, diff["source"]) errors = [] # add: get ids from "new" if selfcontained: # diff["add"] contains all documents, no mongo needed cur = diff["add"] else: cur = doc_feeder(new.target_collection, step=batch_size, inbatch=False, query={'_id': { '$in': diff["add"] }}) for docs in iter_n(cur, batch_size): try: res["added"] += indexer.index_bulk(docs, batch_size, action="create")[0] except BulkIndexError: for doc in docs: try: # force action=create to spot docs already added indexer.index(doc, doc["_id"], action="create") res["added"] += 1 except ConflictError: # already added res["skipped"] += 1 continue except Exception as e: errors.append({ "_id": doc["_id"], "file": diff_file, "error": e }) import pickle pickle.dump(errors, open("errors", "wb")) raise # update: get doc from indexer and apply diff batch = [] ids = [p["_id"] for p in diff["update"]] for i, doc in enumerate(indexer.get_docs(ids)): try: patch_info = diff["update"][ i] # same order as what's return by get_doc()... assert patch_info["_id"] == doc["_id"] # ... but just make sure newdoc = jsonpatch.apply_patch(doc, patch_info["patch"]) if newdoc == doc: # already applied res["skipped"] += 1 continue batch.append(newdoc) except jsonpatch.JsonPatchConflict: # assuming already applieda res["skipped"] += 1 continue if len(batch) >= batch_size: res["updated"] += indexer.index_bulk(batch, batch_size)[0] batch = [] if batch: res["updated"] += indexer.index_bulk(batch, batch_size)[0] # delete: remove from "old" for ids in iter_n(diff["delete"], batch_size): del_skip = indexer.delete_docs(ids) res["deleted"] += del_skip[0] res["skipped"] += del_skip[1] logging.info("Done applying diff from file '%s': %s" % (diff_file, res)) diff.setdefault("synced", {}).setdefault("es", True) dump(diff, diff_file) return res
def sync_mongo_jsondiff_worker(diff_file, old_db_col_names, new_db_col_names, batch_size, cnt, force=False, selfcontained=False, metadata={}): """Worker to sync data between a new and an old mongo collection""" new = create_backend(new_db_col_names) old = create_backend(old_db_col_names) storage = UpsertStorage(get_target_db(), old.target_collection.name, logging) diff = loadobj(diff_file) res = {"added": 0, "updated": 0, "deleted": 0, "skipped": 0} # check if diff files was already synced if not force and diff.get("synced", {}).get("mongo") == True: logging.info("Diff file '%s' already synced, skip it" % diff_file) res["skipped"] += len(diff["add"]) + len(diff["delete"]) + len( diff["update"]) return res assert new.target_collection.name == diff[ "source"], "Source is different in diff file '%s': %s" % ( diff_file, diff["source"]) # add: get ids from "new" if selfcontained: # diff["add"] contains all documents, not mongo needed for docs in iter_n(diff["add"], batch_size): res["added"] += storage.process((d for d in docs), batch_size) else: cur = doc_feeder(new.target_collection, step=batch_size, inbatch=False, query={'_id': { '$in': diff["add"] }}) for docs in iter_n(cur, batch_size): # use generator otherwise process/doc_iterator will require a dict (that's bad...) res["added"] += storage.process((d for d in docs), batch_size) # update: get doc from "old" and apply diff batch = [] for patch_info in diff["update"]: doc = old.get_from_id(patch_info["_id"]) try: doc = jsonpatch.apply_patch(doc, patch_info["patch"]) batch.append(doc) except jsonpatch.JsonPatchConflict: # assuming already applieda res["skipped"] += 1 continue if len(batch) >= batch_size: res["updated"] += storage.process((d for d in batch), batch_size) batch = [] if batch: res["updated"] += storage.process((d for d in batch), batch_size) # delete: remove from "old" for ids in iter_n(diff["delete"], batch_size): res["deleted"] += old.remove_from_ids(ids) # we potentially modified the "old" collection so invalidate cache just to make sure invalidate_cache(old.target_collection.name, "target") logging.info("Done applying diff from file '%s': %s" % (diff_file, res)) diff.setdefault("synced", {}).setdefault("mongo", True) dump(diff, diff_file) return res
def load_all(data_folder): '''Load "uniprot" using yield, while building "PDB" and "PIR" data dict while reading data file. These dict are then dumped (pickled) and stored later''' def cvt_fn(pdb_id): return pdb_id.split(':')[0] def merge(xli, transcode=False): xli2 = [] uniprot_acc, section, entrez_id, ensembl_id = xli if entrez_id: xli2.append((uniprot_acc, section, entrez_id)) elif ensembl_id: if not transcode: raise KeyError(ensembl_id) try: entrez_id = ensembl2geneid[ensembl_id] #if ensembl_id can be mapped to entrez_id for _eid in entrez_id: xli2.append((uniprot_acc, section, _eid)) except KeyError: xli2.append((uniprot_acc, section, ensembl_id)) return xli2 def transform(xli2): gene2uniprot = list2dict(list_nondup(xli2), 2, alwayslist=True) gene2uniprot = value_convert(gene2uniprot, _dict_convert, traverse_list=False) gid, uniprot = list(gene2uniprot.items())[0] docs = [] for gid, uniprot in gene2uniprot.items(): doc = {"_id": gid} doc.update(uniprot) docs.append(doc) return docs def merge_x(xli, gene2x, transcode=False, cvt_fn=None, k=None): xli2 = [] entrez_id, ensembl_id, x_value = xli if not x_value: return if cvt_fn: x_value = cvt_fn(x_value) if entrez_id: xli2.append((entrez_id, x_value)) elif ensembl_id: if not transcode: raise KeyError(ensembl_id) try: entrez_id = x_ensembl2geneid[ensembl_id] #if ensembl_id can be mapped to entrez_id for _eid in entrez_id: xli2.append((_eid, x_value)) except KeyError: xli2.append((ensembl_id, x_value)) for x in xli2: gene2x.setdefault(x[0], []).append(x[1]) uniprot_datafile = os.path.join(data_folder, 'idmapping_selected.tab.gz') t0 = time.time() # cache for uniprot ensembl2geneid = {} # cache for PDB and PIR x_ensembl2geneid = {} remains = [] pdb_remains = [] pir_remains = [] # once filled, will be dumped for later storage gene2pdb = {} gene2pir = {} # store all PDB & PIR data while looping, the whole will be stored later for ld in tabfile_feeder(uniprot_datafile, header=1, assert_column_no=VALID_COLUMN_NO): # Uniprot data will be stored as we read line by line xlis = [] pdbxlis = [] pirxlis = [] # raw lines for each sources uniprotld = [ld[0], ld[1], ld[2], ld[18]] pdbld = [ld[2], ld[19], ld[5]] pirld = [ld[2], ld[19], ld[11]] # UniProt # GeneID and EnsemblID columns may have duplicates for value in dupline_seperator(dupline=uniprotld, dup_idx=[2, 3], dup_sep='; '): value = list(value) value[1] = get_uniprot_section(value[1]) value = tuple(value) xlis.append(value) # PDB for value in dupline_seperator(dupline=pdbld, dup_sep='; '): pdbxlis.append(value) # PIR for value in dupline_seperator(dupline=pirld, dup_sep='; '): pirxlis.append(value) for xli in xlis: # feed mapping if xli[2] != '' and xli[3] != '': ensembl2geneid.setdefault(xli[3], []).append(xli[2]) try: # postpone ensemblid->entrezid resolution while parsing uniprot as the # full transcodification dict is only correct at the end. # ex: # 1. UniprotID-A EntrezID-A EnsemblID # 2. UniprotID-B EnsemblID # 3. UniprotID-C EntrezID-B EnsemblID # # UniprotID-B should associated to both EntrezID-A and EntrezID-B # but we need to read up to line 3 to do so xli2 = merge(xli, transcode=False) if not xli2: continue docs = transform(xli2) for doc in docs: yield doc except KeyError: remains.append(xli) for xli in pdbxlis: if xli[0] != '' and xli[1] != '': x_ensembl2geneid.setdefault(xli[1], []).append(xli[0]) try: merge_x(xli, gene2pdb, transcode=False, cvt_fn=cvt_fn, k="pdb") except KeyError: pdb_remains.append(xli) for xli in pirxlis: if xli[0] != '' and xli[1] != '': x_ensembl2geneid.setdefault(xli[1], []).append(xli[0]) try: merge_x(xli, gene2pir, transcode=False) except KeyError: pir_remains.append(xli) # now transcode with what we have for remain in remains: try: xli2 = merge(remain, transcode=True) if not xli2: continue docs = transform(xli2) for doc in docs: yield doc except KeyError: pass for remain in pdb_remains: try: merge_x(remain, gene2pdb, transcode=True, cvt_fn=cvt_fn) except KeyError: pass for remain in pir_remains: try: merge_x(remain, gene2pir, transcode=True) except KeyError: pass # PDB def normalize(value, keyname): res = None uniq = sorted(set(value)) if len(uniq) > 1: res = {keyname: uniq} else: res = {keyname: uniq[0]} return res def normalize_pdb(value): return normalize(value, "pdb") def normalize_pir(value): return normalize(value, "pir") # PDB gene2pdb = value_convert(gene2pdb, normalize_pdb, traverse_list=False) pdb_dumpfile = os.path.join(data_folder, 'gene2pdb.pyobj') dump(gene2pdb, pdb_dumpfile) # PIR gene2pir = value_convert(gene2pir, normalize_pir, traverse_list=False) pir_dumpfile = os.path.join(data_folder, 'gene2pir.pyobj') dump(gene2pir, pir_dumpfile)