def get_genome_in_bit(chr_fa_folder): ''' encode each chromosome fasta sequence into a bitarray, and store them in a dictionary with chr numbers as keys chr_fa_folder is the folder to put all gzipped fasta files: fasta files can be downloaded from NCBI FTP site: ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh37.p13/Primary_Assembly/assembled_chromosomes/FASTA/ chr<i>.fa.gz (e.g. chr1.fa.gz) ''' chr_bit_d = {} chr_range = [str(i) for i in range(1, 23)] + ['X', 'Y', 'MT'] t0 = time.time() for i in chr_range: t1 = time.time() #file_name = 'hs_ref_GRCh37.p5_chr{}.fa.gz'.format(i) file_name = 'chr{}.fa.gz'.format(i) print("Loading {}...".format(file_name), end='') file_name = os.path.join(chr_fa_folder, file_name) with open_anyfile(file_name) as seq_f: seq_f.readline() # skip header seq_bit = bitarray() for line in seq_f: line = line.rstrip('\n') line_bit = nuc_to_bit(line) seq_bit += line_bit chr_bit_d.update({i: seq_bit}) print("done.[{}]".format(timesofar(t1))) print('=' * 20) print("Finished. [{}]".format(timesofar(t0))) return chr_bit_d
def process(self, iterable, batch_size): self.logger.info("Uploading to the DB...") t0 = time.time() tinner = time.time() total = 0 for doc_li in self.doc_iterator(iterable, batch=True, batch_size=batch_size): try: bob = self.temp_collection.initialize_unordered_bulk_op() for d in doc_li: bob.insert(d) res = bob.execute() total += res['nInserted'] self.logger.info("Inserted %s records [%s]" % (res['nInserted'], timesofar(tinner))) except BulkWriteError as e: self.logger.info( "Inserted %s records, ignoring %d [%s]" % (e.details['nInserted'], len( e.details["writeErrors"]), timesofar(tinner))) except Exception as e: raise tinner = time.time() self.logger.info('Done[%s]' % timesofar(t0)) return total
def process(self, doc_d, batch_size): self.logger.info("Uploading to the DB...") t0 = time.time() tinner = time.time() # force step = 1 cnt = 0 total = 0 dups = 0 for doc_li in self.doc_iterator(doc_d, batch=True, batch_size=1): try: res = self.temp_collection.insert(doc_li, manipulate=False, check_keys=False) cnt += 1 total += 1 if (cnt + dups) % batch_size == 0: # we insert one by one but display progress on a "batch_size" base self.logger.info("Inserted %s records, ignoring %s [%s]" % (cnt, dups, timesofar(tinner))) cnt = 0 dups = 0 tinner = time.time() except DuplicateKeyError: dups += 1 pass self.logger.info('Done[%s]' % timesofar(t0)) return total
def redo_parse_gbff(path): '''call this function manually to re-start the parsing step and set src_dump. This is used when main() is broken at parsing step, then parsing need to be re-started after the fix. ''' #mark the download starts src_dump = get_src_dump() t0 = time.time() t_download = timesofar(t0) t1 = time.time() #mark parsing starts src_dump.update({'_id': 'entrez'}, {'$set': {'status': 'parsing'}}) parse_gbff(path) t_parsing = timesofar(t1) t_total = timesofar(t0) #mark the download finished successfully _updates = { 'status': 'success', 'time': { 'download': t_download, 'parsing': t_parsing, 'total': t_total }, 'pending_to_upload': True # a flag to trigger data uploading } src_dump.update({'_id': 'entrez'}, {'$set': _updates})
def refresh_commands(cls): for num, info in sorted(cls.launched_commands.items()): # already process, this current command is now history # Note: if we have millions of commands there, it could last quite a while, # but IRL we only have a few if info.get("is_done") == True: continue # is_done = set([j.done() for j in info["jobs"]]) == set([True]) # TODO: remove this line is_done = {j.done() for j in info["jobs"]} == {True} has_err = is_done and [ True for j in info["jobs"] if j.exception() ] or None localoutputs = is_done and ( [str(j.exception()) for j in info["jobs"] if j.exception()] or [j.result() for j in info["jobs"]]) or None if is_done: cls.launched_commands[num]["is_done"] = True cls.launched_commands[num][ "failed"] = has_err and has_err[0] or False cls.launched_commands[num]["results"] = localoutputs cls.launched_commands[num]["finished_at"] = time.time() cls.launched_commands[num]["duration"] = timesofar( t0=cls.launched_commands[num]["started_at"], t1=cls.launched_commands[num]["finished_at"]) cls.save_cmd(num, cls.launched_commands[num]) if not has_err and localoutputs and set(map( type, localoutputs)) == {str}: localoutputs = "\n" + "".join(localoutputs) cls.pending_outputs[num] = "[%s] %s {%s} %s: finished %s " % \ (num, has_err and "ERR" or "OK", timesofar(info["started_at"]), info["cmd"], localoutputs) else: cls.pending_outputs[num] = "[%s] RUN {%s} %s" % ( num, timesofar(info["started_at"]), info["cmd"])
def get_genome_in_bit(chr_fa_folder): ''' encode each chromosome fasta sequence into a bitarray, and store them in a dictionary with chr numbers as keys chr_fa_folder is the folder to put all gzipped fasta files: fasta files can be downloaded from NCBI FTP site: ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh37.p13/Primary_Assembly/assembled_chromosomes/FASTA/ chr<i>.fa.gz (e.g. chr1.fa.gz) ''' chr_bit_d = {} chr_range = [str(i) for i in range(1, 23)] + ['X', 'Y', 'MT'] t0 = time.time() for i in chr_range: t1 = time.time() #file_name = 'hs_ref_GRCh37.p5_chr{}.fa.gz'.format(i) file_name = 'chr{}.fa.gz'.format(i) print("Loading {}...".format(file_name), end='') file_name = os.path.join(chr_fa_folder, file_name) with open_anyfile(file_name) as seq_f: seq_f.readline() # skip header seq_bit = bitarray() for line in seq_f: line = line.rstrip('\n') line_bit = nuc_to_bit(line) seq_bit += line_bit chr_bit_d.update({i: seq_bit}) print("done.[{}]".format(timesofar(t1))) print('='*20) print("Finished. [{}]".format(timesofar(t0))) return chr_bit_d
def process(self, doc_d, batch_size): self.logger.info("Uploading to the DB...") t0 = time.time() tinner = time.time() aslistofdict = None total = 0 for doc_li in self.doc_iterator(doc_d, batch=True, batch_size=batch_size): toinsert = len(doc_li) nbinsert = 0 self.logger.info("Inserting %s records ... " % toinsert) try: bob = self.temp_collection.initialize_unordered_bulk_op() for d in doc_li: aslistofdict = d.pop("__aslistofdict__", None) bob.insert(d) res = bob.execute() nbinsert += res["nInserted"] self.logger.info("OK [%s]" % timesofar(tinner)) except BulkWriteError as e: inserted = e.details["nInserted"] nbinsert += inserted self.logger.info("Fixing %d records " % len(e.details["writeErrors"])) ids = [d["op"]["_id"] for d in e.details["writeErrors"]] # build hash of existing docs docs = self.temp_collection.find({"_id": {"$in": ids}}) hdocs = {} for doc in docs: hdocs[doc["_id"]] = doc bob2 = self.temp_collection.initialize_unordered_bulk_op() for err in e.details["writeErrors"]: errdoc = err["op"] existing = hdocs[errdoc["_id"]] assert "_id" in existing _id = errdoc.pop("_id") merged = merge_struct(errdoc, existing, aslistofdict=aslistofdict) bob2.find({"_id": _id}).update_one({"$set": merged}) # update previously fetched doc. if several errors are about the same doc id, # we would't merged things properly without an updated document assert "_id" in merged hdocs[_id] = merged nbinsert += 1 res = bob2.execute() self.logger.info("OK [%s]" % timesofar(tinner)) assert nbinsert == toinsert, "nb %s to %s" % (nbinsert, toinsert) # end of loop so it counts the time spent in doc_iterator tinner = time.time() total += nbinsert self.logger.info('Done[%s]' % timesofar(t0)) self.switch_collection() self.post_update_data() return total
def update_index(changes, sync_src, sync_target, noconfirm=False): # changes['_add'] = changes['delete'] # changes['_delete'] = changes['add'] # changes['delete'] = changes['_delete'] # changes['add'] = changes['_add'] # del changes['_add'] # del changes['_delete'] print("\t{}\trecords will be added.".format(len(changes['add']))) print("\t{}\trecords will be deleted.".format(len(changes['delete']))) print("\t{}\trecords will be updated.".format(len(changes['update']))) print() print('\tsync_src:\t{:<45}{}'.format(sync_src.target_collection.name, sync_src.name)) print('\tsync_target\t{:<45}{}'.format( sync_target.target_esidxer.ES_INDEX_NAME, sync_target.name)) if noconfirm or ask("Continue?") == 'Y': t00 = time.time() es_idxer = sync_target.target_esidxer if len(changes['add']) > 0: print("Adding {} new records...".format(len(changes['add']))) t0 = time.time() _q = {'_id': {'$in': changes['add']}} for docs in doc_feeder(sync_src.target_collection, step=1000, inbatch=True, query=_q): es_idxer.add_docs(docs) print("Done. [{}]".format(timesofar(t0))) if len(changes['delete']) > 0: print("Deleting {} old records...".format(len(changes['delete']))) t0 = time.time() es_idxer.delete_docs(changes['delete']) print("Done. [{}]".format(timesofar(t0))) if len(changes['update']) > 0: print("Updating {} existing records...".format( len(changes['update']))) t0 = time.time() ids = [d['_id'] for d in changes['update']] _q = {'_id': {'$in': ids}} for docs in doc_feeder(sync_src.target_collection, step=1000, inbatch=True, query=_q): es_idxer.add_docs(docs) print("Done. [{}]".format(timesofar(t0))) print('=' * 20) print('Finished. [{}]'.format(timesofar(t00)))
def load(self, genedoc_d=None, update_data=True, update_master=True, test=False, step=10000): if not self.temp_collection: self.make_temp_collection() self.temp_collection.drop() # drop all existing records just in case. if update_data: genedoc_d = genedoc_d or self.load_genedoc() print("genedoc_d mem: %s" % sys.getsizeof(genedoc_d)) print("Uploading to the DB...", end='') t0 = time.time() # for doc in self.doc_iterator(genedoc_d, batch=False): # if not test: # doc.save() for doc_li in self.doc_iterator(genedoc_d, batch=True, step=step): if not test: self.temp_collection.insert(doc_li, manipulate=False, check_keys=False) print('Done[%s]' % timesofar(t0)) self.switch_collection() if getattr(self, 'ENTREZ_GENEDOC_ROOT', False): print('Uploading "geneid_d" to GridFS...', end='') t0 = time.time() geneid_d = self.get_geneid_d() dump2gridfs(geneid_d, self.__collection__ + '__geneid_d.pyobj', self.db) print('Done[%s]' % timesofar(t0)) if getattr(self, 'ENSEMBL_GENEDOC_ROOT', False): print('Uploading "mapping2entrezgene" to GridFS...', end='') t0 = time.time() x2entrezgene_list = self.get_mapping_to_entrez() dump2gridfs(x2entrezgene_list, self.__collection__ + '__2entrezgene_list.pyobj', self.db) print('Done[%s]' % timesofar(t0)) if update_master: # update src_master collection if not test: _doc = {"_id": str(self.__collection__), "name": str(self.__collection__), "timestamp": datetime.datetime.now()} for attr in ['ENTREZ_GENEDOC_ROOT', 'ENSEMBL_GENEDOC_ROOT', 'id_type']: if hasattr(self, attr): _doc[attr] = getattr(self, attr) if hasattr(self, 'get_mapping'): _doc['mapping'] = getattr(self, 'get_mapping')() coll = conn[GeneDocSourceMaster.__database__][GeneDocSourceMaster.__collection__] dkey = {"_id": _doc["_id"]} prev = coll.find_one(dkey) if prev: coll.replace_one(dkey, _doc) else: coll.insert_one(_doc)
def doc_feeder(collection, step=1000, s=None, e=None, inbatch=False, query=None, batch_callback=None, fields=None): '''A iterator for returning docs in a collection, with batch query. additional filter query can be passed via "query", e.g., doc_feeder(collection, query={'taxid': {'$in': [9606, 10090, 10116]}}) batch_callback is a callback function as fn(cnt, t), called after every batch fields is optional parameter passed to find to restrict fields to return. ''' cur = collection.find(query, no_cursor_timeout=False, projection=fields) n = cur.count() s = s or 0 e = e or n print('Retrieving %d documents from database "%s".' % (n, collection.name)) t0 = time.time() if inbatch: doc_li = [] cnt = 0 t1 = time.time() try: if s: cur.skip(s) cnt = s print("Skipping %d documents." % s) if e: cur.limit(e - (s or 0)) cur.batch_size(step) print("Processing %d-%d documents..." % (cnt + 1, min(cnt + step, e)), end='') for doc in cur: if inbatch: doc_li.append(doc) else: yield doc cnt += 1 if cnt % step == 0: if inbatch: yield doc_li doc_li = [] print('Done.[%.1f%%,%s]' % (cnt * 100. / n, timesofar(t1))) if batch_callback: batch_callback(cnt, time.time()-t1) if cnt < e: t1 = time.time() print("Processing %d-%d documents..." % (cnt + 1, min(cnt + step, e)), end='') if inbatch and doc_li: #Important: need to yield the last batch here yield doc_li #print 'Done.[%s]' % timesofar(t1) print('Done.[%.1f%%,%s]' % (cnt * 100. / n, timesofar(t1))) print("=" * 20) print('Finished.[total time: %s]' % timesofar(t0)) finally: cur.close()
def apply_changes(self, changes, verify=True, noconfirm=False): if verify: self.pre_verify_changes(changes) if not (noconfirm or ask('\nContinue to apply changes?') == 'Y'): print("Aborted.") return -1 step = self.step _db = get_target_db() source_col = _db[changes['source']] src = GeneDocMongoDBBackend(source_col) target = GeneDocESBackend(self) _timestamp = changes['timestamp'] def _add_docs(ids): i = 0 for _ids in iter_n(ids, step): t1 = time.time() _doc_li = src.mget_from_ids(_ids) for _doc in _doc_li: _doc['_timestamp'] = _timestamp i += 1 target.insert(_doc_li) print('\t{}\t{}'.format(i, timesofar(t1))) t0 = time.time() if changes['add']: print("Adding {} new docs...".format(len(changes['add']))) t00 = time.time() _add_docs(changes['add']) print("done. [{}]".format(timesofar(t00))) if changes['delete']: print("Deleting {} discontinued docs...".format( len(changes['delete'])), end='') t00 = time.time() target.remove_from_ids(changes['delete'], step=step) print("done. [{}]".format(timesofar(t00))) if changes['update']: print("Updating {} existing docs...".format(len( changes['update']))) t00 = time.time() ids = [x['_id'] for x in changes['update']] _add_docs(ids) print("done. [{}]".format(timesofar(t00))) target.finalize() print("\n") print("Finished.", timesofar(t0))
def two_docs_iterator(b1, b2, id_list, step=10000): t0 = time.time() n = len(id_list) for i in range(0, n, step): t1 = time.time() print("Processing %d-%d documents..." % (i + 1, min(i + step, n)), end='') _ids = id_list[i:i+step] iter1 = b1.mget_from_ids(_ids, asiter=True) iter2 = b2.mget_from_ids(_ids, asiter=True) for doc1, doc2 in zip(iter1, iter2): yield doc1, doc2 print('Done.[%.1f%%,%s]' % (i*100./n, timesofar(t1))) print("="*20) print('Finished.[total time: %s]' % timesofar(t0))
def check_src_upload(self): running_processes = self.running_processes_upload jobs_finished = [] if running_processes: self.idle = True print('Dispatcher: {} active job(s)'.format( len(running_processes))) print(get_process_info(running_processes)) for src in running_processes: p = running_processes[src] returncode = p.poll() if returncode is None: p.log_f.flush() else: t1 = round(time.time() - p.t0, 0) d = { 'upload.returncode': returncode, 'upload.timestamp': datetime.now(), 'upload.time_in_s': t1, 'upload.time': timesofar(p.t0), 'upload.logfile': p.logfile, 'upload.status': "success" if returncode == 0 else "failed" } mark_upload_done(src, d) jobs_finished.append(src) p.log_f.close() if returncode == 0: msg = 'Dispatcher: "{}" uploader finished successfully with code {} (time: {})'.format( src, returncode, timesofar(p.t0, t1=t1)) print(msg) if hipchat_msg: msg += '<a href="{}/log/dump/{}">dump log</a>'.format( DATA_WWW_ROOT_URL, src) msg += '<a href="{}/log/upload/{}">upload log</a>'.format( DATA_WWW_ROOT_URL, src) hipchat_msg(msg, message_format='html', color="green") source_upload_success.send(self, src_name=src) else: msg = 'Dispatcher: "{}" uploader failed with code {} (time: {}s)'.format( src, returncode, t1) print(msg) if hipchat_msg: hipchat_msg(msg, color="red") source_upload_failed.send(self, src_name=src) for src in jobs_finished: del running_processes[src]
def checkmem(self, pinfo=None): mem_req = pinfo and pinfo.get("__reqs__", {}).get("mem") or 0 t0 = time.time() waited = False sleep_time = 5 if mem_req: logger.info("Job {cat:%s,source:%s,step:%s} requires %s memory, checking if available" % \ (pinfo.get("category"), pinfo.get("source"), pinfo.get("step"), sizeof_fmt(mem_req))) if self.max_memory_usage: hub_mem = self.hub_memory while hub_mem >= self.max_memory_usage: logger.info("Hub is using too much memory to launch job {cat:%s,source:%s,step:%s} (%s used, more than max allowed %s), wait a little (job's already been postponed for %s)" % \ (pinfo.get("category"), pinfo.get("source"), pinfo.get("step"), sizeof_fmt(hub_mem), sizeof_fmt(self.max_memory_usage),timesofar(t0))) yield from asyncio.sleep(sleep_time) waited = True hub_mem = self.hub_memory if mem_req: # max allowed mem is either the limit we gave and the os limit max_mem = self.max_memory_usage and self.max_memory_usage or self.avail_memory # TODO: check projected memory (jobs with mem requirements currently running # as those jobs may not have reached their max mem usage yet) hub_mem = self.hub_memory while mem_req >= (max_mem - hub_mem): logger.info("Job {cat:%s,source:%s,step:%s} needs %s to run, not enough to launch it (hub consumes %s while max allowed is %s), wait a little (job's already been postponed for %s)" % \ (pinfo.get("category"), pinfo.get("source"), pinfo.get("step"), sizeof_fmt(mem_req), sizeof_fmt(hub_mem), sizeof_fmt(max_mem), timesofar(t0))) yield from asyncio.sleep(sleep_time) waited = True # refresh limites and usage (manager can be modified from hub # thus memory usage can be modified on-the-fly hub_mem = self.hub_memory max_mem = self.max_memory_usage and self.max_memory_usage or self.avail_memory pendings = len(self.process_queue._pending_work_items.keys() ) - config.HUB_MAX_WORKERS while pendings >= config.MAX_QUEUED_JOBS: if not waited: logger.info("Can't run job {cat:%s,source:%s,step:%s} right now, too much pending jobs in the queue (max: %s), will retry until possible" % \ (pinfo.get("category"), pinfo.get("source"), pinfo.get("step"), config.MAX_QUEUED_JOBS)) yield from asyncio.sleep(sleep_time) pendings = len(self.process_queue._pending_work_items.keys() ) - config.HUB_MAX_WORKERS waited = True if waited: logger.info( "Job {cat:%s,source:%s,step:%s} now can be launched (total waiting time: %s)" % (pinfo.get("category"), pinfo.get("source"), pinfo.get("step"), timesofar(t0)))
def register_status(self, status, transient=False, **extra): try: # is status is "failed" and depending on where it failed, # we may not be able to get the new_data_folder (if dumper didn't reach # the release information for instance). Default to current if failing data_folder = self.new_data_folder except DumperException: data_folder = self.current_data_folder self.src_doc = { '_id': self.src_name, 'data_folder': data_folder, 'release': getattr(self, self.__class__.SUFFIX_ATTR), 'download': { 'logfile': self.logfile, 'started_at': datetime.now(), 'status': status } } # only register time when it's a final state if transient: self.src_doc["download"]["pid"] = os.getpid() else: self.src_doc["download"]["time"] = timesofar(self.t0) if "download" in extra: self.src_doc["download"].update(extra["download"]) else: self.src_doc.update(extra) self.src_dump.save(self.src_doc)
def load_broadinstitute_exac(): print('DATA_FOLDER: ' + DATA_FOLDER) t0 = time.time() exacs = load_broadinstitute_exac_all() for k, v in load_broadinstitute_exac_nontcga().items(): try: exacs[k]["exac"]["nontcga"] = v["exac"]["nontcga"] except KeyError: exacs[k] = v for k, v in load_broadinstitute_exac_nonpsych().items(): try: exacs[k]["exac"]["nonpsych"] = v["exac"]["nonpsych"] except KeyError: exacs[k] = v logging.info("Convert transcript ID to EntrezID") import dataload.sources.ensembl.ensembl_base as ensembl_base ensembl_parser = ensembl_base.EnsemblParser() ensembl_parser._load_ensembl2entrez_li() ensembl2entrez = list2dict(ensembl_parser.ensembl2entrez_li, 0, alwayslist=True) ensembl_dir = get_data_folder("ensembl") for line in tabfile_feeder( os.path.join(ensembl_dir, "gene_ensembl__translation__main.txt")): _, ensid, transid, _ = line if transid in exacs: data = exacs.pop( transid) # pop so no-match means no data in the end for entrezid in ensembl2entrez.get(ensid, [ensid]): exacs[entrezid] = data load_done('[%d, %s]' % (len(exacs), timesofar(t0))) return exacs
def main(no_confirm=True): src_dump = get_src_dump() download_list = get_file_list_for_download() if len(download_list) == 0: logging.info("No newer file found. Abort now.") sys.exit(0) doc = src_dump.find_one({'_id': 'ucsc'}) if not os.path.exists(DATA_FOLDER): os.makedirs(DATA_FOLDER) logfile = os.path.join(DATA_FOLDER, 'ucsc_dump.log') setup_logfile(logfile) # mark the download starts doc = {'_id': 'ucsc', 'timestamp': timestamp, 'data_folder': DATA_FOLDER, 'lastmodified': latest_lastmodified, 'logfile': logfile, 'status': 'downloading'} src_dump.save(doc) t0 = time.time() download(download_list, no_confirm) # mark the download finished successfully _updates = { 'status': 'success', 'time': timesofar(t0), 'pending_to_upload': True # a flag to trigger data uploading } src_dump.update({'_id': 'ucsc'}, {'$set': _updates})
def load_x(idx, fieldname, cvt_fn=None): '''idx is 0-based column number''' print('DATA_FOLDER: ' + DATA_FOLDER) DATAFILE = os.path.join(DATA_FOLDER, 'idmapping_selected.tab.gz') load_start(DATAFILE) t0 = time.time() xli = [] for ld in tabfile_feeder(DATAFILE, header=1, assert_column_no=VALID_COLUMN_NO): ld = listitems(ld, *(2, 19, idx)) # GeneID Ensembl(Gene) target_value for value in dupline_seperator(dupline=ld, dup_sep='; '): xli.append(value) ensembl2geneid = list2dict(list_nondup([(x[1], x[0]) for x in xli if x[0] != '' and x[1] != '']), 0, alwayslist=True) xli2 = [] for entrez_id, ensembl_id, x_value in xli: if x_value: if cvt_fn: x_value = cvt_fn(x_value) if entrez_id: xli2.append((entrez_id, x_value)) elif ensembl_id: entrez_id = ensembl2geneid.get(ensembl_id, None) if entrez_id: for _eid in entrez_id: xli2.append((_eid, x_value)) else: xli2.append((ensembl_id, x_value)) gene2x = list2dict(list_nondup(xli2), 0) fn = lambda value: {fieldname: sorted(value) if isinstance(value, list) else value} gene2x = value_convert(gene2x, fn, traverse_list=False) load_done('[%d, %s]' % (len(gene2x), timesofar(t0))) return gene2x
def extract_worker_info(self, worker): info = OrderedDict() proc = worker.get("process") err = worker.get("err") and " !" or "" info["pid"] = str(worker["info"]["id"]) + err info["source"] = norm(worker["info"].get("source") or "", 25) info["category"] = norm(worker["info"].get("category") or "", 10) info["step"] = norm(worker["info"].get("step") or "", 20) info["description"] = norm(worker["info"].get("description") or "", 30) info["mem"] = proc and sizeof_fmt(proc.memory_info().rss) info["cpu"] = proc and "%.1f%%" % proc.cpu_percent() info["started_at"] = worker.get("started_at") or "" if worker.get("duration"): info["duration"] = worker["duration"] else: info["duration"] = timesofar(worker.get("started_at", 0)) info["files"] = [] if proc: for pfile in proc.open_files(): # skip 'a' (logger) if pfile.mode == 'r': finfo = OrderedDict() finfo["path"] = pfile.path finfo["read"] = sizeof_fmt(pfile.position) size = os.path.getsize(pfile.path) finfo["size"] = sizeof_fmt(size) info["files"].append(finfo) return info
def main(no_confirm=True): src_dump = get_src_dump() download_list = get_file_list_for_download() if len(download_list) == 0: logging.info("No newer file found. Abort now.") sys.exit(0) doc = src_dump.find_one({'_id': 'ucsc'}) if not os.path.exists(DATA_FOLDER): os.makedirs(DATA_FOLDER) logfile = os.path.join(DATA_FOLDER, 'ucsc_dump.log') setup_logfile(logfile) # mark the download starts doc = { '_id': 'ucsc', 'timestamp': timestamp, 'data_folder': DATA_FOLDER, 'lastmodified': latest_lastmodified, 'logfile': logfile, 'status': 'downloading' } src_dump.save(doc) t0 = time.time() download(download_list, no_confirm) # mark the download finished successfully _updates = { 'status': 'success', 'time': timesofar(t0), 'pending_to_upload': True # a flag to trigger data uploading } src_dump.update({'_id': 'ucsc'}, {'$set': _updates})
def register_status(self,status,**extra): """ Register step status, ie. status for a sub-resource """ upload_info = {"status" : status} upload_info.update(extra) job_key = "upload.jobs.%s" % self.name if status == "uploading": # record some "in-progress" information upload_info['step'] = self.name # this is the actual collection name upload_info['temp_collection'] = self.temp_collection_name upload_info['pid'] = os.getpid() upload_info['logfile'] = self.logfile upload_info['started_at'] = datetime.datetime.now() self.src_dump.update_one({"_id":self.main_source},{"$set" : {job_key : upload_info}}) else: # only register time when it's a final state # also, keep previous uploading information upd = {} for k,v in upload_info.items(): upd["%s.%s" % (job_key,k)] = v t1 = round(time.time() - self.t0, 0) upd["%s.status" % job_key] = status upd["%s.time" % job_key] = timesofar(self.t0) upd["%s.time_in_s" % job_key] = t1 upd["%s.step" % job_key] = self.name # collection name self.src_dump.update_one({"_id" : self.main_source},{"$set" : upd})
def load_broadinstitute_exac(): print('DATA_FOLDER: ' + DATA_FOLDER) t0 = time.time() exacs = load_broadinstitute_exac_all() for k,v in load_broadinstitute_exac_nontcga().items(): try: exacs[k]["exac"]["nontcga"] = v["exac"]["nontcga"] except KeyError: exacs[k] = v for k,v in load_broadinstitute_exac_nonpsych().items(): try: exacs[k]["exac"]["nonpsych"] = v["exac"]["nonpsych"] except KeyError: exacs[k] = v logging.info("Convert transcript ID to EntrezID") import dataload.sources.ensembl.ensembl_base as ensembl_base ensembl_parser = ensembl_base.EnsemblParser() ensembl_parser._load_ensembl2entrez_li() ensembl2entrez = list2dict(ensembl_parser.ensembl2entrez_li, 0, alwayslist=True) ensembl_dir = get_data_folder("ensembl") for line in tabfile_feeder(os.path.join(ensembl_dir,"gene_ensembl__translation__main.txt")): _,ensid,transid,_ = line if transid in exacs: data = exacs.pop(transid) # pop so no-match means no data in the end for entrezid in ensembl2entrez.get(ensid,[ensid]): exacs[entrezid] = data load_done('[%d, %s]' % (len(exacs), timesofar(t0))) return exacs
def extract_worker_info(self, worker): info = OrderedDict() proc = worker.get("process", worker) err = worker.get("err") and " !" or "" info["pid"] = str(worker["job"]["id"]) + err info["source"] = norm(worker["job"].get("source") or "", 25) info["category"] = norm(worker["job"].get("category") or "", 10) info["step"] = norm(worker["job"].get("step") or "", 20) info["description"] = norm(worker["job"].get("description") or "", 30) info["mem"] = sizeof_fmt(proc.get("memory", {}).get("size", 0.0)) info["cpu"] = "%.1f%%" % proc.get("cpu", {}).get("percent", 0.0) info["started_at"] = worker["job"]["started_at"] if worker.get("duration"): info["duration"] = worker["duration"] else: info["duration"] = timesofar(worker["job"]["started_at"]) # for now, don't display files used by the process info["files"] = [] #if proc: # for pfile in proc.open_files(): # # skip 'a' (logger) # if pfile.mode == 'r': # finfo = OrderedDict() # finfo["path"] = pfile.path # finfo["read"] = sizeof_fmt(pfile.position) # size = os.path.getsize(pfile.path) # finfo["size"] = sizeof_fmt(size) # #info["files"].append(finfo) return info
def run_jobs_on_ipythoncluster(worker, task_list, shutdown_ipengines_after_done=False): t0 = time.time() rc = Client(CLUSTER_CLIENT_JSON) lview = rc.load_balanced_view() cnt_nodes = len(lview.targets or rc.ids) print("\t# nodes in use: {}".format(cnt_nodes)) lview.block = False print("\t# of tasks: {}".format(len(task_list))) print("\tsubmitting...", end='') job = lview.map_async(worker,task_list) print("done.") try: job.wait_interactive() except KeyboardInterrupt: #handle "Ctrl-C" if ask("\nAbort all submitted jobs?") == 'Y': lview.abort() print("Aborted, all submitted jobs are cancelled.") else: print("Aborted, but your jobs are still running on the cluster.") return if len(job.result()) != len(task_list): print("WARNING:\t# of results returned ({}) != # of tasks ({}).".format(len(job.result()), len(task_list))) print("\ttotal time: {}".format(timesofar(t0))) if shutdown_ipengines_after_done: print("\tshuting down all ipengine nodes...", end='') lview.shutdown() print('Done.') return job.result()
def handle_src_build(self): #cleanup src and target collections src_clean_archives(noconfirm=True) target_clean_collections(noconfirm=True) for config in ('mygene', 'mygene_allspecies'): t0 = time.time() p = Popen(['python', '-m', 'databuild.builder', config], cwd=src_path) returncode = p.wait() t = timesofar(t0) if returncode == 0: msg = 'Dispatcher: "{}" builder finished successfully with code {} (time: {})'.format( config, returncode, t) color = "green" else: msg = 'Dispatcher: "{}" builder failed successfully with code {} (time: {})'.format( config, returncode, t) color = "red" print(msg) if hipchat_msg: msg += '<a href="{}/log/build/{}">build log</a>'.format( DATA_WWW_ROOT_URL, config) hipchat_msg(msg, message_format='html', color=color) assert returncode == 0, "Subprocess failed. Check error above." genedoc_merged.send(self)
def get_thread_summary(self): running_tids = self.get_thread_files() tchildren = self.thread_queue._threads res = {} for child in tchildren: res[child.name] = { "is_alive": child.isAlive(), "is_daemon": child.isDaemon(), } if child.name in running_tids: # something is running on that child process worker = running_tids[child.name] res[child.name]["job"] = { "started_at": worker["job"]["started_at"], "duration": timesofar(worker["job"]["started_at"], 0), "func_name": worker["func_name"], "category": worker["job"]["category"], "description": worker["job"]["description"], "source": worker["job"]["source"], "step": worker["job"]["step"], "id": worker["job"]["id"], } return res
def apply_changes(self, changes, verify=True, noconfirm=False): if verify: self.pre_verify_changes(changes) if not (noconfirm or ask('\nContinue to apply changes?') == 'Y'): print("Aborted.") return -1 step = self.step _db = get_target_db() source_col = _db[changes['source']] src = GeneDocMongoDBBackend(source_col) target = GeneDocESBackend(self) _timestamp = changes['timestamp'] def _add_docs(ids): i = 0 for _ids in iter_n(ids, step): t1 = time.time() _doc_li = src.mget_from_ids(_ids) for _doc in _doc_li: _doc['_timestamp'] = _timestamp i += 1 target.insert(_doc_li) print('\t{}\t{}'.format(i, timesofar(t1))) t0 = time.time() if changes['add']: print("Adding {} new docs...".format(len(changes['add']))) t00 = time.time() _add_docs(changes['add']) print("done. [{}]".format(timesofar(t00))) if changes['delete']: print("Deleting {} discontinued docs...".format(len(changes['delete'])), end='') t00 = time.time() target.remove_from_ids(changes['delete'], step=step) print("done. [{}]".format(timesofar(t00))) if changes['update']: print("Updating {} existing docs...".format(len(changes['update']))) t00 = time.time() ids = [x['_id'] for x in changes['update']] _add_docs(ids) print("done. [{}]".format(timesofar(t00))) target.finalize() print("\n") print("Finished.", timesofar(t0))
def update_index(changes, sync_src, sync_target, noconfirm=False): # changes['_add'] = changes['delete'] # changes['_delete'] = changes['add'] # changes['delete'] = changes['_delete'] # changes['add'] = changes['_add'] # del changes['_add'] # del changes['_delete'] print("\t{}\trecords will be added.".format(len(changes['add']))) print("\t{}\trecords will be deleted.".format(len(changes['delete']))) print("\t{}\trecords will be updated.".format(len(changes['update']))) print() print('\tsync_src:\t{:<45}{}'.format(sync_src.target_collection.name, sync_src.name)) print('\tsync_target\t{:<45}{}'.format(sync_target.target_esidxer.ES_INDEX_NAME, sync_target.name)) if noconfirm or ask("Continue?") == 'Y': t00 = time.time() es_idxer = sync_target.target_esidxer if len(changes['add']) > 0: print("Adding {} new records...".format(len(changes['add']))) t0 = time.time() _q = {'_id': {'$in': changes['add']}} for docs in doc_feeder(sync_src.target_collection, step=1000, inbatch=True, query=_q): es_idxer.add_docs(docs) print("Done. [{}]".format(timesofar(t0))) if len(changes['delete']) > 0: print("Deleting {} old records...".format(len(changes['delete']))) t0 = time.time() es_idxer.delete_docs(changes['delete']) print("Done. [{}]".format(timesofar(t0))) if len(changes['update']) > 0: print("Updating {} existing records...".format(len(changes['update']))) t0 = time.time() ids = [d['_id'] for d in changes['update']] _q = {'_id': {'$in': ids}} for docs in doc_feeder(sync_src.target_collection, step=1000, inbatch=True, query=_q): es_idxer.add_docs(docs) print("Done. [{}]".format(timesofar(t0))) print('=' * 20) print('Finished. [{}]'.format(timesofar(t00)))
def main(): no_confirm = True # set it to True for running this script automatically without intervention. if not os.path.exists(DATA_FOLDER): os.makedirs(DATA_FOLDER) else: if not (no_confirm or len(os.listdir(DATA_FOLDER)) == 0 or ask('DATA_FOLDER (%s) is not empty. Continue?' % DATA_FOLDER) == 'Y'): sys.exit() logfile = os.path.join(DATA_FOLDER, 'entrez_dump.log') setup_logfile(logfile) #mark the download starts src_dump = get_src_dump() doc = { '_id': 'entrez', 'timestamp': timestamp, 'data_folder': DATA_FOLDER, 'logfile': logfile, 'status': 'downloading' } src_dump.save(doc) t0 = time.time() download(DATA_FOLDER, no_confirm=no_confirm) t_download = timesofar(t0) t1 = time.time() #mark parsing starts src_dump.update({'_id': 'entrez'}, {'$set': {'status': 'parsing'}}) parse_gbff(DATA_FOLDER) t_parsing = timesofar(t1) t_total = timesofar(t0) #mark the download finished successfully _updates = { 'status': 'success', 'time': { 'download': t_download, 'parsing': t_parsing, 'total': t_total }, 'pending_to_upload': True # a flag to trigger data uploading } src_dump.update({'_id': 'entrez'}, {'$set': _updates})
def _db_upload(self, doc_li, step=10000, verbose=True): import time from biothings.utils.common import timesofar from utils.dataload import list2dict, list_itemcnt, listsort output = [] t0 = time.time() for i in range(0, len(doc_li), step): output.extend(self.target_db.update(doc_li[i:i+step])) if verbose: print('\t%d-%d Done [%s]...' % (i+1, min(i+step, len(doc_li)), timesofar(t0))) res = list2dict(list_itemcnt([x[0] for x in output]), 0) print("Done![%s, %d OK, %d Error]" % (timesofar(t0), res.get(True, 0), res.get(False, 0))) res = listsort(list_itemcnt([x[2].args[0] for x in output if x[0] is False]), 1, reverse=True) print('\n'.join(['\t%s\t%d' % x for x in res[:10]])) if len(res) > 10: print("\t%d lines omitted..." % (len(res)-10))
def main(daemon=False): running_processes = {} while 1: src_to_update_li = check_mongo() if src_to_update_li: print('\nDispatcher: found pending jobs ', src_to_update_li) for src_to_update in src_to_update_li: if src_to_update not in running_processes: mark_upload_started(src_to_update) p = dispatch(src_to_update) src_dump.update({'_id': src_to_update}, {"$set": {"upload.pid": p.pid}}) p.t0 = time.time() running_processes[src_to_update] = p jobs_finished = [] if running_processes: print('Dispatcher: {} active job(s)'.format(len(running_processes))) print(get_process_info(running_processes)) for src in running_processes: p = running_processes[src] returncode = p.poll() if returncode is not None: t1 = round(time.time() - p.t0, 0) d = {'upload.returncode': returncode, 'upload.timestamp': datetime.now(), 'upload.time_in_s': t1, 'upload.time': timesofar(p.t0), 'upload.logfile': p.logfile, } if returncode == 0: print('Dispatcher: {} finished successfully with code {} (time: {}s)'.format(src, returncode, t1)) d['upload.status'] = "success" else: print('Dispatcher: {} failed with code {} (time: {}s)'.format(src, returncode, t1)) d['upload.status'] = "failed" mark_upload_done(src, d) jobs_finished.append(src) p.log_f.close() else: p.log_f.flush() for src in jobs_finished: del running_processes[src] if running_processes: time.sleep(10) else: if daemon: #continue monitor src_dump collection print("{}".format('\b' * 50), end='') for i in range(100): print('\b' * 2 + [chr(8212), '\\', '|', '/'][i % 4], end='') time.sleep(0.1) else: break
def _add_docs(ids): i = 0 for _ids in iter_n(ids, step): t1 = time.time() _doc_li = src.mget_from_ids(_ids) for _doc in _doc_li: _doc['_timestamp'] = _timestamp i += 1 target.insert(_doc_li) print('\t{}\t{}'.format(i, timesofar(t1)))
def main_cron(no_confirm=True): '''set no_confirm to True for running this script automatically without intervention.''' src_dump = get_src_dump() mart_version = chk_latest_mart_version() logging.info("Checking latest mart_version:\t%s" % mart_version) doc = src_dump.find_one({'_id': 'ensembl'}) if doc and 'release' in doc and mart_version <= doc['release']: data_file = os.path.join(doc['data_folder'], 'gene_ensembl__gene__main.txt') if os.path.exists(data_file): logging.info("No newer release found. Abort now.") sys.exit(0) DATA_FOLDER = os.path.join(ENSEMBL_FOLDER, str(mart_version)) if not os.path.exists(DATA_FOLDER): os.makedirs(DATA_FOLDER) else: if not (no_confirm or len(os.listdir(DATA_FOLDER)) == 0 or ask('DATA_FOLDER (%s) is not empty. Continue?' % DATA_FOLDER) == 'Y'): sys.exit(0) logfile = os.path.join(DATA_FOLDER, 'ensembl_mart_%s.log' % mart_version) setup_logfile(logfile) #mark the download starts doc = {'_id': 'ensembl', 'release': mart_version, 'timestamp': time.strftime('%Y%m%d'), 'data_folder': DATA_FOLDER, 'logfile': logfile, 'status': 'downloading'} src_dump.save(doc) t0 = time.time() try: BM = BioMart() BM.species_li = get_all_species(mart_version) BM.get_gene__main(os.path.join(DATA_FOLDER, 'gene_ensembl__gene__main.txt')) BM.get_translation__main(os.path.join(DATA_FOLDER, 'gene_ensembl__translation__main.txt')) BM.get_xref_entrezgene(os.path.join(DATA_FOLDER, 'gene_ensembl__xref_entrezgene__dm.txt')) BM.get_profile(os.path.join(DATA_FOLDER, 'gene_ensembl__prot_profile__dm.txt')) BM.get_interpro(os.path.join(DATA_FOLDER, 'gene_ensembl__prot_interpro__dm.txt')) BM.get_pfam(os.path.join(DATA_FOLDER, 'gene_ensembl__prot_pfam__dm.txt')) finally: sys.stdout.close() #mark the download finished successfully _updates = { 'status': 'success', 'time': timesofar(t0), 'pending_to_upload': True # a flag to trigger data uploading } src_dump.update({'_id': 'ensembl'}, {'$set': _updates})
def main(): no_confirm = True # set it to True for running this script automatically without intervention. if not os.path.exists(DATA_FOLDER): os.makedirs(DATA_FOLDER) else: if not (no_confirm or len(os.listdir(DATA_FOLDER)) == 0 or ask('DATA_FOLDER (%s) is not empty. Continue?' % DATA_FOLDER) == 'Y'): sys.exit() logfile = os.path.join(DATA_FOLDER, 'entrez_dump.log') setup_logfile(logfile) #mark the download starts src_dump = get_src_dump() doc = {'_id': 'entrez', 'timestamp': timestamp, 'data_folder': DATA_FOLDER, 'logfile': logfile, 'status': 'downloading'} src_dump.save(doc) t0 = time.time() download(DATA_FOLDER, no_confirm=no_confirm) t_download = timesofar(t0) t1 = time.time() #mark parsing starts src_dump.update({'_id': 'entrez'}, {'$set': {'status': 'parsing'}}) parse_gbff(DATA_FOLDER) t_parsing = timesofar(t1) t_total = timesofar(t0) #mark the download finished successfully _updates = { 'status': 'success', 'time': { 'download': t_download, 'parsing': t_parsing, 'total': t_total }, 'pending_to_upload': True # a flag to trigger data uploading } src_dump.update({'_id': 'entrez'}, {'$set': _updates})
def get_process_summary(self): running_pids = self.get_pid_files() res = {} for child in self.pchildren: try: mem = child.memory_info().rss pio = child.io_counters() # TODO: cpu as reported here isn't reliable, the only to get something # consistent to call cpu_percent() with a waiting time argument to integrate # CPU activity over this time, but this is a blocking call and freeze the hub # (an async implementation might possible though). Currently, pchildren is list # set at init time where process object are stored, so subsequent cpu_percent() # calls should report CPU activity since last call (between /job_manager & top() # calls), but it constently return CPU > 100% even when no thread running (that # could have been the explination but it's not). cpu = child.cpu_percent() res[child.pid] = { "memory": { "size": child.memory_info().rss, "percent": child.memory_percent(), }, "cpu": { # override status() when we have cpu activity to avoid # having a "sleeping" process that's actually running something # (prob happening because delay between status and cpu_percent(), like a race condition) "status": cpu > 0.0 and "running" or child.status(), "percent": cpu }, "io": { "read_count": pio.read_count, "write_count": pio.write_count, "read_bytes": pio.read_bytes, "write_bytes": pio.write_bytes } } if child.pid in running_pids: # something is running on that child process worker = running_pids[child.pid] res[child.pid]["job"] = { "started_at": worker["job"]["started_at"], "duration": timesofar(worker["job"]["started_at"], 0), "func_name": worker["func_name"], "category": worker["job"]["category"], "description": worker["job"]["description"], "source": worker["job"]["source"], "step": worker["job"]["step"], "id": worker["job"]["id"], } except psutil.NoSuchProcess as e: print("child not found %s %s" % (child, e)) continue return res
def apply_changes(self, changes): step = self.step target_col = self._target_col source_col = self._db[changes['source']] src = GeneDocMongoDBBackend(source_col) target = GeneDocMongoDBBackend(target_col) _timestamp = changes['timestamp'] t0 = time.time() if changes['add']: logging.info("Adding {} new docs...".format(len(changes['add']))) t00 = time.time() for _ids in iter_n(changes['add'], step): _doc_li = src.mget_from_ids(_ids) for _doc in _doc_li: _doc['_timestamp'] = _timestamp target.insert(_doc_li) logging.info("done. [{}]".format(timesofar(t00))) if changes['delete']: logging.info("Deleting {} discontinued docs...".format( len(changes['delete']))) t00 = time.time() target.remove_from_ids(changes['delete'], step=step) logging.info("done. [{}]".format(timesofar(t00))) if changes['update']: logging.info("Updating {} existing docs...".format( len(changes['update']))) t00 = time.time() i = 0 t1 = time.time() for _diff in changes['update']: target.update_diff(_diff, extra={'_timestamp': _timestamp}) i += 1 if i > 1 and i % step == 0: logging.info('\t{}\t{}'.format(i, timesofar(t1))) t1 = time.time() logging.info("done. [{}]".format(timesofar(t00))) logging.info("\n") logging.info("Finished. %s" % timesofar(t0))
def main(): if len(sys.argv) > 1: config = sys.argv[1] else: config = 'mygene_allspecies' use_parallel = '-p' in sys.argv noconfirm = '-b' in sys.argv if config == 'clean': clean_target_collection() else: t0 = time.time() build_index(config, use_parallel=use_parallel, noconfirm=noconfirm) print("Finished.", timesofar(t0))
def _finished(self, _doc, _job): doc = self._col.find_one({'_id': self._id}) job = doc["jobs"][-1] t0 = job["step_started_at"].timestamp() job["time_in_s"] = round(time() - t0, 0) job["time"] = timesofar(t0) if self.regx: merge(doc, _doc) merge(job, _job) self._col.replace_one({"_id": self._id}, doc)
def apply_changes(self, changes): step = self.step target_col = self._target_col source_col = self._db[changes['source']] src = GeneDocMongoDBBackend(source_col) target = GeneDocMongoDBBackend(target_col) _timestamp = changes['timestamp'] t0 = time.time() if changes['add']: logging.info("Adding {} new docs...".format(len(changes['add']))) t00 = time.time() for _ids in iter_n(changes['add'], step): _doc_li = src.mget_from_ids(_ids) for _doc in _doc_li: _doc['_timestamp'] = _timestamp target.insert(_doc_li) logging.info("done. [{}]".format(timesofar(t00))) if changes['delete']: logging.info("Deleting {} discontinued docs...".format(len(changes['delete']))) t00 = time.time() target.remove_from_ids(changes['delete'], step=step) logging.info("done. [{}]".format(timesofar(t00))) if changes['update']: logging.info("Updating {} existing docs...".format(len(changes['update']))) t00 = time.time() i = 0 t1 = time.time() for _diff in changes['update']: target.update_diff(_diff, extra={'_timestamp': _timestamp}) i += 1 if i > 1 and i % step == 0: logging.info('\t{}\t{}'.format(i, timesofar(t1))) t1 = time.time() logging.info("done. [{}]".format(timesofar(t00))) logging.info("\n") logging.info("Finished. %s" % timesofar(t0))
def doc_feeder(self, step=10000, verbose=True, query=None, scroll='10m', **kwargs): q = query if query else {'query': {'match_all': {}}} _q_cnt = self.count(q=q, raw=True) n = _q_cnt['count'] n_shards = _q_cnt['_shards']['total'] assert n_shards == _q_cnt['_shards']['successful'] _size = int(step / n_shards) assert _size * n_shards == step cnt = 0 t0 = time.time() if verbose: print('\ttotal docs: {}'.format(n)) t1 = time.time() res = self._es.search(self._index, self._doc_type, body=q, size=_size, search_type='scan', scroll=scroll, **kwargs) # double check initial scroll request returns no hits assert len(res['hits']['hits']) == 0 while 1: if verbose: t1 = time.time() if cnt < n: print('\t{}-{}...'.format(cnt+1, min(cnt+step, n)), end='') res = self._es.scroll(res['_scroll_id'], scroll=scroll) if len(res['hits']['hits']) == 0: break else: for doc in res['hits']['hits']: yield doc['_source'] cnt += 1 if verbose: print('done.[%.1f%%,%s]' % (min(cnt, n)*100./n, timesofar(t1))) if verbose: print("Finished! [{}]".format(timesofar(t0))) assert cnt == n, "Error: scroll query terminated early [{}, {}], please retry.\nLast response:\n{}".format(cnt, n, res)
def process(self, iterable, batch_size): self.logger.info("Uploading to the DB...") t0 = time.time() tinner = time.time() total = 0 for doc_li in self.doc_iterator(iterable, batch=True, batch_size=batch_size): try: bob = self.temp_collection.initialize_unordered_bulk_op() for d in doc_li: bob.find({"_id": d["_id"]}).upsert().replace_one(d) res = bob.execute() nb = res["nUpserted"] + res["nModified"] total += nb self.logger.info("Upserted %s records [%s]" % (nb, timesofar(tinner))) except Exception as e: raise tinner = time.time() self.logger.info('Done[%s]' % timesofar(t0)) return total
def parse_vcf(assembly, vcf_infile, compressed=True, verbose=True, by_id=True, **tabix_params): t0 = time.time() compressed == vcf_infile.endswith('.gz') vcf_r = Reader(filename=vcf_infile, compressed=compressed) vcf_r.fetch('1', 1) # call a dummy fetch to initialize vcf_r._tabix if tabix_params: vcf_r.reader = vcf_r._tabix.fetch(**tabix_params) cnt_1, cnt_2, cnt_3 = 0, 0, 0 for rec in vcf_r: doc = parse_one_rec(assembly, rec) if by_id: # one hgvs id, one doc if doc['_id']: if isinstance(doc['_id'], list): for i, _id in enumerate(doc['_id']): _doc = copy.copy(doc) _doc['alt'] = doc['alt'][i] _doc[assembly] = doc[assembly][i] _doc['_id'] = _id yield _doc cnt_2 += 1 if verbose: logging.info("%s\t%s" % (_doc['rsid'], _doc['_id'])) else: yield doc cnt_2 += 1 if verbose: logging.info("%s\t%s" % (doc['rsid'], doc['_id'])) else: cnt_3 += 1 else: # one rsid, one doc if doc['_id']: yield doc cnt_2 += 1 if verbose: logging.info("%s\t%s" % (doc['rsid'], doc['_id'])) else: cnt_3 += 1 cnt_1 += 1 logging.info("Done. [{}]".format(timesofar(t0))) logging.info("Total rs: {}; total docs: {}; skipped rs: {}".format( cnt_1, cnt_2, cnt_3))
def process(self, doc_d, batch_size): self.logger.info("Uploading to the DB...") t0 = time.time() total = 0 for doc_li in self.doc_iterator(doc_d, batch=True, batch_size=batch_size): self.temp_collection.insert(doc_li, manipulate=False, check_keys=False) total += len(doc_li) self.logger.info('Done[%s]' % timesofar(t0)) return total
def _db_upload(self, doc_li, step=10000, verbose=True): import time from biothings.utils.common import timesofar from utils.dataload import list2dict, list_itemcnt, listsort output = [] t0 = time.time() for i in range(0, len(doc_li), step): output.extend(self.target_db.update(doc_li[i:i + step])) if verbose: print('\t%d-%d Done [%s]...' % (i + 1, min(i + step, len(doc_li)), timesofar(t0))) res = list2dict(list_itemcnt([x[0] for x in output]), 0) print("Done![%s, %d OK, %d Error]" % (timesofar(t0), res.get(True, 0), res.get(False, 0))) res = listsort(list_itemcnt( [x[2].args[0] for x in output if x[0] is False]), 1, reverse=True) print('\n'.join(['\t%s\t%d' % x for x in res[:10]])) if len(res) > 10: print("\t%d lines omitted..." % (len(res) - 10))
def load_exons_for_species(species, exons_key='exons'): refflat_file = os.path.join(DATA_FOLDER, species, 'database/refFlat.txt.gz') load_start(refflat_file) t0 = time.time() ref2exons = {} for ld in tabfile_feeder(refflat_file, header=0): refseq = ld[1] chr = ld[2] if chr.startswith('chr'): chr = chr[3:] exons = list( zip([int(x) for x in ld[9].split(',') if x], [int(x) for x in ld[10].split(',') if x])) assert len(exons) == int(ld[8]), (len(exons), int(ld[8])) ref2exons.setdefault(refseq, []).append({ 'transcript': refseq, 'chr': chr, 'strand': -1 if ld[3] == '-' else 1, 'txstart': int(ld[4]), 'txend': int(ld[5]), 'cdsstart': int(ld[6]), 'cdsend': int(ld[7]), 'position': exons }) gene2exons = {} for refseq in sorted(ref2exons.keys()): geneid = refseq2gene.get(refseq, None) if geneid and geneid != '0': if geneid not in gene2exons: gene2exons[geneid] = {exons_key: ref2exons[refseq]} else: gene2exons[geneid][exons_key].extend(ref2exons[refseq]) load_done('[%d, %s]' % (len(gene2exons), timesofar(t0))) return gene2exons
def backup_timestamp(self, outfile=None, compress=True, ): '''backup "_id" and "_timestamp" fields into a output file.''' ts = time.strftime('%Y%m%d') outfile = outfile or self._target_col.name + '_tsbk_' + ts + '.txt' if compress: outfile += '.bz' import bz2 logging.info('Backing up timestamps into "{}"...'.format(outfile)) t0 = time.time() file_handler = bz2.BZ2File if compress else open with file_handler(outfile, 'wb') as out_f: for doc in doc_feeder(self._target_col, step=100000, fields=['_timestamp']): data = '%s\t%s\n' % (doc['_id'], doc['_timestamp'].strftime('%Y%m%d')) out_f.write(data.encode()) logging.info("Done. %s" % timesofar(t0)) return outfile
def doc_feeder0(collection, step=1000, s=None, e=None, inbatch=False): '''A iterator for returning docs in a collection, with batch query.''' n = collection.count() s = s or 1 e = e or n print('Found %d documents in database "%s".' % (n, collection.name)) for i in range(s - 1, e + 1, step): print("Processing %d-%d documents..." % (i + 1, i + step), end='') t0 = time.time() res = collection.find(skip=i, limit=step, timeout=False) if inbatch: yield res else: for doc in res: yield doc print('Done.[%s]' % timesofar(t0))
def main_cron(): no_confirm = True # set it to True for running this script automatically without intervention. logging.info("Checking latest refseq release:\t", end='') refseq_release = get_refseq_release() logging.info(refseq_release) src_dump = get_src_dump() doc = src_dump.find_one({'_id': 'refseq'}) if doc and 'release' in doc and refseq_release <= doc['release']: data_file = os.path.join(doc['data_folder'], 'complete.109.rna.gbff.gz') if os.path.exists(data_file): logging.info("No newer release found. Abort now.") sys.exit(0) DATA_FOLDER = os.path.join(REFSEQ_FOLDER, str(refseq_release)) if not os.path.exists(DATA_FOLDER): os.makedirs(DATA_FOLDER) else: if not (no_confirm or len(os.listdir(DATA_FOLDER)) == 0 or ask('DATA_FOLDER (%s) is not empty. Continue?' % DATA_FOLDER) == 'Y'): sys.exit(0) logfile = os.path.join(DATA_FOLDER, 'refseq_dump.log') setup_logfile(logfile) #mark the download starts doc = {'_id': 'refseq', 'release': refseq_release, 'timestamp': time.strftime('%Y%m%d'), 'data_folder': DATA_FOLDER, 'logfile': logfile, 'status': 'downloading'} src_dump.save(doc) t0 = time.time() try: download(DATA_FOLDER, refseq_release, no_confirm=no_confirm) finally: sys.stdout.close() #mark the download finished successfully _updates = { 'status': 'success', 'time': timesofar(t0), 'pending_to_upload': True # a flag to trigger data uploading } src_dump.update({'_id': 'refseq'}, {'$set': _updates})
def load_cpdb(__metadata__): # only import pathways from these sources PATHWAY_SOURCES_INCLUDED = __metadata__['pathway_sources_included'] VALID_COLUMN_NO = 4 t0 = time.time() print('DATA_FOLDER: ' + DATA_FOLDER) DATA_FILES = [] DATA_FILES.append(os.path.join(DATA_FOLDER, 'CPDB_pathways_genes_mouse.tab')) DATA_FILES.append(os.path.join(DATA_FOLDER, 'CPDB_pathways_genes_yeast.tab')) DATA_FILES.append(os.path.join(DATA_FOLDER, 'CPDB_pathways_genes_human.tab')) _out = [] for DATA_FILE in DATA_FILES: load_start(DATA_FILE) for ld in tabfile_feeder(DATA_FILE, header=1, assert_column_no=VALID_COLUMN_NO): p_name, p_id, p_source = ld[:3] p_source = p_source.lower() if p_source == 'kegg' and p_id.startswith('path:'): p_id = p_id[5:] if p_source in PATHWAY_SOURCES_INCLUDED: genes = ld[-1].split(",") for gene in genes: _out.append((gene, p_name, p_id, p_source)) load_done() _out = list2dict(_out, 0, alwayslist=True) def _inner_cvt(p): p_name, p_id = p _d = {'name': p_name} if p_id != 'None': _d['id'] = p_id return _d def _cvt(pli): _d = list2dict(pli, 2) _d = value_convert(_d, _inner_cvt) for p_source in _d: if isinstance(_d[p_source], list): _d[p_source].sort() return {'pathway': _d} _out = dict_convert(_out, valuefn=_cvt) load_done('[%d, %s]' % (len(_out), timesofar(t0))) return _out
def load_ucsc_exons(): print('DATA_FOLDER: ' + DATA_FOLDER) species_li = os.listdir(DATA_FOLDER) print("Found {} species folders.".format(len(species_li))) t0 = time.time() gene2exons = {} for species in species_li: print(species, end='...') if species == 'Homo_sapiens': gene2exons.update(load_exons_for_human()) elif species == 'Mus_musculus': gene2exons.update(load_exons_for_mouse()) else: gene2exons.update(load_exons_for_species(species)) load_done('[%d, %s]' % (len(gene2exons), timesofar(t0))) return gene2exons
def parse_vcf(assembly, vcf_infile, compressed=True, verbose=True, by_id=True, **tabix_params): t0 = time.time() compressed == vcf_infile.endswith('.gz') vcf_r = Reader(filename=vcf_infile, compressed=compressed) vcf_r.fetch('1', 1) # call a dummy fetch to initialize vcf_r._tabix if tabix_params: vcf_r.reader = vcf_r._tabix.fetch(**tabix_params) cnt_1, cnt_2, cnt_3 = 0, 0, 0 for rec in vcf_r: doc = parse_one_rec(assembly, rec) if by_id: # one hgvs id, one doc if doc['_id']: if isinstance(doc['_id'], list): for i, _id in enumerate(doc['_id']): _doc = copy.copy(doc) _doc['alt'] = doc['alt'][i] _doc[assembly] = doc[assembly][i] _doc['_id'] = _id yield _doc cnt_2 += 1 if verbose: logging.info("%s\t%s" % (_doc['rsid'], _doc['_id'])) else: yield doc cnt_2 += 1 if verbose: logging.info("%s\t%s" % (doc['rsid'], doc['_id'])) else: cnt_3 += 1 else: # one rsid, one doc if doc['_id']: yield doc cnt_2 += 1 if verbose: logging.info("%s\t%s" % (doc['rsid'], doc['_id'])) else: cnt_3 += 1 cnt_1 += 1 logging.info("Done. [{}]".format(timesofar(t0))) logging.info("Total rs: {}; total docs: {}; skipped rs: {}".format(cnt_1, cnt_2, cnt_3))
def update_from_temp_collections(config,no_confirm=False,use_parallel=False): t0 = time.time() sc = GeneDocSyncer(config) new_src_li = sc.get_new_source_list() if not new_src_li: logging.info("No new source collections need to update. Abort now.") return logging.info("Found {} new source collections need to update:".format(len(new_src_li))) logging.info("\n".join(['\t' + x for x in new_src_li])) if no_confirm or ask('Continue?') == 'Y': logfile = 'databuild_sync_{}_{}.log'.format(config, time.strftime('%Y%m%d')) logfile = os.path.join(LOG_FOLDER, logfile) setup_logfile(logfile) for src in new_src_li: t0 = time.time() logging.info("Current source collection: %s" % src) ts = _get_timestamp(src, as_str=True) logging.info("Calculating changes... ") changes = sc.get_changes(src, use_parallel=use_parallel) logging.info("Done") get_changes_stats(changes) if no_confirm or ask("Continue to save changes...") == 'Y': if config == 'genedoc_mygene': dumpfile = 'changes_{}.pyobj'.format(ts) else: dumpfile = 'changes_{}_allspecies.pyobj'.format(ts) dump(changes, dumpfile) dumpfile_key = 'genedoc_changes/' + dumpfile logging.info('Saving to S3: "{}"... '.format(dumpfile_key)) send_s3_file(dumpfile, dumpfile_key) logging.info('Done.') #os.remove(dumpfile) if no_confirm or ask("Continue to apply changes...") == 'Y': sc.apply_changes(changes) sc.verify_changes(changes) logging.info('=' * 20) logging.info("Finished. %s" % timesofar(t0))
def main(): if len(sys.argv) > 1: config = sys.argv[1] else: config = 'mygene_allspecies' use_parallel = '-p' in sys.argv sources = None # will build all sources target = None # will generate a new collection name # "target_col:src_col1,src_col2" will specifically merge src_col1 # and src_col2 into existing target_col (instead of merging everything) if not use_parallel and len(sys.argv) > 2: target,tmp = sys.argv[2].split(":") sources = tmp.split(",") t0 = time.time() bdr = DataBuilder(backend='mongodb') bdr.load_build_config(config) bdr.using_ipython_cluster = use_parallel bdr.merge(sources=sources,target=target) logging.info("Finished. %s" % timesofar(t0))
def load_exons_for_species(species, exons_key='exons'): refflat_file = os.path.join(DATA_FOLDER, species, 'database/refFlat.txt.gz') load_start(refflat_file) t0 = time.time() ref2exons = {} for ld in tabfile_feeder(refflat_file, header=0): refseq = ld[1] chr = ld[2] if chr.startswith('chr'): chr = chr[3:] exons = list(zip([int(x) for x in ld[9].split(',') if x], [int(x) for x in ld[10].split(',') if x])) assert len(exons) == int(ld[8]), (len(exons), int(ld[8])) ref2exons.setdefault(refseq,[]).append({ 'transcript' : refseq, 'chr': chr, 'strand': -1 if ld[3] == '-' else 1, 'txstart': int(ld[4]), 'txend': int(ld[5]), 'cdsstart': int(ld[6]), 'cdsend': int(ld[7]), 'position': exons }) gene2exons = {} for refseq in sorted(ref2exons.keys()): geneid = refseq2gene.get(refseq, None) if geneid and geneid != '0': if geneid not in gene2exons: gene2exons[geneid] = {exons_key: ref2exons[refseq]} else: gene2exons[geneid][exons_key].extend(ref2exons[refseq]) load_done('[%d, %s]' % (len(gene2exons), timesofar(t0))) return gene2exons
def main(no_confirm=True): src_dump = get_src_dump() lastmodified = check_lastmodified() doc = src_dump.find_one({'_id': 'uniprot'}) if doc and 'lastmodified' in doc and lastmodified <= doc['lastmodified']: path, filename = os.path.split(DATAFILE_PATH) data_file = os.path.join(doc['data_folder'], filename) if os.path.exists(data_file): logging.info("No newer file found. Abort now.") sys.exit(0) if not os.path.exists(DATA_FOLDER): os.makedirs(DATA_FOLDER) else: if not (no_confirm or len(os.listdir(DATA_FOLDER)) == 0 or ask('DATA_FOLDER (%s) is not empty. Continue?' % DATA_FOLDER) == 'Y'): sys.exit(0) logfile = os.path.join(DATA_FOLDER, 'uniprot_dump.log') setup_logfile(logfile) #mark the download starts doc = {'_id': 'uniprot', 'timestamp': timestamp, 'data_folder': DATA_FOLDER, 'lastmodified': lastmodified, 'logfile': logfile, 'status': 'downloading'} src_dump.save(doc) t0 = time.time() download(no_confirm) #mark the download finished successfully _updates = { 'status': 'success', 'time': timesofar(t0), 'pending_to_upload': True # a flag to trigger data uploading } src_dump.update({'_id': 'uniprot'}, {'$set': _updates})
def merge(self, step=100000, restart_at=0,sources=None,target=None): t0 = time.time() self.validate_src_collections(sources) self.prepare_target(target_name=target) self.log_building_start() try: if self.using_ipython_cluster: if sources: raise NotImplemented("merge speficic sources not supported when using parallel") self._merge_ipython_cluster(step=step) else: self._merge_local(step=step, restart_at=restart_at,src_collection_list=sources) if self.target.name == 'es': logging.info("Updating metadata...") self.update_mapping_meta() t1 = round(time.time() - t0, 0) t = timesofar(t0) self.log_src_build({'status': 'success', 'time': t, 'time_in_s': t1, 'timestamp': datetime.now()}) finally: #do a simple validation here if getattr(self, '_stats', None): logging.info("Validating...") target_cnt = self.target.count() if target_cnt == self._stats['total_genes']: logging.info("OK [total count={}]".format(target_cnt)) else: logging.info("Warning: total count of gene documents does not match [{}, should be {}]".format(target_cnt, self._stats['total_genes'])) if self.merge_logging: sys.stdout.close()