def build_index2(self, build_config='mygene_allspecies', last_build_idx=-1, use_parallel=False, es_host=None, es_index_name=None): """Build ES index from last successfully-merged mongodb collection. optional "es_host" argument can be used to specified another ES host, otherwise default ES_HOST. optional "es_index_name" argument can be used to pass an alternative index name, otherwise same as mongodb collection name """ from pprint import pprint self.load_build_config(build_config) last_build = self._build_config['build'][last_build_idx] print "Last build record:" pprint(last_build) assert last_build['status'] == 'success', \ "Abort. Last build did not success." assert last_build['target_backend'] == "mongodb", \ 'Abort. Last build need to be built using "mongodb" backend.' assert last_build.get('stats', None), \ 'Abort. Last build stats are not available.' self._stats = last_build['stats'] assert last_build.get('target', None), \ 'Abort. Last build target_collection is not available.' #target_collection = last_build['target'] target_collection = "genedoc_{}_current".format(build_config) ###### _db = get_target_db() target_collection = _db[target_collection] print print 'Source: ', target_collection.name _mapping = self.get_mapping() _meta = {} src_version = self.get_src_version() if src_version: _meta['src_version'] = src_version if getattr(self, '_stats', None): _meta['stats'] = self._stats if 'timestamp' in last_build: _meta['timestamp'] = last_build['timestamp'] if _meta: _mapping['_meta'] = _meta es_index_name = es_index_name or target_collection.name es_idxer = ESIndexer(mapping=_mapping, es_index_name=es_index_name, es_host=es_host, step=5000) if build_config == 'mygene_allspecies': es_idxer.number_of_shards = 10 # default 5 print "ES host:", es_idxer.conn.servers[0].geturl() print "ES index:", es_index_name if ask("Continue to build ES index?") == 'Y': es_idxer.use_parallel = use_parallel #es_idxer.s = 609000 if es_idxer.conn.indices.exists_index(es_idxer.ES_INDEX_NAME): if ask('Index "{}" exists. Delete?'.format(es_idxer.ES_INDEX_NAME)) == 'Y': es_idxer.conn.indices.delete_index(es_idxer.ES_INDEX_NAME) else: print "Abort." return es_idxer.create_index() #es_idxer.delete_index_type(es_idxer.ES_INDEX_es.pTYPE, noconfirm=True) es_idxer.build_index(target_collection, verbose=False)
def clean_target_collection(): bdr = DataBuilder(backend='mongodb') bdr.load_build_config('mygene') try: target_collection = bdr.pick_target_collection(autoselect=False) except KeyboardInterrupt: print "Aborted." return if ask('Delete collection "{}"'.format(target_collection.name)) == 'Y': if ask("Double check! Are you sure?") == 'Y': target_collection.drop() print 'Done, collection "{}" was dropped.'.format(target_collection.name)
def build_index(config, use_parallel=True, noconfirm=False): bdr = DataBuilder(backend='mongodb') bdr.load_build_config(config) target_collection = bdr.pick_target_collection() target_es_index = 'genedoc_' + bdr._build_config['name'] if target_collection: es_idxer = ESIndexer(mapping=bdr.get_mapping()) es_idxer.ES_INDEX_NAME = target_es_index es_idxer.step = 10000 es_idxer.use_parallel = use_parallel es_server = es_idxer.conn.servers[0].geturl() print "ES target: {}/{}/{}".format(es_server, es_idxer.ES_INDEX_NAME, es_idxer.ES_INDEX_TYPE) if ask("Continue?") == 'Y': #es_idxer.s = 609000 #es_idxer.conn.indices.delete_index(es_idxer.ES_INDEX_NAME) es_idxer.create_index() es_idxer.delete_index_type(es_idxer.ES_INDEX_TYPE, noconfirm=noconfirm) es_idxer.build_index(target_collection, verbose=False) es_idxer.optimize() else: print "Aborted." else: print "Error: target collection is not ready yet or failed to build."
def sync_index(config, use_parallel=True, noconfirm=False): bdr = DataBuilder(backend='mongodb') bdr.load_build_config(config) target_collection = bdr.pick_target_collection() target_es_index = 'genedoc_' + bdr._build_config['name'] sync_src = backend.GeneDocMongoDBBackend(target_collection) es_idxer = ESIndexer(bdr.get_mapping()) es_idxer.ES_INDEX_NAME = target_es_index es_idxer.step = 10000 es_idxer.use_parallel = use_parallel sync_target = backend.GeneDocESBackend(es_idxer) print '\tsync_src:\t{:<40}{}\t{}'.format(target_collection.name, sync_src.name, sync_src.count()) print '\tsync_target\t{:<40}{}\t{}'.format(target_es_index, sync_target.name, sync_target.count()) if noconfirm or ask("Continue?") == "Y": changes = diff.diff_collections(sync_src, sync_target) return changes
def es_clean_indices(keep_last=2, es_host=None, verbose=True, noconfirm=False, dryrun=False): '''clean up es indices, only keep last <keep_last> number of indices.''' conn = get_es(es_host) index_li = list(conn.indices.get_aliases().keys()) if verbose: print("Found {} indices".format(len(index_li))) for prefix in ('genedoc_mygene', 'genedoc_mygene_allspecies'): pat = prefix + '_(\d{8})_\w{8}' _li = [] for index in index_li: mat = re.match(pat, index) if mat: _li.append((mat.group(1), index)) _li.sort() # older collection appears first index_to_remove = [x[1] for x in _li[:-keep_last]] # keep last # of newer indices if len(index_to_remove) > 0: print ("{} \"{}*\" indices will be removed.".format(len(index_to_remove), prefix)) if verbose: for index in index_to_remove: print ('\t', index) if noconfirm or ask("Continue?") == 'Y': for index in index_to_remove: if dryrun: print("dryrun=True, nothing is actually deleted") else: conn.indices.delete(index) print("Done.[%s indices removed]" % len(index_to_remove)) else: print("Aborted.") else: print("Nothing needs to be removed.")
def merge(src, target, step=10000, confirm=True): """Merging docs from src collection into target collection.""" src_m = importlib.import_module('dataload.contrib.' + src + '.__init__') db = get_src_db() src_coll = db[src] target_coll = db[target] cnt = src_coll.count() if not (confirm and ask('Continue to update {} docs from "{}" into "{}"?'.format( cnt, src_coll.name, target_coll.name)) == 'Y'): return for doc in doc_feeder(src_coll, step=step): if src == 'ndc': _id = src_m.get_id_for_merging(doc, src, db) target_coll.update_many( {'drugbank.products.ndc_product_code': _id}, {'$addToSet': { 'ndc': doc['ndc'] }}) d = {} _id = src_m.get_id_for_merging(doc, src, db) d.update({'_id': _id, src: doc[src]}) target_coll.update_one({"_id": _id}, {'$set': d}, upsert=True)
def download(url, output_folder, output_file, no_confirm=False, use_axel=False): orig_path = os.getcwd() if not os.path.exists(output_folder): os.makedirs(output_folder) # create output_folder if doesn not exist try: os.chdir(output_folder) if os.path.exists(output_file): if no_confirm or ask('Remove existing file "%s"?' % output_file) == 'Y': os.remove(output_file) else: print("Skipped!") return print('Downloading "%s"...' % output_file) if use_axel: #faster than wget using 5 connections cmdline = 'axel -a -n 5 "{}" -o "{}"'.format(url, output_file) else: cmdline = 'wget "{}" -O "{}"'.format(url, output_file) return_code = os.system(cmdline) if return_code == 0: print("Success.") else: print("Failed with return code (%s)." % return_code) print("="*50) finally: os.chdir(orig_path)
def target_clean_collections(keep_last=2, target=None, verbose=True, noconfirm=False): '''clean up collections in target db, only keep last <keep_last> number of collections.''' import re from utils.common import ask target = target or get_target_db() coll_list = target.collection_names() for prefix in ('genedoc_mygene', 'genedoc_mygene_allspecies'): pat = prefix + '_(\d{8})_\w{8}' _li = [] for coll_name in coll_list: mat = re.match(pat, coll_name) if mat: _li.append((mat.group(1), coll_name)) _li.sort() # older collection appears first coll_to_remove = [x[1] for x in _li[:-keep_last]] # keep last # of newer collections if len(coll_to_remove) > 0: print "{} \"{}*\" collection(s) will be removed.".format(len(coll_to_remove), prefix) if verbose: for coll in coll_to_remove: print '\t', coll if noconfirm or ask("Continue?") == 'Y': for coll in coll_to_remove: target[coll].drop() print "Done.[%s collection(s) removed]" % len(coll_to_remove) else: print "Aborted." else: print "Nothing needs to be removed."
def run_jobs_on_ipythoncluster(worker, task_list, shutdown_ipengines_after_done=False): t0 = time.time() rc = Client(CLUSTER_CLIENT_JSON) lview = rc.load_balanced_view() print "\t# nodes in use: {}".format(len(lview.targets or rc.ids)) lview.block = False print "\t# of tasks: {}".format(len(task_list)) print "\tsubmitting...", job = lview.map_async(worker, task_list) print "done." try: job.wait_interactive() except KeyboardInterrupt: #handle "Ctrl-C" if ask("\nAbort all submitted jobs?") == 'Y': lview.abort() print "Aborted, all submitted jobs are cancelled." else: print "Aborted, but your jobs are still running on the cluster." return if len(job.result) != len(task_list): print "WARNING:\t# of results returned ({}) != # of tasks ({}).".format(len(job.result), len(task_list)) print "\ttotal time: {}".format(timesofar(t0)) if shutdown_ipengines_after_done: print "\tshuting down all ipengine nodes...", lview.shutdown() print 'Done.' return job.result
def update_mapping(self, m): assert list(m) == [self._doc_type] # assert m[self._doc_type].keys() == ['properties'] assert 'properties' in m[self._doc_type] print(json.dumps(m, indent=2)) if ask("Continue to update above mapping?") == 'Y': print(self._es.indices.put_mapping(index=self._index, doc_type=self._doc_type, body=m))
def main(): if len(sys.argv) == 2 and sys.argv[1] == 'check': print "Checking latest mart_version:\t", mart_version = chk_latest_mart_version() print mart_version return if len(sys.argv) > 1: mart_version = sys.argv[1] else: print "Checking latest mart_version:\t", mart_version = chk_latest_mart_version() print mart_version BM = BioMart() BM.species_li = get_all_species(mart_version) DATA_FOLDER = os.path.join(ENSEMBL_FOLDER, mart_version) if not os.path.exists(DATA_FOLDER): os.makedirs(DATA_FOLDER) else: if not (len(os.listdir(DATA_FOLDER))==0 or ask('DATA_FOLDER (%s) is not empty. Continue?' % DATA_FOLDER)=='Y'): return log_f, logfile = safewfile(os.path.join(DATA_FOLDER, 'ensembl_mart_%s.log' % mart_version)) sys.stdout = LogPrint(log_f, timestamp=True) BM.get_gene__main(os.path.join(DATA_FOLDER, 'gene_ensembl__gene__main.txt')) BM.get_translation__main(os.path.join(DATA_FOLDER, 'gene_ensembl__translation__main.txt')) BM.get_xref_entrezgene(os.path.join(DATA_FOLDER, 'gene_ensembl__xref_entrezgene__dm.txt')) BM.get_profile(os.path.join(DATA_FOLDER, 'gene_ensembl__prot_profile__dm.txt')) BM.get_interpro(os.path.join(DATA_FOLDER, 'gene_ensembl__prot_interpro__dm.txt')) sys.stdout.close()
def main(): parser = OptionParser() parser.add_option("-c", "--conf", dest="config", action="store", default=None, help="ES indexing building config name") parser.add_option("-b", "--noconfirm", dest="noconfirm", action="store_true", default=False, help="do not ask for confirmation") parser.add_option("-e", "--es-index", dest="es_index_name", action="store", default=None, help="provide an alternative ES index name") # parser.add_option("", "--no-cleanup", dest="nocleanup", # action="store_true", default=False, # help="do not clean up old ES indices") (options, args) = parser.parse_args() with open_tunnel() as tunnel: if tunnel.ok: es_host = '127.0.0.1:' + str(es_local_tunnel_port) else: es_host = ES_HOST # if not options.nocleanup: # es_clean_indices(noconfirm=options.noconfirm) t00 = time.time() bdr = DataBuilder(backend='es') if options.config: config_li = [options.config] else: config_li = ['mygene', 'mygene_allspecies'] if not options.noconfirm: print('\n'.join([ "Ready to build these ES indices on %s (tunnel=%s):" % (es_host, tunnel.ok) ] + ['\t' + conf for conf in config_li])) if ask('Continue?') != 'Y': print("Aborted") return for _conf in config_li: t0 = time.time() print('>"{}">>>>>>'.format(_conf)) bdr.build_index2(_conf, es_index_name=options.es_index_name, es_host=es_host, noconfirm=options.noconfirm) print('<<<<<<"{}"...done. {}'.format(_conf, timesofar(t0))) print('=' * 20) print("Finished.", timesofar(t00))
def merge(src, target, step=10000, confirm=True): """Merging docs from src collection into target collection.""" cnt = src.count() if not (confirm and ask('Continue to update {} docs from "{}" into "{}"?'.format(cnt, src.name, target.name)) == 'Y'): return for doc in doc_feeder(src, step=step): _id = doc['_id'] target.update_one({"_id": _id}, {'$set': doc}, upsert=True)
def main_cron(): no_confirm = True # set it to True for running this script automatically without intervention. src_dump = get_src_dump() print("Checking latest mart_version:\t", end=' ') mart_version = chk_latest_mart_version() print(mart_version) doc = src_dump.find_one({'_id': 'ensembl'}) if doc and 'release' in doc and mart_version <= doc['release']: data_file = os.path.join(doc['data_folder'], 'gene_ensembl__gene__main.txt') if os.path.exists(data_file): print("No newer release found. Abort now.") sys.exit(0) DATA_FOLDER = os.path.join(ENSEMBL_FOLDER, str(mart_version)) if not os.path.exists(DATA_FOLDER): os.makedirs(DATA_FOLDER) else: if not (no_confirm or len(os.listdir(DATA_FOLDER)) == 0 or ask('DATA_FOLDER (%s) is not empty. Continue?' % DATA_FOLDER) == 'Y'): sys.exit(0) log_f, logfile = safewfile(os.path.join(DATA_FOLDER, 'ensembl_mart_%s.log' % mart_version), prompt=(not no_confirm), default='O') sys.stdout = LogPrint(log_f, timestamp=True) #mark the download starts doc = {'_id': 'ensembl', 'release': mart_version, 'timestamp': time.strftime('%Y%m%d'), 'data_folder': DATA_FOLDER, 'logfile': logfile, 'status': 'downloading'} src_dump.save(doc) t0 = time.time() try: BM = BioMart() BM.species_li = get_all_species(mart_version) BM.get_gene__main(os.path.join(DATA_FOLDER, 'gene_ensembl__gene__main.txt')) BM.get_translation__main(os.path.join(DATA_FOLDER, 'gene_ensembl__translation__main.txt')) BM.get_xref_entrezgene(os.path.join(DATA_FOLDER, 'gene_ensembl__xref_entrezgene__dm.txt')) BM.get_profile(os.path.join(DATA_FOLDER, 'gene_ensembl__prot_profile__dm.txt')) BM.get_interpro(os.path.join(DATA_FOLDER, 'gene_ensembl__prot_interpro__dm.txt')) BM.get_pfam(os.path.join(DATA_FOLDER, 'gene_ensembl__prot_pfam__dm.txt')) finally: sys.stdout.close() #mark the download finished successfully _updates = { 'status': 'success', 'time': timesofar(t0), 'pending_to_upload': True # a flag to trigger data uploading } src_dump.update({'_id': 'ensembl'}, {'$set': _updates})
def delete_index_type(self, index_type, noconfirm=False): '''Delete all indexes for a given index_type.''' index_name = self.ES_INDEX_NAME # Check if index_type exists m = self.conn.indices.get_mapping(index_name, index_type) if not m: print('Error: index type "%s" does not exist in index "%s".' % (index_type, index_name)) return path = '/%s/%s' % (index_name, index_type) if noconfirm or ask('Confirm to delete all data under "%s":' % path) == 'Y': return self.conn.indices.delete_mapping(index=index_name, doc_type=index_type)
def main(): if len(sys.argv) > 1: config = sys.argv[1] else: config = 'mygene' #config = 'mygene_allspecies' if not config.startswith('genedoc_'): config = 'genedoc_' + config assert config in ['genedoc_mygene', 'genedoc_mygene_allspecies'] noconfirm = '-b' in sys.argv _changes_fn = _get_current_changes_fn(config) if _changes_fn: print("Changes file: " + _changes_fn) else: print("No changes file found. Aborted.") return -1 if noconfirm or ask("Continue to load?") == 'Y': changes = loadobj(_changes_fn) else: print("Aborted.") return -2 _es_host = 'localhost:' + str(es_local_tunnel_port) _es_index = config + TARGET_ES_INDEX_SUFFIX # '_current_1' # for test #_es_host = 'localhost:9200' #_es_index = config + TARGET_ES_INDEX_SUFFIX # '_current_1' with open_tunnel() as tunnel: if tunnel.ok: esi = ESIndexer2(_es_index, es_host=_es_host) meta = esi.get_mapping_meta(changes) print('\033[34;06m{}\033[0m:'.format('[Metadata]')) pprint(meta) code = esi.apply_changes(changes, noconfirm=noconfirm) if code != -1: # aborted when code == -1 _meta = {'_meta': meta} # somehow when only update "_meta", "_timestamp" get empty # so add "_timestamp" explicitly here. This is an ES bug. _meta['_timestamp'] = { "enabled": True, "path": "_timestamp" } #esi.update_mapping_meta(_meta) print(esi.conn.indices.put_mapping(esi.ES_INDEX_TYPE, _meta, [esi.ES_INDEX_NAME])) esi.post_verify_changes(changes)
def apply_changes(self, changes, verify=True, noconfirm=False): if verify: self.pre_verify_changes(changes) if not (noconfirm or ask('\nContinue to apply changes?') == 'Y'): print("Aborted.") return -1 #src = self.get_source_collection(changes) step = self.step _db = get_target_db() source_col = _db[changes['source']] src = GeneDocMongoDBBackend(source_col) target = GeneDocESBackend(self) _timestamp = changes['timestamp'] def _add_docs(ids): i = 0 for _ids in iter_n(ids, step): t1 = time.time() _doc_li = src.mget_from_ids(_ids) for _doc in _doc_li: _doc['_timestamp'] = _timestamp i += 1 target.insert(_doc_li) print('\t{}\t{}'.format(i, timesofar(t1))) t0 = time.time() if changes['add']: print("Adding {} new docs...".format(len(changes['add']))) t00 = time.time() _add_docs(changes['add']) print("done. [{}]".format(timesofar(t00))) if changes['delete']: print("Deleting {} discontinued docs...".format(len(changes['delete'])), end='') t00 = time.time() target.remove_from_ids(changes['delete'], step=step) print("done. [{}]".format(timesofar(t00))) if changes['update']: print("Updating {} existing docs...".format(len(changes['update']))) t00 = time.time() ids = [x['_id'] for x in changes['update']] _add_docs(ids) print("done. [{}]".format(timesofar(t00))) target.finalize() print("\n") print("Finished.", timesofar(t0))
def main(): no_confirm = True # set it to True for running this script automatically without intervention. src_dump = get_src_dump() (file_name, release) = get_newest_release() doc = src_dump.find_one({'_id': 'clinvar'}) if new_release_available(doc['release']): data_file = os.path.join(doc['data_folder'], file_name) if os.path.exists(data_file): print("No newer file found. Abort now.") return if not os.path.exists(DATA_FOLDER): os.makedirs(DATA_FOLDER) else: if not (no_confirm or len(os.listdir(DATA_FOLDER)) == 0 or ask('DATA_FOLDER (%s) is not empty. Continue?' % DATA_FOLDER) == 'Y'): return log_f, logfile = safewfile(os.path.join(DATA_FOLDER, 'clinvar_dump.log'), prompt=(not no_confirm), default='O') sys.stdout = LogPrint(log_f, timestamp=True) # mark the download starts doc = { '_id': 'clinvar', 'timestamp': timestamp, 'data_folder': DATA_FOLDER, 'release': release, 'logfile': logfile, 'status': 'downloading' } src_dump.save(doc) t0 = time.time() try: download_ftp_file(no_confirm) finally: sys.stdout.close() # mark the download finished successfully _updates = { 'status': 'success', 'time': timesofar(t0), 'pending_to_upload': True # a flag to trigger data uploading } src_dump.update({'_id': 'clinvar'}, {'$set': _updates})
def main_cron(): no_confirm = True # set it to True for running this script automatically without intervention. print("Checking latest refseq release:\t", end='') refseq_release = get_refseq_release() print(refseq_release) src_dump = get_src_dump() doc = src_dump.find_one({'_id': 'refseq'}) if doc and 'release' in doc and refseq_release <= doc['release']: data_file = os.path.join(doc['data_folder'], 'complete.109.rna.gbff.gz') if os.path.exists(data_file): print("No newer release found. Abort now.") sys.exit(0) DATA_FOLDER = os.path.join(REFSEQ_FOLDER, str(refseq_release)) if not os.path.exists(DATA_FOLDER): os.makedirs(DATA_FOLDER) else: if not (no_confirm or len(os.listdir(DATA_FOLDER)) == 0 or ask('DATA_FOLDER (%s) is not empty. Continue?' % DATA_FOLDER) == 'Y'): sys.exit(0) log_f, logfile = safewfile(os.path.join(DATA_FOLDER, 'refseq_dump.log'), prompt=(not no_confirm), default='O') sys.stdout = LogPrint(log_f, timestamp=True) #mark the download starts doc = {'_id': 'refseq', 'release': refseq_release, 'timestamp': time.strftime('%Y%m%d'), 'data_folder': DATA_FOLDER, 'logfile': logfile, 'status': 'downloading'} src_dump.save(doc) t0 = time.time() try: download(DATA_FOLDER, refseq_release, no_confirm=no_confirm) finally: sys.stdout.close() #mark the download finished successfully _updates = { 'status': 'success', 'time': timesofar(t0), 'pending_to_upload': True # a flag to trigger data uploading } src_dump.update({'_id': 'refseq'}, {'$set': _updates})
def main(): parser = OptionParser() parser.add_option("-c", "--conf", dest="config", action="store", default=None, help="ES indexing building config name") parser.add_option("-b", "--noconfirm", dest="noconfirm", action="store_true", default=False, help="do not ask for confirmation") parser.add_option("-e", "--es-index", dest="es_index_name", action="store", default=None, help="provide an alternative ES index name") # parser.add_option("", "--no-cleanup", dest="nocleanup", # action="store_true", default=False, # help="do not clean up old ES indices") (options, args) = parser.parse_args() with open_tunnel() as tunnel: if tunnel.ok: es_host = '127.0.0.1:' + str(es_local_tunnel_port) else: es_host = ES_HOST # if not options.nocleanup: # es_clean_indices(noconfirm=options.noconfirm) t00 = time.time() bdr = DataBuilder(backend='es') if options.config: config_li = [options.config] else: config_li = ['mygene', 'mygene_allspecies'] if not options.noconfirm: print('\n'.join(["Ready to build these ES indices on %s (tunnel=%s):" % (es_host, tunnel.ok)] + ['\t' + conf for conf in config_li])) if ask('Continue?') != 'Y': print("Aborted") return for _conf in config_li: t0 = time.time() print('>"{}">>>>>>'.format(_conf)) bdr.build_index2(_conf, es_index_name=options.es_index_name, es_host=es_host,noconfirm=options.noconfirm) print('<<<<<<"{}"...done. {}'.format(_conf, timesofar(t0))) print('=' * 20) print("Finished.", timesofar(t00))
def main(): no_confirm = True # set it to True for running this script automatically without intervention. if not os.path.exists(DATA_FOLDER): os.makedirs(DATA_FOLDER) else: if not (no_confirm or len(os.listdir(DATA_FOLDER)) == 0 or ask('DATA_FOLDER (%s) is not empty. Continue?' % DATA_FOLDER) == 'Y'): sys.exit() log_f, logfile = safewfile(os.path.join(DATA_FOLDER, 'entrez_dump.log'), prompt=(not no_confirm), default='O') sys.stdout = LogPrint(log_f, timestamp=True) sys.stderr = sys.stdout #mark the download starts src_dump = get_src_dump() doc = {'_id': 'entrez', 'timestamp': timestamp, 'data_folder': DATA_FOLDER, 'logfile': logfile, 'status': 'downloading'} src_dump.save(doc) t0 = time.time() try: download(DATA_FOLDER, no_confirm=no_confirm) t_download = timesofar(t0) t1 = time.time() #mark parsing starts src_dump.update({'_id': 'entrez'}, {'$set': {'status': 'parsing'}}) parse_gbff(DATA_FOLDER) t_parsing = timesofar(t1) t_total = timesofar(t0) finally: sys.stdout.close() #mark the download finished successfully _updates = { 'status': 'success', 'time': { 'download': t_download, 'parsing': t_parsing, 'total': t_total }, 'pending_to_upload': True # a flag to trigger data uploading } src_dump.update({'_id': 'entrez'}, {'$set': _updates})
def update_mapping_meta(self, meta, confirm=True): allowed_keys = set(['_meta', '_timestamp']) if isinstance(meta, dict) and len(set(meta) - allowed_keys) == 0: current_meta = self.get_mapping_meta() print('\033[34;06m{}\033[0m:'.format('[Current _meta]')) print(json.dumps(current_meta, indent=2)) print('\033[34;06m{}\033[0m:'.format('[Replace with new _meta]')) print(json.dumps(meta, indent=2)) if not confirm or ask('Continue to update above _meta field to "{}" index?'.format(self._index)) == "Y": body = {self._doc_type: meta} print(self._es.indices.put_mapping( doc_type=self._doc_type, body=body, index=self._index )) else: raise ValueError('Input "meta" should have and only have "_meta" field.')
def update_index(changes, sync_src, sync_target, noconfirm=False): # changes['_add'] = changes['delete'] # changes['_delete'] = changes['add'] # changes['delete'] = changes['_delete'] # changes['add'] = changes['_add'] # del changes['_add'] # del changes['_delete'] print "\t{}\trecords will be added.".format(len(changes['add'])) print "\t{}\trecords will be deleted.".format(len(changes['delete'])) print "\t{}\trecords will be updated.".format(len(changes['update'])) print print '\tsync_src:\t{:<45}{}'.format(sync_src.target_collection.name, sync_src.name) print '\tsync_target\t{:<45}{}'.format(sync_target.target_esidxer.ES_INDEX_NAME, sync_target.name) if noconfirm or ask("Continue?")=='Y': t00 = time.time() es_idxer = sync_target.target_esidxer if len(changes['add']) > 0: print "Adding {} new records...".format(len(changes['add'])) t0 = time.time() _q = {'_id': {'$in': changes['add']}} for docs in doc_feeder(sync_src.target_collection, step=1000, inbatch=True, query=_q): es_idxer.add_docs(docs) print "Done. [{}]".format(timesofar(t0)) if len(changes['delete']) > 0: print "Deleting {} old records...".format(len(changes['delete'])) t0 = time.time() es_idxer.delete_docs(changes['delete']) print "Done. [{}]".format(timesofar(t0)) if len(changes['update']) > 0: print "Updating {} existing records...".format(len(changes['update'])) t0 = time.time() ids = [d['_id'] for d in changes['update']] _q = {'_id': {'$in': ids}} for docs in doc_feeder(sync_src.target_collection, step=1000, inbatch=True, query=_q): es_idxer.add_docs(docs) print "Done. [{}]".format(timesofar(t0)) print '='*20 print 'Finished. [{}]'.format(timesofar(t00))
def update_mapping_meta(self, meta, confirm=True): allowed_keys = set(['_meta', '_timestamp']) if isinstance(meta, dict) and len(set(meta) - allowed_keys) == 0: current_meta = self.get_mapping_meta() print('\033[34;06m{}\033[0m:'.format('[Current _meta]')) print(json.dumps(current_meta, indent=2)) print('\033[34;06m{}\033[0m:'.format('[Replace with new _meta]')) print(json.dumps(meta, indent=2)) if not confirm or ask( 'Continue to update above _meta field to "{}" index?'. format(self._index)) == "Y": body = {self._doc_type: meta} print( self._es.indices.put_mapping(doc_type=self._doc_type, body=body, index=self._index)) else: raise ValueError( 'Input "meta" should have and only have "_meta" field.')
def main(): no_confirm = True # set it to True for running this script automatically without intervention. src_dump = get_src_dump() (file_name, release) = get_newest_release() doc = src_dump.find_one({'_id': 'clinvar'}) if new_release_available(doc['release']): data_file = os.path.join(doc['data_folder'], file_name) if os.path.exists(data_file): print("No newer file found. Abort now.") return if not os.path.exists(DATA_FOLDER): os.makedirs(DATA_FOLDER) else: if not (no_confirm or len(os.listdir(DATA_FOLDER)) == 0 or ask('DATA_FOLDER (%s) is not empty. Continue?' % DATA_FOLDER) == 'Y'): return log_f, logfile = safewfile(os.path.join(DATA_FOLDER, 'clinvar_dump.log'), prompt=(not no_confirm), default='O') sys.stdout = LogPrint(log_f, timestamp=True) # mark the download starts doc = {'_id': 'clinvar', 'timestamp': timestamp, 'data_folder': DATA_FOLDER, 'release': release, 'logfile': logfile, 'status': 'downloading'} src_dump.save(doc) t0 = time.time() try: download_ftp_file(no_confirm) finally: sys.stdout.close() # mark the download finished successfully _updates = { 'status': 'success', 'time': timesofar(t0), 'pending_to_upload': True # a flag to trigger data uploading } src_dump.update({'_id': 'clinvar'}, {'$set': _updates})
def merge_resume(self, build_config, at_collection, step=10000): '''resume a merging process after a failure. .merge_resume('mygene_allspecies', 'reporter') ''' from pprint import pprint assert not self.using_ipython_cluster, "Abort. Can only resume merging in non-parallel mode." self.load_build_config(build_config) last_build = self._build_config['build'][-1] print "Last build record:" pprint(last_build) assert last_build['status'] == 'building', \ "Abort. Last build does not need to be resumed." assert at_collection in self._build_config['sources'], \ 'Abort. Cannot resume merging from a unknown collection "{}"'.format(at_collection) assert last_build['target_backend'] == self.target.name, \ 'Abort. Re-initialized DataBuilder class using matching backend "{}"'.format(last_build['backend']) assert last_build.get('stats', None), \ 'Abort. Intital build stats are not available. You should restart the build from the scratch.' self._stats = last_build['stats'] if ask('Continue to resume merging from "{}"?'.format(at_collection)) == 'Y': #TODO: resume logging target_name = last_build['target'] self.validate_src_collections() self.prepare_target(target_name=target_name) src_cnt = 0 for collection in self._build_config['sources']: if collection in ['entrez_gene', 'ensembl_gene']: continue src_cnt += 1 if collection == at_collection: break self._merge_local(step=step, restart_at=src_cnt) if self.target.name == 'es': print "Updating metadata...", self.update_mapping_meta() self.log_src_build({'status': 'success', 'timestamp': datetime.now()})
def download(path, no_confirm=False): out = [] orig_path = os.getcwd() try: _expand_refseq_files() for subfolder in FILE_LIST: filedata = FILE_LIST[subfolder] baseurl = filedata['url'] data_folder = os.path.join(path, subfolder) if not os.path.exists(data_folder): os.mkdir(data_folder) for f in filedata['files']: url = baseurl + f os.chdir(data_folder) filename = os.path.split(f)[1] if os.path.exists(filename): if no_confirm or ask('Remove existing file "%s"?' % filename) == 'Y': os.remove(filename) else: print("Skipped!") continue print('Downloading "%s"...' % f) #cmdline = 'wget %s' % url #cmdline = 'axel -a -n 5 %s' % url #faster than wget using 5 connections cmdline = _get_ascp_cmdline(url) return_code = os.system(cmdline) #return_code = 0;print cmdline #for testing if return_code == 0: print("Success.") else: print("Failed with return code (%s)." % return_code) out.append((url, return_code)) print("="*50) finally: os.chdir(orig_path) return out
def download(no_confirm=False): orig_path = os.getcwd() try: os.chdir(DATA_FOLDER) path, filename = os.path.split(DATAFILE_PATH) if os.path.exists(filename): if no_confirm or ask('Remove existing file "%s"?' % filename) == 'Y': os.remove(filename) else: print("Skipped!") return print('Downloading "%s"...' % filename) url = 'ftp://{}/{}'.format(FTP_SERVER, DATAFILE_PATH) cmdline = 'wget %s -O %s' % (url, filename) #cmdline = 'axel -a -n 5 %s' % url #faster than wget using 5 connections return_code = os.system(cmdline) if return_code == 0: print("Success.") else: print("Failed with return code (%s)." % return_code) print("="*50) finally: os.chdir(orig_path)
def download(no_confirm=False): orig_path = os.getcwd() try: os.chdir(DATA_FOLDER) filename = 'genes.zip' url = GENES_URL if os.path.exists(filename): if no_confirm or ask('Remove existing file "%s"?' % filename) == 'Y': os.remove(filename) else: print "Skipped!" return print 'Downloading "%s"...' % filename cmdline = 'wget %s -O %s' % (url, filename) #cmdline = 'axel -a -n 5 %s' % url #faster than wget using 5 connections return_code = os.system(cmdline) if return_code == 0: print "Success." else: print "Failed with return code (%s)." % return_code print "="*50 finally: os.chdir(orig_path)
def download(path, release, no_confirm=False): out = [] orig_path = os.getcwd() try: data_folder = os.path.join(path, release) if not os.path.exists(data_folder): os.mkdir(data_folder) _url = 'ftp://' + FTP_SERVER + BASE_PATH + DATA_FILE url_li = _expand_wildchar_urls(_url) print('Found {} "{}" files to download.'.format(len(url_li), DATA_FILE)) for url in url_li: os.chdir(data_folder) filename = os.path.split(url)[1] if os.path.exists(filename): if no_confirm or ask('Remove existing file "%s"?' % filename) == 'Y': os.remove(filename) else: print("Skipped!") continue print('Downloading "%s"...' % filename) #cmdline = 'wget %s' % url #cmdline = 'axel -a -n 5 %s' % url #faster than wget using 5 connections cmdline = _get_ascp_cmdline(url) return_code = os.system(cmdline) #return_code = 0;print cmdline #for testing if return_code == 0: print("Success.") else: print("Failed with return code (%s)." % return_code) out.append((url, return_code)) print("="*50) finally: os.chdir(orig_path) return out
def src_clean_archives(keep_last=1, src=None, verbose=True, noconfirm=False): '''clean up archive collections in src db, only keep last <kepp_last> number of archive. ''' from utils.dataload import list2dict from utils.common import ask src = src or get_src_db() archive_li = sorted([(coll.split('_archive_')[0], coll) for coll in src.collection_names() if coll.find('archive') != -1]) archive_d = list2dict(archive_li, 0, alwayslist=1) coll_to_remove = [] for k, v in archive_d.items(): print k, #check current collection exists if src[k].count() > 0: cnt = 0 for coll in sorted(v)[:-keep_last]: coll_to_remove.append(coll) cnt += 1 print "\t\t%s archived collections marked to remove." % cnt else: print 'skipped. Missing current "%s" collection!' % k if len(coll_to_remove) > 0: print "%d archived collections will be removed." % len(coll_to_remove) if verbose: for coll in coll_to_remove: print '\t', coll if noconfirm or ask("Continue?") == 'Y': for coll in coll_to_remove: src[coll].drop() print "Done.[%s collections removed]" % len(coll_to_remove) else: print "Aborted." else: print "Nothing needs to be removed."
def diff2src(use_parallel=True, noconfirm=False): src_li = [] target_db = get_target_db() src_li.extend([(name, target_db[name].count(), 'mongodb') for name in sorted(target_db.collection_names()) if name.startswith('genedoc')]) es_idxer = ESIndexer() es_idxer.conn.default_indices=[] for es_idx in es_idxer.conn.indices.get_indices(): if es_idx.startswith('genedoc'): es_idxer.ES_INDEX_NAME = es_idx src_li.append((es_idx, es_idxer.count()['count'], 'es')) print "Found {} sources:".format(len(src_li)) src_1 = _pick_one(src_li, "Pick first source above: ") src_li.remove(src_1) print src_2 = _pick_one(src_li, "Pick second source above: ") sync_li = [] for src in (src_1, src_2): if src[2] == 'mongodb': b = backend.GeneDocMongoDBBackend(target_db[src[0]]) elif src[2] == 'es': es_idxer = ESIndexer() es_idxer.ES_INDEX_NAME = src[0] es_idxer.step = 10000 b = backend.GeneDocESBackend(es_idxer) sync_li.append(b) sync_src, sync_target = sync_li print '\tsync_src:\t{:<45}{}\t{}'.format(*src_1) print '\tsync_target\t{:<45}{}\t{}'.format(*src_2) if noconfirm or ask("Continue?") == "Y": changes = diff.diff_collections(sync_src, sync_target, use_parallel=use_parallel) return changes
no_confirm = True # set it to True for running this script automatically without intervention. src_dump = get_src_dump() lastmodified = check_lastmodified() doc = src_dump.find_one({'_id': 'uniprot'}) if doc and 'lastmodified' in doc and lastmodified <= doc['lastmodified']: path, filename = os.path.split(DATAFILE_PATH) data_file = os.path.join(doc['data_folder'], filename) if os.path.exists(data_file): print("No newer file found. Abort now.") sys.exit(0) if not os.path.exists(DATA_FOLDER): os.makedirs(DATA_FOLDER) else: if not (no_confirm or len(os.listdir(DATA_FOLDER)) == 0 or ask('DATA_FOLDER (%s) is not empty. Continue?' % DATA_FOLDER) == 'Y'): sys.exit(0) log_f, logfile = safewfile(os.path.join(DATA_FOLDER, 'uniprot_dump.log'), prompt=(not no_confirm), default='O') sys.stdout = LogPrint(log_f, timestamp=True) #mark the download starts doc = {'_id': 'uniprot', 'timestamp': timestamp, 'data_folder': DATA_FOLDER, 'lastmodified': lastmodified, 'logfile': logfile, 'status': 'downloading'} src_dump.save(doc) t0 = time.time() try: