def clean_target_collection(): bdr = DataBuilder(backend='mongodb') bdr.load_build_config('mygene') try: target_collection = bdr.pick_target_collection(autoselect=False) except KeyboardInterrupt: print("Aborted.") return if ask('Delete collection "{}"'.format(target_collection.name)) == 'Y': if ask("Double check! Are you sure?") == 'Y': target_collection.drop() print('Done, collection "{}" was dropped.'.format(target_collection.name))
def clean_target_collection(): bdr = DataBuilder(backend='mongodb') bdr.load_build_config('mygene') try: target_collection = bdr.pick_target_collection(autoselect=False) except KeyboardInterrupt: print("Aborted.") return if ask('Delete collection "{}"'.format(target_collection.name)) == 'Y': if ask("Double check! Are you sure?") == 'Y': target_collection.drop() print('Done, collection "{}" was dropped.'.format( target_collection.name))
def build_index(config, use_parallel=True, noconfirm=False): bdr = DataBuilder(backend='mongodb') bdr.load_build_config(config) target_collection = bdr.pick_target_collection() target_es_index = 'genedoc_' + bdr._build_config['name'] if target_collection: es_idxer = ESIndexer(mapping=bdr.get_mapping()) es_idxer.ES_INDEX_NAME = target_es_index es_idxer.step = 10000 es_idxer.use_parallel = use_parallel es_server = es_idxer.conn.servers[0].geturl() print("ES target: {}/{}/{}".format(es_server, es_idxer.ES_INDEX_NAME, es_idxer.ES_INDEX_TYPE)) if noconfirm or ask("Continue?") == 'Y': #es_idxer.s = 609000 #es_idxer.conn.indices.delete_index(es_idxer.ES_INDEX_NAME) es_idxer.create_index() es_idxer.delete_index_type(es_idxer.ES_INDEX_TYPE, noconfirm=noconfirm) es_idxer.build_index(target_collection, verbose=False) es_idxer.optimize() else: print("Aborted.") else: print("Error: target collection is not ready yet or failed to build.")
def run_jobs_on_ipythoncluster(worker, task_list, shutdown_ipengines_after_done=False): t0 = time.time() rc = Client(CLUSTER_CLIENT_JSON) lview = rc.load_balanced_view() cnt_nodes = len(lview.targets or rc.ids) print("\t# nodes in use: {}".format(cnt_nodes)) lview.block = False print("\t# of tasks: {}".format(len(task_list))) print("\tsubmitting...", end='') job = lview.map_async(worker,task_list) print("done.") try: job.wait_interactive() except KeyboardInterrupt: #handle "Ctrl-C" if ask("\nAbort all submitted jobs?") == 'Y': lview.abort() print("Aborted, all submitted jobs are cancelled.") else: print("Aborted, but your jobs are still running on the cluster.") return if len(job.result()) != len(task_list): print("WARNING:\t# of results returned ({}) != # of tasks ({}).".format(len(job.result()), len(task_list))) print("\ttotal time: {}".format(timesofar(t0))) if shutdown_ipengines_after_done: print("\tshuting down all ipengine nodes...", end='') lview.shutdown() print('Done.') return job.result()
def setUpClass(cls): cls.index = Index(Schema.Index.name) if cls.index.exists(): if FORCE_TEST or ask( 'Current indexed documents will be permenantely lost.' ) == 'Y': cls.index.delete() else: exit() # create new index as defined in Schema class Schema.init() # test dataset cls.testset = [] # add a document url = 'https://raw.githubusercontent.com/namespacestd0/mygene.info/master/README.md' meta = Metadata(username='******', slug='dev', url=url) schema = Schema(clses=['biothings', 'smartapi'], props=['es-dsl'], _meta=meta) schema.save() cls.testset.append(schema) # add another document url = ('https://raw.githubusercontent.com/data2health/' 'schemas/biothings/biothings/biothings_curie.jsonld') meta = Metadata(username='******', slug='d2h', url=url) schema = Schema(clses=['biothings'], _meta=meta) schema.save() cls.testset.append(schema)
def download(url, output_folder, output_file, no_confirm=False, use_axel=False): orig_path = os.getcwd() if not os.path.exists(output_folder): os.makedirs(output_folder) # create output_folder if doesn not exist try: os.chdir(output_folder) if os.path.exists(output_file): if no_confirm or ask('Remove existing file "%s"?' % output_file) == 'Y': os.remove(output_file) else: print("Skipped!") return print('Downloading "%s"...' % output_file) if use_axel: #faster than wget using 5 connections cmdline = 'axel -a -n 5 "{}" -o "{}"'.format(url, output_file) else: cmdline = 'wget "{}" -O "{}"'.format(url, output_file) return_code = os.system(cmdline) if return_code == 0: print("Success.") else: print("Failed with return code (%s)." % return_code) print("="*50) finally: os.chdir(orig_path)
def download(no_confirm=False): orig_path = os.getcwd() try: os.chdir(DATA_FOLDER) for one_file in DATAFILES_PATH: path, filename = os.path.split(one_file) if os.path.exists(filename): if no_confirm or ask( 'Remove existing file "%s"?' % filename) == 'Y': os.remove(filename) else: logging.info("Skipped!") return logging.info('Downloading "%s"...' % filename) url = 'ftp://{}/{}'.format(FTP_SERVER, one_file) cmdline = 'wget %s -O %s' % (url, filename) #cmdline = 'axel -a -n 5 %s' % url #faster than wget using 5 connections return_code = os.system(cmdline) if return_code == 0: logging.info("Success.") else: logging.info("Failed with return code (%s)." % return_code) logging.info("=" * 50) finally: os.chdir(orig_path)
def target_clean_collections(keep_last=2, target=None, verbose=True, noconfirm=False): '''clean up collections in target db, only keep last <keep_last> number of collections.''' import re from biothings.utils.common import ask target = target or get_target_db() coll_list = target.collection_names() for prefix in ('genedoc_mygene', 'genedoc_mygene_allspecies'): pat = prefix + '_(\d{8})_\w{8}' _li = [] for coll_name in coll_list: mat = re.match(pat, coll_name) if mat: _li.append((mat.group(1), coll_name)) _li.sort() # older collection appears first coll_to_remove = [x[1] for x in _li[:-keep_last] ] # keep last # of newer collections if len(coll_to_remove) > 0: print('{} "{}*" collection(s) will be removed.'.format( len(coll_to_remove), prefix)) if verbose: for coll in coll_to_remove: print('\t', coll) if noconfirm or ask("Continue?") == 'Y': for coll in coll_to_remove: target[coll].drop() print("Done.[%s collection(s) removed]" % len(coll_to_remove)) else: print("Aborted.") else: print("Nothing needs to be removed.")
def target_clean_collections(keep_last=2, target=None, verbose=True, noconfirm=False): '''clean up collections in target db, only keep last <keep_last> number of collections.''' import re from biothings.utils.common import ask target = target or get_target_db() coll_list = target.collection_names() for prefix in ('genedoc_mygene', 'genedoc_mygene_allspecies'): pat = prefix + '_(\d{8})_\w{8}' _li = [] for coll_name in coll_list: mat = re.match(pat, coll_name) if mat: _li.append((mat.group(1), coll_name)) _li.sort() # older collection appears first coll_to_remove = [x[1] for x in _li[:-keep_last]] # keep last # of newer collections if len(coll_to_remove) > 0: print('{} "{}*" collection(s) will be removed.'.format(len(coll_to_remove), prefix)) if verbose: for coll in coll_to_remove: print('\t', coll) if noconfirm or ask("Continue?") == 'Y': for coll in coll_to_remove: target[coll].drop() print("Done.[%s collection(s) removed]" % len(coll_to_remove)) else: print("Aborted.") else: print("Nothing needs to be removed.")
def update_mapping(self, m): assert list(m) == [self._doc_type] # assert m[self._doc_type].keys() == ['properties'] assert 'properties' in m[self._doc_type] print(json.dumps(m, indent=2)) if ask("Continue to update above mapping?") == 'Y': print(self._es.indices.put_mapping(index=self._index, doc_type=self._doc_type, body=m))
def update_from_temp_collections(config, no_confirm=False, use_parallel=False): t0 = time.time() sc = GeneDocSyncer(config) new_src_li = sc.get_new_source_list() if not new_src_li: logging.info("No new source collections need to update. Abort now.") return logging.info("Found {} new source collections need to update:".format( len(new_src_li))) logging.info("\n".join(['\t' + x for x in new_src_li])) if no_confirm or ask('Continue?') == 'Y': logfile = 'databuild_sync_{}_{}.log'.format(config, time.strftime('%Y%m%d')) logfile = os.path.join(LOG_FOLDER, logfile) setup_logfile(logfile) for src in new_src_li: t0 = time.time() logging.info("Current source collection: %s" % src) ts = _get_timestamp(src, as_str=True) logging.info("Calculating changes... ") changes = sc.get_changes(src, use_parallel=use_parallel) logging.info("Done") get_changes_stats(changes) if no_confirm or ask("Continue to save changes...") == 'Y': if config == 'genedoc_mygene': dumpfile = 'changes_{}.pyobj'.format(ts) else: dumpfile = 'changes_{}_allspecies.pyobj'.format(ts) dump(changes, dumpfile) dumpfile_key = 'genedoc_changes/' + dumpfile logging.info('Saving to S3: "{}"... '.format(dumpfile_key)) send_s3_file(dumpfile, dumpfile_key) logging.info('Done.') #os.remove(dumpfile) if no_confirm or ask("Continue to apply changes...") == 'Y': sc.apply_changes(changes) sc.verify_changes(changes) logging.info('=' * 20) logging.info("Finished. %s" % timesofar(t0))
def main_cron(no_confirm=True): '''set no_confirm to True for running this script automatically without intervention.''' src_dump = get_src_dump() mart_version = chk_latest_mart_version() logging.info("Checking latest mart_version:\t%s" % mart_version) doc = src_dump.find_one({'_id': 'ensembl'}) if doc and 'release' in doc and mart_version <= doc['release']: data_file = os.path.join(doc['data_folder'], 'gene_ensembl__gene__main.txt') if os.path.exists(data_file): logging.info("No newer release found. Abort now.") sys.exit(0) DATA_FOLDER = os.path.join(ENSEMBL_FOLDER, str(mart_version)) if not os.path.exists(DATA_FOLDER): os.makedirs(DATA_FOLDER) else: if not (no_confirm or len(os.listdir(DATA_FOLDER)) == 0 or ask('DATA_FOLDER (%s) is not empty. Continue?' % DATA_FOLDER) == 'Y'): sys.exit(0) logfile = os.path.join(DATA_FOLDER, 'ensembl_mart_%s.log' % mart_version) setup_logfile(logfile) #mark the download starts doc = {'_id': 'ensembl', 'release': mart_version, 'timestamp': time.strftime('%Y%m%d'), 'data_folder': DATA_FOLDER, 'logfile': logfile, 'status': 'downloading'} src_dump.save(doc) t0 = time.time() try: BM = BioMart() BM.species_li = get_all_species(mart_version) BM.get_gene__main(os.path.join(DATA_FOLDER, 'gene_ensembl__gene__main.txt')) BM.get_translation__main(os.path.join(DATA_FOLDER, 'gene_ensembl__translation__main.txt')) BM.get_xref_entrezgene(os.path.join(DATA_FOLDER, 'gene_ensembl__xref_entrezgene__dm.txt')) BM.get_profile(os.path.join(DATA_FOLDER, 'gene_ensembl__prot_profile__dm.txt')) BM.get_interpro(os.path.join(DATA_FOLDER, 'gene_ensembl__prot_interpro__dm.txt')) BM.get_pfam(os.path.join(DATA_FOLDER, 'gene_ensembl__prot_pfam__dm.txt')) finally: sys.stdout.close() #mark the download finished successfully _updates = { 'status': 'success', 'time': timesofar(t0), 'pending_to_upload': True # a flag to trigger data uploading } src_dump.update({'_id': 'ensembl'}, {'$set': _updates})
def update_from_temp_collections(config,no_confirm=False,use_parallel=False): t0 = time.time() sc = GeneDocSyncer(config) new_src_li = sc.get_new_source_list() if not new_src_li: logging.info("No new source collections need to update. Abort now.") return logging.info("Found {} new source collections need to update:".format(len(new_src_li))) logging.info("\n".join(['\t' + x for x in new_src_li])) if no_confirm or ask('Continue?') == 'Y': logfile = 'databuild_sync_{}_{}.log'.format(config, time.strftime('%Y%m%d')) logfile = os.path.join(LOG_FOLDER, logfile) setup_logfile(logfile) for src in new_src_li: t0 = time.time() logging.info("Current source collection: %s" % src) ts = _get_timestamp(src, as_str=True) logging.info("Calculating changes... ") changes = sc.get_changes(src, use_parallel=use_parallel) logging.info("Done") get_changes_stats(changes) if no_confirm or ask("Continue to save changes...") == 'Y': if config == 'genedoc_mygene': dumpfile = 'changes_{}.pyobj'.format(ts) else: dumpfile = 'changes_{}_allspecies.pyobj'.format(ts) dump(changes, dumpfile) dumpfile_key = 'genedoc_changes/' + dumpfile logging.info('Saving to S3: "{}"... '.format(dumpfile_key)) send_s3_file(dumpfile, dumpfile_key) logging.info('Done.') #os.remove(dumpfile) if no_confirm or ask("Continue to apply changes...") == 'Y': sc.apply_changes(changes) sc.verify_changes(changes) logging.info('=' * 20) logging.info("Finished. %s" % timesofar(t0))
def update_index(changes, sync_src, sync_target, noconfirm=False): # changes['_add'] = changes['delete'] # changes['_delete'] = changes['add'] # changes['delete'] = changes['_delete'] # changes['add'] = changes['_add'] # del changes['_add'] # del changes['_delete'] print("\t{}\trecords will be added.".format(len(changes['add']))) print("\t{}\trecords will be deleted.".format(len(changes['delete']))) print("\t{}\trecords will be updated.".format(len(changes['update']))) print() print('\tsync_src:\t{:<45}{}'.format(sync_src.target_collection.name, sync_src.name)) print('\tsync_target\t{:<45}{}'.format( sync_target.target_esidxer.ES_INDEX_NAME, sync_target.name)) if noconfirm or ask("Continue?") == 'Y': t00 = time.time() es_idxer = sync_target.target_esidxer if len(changes['add']) > 0: print("Adding {} new records...".format(len(changes['add']))) t0 = time.time() _q = {'_id': {'$in': changes['add']}} for docs in doc_feeder(sync_src.target_collection, step=1000, inbatch=True, query=_q): es_idxer.add_docs(docs) print("Done. [{}]".format(timesofar(t0))) if len(changes['delete']) > 0: print("Deleting {} old records...".format(len(changes['delete']))) t0 = time.time() es_idxer.delete_docs(changes['delete']) print("Done. [{}]".format(timesofar(t0))) if len(changes['update']) > 0: print("Updating {} existing records...".format( len(changes['update']))) t0 = time.time() ids = [d['_id'] for d in changes['update']] _q = {'_id': {'$in': ids}} for docs in doc_feeder(sync_src.target_collection, step=1000, inbatch=True, query=_q): es_idxer.add_docs(docs) print("Done. [{}]".format(timesofar(t0))) print('=' * 20) print('Finished. [{}]'.format(timesofar(t00)))
def apply_changes(self, changes, verify=True, noconfirm=False): if verify: self.pre_verify_changes(changes) if not (noconfirm or ask('\nContinue to apply changes?') == 'Y'): print("Aborted.") return -1 step = self.step _db = get_target_db() source_col = _db[changes['source']] src = GeneDocMongoDBBackend(source_col) target = GeneDocESBackend(self) _timestamp = changes['timestamp'] def _add_docs(ids): i = 0 for _ids in iter_n(ids, step): t1 = time.time() _doc_li = src.mget_from_ids(_ids) for _doc in _doc_li: _doc['_timestamp'] = _timestamp i += 1 target.insert(_doc_li) print('\t{}\t{}'.format(i, timesofar(t1))) t0 = time.time() if changes['add']: print("Adding {} new docs...".format(len(changes['add']))) t00 = time.time() _add_docs(changes['add']) print("done. [{}]".format(timesofar(t00))) if changes['delete']: print("Deleting {} discontinued docs...".format( len(changes['delete'])), end='') t00 = time.time() target.remove_from_ids(changes['delete'], step=step) print("done. [{}]".format(timesofar(t00))) if changes['update']: print("Updating {} existing docs...".format(len( changes['update']))) t00 = time.time() ids = [x['_id'] for x in changes['update']] _add_docs(ids) print("done. [{}]".format(timesofar(t00))) target.finalize() print("\n") print("Finished.", timesofar(t0))
def apply_changes(self, changes, verify=True, noconfirm=False): if verify: self.pre_verify_changes(changes) if not (noconfirm or ask('\nContinue to apply changes?') == 'Y'): print("Aborted.") return -1 step = self.step _db = get_target_db() source_col = _db[changes['source']] src = GeneDocMongoDBBackend(source_col) target = GeneDocESBackend(self) _timestamp = changes['timestamp'] def _add_docs(ids): i = 0 for _ids in iter_n(ids, step): t1 = time.time() _doc_li = src.mget_from_ids(_ids) for _doc in _doc_li: _doc['_timestamp'] = _timestamp i += 1 target.insert(_doc_li) print('\t{}\t{}'.format(i, timesofar(t1))) t0 = time.time() if changes['add']: print("Adding {} new docs...".format(len(changes['add']))) t00 = time.time() _add_docs(changes['add']) print("done. [{}]".format(timesofar(t00))) if changes['delete']: print("Deleting {} discontinued docs...".format(len(changes['delete'])), end='') t00 = time.time() target.remove_from_ids(changes['delete'], step=step) print("done. [{}]".format(timesofar(t00))) if changes['update']: print("Updating {} existing docs...".format(len(changes['update']))) t00 = time.time() ids = [x['_id'] for x in changes['update']] _add_docs(ids) print("done. [{}]".format(timesofar(t00))) target.finalize() print("\n") print("Finished.", timesofar(t0))
def main_cron(): no_confirm = True # set it to True for running this script automatically without intervention. logging.info("Checking latest refseq release:\t", end='') refseq_release = get_refseq_release() logging.info(refseq_release) src_dump = get_src_dump() doc = src_dump.find_one({'_id': 'refseq'}) if doc and 'release' in doc and refseq_release <= doc['release']: data_file = os.path.join(doc['data_folder'], 'complete.109.rna.gbff.gz') if os.path.exists(data_file): logging.info("No newer release found. Abort now.") sys.exit(0) DATA_FOLDER = os.path.join(REFSEQ_FOLDER, str(refseq_release)) if not os.path.exists(DATA_FOLDER): os.makedirs(DATA_FOLDER) else: if not (no_confirm or len(os.listdir(DATA_FOLDER)) == 0 or ask('DATA_FOLDER (%s) is not empty. Continue?' % DATA_FOLDER) == 'Y'): sys.exit(0) logfile = os.path.join(DATA_FOLDER, 'refseq_dump.log') setup_logfile(logfile) #mark the download starts doc = {'_id': 'refseq', 'release': refseq_release, 'timestamp': time.strftime('%Y%m%d'), 'data_folder': DATA_FOLDER, 'logfile': logfile, 'status': 'downloading'} src_dump.save(doc) t0 = time.time() try: download(DATA_FOLDER, refseq_release, no_confirm=no_confirm) finally: sys.stdout.close() #mark the download finished successfully _updates = { 'status': 'success', 'time': timesofar(t0), 'pending_to_upload': True # a flag to trigger data uploading } src_dump.update({'_id': 'refseq'}, {'$set': _updates})
def update_index(changes, sync_src, sync_target, noconfirm=False): # changes['_add'] = changes['delete'] # changes['_delete'] = changes['add'] # changes['delete'] = changes['_delete'] # changes['add'] = changes['_add'] # del changes['_add'] # del changes['_delete'] print("\t{}\trecords will be added.".format(len(changes['add']))) print("\t{}\trecords will be deleted.".format(len(changes['delete']))) print("\t{}\trecords will be updated.".format(len(changes['update']))) print() print('\tsync_src:\t{:<45}{}'.format(sync_src.target_collection.name, sync_src.name)) print('\tsync_target\t{:<45}{}'.format(sync_target.target_esidxer.ES_INDEX_NAME, sync_target.name)) if noconfirm or ask("Continue?") == 'Y': t00 = time.time() es_idxer = sync_target.target_esidxer if len(changes['add']) > 0: print("Adding {} new records...".format(len(changes['add']))) t0 = time.time() _q = {'_id': {'$in': changes['add']}} for docs in doc_feeder(sync_src.target_collection, step=1000, inbatch=True, query=_q): es_idxer.add_docs(docs) print("Done. [{}]".format(timesofar(t0))) if len(changes['delete']) > 0: print("Deleting {} old records...".format(len(changes['delete']))) t0 = time.time() es_idxer.delete_docs(changes['delete']) print("Done. [{}]".format(timesofar(t0))) if len(changes['update']) > 0: print("Updating {} existing records...".format(len(changes['update']))) t0 = time.time() ids = [d['_id'] for d in changes['update']] _q = {'_id': {'$in': ids}} for docs in doc_feeder(sync_src.target_collection, step=1000, inbatch=True, query=_q): es_idxer.add_docs(docs) print("Done. [{}]".format(timesofar(t0))) print('=' * 20) print('Finished. [{}]'.format(timesofar(t00)))
def main(): no_confirm = True # set it to True for running this script automatically without intervention. if not os.path.exists(DATA_FOLDER): os.makedirs(DATA_FOLDER) else: if not (no_confirm or len(os.listdir(DATA_FOLDER)) == 0 or ask('DATA_FOLDER (%s) is not empty. Continue?' % DATA_FOLDER) == 'Y'): sys.exit() logfile = os.path.join(DATA_FOLDER, 'entrez_dump.log') setup_logfile(logfile) #mark the download starts src_dump = get_src_dump() doc = { '_id': 'entrez', 'timestamp': timestamp, 'data_folder': DATA_FOLDER, 'logfile': logfile, 'status': 'downloading' } src_dump.save(doc) t0 = time.time() download(DATA_FOLDER, no_confirm=no_confirm) t_download = timesofar(t0) t1 = time.time() #mark parsing starts src_dump.update({'_id': 'entrez'}, {'$set': {'status': 'parsing'}}) parse_gbff(DATA_FOLDER) t_parsing = timesofar(t1) t_total = timesofar(t0) #mark the download finished successfully _updates = { 'status': 'success', 'time': { 'download': t_download, 'parsing': t_parsing, 'total': t_total }, 'pending_to_upload': True # a flag to trigger data uploading } src_dump.update({'_id': 'entrez'}, {'$set': _updates})
def main(): if len(sys.argv) > 1: config = sys.argv[1] else: config = 'mygene_allspecies' if not config.startswith('genedoc_'): config = 'genedoc_' + config assert config in ['genedoc_mygene', 'genedoc_mygene_allspecies'] noconfirm = '-b' in sys.argv _changes_fn = _get_current_changes_fn(config) if _changes_fn: print("Changes file: " + _changes_fn) else: print("No changes file found. Aborted.") return -1 if noconfirm or ask("Continue to load?") == 'Y': changes = loadobj(_changes_fn) else: print("Aborted.") return -2 _es_index = config + TARGET_ES_INDEX_SUFFIX # ES host will be set depending on whether a tunnel is used or not with open_tunnel() as tunnel: if tunnel.ok: _es_host = 'localhost:' + str(es_local_tunnel_port) else: _es_host = ES_HOST esi = ESIndexer2(_es_index, es_host=_es_host) meta = esi.get_mapping_meta(changes) print('\033[34;06m{}\033[0m:'.format('[Metadata]')) pprint(meta) code = esi.apply_changes(changes, noconfirm=noconfirm) if code != -1: # aborted when code == -1 _meta = {'_meta': meta} print( esi.conn.indices.put_mapping(esi.ES_INDEX_TYPE, _meta, [esi.ES_INDEX_NAME])) esi.post_verify_changes(changes)
def main(): no_confirm = True # set it to True for running this script automatically without intervention. if not os.path.exists(DATA_FOLDER): os.makedirs(DATA_FOLDER) else: if not (no_confirm or len(os.listdir(DATA_FOLDER)) == 0 or ask('DATA_FOLDER (%s) is not empty. Continue?' % DATA_FOLDER) == 'Y'): sys.exit() logfile = os.path.join(DATA_FOLDER, 'entrez_dump.log') setup_logfile(logfile) #mark the download starts src_dump = get_src_dump() doc = {'_id': 'entrez', 'timestamp': timestamp, 'data_folder': DATA_FOLDER, 'logfile': logfile, 'status': 'downloading'} src_dump.save(doc) t0 = time.time() download(DATA_FOLDER, no_confirm=no_confirm) t_download = timesofar(t0) t1 = time.time() #mark parsing starts src_dump.update({'_id': 'entrez'}, {'$set': {'status': 'parsing'}}) parse_gbff(DATA_FOLDER) t_parsing = timesofar(t1) t_total = timesofar(t0) #mark the download finished successfully _updates = { 'status': 'success', 'time': { 'download': t_download, 'parsing': t_parsing, 'total': t_total }, 'pending_to_upload': True # a flag to trigger data uploading } src_dump.update({'_id': 'entrez'}, {'$set': _updates})
def main(no_confirm=True): src_dump = get_src_dump() lastmodified = check_lastmodified() doc = src_dump.find_one({'_id': 'exac'}) if doc and 'lastmodified' in doc and lastmodified <= doc['lastmodified']: path, filename = os.path.split(DATAFILES_PATH[0]) data_file = os.path.join(doc['data_folder'], filename) if os.path.exists(data_file): logging.info("No newer file found. Abort now.") sys.exit(0) if not os.path.exists(DATA_FOLDER): os.makedirs(DATA_FOLDER) else: if not (no_confirm or len(os.listdir(DATA_FOLDER)) == 0 or ask('DATA_FOLDER (%s) is not empty. Continue?' % DATA_FOLDER) == 'Y'): sys.exit(0) logfile = os.path.join(DATA_FOLDER, 'exac_dump.log') setup_logfile(logfile) #mark the download starts doc = { '_id': 'exac', 'timestamp': timestamp, 'data_folder': DATA_FOLDER, 'lastmodified': lastmodified, 'logfile': logfile, 'status': 'downloading' } src_dump.save(doc) t0 = time.time() download(no_confirm) #mark the download finished successfully _updates = { 'status': 'success', 'time': timesofar(t0), 'pending_to_upload': True # a flag to trigger data uploading } src_dump.update({'_id': 'exac'}, {'$set': _updates})
def main(): if len(sys.argv) > 1: config = sys.argv[1] else: config = 'mygene_allspecies' if not config.startswith('genedoc_'): config = 'genedoc_' + config assert config in ['genedoc_mygene', 'genedoc_mygene_allspecies'] noconfirm = '-b' in sys.argv _changes_fn = _get_current_changes_fn(config) if _changes_fn: print("Changes file: " + _changes_fn) else: print("No changes file found. Aborted.") return -1 if noconfirm or ask("Continue to load?") == 'Y': changes = loadobj(_changes_fn) else: print("Aborted.") return -2 _es_index = config + TARGET_ES_INDEX_SUFFIX # ES host will be set depending on whether a tunnel is used or not with open_tunnel() as tunnel: if tunnel.ok: _es_host = 'localhost:' + str(es_local_tunnel_port) else: _es_host = ES_HOST esi = ESIndexer2(_es_index, es_host=_es_host) meta = esi.get_mapping_meta(changes) print('\033[34;06m{}\033[0m:'.format('[Metadata]')) pprint(meta) code = esi.apply_changes(changes, noconfirm=noconfirm) if code != -1: # aborted when code == -1 _meta = {'_meta': meta} print(esi.conn.indices.put_mapping(esi.ES_INDEX_TYPE, _meta, [esi.ES_INDEX_NAME])) esi.post_verify_changes(changes)
def merge_resume(self, build_config, at_collection, step=10000): '''resume a merging process after a failure. .merge_resume('mygene_allspecies', 'reporter') ''' assert not self.using_ipython_cluster, "Abort. Can only resume merging in non-parallel mode." self.load_build_config(build_config) last_build = self._build_config['build'][-1] logging.info("Last build record:") logging.info(pformat(last_build)) assert last_build['status'] == 'building', \ "Abort. Last build does not need to be resumed." assert at_collection in self._build_config['sources'], \ 'Abort. Cannot resume merging from a unknown collection "{}"'.format(at_collection) assert last_build['target_backend'] == self.target.name, \ 'Abort. Re-initialized DataBuilder class using matching backend "{}"'.format(last_build['backend']) assert last_build.get('stats', None), \ 'Abort. Intital build stats are not available. You should restart the build from the scratch.' self._stats = last_build['stats'] if ask('Continue to resume merging from "{}"?'.format( at_collection)) == 'Y': #TODO: resume logging target_name = last_build['target'] self.validate_src_collections() self.prepare_target(target_name=target_name) src_cnt = 0 for collection in self._build_config['sources']: if collection in ['entrez_gene', 'ensembl_gene']: continue src_cnt += 1 if collection == at_collection: break self._merge_local(step=step, restart_at=src_cnt) if self.target.name == 'es': logging.info("Updating metadata...") self.update_mapping_meta() self.log_src_build({ 'status': 'success', 'timestamp': datetime.now() })
def diff2src(use_parallel=True, noconfirm=False): src_li = [] target_db = get_target_db() src_li.extend([(name, target_db[name].count(), 'mongodb') for name in sorted(target_db.collection_names()) if name.startswith('genedoc')]) es_idxer = ESIndexer() es_idxer.conn.default_indices = [] for es_idx in es_idxer.conn.indices.get_indices(): if es_idx.startswith('genedoc'): es_idxer.ES_INDEX_NAME = es_idx src_li.append((es_idx, es_idxer.count()['count'], 'es')) print("Found {} sources:".format(len(src_li))) src_1 = _pick_one(src_li, "Pick first source above: ") src_li.remove(src_1) print src_2 = _pick_one(src_li, "Pick second source above: ") sync_li = [] for src in (src_1, src_2): if src[2] == 'mongodb': b = backend.GeneDocMongoDBBackend(target_db[src[0]]) elif src[2] == 'es': es_idxer = ESIndexer() es_idxer.ES_INDEX_NAME = src[0] es_idxer.step = 10000 b = backend.GeneDocESBackend(es_idxer) sync_li.append(b) sync_src, sync_target = sync_li print('\tsync_src:\t{:<45}{}\t{}'.format(*src_1)) print('\tsync_target\t{:<45}{}\t{}'.format(*src_2)) if noconfirm or ask("Continue?") == "Y": changes = diff.diff_collections(sync_src, sync_target, use_parallel=use_parallel) return changes
def download(path, no_confirm=False): out = [] orig_path = os.getcwd() try: _expand_refseq_files() for subfolder in FILE_LIST: filedata = FILE_LIST[subfolder] baseurl = filedata['url'] data_folder = os.path.join(path, subfolder) if not os.path.exists(data_folder): os.mkdir(data_folder) for f in filedata['files']: url = baseurl + f os.chdir(data_folder) filename = os.path.split(f)[1] if os.path.exists(filename): if no_confirm or ask( 'Remove existing file "%s"?' % filename) == 'Y': os.remove(filename) else: logging.info("Skipped!") continue logging.info('Downloading "%s"...' % f) #cmdline = 'wget %s' % url #cmdline = 'axel -a -n 5 %s' % url #faster than wget using 5 connections cmdline = _get_ascp_cmdline(url) return_code = os.system(cmdline) #return_code = 0;print cmdline #for testing if return_code == 0: logging.info("Success.") else: logging.info("Failed with return code (%s)." % return_code) out.append((url, return_code)) logging.info("=" * 50) finally: os.chdir(orig_path) return out
def main(no_confirm=True): src_dump = get_src_dump() lastmodified = check_lastmodified() doc = src_dump.find_one({'_id': 'uniprot'}) if doc and 'lastmodified' in doc and lastmodified <= doc['lastmodified']: path, filename = os.path.split(DATAFILE_PATH) data_file = os.path.join(doc['data_folder'], filename) if os.path.exists(data_file): logging.info("No newer file found. Abort now.") sys.exit(0) if not os.path.exists(DATA_FOLDER): os.makedirs(DATA_FOLDER) else: if not (no_confirm or len(os.listdir(DATA_FOLDER)) == 0 or ask('DATA_FOLDER (%s) is not empty. Continue?' % DATA_FOLDER) == 'Y'): sys.exit(0) logfile = os.path.join(DATA_FOLDER, 'uniprot_dump.log') setup_logfile(logfile) #mark the download starts doc = {'_id': 'uniprot', 'timestamp': timestamp, 'data_folder': DATA_FOLDER, 'lastmodified': lastmodified, 'logfile': logfile, 'status': 'downloading'} src_dump.save(doc) t0 = time.time() download(no_confirm) #mark the download finished successfully _updates = { 'status': 'success', 'time': timesofar(t0), 'pending_to_upload': True # a flag to trigger data uploading } src_dump.update({'_id': 'uniprot'}, {'$set': _updates})
def src_clean_archives(keep_last=1, src=None, verbose=True, noconfirm=False): '''clean up archive collections in src db, only keep last <kepp_last> number of archive. ''' from utils.dataload import list2dict from biothings.utils.common import ask src = src or get_src_db() archive_li = sorted([(coll.split('_archive_')[0], coll) for coll in src.collection_names() if coll.find('archive') != -1]) archive_d = list2dict(archive_li, 0, alwayslist=1) coll_to_remove = [] for k, v in archive_d.items(): print(k, end='') #check current collection exists if src[k].count() > 0: cnt = 0 for coll in sorted(v)[:-keep_last]: coll_to_remove.append(coll) cnt += 1 print("\t\t%s archived collections marked to remove." % cnt) else: print('skipped. Missing current "%s" collection!' % k) if len(coll_to_remove) > 0: print("%d archived collections will be removed." % len(coll_to_remove)) if verbose: for coll in coll_to_remove: print('\t', coll) if noconfirm or ask("Continue?") == 'Y': for coll in coll_to_remove: src[coll].drop() print("Done.[%s collections removed]" % len(coll_to_remove)) else: print("Aborted.") else: print("Nothing needs to be removed.")
def run_jobs_on_ipythoncluster(worker, task_list, shutdown_ipengines_after_done=False): t0 = time.time() rc = Client(config.CLUSTER_CLIENT_JSON) lview = rc.load_balanced_view() cnt_nodes = len(lview.targets or rc.ids) print("\t# nodes in use: {}".format(cnt_nodes)) lview.block = False # move to app path lview.map(os.chdir, [config.APP_PATH] * cnt_nodes) print("\t# of tasks: {}".format(len(task_list))) print("\tsubmitting...", end='') job = lview.map_async(worker, task_list) print("done.") try: job.wait_interactive() except KeyboardInterrupt: #handle "Ctrl-C" if ask("\nAbort all submitted jobs?") == 'Y': lview.abort() print("Aborted, all submitted jobs are cancelled.") else: print("Aborted, but your jobs are still running on the cluster.") return if len(job.result()) != len(task_list): print( "WARNING:\t# of results returned ({}) != # of tasks ({}).".format( len(job.result()), len(task_list))) print("\ttotal time: {}".format(timesofar(t0))) if shutdown_ipengines_after_done: print("\tshuting down all ipengine nodes...", end='') lview.shutdown() print('Done.') return job.result()
def download(path, no_confirm=False): out = [] orig_path = os.getcwd() try: _expand_refseq_files() for subfolder in FILE_LIST: filedata = FILE_LIST[subfolder] baseurl = filedata['url'] data_folder = os.path.join(path, subfolder) if not os.path.exists(data_folder): os.mkdir(data_folder) for f in filedata['files']: url = baseurl + f os.chdir(data_folder) filename = os.path.split(f)[1] if os.path.exists(filename): if no_confirm or ask('Remove existing file "%s"?' % filename) == 'Y': os.remove(filename) else: logging.info("Skipped!") continue logging.info('Downloading "%s"...' % f) #cmdline = 'wget %s' % url #cmdline = 'axel -a -n 5 %s' % url #faster than wget using 5 connections cmdline = _get_ascp_cmdline(url) return_code = os.system(cmdline) #return_code = 0;print cmdline #for testing if return_code == 0: logging.info("Success.") else: logging.info("Failed with return code (%s)." % return_code) out.append((url, return_code)) logging.info("=" * 50) finally: os.chdir(orig_path) return out
def download(no_confirm=False): orig_path = os.getcwd() try: os.chdir(DATA_FOLDER) path, filename = os.path.split(DATAFILE_PATH) if os.path.exists(filename): if no_confirm or ask('Remove existing file "%s"?' % filename) == 'Y': os.remove(filename) else: logging.info("Skipped!") return logging.info('Downloading "%s"...' % filename) url = 'ftp://{}/{}'.format(FTP_SERVER, DATAFILE_PATH) cmdline = 'wget %s -O %s' % (url, filename) #cmdline = 'axel -a -n 5 %s' % url #faster than wget using 5 connections return_code = os.system(cmdline) if return_code == 0: logging.info("Success.") else: logging.info("Failed with return code (%s)." % return_code) logging.info("=" * 50) finally: os.chdir(orig_path)
def sync_index(config, use_parallel=True, noconfirm=False): bdr = DataBuilder(backend='mongodb') bdr.load_build_config(config) target_collection = bdr.pick_target_collection() target_es_index = 'genedoc_' + bdr._build_config['name'] sync_src = backend.GeneDocMongoDBBackend(target_collection) es_idxer = ESIndexer(bdr.get_mapping()) es_idxer.ES_INDEX_NAME = target_es_index es_idxer.step = 10000 es_idxer.use_parallel = use_parallel sync_target = backend.GeneDocESBackend(es_idxer) print('\tsync_src:\t{:<40}{}\t{}'.format(target_collection.name, sync_src.name, sync_src.count())) print('\tsync_target\t{:<40}{}\t{}'.format(target_es_index, sync_target.name, sync_target.count())) if noconfirm or ask("Continue?") == "Y": changes = diff.diff_collections(sync_src, sync_target) return changes
def download(no_confirm=False): orig_path = os.getcwd() try: os.chdir(DATA_FOLDER) filename = 'genes.zip' url = GENES_URL if os.path.exists(filename): if no_confirm or ask('Remove existing file "%s"?' % filename) == 'Y': os.remove(filename) else: logging.info("Skipped!") return logging.info('Downloading "%s"...' % filename) cmdline = 'wget "%s" -O %s' % (url, filename) #cmdline = 'axel -a -n 5 %s' % url #faster than wget using 5 connections return_code = os.system(cmdline) if return_code == 0: logging.info("Success.") else: logging.info("Failed with return code (%s)." % return_code) logging.info("=" * 50) finally: os.chdir(orig_path)
def download(path, release, no_confirm=False): out = [] orig_path = os.getcwd() try: data_folder = os.path.join(path, release) if not os.path.exists(data_folder): os.mkdir(data_folder) _url = 'ftp://' + FTP_SERVER + BASE_PATH + DATA_FILE url_li = _expand_wildchar_urls(_url) logging.info('Found {} "{}" files to download.'.format(len(url_li), DATA_FILE)) for url in url_li: os.chdir(data_folder) filename = os.path.split(url)[1] if os.path.exists(filename): if no_confirm or ask('Remove existing file "%s"?' % filename) == 'Y': os.remove(filename) else: logging.info("Skipped!") continue logging.info('Downloading "%s"...' % filename) #cmdline = 'wget %s' % url #cmdline = 'axel -a -n 5 %s' % url #faster than wget using 5 connections cmdline = _get_ascp_cmdline(url) return_code = os.system(cmdline) #return_code = 0;print cmdline #for testing if return_code == 0: logging.info("Success.") else: logging.info("Failed with return code (%s)." % return_code) out.append((url, return_code)) logging.info("=" * 50) finally: os.chdir(orig_path) return out
def merge_resume(self, build_config, at_collection, step=10000): '''resume a merging process after a failure. .merge_resume('mygene_allspecies', 'reporter') ''' assert not self.using_ipython_cluster, "Abort. Can only resume merging in non-parallel mode." self.load_build_config(build_config) last_build = self._build_config['build'][-1] logging.info("Last build record:") logging.info(pformat(last_build)) assert last_build['status'] == 'building', \ "Abort. Last build does not need to be resumed." assert at_collection in self._build_config['sources'], \ 'Abort. Cannot resume merging from a unknown collection "{}"'.format(at_collection) assert last_build['target_backend'] == self.target.name, \ 'Abort. Re-initialized DataBuilder class using matching backend "{}"'.format(last_build['backend']) assert last_build.get('stats', None), \ 'Abort. Intital build stats are not available. You should restart the build from the scratch.' self._stats = last_build['stats'] if ask('Continue to resume merging from "{}"?'.format(at_collection)) == 'Y': #TODO: resume logging target_name = last_build['target'] self.validate_src_collections() self.prepare_target(target_name=target_name) src_cnt = 0 for collection in self._build_config['sources']: if collection in ['entrez_gene', 'ensembl_gene']: continue src_cnt += 1 if collection == at_collection: break self._merge_local(step=step, restart_at=src_cnt) if self.target.name == 'es': logging.info("Updating metadata...") self.update_mapping_meta() self.log_src_build({'status': 'success', 'timestamp': datetime.now()})
def download(no_confirm=False): orig_path = os.getcwd() try: os.chdir(DATA_FOLDER) filename = 'genes.zip' url = GENES_URL if os.path.exists(filename): if no_confirm or ask( 'Remove existing file "%s"?' % filename) == 'Y': os.remove(filename) else: logging.info("Skipped!") return logging.info('Downloading "%s"...' % filename) cmdline = 'wget "%s" -O %s' % (url, filename) #cmdline = 'axel -a -n 5 %s' % url #faster than wget using 5 connections return_code = os.system(cmdline) if return_code == 0: logging.info("Success.") else: logging.info("Failed with return code (%s)." % return_code) logging.info("=" * 50) finally: os.chdir(orig_path)
def build_index2(self, build_config='mygene_allspecies', last_build_idx=-1, use_parallel=False, es_host=None, es_index_name=None, noconfirm=False): """Build ES index from last successfully-merged mongodb collection. optional "es_host" argument can be used to specified another ES host, otherwise default ES_HOST. optional "es_index_name" argument can be used to pass an alternative index name, otherwise same as mongodb collection name """ self.load_build_config(build_config) assert "build" in self._build_config, "Abort. No such build records for config %s" % build_config last_build = self._build_config['build'][last_build_idx] logging.info("Last build record:") logging.info(pformat(last_build)) assert last_build['status'] == 'success', \ "Abort. Last build did not success." assert last_build['target_backend'] == "mongodb", \ 'Abort. Last build need to be built using "mongodb" backend.' assert last_build.get('stats', None), \ 'Abort. Last build stats are not available.' self._stats = last_build['stats'] assert last_build.get('target', None), \ 'Abort. Last build target_collection is not available.' # Get the source collection to build the ES index # IMPORTANT: the collection in last_build['target'] does not contain _timestamp field, # only the "genedoc_*_current" collection does. When "timestamp" is enabled # in mappings, last_build['target'] collection won't be indexed by ES correctly, # therefore, we use "genedoc_*_current" collection as the source here: #target_collection = last_build['target'] target_collection = "genedoc_{}_current".format(build_config) _db = get_target_db() target_collection = _db[target_collection] logging.info("") logging.info('Source: %s' % target_collection.name) _mapping = self.get_mapping() _meta = {} src_version = self.get_src_version() if src_version: _meta['src_version'] = src_version if getattr(self, '_stats', None): _meta['stats'] = self._stats if 'timestamp' in last_build: _meta['timestamp'] = last_build['timestamp'] if _meta: _mapping['_meta'] = _meta es_index_name = es_index_name or target_collection.name es_idxer = ESIndexer(mapping=_mapping, es_index_name=es_index_name, es_host=es_host, step=5000) if build_config == 'mygene_allspecies': es_idxer.number_of_shards = 10 # default 5 es_idxer.check() if noconfirm or ask("Continue to build ES index?") == 'Y': es_idxer.use_parallel = use_parallel #es_idxer.s = 609000 if es_idxer.exists_index(es_idxer.ES_INDEX_NAME): if noconfirm or ask('Index "{}" exists. Delete?'.format( es_idxer.ES_INDEX_NAME)) == 'Y': es_idxer.conn.indices.delete(es_idxer.ES_INDEX_NAME) else: logging.info("Abort.") return es_idxer.create_index() #es_idxer.delete_index_type(es_idxer.ES_INDEX_TYPE, noconfirm=True) es_idxer.build_index(target_collection, verbose=False)
def rename_from_temp_collection(config,from_index,no_confirm=False): # check if index exist before chenging anything sc = GeneDocSyncer(config) if not from_index in sc._db.collection_names(): logging.error("Collection '%s' does not exist" % from_index) from_col = sc._db.get_collection(from_index) orig_name = sc._target_col.name logging.info("Backing up timestamp from '%s'" % orig_name) if no_confirm or ask('Continue?') == 'Y': bckfile = backup_timestamp_main([config]).pop() else: bckfile = None # rename existing current for backup purpose bck_name = orig_name + "_bck_%s" % time.strftime('%Y%m%d%H%M%S') logging.info("Renaming %s to %s" % (orig_name,bck_name)) if no_confirm or ask('Continue?') == 'Y': sc._target_col.rename(bck_name) logging.info("Renaming %s to %s" % (from_col.name,orig_name)) if no_confirm or ask('Continue?') == 'Y': from_col.rename(orig_name) if bckfile is None: try: pat = "%s_current_tsbk_*.txt.bz" % config logging.info("Looking for '%s'" % pat) bckfile = sorted(glob.glob(pat))[0] if ask("Do you want me to apply timestamp from file '%s' to collection '%s' ?" % (bckfile,sc._target_col.name)) == 'Y': pass else: return except IndexError: logging.error("Can't find any timstamp file to apply, giving up...") return prev_ts = {} import bz2 logging.info("Loading timestamps from '%s'" % bckfile) with bz2.BZ2File(bckfile, 'rb') as in_f: for line in in_f.readlines(): _id,ts = line.decode().split("\t") prev_ts[_id.strip()] = datetime.strptime(ts.strip(),"%Y%m%d") logging.info("Now applying timestamp from file '%s' (if more recent than those on the collection)" % bckfile) cur = sc._target_col.find() default_ts = datetime.now() results = {"restored" : 0, "updated" : 0, "unchanged" : 0, "defaulted" : 0} bulk_cnt = 0 bob = sc._target_col.initialize_unordered_bulk_op() cnt = 0 t0 = time.time() while True: try: doc = next(cur) if "_timestamp" not in doc: if prev_ts.get(doc["_id"]): ts = prev_ts[doc["_id"]] results["restored"] += 1 else: ts = default_ts results["defaulted"] += 1 doc["_timestamp"] = ts bulk_cnt += 1 cnt += 1 bob.find({"_id" : doc["_id"]}).update_one({"$set" : doc}) elif prev_ts.get(doc["_id"]) and prev_ts[doc["_id"]] > doc["_timestamp"]: doc["_timestamp"] = prev_ts[doc["_id"]] results["updated"] += 1 bulk_cnt += 1 cnt += 1 bob.find({"_id" : doc["_id"]}).update_one({"$set" : doc}) else: results["unchanged"] += 1 cnt += 1 if cnt % 1000 == 0: logging.info("Processed %s documents (%s) [%s]" % (cnt,results,timesofar(t0))) t0 = time.time() if bulk_cnt == 1000: bulk_cnt = 0 bob.execute() bob = sc._target_col.initialize_unordered_bulk_op() except StopIteration: break cur.close() try: bob.execute() except InvalidOperation: pass logging.info("Done: %s" % results)
def rename_from_temp_collection(config, from_index, no_confirm=False): # check if index exist before chenging anything sc = GeneDocSyncer(config) if not from_index in sc._db.collection_names(): logging.error("Collection '%s' does not exist" % from_index) from_col = sc._db.get_collection(from_index) orig_name = sc._target_col.name logging.info("Backing up timestamp from '%s'" % orig_name) if no_confirm or ask('Continue?') == 'Y': bckfile = backup_timestamp_main([config]).pop() else: bckfile = None # rename existing current for backup purpose bck_name = orig_name + "_bck_%s" % time.strftime('%Y%m%d%H%M%S') logging.info("Renaming %s to %s" % (orig_name, bck_name)) if no_confirm or ask('Continue?') == 'Y': sc._target_col.rename(bck_name) logging.info("Renaming %s to %s" % (from_col.name, orig_name)) if no_confirm or ask('Continue?') == 'Y': from_col.rename(orig_name) if bckfile is None: try: pat = "%s_current_tsbk_*.txt.bz" % config logging.info("Looking for '%s'" % pat) bckfile = sorted(glob.glob(pat))[0] if ask("Do you want me to apply timestamp from file '%s' to collection '%s' ?" % (bckfile, sc._target_col.name)) == 'Y': pass else: return except IndexError: logging.error( "Can't find any timstamp file to apply, giving up...") return prev_ts = {} import bz2 logging.info("Loading timestamps from '%s'" % bckfile) with bz2.BZ2File(bckfile, 'rb') as in_f: for line in in_f.readlines(): _id, ts = line.decode().split("\t") prev_ts[_id.strip()] = datetime.strptime(ts.strip(), "%Y%m%d") logging.info( "Now applying timestamp from file '%s' (if more recent than those on the collection)" % bckfile) cur = sc._target_col.find() default_ts = datetime.now() results = {"restored": 0, "updated": 0, "unchanged": 0, "defaulted": 0} bulk_cnt = 0 bob = sc._target_col.initialize_unordered_bulk_op() cnt = 0 t0 = time.time() while True: try: doc = next(cur) if "_timestamp" not in doc: if prev_ts.get(doc["_id"]): ts = prev_ts[doc["_id"]] results["restored"] += 1 else: ts = default_ts results["defaulted"] += 1 doc["_timestamp"] = ts bulk_cnt += 1 cnt += 1 bob.find({"_id": doc["_id"]}).update_one({"$set": doc}) elif prev_ts.get( doc["_id"]) and prev_ts[doc["_id"]] > doc["_timestamp"]: doc["_timestamp"] = prev_ts[doc["_id"]] results["updated"] += 1 bulk_cnt += 1 cnt += 1 bob.find({"_id": doc["_id"]}).update_one({"$set": doc}) else: results["unchanged"] += 1 cnt += 1 if cnt % 1000 == 0: logging.info("Processed %s documents (%s) [%s]" % (cnt, results, timesofar(t0))) t0 = time.time() if bulk_cnt == 1000: bulk_cnt = 0 bob.execute() bob = sc._target_col.initialize_unordered_bulk_op() except StopIteration: break cur.close() try: bob.execute() except InvalidOperation: pass logging.info("Done: %s" % results)
def build_index2(self, build_config='mygene_allspecies', last_build_idx=-1, use_parallel=False, es_host=None, es_index_name=None, noconfirm=False): """Build ES index from last successfully-merged mongodb collection. optional "es_host" argument can be used to specified another ES host, otherwise default ES_HOST. optional "es_index_name" argument can be used to pass an alternative index name, otherwise same as mongodb collection name """ self.load_build_config(build_config) assert "build" in self._build_config, "Abort. No such build records for config %s" % build_config last_build = self._build_config['build'][last_build_idx] logging.info("Last build record:") logging.info(pformat(last_build)) assert last_build['status'] == 'success', \ "Abort. Last build did not success." assert last_build['target_backend'] == "mongodb", \ 'Abort. Last build need to be built using "mongodb" backend.' assert last_build.get('stats', None), \ 'Abort. Last build stats are not available.' self._stats = last_build['stats'] assert last_build.get('target', None), \ 'Abort. Last build target_collection is not available.' # Get the source collection to build the ES index # IMPORTANT: the collection in last_build['target'] does not contain _timestamp field, # only the "genedoc_*_current" collection does. When "timestamp" is enabled # in mappings, last_build['target'] collection won't be indexed by ES correctly, # therefore, we use "genedoc_*_current" collection as the source here: #target_collection = last_build['target'] target_collection = "genedoc_{}_current".format(build_config) _db = get_target_db() target_collection = _db[target_collection] logging.info("") logging.info('Source: %s' % target_collection.name) _mapping = self.get_mapping() _meta = {} src_version = self.get_src_version() if src_version: _meta['src_version'] = src_version if getattr(self, '_stats', None): _meta['stats'] = self._stats if 'timestamp' in last_build: _meta['timestamp'] = last_build['timestamp'] if _meta: _mapping['_meta'] = _meta es_index_name = es_index_name or target_collection.name es_idxer = ESIndexer(mapping=_mapping, es_index_name=es_index_name, es_host=es_host, step=5000) if build_config == 'mygene_allspecies': es_idxer.number_of_shards = 10 # default 5 es_idxer.check() if noconfirm or ask("Continue to build ES index?") == 'Y': es_idxer.use_parallel = use_parallel #es_idxer.s = 609000 if es_idxer.exists_index(es_idxer.ES_INDEX_NAME): if noconfirm or ask('Index "{}" exists. Delete?'.format(es_idxer.ES_INDEX_NAME)) == 'Y': es_idxer.conn.indices.delete(es_idxer.ES_INDEX_NAME) else: logging.info("Abort.") return es_idxer.create_index() #es_idxer.delete_index_type(es_idxer.ES_INDEX_TYPE, noconfirm=True) es_idxer.build_index(target_collection, verbose=False)
def main_cron(no_confirm=True): '''set no_confirm to True for running this script automatically without intervention.''' src_dump = get_src_dump() mart_version = chk_latest_mart_version() logging.info("Checking latest mart_version:\t%s" % mart_version) doc = src_dump.find_one({'_id': 'ensembl'}) if doc and 'release' in doc and mart_version <= doc['release']: data_file = os.path.join(doc['data_folder'], 'gene_ensembl__gene__main.txt') if os.path.exists(data_file): logging.info("No newer release found. Abort now.") sys.exit(0) DATA_FOLDER = os.path.join(ENSEMBL_FOLDER, str(mart_version)) if not os.path.exists(DATA_FOLDER): os.makedirs(DATA_FOLDER) else: if not (no_confirm or len(os.listdir(DATA_FOLDER)) == 0 or ask('DATA_FOLDER (%s) is not empty. Continue?' % DATA_FOLDER) == 'Y'): sys.exit(0) logfile = os.path.join(DATA_FOLDER, 'ensembl_mart_%s.log' % mart_version) setup_logfile(logfile) #mark the download starts doc = { '_id': 'ensembl', 'release': mart_version, 'timestamp': time.strftime('%Y%m%d'), 'data_folder': DATA_FOLDER, 'logfile': logfile, 'status': 'downloading' } src_dump.save(doc) t0 = time.time() try: BM = BioMart() BM.species_li = get_all_species(mart_version) BM.get_gene__main( os.path.join(DATA_FOLDER, 'gene_ensembl__gene__main.txt')) BM.get_translation__main( os.path.join(DATA_FOLDER, 'gene_ensembl__translation__main.txt')) BM.get_xref_entrezgene( os.path.join(DATA_FOLDER, 'gene_ensembl__xref_entrezgene__dm.txt')) BM.get_profile( os.path.join(DATA_FOLDER, 'gene_ensembl__prot_profile__dm.txt')) BM.get_interpro( os.path.join(DATA_FOLDER, 'gene_ensembl__prot_interpro__dm.txt')) BM.get_pfam( os.path.join(DATA_FOLDER, 'gene_ensembl__prot_pfam__dm.txt')) finally: sys.stdout.close() #mark the download finished successfully _updates = { 'status': 'success', 'time': timesofar(t0), 'pending_to_upload': True # a flag to trigger data uploading } src_dump.update({'_id': 'ensembl'}, {'$set': _updates})