Ejemplo n.º 1
0
def clean_target_collection():
    bdr = DataBuilder(backend='mongodb')
    bdr.load_build_config('mygene')
    try:
        target_collection = bdr.pick_target_collection(autoselect=False)
    except KeyboardInterrupt:
        print("Aborted.")
        return

    if ask('Delete collection "{}"'.format(target_collection.name)) == 'Y':
        if ask("Double check! Are you sure?") == 'Y':
            target_collection.drop()
            print('Done, collection "{}" was dropped.'.format(target_collection.name))
Ejemplo n.º 2
0
def clean_target_collection():
    bdr = DataBuilder(backend='mongodb')
    bdr.load_build_config('mygene')
    try:
        target_collection = bdr.pick_target_collection(autoselect=False)
    except KeyboardInterrupt:
        print("Aborted.")
        return

    if ask('Delete collection "{}"'.format(target_collection.name)) == 'Y':
        if ask("Double check! Are you sure?") == 'Y':
            target_collection.drop()
            print('Done, collection "{}" was dropped.'.format(
                target_collection.name))
Ejemplo n.º 3
0
def build_index(config, use_parallel=True, noconfirm=False):
    bdr = DataBuilder(backend='mongodb')
    bdr.load_build_config(config)
    target_collection = bdr.pick_target_collection()
    target_es_index = 'genedoc_' + bdr._build_config['name']

    if target_collection:
        es_idxer = ESIndexer(mapping=bdr.get_mapping())
        es_idxer.ES_INDEX_NAME = target_es_index
        es_idxer.step = 10000
        es_idxer.use_parallel = use_parallel
        es_server = es_idxer.conn.servers[0].geturl()
        print("ES target: {}/{}/{}".format(es_server,
                                           es_idxer.ES_INDEX_NAME,
                                           es_idxer.ES_INDEX_TYPE))
        if noconfirm or ask("Continue?") == 'Y':
            #es_idxer.s = 609000
            #es_idxer.conn.indices.delete_index(es_idxer.ES_INDEX_NAME)
            es_idxer.create_index()
            es_idxer.delete_index_type(es_idxer.ES_INDEX_TYPE, noconfirm=noconfirm)
            es_idxer.build_index(target_collection, verbose=False)
            es_idxer.optimize()
        else:
            print("Aborted.")
    else:
        print("Error: target collection is not ready yet or failed to build.")
Ejemplo n.º 4
0
def run_jobs_on_ipythoncluster(worker, task_list, shutdown_ipengines_after_done=False):

    t0 = time.time()
    rc = Client(CLUSTER_CLIENT_JSON)
    lview = rc.load_balanced_view()
    cnt_nodes = len(lview.targets or rc.ids)
    print("\t# nodes in use: {}".format(cnt_nodes))
    lview.block = False

    print("\t# of tasks: {}".format(len(task_list)))
    print("\tsubmitting...", end='')
    job = lview.map_async(worker,task_list)
    print("done.")
    try:
        job.wait_interactive()
    except KeyboardInterrupt:
        #handle "Ctrl-C"
        if ask("\nAbort all submitted jobs?") == 'Y':
            lview.abort()
            print("Aborted, all submitted jobs are cancelled.")
        else:
            print("Aborted, but your jobs are still running on the cluster.")
        return

    if len(job.result()) != len(task_list):
        print("WARNING:\t# of results returned ({}) != # of tasks ({}).".format(len(job.result()), len(task_list)))
    print("\ttotal time: {}".format(timesofar(t0)))

    if shutdown_ipengines_after_done:
        print("\tshuting down all ipengine nodes...", end='')
        lview.shutdown()
        print('Done.')
    return job.result()
Ejemplo n.º 5
0
    def setUpClass(cls):

        cls.index = Index(Schema.Index.name)

        if cls.index.exists():
            if FORCE_TEST or ask(
                    'Current indexed documents will be permenantely lost.'
            ) == 'Y':
                cls.index.delete()
            else:
                exit()

        # create new index as defined in Schema class
        Schema.init()

        # test dataset
        cls.testset = []

        # add a document
        url = 'https://raw.githubusercontent.com/namespacestd0/mygene.info/master/README.md'
        meta = Metadata(username='******', slug='dev', url=url)
        schema = Schema(clses=['biothings', 'smartapi'],
                        props=['es-dsl'],
                        _meta=meta)
        schema.save()
        cls.testset.append(schema)

        # add another document
        url = ('https://raw.githubusercontent.com/data2health/'
               'schemas/biothings/biothings/biothings_curie.jsonld')
        meta = Metadata(username='******', slug='d2h', url=url)
        schema = Schema(clses=['biothings'], _meta=meta)
        schema.save()
        cls.testset.append(schema)
Ejemplo n.º 6
0
def download(url, output_folder, output_file, no_confirm=False, use_axel=False):
    orig_path = os.getcwd()
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)  # create output_folder if doesn not exist
    try:
        os.chdir(output_folder)
        if os.path.exists(output_file):
            if no_confirm or ask('Remove existing file "%s"?' % output_file) == 'Y':
                os.remove(output_file)
            else:
                print("Skipped!")
                return
        print('Downloading "%s"...' % output_file)
        if use_axel:
            #faster than wget using 5 connections
            cmdline = 'axel -a -n 5 "{}" -o "{}"'.format(url, output_file)
        else:
            cmdline = 'wget "{}" -O "{}"'.format(url, output_file)
        return_code = os.system(cmdline)
        if return_code == 0:
            print("Success.")
        else:
            print("Failed with return code (%s)." % return_code)
        print("="*50)
    finally:
        os.chdir(orig_path)
Ejemplo n.º 7
0
def build_index(config, use_parallel=True, noconfirm=False):
    bdr = DataBuilder(backend='mongodb')
    bdr.load_build_config(config)
    target_collection = bdr.pick_target_collection()
    target_es_index = 'genedoc_' + bdr._build_config['name']

    if target_collection:
        es_idxer = ESIndexer(mapping=bdr.get_mapping())
        es_idxer.ES_INDEX_NAME = target_es_index
        es_idxer.step = 10000
        es_idxer.use_parallel = use_parallel
        es_server = es_idxer.conn.servers[0].geturl()
        print("ES target: {}/{}/{}".format(es_server, es_idxer.ES_INDEX_NAME,
                                           es_idxer.ES_INDEX_TYPE))
        if noconfirm or ask("Continue?") == 'Y':
            #es_idxer.s = 609000
            #es_idxer.conn.indices.delete_index(es_idxer.ES_INDEX_NAME)
            es_idxer.create_index()
            es_idxer.delete_index_type(es_idxer.ES_INDEX_TYPE,
                                       noconfirm=noconfirm)
            es_idxer.build_index(target_collection, verbose=False)
            es_idxer.optimize()
        else:
            print("Aborted.")
    else:
        print("Error: target collection is not ready yet or failed to build.")
Ejemplo n.º 8
0
def download(url, output_folder, output_file, no_confirm=False, use_axel=False):
    orig_path = os.getcwd()
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)  # create output_folder if doesn not exist
    try:
        os.chdir(output_folder)
        if os.path.exists(output_file):
            if no_confirm or ask('Remove existing file "%s"?' % output_file) == 'Y':
                os.remove(output_file)
            else:
                print("Skipped!")
                return
        print('Downloading "%s"...' % output_file)
        if use_axel:
            #faster than wget using 5 connections
            cmdline = 'axel -a -n 5 "{}" -o "{}"'.format(url, output_file)
        else:
            cmdline = 'wget "{}" -O "{}"'.format(url, output_file)
        return_code = os.system(cmdline)
        if return_code == 0:
            print("Success.")
        else:
            print("Failed with return code (%s)." % return_code)
        print("="*50)
    finally:
        os.chdir(orig_path)
Ejemplo n.º 9
0
def download(no_confirm=False):
    orig_path = os.getcwd()
    try:
        os.chdir(DATA_FOLDER)
        for one_file in DATAFILES_PATH:
            path, filename = os.path.split(one_file)
            if os.path.exists(filename):
                if no_confirm or ask(
                        'Remove existing file "%s"?' % filename) == 'Y':
                    os.remove(filename)
                else:
                    logging.info("Skipped!")
                    return
            logging.info('Downloading "%s"...' % filename)
            url = 'ftp://{}/{}'.format(FTP_SERVER, one_file)
            cmdline = 'wget %s -O %s' % (url, filename)
            #cmdline = 'axel -a -n 5 %s' % url   #faster than wget using 5 connections
            return_code = os.system(cmdline)
            if return_code == 0:
                logging.info("Success.")
            else:
                logging.info("Failed with return code (%s)." % return_code)
            logging.info("=" * 50)
    finally:
        os.chdir(orig_path)
Ejemplo n.º 10
0
def target_clean_collections(keep_last=2,
                             target=None,
                             verbose=True,
                             noconfirm=False):
    '''clean up collections in target db, only keep last <keep_last> number of collections.'''
    import re
    from biothings.utils.common import ask

    target = target or get_target_db()
    coll_list = target.collection_names()

    for prefix in ('genedoc_mygene', 'genedoc_mygene_allspecies'):
        pat = prefix + '_(\d{8})_\w{8}'
        _li = []
        for coll_name in coll_list:
            mat = re.match(pat, coll_name)
            if mat:
                _li.append((mat.group(1), coll_name))
        _li.sort()  # older collection appears first
        coll_to_remove = [x[1] for x in _li[:-keep_last]
                          ]  # keep last # of newer collections
        if len(coll_to_remove) > 0:
            print('{} "{}*" collection(s) will be removed.'.format(
                len(coll_to_remove), prefix))
            if verbose:
                for coll in coll_to_remove:
                    print('\t', coll)
            if noconfirm or ask("Continue?") == 'Y':
                for coll in coll_to_remove:
                    target[coll].drop()
                print("Done.[%s collection(s) removed]" % len(coll_to_remove))
            else:
                print("Aborted.")
        else:
            print("Nothing needs to be removed.")
Ejemplo n.º 11
0
def target_clean_collections(keep_last=2, target=None, verbose=True, noconfirm=False):
    '''clean up collections in target db, only keep last <keep_last> number of collections.'''
    import re
    from biothings.utils.common import ask

    target = target or get_target_db()
    coll_list = target.collection_names()

    for prefix in ('genedoc_mygene', 'genedoc_mygene_allspecies'):
        pat = prefix + '_(\d{8})_\w{8}'
        _li = []
        for coll_name in coll_list:
            mat = re.match(pat, coll_name)
            if mat:
                _li.append((mat.group(1), coll_name))
        _li.sort()   # older collection appears first
        coll_to_remove = [x[1] for x in _li[:-keep_last]]   # keep last # of newer collections
        if len(coll_to_remove) > 0:
            print('{} "{}*" collection(s) will be removed.'.format(len(coll_to_remove), prefix))
            if verbose:
                for coll in coll_to_remove:
                    print('\t', coll)
            if noconfirm or ask("Continue?") == 'Y':
                for coll in coll_to_remove:
                    target[coll].drop()
                print("Done.[%s collection(s) removed]" % len(coll_to_remove))
            else:
                print("Aborted.")
        else:
            print("Nothing needs to be removed.")
Ejemplo n.º 12
0
 def update_mapping(self, m):
     assert list(m) == [self._doc_type]
     # assert m[self._doc_type].keys() == ['properties']
     assert 'properties' in m[self._doc_type]
     print(json.dumps(m, indent=2))
     if ask("Continue to update above mapping?") == 'Y':
         print(self._es.indices.put_mapping(index=self._index, doc_type=self._doc_type, body=m))
Ejemplo n.º 13
0
def update_from_temp_collections(config, no_confirm=False, use_parallel=False):
    t0 = time.time()
    sc = GeneDocSyncer(config)
    new_src_li = sc.get_new_source_list()
    if not new_src_li:
        logging.info("No new source collections need to update. Abort now.")
        return

    logging.info("Found {} new source collections need to update:".format(
        len(new_src_li)))
    logging.info("\n".join(['\t' + x for x in new_src_li]))

    if no_confirm or ask('Continue?') == 'Y':
        logfile = 'databuild_sync_{}_{}.log'.format(config,
                                                    time.strftime('%Y%m%d'))
        logfile = os.path.join(LOG_FOLDER, logfile)
        setup_logfile(logfile)

        for src in new_src_li:
            t0 = time.time()
            logging.info("Current source collection: %s" % src)
            ts = _get_timestamp(src, as_str=True)
            logging.info("Calculating changes... ")
            changes = sc.get_changes(src, use_parallel=use_parallel)
            logging.info("Done")
            get_changes_stats(changes)
            if no_confirm or ask("Continue to save changes...") == 'Y':
                if config == 'genedoc_mygene':
                    dumpfile = 'changes_{}.pyobj'.format(ts)
                else:
                    dumpfile = 'changes_{}_allspecies.pyobj'.format(ts)
                dump(changes, dumpfile)
                dumpfile_key = 'genedoc_changes/' + dumpfile
                logging.info('Saving to S3: "{}"... '.format(dumpfile_key))
                send_s3_file(dumpfile, dumpfile_key)
                logging.info('Done.')
                #os.remove(dumpfile)

            if no_confirm or ask("Continue to apply changes...") == 'Y':
                sc.apply_changes(changes)
                sc.verify_changes(changes)
            logging.info('=' * 20)
            logging.info("Finished. %s" % timesofar(t0))
Ejemplo n.º 14
0
def main_cron(no_confirm=True):
    '''set no_confirm to True for running this script automatically
       without intervention.'''

    src_dump = get_src_dump()
    mart_version = chk_latest_mart_version()
    logging.info("Checking latest mart_version:\t%s" % mart_version)

    doc = src_dump.find_one({'_id': 'ensembl'})
    if doc and 'release' in doc and mart_version <= doc['release']:
        data_file = os.path.join(doc['data_folder'], 'gene_ensembl__gene__main.txt')
        if os.path.exists(data_file):
            logging.info("No newer release found. Abort now.")
            sys.exit(0)

    DATA_FOLDER = os.path.join(ENSEMBL_FOLDER, str(mart_version))
    if not os.path.exists(DATA_FOLDER):
        os.makedirs(DATA_FOLDER)
    else:
        if not (no_confirm or len(os.listdir(DATA_FOLDER)) == 0 or ask('DATA_FOLDER (%s) is not empty. Continue?' % DATA_FOLDER) == 'Y'):
            sys.exit(0)

    logfile = os.path.join(DATA_FOLDER, 'ensembl_mart_%s.log' % mart_version)
    setup_logfile(logfile)

    #mark the download starts
    doc = {'_id': 'ensembl',
           'release': mart_version,
           'timestamp': time.strftime('%Y%m%d'),
           'data_folder': DATA_FOLDER,
           'logfile': logfile,
           'status': 'downloading'}
    src_dump.save(doc)
    t0 = time.time()

    try:
        BM = BioMart()
        BM.species_li = get_all_species(mart_version)
        BM.get_gene__main(os.path.join(DATA_FOLDER, 'gene_ensembl__gene__main.txt'))
        BM.get_translation__main(os.path.join(DATA_FOLDER, 'gene_ensembl__translation__main.txt'))
        BM.get_xref_entrezgene(os.path.join(DATA_FOLDER, 'gene_ensembl__xref_entrezgene__dm.txt'))

        BM.get_profile(os.path.join(DATA_FOLDER, 'gene_ensembl__prot_profile__dm.txt'))
        BM.get_interpro(os.path.join(DATA_FOLDER, 'gene_ensembl__prot_interpro__dm.txt'))
        BM.get_pfam(os.path.join(DATA_FOLDER, 'gene_ensembl__prot_pfam__dm.txt'))
    finally:
        sys.stdout.close()

    #mark the download finished successfully
    _updates = {
        'status': 'success',
        'time': timesofar(t0),
        'pending_to_upload': True    # a flag to trigger data uploading
    }
    src_dump.update({'_id': 'ensembl'}, {'$set': _updates})
Ejemplo n.º 15
0
def update_from_temp_collections(config,no_confirm=False,use_parallel=False):
    t0 = time.time()
    sc = GeneDocSyncer(config)
    new_src_li = sc.get_new_source_list()
    if not new_src_li:
        logging.info("No new source collections need to update. Abort now.")
        return

    logging.info("Found {} new source collections need to update:".format(len(new_src_li)))
    logging.info("\n".join(['\t' + x for x in new_src_li]))

    if no_confirm or ask('Continue?') == 'Y':
        logfile = 'databuild_sync_{}_{}.log'.format(config, time.strftime('%Y%m%d'))
        logfile = os.path.join(LOG_FOLDER, logfile)
        setup_logfile(logfile)

        for src in new_src_li:
            t0 = time.time()
            logging.info("Current source collection: %s" % src)
            ts = _get_timestamp(src, as_str=True)
            logging.info("Calculating changes... ")
            changes = sc.get_changes(src, use_parallel=use_parallel)
            logging.info("Done")
            get_changes_stats(changes)
            if no_confirm or ask("Continue to save changes...") == 'Y':
                if config == 'genedoc_mygene':
                    dumpfile = 'changes_{}.pyobj'.format(ts)
                else:
                    dumpfile = 'changes_{}_allspecies.pyobj'.format(ts)
                dump(changes, dumpfile)
                dumpfile_key = 'genedoc_changes/' + dumpfile
                logging.info('Saving to S3: "{}"... '.format(dumpfile_key))
                send_s3_file(dumpfile, dumpfile_key)
                logging.info('Done.')
                #os.remove(dumpfile)

            if no_confirm or ask("Continue to apply changes...") == 'Y':
                sc.apply_changes(changes)
                sc.verify_changes(changes)
            logging.info('=' * 20)
            logging.info("Finished. %s" % timesofar(t0))
Ejemplo n.º 16
0
def update_index(changes, sync_src, sync_target, noconfirm=False):
    # changes['_add'] = changes['delete']
    # changes['_delete'] = changes['add']
    # changes['delete'] = changes['_delete']
    # changes['add'] = changes['_add']
    # del changes['_add']
    # del changes['_delete']

    print("\t{}\trecords will be added.".format(len(changes['add'])))
    print("\t{}\trecords will be deleted.".format(len(changes['delete'])))
    print("\t{}\trecords will be updated.".format(len(changes['update'])))

    print()
    print('\tsync_src:\t{:<45}{}'.format(sync_src.target_collection.name,
                                         sync_src.name))
    print('\tsync_target\t{:<45}{}'.format(
        sync_target.target_esidxer.ES_INDEX_NAME, sync_target.name))

    if noconfirm or ask("Continue?") == 'Y':
        t00 = time.time()
        es_idxer = sync_target.target_esidxer

        if len(changes['add']) > 0:
            print("Adding {} new records...".format(len(changes['add'])))
            t0 = time.time()
            _q = {'_id': {'$in': changes['add']}}
            for docs in doc_feeder(sync_src.target_collection,
                                   step=1000,
                                   inbatch=True,
                                   query=_q):
                es_idxer.add_docs(docs)
            print("Done. [{}]".format(timesofar(t0)))

        if len(changes['delete']) > 0:
            print("Deleting {} old records...".format(len(changes['delete'])))
            t0 = time.time()
            es_idxer.delete_docs(changes['delete'])
            print("Done. [{}]".format(timesofar(t0)))

        if len(changes['update']) > 0:
            print("Updating {} existing records...".format(
                len(changes['update'])))
            t0 = time.time()
            ids = [d['_id'] for d in changes['update']]
            _q = {'_id': {'$in': ids}}
            for docs in doc_feeder(sync_src.target_collection,
                                   step=1000,
                                   inbatch=True,
                                   query=_q):
                es_idxer.add_docs(docs)
            print("Done. [{}]".format(timesofar(t0)))
        print('=' * 20)
        print('Finished. [{}]'.format(timesofar(t00)))
Ejemplo n.º 17
0
    def apply_changes(self, changes, verify=True, noconfirm=False):
        if verify:
            self.pre_verify_changes(changes)

        if not (noconfirm or ask('\nContinue to apply changes?') == 'Y'):
            print("Aborted.")
            return -1
        step = self.step
        _db = get_target_db()
        source_col = _db[changes['source']]
        src = GeneDocMongoDBBackend(source_col)
        target = GeneDocESBackend(self)
        _timestamp = changes['timestamp']

        def _add_docs(ids):
            i = 0
            for _ids in iter_n(ids, step):
                t1 = time.time()
                _doc_li = src.mget_from_ids(_ids)
                for _doc in _doc_li:
                    _doc['_timestamp'] = _timestamp
                    i += 1
                target.insert(_doc_li)
                print('\t{}\t{}'.format(i, timesofar(t1)))

        t0 = time.time()
        if changes['add']:
            print("Adding {} new docs...".format(len(changes['add'])))
            t00 = time.time()
            _add_docs(changes['add'])
            print("done. [{}]".format(timesofar(t00)))
        if changes['delete']:
            print("Deleting {} discontinued docs...".format(
                len(changes['delete'])),
                  end='')
            t00 = time.time()
            target.remove_from_ids(changes['delete'], step=step)
            print("done. [{}]".format(timesofar(t00)))
        if changes['update']:
            print("Updating {} existing docs...".format(len(
                changes['update'])))
            t00 = time.time()
            ids = [x['_id'] for x in changes['update']]
            _add_docs(ids)
            print("done. [{}]".format(timesofar(t00)))

        target.finalize()

        print("\n")
        print("Finished.", timesofar(t0))
Ejemplo n.º 18
0
    def apply_changes(self, changes, verify=True, noconfirm=False):
        if verify:
            self.pre_verify_changes(changes)

        if not (noconfirm or ask('\nContinue to apply changes?') == 'Y'):
            print("Aborted.")
            return -1
        step = self.step
        _db = get_target_db()
        source_col = _db[changes['source']]
        src = GeneDocMongoDBBackend(source_col)
        target = GeneDocESBackend(self)
        _timestamp = changes['timestamp']

        def _add_docs(ids):
            i = 0
            for _ids in iter_n(ids, step):
                t1 = time.time()
                _doc_li = src.mget_from_ids(_ids)
                for _doc in _doc_li:
                    _doc['_timestamp'] = _timestamp
                    i += 1
                target.insert(_doc_li)
                print('\t{}\t{}'.format(i, timesofar(t1)))

        t0 = time.time()
        if changes['add']:
            print("Adding {} new docs...".format(len(changes['add'])))
            t00 = time.time()
            _add_docs(changes['add'])
            print("done. [{}]".format(timesofar(t00)))
        if changes['delete']:
            print("Deleting {} discontinued docs...".format(len(changes['delete'])), end='')
            t00 = time.time()
            target.remove_from_ids(changes['delete'], step=step)
            print("done. [{}]".format(timesofar(t00)))
        if changes['update']:
            print("Updating {} existing docs...".format(len(changes['update'])))
            t00 = time.time()
            ids = [x['_id'] for x in changes['update']]
            _add_docs(ids)
            print("done. [{}]".format(timesofar(t00)))

        target.finalize()

        print("\n")
        print("Finished.", timesofar(t0))
Ejemplo n.º 19
0
def main_cron():
    no_confirm = True   # set it to True for running this script automatically without intervention.

    logging.info("Checking latest refseq release:\t", end='')
    refseq_release = get_refseq_release()
    logging.info(refseq_release)

    src_dump = get_src_dump()
    doc = src_dump.find_one({'_id': 'refseq'})
    if doc and 'release' in doc and refseq_release <= doc['release']:
        data_file = os.path.join(doc['data_folder'], 'complete.109.rna.gbff.gz')
        if os.path.exists(data_file):
            logging.info("No newer release found. Abort now.")
            sys.exit(0)

    DATA_FOLDER = os.path.join(REFSEQ_FOLDER, str(refseq_release))
    if not os.path.exists(DATA_FOLDER):
        os.makedirs(DATA_FOLDER)
    else:
        if not (no_confirm or len(os.listdir(DATA_FOLDER)) == 0 or ask('DATA_FOLDER (%s) is not empty. Continue?' % DATA_FOLDER) == 'Y'):
            sys.exit(0)

    logfile = os.path.join(DATA_FOLDER, 'refseq_dump.log')
    setup_logfile(logfile)

    #mark the download starts
    doc = {'_id': 'refseq',
           'release': refseq_release,
           'timestamp': time.strftime('%Y%m%d'),
           'data_folder': DATA_FOLDER,
           'logfile': logfile,
           'status': 'downloading'}
    src_dump.save(doc)
    t0 = time.time()

    try:
        download(DATA_FOLDER, refseq_release, no_confirm=no_confirm)
    finally:
        sys.stdout.close()

    #mark the download finished successfully
    _updates = {
        'status': 'success',
        'time': timesofar(t0),
        'pending_to_upload': True    # a flag to trigger data uploading
    }
    src_dump.update({'_id': 'refseq'}, {'$set': _updates})
Ejemplo n.º 20
0
def update_index(changes, sync_src, sync_target, noconfirm=False):
    # changes['_add'] = changes['delete']
    # changes['_delete'] = changes['add']
    # changes['delete'] = changes['_delete']
    # changes['add'] = changes['_add']
    # del changes['_add']
    # del changes['_delete']

    print("\t{}\trecords will be added.".format(len(changes['add'])))
    print("\t{}\trecords will be deleted.".format(len(changes['delete'])))
    print("\t{}\trecords will be updated.".format(len(changes['update'])))

    print()
    print('\tsync_src:\t{:<45}{}'.format(sync_src.target_collection.name,
                                         sync_src.name))
    print('\tsync_target\t{:<45}{}'.format(sync_target.target_esidxer.ES_INDEX_NAME,
                                           sync_target.name))

    if noconfirm or ask("Continue?") == 'Y':
        t00 = time.time()
        es_idxer = sync_target.target_esidxer

        if len(changes['add']) > 0:
            print("Adding {} new records...".format(len(changes['add'])))
            t0 = time.time()
            _q = {'_id': {'$in': changes['add']}}
            for docs in doc_feeder(sync_src.target_collection, step=1000, inbatch=True, query=_q):
                es_idxer.add_docs(docs)
            print("Done. [{}]".format(timesofar(t0)))

        if len(changes['delete']) > 0:
            print("Deleting {} old records...".format(len(changes['delete'])))
            t0 = time.time()
            es_idxer.delete_docs(changes['delete'])
            print("Done. [{}]".format(timesofar(t0)))

        if len(changes['update']) > 0:
            print("Updating {} existing records...".format(len(changes['update'])))
            t0 = time.time()
            ids = [d['_id'] for d in changes['update']]
            _q = {'_id': {'$in': ids}}
            for docs in doc_feeder(sync_src.target_collection, step=1000, inbatch=True, query=_q):
                es_idxer.add_docs(docs)
            print("Done. [{}]".format(timesofar(t0)))
        print('=' * 20)
        print('Finished. [{}]'.format(timesofar(t00)))
Ejemplo n.º 21
0
def main():
    no_confirm = True  # set it to True for running this script automatically without intervention.

    if not os.path.exists(DATA_FOLDER):
        os.makedirs(DATA_FOLDER)
    else:
        if not (no_confirm or len(os.listdir(DATA_FOLDER)) == 0
                or ask('DATA_FOLDER (%s) is not empty. Continue?' %
                       DATA_FOLDER) == 'Y'):
            sys.exit()

    logfile = os.path.join(DATA_FOLDER, 'entrez_dump.log')
    setup_logfile(logfile)

    #mark the download starts
    src_dump = get_src_dump()
    doc = {
        '_id': 'entrez',
        'timestamp': timestamp,
        'data_folder': DATA_FOLDER,
        'logfile': logfile,
        'status': 'downloading'
    }
    src_dump.save(doc)
    t0 = time.time()
    download(DATA_FOLDER, no_confirm=no_confirm)
    t_download = timesofar(t0)
    t1 = time.time()
    #mark parsing starts
    src_dump.update({'_id': 'entrez'}, {'$set': {'status': 'parsing'}})
    parse_gbff(DATA_FOLDER)
    t_parsing = timesofar(t1)
    t_total = timesofar(t0)

    #mark the download finished successfully
    _updates = {
        'status': 'success',
        'time': {
            'download': t_download,
            'parsing': t_parsing,
            'total': t_total
        },
        'pending_to_upload': True  # a flag to trigger data uploading
    }

    src_dump.update({'_id': 'entrez'}, {'$set': _updates})
Ejemplo n.º 22
0
def main():
    if len(sys.argv) > 1:
        config = sys.argv[1]
    else:
        config = 'mygene_allspecies'
    if not config.startswith('genedoc_'):
        config = 'genedoc_' + config
    assert config in ['genedoc_mygene', 'genedoc_mygene_allspecies']
    noconfirm = '-b' in sys.argv

    _changes_fn = _get_current_changes_fn(config)
    if _changes_fn:
        print("Changes file: " + _changes_fn)
    else:
        print("No changes file found. Aborted.")
        return -1
    if noconfirm or ask("Continue to load?") == 'Y':
        changes = loadobj(_changes_fn)
    else:
        print("Aborted.")
        return -2

    _es_index = config + TARGET_ES_INDEX_SUFFIX
    # ES host will be set depending on whether a tunnel is used or not
    with open_tunnel() as tunnel:
        if tunnel.ok:
            _es_host = 'localhost:' + str(es_local_tunnel_port)
        else:
            _es_host = ES_HOST

        esi = ESIndexer2(_es_index, es_host=_es_host)

        meta = esi.get_mapping_meta(changes)
        print('\033[34;06m{}\033[0m:'.format('[Metadata]'))
        pprint(meta)
        code = esi.apply_changes(changes, noconfirm=noconfirm)
        if code != -1:
            # aborted when code == -1
            _meta = {'_meta': meta}
            print(
                esi.conn.indices.put_mapping(esi.ES_INDEX_TYPE, _meta,
                                             [esi.ES_INDEX_NAME]))
            esi.post_verify_changes(changes)
Ejemplo n.º 23
0
def main():
    no_confirm = True   # set it to True for running this script automatically without intervention.

    if not os.path.exists(DATA_FOLDER):
        os.makedirs(DATA_FOLDER)
    else:
        if not (no_confirm or len(os.listdir(DATA_FOLDER)) == 0 or ask('DATA_FOLDER (%s) is not empty. Continue?' % DATA_FOLDER) == 'Y'):
            sys.exit()

    logfile = os.path.join(DATA_FOLDER, 'entrez_dump.log')
    setup_logfile(logfile)

    #mark the download starts
    src_dump = get_src_dump()
    doc = {'_id': 'entrez',
           'timestamp': timestamp,
           'data_folder': DATA_FOLDER,
           'logfile': logfile,
           'status': 'downloading'}
    src_dump.save(doc)
    t0 = time.time()
    download(DATA_FOLDER, no_confirm=no_confirm)
    t_download = timesofar(t0)
    t1 = time.time()
    #mark parsing starts
    src_dump.update({'_id': 'entrez'}, {'$set': {'status': 'parsing'}})
    parse_gbff(DATA_FOLDER)
    t_parsing = timesofar(t1)
    t_total = timesofar(t0)

    #mark the download finished successfully
    _updates = {
        'status': 'success',
        'time': {
            'download': t_download,
            'parsing': t_parsing,
            'total': t_total
        },
        'pending_to_upload': True    # a flag to trigger data uploading
    }

    src_dump.update({'_id': 'entrez'}, {'$set': _updates})
Ejemplo n.º 24
0
def main(no_confirm=True):

    src_dump = get_src_dump()
    lastmodified = check_lastmodified()
    doc = src_dump.find_one({'_id': 'exac'})
    if doc and 'lastmodified' in doc and lastmodified <= doc['lastmodified']:
        path, filename = os.path.split(DATAFILES_PATH[0])
        data_file = os.path.join(doc['data_folder'], filename)
        if os.path.exists(data_file):
            logging.info("No newer file found. Abort now.")
            sys.exit(0)

    if not os.path.exists(DATA_FOLDER):
        os.makedirs(DATA_FOLDER)
    else:
        if not (no_confirm or len(os.listdir(DATA_FOLDER)) == 0
                or ask('DATA_FOLDER (%s) is not empty. Continue?' %
                       DATA_FOLDER) == 'Y'):
            sys.exit(0)

    logfile = os.path.join(DATA_FOLDER, 'exac_dump.log')
    setup_logfile(logfile)

    #mark the download starts
    doc = {
        '_id': 'exac',
        'timestamp': timestamp,
        'data_folder': DATA_FOLDER,
        'lastmodified': lastmodified,
        'logfile': logfile,
        'status': 'downloading'
    }
    src_dump.save(doc)
    t0 = time.time()
    download(no_confirm)
    #mark the download finished successfully
    _updates = {
        'status': 'success',
        'time': timesofar(t0),
        'pending_to_upload': True  # a flag to trigger data uploading
    }
    src_dump.update({'_id': 'exac'}, {'$set': _updates})
Ejemplo n.º 25
0
def main():
    if len(sys.argv) > 1:
        config = sys.argv[1]
    else:
        config = 'mygene_allspecies'
    if not config.startswith('genedoc_'):
        config = 'genedoc_' + config
    assert config in ['genedoc_mygene', 'genedoc_mygene_allspecies']
    noconfirm = '-b' in sys.argv

    _changes_fn = _get_current_changes_fn(config)
    if _changes_fn:
        print("Changes file: " + _changes_fn)
    else:
        print("No changes file found. Aborted.")
        return -1
    if noconfirm or ask("Continue to load?") == 'Y':
        changes = loadobj(_changes_fn)
    else:
        print("Aborted.")
        return -2

    _es_index = config + TARGET_ES_INDEX_SUFFIX
    # ES host will be set depending on whether a tunnel is used or not
    with open_tunnel() as tunnel:
        if tunnel.ok:
            _es_host = 'localhost:' + str(es_local_tunnel_port)
        else:
            _es_host = ES_HOST

        esi = ESIndexer2(_es_index, es_host=_es_host)

        meta = esi.get_mapping_meta(changes)
        print('\033[34;06m{}\033[0m:'.format('[Metadata]'))
        pprint(meta)
        code = esi.apply_changes(changes, noconfirm=noconfirm)
        if code != -1:
            # aborted when code == -1
            _meta = {'_meta': meta}
            print(esi.conn.indices.put_mapping(esi.ES_INDEX_TYPE, _meta, [esi.ES_INDEX_NAME]))
            esi.post_verify_changes(changes)
Ejemplo n.º 26
0
    def merge_resume(self, build_config, at_collection, step=10000):
        '''resume a merging process after a failure.
             .merge_resume('mygene_allspecies', 'reporter')
        '''
        assert not self.using_ipython_cluster, "Abort. Can only resume merging in non-parallel mode."
        self.load_build_config(build_config)
        last_build = self._build_config['build'][-1]
        logging.info("Last build record:")
        logging.info(pformat(last_build))
        assert last_build['status'] == 'building', \
            "Abort. Last build does not need to be resumed."
        assert at_collection in self._build_config['sources'], \
            'Abort. Cannot resume merging from a unknown collection "{}"'.format(at_collection)
        assert last_build['target_backend'] == self.target.name, \
            'Abort. Re-initialized DataBuilder class using matching backend "{}"'.format(last_build['backend'])
        assert last_build.get('stats', None), \
            'Abort. Intital build stats are not available. You should restart the build from the scratch.'
        self._stats = last_build['stats']

        if ask('Continue to resume merging from "{}"?'.format(
                at_collection)) == 'Y':
            #TODO: resume logging
            target_name = last_build['target']
            self.validate_src_collections()
            self.prepare_target(target_name=target_name)
            src_cnt = 0
            for collection in self._build_config['sources']:
                if collection in ['entrez_gene', 'ensembl_gene']:
                    continue
                src_cnt += 1
                if collection == at_collection:
                    break
            self._merge_local(step=step, restart_at=src_cnt)
            if self.target.name == 'es':
                logging.info("Updating metadata...")
                self.update_mapping_meta()
            self.log_src_build({
                'status': 'success',
                'timestamp': datetime.now()
            })
Ejemplo n.º 27
0
def diff2src(use_parallel=True, noconfirm=False):
    src_li = []

    target_db = get_target_db()
    src_li.extend([(name, target_db[name].count(), 'mongodb')
                   for name in sorted(target_db.collection_names())
                   if name.startswith('genedoc')])

    es_idxer = ESIndexer()
    es_idxer.conn.default_indices = []
    for es_idx in es_idxer.conn.indices.get_indices():
        if es_idx.startswith('genedoc'):
            es_idxer.ES_INDEX_NAME = es_idx
            src_li.append((es_idx, es_idxer.count()['count'], 'es'))

    print("Found {} sources:".format(len(src_li)))
    src_1 = _pick_one(src_li, "Pick first source above: ")
    src_li.remove(src_1)
    print
    src_2 = _pick_one(src_li, "Pick second source above: ")

    sync_li = []
    for src in (src_1, src_2):
        if src[2] == 'mongodb':
            b = backend.GeneDocMongoDBBackend(target_db[src[0]])
        elif src[2] == 'es':
            es_idxer = ESIndexer()
            es_idxer.ES_INDEX_NAME = src[0]
            es_idxer.step = 10000
            b = backend.GeneDocESBackend(es_idxer)
        sync_li.append(b)

    sync_src, sync_target = sync_li
    print('\tsync_src:\t{:<45}{}\t{}'.format(*src_1))
    print('\tsync_target\t{:<45}{}\t{}'.format(*src_2))
    if noconfirm or ask("Continue?") == "Y":
        changes = diff.diff_collections(sync_src,
                                        sync_target,
                                        use_parallel=use_parallel)
        return changes
Ejemplo n.º 28
0
def download(path, no_confirm=False):
    out = []
    orig_path = os.getcwd()
    try:
        _expand_refseq_files()
        for subfolder in FILE_LIST:
            filedata = FILE_LIST[subfolder]
            baseurl = filedata['url']
            data_folder = os.path.join(path, subfolder)
            if not os.path.exists(data_folder):
                os.mkdir(data_folder)

            for f in filedata['files']:
                url = baseurl + f
                os.chdir(data_folder)
                filename = os.path.split(f)[1]
                if os.path.exists(filename):
                    if no_confirm or ask(
                            'Remove existing file "%s"?' % filename) == 'Y':
                        os.remove(filename)
                    else:
                        logging.info("Skipped!")
                        continue
                logging.info('Downloading "%s"...' % f)
                #cmdline = 'wget %s' % url
                #cmdline = 'axel -a -n 5 %s' % url   #faster than wget using 5 connections
                cmdline = _get_ascp_cmdline(url)
                return_code = os.system(cmdline)
                #return_code = 0;print cmdline    #for testing
                if return_code == 0:
                    logging.info("Success.")
                else:
                    logging.info("Failed with return code (%s)." % return_code)
                    out.append((url, return_code))
                logging.info("=" * 50)
    finally:
        os.chdir(orig_path)

    return out
Ejemplo n.º 29
0
def main(no_confirm=True):

    src_dump = get_src_dump()
    lastmodified = check_lastmodified()
    doc = src_dump.find_one({'_id': 'uniprot'})
    if doc and 'lastmodified' in doc and lastmodified <= doc['lastmodified']:
        path, filename = os.path.split(DATAFILE_PATH)
        data_file = os.path.join(doc['data_folder'], filename)
        if os.path.exists(data_file):
            logging.info("No newer file found. Abort now.")
            sys.exit(0)

    if not os.path.exists(DATA_FOLDER):
        os.makedirs(DATA_FOLDER)
    else:
        if not (no_confirm or len(os.listdir(DATA_FOLDER)) == 0 or ask('DATA_FOLDER (%s) is not empty. Continue?' % DATA_FOLDER) == 'Y'):
            sys.exit(0)

    logfile = os.path.join(DATA_FOLDER, 'uniprot_dump.log')
    setup_logfile(logfile)

    #mark the download starts
    doc = {'_id': 'uniprot',
           'timestamp': timestamp,
           'data_folder': DATA_FOLDER,
           'lastmodified': lastmodified,
           'logfile': logfile,
           'status': 'downloading'}
    src_dump.save(doc)
    t0 = time.time()
    download(no_confirm)
    #mark the download finished successfully
    _updates = {
        'status': 'success',
        'time': timesofar(t0),
        'pending_to_upload': True    # a flag to trigger data uploading
    }
    src_dump.update({'_id': 'uniprot'}, {'$set': _updates})
Ejemplo n.º 30
0
def src_clean_archives(keep_last=1, src=None, verbose=True, noconfirm=False):
    '''clean up archive collections in src db, only keep last <kepp_last>
       number of archive.
    '''
    from utils.dataload import list2dict
    from biothings.utils.common import ask

    src = src or get_src_db()

    archive_li = sorted([(coll.split('_archive_')[0], coll)
                         for coll in src.collection_names()
                         if coll.find('archive') != -1])
    archive_d = list2dict(archive_li, 0, alwayslist=1)
    coll_to_remove = []
    for k, v in archive_d.items():
        print(k, end='')
        #check current collection exists
        if src[k].count() > 0:
            cnt = 0
            for coll in sorted(v)[:-keep_last]:
                coll_to_remove.append(coll)
                cnt += 1
            print("\t\t%s archived collections marked to remove." % cnt)
        else:
            print('skipped. Missing current "%s" collection!' % k)
    if len(coll_to_remove) > 0:
        print("%d archived collections will be removed." % len(coll_to_remove))
        if verbose:
            for coll in coll_to_remove:
                print('\t', coll)
        if noconfirm or ask("Continue?") == 'Y':
            for coll in coll_to_remove:
                src[coll].drop()
            print("Done.[%s collections removed]" % len(coll_to_remove))
        else:
            print("Aborted.")
    else:
        print("Nothing needs to be removed.")
Ejemplo n.º 31
0
def run_jobs_on_ipythoncluster(worker,
                               task_list,
                               shutdown_ipengines_after_done=False):

    t0 = time.time()
    rc = Client(config.CLUSTER_CLIENT_JSON)
    lview = rc.load_balanced_view()
    cnt_nodes = len(lview.targets or rc.ids)
    print("\t# nodes in use: {}".format(cnt_nodes))
    lview.block = False
    # move to app path
    lview.map(os.chdir, [config.APP_PATH] * cnt_nodes)
    print("\t# of tasks: {}".format(len(task_list)))
    print("\tsubmitting...", end='')
    job = lview.map_async(worker, task_list)
    print("done.")
    try:
        job.wait_interactive()
    except KeyboardInterrupt:
        #handle "Ctrl-C"
        if ask("\nAbort all submitted jobs?") == 'Y':
            lview.abort()
            print("Aborted, all submitted jobs are cancelled.")
        else:
            print("Aborted, but your jobs are still running on the cluster.")
        return

    if len(job.result()) != len(task_list):
        print(
            "WARNING:\t# of results returned ({}) != # of tasks ({}).".format(
                len(job.result()), len(task_list)))
    print("\ttotal time: {}".format(timesofar(t0)))

    if shutdown_ipengines_after_done:
        print("\tshuting down all ipengine nodes...", end='')
        lview.shutdown()
        print('Done.')
    return job.result()
Ejemplo n.º 32
0
def download(path, no_confirm=False):
    out = []
    orig_path = os.getcwd()
    try:
        _expand_refseq_files()
        for subfolder in FILE_LIST:
            filedata = FILE_LIST[subfolder]
            baseurl = filedata['url']
            data_folder = os.path.join(path, subfolder)
            if not os.path.exists(data_folder):
                os.mkdir(data_folder)

            for f in filedata['files']:
                url = baseurl + f
                os.chdir(data_folder)
                filename = os.path.split(f)[1]
                if os.path.exists(filename):
                    if no_confirm or ask('Remove existing file "%s"?' % filename) == 'Y':
                        os.remove(filename)
                    else:
                        logging.info("Skipped!")
                        continue
                logging.info('Downloading "%s"...' % f)
                #cmdline = 'wget %s' % url
                #cmdline = 'axel -a -n 5 %s' % url   #faster than wget using 5 connections
                cmdline = _get_ascp_cmdline(url)
                return_code = os.system(cmdline)
                #return_code = 0;print cmdline    #for testing
                if return_code == 0:
                    logging.info("Success.")
                else:
                    logging.info("Failed with return code (%s)." % return_code)
                    out.append((url, return_code))
                logging.info("=" * 50)
    finally:
        os.chdir(orig_path)

    return out
Ejemplo n.º 33
0
def download(no_confirm=False):
    orig_path = os.getcwd()
    try:
        os.chdir(DATA_FOLDER)
        path, filename = os.path.split(DATAFILE_PATH)
        if os.path.exists(filename):
            if no_confirm or ask('Remove existing file "%s"?' % filename) == 'Y':
                os.remove(filename)
            else:
                logging.info("Skipped!")
                return
        logging.info('Downloading "%s"...' % filename)
        url = 'ftp://{}/{}'.format(FTP_SERVER, DATAFILE_PATH)
        cmdline = 'wget %s -O %s' % (url, filename)
        #cmdline = 'axel -a -n 5 %s' % url   #faster than wget using 5 connections
        return_code = os.system(cmdline)
        if return_code == 0:
            logging.info("Success.")
        else:
            logging.info("Failed with return code (%s)." % return_code)
        logging.info("=" * 50)
    finally:
        os.chdir(orig_path)
Ejemplo n.º 34
0
def sync_index(config, use_parallel=True, noconfirm=False):

    bdr = DataBuilder(backend='mongodb')
    bdr.load_build_config(config)
    target_collection = bdr.pick_target_collection()
    target_es_index = 'genedoc_' + bdr._build_config['name']

    sync_src = backend.GeneDocMongoDBBackend(target_collection)

    es_idxer = ESIndexer(bdr.get_mapping())
    es_idxer.ES_INDEX_NAME = target_es_index
    es_idxer.step = 10000
    es_idxer.use_parallel = use_parallel
    sync_target = backend.GeneDocESBackend(es_idxer)

    print('\tsync_src:\t{:<40}{}\t{}'.format(target_collection.name,
                                             sync_src.name, sync_src.count()))
    print('\tsync_target\t{:<40}{}\t{}'.format(target_es_index,
                                               sync_target.name,
                                               sync_target.count()))
    if noconfirm or ask("Continue?") == "Y":
        changes = diff.diff_collections(sync_src, sync_target)
        return changes
Ejemplo n.º 35
0
def download(no_confirm=False):
    orig_path = os.getcwd()
    try:
        os.chdir(DATA_FOLDER)
        filename = 'genes.zip'
        url = GENES_URL
        if os.path.exists(filename):
            if no_confirm or ask('Remove existing file "%s"?' % filename) == 'Y':
                os.remove(filename)
            else:
                logging.info("Skipped!")
                return
        logging.info('Downloading "%s"...' % filename)
        cmdline = 'wget "%s" -O %s' % (url, filename)
        #cmdline = 'axel -a -n 5 %s' % url   #faster than wget using 5 connections
        return_code = os.system(cmdline)
        if return_code == 0:
            logging.info("Success.")
        else:
            logging.info("Failed with return code (%s)." % return_code)
        logging.info("=" * 50)
    finally:
        os.chdir(orig_path)
Ejemplo n.º 36
0
def src_clean_archives(keep_last=1, src=None, verbose=True, noconfirm=False):
    '''clean up archive collections in src db, only keep last <kepp_last>
       number of archive.
    '''
    from utils.dataload import list2dict
    from biothings.utils.common import ask

    src = src or get_src_db()

    archive_li = sorted([(coll.split('_archive_')[0], coll) for coll in src.collection_names()
                         if coll.find('archive') != -1])
    archive_d = list2dict(archive_li, 0, alwayslist=1)
    coll_to_remove = []
    for k, v in archive_d.items():
        print(k, end='')
        #check current collection exists
        if src[k].count() > 0:
            cnt = 0
            for coll in sorted(v)[:-keep_last]:
                coll_to_remove.append(coll)
                cnt += 1
            print("\t\t%s archived collections marked to remove." % cnt)
        else:
            print('skipped. Missing current "%s" collection!' % k)
    if len(coll_to_remove) > 0:
        print("%d archived collections will be removed." % len(coll_to_remove))
        if verbose:
            for coll in coll_to_remove:
                print('\t', coll)
        if noconfirm or ask("Continue?") == 'Y':
            for coll in coll_to_remove:
                src[coll].drop()
            print("Done.[%s collections removed]" % len(coll_to_remove))
        else:
            print("Aborted.")
    else:
        print("Nothing needs to be removed.")
Ejemplo n.º 37
0
def download(path, release, no_confirm=False):
    out = []
    orig_path = os.getcwd()
    try:
        data_folder = os.path.join(path, release)
        if not os.path.exists(data_folder):
            os.mkdir(data_folder)

        _url = 'ftp://' + FTP_SERVER + BASE_PATH + DATA_FILE
        url_li = _expand_wildchar_urls(_url)
        logging.info('Found {} "{}" files to download.'.format(len(url_li), DATA_FILE))

        for url in url_li:
            os.chdir(data_folder)
            filename = os.path.split(url)[1]
            if os.path.exists(filename):
                if no_confirm or ask('Remove existing file "%s"?' % filename) == 'Y':
                    os.remove(filename)
                else:
                    logging.info("Skipped!")
                    continue
            logging.info('Downloading "%s"...' % filename)
            #cmdline = 'wget %s' % url
            #cmdline = 'axel -a -n 5 %s' % url   #faster than wget using 5 connections
            cmdline = _get_ascp_cmdline(url)
            return_code = os.system(cmdline)
            #return_code = 0;print cmdline    #for testing
            if return_code == 0:
                logging.info("Success.")
            else:
                logging.info("Failed with return code (%s)." % return_code)
                out.append((url, return_code))
            logging.info("=" * 50)
    finally:
        os.chdir(orig_path)

    return out
Ejemplo n.º 38
0
    def merge_resume(self, build_config, at_collection, step=10000):
        '''resume a merging process after a failure.
             .merge_resume('mygene_allspecies', 'reporter')
        '''
        assert not self.using_ipython_cluster, "Abort. Can only resume merging in non-parallel mode."
        self.load_build_config(build_config)
        last_build = self._build_config['build'][-1]
        logging.info("Last build record:")
        logging.info(pformat(last_build))
        assert last_build['status'] == 'building', \
            "Abort. Last build does not need to be resumed."
        assert at_collection in self._build_config['sources'], \
            'Abort. Cannot resume merging from a unknown collection "{}"'.format(at_collection)
        assert last_build['target_backend'] == self.target.name, \
            'Abort. Re-initialized DataBuilder class using matching backend "{}"'.format(last_build['backend'])
        assert last_build.get('stats', None), \
            'Abort. Intital build stats are not available. You should restart the build from the scratch.'
        self._stats = last_build['stats']

        if ask('Continue to resume merging from "{}"?'.format(at_collection)) == 'Y':
            #TODO: resume logging
            target_name = last_build['target']
            self.validate_src_collections()
            self.prepare_target(target_name=target_name)
            src_cnt = 0
            for collection in self._build_config['sources']:
                if collection in ['entrez_gene', 'ensembl_gene']:
                    continue
                src_cnt += 1
                if collection == at_collection:
                    break
            self._merge_local(step=step, restart_at=src_cnt)
            if self.target.name == 'es':
                logging.info("Updating metadata...")
                self.update_mapping_meta()
            self.log_src_build({'status': 'success',
                                'timestamp': datetime.now()})
Ejemplo n.º 39
0
def sync_index(config, use_parallel=True, noconfirm=False):

    bdr = DataBuilder(backend='mongodb')
    bdr.load_build_config(config)
    target_collection = bdr.pick_target_collection()
    target_es_index = 'genedoc_' + bdr._build_config['name']

    sync_src = backend.GeneDocMongoDBBackend(target_collection)

    es_idxer = ESIndexer(bdr.get_mapping())
    es_idxer.ES_INDEX_NAME = target_es_index
    es_idxer.step = 10000
    es_idxer.use_parallel = use_parallel
    sync_target = backend.GeneDocESBackend(es_idxer)

    print('\tsync_src:\t{:<40}{}\t{}'.format(target_collection.name,
                                             sync_src.name,
                                             sync_src.count()))
    print('\tsync_target\t{:<40}{}\t{}'.format(target_es_index,
                                               sync_target.name,
                                               sync_target.count()))
    if noconfirm or ask("Continue?") == "Y":
        changes = diff.diff_collections(sync_src, sync_target)
        return changes
Ejemplo n.º 40
0
def download(no_confirm=False):
    orig_path = os.getcwd()
    try:
        os.chdir(DATA_FOLDER)
        filename = 'genes.zip'
        url = GENES_URL
        if os.path.exists(filename):
            if no_confirm or ask(
                    'Remove existing file "%s"?' % filename) == 'Y':
                os.remove(filename)
            else:
                logging.info("Skipped!")
                return
        logging.info('Downloading "%s"...' % filename)
        cmdline = 'wget "%s" -O %s' % (url, filename)
        #cmdline = 'axel -a -n 5 %s' % url   #faster than wget using 5 connections
        return_code = os.system(cmdline)
        if return_code == 0:
            logging.info("Success.")
        else:
            logging.info("Failed with return code (%s)." % return_code)
        logging.info("=" * 50)
    finally:
        os.chdir(orig_path)
Ejemplo n.º 41
0
def diff2src(use_parallel=True, noconfirm=False):
    src_li = []

    target_db = get_target_db()
    src_li.extend([(name, target_db[name].count(), 'mongodb') for name in sorted(target_db.collection_names()) if name.startswith('genedoc')])

    es_idxer = ESIndexer()
    es_idxer.conn.default_indices = []
    for es_idx in es_idxer.conn.indices.get_indices():
        if es_idx.startswith('genedoc'):
            es_idxer.ES_INDEX_NAME = es_idx
            src_li.append((es_idx, es_idxer.count()['count'], 'es'))

    print("Found {} sources:".format(len(src_li)))
    src_1 = _pick_one(src_li, "Pick first source above: ")
    src_li.remove(src_1)
    print
    src_2 = _pick_one(src_li, "Pick second source above: ")

    sync_li = []
    for src in (src_1, src_2):
        if src[2] == 'mongodb':
            b = backend.GeneDocMongoDBBackend(target_db[src[0]])
        elif src[2] == 'es':
            es_idxer = ESIndexer()
            es_idxer.ES_INDEX_NAME = src[0]
            es_idxer.step = 10000
            b = backend.GeneDocESBackend(es_idxer)
        sync_li.append(b)

    sync_src, sync_target = sync_li
    print('\tsync_src:\t{:<45}{}\t{}'.format(*src_1))
    print('\tsync_target\t{:<45}{}\t{}'.format(*src_2))
    if noconfirm or ask("Continue?") == "Y":
        changes = diff.diff_collections(sync_src, sync_target, use_parallel=use_parallel)
        return changes
Ejemplo n.º 42
0
    def build_index2(self,
                     build_config='mygene_allspecies',
                     last_build_idx=-1,
                     use_parallel=False,
                     es_host=None,
                     es_index_name=None,
                     noconfirm=False):
        """Build ES index from last successfully-merged mongodb collection.
            optional "es_host" argument can be used to specified another ES host, otherwise default ES_HOST.
            optional "es_index_name" argument can be used to pass an alternative index name, otherwise same as mongodb collection name
        """
        self.load_build_config(build_config)
        assert "build" in self._build_config, "Abort. No such build records for config %s" % build_config
        last_build = self._build_config['build'][last_build_idx]
        logging.info("Last build record:")
        logging.info(pformat(last_build))
        assert last_build['status'] == 'success', \
            "Abort. Last build did not success."
        assert last_build['target_backend'] == "mongodb", \
            'Abort. Last build need to be built using "mongodb" backend.'
        assert last_build.get('stats', None), \
            'Abort. Last build stats are not available.'
        self._stats = last_build['stats']
        assert last_build.get('target', None), \
            'Abort. Last build target_collection is not available.'

        # Get the source collection to build the ES index
        # IMPORTANT: the collection in last_build['target'] does not contain _timestamp field,
        #            only the "genedoc_*_current" collection does. When "timestamp" is enabled
        #            in mappings, last_build['target'] collection won't be indexed by ES correctly,
        #            therefore, we use "genedoc_*_current" collection as the source here:
        #target_collection = last_build['target']
        target_collection = "genedoc_{}_current".format(build_config)
        _db = get_target_db()
        target_collection = _db[target_collection]
        logging.info("")
        logging.info('Source: %s' % target_collection.name)
        _mapping = self.get_mapping()
        _meta = {}
        src_version = self.get_src_version()
        if src_version:
            _meta['src_version'] = src_version
        if getattr(self, '_stats', None):
            _meta['stats'] = self._stats
        if 'timestamp' in last_build:
            _meta['timestamp'] = last_build['timestamp']
        if _meta:
            _mapping['_meta'] = _meta
        es_index_name = es_index_name or target_collection.name
        es_idxer = ESIndexer(mapping=_mapping,
                             es_index_name=es_index_name,
                             es_host=es_host,
                             step=5000)
        if build_config == 'mygene_allspecies':
            es_idxer.number_of_shards = 10  # default 5
        es_idxer.check()
        if noconfirm or ask("Continue to build ES index?") == 'Y':
            es_idxer.use_parallel = use_parallel
            #es_idxer.s = 609000
            if es_idxer.exists_index(es_idxer.ES_INDEX_NAME):
                if noconfirm or ask('Index "{}" exists. Delete?'.format(
                        es_idxer.ES_INDEX_NAME)) == 'Y':
                    es_idxer.conn.indices.delete(es_idxer.ES_INDEX_NAME)
                else:
                    logging.info("Abort.")
                    return
            es_idxer.create_index()
            #es_idxer.delete_index_type(es_idxer.ES_INDEX_TYPE, noconfirm=True)
            es_idxer.build_index(target_collection, verbose=False)
Ejemplo n.º 43
0
def rename_from_temp_collection(config,from_index,no_confirm=False):
    # check if index exist before chenging anything
    sc = GeneDocSyncer(config)
    if not from_index in sc._db.collection_names():
        logging.error("Collection '%s' does not exist" % from_index)
    from_col = sc._db.get_collection(from_index)
    orig_name = sc._target_col.name
    logging.info("Backing up timestamp from '%s'" % orig_name)
    if no_confirm or ask('Continue?') == 'Y':
        bckfile = backup_timestamp_main([config]).pop()
    else:
        bckfile = None
    # rename existing current for backup purpose
    bck_name = orig_name + "_bck_%s" % time.strftime('%Y%m%d%H%M%S')
    logging.info("Renaming %s to %s" % (orig_name,bck_name))
    if no_confirm or ask('Continue?') == 'Y':
        sc._target_col.rename(bck_name)
    logging.info("Renaming %s to %s" % (from_col.name,orig_name))
    if no_confirm or ask('Continue?') == 'Y':
        from_col.rename(orig_name)
    if bckfile is None:
        try:
            pat = "%s_current_tsbk_*.txt.bz" % config
            logging.info("Looking for '%s'" % pat)
            bckfile = sorted(glob.glob(pat))[0]
            if ask("Do you want me to apply timestamp from file '%s' to collection '%s' ?" % (bckfile,sc._target_col.name)) == 'Y':
                pass
            else:
                return
        except IndexError:
            logging.error("Can't find any timstamp file to apply, giving up...")
            return
    prev_ts = {}
    import bz2
    logging.info("Loading timestamps from '%s'" % bckfile)
    with bz2.BZ2File(bckfile, 'rb') as in_f:
        for line in in_f.readlines():
            _id,ts = line.decode().split("\t")
            prev_ts[_id.strip()] = datetime.strptime(ts.strip(),"%Y%m%d")

    logging.info("Now applying timestamp from file '%s' (if more recent than those on the collection)" % bckfile)
    cur = sc._target_col.find()
    default_ts = datetime.now()
    results = {"restored" : 0, "updated" : 0, "unchanged" : 0, "defaulted" : 0} 
    bulk_cnt = 0
    bob = sc._target_col.initialize_unordered_bulk_op()
    cnt = 0
    t0 = time.time()
    while True:
        try:
            doc = next(cur)

            if "_timestamp" not in doc:
                if prev_ts.get(doc["_id"]):
                    ts = prev_ts[doc["_id"]]
                    results["restored"] += 1
                else:
                    ts = default_ts
                    results["defaulted"] += 1
                doc["_timestamp"] = ts
                bulk_cnt += 1
                cnt += 1
                bob.find({"_id" : doc["_id"]}).update_one({"$set" : doc})
            elif prev_ts.get(doc["_id"]) and prev_ts[doc["_id"]] > doc["_timestamp"]:
                doc["_timestamp"] = prev_ts[doc["_id"]]
                results["updated"] += 1
                bulk_cnt += 1
                cnt += 1
                bob.find({"_id" : doc["_id"]}).update_one({"$set" : doc})
            else:
                results["unchanged"] += 1
                cnt += 1

            if cnt % 1000 == 0:
                logging.info("Processed %s documents (%s) [%s]" % (cnt,results,timesofar(t0)))
                t0 = time.time()
            if bulk_cnt == 1000:
                bulk_cnt = 0
                bob.execute()
                bob = sc._target_col.initialize_unordered_bulk_op()

        except StopIteration:
            break
            cur.close()
    try:
        bob.execute()
    except InvalidOperation:
        pass

    logging.info("Done: %s" % results)
Ejemplo n.º 44
0
def rename_from_temp_collection(config, from_index, no_confirm=False):
    # check if index exist before chenging anything
    sc = GeneDocSyncer(config)
    if not from_index in sc._db.collection_names():
        logging.error("Collection '%s' does not exist" % from_index)
    from_col = sc._db.get_collection(from_index)
    orig_name = sc._target_col.name
    logging.info("Backing up timestamp from '%s'" % orig_name)
    if no_confirm or ask('Continue?') == 'Y':
        bckfile = backup_timestamp_main([config]).pop()
    else:
        bckfile = None
    # rename existing current for backup purpose
    bck_name = orig_name + "_bck_%s" % time.strftime('%Y%m%d%H%M%S')
    logging.info("Renaming %s to %s" % (orig_name, bck_name))
    if no_confirm or ask('Continue?') == 'Y':
        sc._target_col.rename(bck_name)
    logging.info("Renaming %s to %s" % (from_col.name, orig_name))
    if no_confirm or ask('Continue?') == 'Y':
        from_col.rename(orig_name)
    if bckfile is None:
        try:
            pat = "%s_current_tsbk_*.txt.bz" % config
            logging.info("Looking for '%s'" % pat)
            bckfile = sorted(glob.glob(pat))[0]
            if ask("Do you want me to apply timestamp from file '%s' to collection '%s' ?"
                   % (bckfile, sc._target_col.name)) == 'Y':
                pass
            else:
                return
        except IndexError:
            logging.error(
                "Can't find any timstamp file to apply, giving up...")
            return
    prev_ts = {}
    import bz2
    logging.info("Loading timestamps from '%s'" % bckfile)
    with bz2.BZ2File(bckfile, 'rb') as in_f:
        for line in in_f.readlines():
            _id, ts = line.decode().split("\t")
            prev_ts[_id.strip()] = datetime.strptime(ts.strip(), "%Y%m%d")

    logging.info(
        "Now applying timestamp from file '%s' (if more recent than those on the collection)"
        % bckfile)
    cur = sc._target_col.find()
    default_ts = datetime.now()
    results = {"restored": 0, "updated": 0, "unchanged": 0, "defaulted": 0}
    bulk_cnt = 0
    bob = sc._target_col.initialize_unordered_bulk_op()
    cnt = 0
    t0 = time.time()
    while True:
        try:
            doc = next(cur)

            if "_timestamp" not in doc:
                if prev_ts.get(doc["_id"]):
                    ts = prev_ts[doc["_id"]]
                    results["restored"] += 1
                else:
                    ts = default_ts
                    results["defaulted"] += 1
                doc["_timestamp"] = ts
                bulk_cnt += 1
                cnt += 1
                bob.find({"_id": doc["_id"]}).update_one({"$set": doc})
            elif prev_ts.get(
                    doc["_id"]) and prev_ts[doc["_id"]] > doc["_timestamp"]:
                doc["_timestamp"] = prev_ts[doc["_id"]]
                results["updated"] += 1
                bulk_cnt += 1
                cnt += 1
                bob.find({"_id": doc["_id"]}).update_one({"$set": doc})
            else:
                results["unchanged"] += 1
                cnt += 1

            if cnt % 1000 == 0:
                logging.info("Processed %s documents (%s) [%s]" %
                             (cnt, results, timesofar(t0)))
                t0 = time.time()
            if bulk_cnt == 1000:
                bulk_cnt = 0
                bob.execute()
                bob = sc._target_col.initialize_unordered_bulk_op()

        except StopIteration:
            break
            cur.close()
    try:
        bob.execute()
    except InvalidOperation:
        pass

    logging.info("Done: %s" % results)
Ejemplo n.º 45
0
    def build_index2(self, build_config='mygene_allspecies', last_build_idx=-1, use_parallel=False, es_host=None, es_index_name=None, noconfirm=False):
        """Build ES index from last successfully-merged mongodb collection.
            optional "es_host" argument can be used to specified another ES host, otherwise default ES_HOST.
            optional "es_index_name" argument can be used to pass an alternative index name, otherwise same as mongodb collection name
        """
        self.load_build_config(build_config)
        assert "build" in self._build_config, "Abort. No such build records for config %s" % build_config
        last_build = self._build_config['build'][last_build_idx]
        logging.info("Last build record:")
        logging.info(pformat(last_build))
        assert last_build['status'] == 'success', \
            "Abort. Last build did not success."
        assert last_build['target_backend'] == "mongodb", \
            'Abort. Last build need to be built using "mongodb" backend.'
        assert last_build.get('stats', None), \
            'Abort. Last build stats are not available.'
        self._stats = last_build['stats']
        assert last_build.get('target', None), \
            'Abort. Last build target_collection is not available.'

        # Get the source collection to build the ES index
        # IMPORTANT: the collection in last_build['target'] does not contain _timestamp field,
        #            only the "genedoc_*_current" collection does. When "timestamp" is enabled
        #            in mappings, last_build['target'] collection won't be indexed by ES correctly,
        #            therefore, we use "genedoc_*_current" collection as the source here:
        #target_collection = last_build['target']
        target_collection = "genedoc_{}_current".format(build_config)
        _db = get_target_db()
        target_collection = _db[target_collection]
        logging.info("")
        logging.info('Source: %s' % target_collection.name)
        _mapping = self.get_mapping()
        _meta = {}
        src_version = self.get_src_version()
        if src_version:
            _meta['src_version'] = src_version
        if getattr(self, '_stats', None):
            _meta['stats'] = self._stats
        if 'timestamp' in last_build:
            _meta['timestamp'] = last_build['timestamp']
        if _meta:
            _mapping['_meta'] = _meta
        es_index_name = es_index_name or target_collection.name
        es_idxer = ESIndexer(mapping=_mapping,
                             es_index_name=es_index_name,
                             es_host=es_host,
                             step=5000)
        if build_config == 'mygene_allspecies':
            es_idxer.number_of_shards = 10   # default 5
        es_idxer.check()
        if noconfirm or ask("Continue to build ES index?") == 'Y':
            es_idxer.use_parallel = use_parallel
            #es_idxer.s = 609000
            if es_idxer.exists_index(es_idxer.ES_INDEX_NAME):
                if noconfirm or ask('Index "{}" exists. Delete?'.format(es_idxer.ES_INDEX_NAME)) == 'Y':
                    es_idxer.conn.indices.delete(es_idxer.ES_INDEX_NAME)
                else:
                    logging.info("Abort.")
                    return
            es_idxer.create_index()
            #es_idxer.delete_index_type(es_idxer.ES_INDEX_TYPE, noconfirm=True)
            es_idxer.build_index(target_collection, verbose=False)
Ejemplo n.º 46
0
def main_cron(no_confirm=True):
    '''set no_confirm to True for running this script automatically
       without intervention.'''

    src_dump = get_src_dump()
    mart_version = chk_latest_mart_version()
    logging.info("Checking latest mart_version:\t%s" % mart_version)

    doc = src_dump.find_one({'_id': 'ensembl'})
    if doc and 'release' in doc and mart_version <= doc['release']:
        data_file = os.path.join(doc['data_folder'],
                                 'gene_ensembl__gene__main.txt')
        if os.path.exists(data_file):
            logging.info("No newer release found. Abort now.")
            sys.exit(0)

    DATA_FOLDER = os.path.join(ENSEMBL_FOLDER, str(mart_version))
    if not os.path.exists(DATA_FOLDER):
        os.makedirs(DATA_FOLDER)
    else:
        if not (no_confirm or len(os.listdir(DATA_FOLDER)) == 0
                or ask('DATA_FOLDER (%s) is not empty. Continue?' %
                       DATA_FOLDER) == 'Y'):
            sys.exit(0)

    logfile = os.path.join(DATA_FOLDER, 'ensembl_mart_%s.log' % mart_version)
    setup_logfile(logfile)

    #mark the download starts
    doc = {
        '_id': 'ensembl',
        'release': mart_version,
        'timestamp': time.strftime('%Y%m%d'),
        'data_folder': DATA_FOLDER,
        'logfile': logfile,
        'status': 'downloading'
    }
    src_dump.save(doc)
    t0 = time.time()

    try:
        BM = BioMart()
        BM.species_li = get_all_species(mart_version)
        BM.get_gene__main(
            os.path.join(DATA_FOLDER, 'gene_ensembl__gene__main.txt'))
        BM.get_translation__main(
            os.path.join(DATA_FOLDER, 'gene_ensembl__translation__main.txt'))
        BM.get_xref_entrezgene(
            os.path.join(DATA_FOLDER, 'gene_ensembl__xref_entrezgene__dm.txt'))

        BM.get_profile(
            os.path.join(DATA_FOLDER, 'gene_ensembl__prot_profile__dm.txt'))
        BM.get_interpro(
            os.path.join(DATA_FOLDER, 'gene_ensembl__prot_interpro__dm.txt'))
        BM.get_pfam(
            os.path.join(DATA_FOLDER, 'gene_ensembl__prot_pfam__dm.txt'))
    finally:
        sys.stdout.close()

    #mark the download finished successfully
    _updates = {
        'status': 'success',
        'time': timesofar(t0),
        'pending_to_upload': True  # a flag to trigger data uploading
    }
    src_dump.update({'_id': 'ensembl'}, {'$set': _updates})