Ejemplo n.º 1
0
 def build_index2(self, build_config='mygene_allspecies', last_build_idx=-1, use_parallel=False, es_host=None, es_index_name=None):
     """Build ES index from last successfully-merged mongodb collection.
         optional "es_host" argument can be used to specified another ES host, otherwise default ES_HOST.
         optional "es_index_name" argument can be used to pass an alternative index name, otherwise same as mongodb collection name
     """
     from pprint import pprint
     self.load_build_config(build_config)
     last_build = self._build_config['build'][last_build_idx]
     print "Last build record:"
     pprint(last_build)
     assert last_build['status'] == 'success', \
         "Abort. Last build did not success."
     assert last_build['target_backend'] == "mongodb", \
         'Abort. Last build need to be built using "mongodb" backend.'
     assert last_build.get('stats', None), \
         'Abort. Last build stats are not available.'
     self._stats = last_build['stats']
     assert last_build.get('target', None), \
         'Abort. Last build target_collection is not available.'
     #target_collection = last_build['target']
     target_collection = "genedoc_{}_current".format(build_config)  ######
     _db = get_target_db()
     target_collection = _db[target_collection]
     print
     print 'Source: ', target_collection.name
     _mapping = self.get_mapping()
     _meta = {}
     src_version = self.get_src_version()
     if src_version:
         _meta['src_version'] = src_version
     if getattr(self, '_stats', None):
         _meta['stats'] = self._stats
     if 'timestamp' in last_build:
         _meta['timestamp'] = last_build['timestamp']
     if _meta:
         _mapping['_meta'] = _meta
     es_index_name = es_index_name or target_collection.name
     es_idxer = ESIndexer(mapping=_mapping,
                          es_index_name=es_index_name,
                          es_host=es_host,
                          step=5000)
     if build_config == 'mygene_allspecies':
         es_idxer.number_of_shards = 10   # default 5
     print "ES host:", es_idxer.conn.servers[0].geturl()
     print "ES index:", es_index_name
     if ask("Continue to build ES index?") == 'Y':
         es_idxer.use_parallel = use_parallel
         #es_idxer.s = 609000
         if es_idxer.conn.indices.exists_index(es_idxer.ES_INDEX_NAME):
             if ask('Index "{}" exists. Delete?'.format(es_idxer.ES_INDEX_NAME)) == 'Y':
                 es_idxer.conn.indices.delete_index(es_idxer.ES_INDEX_NAME)
             else:
                 print "Abort."
                 return
         es_idxer.create_index()
         #es_idxer.delete_index_type(es_idxer.ES_INDEX_es.pTYPE, noconfirm=True)
         es_idxer.build_index(target_collection, verbose=False)
Ejemplo n.º 2
0
def clean_target_collection():
    bdr = DataBuilder(backend='mongodb')
    bdr.load_build_config('mygene')
    try:
        target_collection = bdr.pick_target_collection(autoselect=False)
    except KeyboardInterrupt:
        print "Aborted."
        return

    if ask('Delete collection "{}"'.format(target_collection.name)) == 'Y':
        if ask("Double check! Are you sure?") == 'Y':
            target_collection.drop()
            print 'Done, collection "{}" was dropped.'.format(target_collection.name)
Ejemplo n.º 3
0
def build_index(config, use_parallel=True, noconfirm=False):
    bdr = DataBuilder(backend='mongodb')
    bdr.load_build_config(config)
    target_collection = bdr.pick_target_collection()
    target_es_index = 'genedoc_' + bdr._build_config['name']

    if target_collection:
        es_idxer = ESIndexer(mapping=bdr.get_mapping())
        es_idxer.ES_INDEX_NAME = target_es_index
        es_idxer.step = 10000
        es_idxer.use_parallel = use_parallel
        es_server = es_idxer.conn.servers[0].geturl()
        print "ES target: {}/{}/{}".format(es_server,
                                           es_idxer.ES_INDEX_NAME,
                                           es_idxer.ES_INDEX_TYPE)
        if ask("Continue?") == 'Y':
            #es_idxer.s = 609000
            #es_idxer.conn.indices.delete_index(es_idxer.ES_INDEX_NAME)
            es_idxer.create_index()
            es_idxer.delete_index_type(es_idxer.ES_INDEX_TYPE, noconfirm=noconfirm)
            es_idxer.build_index(target_collection, verbose=False)
            es_idxer.optimize()
        else:
            print "Aborted."
    else:
        print "Error: target collection is not ready yet or failed to build."
Ejemplo n.º 4
0
def sync_index(config, use_parallel=True, noconfirm=False):

    bdr = DataBuilder(backend='mongodb')
    bdr.load_build_config(config)
    target_collection = bdr.pick_target_collection()
    target_es_index = 'genedoc_' + bdr._build_config['name']


    sync_src = backend.GeneDocMongoDBBackend(target_collection)

    es_idxer = ESIndexer(bdr.get_mapping())
    es_idxer.ES_INDEX_NAME = target_es_index
    es_idxer.step = 10000
    es_idxer.use_parallel = use_parallel
    sync_target = backend.GeneDocESBackend(es_idxer)

    print '\tsync_src:\t{:<40}{}\t{}'.format(target_collection.name,
                                           sync_src.name,
                                           sync_src.count())
    print '\tsync_target\t{:<40}{}\t{}'.format(target_es_index,
                                             sync_target.name,
                                             sync_target.count())
    if noconfirm or ask("Continue?") == "Y":
        changes = diff.diff_collections(sync_src, sync_target)
        return changes
Ejemplo n.º 5
0
def es_clean_indices(keep_last=2, es_host=None, verbose=True, noconfirm=False, dryrun=False):
    '''clean up es indices, only keep last <keep_last> number of indices.'''
    conn = get_es(es_host)
    index_li = list(conn.indices.get_aliases().keys())
    if verbose:
        print("Found {} indices".format(len(index_li)))

    for prefix in ('genedoc_mygene', 'genedoc_mygene_allspecies'):
        pat = prefix + '_(\d{8})_\w{8}'
        _li = []
        for index in index_li:
            mat = re.match(pat, index)
            if mat:
                _li.append((mat.group(1), index))
        _li.sort()   # older collection appears first
        index_to_remove = [x[1] for x in _li[:-keep_last]]   # keep last # of newer indices
        if len(index_to_remove) > 0:
            print ("{} \"{}*\" indices will be removed.".format(len(index_to_remove), prefix))
            if verbose:
                for index in index_to_remove:
                    print ('\t', index)
            if noconfirm or ask("Continue?") == 'Y':
                for index in index_to_remove:
                    if dryrun:
                        print("dryrun=True, nothing is actually deleted")
                    else:
                        conn.indices.delete(index)
                print("Done.[%s indices removed]" % len(index_to_remove))
            else:
                print("Aborted.")
        else:
            print("Nothing needs to be removed.")
Ejemplo n.º 6
0
def merge(src, target, step=10000, confirm=True):
    """Merging docs from src collection into target collection."""

    src_m = importlib.import_module('dataload.contrib.' + src + '.__init__')

    db = get_src_db()
    src_coll = db[src]
    target_coll = db[target]
    cnt = src_coll.count()
    if not (confirm
            and ask('Continue to update {} docs from "{}" into "{}"?'.format(
                cnt, src_coll.name, target_coll.name)) == 'Y'):
        return

    for doc in doc_feeder(src_coll, step=step):
        if src == 'ndc':
            _id = src_m.get_id_for_merging(doc, src, db)
            target_coll.update_many(
                {'drugbank.products.ndc_product_code': _id},
                {'$addToSet': {
                    'ndc': doc['ndc']
                }})

        d = {}
        _id = src_m.get_id_for_merging(doc, src, db)
        d.update({'_id': _id, src: doc[src]})
        target_coll.update_one({"_id": _id}, {'$set': d}, upsert=True)
Ejemplo n.º 7
0
def download(url, output_folder, output_file, no_confirm=False, use_axel=False):
    orig_path = os.getcwd()
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)  # create output_folder if doesn not exist
    try:
        os.chdir(output_folder)
        if os.path.exists(output_file):
            if no_confirm or ask('Remove existing file "%s"?' % output_file) == 'Y':
                os.remove(output_file)
            else:
                print("Skipped!")
                return
        print('Downloading "%s"...' % output_file)
        if use_axel:
            #faster than wget using 5 connections
            cmdline = 'axel -a -n 5 "{}" -o "{}"'.format(url, output_file)
        else:
            cmdline = 'wget "{}" -O "{}"'.format(url, output_file)
        return_code = os.system(cmdline)
        if return_code == 0:
            print("Success.")
        else:
            print("Failed with return code (%s)." % return_code)
        print("="*50)
    finally:
        os.chdir(orig_path)
Ejemplo n.º 8
0
def target_clean_collections(keep_last=2, target=None, verbose=True, noconfirm=False):
    '''clean up collections in target db, only keep last <keep_last> number of collections.'''
    import re
    from utils.common import ask

    target = target or get_target_db()
    coll_list = target.collection_names()

    for prefix in ('genedoc_mygene', 'genedoc_mygene_allspecies'):
        pat = prefix + '_(\d{8})_\w{8}'
        _li = []
        for coll_name in coll_list:
            mat = re.match(pat, coll_name)
            if mat:
                _li.append((mat.group(1), coll_name))
        _li.sort()   # older collection appears first
        coll_to_remove = [x[1] for x in _li[:-keep_last]]   # keep last # of newer collections
        if len(coll_to_remove) > 0:
            print "{} \"{}*\" collection(s) will be removed.".format(len(coll_to_remove), prefix)
            if verbose:
                for coll in coll_to_remove:
                    print '\t', coll
            if noconfirm or ask("Continue?") == 'Y':
                for coll in coll_to_remove:
                    target[coll].drop()
                print "Done.[%s collection(s) removed]" % len(coll_to_remove)
            else:
                print "Aborted."
        else:
            print "Nothing needs to be removed."
Ejemplo n.º 9
0
def run_jobs_on_ipythoncluster(worker, task_list, shutdown_ipengines_after_done=False):

    t0 = time.time()
    rc = Client(CLUSTER_CLIENT_JSON)
    lview = rc.load_balanced_view()
    print "\t# nodes in use: {}".format(len(lview.targets or rc.ids))
    lview.block = False

    print "\t# of tasks: {}".format(len(task_list))
    print "\tsubmitting...",
    job = lview.map_async(worker, task_list)
    print "done."
    try:
        job.wait_interactive()
    except KeyboardInterrupt:
        #handle "Ctrl-C"
        if ask("\nAbort all submitted jobs?") == 'Y':
            lview.abort()
            print "Aborted, all submitted jobs are cancelled."
        else:
            print "Aborted, but your jobs are still running on the cluster."
        return

    if len(job.result) != len(task_list):
        print "WARNING:\t# of results returned ({}) != # of tasks ({}).".format(len(job.result), len(task_list))
    print "\ttotal time: {}".format(timesofar(t0))

    if shutdown_ipengines_after_done:
        print "\tshuting down all ipengine nodes...",
        lview.shutdown()
        print 'Done.'
    return job.result
Ejemplo n.º 10
0
 def update_mapping(self, m):
     assert list(m) == [self._doc_type]
     # assert m[self._doc_type].keys() == ['properties']
     assert 'properties' in m[self._doc_type]
     print(json.dumps(m, indent=2))
     if ask("Continue to update above mapping?") == 'Y':
         print(self._es.indices.put_mapping(index=self._index, doc_type=self._doc_type, body=m))
Ejemplo n.º 11
0
def main():
    if len(sys.argv) == 2 and sys.argv[1] == 'check':
        print "Checking latest mart_version:\t",
        mart_version = chk_latest_mart_version()
        print mart_version
        return

    if len(sys.argv) > 1:
        mart_version = sys.argv[1]
    else:
        print "Checking latest mart_version:\t",
        mart_version = chk_latest_mart_version()
        print mart_version

    BM = BioMart()
    BM.species_li = get_all_species(mart_version)
    DATA_FOLDER = os.path.join(ENSEMBL_FOLDER, mart_version)
    if not os.path.exists(DATA_FOLDER):
        os.makedirs(DATA_FOLDER)
    else:
        if not (len(os.listdir(DATA_FOLDER))==0 or ask('DATA_FOLDER (%s) is not empty. Continue?' % DATA_FOLDER)=='Y'):
            return
    log_f, logfile = safewfile(os.path.join(DATA_FOLDER, 'ensembl_mart_%s.log' % mart_version))
    sys.stdout = LogPrint(log_f, timestamp=True)

    BM.get_gene__main(os.path.join(DATA_FOLDER, 'gene_ensembl__gene__main.txt'))
    BM.get_translation__main(os.path.join(DATA_FOLDER, 'gene_ensembl__translation__main.txt'))
    BM.get_xref_entrezgene(os.path.join(DATA_FOLDER, 'gene_ensembl__xref_entrezgene__dm.txt'))

    BM.get_profile(os.path.join(DATA_FOLDER, 'gene_ensembl__prot_profile__dm.txt'))
    BM.get_interpro(os.path.join(DATA_FOLDER, 'gene_ensembl__prot_interpro__dm.txt'))
    sys.stdout.close()
Ejemplo n.º 12
0
 def update_mapping(self, m):
     assert list(m) == [self._doc_type]
     # assert m[self._doc_type].keys() == ['properties']
     assert 'properties' in m[self._doc_type]
     print(json.dumps(m, indent=2))
     if ask("Continue to update above mapping?") == 'Y':
         print(self._es.indices.put_mapping(index=self._index, doc_type=self._doc_type, body=m))
Ejemplo n.º 13
0
def main():
    parser = OptionParser()
    parser.add_option("-c",
                      "--conf",
                      dest="config",
                      action="store",
                      default=None,
                      help="ES indexing building config name")
    parser.add_option("-b",
                      "--noconfirm",
                      dest="noconfirm",
                      action="store_true",
                      default=False,
                      help="do not ask for confirmation")
    parser.add_option("-e",
                      "--es-index",
                      dest="es_index_name",
                      action="store",
                      default=None,
                      help="provide an alternative ES index name")
    # parser.add_option("", "--no-cleanup", dest="nocleanup",
    #                   action="store_true", default=False,
    #                   help="do not clean up old ES indices")
    (options, args) = parser.parse_args()

    with open_tunnel() as tunnel:
        if tunnel.ok:
            es_host = '127.0.0.1:' + str(es_local_tunnel_port)
        else:
            es_host = ES_HOST
        # if not options.nocleanup:
        #     es_clean_indices(noconfirm=options.noconfirm)
        t00 = time.time()
        bdr = DataBuilder(backend='es')
        if options.config:
            config_li = [options.config]
        else:
            config_li = ['mygene', 'mygene_allspecies']

        if not options.noconfirm:
            print('\n'.join([
                "Ready to build these ES indices on %s (tunnel=%s):" %
                (es_host, tunnel.ok)
            ] + ['\t' + conf for conf in config_li]))
            if ask('Continue?') != 'Y':
                print("Aborted")
                return

        for _conf in config_li:
            t0 = time.time()
            print('>"{}">>>>>>'.format(_conf))
            bdr.build_index2(_conf,
                             es_index_name=options.es_index_name,
                             es_host=es_host,
                             noconfirm=options.noconfirm)
            print('<<<<<<"{}"...done. {}'.format(_conf, timesofar(t0)))

        print('=' * 20)
        print("Finished.", timesofar(t00))
Ejemplo n.º 14
0
def merge(src, target, step=10000, confirm=True):
    """Merging docs from src collection into target collection."""
    cnt = src.count()
    if not (confirm and ask('Continue to update {} docs from "{}" into "{}"?'.format(cnt, src.name, target.name)) == 'Y'):
        return

    for doc in doc_feeder(src, step=step):
        _id = doc['_id']
        target.update_one({"_id": _id}, {'$set': doc}, upsert=True)
Ejemplo n.º 15
0
def main_cron():
    no_confirm = True   # set it to True for running this script automatically without intervention.

    src_dump = get_src_dump()
    print("Checking latest mart_version:\t", end=' ')
    mart_version = chk_latest_mart_version()
    print(mart_version)

    doc = src_dump.find_one({'_id': 'ensembl'})
    if doc and 'release' in doc and mart_version <= doc['release']:
        data_file = os.path.join(doc['data_folder'], 'gene_ensembl__gene__main.txt')
        if os.path.exists(data_file):
            print("No newer release found. Abort now.")
            sys.exit(0)

    DATA_FOLDER = os.path.join(ENSEMBL_FOLDER, str(mart_version))
    if not os.path.exists(DATA_FOLDER):
        os.makedirs(DATA_FOLDER)
    else:
        if not (no_confirm or len(os.listdir(DATA_FOLDER)) == 0 or ask('DATA_FOLDER (%s) is not empty. Continue?' % DATA_FOLDER) == 'Y'):
            sys.exit(0)

    log_f, logfile = safewfile(os.path.join(DATA_FOLDER, 'ensembl_mart_%s.log' % mart_version), prompt=(not no_confirm), default='O')
    sys.stdout = LogPrint(log_f, timestamp=True)

    #mark the download starts
    doc = {'_id': 'ensembl',
           'release': mart_version,
           'timestamp': time.strftime('%Y%m%d'),
           'data_folder': DATA_FOLDER,
           'logfile': logfile,
           'status': 'downloading'}
    src_dump.save(doc)
    t0 = time.time()

    try:
        BM = BioMart()
        BM.species_li = get_all_species(mart_version)
        BM.get_gene__main(os.path.join(DATA_FOLDER, 'gene_ensembl__gene__main.txt'))
        BM.get_translation__main(os.path.join(DATA_FOLDER, 'gene_ensembl__translation__main.txt'))
        BM.get_xref_entrezgene(os.path.join(DATA_FOLDER, 'gene_ensembl__xref_entrezgene__dm.txt'))

        BM.get_profile(os.path.join(DATA_FOLDER, 'gene_ensembl__prot_profile__dm.txt'))
        BM.get_interpro(os.path.join(DATA_FOLDER, 'gene_ensembl__prot_interpro__dm.txt'))
        BM.get_pfam(os.path.join(DATA_FOLDER, 'gene_ensembl__prot_pfam__dm.txt'))
    finally:
        sys.stdout.close()

    #mark the download finished successfully
    _updates = {
        'status': 'success',
        'time': timesofar(t0),
        'pending_to_upload': True    # a flag to trigger data uploading
    }
    src_dump.update({'_id': 'ensembl'}, {'$set': _updates})
Ejemplo n.º 16
0
 def delete_index_type(self, index_type, noconfirm=False):
     '''Delete all indexes for a given index_type.'''
     index_name = self.ES_INDEX_NAME
     # Check if index_type exists
     m = self.conn.indices.get_mapping(index_name, index_type)
     if not m:
         print('Error: index type "%s" does not exist in index "%s".' % (index_type, index_name))
         return
     path = '/%s/%s' % (index_name, index_type)
     if noconfirm or ask('Confirm to delete all data under "%s":' % path) == 'Y':
         return self.conn.indices.delete_mapping(index=index_name, doc_type=index_type)
Ejemplo n.º 17
0
def main():
    if len(sys.argv) > 1:
        config = sys.argv[1]
    else:
        config = 'mygene'
        #config = 'mygene_allspecies'
    if not config.startswith('genedoc_'):
        config = 'genedoc_' + config
    assert config in ['genedoc_mygene', 'genedoc_mygene_allspecies']
    noconfirm = '-b' in sys.argv

    _changes_fn = _get_current_changes_fn(config)
    if _changes_fn:
        print("Changes file: " + _changes_fn)
    else:
        print("No changes file found. Aborted.")
        return -1
    if noconfirm or ask("Continue to load?") == 'Y':
        changes = loadobj(_changes_fn)
    else:
        print("Aborted.")
        return -2

    _es_host = 'localhost:' + str(es_local_tunnel_port)
    _es_index = config + TARGET_ES_INDEX_SUFFIX    # '_current_1'

    # for test
    #_es_host = 'localhost:9200'
    #_es_index = config + TARGET_ES_INDEX_SUFFIX    # '_current_1'

    with open_tunnel() as tunnel:
        if tunnel.ok:
            esi = ESIndexer2(_es_index, es_host=_es_host)
            meta = esi.get_mapping_meta(changes)
            print('\033[34;06m{}\033[0m:'.format('[Metadata]'))
            pprint(meta)
            code = esi.apply_changes(changes, noconfirm=noconfirm)
            if code != -1:
                # aborted when code == -1
                _meta = {'_meta': meta}
                # somehow when only update "_meta", "_timestamp" get empty
                # so add "_timestamp" explicitly here. This is an ES bug.
                _meta['_timestamp'] = {
                    "enabled": True,
                    "path": "_timestamp"
                }
                #esi.update_mapping_meta(_meta)
                print(esi.conn.indices.put_mapping(esi.ES_INDEX_TYPE, _meta, [esi.ES_INDEX_NAME]))
                esi.post_verify_changes(changes)
Ejemplo n.º 18
0
    def apply_changes(self, changes, verify=True, noconfirm=False):
        if verify:
            self.pre_verify_changes(changes)

        if not (noconfirm or ask('\nContinue to apply changes?') == 'Y'):
            print("Aborted.")
            return -1
        #src = self.get_source_collection(changes)
        step = self.step
        _db = get_target_db()
        source_col = _db[changes['source']]
        src = GeneDocMongoDBBackend(source_col)
        target = GeneDocESBackend(self)
        _timestamp = changes['timestamp']

        def _add_docs(ids):
            i = 0
            for _ids in iter_n(ids, step):
                t1 = time.time()
                _doc_li = src.mget_from_ids(_ids)
                for _doc in _doc_li:
                    _doc['_timestamp'] = _timestamp
                    i += 1
                target.insert(_doc_li)
                print('\t{}\t{}'.format(i, timesofar(t1)))

        t0 = time.time()
        if changes['add']:
            print("Adding {} new docs...".format(len(changes['add'])))
            t00 = time.time()
            _add_docs(changes['add'])
            print("done. [{}]".format(timesofar(t00)))
        if changes['delete']:
            print("Deleting {} discontinued docs...".format(len(changes['delete'])), end='')
            t00 = time.time()
            target.remove_from_ids(changes['delete'], step=step)
            print("done. [{}]".format(timesofar(t00)))
        if changes['update']:
            print("Updating {} existing docs...".format(len(changes['update'])))
            t00 = time.time()
            ids = [x['_id'] for x in changes['update']]
            _add_docs(ids)
            print("done. [{}]".format(timesofar(t00)))

        target.finalize()

        print("\n")
        print("Finished.", timesofar(t0))
Ejemplo n.º 19
0
def main():
    no_confirm = True  # set it to True for running this script automatically without intervention.
    src_dump = get_src_dump()
    (file_name, release) = get_newest_release()
    doc = src_dump.find_one({'_id': 'clinvar'})
    if new_release_available(doc['release']):
        data_file = os.path.join(doc['data_folder'], file_name)
        if os.path.exists(data_file):
            print("No newer file found. Abort now.")
            return

        if not os.path.exists(DATA_FOLDER):
            os.makedirs(DATA_FOLDER)
        else:
            if not (no_confirm or len(os.listdir(DATA_FOLDER)) == 0
                    or ask('DATA_FOLDER (%s) is not empty. Continue?' %
                           DATA_FOLDER) == 'Y'):
                return

        log_f, logfile = safewfile(os.path.join(DATA_FOLDER,
                                                'clinvar_dump.log'),
                                   prompt=(not no_confirm),
                                   default='O')
        sys.stdout = LogPrint(log_f, timestamp=True)

        # mark the download starts
        doc = {
            '_id': 'clinvar',
            'timestamp': timestamp,
            'data_folder': DATA_FOLDER,
            'release': release,
            'logfile': logfile,
            'status': 'downloading'
        }
        src_dump.save(doc)
        t0 = time.time()
        try:
            download_ftp_file(no_confirm)
        finally:
            sys.stdout.close()
        # mark the download finished successfully
        _updates = {
            'status': 'success',
            'time': timesofar(t0),
            'pending_to_upload': True  # a flag to trigger data uploading
        }
        src_dump.update({'_id': 'clinvar'}, {'$set': _updates})
Ejemplo n.º 20
0
def main_cron():
    no_confirm = True   # set it to True for running this script automatically without intervention.

    print("Checking latest refseq release:\t", end='')
    refseq_release = get_refseq_release()
    print(refseq_release)

    src_dump = get_src_dump()
    doc = src_dump.find_one({'_id': 'refseq'})
    if doc and 'release' in doc and refseq_release <= doc['release']:
        data_file = os.path.join(doc['data_folder'], 'complete.109.rna.gbff.gz')
        if os.path.exists(data_file):
            print("No newer release found. Abort now.")
            sys.exit(0)

    DATA_FOLDER = os.path.join(REFSEQ_FOLDER, str(refseq_release))
    if not os.path.exists(DATA_FOLDER):
        os.makedirs(DATA_FOLDER)
    else:
        if not (no_confirm or len(os.listdir(DATA_FOLDER)) == 0 or ask('DATA_FOLDER (%s) is not empty. Continue?' % DATA_FOLDER) == 'Y'):
            sys.exit(0)

    log_f, logfile = safewfile(os.path.join(DATA_FOLDER, 'refseq_dump.log'), prompt=(not no_confirm), default='O')
    sys.stdout = LogPrint(log_f, timestamp=True)

    #mark the download starts
    doc = {'_id': 'refseq',
           'release': refseq_release,
           'timestamp': time.strftime('%Y%m%d'),
           'data_folder': DATA_FOLDER,
           'logfile': logfile,
           'status': 'downloading'}
    src_dump.save(doc)
    t0 = time.time()

    try:
        download(DATA_FOLDER, refseq_release, no_confirm=no_confirm)
    finally:
        sys.stdout.close()

    #mark the download finished successfully
    _updates = {
        'status': 'success',
        'time': timesofar(t0),
        'pending_to_upload': True    # a flag to trigger data uploading
    }
    src_dump.update({'_id': 'refseq'}, {'$set': _updates})
Ejemplo n.º 21
0
def main():
    parser = OptionParser()
    parser.add_option("-c", "--conf", dest="config",
                      action="store", default=None,
                      help="ES indexing building config name")
    parser.add_option("-b", "--noconfirm", dest="noconfirm",
                      action="store_true", default=False,
                      help="do not ask for confirmation")
    parser.add_option("-e", "--es-index", dest="es_index_name",
                      action="store", default=None,
                      help="provide an alternative ES index name")
    # parser.add_option("", "--no-cleanup", dest="nocleanup",
    #                   action="store_true", default=False,
    #                   help="do not clean up old ES indices")
    (options, args) = parser.parse_args()

    with open_tunnel() as tunnel:
        if tunnel.ok:
            es_host = '127.0.0.1:' + str(es_local_tunnel_port)
        else:
            es_host = ES_HOST
        # if not options.nocleanup:
        #     es_clean_indices(noconfirm=options.noconfirm)
        t00 = time.time()
        bdr = DataBuilder(backend='es')
        if options.config:
            config_li = [options.config]
        else:
            config_li = ['mygene', 'mygene_allspecies']

        if not options.noconfirm:
            print('\n'.join(["Ready to build these ES indices on %s (tunnel=%s):" % (es_host, tunnel.ok)] +
                            ['\t' + conf for conf in config_li]))
            if ask('Continue?') != 'Y':
                print("Aborted")
                return

        for _conf in config_li:
            t0 = time.time()
            print('>"{}">>>>>>'.format(_conf))
            bdr.build_index2(_conf,
                             es_index_name=options.es_index_name,
                             es_host=es_host,noconfirm=options.noconfirm)
            print('<<<<<<"{}"...done. {}'.format(_conf, timesofar(t0)))

        print('=' * 20)
        print("Finished.", timesofar(t00))
Ejemplo n.º 22
0
def main():
    no_confirm = True   # set it to True for running this script automatically without intervention.

    if not os.path.exists(DATA_FOLDER):
        os.makedirs(DATA_FOLDER)
    else:
        if not (no_confirm or len(os.listdir(DATA_FOLDER)) == 0 or ask('DATA_FOLDER (%s) is not empty. Continue?' % DATA_FOLDER) == 'Y'):
            sys.exit()

    log_f, logfile = safewfile(os.path.join(DATA_FOLDER, 'entrez_dump.log'), prompt=(not no_confirm), default='O')
    sys.stdout = LogPrint(log_f, timestamp=True)
    sys.stderr = sys.stdout

    #mark the download starts
    src_dump = get_src_dump()
    doc = {'_id': 'entrez',
           'timestamp': timestamp,
           'data_folder': DATA_FOLDER,
           'logfile': logfile,
           'status': 'downloading'}
    src_dump.save(doc)
    t0 = time.time()
    try:
        download(DATA_FOLDER, no_confirm=no_confirm)
        t_download = timesofar(t0)
        t1 = time.time()
        #mark parsing starts
        src_dump.update({'_id': 'entrez'}, {'$set': {'status': 'parsing'}})
        parse_gbff(DATA_FOLDER)
        t_parsing = timesofar(t1)
        t_total = timesofar(t0)
    finally:
        sys.stdout.close()

    #mark the download finished successfully
    _updates = {
        'status': 'success',
        'time': {
            'download': t_download,
            'parsing': t_parsing,
            'total': t_total
        },
        'pending_to_upload': True    # a flag to trigger data uploading
    }

    src_dump.update({'_id': 'entrez'}, {'$set': _updates})
Ejemplo n.º 23
0
 def update_mapping_meta(self, meta, confirm=True):
     allowed_keys = set(['_meta', '_timestamp'])
     if isinstance(meta, dict) and len(set(meta) - allowed_keys) == 0:
         current_meta = self.get_mapping_meta()
         print('\033[34;06m{}\033[0m:'.format('[Current _meta]'))
         print(json.dumps(current_meta, indent=2))
         print('\033[34;06m{}\033[0m:'.format('[Replace with new _meta]'))
         print(json.dumps(meta, indent=2))
         if not confirm or ask('Continue to update above _meta field to "{}" index?'.format(self._index)) == "Y":
             body = {self._doc_type: meta}
             print(self._es.indices.put_mapping(
                 doc_type=self._doc_type,
                 body=body,
                 index=self._index
             ))
     else:
         raise ValueError('Input "meta" should have and only have "_meta" field.')
Ejemplo n.º 24
0
def update_index(changes, sync_src, sync_target, noconfirm=False):
    # changes['_add'] = changes['delete']
    # changes['_delete'] = changes['add']
    # changes['delete'] = changes['_delete']
    # changes['add'] = changes['_add']
    # del changes['_add']
    # del changes['_delete']

    print "\t{}\trecords will be added.".format(len(changes['add']))
    print "\t{}\trecords will be deleted.".format(len(changes['delete']))
    print "\t{}\trecords will be updated.".format(len(changes['update']))

    print
    print '\tsync_src:\t{:<45}{}'.format(sync_src.target_collection.name,
                                             sync_src.name)
    print '\tsync_target\t{:<45}{}'.format(sync_target.target_esidxer.ES_INDEX_NAME,
                                               sync_target.name)

    if noconfirm or ask("Continue?")=='Y':
        t00 = time.time()
        es_idxer = sync_target.target_esidxer

        if len(changes['add']) > 0:
            print "Adding {} new records...".format(len(changes['add']))
            t0 = time.time()
            _q = {'_id': {'$in': changes['add']}}
            for docs in doc_feeder(sync_src.target_collection, step=1000, inbatch=True, query=_q):
                es_idxer.add_docs(docs)
            print "Done. [{}]".format(timesofar(t0))

        if len(changes['delete']) > 0:
            print "Deleting {} old records...".format(len(changes['delete']))
            t0 = time.time()
            es_idxer.delete_docs(changes['delete'])
            print "Done. [{}]".format(timesofar(t0))

        if len(changes['update']) > 0:
            print "Updating {} existing records...".format(len(changes['update']))
            t0 = time.time()
            ids = [d['_id'] for d in changes['update']]
            _q = {'_id': {'$in': ids}}
            for docs in doc_feeder(sync_src.target_collection, step=1000, inbatch=True, query=_q):
                es_idxer.add_docs(docs)
            print "Done. [{}]".format(timesofar(t0))
        print '='*20
        print 'Finished. [{}]'.format(timesofar(t00))
Ejemplo n.º 25
0
 def update_mapping_meta(self, meta, confirm=True):
     allowed_keys = set(['_meta', '_timestamp'])
     if isinstance(meta, dict) and len(set(meta) - allowed_keys) == 0:
         current_meta = self.get_mapping_meta()
         print('\033[34;06m{}\033[0m:'.format('[Current _meta]'))
         print(json.dumps(current_meta, indent=2))
         print('\033[34;06m{}\033[0m:'.format('[Replace with new _meta]'))
         print(json.dumps(meta, indent=2))
         if not confirm or ask(
                 'Continue to update above _meta field to "{}" index?'.
                 format(self._index)) == "Y":
             body = {self._doc_type: meta}
             print(
                 self._es.indices.put_mapping(doc_type=self._doc_type,
                                              body=body,
                                              index=self._index))
     else:
         raise ValueError(
             'Input "meta" should have and only have "_meta" field.')
Ejemplo n.º 26
0
def main():
    no_confirm = True   # set it to True for running this script automatically without intervention.
    src_dump = get_src_dump()
    (file_name, release) = get_newest_release()
    doc = src_dump.find_one({'_id': 'clinvar'})
    if new_release_available(doc['release']):
        data_file = os.path.join(doc['data_folder'], file_name)
        if os.path.exists(data_file):
            print("No newer file found. Abort now.")
            return

        if not os.path.exists(DATA_FOLDER):
            os.makedirs(DATA_FOLDER)
        else:
            if not (no_confirm or len(os.listdir(DATA_FOLDER)) == 0
                    or ask('DATA_FOLDER (%s) is not empty. Continue?' % DATA_FOLDER) == 'Y'):
                return

        log_f, logfile = safewfile(os.path.join(DATA_FOLDER, 'clinvar_dump.log'), prompt=(not no_confirm), default='O')
        sys.stdout = LogPrint(log_f, timestamp=True)

        # mark the download starts
        doc = {'_id': 'clinvar',
               'timestamp': timestamp,
               'data_folder': DATA_FOLDER,
               'release': release,
               'logfile': logfile,
               'status': 'downloading'}
        src_dump.save(doc)
        t0 = time.time()
        try:
            download_ftp_file(no_confirm)
        finally:
            sys.stdout.close()
        # mark the download finished successfully
        _updates = {
            'status': 'success',
            'time': timesofar(t0),
            'pending_to_upload': True    # a flag to trigger data uploading
        }
        src_dump.update({'_id': 'clinvar'}, {'$set': _updates})
Ejemplo n.º 27
0
    def merge_resume(self, build_config, at_collection, step=10000):
        '''resume a merging process after a failure.
             .merge_resume('mygene_allspecies', 'reporter')
        '''
        from pprint import pprint
        assert not self.using_ipython_cluster, "Abort. Can only resume merging in non-parallel mode."
        self.load_build_config(build_config)
        last_build = self._build_config['build'][-1]
        print "Last build record:"
        pprint(last_build)
        assert last_build['status'] == 'building', \
            "Abort. Last build does not need to be resumed."
        assert at_collection in self._build_config['sources'], \
            'Abort. Cannot resume merging from a unknown collection "{}"'.format(at_collection)
        assert last_build['target_backend'] == self.target.name, \
            'Abort. Re-initialized DataBuilder class using matching backend "{}"'.format(last_build['backend'])
        assert last_build.get('stats', None), \
            'Abort. Intital build stats are not available. You should restart the build from the scratch.'
        self._stats = last_build['stats']

        if ask('Continue to resume merging from "{}"?'.format(at_collection)) == 'Y':
            #TODO: resume logging
            target_name = last_build['target']
            self.validate_src_collections()
            self.prepare_target(target_name=target_name)
            src_cnt = 0
            for collection in self._build_config['sources']:
                if collection in ['entrez_gene', 'ensembl_gene']:
                    continue
                src_cnt += 1
                if collection == at_collection:
                    break
            self._merge_local(step=step, restart_at=src_cnt)
            if self.target.name == 'es':
                print "Updating metadata...",
                self.update_mapping_meta()
            self.log_src_build({'status': 'success',
                                'timestamp': datetime.now()})
Ejemplo n.º 28
0
def download(path, no_confirm=False):
    out = []
    orig_path = os.getcwd()
    try:
        _expand_refseq_files()
        for subfolder in FILE_LIST:
            filedata = FILE_LIST[subfolder]
            baseurl = filedata['url']
            data_folder = os.path.join(path, subfolder)
            if not os.path.exists(data_folder):
                os.mkdir(data_folder)

            for f in filedata['files']:
                url = baseurl + f
                os.chdir(data_folder)
                filename = os.path.split(f)[1]
                if os.path.exists(filename):
                    if no_confirm or ask('Remove existing file "%s"?' % filename) == 'Y':
                        os.remove(filename)
                    else:
                        print("Skipped!")
                        continue
                print('Downloading "%s"...' % f)
                #cmdline = 'wget %s' % url
                #cmdline = 'axel -a -n 5 %s' % url   #faster than wget using 5 connections
                cmdline = _get_ascp_cmdline(url)
                return_code = os.system(cmdline)
                #return_code = 0;print cmdline    #for testing
                if return_code == 0:
                    print("Success.")
                else:
                    print("Failed with return code (%s)." % return_code)
                    out.append((url, return_code))
                print("="*50)
    finally:
        os.chdir(orig_path)

    return out
Ejemplo n.º 29
0
def download(no_confirm=False):
    orig_path = os.getcwd()
    try:
        os.chdir(DATA_FOLDER)
        path, filename = os.path.split(DATAFILE_PATH)
        if os.path.exists(filename):
            if no_confirm or ask('Remove existing file "%s"?' % filename) == 'Y':
                os.remove(filename)
            else:
                print("Skipped!")
                return
        print('Downloading "%s"...' % filename)
        url = 'ftp://{}/{}'.format(FTP_SERVER, DATAFILE_PATH)
        cmdline = 'wget %s -O %s' % (url, filename)
        #cmdline = 'axel -a -n 5 %s' % url   #faster than wget using 5 connections
        return_code = os.system(cmdline)
        if return_code == 0:
            print("Success.")
        else:
            print("Failed with return code (%s)." % return_code)
        print("="*50)
    finally:
        os.chdir(orig_path)
Ejemplo n.º 30
0
def download(no_confirm=False):
    orig_path = os.getcwd()
    try:
        os.chdir(DATA_FOLDER)
        filename = 'genes.zip'
        url = GENES_URL
        if os.path.exists(filename):
            if no_confirm or ask('Remove existing file "%s"?' % filename) == 'Y':
                os.remove(filename)
            else:
                print "Skipped!"
                return
        print 'Downloading "%s"...' % filename
        cmdline = 'wget %s -O %s' % (url, filename)
        #cmdline = 'axel -a -n 5 %s' % url   #faster than wget using 5 connections
        return_code = os.system(cmdline)
        if return_code == 0:
            print "Success."
        else:
            print "Failed with return code (%s)." % return_code
        print "="*50
    finally:
        os.chdir(orig_path)
Ejemplo n.º 31
0
def download(path, release, no_confirm=False):
    out = []
    orig_path = os.getcwd()
    try:
        data_folder = os.path.join(path, release)
        if not os.path.exists(data_folder):
            os.mkdir(data_folder)

        _url = 'ftp://' + FTP_SERVER + BASE_PATH + DATA_FILE
        url_li = _expand_wildchar_urls(_url)
        print('Found {} "{}" files to download.'.format(len(url_li), DATA_FILE))

        for url in url_li:
            os.chdir(data_folder)
            filename = os.path.split(url)[1]
            if os.path.exists(filename):
                if no_confirm or ask('Remove existing file "%s"?' % filename) == 'Y':
                    os.remove(filename)
                else:
                    print("Skipped!")
                    continue
            print('Downloading "%s"...' % filename)
            #cmdline = 'wget %s' % url
            #cmdline = 'axel -a -n 5 %s' % url   #faster than wget using 5 connections
            cmdline = _get_ascp_cmdline(url)
            return_code = os.system(cmdline)
            #return_code = 0;print cmdline    #for testing
            if return_code == 0:
                print("Success.")
            else:
                print("Failed with return code (%s)." % return_code)
                out.append((url, return_code))
            print("="*50)
    finally:
        os.chdir(orig_path)

    return out
Ejemplo n.º 32
0
def src_clean_archives(keep_last=1, src=None, verbose=True, noconfirm=False):
    '''clean up archive collections in src db, only keep last <kepp_last>
       number of archive.
    '''
    from utils.dataload import list2dict
    from utils.common import ask

    src = src or get_src_db()

    archive_li = sorted([(coll.split('_archive_')[0], coll) for coll in src.collection_names()
                         if coll.find('archive') != -1])
    archive_d = list2dict(archive_li, 0, alwayslist=1)
    coll_to_remove = []
    for k, v in archive_d.items():
        print k,
        #check current collection exists
        if src[k].count() > 0:
            cnt = 0
            for coll in sorted(v)[:-keep_last]:
                coll_to_remove.append(coll)
                cnt += 1
            print "\t\t%s archived collections marked to remove." % cnt
        else:
            print 'skipped. Missing current "%s" collection!' % k
    if len(coll_to_remove) > 0:
        print "%d archived collections will be removed." % len(coll_to_remove)
        if verbose:
            for coll in coll_to_remove:
                print '\t', coll
        if noconfirm or ask("Continue?") == 'Y':
            for coll in coll_to_remove:
                src[coll].drop()
            print "Done.[%s collections removed]" % len(coll_to_remove)
        else:
            print "Aborted."
    else:
        print "Nothing needs to be removed."
Ejemplo n.º 33
0
def diff2src(use_parallel=True, noconfirm=False):
    src_li = []

    target_db = get_target_db()
    src_li.extend([(name, target_db[name].count(), 'mongodb') for name in sorted(target_db.collection_names()) if name.startswith('genedoc')])

    es_idxer = ESIndexer()
    es_idxer.conn.default_indices=[]
    for es_idx in es_idxer.conn.indices.get_indices():
        if es_idx.startswith('genedoc'):
            es_idxer.ES_INDEX_NAME = es_idx
            src_li.append((es_idx, es_idxer.count()['count'], 'es'))

    print "Found {} sources:".format(len(src_li))
    src_1 = _pick_one(src_li, "Pick first source above: ")
    src_li.remove(src_1)
    print
    src_2 = _pick_one(src_li, "Pick second source above: ")

    sync_li = []
    for src in (src_1, src_2):
        if src[2] == 'mongodb':
            b = backend.GeneDocMongoDBBackend(target_db[src[0]])
        elif src[2] == 'es':
            es_idxer = ESIndexer()
            es_idxer.ES_INDEX_NAME = src[0]
            es_idxer.step = 10000
            b = backend.GeneDocESBackend(es_idxer)
        sync_li.append(b)

    sync_src, sync_target = sync_li
    print '\tsync_src:\t{:<45}{}\t{}'.format(*src_1)
    print '\tsync_target\t{:<45}{}\t{}'.format(*src_2)
    if noconfirm or ask("Continue?") == "Y":
        changes = diff.diff_collections(sync_src, sync_target, use_parallel=use_parallel)
        return changes
Ejemplo n.º 34
0
    no_confirm = True   # set it to True for running this script automatically without intervention.

    src_dump = get_src_dump()
    lastmodified = check_lastmodified()
    doc = src_dump.find_one({'_id': 'uniprot'})
    if doc and 'lastmodified' in doc and lastmodified <= doc['lastmodified']:
        path, filename = os.path.split(DATAFILE_PATH)
        data_file = os.path.join(doc['data_folder'], filename)
        if os.path.exists(data_file):
            print("No newer file found. Abort now.")
            sys.exit(0)

    if not os.path.exists(DATA_FOLDER):
        os.makedirs(DATA_FOLDER)
    else:
        if not (no_confirm or len(os.listdir(DATA_FOLDER)) == 0 or ask('DATA_FOLDER (%s) is not empty. Continue?' % DATA_FOLDER) == 'Y'):
            sys.exit(0)

    log_f, logfile = safewfile(os.path.join(DATA_FOLDER, 'uniprot_dump.log'), prompt=(not no_confirm), default='O')
    sys.stdout = LogPrint(log_f, timestamp=True)

    #mark the download starts
    doc = {'_id': 'uniprot',
           'timestamp': timestamp,
           'data_folder': DATA_FOLDER,
           'lastmodified': lastmodified,
           'logfile': logfile,
           'status': 'downloading'}
    src_dump.save(doc)
    t0 = time.time()
    try: