Esempio n. 1
0
def build_index(config, use_parallel=True, noconfirm=False):
    bdr = DataBuilder(backend='mongodb')
    bdr.load_build_config(config)
    target_collection = bdr.pick_target_collection()
    target_es_index = 'genedoc_' + bdr._build_config['name']

    if target_collection:
        es_idxer = ESIndexer(mapping=bdr.get_mapping())
        es_idxer.ES_INDEX_NAME = target_es_index
        es_idxer.step = 10000
        es_idxer.use_parallel = use_parallel
        es_server = es_idxer.conn.servers[0].geturl()
        print "ES target: {}/{}/{}".format(es_server,
                                           es_idxer.ES_INDEX_NAME,
                                           es_idxer.ES_INDEX_TYPE)
        if ask("Continue?") == 'Y':
            #es_idxer.s = 609000
            #es_idxer.conn.indices.delete_index(es_idxer.ES_INDEX_NAME)
            es_idxer.create_index()
            es_idxer.delete_index_type(es_idxer.ES_INDEX_TYPE, noconfirm=noconfirm)
            es_idxer.build_index(target_collection, verbose=False)
            es_idxer.optimize()
        else:
            print "Aborted."
    else:
        print "Error: target collection is not ready yet or failed to build."
Esempio n. 2
0
def build_index(config, use_parallel=True, noconfirm=False):
    bdr = DataBuilder(backend='mongodb')
    bdr.load_build_config(config)
    target_collection = bdr.pick_target_collection()
    target_es_index = 'genedoc_' + bdr._build_config['name']

    if target_collection:
        es_idxer = ESIndexer(mapping=bdr.get_mapping())
        es_idxer.ES_INDEX_NAME = target_es_index
        es_idxer.step = 10000
        es_idxer.use_parallel = use_parallel
        es_server = es_idxer.conn.servers[0].geturl()
        print("ES target: {}/{}/{}".format(es_server, es_idxer.ES_INDEX_NAME,
                                           es_idxer.ES_INDEX_TYPE))
        if noconfirm or ask("Continue?") == 'Y':
            #es_idxer.s = 609000
            #es_idxer.conn.indices.delete_index(es_idxer.ES_INDEX_NAME)
            es_idxer.create_index()
            es_idxer.delete_index_type(es_idxer.ES_INDEX_TYPE,
                                       noconfirm=noconfirm)
            es_idxer.build_index(target_collection, verbose=False)
            es_idxer.optimize()
        else:
            print("Aborted.")
    else:
        print("Error: target collection is not ready yet or failed to build.")
Esempio n. 3
0
def sync_index(config, use_parallel=True, noconfirm=False):

    bdr = DataBuilder(backend='mongodb')
    bdr.load_build_config(config)
    target_collection = bdr.pick_target_collection()
    target_es_index = 'genedoc_' + bdr._build_config['name']


    sync_src = backend.GeneDocMongoDBBackend(target_collection)

    es_idxer = ESIndexer(bdr.get_mapping())
    es_idxer.ES_INDEX_NAME = target_es_index
    es_idxer.step = 10000
    es_idxer.use_parallel = use_parallel
    sync_target = backend.GeneDocESBackend(es_idxer)

    print '\tsync_src:\t{:<40}{}\t{}'.format(target_collection.name,
                                           sync_src.name,
                                           sync_src.count())
    print '\tsync_target\t{:<40}{}\t{}'.format(target_es_index,
                                             sync_target.name,
                                             sync_target.count())
    if noconfirm or ask("Continue?") == "Y":
        changes = diff.diff_collections(sync_src, sync_target)
        return changes
Esempio n. 4
0
 def build_index2(self, build_config='mygene_allspecies', last_build_idx=-1, use_parallel=False, es_host=None, es_index_name=None):
     """Build ES index from last successfully-merged mongodb collection.
         optional "es_host" argument can be used to specified another ES host, otherwise default ES_HOST.
         optional "es_index_name" argument can be used to pass an alternative index name, otherwise same as mongodb collection name
     """
     from pprint import pprint
     self.load_build_config(build_config)
     last_build = self._build_config['build'][last_build_idx]
     print "Last build record:"
     pprint(last_build)
     assert last_build['status'] == 'success', \
         "Abort. Last build did not success."
     assert last_build['target_backend'] == "mongodb", \
         'Abort. Last build need to be built using "mongodb" backend.'
     assert last_build.get('stats', None), \
         'Abort. Last build stats are not available.'
     self._stats = last_build['stats']
     assert last_build.get('target', None), \
         'Abort. Last build target_collection is not available.'
     #target_collection = last_build['target']
     target_collection = "genedoc_{}_current".format(build_config)  ######
     _db = get_target_db()
     target_collection = _db[target_collection]
     print
     print 'Source: ', target_collection.name
     _mapping = self.get_mapping()
     _meta = {}
     src_version = self.get_src_version()
     if src_version:
         _meta['src_version'] = src_version
     if getattr(self, '_stats', None):
         _meta['stats'] = self._stats
     if 'timestamp' in last_build:
         _meta['timestamp'] = last_build['timestamp']
     if _meta:
         _mapping['_meta'] = _meta
     es_index_name = es_index_name or target_collection.name
     es_idxer = ESIndexer(mapping=_mapping,
                          es_index_name=es_index_name,
                          es_host=es_host,
                          step=5000)
     if build_config == 'mygene_allspecies':
         es_idxer.number_of_shards = 10   # default 5
     print "ES host:", es_idxer.conn.servers[0].geturl()
     print "ES index:", es_index_name
     if ask("Continue to build ES index?") == 'Y':
         es_idxer.use_parallel = use_parallel
         #es_idxer.s = 609000
         if es_idxer.conn.indices.exists_index(es_idxer.ES_INDEX_NAME):
             if ask('Index "{}" exists. Delete?'.format(es_idxer.ES_INDEX_NAME)) == 'Y':
                 es_idxer.conn.indices.delete_index(es_idxer.ES_INDEX_NAME)
             else:
                 print "Abort."
                 return
         es_idxer.create_index()
         #es_idxer.delete_index_type(es_idxer.ES_INDEX_es.pTYPE, noconfirm=True)
         es_idxer.build_index(target_collection, verbose=False)
Esempio n. 5
0
    def sync_index(self, use_parallel=True):
        from utils import diff

        sync_src = self.get_target_collection()

        es_idxer = ESIndexer(self.get_mapping())
        es_idxer.ES_INDEX_NAME = sync_src.target_collection.name
        es_idxer.step = 10000
        es_idxer.use_parallel = use_parallel
        sync_target = databuild.backend.GeneDocESBackend(es_idxer)

        changes = diff.diff_collections(sync_src, sync_target)
        return changes
Esempio n. 6
0
    def sync_index(self, use_parallel=True):
        from utils import diff

        sync_src = self.get_target_collection()

        es_idxer = ESIndexer(self.get_mapping())
        es_idxer.ES_INDEX_NAME = sync_src.target_collection.name
        es_idxer.step = 10000
        es_idxer.use_parallel = use_parallel
        sync_target = databuild.backend.GeneDocESBackend(es_idxer)

        changes = diff.diff_collections(sync_src, sync_target)
        return changes
Esempio n. 7
0
 def build_index(self, use_parallel=True):
     target_collection = self.get_target_collection()
     if target_collection:
         es_idxer = ESIndexer(mapping=self.get_mapping())
         es_idxer.ES_INDEX_NAME = 'genedoc_' + self._build_config['name']
         es_idxer.step = 10000
         es_idxer.use_parallel = use_parallel
         #es_idxer.s = 609000
         #es_idxer.conn.indices.delete_index(es_idxer.ES_INDEX_NAME)
         es_idxer.create_index()
         es_idxer.delete_index_type(es_idxer.ES_INDEX_TYPE, noconfirm=True)
         es_idxer.build_index(target_collection, verbose=False)
         es_idxer.optimize()
     else:
         logging.info("Error: target collection is not ready yet or failed to build.")
Esempio n. 8
0
 def build_index(self, use_parallel=True):
     target_collection = self.get_target_collection()
     if target_collection:
         es_idxer = ESIndexer(mapping=self.get_mapping())
         es_idxer.ES_INDEX_NAME = 'genedoc_' + self._build_config['name']
         es_idxer.step = 10000
         es_idxer.use_parallel = use_parallel
         #es_idxer.s = 609000
         #es_idxer.conn.indices.delete_index(es_idxer.ES_INDEX_NAME)
         es_idxer.create_index()
         es_idxer.delete_index_type(es_idxer.ES_INDEX_TYPE, noconfirm=True)
         es_idxer.build_index(target_collection, verbose=False)
         es_idxer.optimize()
     else:
         logging.info(
             "Error: target collection is not ready yet or failed to build."
         )
Esempio n. 9
0
def sync_index(config, use_parallel=True, noconfirm=False):

    bdr = DataBuilder(backend='mongodb')
    bdr.load_build_config(config)
    target_collection = bdr.pick_target_collection()
    target_es_index = 'genedoc_' + bdr._build_config['name']

    sync_src = backend.GeneDocMongoDBBackend(target_collection)

    es_idxer = ESIndexer(bdr.get_mapping())
    es_idxer.ES_INDEX_NAME = target_es_index
    es_idxer.step = 10000
    es_idxer.use_parallel = use_parallel
    sync_target = backend.GeneDocESBackend(es_idxer)

    print('\tsync_src:\t{:<40}{}\t{}'.format(target_collection.name,
                                             sync_src.name, sync_src.count()))
    print('\tsync_target\t{:<40}{}\t{}'.format(target_es_index,
                                               sync_target.name,
                                               sync_target.count()))
    if noconfirm or ask("Continue?") == "Y":
        changes = diff.diff_collections(sync_src, sync_target)
        return changes
Esempio n. 10
0
    def build_index2(self, build_config='mygene_allspecies', last_build_idx=-1, use_parallel=False, es_host=None, es_index_name=None, noconfirm=False):
        """Build ES index from last successfully-merged mongodb collection.
            optional "es_host" argument can be used to specified another ES host, otherwise default ES_HOST.
            optional "es_index_name" argument can be used to pass an alternative index name, otherwise same as mongodb collection name
        """
        self.load_build_config(build_config)
        assert "build" in self._build_config, "Abort. No such build records for config %s" % build_config
        last_build = self._build_config['build'][last_build_idx]
        logging.info("Last build record:")
        logging.info(pformat(last_build))
        assert last_build['status'] == 'success', \
            "Abort. Last build did not success."
        assert last_build['target_backend'] == "mongodb", \
            'Abort. Last build need to be built using "mongodb" backend.'
        assert last_build.get('stats', None), \
            'Abort. Last build stats are not available.'
        self._stats = last_build['stats']
        assert last_build.get('target', None), \
            'Abort. Last build target_collection is not available.'

        # Get the source collection to build the ES index
        # IMPORTANT: the collection in last_build['target'] does not contain _timestamp field,
        #            only the "genedoc_*_current" collection does. When "timestamp" is enabled
        #            in mappings, last_build['target'] collection won't be indexed by ES correctly,
        #            therefore, we use "genedoc_*_current" collection as the source here:
        #target_collection = last_build['target']
        target_collection = "genedoc_{}_current".format(build_config)
        _db = get_target_db()
        target_collection = _db[target_collection]
        logging.info("")
        logging.info('Source: %s' % target_collection.name)
        _mapping = self.get_mapping()
        _meta = {}
        src_version = self.get_src_version()
        if src_version:
            _meta['src_version'] = src_version
        if getattr(self, '_stats', None):
            _meta['stats'] = self._stats
        if 'timestamp' in last_build:
            _meta['timestamp'] = last_build['timestamp']
        if _meta:
            _mapping['_meta'] = _meta
        es_index_name = es_index_name or target_collection.name
        es_idxer = ESIndexer(mapping=_mapping,
                             es_index_name=es_index_name,
                             es_host=es_host,
                             step=5000)
        if build_config == 'mygene_allspecies':
            es_idxer.number_of_shards = 10   # default 5
        es_idxer.check()
        if noconfirm or ask("Continue to build ES index?") == 'Y':
            es_idxer.use_parallel = use_parallel
            #es_idxer.s = 609000
            if es_idxer.exists_index(es_idxer.ES_INDEX_NAME):
                if noconfirm or ask('Index "{}" exists. Delete?'.format(es_idxer.ES_INDEX_NAME)) == 'Y':
                    es_idxer.conn.indices.delete(es_idxer.ES_INDEX_NAME)
                else:
                    logging.info("Abort.")
                    return
            es_idxer.create_index()
            #es_idxer.delete_index_type(es_idxer.ES_INDEX_TYPE, noconfirm=True)
            es_idxer.build_index(target_collection, verbose=False)
Esempio n. 11
0
    def build_index2(self,
                     build_config='mygene_allspecies',
                     last_build_idx=-1,
                     use_parallel=False,
                     es_host=None,
                     es_index_name=None,
                     noconfirm=False):
        """Build ES index from last successfully-merged mongodb collection.
            optional "es_host" argument can be used to specified another ES host, otherwise default ES_HOST.
            optional "es_index_name" argument can be used to pass an alternative index name, otherwise same as mongodb collection name
        """
        self.load_build_config(build_config)
        assert "build" in self._build_config, "Abort. No such build records for config %s" % build_config
        last_build = self._build_config['build'][last_build_idx]
        logging.info("Last build record:")
        logging.info(pformat(last_build))
        assert last_build['status'] == 'success', \
            "Abort. Last build did not success."
        assert last_build['target_backend'] == "mongodb", \
            'Abort. Last build need to be built using "mongodb" backend.'
        assert last_build.get('stats', None), \
            'Abort. Last build stats are not available.'
        self._stats = last_build['stats']
        assert last_build.get('target', None), \
            'Abort. Last build target_collection is not available.'

        # Get the source collection to build the ES index
        # IMPORTANT: the collection in last_build['target'] does not contain _timestamp field,
        #            only the "genedoc_*_current" collection does. When "timestamp" is enabled
        #            in mappings, last_build['target'] collection won't be indexed by ES correctly,
        #            therefore, we use "genedoc_*_current" collection as the source here:
        #target_collection = last_build['target']
        target_collection = "genedoc_{}_current".format(build_config)
        _db = get_target_db()
        target_collection = _db[target_collection]
        logging.info("")
        logging.info('Source: %s' % target_collection.name)
        _mapping = self.get_mapping()
        _meta = {}
        src_version = self.get_src_version()
        if src_version:
            _meta['src_version'] = src_version
        if getattr(self, '_stats', None):
            _meta['stats'] = self._stats
        if 'timestamp' in last_build:
            _meta['timestamp'] = last_build['timestamp']
        if _meta:
            _mapping['_meta'] = _meta
        es_index_name = es_index_name or target_collection.name
        es_idxer = ESIndexer(mapping=_mapping,
                             es_index_name=es_index_name,
                             es_host=es_host,
                             step=5000)
        if build_config == 'mygene_allspecies':
            es_idxer.number_of_shards = 10  # default 5
        es_idxer.check()
        if noconfirm or ask("Continue to build ES index?") == 'Y':
            es_idxer.use_parallel = use_parallel
            #es_idxer.s = 609000
            if es_idxer.exists_index(es_idxer.ES_INDEX_NAME):
                if noconfirm or ask('Index "{}" exists. Delete?'.format(
                        es_idxer.ES_INDEX_NAME)) == 'Y':
                    es_idxer.conn.indices.delete(es_idxer.ES_INDEX_NAME)
                else:
                    logging.info("Abort.")
                    return
            es_idxer.create_index()
            #es_idxer.delete_index_type(es_idxer.ES_INDEX_TYPE, noconfirm=True)
            es_idxer.build_index(target_collection, verbose=False)