def build_index(config, use_parallel=True, noconfirm=False): bdr = DataBuilder(backend='mongodb') bdr.load_build_config(config) target_collection = bdr.pick_target_collection() target_es_index = 'genedoc_' + bdr._build_config['name'] if target_collection: es_idxer = ESIndexer(mapping=bdr.get_mapping()) es_idxer.ES_INDEX_NAME = target_es_index es_idxer.step = 10000 es_idxer.use_parallel = use_parallel es_server = es_idxer.conn.servers[0].geturl() print "ES target: {}/{}/{}".format(es_server, es_idxer.ES_INDEX_NAME, es_idxer.ES_INDEX_TYPE) if ask("Continue?") == 'Y': #es_idxer.s = 609000 #es_idxer.conn.indices.delete_index(es_idxer.ES_INDEX_NAME) es_idxer.create_index() es_idxer.delete_index_type(es_idxer.ES_INDEX_TYPE, noconfirm=noconfirm) es_idxer.build_index(target_collection, verbose=False) es_idxer.optimize() else: print "Aborted." else: print "Error: target collection is not ready yet or failed to build."
def build_index(config, use_parallel=True, noconfirm=False): bdr = DataBuilder(backend='mongodb') bdr.load_build_config(config) target_collection = bdr.pick_target_collection() target_es_index = 'genedoc_' + bdr._build_config['name'] if target_collection: es_idxer = ESIndexer(mapping=bdr.get_mapping()) es_idxer.ES_INDEX_NAME = target_es_index es_idxer.step = 10000 es_idxer.use_parallel = use_parallel es_server = es_idxer.conn.servers[0].geturl() print("ES target: {}/{}/{}".format(es_server, es_idxer.ES_INDEX_NAME, es_idxer.ES_INDEX_TYPE)) if noconfirm or ask("Continue?") == 'Y': #es_idxer.s = 609000 #es_idxer.conn.indices.delete_index(es_idxer.ES_INDEX_NAME) es_idxer.create_index() es_idxer.delete_index_type(es_idxer.ES_INDEX_TYPE, noconfirm=noconfirm) es_idxer.build_index(target_collection, verbose=False) es_idxer.optimize() else: print("Aborted.") else: print("Error: target collection is not ready yet or failed to build.")
def build_index2(self, build_config='mygene_allspecies', last_build_idx=-1, use_parallel=False, es_host=None, es_index_name=None): """Build ES index from last successfully-merged mongodb collection. optional "es_host" argument can be used to specified another ES host, otherwise default ES_HOST. optional "es_index_name" argument can be used to pass an alternative index name, otherwise same as mongodb collection name """ from pprint import pprint self.load_build_config(build_config) last_build = self._build_config['build'][last_build_idx] print "Last build record:" pprint(last_build) assert last_build['status'] == 'success', \ "Abort. Last build did not success." assert last_build['target_backend'] == "mongodb", \ 'Abort. Last build need to be built using "mongodb" backend.' assert last_build.get('stats', None), \ 'Abort. Last build stats are not available.' self._stats = last_build['stats'] assert last_build.get('target', None), \ 'Abort. Last build target_collection is not available.' #target_collection = last_build['target'] target_collection = "genedoc_{}_current".format(build_config) ###### _db = get_target_db() target_collection = _db[target_collection] print print 'Source: ', target_collection.name _mapping = self.get_mapping() _meta = {} src_version = self.get_src_version() if src_version: _meta['src_version'] = src_version if getattr(self, '_stats', None): _meta['stats'] = self._stats if 'timestamp' in last_build: _meta['timestamp'] = last_build['timestamp'] if _meta: _mapping['_meta'] = _meta es_index_name = es_index_name or target_collection.name es_idxer = ESIndexer(mapping=_mapping, es_index_name=es_index_name, es_host=es_host, step=5000) if build_config == 'mygene_allspecies': es_idxer.number_of_shards = 10 # default 5 print "ES host:", es_idxer.conn.servers[0].geturl() print "ES index:", es_index_name if ask("Continue to build ES index?") == 'Y': es_idxer.use_parallel = use_parallel #es_idxer.s = 609000 if es_idxer.conn.indices.exists_index(es_idxer.ES_INDEX_NAME): if ask('Index "{}" exists. Delete?'.format(es_idxer.ES_INDEX_NAME)) == 'Y': es_idxer.conn.indices.delete_index(es_idxer.ES_INDEX_NAME) else: print "Abort." return es_idxer.create_index() #es_idxer.delete_index_type(es_idxer.ES_INDEX_es.pTYPE, noconfirm=True) es_idxer.build_index(target_collection, verbose=False)
def build_index(self, use_parallel=True): target_collection = self.get_target_collection() if target_collection: es_idxer = ESIndexer(mapping=self.get_mapping()) es_idxer.ES_INDEX_NAME = 'genedoc_' + self._build_config['name'] es_idxer.step = 10000 es_idxer.use_parallel = use_parallel #es_idxer.s = 609000 #es_idxer.conn.indices.delete_index(es_idxer.ES_INDEX_NAME) es_idxer.create_index() es_idxer.delete_index_type(es_idxer.ES_INDEX_TYPE, noconfirm=True) es_idxer.build_index(target_collection, verbose=False) es_idxer.optimize() else: logging.info("Error: target collection is not ready yet or failed to build.")
def build_index(self, use_parallel=True): target_collection = self.get_target_collection() if target_collection: es_idxer = ESIndexer(mapping=self.get_mapping()) es_idxer.ES_INDEX_NAME = 'genedoc_' + self._build_config['name'] es_idxer.step = 10000 es_idxer.use_parallel = use_parallel #es_idxer.s = 609000 #es_idxer.conn.indices.delete_index(es_idxer.ES_INDEX_NAME) es_idxer.create_index() es_idxer.delete_index_type(es_idxer.ES_INDEX_TYPE, noconfirm=True) es_idxer.build_index(target_collection, verbose=False) es_idxer.optimize() else: logging.info( "Error: target collection is not ready yet or failed to build." )
def build_index2(self, build_config='mygene_allspecies', last_build_idx=-1, use_parallel=False, es_host=None, es_index_name=None, noconfirm=False): """Build ES index from last successfully-merged mongodb collection. optional "es_host" argument can be used to specified another ES host, otherwise default ES_HOST. optional "es_index_name" argument can be used to pass an alternative index name, otherwise same as mongodb collection name """ self.load_build_config(build_config) assert "build" in self._build_config, "Abort. No such build records for config %s" % build_config last_build = self._build_config['build'][last_build_idx] logging.info("Last build record:") logging.info(pformat(last_build)) assert last_build['status'] == 'success', \ "Abort. Last build did not success." assert last_build['target_backend'] == "mongodb", \ 'Abort. Last build need to be built using "mongodb" backend.' assert last_build.get('stats', None), \ 'Abort. Last build stats are not available.' self._stats = last_build['stats'] assert last_build.get('target', None), \ 'Abort. Last build target_collection is not available.' # Get the source collection to build the ES index # IMPORTANT: the collection in last_build['target'] does not contain _timestamp field, # only the "genedoc_*_current" collection does. When "timestamp" is enabled # in mappings, last_build['target'] collection won't be indexed by ES correctly, # therefore, we use "genedoc_*_current" collection as the source here: #target_collection = last_build['target'] target_collection = "genedoc_{}_current".format(build_config) _db = get_target_db() target_collection = _db[target_collection] logging.info("") logging.info('Source: %s' % target_collection.name) _mapping = self.get_mapping() _meta = {} src_version = self.get_src_version() if src_version: _meta['src_version'] = src_version if getattr(self, '_stats', None): _meta['stats'] = self._stats if 'timestamp' in last_build: _meta['timestamp'] = last_build['timestamp'] if _meta: _mapping['_meta'] = _meta es_index_name = es_index_name or target_collection.name es_idxer = ESIndexer(mapping=_mapping, es_index_name=es_index_name, es_host=es_host, step=5000) if build_config == 'mygene_allspecies': es_idxer.number_of_shards = 10 # default 5 es_idxer.check() if noconfirm or ask("Continue to build ES index?") == 'Y': es_idxer.use_parallel = use_parallel #es_idxer.s = 609000 if es_idxer.exists_index(es_idxer.ES_INDEX_NAME): if noconfirm or ask('Index "{}" exists. Delete?'.format(es_idxer.ES_INDEX_NAME)) == 'Y': es_idxer.conn.indices.delete(es_idxer.ES_INDEX_NAME) else: logging.info("Abort.") return es_idxer.create_index() #es_idxer.delete_index_type(es_idxer.ES_INDEX_TYPE, noconfirm=True) es_idxer.build_index(target_collection, verbose=False)
def build_index2(self, build_config='mygene_allspecies', last_build_idx=-1, use_parallel=False, es_host=None, es_index_name=None, noconfirm=False): """Build ES index from last successfully-merged mongodb collection. optional "es_host" argument can be used to specified another ES host, otherwise default ES_HOST. optional "es_index_name" argument can be used to pass an alternative index name, otherwise same as mongodb collection name """ self.load_build_config(build_config) assert "build" in self._build_config, "Abort. No such build records for config %s" % build_config last_build = self._build_config['build'][last_build_idx] logging.info("Last build record:") logging.info(pformat(last_build)) assert last_build['status'] == 'success', \ "Abort. Last build did not success." assert last_build['target_backend'] == "mongodb", \ 'Abort. Last build need to be built using "mongodb" backend.' assert last_build.get('stats', None), \ 'Abort. Last build stats are not available.' self._stats = last_build['stats'] assert last_build.get('target', None), \ 'Abort. Last build target_collection is not available.' # Get the source collection to build the ES index # IMPORTANT: the collection in last_build['target'] does not contain _timestamp field, # only the "genedoc_*_current" collection does. When "timestamp" is enabled # in mappings, last_build['target'] collection won't be indexed by ES correctly, # therefore, we use "genedoc_*_current" collection as the source here: #target_collection = last_build['target'] target_collection = "genedoc_{}_current".format(build_config) _db = get_target_db() target_collection = _db[target_collection] logging.info("") logging.info('Source: %s' % target_collection.name) _mapping = self.get_mapping() _meta = {} src_version = self.get_src_version() if src_version: _meta['src_version'] = src_version if getattr(self, '_stats', None): _meta['stats'] = self._stats if 'timestamp' in last_build: _meta['timestamp'] = last_build['timestamp'] if _meta: _mapping['_meta'] = _meta es_index_name = es_index_name or target_collection.name es_idxer = ESIndexer(mapping=_mapping, es_index_name=es_index_name, es_host=es_host, step=5000) if build_config == 'mygene_allspecies': es_idxer.number_of_shards = 10 # default 5 es_idxer.check() if noconfirm or ask("Continue to build ES index?") == 'Y': es_idxer.use_parallel = use_parallel #es_idxer.s = 609000 if es_idxer.exists_index(es_idxer.ES_INDEX_NAME): if noconfirm or ask('Index "{}" exists. Delete?'.format( es_idxer.ES_INDEX_NAME)) == 'Y': es_idxer.conn.indices.delete(es_idxer.ES_INDEX_NAME) else: logging.info("Abort.") return es_idxer.create_index() #es_idxer.delete_index_type(es_idxer.ES_INDEX_TYPE, noconfirm=True) es_idxer.build_index(target_collection, verbose=False)