def clone_index(createidx=False, test=True): if test: return from utils.es import ESIndexer from utils.common import iter_n new_idx = 'myvariant_current_3' step = 10000 if createidx: from mapping import get_mapping m = get_mapping() body = {'settings': {'number_of_shards': 10}} # ### es.indices.create(new_idx, body=body) es.indices.put_mapping(index=new_idx, doc_type='variant', body=m) # helpers.reindex(es, source_index='myvariant_all', # target_index= new_idx, chunk_size=10000) esi = ESIndexer() doc_iter = esi.doc_feeder(index='myvariant_all_1', doc_type='variant', step=step) for doc_batch in iter_n(doc_iter, step): do_index(doc_batch, index_name=new_idx, doc_type='variant', step=step, verbose=False, update=True)
def test(): target = get_target_db() sync_src = backend.GeneDocMongoDBBackend( target['genedoc_mygene_allspecies_20130402_uiu7bkyi']) idxer = ESIndexer() sync_target = backend.GeneDocESBackend(idxer) return sync_src, sync_target
def build_index(config, use_parallel=True, noconfirm=False): bdr = DataBuilder(backend='mongodb') bdr.load_build_config(config) target_collection = bdr.pick_target_collection() target_es_index = 'genedoc_' + bdr._build_config['name'] if target_collection: es_idxer = ESIndexer(mapping=bdr.get_mapping()) es_idxer.ES_INDEX_NAME = target_es_index es_idxer.step = 10000 es_idxer.use_parallel = use_parallel es_server = es_idxer.conn.servers[0].geturl() print("ES target: {}/{}/{}".format(es_server, es_idxer.ES_INDEX_NAME, es_idxer.ES_INDEX_TYPE)) if noconfirm or ask("Continue?") == 'Y': #es_idxer.s = 609000 #es_idxer.conn.indices.delete_index(es_idxer.ES_INDEX_NAME) es_idxer.create_index() es_idxer.delete_index_type(es_idxer.ES_INDEX_TYPE, noconfirm=noconfirm) es_idxer.build_index(target_collection, verbose=False) es_idxer.optimize() else: print("Aborted.") else: print("Error: target collection is not ready yet or failed to build.")
def __init__(self, build_config=None, backend='mongodb'): self.src = get_src_db() self.step = 10000 self.use_parallel = False self.merge_logging = True # save output into a logging file when merge is called. self.max_build_status = 10 # max no. of records kept in "build" field of src_build collection. self.using_ipython_cluster = False self.shutdown_ipengines_after_done = False self.log_folder = LOG_FOLDER self._build_config = build_config self._entrez_geneid_d = None self._idmapping_d_cache = {} self.get_src_master() if backend == 'mongodb': self.target = databuild.backend.GeneDocMongoDBBackend() elif backend == 'es': self.target = databuild.backend.GeneDocESBackend(ESIndexer()) elif backend == 'couchdb': from config import COUCHDB_URL import couchdb self.target = databuild.backend.GeneDocCouchDBBackend( couchdb.Server(COUCHDB_URL)) elif backend == 'memory': self.target = databuild.backend.GeneDocMemeoryBackend() else: raise ValueError('Invalid backend "%s".' % backend)
def __init__(self, index=None, doc_type=None, es_host=None, step=5000): self._es = get_es(es_host) self._index = index or config.ES_INDEX_NAME self._doc_type = doc_type or config.ES_DOC_TYPE self._esi = ESIndexer(es_host=es_host) self._esi._index = self._index self._src = get_src_db() self.step = step
def get_backend(target_name, bk_type, **kwargs): '''Return a backend instance for given target_name and backend type. currently support MongoDB and ES backend. ''' if bk_type == 'mongodb': return GeneDocMongoDBBackend(target_name) elif bk_type == 'es': esi = ESIndexer(target_name, **kwargs) return GeneDocESBackend(esi)
def diff2src(use_parallel=True, noconfirm=False): src_li = [] target_db = get_target_db() src_li.extend([(name, target_db[name].count(), 'mongodb') for name in sorted(target_db.collection_names()) if name.startswith('genedoc')]) es_idxer = ESIndexer() es_idxer.conn.default_indices = [] for es_idx in es_idxer.conn.indices.get_indices(): if es_idx.startswith('genedoc'): es_idxer.ES_INDEX_NAME = es_idx src_li.append((es_idx, es_idxer.count()['count'], 'es')) print("Found {} sources:".format(len(src_li))) src_1 = _pick_one(src_li, "Pick first source above: ") src_li.remove(src_1) print src_2 = _pick_one(src_li, "Pick second source above: ") sync_li = [] for src in (src_1, src_2): if src[2] == 'mongodb': b = backend.GeneDocMongoDBBackend(target_db[src[0]]) elif src[2] == 'es': es_idxer = ESIndexer() es_idxer.ES_INDEX_NAME = src[0] es_idxer.step = 10000 b = backend.GeneDocESBackend(es_idxer) sync_li.append(b) sync_src, sync_target = sync_li print('\tsync_src:\t{:<45}{}\t{}'.format(*src_1)) print('\tsync_target\t{:<45}{}\t{}'.format(*src_2)) if noconfirm or ask("Continue?") == "Y": changes = diff.diff_collections(sync_src, sync_target, use_parallel=use_parallel) return changes
def sync_index(self, use_parallel=True): from utils import diff sync_src = self.get_target_collection() es_idxer = ESIndexer(self.get_mapping()) es_idxer.ES_INDEX_NAME = sync_src.target_collection.name es_idxer.step = 10000 es_idxer.use_parallel = use_parallel sync_target = databuild.backend.GeneDocESBackend(es_idxer) changes = diff.diff_collections(sync_src, sync_target) return changes
def build_index(self, use_parallel=True): target_collection = self.get_target_collection() if target_collection: es_idxer = ESIndexer(mapping=self.get_mapping()) es_idxer.ES_INDEX_NAME = 'genedoc_' + self._build_config['name'] es_idxer.step = 10000 es_idxer.use_parallel = use_parallel #es_idxer.s = 609000 #es_idxer.conn.indices.delete_index(es_idxer.ES_INDEX_NAME) es_idxer.create_index() es_idxer.delete_index_type(es_idxer.ES_INDEX_TYPE, noconfirm=True) es_idxer.build_index(target_collection, verbose=False) es_idxer.optimize() else: logging.info( "Error: target collection is not ready yet or failed to build." )
def validate(build_config=None): from pprint import pprint from utils.diff import diff_collections from databuild.backend import GeneDocMongoDBBackend, GeneDocESBackend from biothings.utils.mongo import get_src_build, get_target_db from utils.es import ESIndexer src_build = get_src_build() _cfg = src_build.find_one({'_id': build_config}) last_build = _cfg['build'][-1] print("Last build record:") pprint(last_build) target_name = last_build['target'] mongo_target = get_target_db() b1 = GeneDocMongoDBBackend(mongo_target[target_name]) b2 = GeneDocESBackend( ESIndexer(es_index_name=target_name, es_host='127.0.0.1:' + str(es_local_tunnel_port))) changes = diff_collections(b1, b2, use_parallel=True, step=10000) return changes
def sync_index(config, use_parallel=True, noconfirm=False): bdr = DataBuilder(backend='mongodb') bdr.load_build_config(config) target_collection = bdr.pick_target_collection() target_es_index = 'genedoc_' + bdr._build_config['name'] sync_src = backend.GeneDocMongoDBBackend(target_collection) es_idxer = ESIndexer(bdr.get_mapping()) es_idxer.ES_INDEX_NAME = target_es_index es_idxer.step = 10000 es_idxer.use_parallel = use_parallel sync_target = backend.GeneDocESBackend(es_idxer) print('\tsync_src:\t{:<40}{}\t{}'.format(target_collection.name, sync_src.name, sync_src.count())) print('\tsync_target\t{:<40}{}\t{}'.format(target_es_index, sync_target.name, sync_target.count())) if noconfirm or ask("Continue?") == "Y": changes = diff.diff_collections(sync_src, sync_target) return changes
def build_index2(self, build_config='mygene_allspecies', last_build_idx=-1, use_parallel=False, es_host=None, es_index_name=None, noconfirm=False): """Build ES index from last successfully-merged mongodb collection. optional "es_host" argument can be used to specified another ES host, otherwise default ES_HOST. optional "es_index_name" argument can be used to pass an alternative index name, otherwise same as mongodb collection name """ self.load_build_config(build_config) assert "build" in self._build_config, "Abort. No such build records for config %s" % build_config last_build = self._build_config['build'][last_build_idx] logging.info("Last build record:") logging.info(pformat(last_build)) assert last_build['status'] == 'success', \ "Abort. Last build did not success." assert last_build['target_backend'] == "mongodb", \ 'Abort. Last build need to be built using "mongodb" backend.' assert last_build.get('stats', None), \ 'Abort. Last build stats are not available.' self._stats = last_build['stats'] assert last_build.get('target', None), \ 'Abort. Last build target_collection is not available.' # Get the source collection to build the ES index # IMPORTANT: the collection in last_build['target'] does not contain _timestamp field, # only the "genedoc_*_current" collection does. When "timestamp" is enabled # in mappings, last_build['target'] collection won't be indexed by ES correctly, # therefore, we use "genedoc_*_current" collection as the source here: #target_collection = last_build['target'] target_collection = "genedoc_{}_current".format(build_config) _db = get_target_db() target_collection = _db[target_collection] logging.info("") logging.info('Source: %s' % target_collection.name) _mapping = self.get_mapping() _meta = {} src_version = self.get_src_version() if src_version: _meta['src_version'] = src_version if getattr(self, '_stats', None): _meta['stats'] = self._stats if 'timestamp' in last_build: _meta['timestamp'] = last_build['timestamp'] if _meta: _mapping['_meta'] = _meta es_index_name = es_index_name or target_collection.name es_idxer = ESIndexer(mapping=_mapping, es_index_name=es_index_name, es_host=es_host, step=5000) if build_config == 'mygene_allspecies': es_idxer.number_of_shards = 10 # default 5 es_idxer.check() if noconfirm or ask("Continue to build ES index?") == 'Y': es_idxer.use_parallel = use_parallel #es_idxer.s = 609000 if es_idxer.exists_index(es_idxer.ES_INDEX_NAME): if noconfirm or ask('Index "{}" exists. Delete?'.format( es_idxer.ES_INDEX_NAME)) == 'Y': es_idxer.conn.indices.delete(es_idxer.ES_INDEX_NAME) else: logging.info("Abort.") return es_idxer.create_index() #es_idxer.delete_index_type(es_idxer.ES_INDEX_TYPE, noconfirm=True) es_idxer.build_index(target_collection, verbose=False)