def update_metadata(self, indexer_env, index_name, build_name=None, _meta=None): """ Update _meta for index_name, based on build_name (_meta directly taken from the src_build document) or _meta """ idxkwargs = self[indexer_env] # 1st pass we get the doc_type (don't want to ask that on the signature...) indexer = create_backend( (idxkwargs["es_host"], index_name, None)).target_esidxer m = indexer._es.indices.get_mapping(index_name) assert len(m[index_name]["mappings"]) == 1, "Found more than one doc_type: " + \ "%s" % m[index_name]["mappings"].keys() doc_type = list(m[index_name]["mappings"].keys())[0] # 2nd pass to re-create correct indexer indexer = create_backend( (idxkwargs["es_host"], index_name, doc_type)).target_esidxer if build_name: build = get_src_build().find_one({"_id": build_name}) assert build, "No such build named '%s'" % build_name _meta = build.get("_meta") assert _meta is not None, "No _meta found" return indexer.update_mapping_meta({"_meta": _meta})
def merge_index_worker(col_name, ids, pindexer, batch_num): col = create_backend(col_name).target_collection idxer = pindexer() upd_cnt = 0 new_cnt = 0 cur = doc_feeder(col, step=len(ids), inbatch=False, query={'_id': { '$in': ids }}) docs = [d for d in cur] [d.pop("_timestamp", None) for d in docs] dids = dict([(d["_id"], d) for d in docs]) dexistings = dict([(d["_id"], d) for d in idxer.get_docs([k for k in dids.keys()])]) for _id in dexistings: d = dexistings[_id] # update in-place d.update(dids[_id]) # mark as processed/updated dids.pop(_id) # updated docs (those existing in col *and* index) upd_cnt = idxer.index_bulk(dexistings.values(), len(dexistings)) logging.debug("%s documents updated in index" % repr(upd_cnt)) # new docs (only in col, *not* in index) new_cnt = idxer.index_bulk(dids.values(), len(dids)) logging.debug("%s new documents in index" % repr(new_cnt)) # need to return one: tuple(cnt,list) ret = (upd_cnt[0] + new_cnt[0], upd_cnt[1] + new_cnt[1]) return ret
def new_index_worker(col_name, ids, pindexer, batch_num): col = create_backend(col_name).target_collection idxer = pindexer() cur = doc_feeder(col, step=len(ids), inbatch=False, query={'_id': { '$in': ids }}) cnt = idxer.index_bulk(cur) return cnt
def inspect_data(backend_provider, ids, mode, pre_mapping, **kwargs): col = create_backend(backend_provider).target_collection cur = doc_feeder(col, step=len(ids), inbatch=False, query={'_id': { '$in': ids }}) return btinspect.inspect_docs(cur, mode=mode, pre_mapping=pre_mapping, metadata=False, **kwargs)
def post_sync_cols(self, diff_folder, batch_size, mode, force, target_backend, steps): assert self.target_backend_type == "es", "Only support ElasticSearch backend (got: %s)" % self.target_backend_type assert not self._meta is None, "Metadata not loaded (use load_metadata(diff_folder))" self.logger.info( "Sleeping for a bit while index is being fully updated...") time.sleep(3 * 60) backend_info = self.get_target_backend() self.logger.info("Updating 'stats' by querying index '%s'" % backend_info[1]) indexer = create_backend(backend_info).target_esidxer # compute stats using ES index assembly = self._meta["build_config"]["assembly"] return update_stats(indexer, assembly)
def docfetcher(backend_url, _id=None, count=False, q={}, limit=100): """ Returns documents by _id and using query q. Backend follows biothings.hub.databuild.backend.create_backend() format, see this function documentation for more """ backend = create_backend(backend_url) if count: return backend.count() elif _id: return backend.get_from_id(_id) else: q = backend.query(q) if isinstance(q, types.GeneratorType): # ES backend, need fetch then limit... return [doc for doc in q][:limit] else: # mongo return [doc for doc in q.limit(limit)]
def do(): yield from asyncio.sleep(0.0) nonlocal mode pinfo = { "category": INSPECTOR_CATEGORY, "source": "%s" % repr(data_provider), "step": "", "description": "" } # register begin of inspection (differ slightly depending on type) if data_provider_type == "source": registerer_obj.register_status("inspecting", subkey="inspect") elif data_provider_type == "build": registerer_obj.register_status("inspecting", transient=True, init=True, job={"step": "inspect"}) self.logger.info( "Running inspector on %s (type:%s,data_provider:%s)" % (repr(data_provider), data_provider_type, backend_provider)) if sample is not None: self.logger.info( "Sample set to %s, inspect only a subset of data", sample) if limit is None: self.logger.info("Inspecting all the documents") else: nonlocal batch_size # adjust batch_size so we inspect only "limit" docs if batch is smaller than the limit if batch_size > limit: batch_size = limit self.logger.info("Inspecting only %s documents", limit) # make it pickleable if data_provider_type == "source": # because register_obj is also used to fetch data, it has to be unprepare() for pickling registerer_obj.unprepare() else: # NOTE: do not unprepare() the builder, we'll loose the target name # (it's be randomly generated again) and we won't be able to register results pass cnt = 0 doccnt = 0 jobs = [] # normalize mode param and prepare global results if type(mode) == str: mode = [mode] converters, mode = btinspect.get_converters(mode) inspected = {} for m in mode: inspected.setdefault(m, {}) backend = create_backend(backend_provider).target_collection for ids in id_feeder(backend, batch_size=batch_size): if sample is not None: if random.random() > sample: continue cnt += 1 doccnt += batch_size if limit and doccnt > limit: break pinfo["description"] = "batch #%s" % cnt def batch_inspected(bnum, i, f): nonlocal inspected nonlocal got_error nonlocal mode try: res = f.result() for m in mode: inspected[m] = btinspect.merge_record( inspected[m], res[m], m) except Exception as e: got_error = e self.logger.error( "Error while inspecting data from batch #%s: %s" % (bnum, e)) raise pre_mapping = "mapping" in mode # we want to generate intermediate mapping so we can merge # all maps later and then generate the ES mapping from there self.logger.info("Creating inspect worker for batch #%s" % cnt) job = yield from self.job_manager.defer_to_process( pinfo, partial(inspect_data, backend_provider, ids, mode=mode, pre_mapping=pre_mapping, **kwargs)) job.add_done_callback(partial(batch_inspected, cnt, ids)) jobs.append(job) yield from asyncio.gather(*jobs) # compute metadata (they were skipped before) for m in mode: if m == "mapping": try: inspected["mapping"] = es.generate_es_mapping( inspected["mapping"]) # metadata for mapping only once generated inspected = btinspect.compute_metadata( inspected, m) except es.MappingError as e: inspected["mapping"] = { "pre-mapping": inspected["mapping"], "errors": e.args[1] } else: inspected = btinspect.compute_metadata(inspected, m) # just potential converters btinspect.run_converters(inspected, converters) def fully_inspected(res): nonlocal got_error try: res = btinspect.stringify_inspect_doc(res) _map = {"results": res} _map["data_provider"] = repr(data_provider) _map["started_at"] = started_at _map["duration"] = timesofar(t0) # when inspecting with "stats" mode, we can get huge number but mongo # can't store more than 2^64, make sure to get rid of big nums there def clean_big_nums(k, v): # TODO: same with float/double? seems mongo handles more there ? if isinstance(v, int) and v > 2**64: return k, math.nan else: return k, v dict_traverse(_map, clean_big_nums) # register begin of inspection (differ slightly depending on type) if "mapping" in mode and "errors" in res[ "mapping"] and "pre-mapping" in res["mapping"]: registerer_obj.register_status("failed", subkey="inspect", inspect=_map) got_error = InspectorError( res["mapping"]["errors"]) else: if data_provider_type == "source": registerer_obj.register_status( "success", subkey="inspect", inspect=_map) elif data_provider_type == "build": registerer_obj.register_status( "success", job={"step": "inspect"}, build={"inspect": _map}) except Exception as e: self.logger.exception( "Error while inspecting data: %s" % e) got_error = e if data_provider_type == "source": registerer_obj.register_status("failed", subkey="inspect", err=repr(e)) elif data_provider_type == "build": registerer_obj.register_status( "failed", job={"err": repr(e)}) fully_inspected(inspected) if data_provider_type is None: return if got_error: raise got_error
def do(): yield from asyncio.sleep(0.0) nonlocal mode pinfo = { "category": INSPECTOR_CATEGORY, "source": "%s" % repr(data_provider), "step": "", "description": "" } # register begin of inspection (differ slightly depending on type) if data_provider_type == "source": registerer_obj.register_status("inspecting", subkey="inspect") elif data_provider_type == "build": registerer_obj.register_status("inspecting", transient=True, init=True, job={"step": "inspect"}) self.logger.info("Running inspector on %s (type:%s,data_provider:%s)" % \ (repr(data_provider),data_provider_type,backend_provider)) # make it pickleable if data_provider_type == "source": # because register_obj is also used to fetch data, it has to be unprepare() for pickling registerer_obj.unprepare() else: # NOTE: do not unprepare() the builder, we'll loose the target name # (it's be randomly generated again) and we won't be able to register results pass cnt = 0 jobs = [] # normalize mode param and prepare global results if type(mode) == str: mode = [mode] inspected = {} for m in mode: inspected.setdefault(m, {}) backend = create_backend(backend_provider).target_collection for ids in id_feeder(backend, batch_size=batch_size): cnt += 1 pinfo["description"] = "batch #%s" % cnt def batch_inspected(bnum, i, f): nonlocal inspected nonlocal got_error nonlocal mode try: res = f.result() for m in mode: inspected[m] = btinspect.merge_record( inspected[m], res[m], m) except Exception as e: got_error = e self.logger.error( "Error while inspecting data from batch #%s: %s" % (bnum, e)) raise pre_mapping = "mapping" in mode # we want to generate intermediate mapping so we can merge # all maps later and then generate the ES mapping from there self.logger.info("Creating inspect worker for batch #%s" % cnt) job = yield from self.job_manager.defer_to_process( pinfo, partial(inspect_data, backend_provider, ids, mode=mode, pre_mapping=pre_mapping, **kwargs)) job.add_done_callback(partial(batch_inspected, cnt, ids)) jobs.append(job) yield from asyncio.gather(*jobs) # compute metadata (they were skipped before) for m in mode: if m == "mapping": try: inspected["mapping"] = es.generate_es_mapping( inspected["mapping"]) # metadata for mapping only once generated inspected = btinspect.compute_metadata( inspected, m) except es.MappingError as e: inspected["mapping"] = { "pre-mapping": inspected["mapping"], "errors": e.args[1] } else: inspected = btinspect.compute_metadata(inspected, m) def fully_inspected(res): nonlocal got_error try: res = btinspect.stringify_inspect_doc(res) _map = {"results": res} _map["data_provider"] = repr(data_provider) _map["started_at"] = started_at _map["duration"] = timesofar(t0) # register begin of inspection (differ slightly depending on type) if "mapping" in mode and "errors" in res[ "mapping"] and "pre-mapping" in res["mapping"]: registerer_obj.register_status("failed", subkey="inspect", inspect=_map) got_error = InspectorError( res["mapping"]["errors"]) else: if data_provider_type == "source": registerer_obj.register_status( "success", subkey="inspect", inspect=_map) elif data_provider_type == "build": registerer_obj.register_status( "success", job={"step": "inspect"}, build={"inspect": _map}) except Exception as e: self.logger.exception( "Error while inspecting data: %s" % e) got_error = e if data_provider_type == "source": registerer_obj.register_status("failed", subkey="inspect", err=repr(e)) elif data_provider_type == "build": registerer_obj.register_status( "failed", job={"err": repr(e)}) fully_inspected(inspected) if data_provider_type is None: return if got_error: raise got_error
def index(self, target_name, index_name, job_manager, steps=["index", "post"], batch_size=10000, ids=None, mode="index", worker=None): """ Build an index named "index_name" with data from collection "target_collection". "ids" can be passed to selectively index documents. "mode" can have the following values: - 'purge': will delete index if it exists - 'resume': will use existing index and add documents. "ids" can be passed as a list of missing IDs, or, if not pass, ES will be queried to identify which IDs are missing for each batch in order to complete the index. - 'merge': will merge data with existing index' documents, used when populated several distinct times (cold/hot merge for instance) - None (default): will create a new index, assuming it doesn't already exist """ assert job_manager # check what to do if type(steps) == str: steps = [steps] self.target_name = target_name self.index_name = index_name self.load_build() self.setup_log() # select proper index worker according to mode: if worker is None: # none specified, choose correct one if mode == "merge": worker = merge_index_worker else: worker = new_index_worker got_error = False cnt = 0 if "index" in steps: self.register_status("indexing", transient=True, init=True, job={"step": "index"}) assert self.build_doc.get("backend_url") target_collection = create_backend( self.build_doc["backend_url"]).target_collection backend_url = self.build_doc["backend_url"] _mapping = self.get_mapping() _extra = self.get_index_creation_settings() # partially instantiated indexer instance for process workers partial_idxer = partial(ESIndexer, doc_type=self.doc_type, index=index_name, es_host=self.host, step=batch_size, number_of_shards=self.num_shards, number_of_replicas=self.num_replicas, **self.kwargs) # instantiate one here for index creation es_idxer = partial_idxer() if es_idxer.exists_index(): if mode == "purge": es_idxer.delete_index() elif mode not in ["resume", "merge"]: msg = "Index already '%s' exists, (use mode='purge' to auto-delete it or mode='resume' to add more documents)" % index_name self.register_status("failed", job={"err": msg}) raise IndexerException(msg) if mode not in ["resume", "merge"]: try: es_idxer.create_index({self.doc_type: _mapping}, _extra) except Exception as e: self.logger.exception("Failed to create index") self.register_status("failed", job={"err": repr(e)}) raise def clean_ids(ids): # can't use a generator, it's going to be pickled cleaned = [] for _id in ids: if type(_id) != str: self.logger.warning( "_id '%s' has invalid type (!str), skipped", repr(_id)) continue if len(_id) > 512: # this is an ES6 limitation self.logger.warning("_id is too long: '%s'", _id) continue cleaned.append(_id) return cleaned jobs = [] total = target_collection.count() btotal = math.ceil(total / batch_size) bnum = 1 if ids: self.logger.info( "Indexing from '%s' with specific list of _ids, create indexer job with batch_size=%d", target_name, batch_size) id_provider = iter_n(ids, batch_size) else: self.logger.info( "Fetch _ids from '%s', and create indexer job with batch_size=%d", target_name, batch_size) id_provider = id_feeder(target_collection, batch_size=batch_size, logger=self.logger) for ids in id_provider: yield from asyncio.sleep(0.0) origcnt = len(ids) ids = clean_ids(ids) newcnt = len(ids) if origcnt != newcnt: self.logger.warning( "%d document(s) can't be indexed and will be skipped (invalid _id)", origcnt - newcnt) # progress count cnt += len(ids) pinfo = self.get_pinfo() pinfo["step"] = self.target_name try: descprogress = cnt / total * 100 except ZeroDivisionError: descprogress = 0.0 pinfo["description"] = "#%d/%d (%.1f%%)" % (bnum, btotal, descprogress) self.logger.info( "Creating indexer job #%d/%d, to index '%s' %d/%d (%.1f%%)", bnum, btotal, backend_url, cnt, total, descprogress) job = yield from job_manager.defer_to_process( pinfo, partial(indexer_worker, backend_url, ids, partial_idxer, bnum, mode, worker)) def batch_indexed(f, batch_num): nonlocal got_error try: res = f.result() if type(res) != tuple or type(res[0]) != int: got_error = Exception( "Batch #%s failed while indexing collection '%s' [result:%s]" % (batch_num, self.target_name, repr(res))) except Exception as e: got_error = e self.logger.exception("Batch indexed error") return job.add_done_callback(partial(batch_indexed, batch_num=bnum)) jobs.append(job) bnum += 1 # raise error as soon as we know if got_error: self.register_status("failed", job={"err": repr(got_error)}) raise got_error self.logger.info("%d jobs created for indexing step", len(jobs)) tasks = asyncio.gather(*jobs) def done(f): nonlocal got_error if None in f.result(): got_error = None return # compute overall inserted/updated records # returned values looks like [(num,[]),(num,[]),...] cnt = sum((val[0] for val in f.result())) self.register_status("success", job={"step": "index"}, index={"count": cnt}) if total != cnt: # raise error if counts don't match, but index is still created, # fully registered in case we want to use it anyways err = "Merged collection has %d documents but %d have been indexed (check logs for more)" % ( total, cnt) raise IndexerException(err) self.logger.info( "Index '%s' successfully created using merged collection %s", index_name, target_name, extra={"notify": True}) tasks.add_done_callback(done) yield from tasks if "post" in steps: self.logger.info("Running post-index process for index '%s'", index_name) self.register_status("indexing", transient=True, init=True, job={"step": "post-index"}) pinfo = self.get_pinfo() pinfo["step"] = "post_index" # for some reason (like maintaining object's state between pickling). # we can't use process there. Need to use thread to maintain that state without # building an unmaintainable monster job = yield from job_manager.defer_to_thread( pinfo, partial(self.post_index, target_name, index_name, job_manager, steps=steps, batch_size=batch_size, ids=ids, mode=mode)) def posted(f): nonlocal got_error try: res = f.result() self.logger.info( "Post-index process done for index '%s': %s", index_name, res) self.register_status("indexing", job={"step": "post-index"}) except Exception as e: got_error = e self.logger.exception( "Post-index process failed for index '%s':", index_name, extra={"notify": True}) return job.add_done_callback(posted) yield from job # consume future if got_error: self.register_status("failed", job={"err": repr(got_error)}) raise got_error else: self.register_status("success") return {"%s" % self.index_name: cnt}