Exemple #1
0
 def update_metadata(self,
                     indexer_env,
                     index_name,
                     build_name=None,
                     _meta=None):
     """
     Update _meta for index_name, based on build_name (_meta directly
     taken from the src_build document) or _meta
     """
     idxkwargs = self[indexer_env]
     # 1st pass we get the doc_type (don't want to ask that on the signature...)
     indexer = create_backend(
         (idxkwargs["es_host"], index_name, None)).target_esidxer
     m = indexer._es.indices.get_mapping(index_name)
     assert len(m[index_name]["mappings"]) == 1, "Found more than one doc_type: " + \
         "%s" % m[index_name]["mappings"].keys()
     doc_type = list(m[index_name]["mappings"].keys())[0]
     # 2nd pass to re-create correct indexer
     indexer = create_backend(
         (idxkwargs["es_host"], index_name, doc_type)).target_esidxer
     if build_name:
         build = get_src_build().find_one({"_id": build_name})
         assert build, "No such build named '%s'" % build_name
         _meta = build.get("_meta")
     assert _meta is not None, "No _meta found"
     return indexer.update_mapping_meta({"_meta": _meta})
Exemple #2
0
def merge_index_worker(col_name, ids, pindexer, batch_num):
    col = create_backend(col_name).target_collection
    idxer = pindexer()
    upd_cnt = 0
    new_cnt = 0
    cur = doc_feeder(col,
                     step=len(ids),
                     inbatch=False,
                     query={'_id': {
                         '$in': ids
                     }})
    docs = [d for d in cur]
    [d.pop("_timestamp", None) for d in docs]
    dids = dict([(d["_id"], d) for d in docs])
    dexistings = dict([(d["_id"], d)
                       for d in idxer.get_docs([k for k in dids.keys()])])
    for _id in dexistings:
        d = dexistings[_id]
        # update in-place
        d.update(dids[_id])
        # mark as processed/updated
        dids.pop(_id)
    # updated docs (those existing in col *and* index)
    upd_cnt = idxer.index_bulk(dexistings.values(), len(dexistings))
    logging.debug("%s documents updated in index" % repr(upd_cnt))
    # new docs (only in col, *not* in index)
    new_cnt = idxer.index_bulk(dids.values(), len(dids))
    logging.debug("%s new documents in index" % repr(new_cnt))
    # need to return one: tuple(cnt,list)
    ret = (upd_cnt[0] + new_cnt[0], upd_cnt[1] + new_cnt[1])
    return ret
Exemple #3
0
def new_index_worker(col_name, ids, pindexer, batch_num):
    col = create_backend(col_name).target_collection
    idxer = pindexer()
    cur = doc_feeder(col,
                     step=len(ids),
                     inbatch=False,
                     query={'_id': {
                         '$in': ids
                     }})
    cnt = idxer.index_bulk(cur)
    return cnt
def inspect_data(backend_provider, ids, mode, pre_mapping, **kwargs):
    col = create_backend(backend_provider).target_collection
    cur = doc_feeder(col,
                     step=len(ids),
                     inbatch=False,
                     query={'_id': {
                         '$in': ids
                     }})
    return btinspect.inspect_docs(cur,
                                  mode=mode,
                                  pre_mapping=pre_mapping,
                                  metadata=False,
                                  **kwargs)
Exemple #5
0
 def post_sync_cols(self, diff_folder, batch_size, mode, force,
                    target_backend, steps):
     assert self.target_backend_type == "es", "Only support ElasticSearch backend (got: %s)" % self.target_backend_type
     assert not self._meta is None, "Metadata not loaded (use load_metadata(diff_folder))"
     self.logger.info(
         "Sleeping for a bit while index is being fully updated...")
     time.sleep(3 * 60)
     backend_info = self.get_target_backend()
     self.logger.info("Updating 'stats' by querying index '%s'" %
                      backend_info[1])
     indexer = create_backend(backend_info).target_esidxer
     # compute stats using ES index
     assembly = self._meta["build_config"]["assembly"]
     return update_stats(indexer, assembly)
Exemple #6
0
def docfetcher(backend_url, _id=None, count=False, q={}, limit=100):
    """
    Returns documents by _id and using query q. Backend follows
    biothings.hub.databuild.backend.create_backend() format,
    see this function documentation for more
    """
    backend = create_backend(backend_url)
    if count:
        return backend.count()
    elif _id:
        return backend.get_from_id(_id)
    else:
        q = backend.query(q)
        if isinstance(q, types.GeneratorType):
            # ES backend, need fetch then limit...
            return [doc for doc in q][:limit]
        else:
            # mongo
            return [doc for doc in q.limit(limit)]
Exemple #7
0
            def do():
                yield from asyncio.sleep(0.0)
                nonlocal mode

                pinfo = {
                    "category": INSPECTOR_CATEGORY,
                    "source": "%s" % repr(data_provider),
                    "step": "",
                    "description": ""
                }
                # register begin of inspection (differ slightly depending on type)
                if data_provider_type == "source":
                    registerer_obj.register_status("inspecting",
                                                   subkey="inspect")
                elif data_provider_type == "build":
                    registerer_obj.register_status("inspecting",
                                                   transient=True,
                                                   init=True,
                                                   job={"step": "inspect"})

                self.logger.info(
                    "Running inspector on %s (type:%s,data_provider:%s)" %
                    (repr(data_provider), data_provider_type,
                     backend_provider))
                if sample is not None:
                    self.logger.info(
                        "Sample set to %s, inspect only a subset of data",
                        sample)
                if limit is None:
                    self.logger.info("Inspecting all the documents")
                else:
                    nonlocal batch_size
                    # adjust batch_size so we inspect only "limit" docs if batch is smaller than the limit
                    if batch_size > limit:
                        batch_size = limit
                    self.logger.info("Inspecting only %s documents", limit)
                # make it pickleable
                if data_provider_type == "source":
                    # because register_obj is also used to fetch data, it has to be unprepare() for pickling
                    registerer_obj.unprepare()
                else:
                    # NOTE: do not unprepare() the builder, we'll loose the target name
                    # (it's be randomly generated again) and we won't be able to register results
                    pass

                cnt = 0
                doccnt = 0
                jobs = []
                # normalize mode param and prepare global results
                if type(mode) == str:
                    mode = [mode]

                converters, mode = btinspect.get_converters(mode)

                inspected = {}
                for m in mode:
                    inspected.setdefault(m, {})

                backend = create_backend(backend_provider).target_collection
                for ids in id_feeder(backend, batch_size=batch_size):
                    if sample is not None:
                        if random.random() > sample:
                            continue
                    cnt += 1
                    doccnt += batch_size
                    if limit and doccnt > limit:
                        break
                    pinfo["description"] = "batch #%s" % cnt

                    def batch_inspected(bnum, i, f):
                        nonlocal inspected
                        nonlocal got_error
                        nonlocal mode
                        try:
                            res = f.result()
                            for m in mode:
                                inspected[m] = btinspect.merge_record(
                                    inspected[m], res[m], m)
                        except Exception as e:
                            got_error = e
                            self.logger.error(
                                "Error while inspecting data from batch #%s: %s"
                                % (bnum, e))
                            raise

                    pre_mapping = "mapping" in mode  # we want to generate intermediate mapping so we can merge
                    # all maps later and then generate the ES mapping from there
                    self.logger.info("Creating inspect worker for batch #%s" %
                                     cnt)
                    job = yield from self.job_manager.defer_to_process(
                        pinfo,
                        partial(inspect_data,
                                backend_provider,
                                ids,
                                mode=mode,
                                pre_mapping=pre_mapping,
                                **kwargs))
                    job.add_done_callback(partial(batch_inspected, cnt, ids))
                    jobs.append(job)

                yield from asyncio.gather(*jobs)

                # compute metadata (they were skipped before)
                for m in mode:
                    if m == "mapping":
                        try:
                            inspected["mapping"] = es.generate_es_mapping(
                                inspected["mapping"])
                            # metadata for mapping only once generated
                            inspected = btinspect.compute_metadata(
                                inspected, m)
                        except es.MappingError as e:
                            inspected["mapping"] = {
                                "pre-mapping": inspected["mapping"],
                                "errors": e.args[1]
                            }
                    else:
                        inspected = btinspect.compute_metadata(inspected, m)

                # just potential converters
                btinspect.run_converters(inspected, converters)

                def fully_inspected(res):
                    nonlocal got_error
                    try:
                        res = btinspect.stringify_inspect_doc(res)
                        _map = {"results": res}
                        _map["data_provider"] = repr(data_provider)
                        _map["started_at"] = started_at
                        _map["duration"] = timesofar(t0)

                        # when inspecting with "stats" mode, we can get huge number but mongo
                        # can't store more than 2^64, make sure to get rid of big nums there
                        def clean_big_nums(k, v):
                            # TODO: same with float/double? seems mongo handles more there ?
                            if isinstance(v, int) and v > 2**64:
                                return k, math.nan
                            else:
                                return k, v

                        dict_traverse(_map, clean_big_nums)
                        # register begin of inspection (differ slightly depending on type)
                        if "mapping" in mode and "errors" in res[
                                "mapping"] and "pre-mapping" in res["mapping"]:
                            registerer_obj.register_status("failed",
                                                           subkey="inspect",
                                                           inspect=_map)
                            got_error = InspectorError(
                                res["mapping"]["errors"])
                        else:
                            if data_provider_type == "source":
                                registerer_obj.register_status(
                                    "success", subkey="inspect", inspect=_map)
                            elif data_provider_type == "build":
                                registerer_obj.register_status(
                                    "success",
                                    job={"step": "inspect"},
                                    build={"inspect": _map})
                    except Exception as e:
                        self.logger.exception(
                            "Error while inspecting data: %s" % e)
                        got_error = e
                        if data_provider_type == "source":
                            registerer_obj.register_status("failed",
                                                           subkey="inspect",
                                                           err=repr(e))
                        elif data_provider_type == "build":
                            registerer_obj.register_status(
                                "failed", job={"err": repr(e)})

                fully_inspected(inspected)
                if data_provider_type is None:
                    return
                if got_error:
                    raise got_error
            def do():
                yield from asyncio.sleep(0.0)
                nonlocal mode

                pinfo = {
                    "category": INSPECTOR_CATEGORY,
                    "source": "%s" % repr(data_provider),
                    "step": "",
                    "description": ""
                }
                # register begin of inspection (differ slightly depending on type)
                if data_provider_type == "source":
                    registerer_obj.register_status("inspecting",
                                                   subkey="inspect")
                elif data_provider_type == "build":
                    registerer_obj.register_status("inspecting",
                                                   transient=True,
                                                   init=True,
                                                   job={"step": "inspect"})

                self.logger.info("Running inspector on %s (type:%s,data_provider:%s)" % \
                        (repr(data_provider),data_provider_type,backend_provider))
                # make it pickleable
                if data_provider_type == "source":
                    # because register_obj is also used to fetch data, it has to be unprepare() for pickling
                    registerer_obj.unprepare()
                else:
                    # NOTE: do not unprepare() the builder, we'll loose the target name
                    # (it's be randomly generated again) and we won't be able to register results
                    pass

                cnt = 0
                jobs = []
                # normalize mode param and prepare global results
                if type(mode) == str:
                    mode = [mode]
                inspected = {}
                for m in mode:
                    inspected.setdefault(m, {})

                backend = create_backend(backend_provider).target_collection
                for ids in id_feeder(backend, batch_size=batch_size):
                    cnt += 1
                    pinfo["description"] = "batch #%s" % cnt

                    def batch_inspected(bnum, i, f):
                        nonlocal inspected
                        nonlocal got_error
                        nonlocal mode
                        try:
                            res = f.result()
                            for m in mode:
                                inspected[m] = btinspect.merge_record(
                                    inspected[m], res[m], m)
                        except Exception as e:
                            got_error = e
                            self.logger.error(
                                "Error while inspecting data from batch #%s: %s"
                                % (bnum, e))
                            raise

                    pre_mapping = "mapping" in mode  # we want to generate intermediate mapping so we can merge
                    # all maps later and then generate the ES mapping from there
                    self.logger.info("Creating inspect worker for batch #%s" %
                                     cnt)
                    job = yield from self.job_manager.defer_to_process(
                        pinfo,
                        partial(inspect_data,
                                backend_provider,
                                ids,
                                mode=mode,
                                pre_mapping=pre_mapping,
                                **kwargs))
                    job.add_done_callback(partial(batch_inspected, cnt, ids))
                    jobs.append(job)

                yield from asyncio.gather(*jobs)

                # compute metadata (they were skipped before)
                for m in mode:
                    if m == "mapping":
                        try:
                            inspected["mapping"] = es.generate_es_mapping(
                                inspected["mapping"])
                            # metadata for mapping only once generated
                            inspected = btinspect.compute_metadata(
                                inspected, m)
                        except es.MappingError as e:
                            inspected["mapping"] = {
                                "pre-mapping": inspected["mapping"],
                                "errors": e.args[1]
                            }
                    else:
                        inspected = btinspect.compute_metadata(inspected, m)

                def fully_inspected(res):
                    nonlocal got_error
                    try:
                        res = btinspect.stringify_inspect_doc(res)
                        _map = {"results": res}
                        _map["data_provider"] = repr(data_provider)
                        _map["started_at"] = started_at
                        _map["duration"] = timesofar(t0)
                        # register begin of inspection (differ slightly depending on type)
                        if "mapping" in mode and "errors" in res[
                                "mapping"] and "pre-mapping" in res["mapping"]:
                            registerer_obj.register_status("failed",
                                                           subkey="inspect",
                                                           inspect=_map)
                            got_error = InspectorError(
                                res["mapping"]["errors"])
                        else:
                            if data_provider_type == "source":
                                registerer_obj.register_status(
                                    "success", subkey="inspect", inspect=_map)
                            elif data_provider_type == "build":
                                registerer_obj.register_status(
                                    "success",
                                    job={"step": "inspect"},
                                    build={"inspect": _map})
                    except Exception as e:
                        self.logger.exception(
                            "Error while inspecting data: %s" % e)
                        got_error = e
                        if data_provider_type == "source":
                            registerer_obj.register_status("failed",
                                                           subkey="inspect",
                                                           err=repr(e))
                        elif data_provider_type == "build":
                            registerer_obj.register_status(
                                "failed", job={"err": repr(e)})

                fully_inspected(inspected)
                if data_provider_type is None:
                    return
                if got_error:
                    raise got_error
Exemple #9
0
    def index(self,
              target_name,
              index_name,
              job_manager,
              steps=["index", "post"],
              batch_size=10000,
              ids=None,
              mode="index",
              worker=None):
        """
        Build an index named "index_name" with data from collection "target_collection".

        "ids" can be passed to selectively index documents.

        "mode" can have the following values:
            - 'purge': will delete index if it exists
            - 'resume': will use existing index and add documents. "ids" can be passed as a list of missing IDs,
                    or, if not pass, ES will be queried to identify which IDs are missing for each batch in
                    order to complete the index.
            - 'merge': will merge data with existing index' documents, used when populated several distinct times (cold/hot merge for instance)
            - None (default): will create a new index, assuming it doesn't already exist
        """
        assert job_manager
        # check what to do
        if type(steps) == str:
            steps = [steps]
        self.target_name = target_name
        self.index_name = index_name
        self.load_build()
        self.setup_log()
        # select proper index worker according to mode:
        if worker is None:  # none specified, choose correct one
            if mode == "merge":
                worker = merge_index_worker
            else:
                worker = new_index_worker

        got_error = False
        cnt = 0

        if "index" in steps:
            self.register_status("indexing",
                                 transient=True,
                                 init=True,
                                 job={"step": "index"})
            assert self.build_doc.get("backend_url")
            target_collection = create_backend(
                self.build_doc["backend_url"]).target_collection
            backend_url = self.build_doc["backend_url"]
            _mapping = self.get_mapping()
            _extra = self.get_index_creation_settings()
            # partially instantiated indexer instance for process workers
            partial_idxer = partial(ESIndexer,
                                    doc_type=self.doc_type,
                                    index=index_name,
                                    es_host=self.host,
                                    step=batch_size,
                                    number_of_shards=self.num_shards,
                                    number_of_replicas=self.num_replicas,
                                    **self.kwargs)
            # instantiate one here for index creation
            es_idxer = partial_idxer()
            if es_idxer.exists_index():
                if mode == "purge":
                    es_idxer.delete_index()
                elif mode not in ["resume", "merge"]:
                    msg = "Index already '%s' exists, (use mode='purge' to auto-delete it or mode='resume' to add more documents)" % index_name
                    self.register_status("failed", job={"err": msg})
                    raise IndexerException(msg)

            if mode not in ["resume", "merge"]:
                try:
                    es_idxer.create_index({self.doc_type: _mapping}, _extra)
                except Exception as e:
                    self.logger.exception("Failed to create index")
                    self.register_status("failed", job={"err": repr(e)})
                    raise

            def clean_ids(ids):
                # can't use a generator, it's going to be pickled
                cleaned = []
                for _id in ids:
                    if type(_id) != str:
                        self.logger.warning(
                            "_id '%s' has invalid type (!str), skipped",
                            repr(_id))
                        continue
                    if len(_id) > 512:  # this is an ES6 limitation
                        self.logger.warning("_id is too long: '%s'", _id)
                        continue
                    cleaned.append(_id)
                return cleaned

            jobs = []
            total = target_collection.count()
            btotal = math.ceil(total / batch_size)
            bnum = 1
            if ids:
                self.logger.info(
                    "Indexing from '%s' with specific list of _ids, create indexer job with batch_size=%d",
                    target_name, batch_size)
                id_provider = iter_n(ids, batch_size)
            else:
                self.logger.info(
                    "Fetch _ids from '%s', and create indexer job with batch_size=%d",
                    target_name, batch_size)
                id_provider = id_feeder(target_collection,
                                        batch_size=batch_size,
                                        logger=self.logger)
            for ids in id_provider:
                yield from asyncio.sleep(0.0)
                origcnt = len(ids)
                ids = clean_ids(ids)
                newcnt = len(ids)
                if origcnt != newcnt:
                    self.logger.warning(
                        "%d document(s) can't be indexed and will be skipped (invalid _id)",
                        origcnt - newcnt)
                # progress count
                cnt += len(ids)
                pinfo = self.get_pinfo()
                pinfo["step"] = self.target_name
                try:
                    descprogress = cnt / total * 100
                except ZeroDivisionError:
                    descprogress = 0.0
                pinfo["description"] = "#%d/%d (%.1f%%)" % (bnum, btotal,
                                                            descprogress)
                self.logger.info(
                    "Creating indexer job #%d/%d, to index '%s' %d/%d (%.1f%%)",
                    bnum, btotal, backend_url, cnt, total, descprogress)
                job = yield from job_manager.defer_to_process(
                    pinfo,
                    partial(indexer_worker, backend_url, ids, partial_idxer,
                            bnum, mode, worker))

                def batch_indexed(f, batch_num):
                    nonlocal got_error
                    try:
                        res = f.result()
                        if type(res) != tuple or type(res[0]) != int:
                            got_error = Exception(
                                "Batch #%s failed while indexing collection '%s' [result:%s]"
                                % (batch_num, self.target_name, repr(res)))
                    except Exception as e:
                        got_error = e
                        self.logger.exception("Batch indexed error")
                        return

                job.add_done_callback(partial(batch_indexed, batch_num=bnum))
                jobs.append(job)
                bnum += 1
                # raise error as soon as we know
                if got_error:
                    self.register_status("failed",
                                         job={"err": repr(got_error)})
                    raise got_error
            self.logger.info("%d jobs created for indexing step", len(jobs))
            tasks = asyncio.gather(*jobs)

            def done(f):
                nonlocal got_error
                if None in f.result():
                    got_error = None
                    return
                # compute overall inserted/updated records
                # returned values looks like [(num,[]),(num,[]),...]
                cnt = sum((val[0] for val in f.result()))
                self.register_status("success",
                                     job={"step": "index"},
                                     index={"count": cnt})
                if total != cnt:
                    # raise error if counts don't match, but index is still created,
                    # fully registered in case we want to use it anyways
                    err = "Merged collection has %d documents but %d have been indexed (check logs for more)" % (
                        total, cnt)
                    raise IndexerException(err)
                self.logger.info(
                    "Index '%s' successfully created using merged collection %s",
                    index_name,
                    target_name,
                    extra={"notify": True})

            tasks.add_done_callback(done)
            yield from tasks

        if "post" in steps:
            self.logger.info("Running post-index process for index '%s'",
                             index_name)
            self.register_status("indexing",
                                 transient=True,
                                 init=True,
                                 job={"step": "post-index"})
            pinfo = self.get_pinfo()
            pinfo["step"] = "post_index"
            # for some reason (like maintaining object's state between pickling).
            # we can't use process there. Need to use thread to maintain that state without
            # building an unmaintainable monster
            job = yield from job_manager.defer_to_thread(
                pinfo,
                partial(self.post_index,
                        target_name,
                        index_name,
                        job_manager,
                        steps=steps,
                        batch_size=batch_size,
                        ids=ids,
                        mode=mode))

            def posted(f):
                nonlocal got_error
                try:
                    res = f.result()
                    self.logger.info(
                        "Post-index process done for index '%s': %s",
                        index_name, res)
                    self.register_status("indexing",
                                         job={"step": "post-index"})
                except Exception as e:
                    got_error = e
                    self.logger.exception(
                        "Post-index process failed for index '%s':",
                        index_name,
                        extra={"notify": True})
                    return

            job.add_done_callback(posted)
            yield from job  # consume future

        if got_error:
            self.register_status("failed", job={"err": repr(got_error)})
            raise got_error
        else:
            self.register_status("success")
            return {"%s" % self.index_name: cnt}