Example #1
0
    # now it is
    assert m["vals"][list]["bla"][str] == {"split": {}}
    inspect(sd2, mapt=m, mode="mapping")
    # not splitable in sd2
    assert m["vals"]["bla"][str] == {}
    # mapping with type of type
    sd1 = {
        "_id": "123",
        "homologene": {
            "id": "bla",
            "gene": [[123, 456], [789, 102]]
        }
    }
    m = inspect_docs([sd1], mode="mapping")
    import biothings.utils.es as es
    mapping = es.generate_es_mapping(m)
    assert mapping == {
        'homologene': {
            'properties': {
                'gene': {
                    'type': 'integer'
                },
                'id': {
                    'analyzer': 'string_lowercase',
                    'type': 'string'
                }
            }
        }
    }, "mapping %s" % mapping

    # ok, "bla" is either a scalar or in a list, test merge
Example #2
0
def inspect_docs(docs,
                 mode="type",
                 clean=True,
                 merge=False,
                 logger=logging,
                 pre_mapping=False,
                 limit=None,
                 sample=None,
                 metadata=True,
                 auto_convert=True):
    """Inspect docs and return a summary of its structure:
    - mode:
        + "type": explore documents and report strict data structure
        + "mapping": same as type but also perform test on data so guess best mapping
          (eg. check if a string is splitable, etc...). Implies merge=True
        + "stats": explore documents and compute basic stats (count,min,max,sum)
        + "deepstats": same as stats but record values and also compute mean,stdev,median
          (memory intensive...)
        + "jsonschema", same as "type" but returned a json-schema formatted result
      (mode can also be a list of modes, eg. ["type","mapping"]. There's little
       overhead computing multiple types as most time is spent on actually getting the data)
    - clean: don't delete recorded vqlues or temporary results
    - merge: merge scalar into list when both exist (eg. {"val":..} and [{"val":...}]
    - limit: can limit the inspection to the x first docs (None = no limit, inspects all)
    - sample: in combination with limit, randomly extract a sample of 'limit' docs
              (so not necessarily the x first ones defined by limit). If random.random()
              is greater than sample, doc is inspected, otherwise it's skipped
    - metadata: compute metadata on the result
    - auto_convert: run converters automatically (converters are used to convert one mode's
                    output to another mode's output, eg. type to jsonschema)
    """

    if type(mode) == str:
        modes = [mode]
    else:
        modes = mode
    if auto_convert:
        converters, modes = get_converters(modes, logger=logger)
    _map = {}
    for m in modes:
        _map[m] = {}
    cnt = 0
    errors = set()
    t0 = time.time()
    innert0 = time.time()

    if not sample is None:
        assert limit, "Parameter 'sample' requires 'limit' to be defined"
        assert sample != 1, "Sample value 1 not allowed (no documents would be inspected)"
    if limit:
        limit = int(limit)
        logger.debug("Limiting inspection to the %s first documents" % limit)
    for doc in docs:
        if not sample is None:
            if random.random() <= sample:
                continue
        for m in modes:
            try:
                inspect(doc, mapt=_map[m], mode=m)
            except Exception as e:
                logging.exception(
                    "Can't inspect document (_id: %s) because: %s\ndoc: %s" %
                    (doc.get("_id"), e, pformat("dpc")))
                errors.add(str(e))
        cnt += 1
        if cnt % 10000 == 0:
            logger.info("%d documents processed [%s]" %
                        (cnt, timesofar(innert0)))
            innert0 = time.time()
        if limit and cnt > limit:
            logger.debug("done")
            break
    logger.info("Done [%s]" % timesofar(t0))
    logger.info("Post-processing")

    # post-process, specific for each mode
    for m in modes:
        mode_inst = get_mode_layer(m)
        if mode_inst:
            mode_inst.post(_map[m], m, clean)

    if auto_convert:
        run_converters(_map, converters, logger=logger)

    merge = "mapping" in modes and True or merge
    if merge:
        merge_scalar_list(_map["mapping"], "mapping")
    if "mapping" in modes and pre_mapping is False:
        # directly generate ES mapping
        import biothings.utils.es as es
        try:
            _map["mapping"] = es.generate_es_mapping(_map["mapping"])
            if metadata:
                # compute some extra metadata
                _map = compute_metadata(_map, "mapping")
        except es.MappingError as e:
            prem = {"pre-mapping": _map["mapping"], "errors": e.args[1]}
            _map["mapping"] = prem
    elif errors:
        _map["errors"] = errors
    return _map
Example #3
0
            def do():
                yield from asyncio.sleep(0.0)
                nonlocal mode

                pinfo = {
                    "category": INSPECTOR_CATEGORY,
                    "source": "%s" % repr(data_provider),
                    "step": "",
                    "description": ""
                }
                # register begin of inspection (differ slightly depending on type)
                if data_provider_type == "source":
                    registerer_obj.register_status("inspecting",
                                                   subkey="inspect")
                elif data_provider_type == "build":
                    registerer_obj.register_status("inspecting",
                                                   transient=True,
                                                   init=True,
                                                   job={"step": "inspect"})

                self.logger.info(
                    "Running inspector on %s (type:%s,data_provider:%s)" %
                    (repr(data_provider), data_provider_type,
                     backend_provider))
                if sample is not None:
                    self.logger.info(
                        "Sample set to %s, inspect only a subset of data",
                        sample)
                if limit is None:
                    self.logger.info("Inspecting all the documents")
                else:
                    nonlocal batch_size
                    # adjust batch_size so we inspect only "limit" docs if batch is smaller than the limit
                    if batch_size > limit:
                        batch_size = limit
                    self.logger.info("Inspecting only %s documents", limit)
                # make it pickleable
                if data_provider_type == "source":
                    # because register_obj is also used to fetch data, it has to be unprepare() for pickling
                    registerer_obj.unprepare()
                else:
                    # NOTE: do not unprepare() the builder, we'll loose the target name
                    # (it's be randomly generated again) and we won't be able to register results
                    pass

                cnt = 0
                doccnt = 0
                jobs = []
                # normalize mode param and prepare global results
                if type(mode) == str:
                    mode = [mode]

                converters, mode = btinspect.get_converters(mode)

                inspected = {}
                for m in mode:
                    inspected.setdefault(m, {})

                backend = create_backend(backend_provider).target_collection
                for ids in id_feeder(backend, batch_size=batch_size):
                    if sample is not None:
                        if random.random() > sample:
                            continue
                    cnt += 1
                    doccnt += batch_size
                    if limit and doccnt > limit:
                        break
                    pinfo["description"] = "batch #%s" % cnt

                    def batch_inspected(bnum, i, f):
                        nonlocal inspected
                        nonlocal got_error
                        nonlocal mode
                        try:
                            res = f.result()
                            for m in mode:
                                inspected[m] = btinspect.merge_record(
                                    inspected[m], res[m], m)
                        except Exception as e:
                            got_error = e
                            self.logger.error(
                                "Error while inspecting data from batch #%s: %s"
                                % (bnum, e))
                            raise

                    pre_mapping = "mapping" in mode  # we want to generate intermediate mapping so we can merge
                    # all maps later and then generate the ES mapping from there
                    self.logger.info("Creating inspect worker for batch #%s" %
                                     cnt)
                    job = yield from self.job_manager.defer_to_process(
                        pinfo,
                        partial(inspect_data,
                                backend_provider,
                                ids,
                                mode=mode,
                                pre_mapping=pre_mapping,
                                **kwargs))
                    job.add_done_callback(partial(batch_inspected, cnt, ids))
                    jobs.append(job)

                yield from asyncio.gather(*jobs)

                # compute metadata (they were skipped before)
                for m in mode:
                    if m == "mapping":
                        try:
                            inspected["mapping"] = es.generate_es_mapping(
                                inspected["mapping"])
                            # metadata for mapping only once generated
                            inspected = btinspect.compute_metadata(
                                inspected, m)
                        except es.MappingError as e:
                            inspected["mapping"] = {
                                "pre-mapping": inspected["mapping"],
                                "errors": e.args[1]
                            }
                    else:
                        inspected = btinspect.compute_metadata(inspected, m)

                # just potential converters
                btinspect.run_converters(inspected, converters)

                def fully_inspected(res):
                    nonlocal got_error
                    try:
                        res = btinspect.stringify_inspect_doc(res)
                        _map = {"results": res}
                        _map["data_provider"] = repr(data_provider)
                        _map["started_at"] = started_at
                        _map["duration"] = timesofar(t0)

                        # when inspecting with "stats" mode, we can get huge number but mongo
                        # can't store more than 2^64, make sure to get rid of big nums there
                        def clean_big_nums(k, v):
                            # TODO: same with float/double? seems mongo handles more there ?
                            if isinstance(v, int) and v > 2**64:
                                return k, math.nan
                            else:
                                return k, v

                        dict_traverse(_map, clean_big_nums)
                        # register begin of inspection (differ slightly depending on type)
                        if "mapping" in mode and "errors" in res[
                                "mapping"] and "pre-mapping" in res["mapping"]:
                            registerer_obj.register_status("failed",
                                                           subkey="inspect",
                                                           inspect=_map)
                            got_error = InspectorError(
                                res["mapping"]["errors"])
                        else:
                            if data_provider_type == "source":
                                registerer_obj.register_status(
                                    "success", subkey="inspect", inspect=_map)
                            elif data_provider_type == "build":
                                registerer_obj.register_status(
                                    "success",
                                    job={"step": "inspect"},
                                    build={"inspect": _map})
                    except Exception as e:
                        self.logger.exception(
                            "Error while inspecting data: %s" % e)
                        got_error = e
                        if data_provider_type == "source":
                            registerer_obj.register_status("failed",
                                                           subkey="inspect",
                                                           err=repr(e))
                        elif data_provider_type == "build":
                            registerer_obj.register_status(
                                "failed", job={"err": repr(e)})

                fully_inspected(inspected)
                if data_provider_type is None:
                    return
                if got_error:
                    raise got_error
Example #4
0
            def do():
                yield from asyncio.sleep(0.0)
                nonlocal mode

                pinfo = {
                    "category": INSPECTOR_CATEGORY,
                    "source": "%s" % repr(data_provider),
                    "step": "",
                    "description": ""
                }
                # register begin of inspection (differ slightly depending on type)
                if data_provider_type == "source":
                    registerer_obj.register_status("inspecting",
                                                   subkey="inspect")
                elif data_provider_type == "build":
                    registerer_obj.register_status("inspecting",
                                                   transient=True,
                                                   init=True,
                                                   job={"step": "inspect"})

                self.logger.info("Running inspector on %s (type:%s,data_provider:%s)" % \
                        (repr(data_provider),data_provider_type,backend_provider))
                # make it pickleable
                if data_provider_type == "source":
                    # because register_obj is also used to fetch data, it has to be unprepare() for pickling
                    registerer_obj.unprepare()
                else:
                    # NOTE: do not unprepare() the builder, we'll loose the target name
                    # (it's be randomly generated again) and we won't be able to register results
                    pass

                cnt = 0
                jobs = []
                # normalize mode param and prepare global results
                if type(mode) == str:
                    mode = [mode]
                inspected = {}
                for m in mode:
                    inspected.setdefault(m, {})

                backend = create_backend(backend_provider).target_collection
                for ids in id_feeder(backend, batch_size=batch_size):
                    cnt += 1
                    pinfo["description"] = "batch #%s" % cnt

                    def batch_inspected(bnum, i, f):
                        nonlocal inspected
                        nonlocal got_error
                        nonlocal mode
                        try:
                            res = f.result()
                            for m in mode:
                                inspected[m] = btinspect.merge_record(
                                    inspected[m], res[m], m)
                        except Exception as e:
                            got_error = e
                            self.logger.error(
                                "Error while inspecting data from batch #%s: %s"
                                % (bnum, e))
                            raise

                    pre_mapping = "mapping" in mode  # we want to generate intermediate mapping so we can merge
                    # all maps later and then generate the ES mapping from there
                    self.logger.info("Creating inspect worker for batch #%s" %
                                     cnt)
                    job = yield from self.job_manager.defer_to_process(
                        pinfo,
                        partial(inspect_data,
                                backend_provider,
                                ids,
                                mode=mode,
                                pre_mapping=pre_mapping,
                                **kwargs))
                    job.add_done_callback(partial(batch_inspected, cnt, ids))
                    jobs.append(job)

                yield from asyncio.gather(*jobs)

                # compute metadata (they were skipped before)
                for m in mode:
                    if m == "mapping":
                        try:
                            inspected["mapping"] = es.generate_es_mapping(
                                inspected["mapping"])
                            # metadata for mapping only once generated
                            inspected = btinspect.compute_metadata(
                                inspected, m)
                        except es.MappingError as e:
                            inspected["mapping"] = {
                                "pre-mapping": inspected["mapping"],
                                "errors": e.args[1]
                            }
                    else:
                        inspected = btinspect.compute_metadata(inspected, m)

                def fully_inspected(res):
                    nonlocal got_error
                    try:
                        res = btinspect.stringify_inspect_doc(res)
                        _map = {"results": res}
                        _map["data_provider"] = repr(data_provider)
                        _map["started_at"] = started_at
                        _map["duration"] = timesofar(t0)
                        # register begin of inspection (differ slightly depending on type)
                        if "mapping" in mode and "errors" in res[
                                "mapping"] and "pre-mapping" in res["mapping"]:
                            registerer_obj.register_status("failed",
                                                           subkey="inspect",
                                                           inspect=_map)
                            got_error = InspectorError(
                                res["mapping"]["errors"])
                        else:
                            if data_provider_type == "source":
                                registerer_obj.register_status(
                                    "success", subkey="inspect", inspect=_map)
                            elif data_provider_type == "build":
                                registerer_obj.register_status(
                                    "success",
                                    job={"step": "inspect"},
                                    build={"inspect": _map})
                    except Exception as e:
                        self.logger.exception(
                            "Error while inspecting data: %s" % e)
                        got_error = e
                        if data_provider_type == "source":
                            registerer_obj.register_status("failed",
                                                           subkey="inspect",
                                                           err=repr(e))
                        elif data_provider_type == "build":
                            registerer_obj.register_status(
                                "failed", job={"err": repr(e)})

                fully_inspected(inspected)
                if data_provider_type is None:
                    return
                if got_error:
                    raise got_error