# now it is assert m["vals"][list]["bla"][str] == {"split": {}} inspect(sd2, mapt=m, mode="mapping") # not splitable in sd2 assert m["vals"]["bla"][str] == {} # mapping with type of type sd1 = { "_id": "123", "homologene": { "id": "bla", "gene": [[123, 456], [789, 102]] } } m = inspect_docs([sd1], mode="mapping") import biothings.utils.es as es mapping = es.generate_es_mapping(m) assert mapping == { 'homologene': { 'properties': { 'gene': { 'type': 'integer' }, 'id': { 'analyzer': 'string_lowercase', 'type': 'string' } } } }, "mapping %s" % mapping # ok, "bla" is either a scalar or in a list, test merge
def inspect_docs(docs, mode="type", clean=True, merge=False, logger=logging, pre_mapping=False, limit=None, sample=None, metadata=True, auto_convert=True): """Inspect docs and return a summary of its structure: - mode: + "type": explore documents and report strict data structure + "mapping": same as type but also perform test on data so guess best mapping (eg. check if a string is splitable, etc...). Implies merge=True + "stats": explore documents and compute basic stats (count,min,max,sum) + "deepstats": same as stats but record values and also compute mean,stdev,median (memory intensive...) + "jsonschema", same as "type" but returned a json-schema formatted result (mode can also be a list of modes, eg. ["type","mapping"]. There's little overhead computing multiple types as most time is spent on actually getting the data) - clean: don't delete recorded vqlues or temporary results - merge: merge scalar into list when both exist (eg. {"val":..} and [{"val":...}] - limit: can limit the inspection to the x first docs (None = no limit, inspects all) - sample: in combination with limit, randomly extract a sample of 'limit' docs (so not necessarily the x first ones defined by limit). If random.random() is greater than sample, doc is inspected, otherwise it's skipped - metadata: compute metadata on the result - auto_convert: run converters automatically (converters are used to convert one mode's output to another mode's output, eg. type to jsonschema) """ if type(mode) == str: modes = [mode] else: modes = mode if auto_convert: converters, modes = get_converters(modes, logger=logger) _map = {} for m in modes: _map[m] = {} cnt = 0 errors = set() t0 = time.time() innert0 = time.time() if not sample is None: assert limit, "Parameter 'sample' requires 'limit' to be defined" assert sample != 1, "Sample value 1 not allowed (no documents would be inspected)" if limit: limit = int(limit) logger.debug("Limiting inspection to the %s first documents" % limit) for doc in docs: if not sample is None: if random.random() <= sample: continue for m in modes: try: inspect(doc, mapt=_map[m], mode=m) except Exception as e: logging.exception( "Can't inspect document (_id: %s) because: %s\ndoc: %s" % (doc.get("_id"), e, pformat("dpc"))) errors.add(str(e)) cnt += 1 if cnt % 10000 == 0: logger.info("%d documents processed [%s]" % (cnt, timesofar(innert0))) innert0 = time.time() if limit and cnt > limit: logger.debug("done") break logger.info("Done [%s]" % timesofar(t0)) logger.info("Post-processing") # post-process, specific for each mode for m in modes: mode_inst = get_mode_layer(m) if mode_inst: mode_inst.post(_map[m], m, clean) if auto_convert: run_converters(_map, converters, logger=logger) merge = "mapping" in modes and True or merge if merge: merge_scalar_list(_map["mapping"], "mapping") if "mapping" in modes and pre_mapping is False: # directly generate ES mapping import biothings.utils.es as es try: _map["mapping"] = es.generate_es_mapping(_map["mapping"]) if metadata: # compute some extra metadata _map = compute_metadata(_map, "mapping") except es.MappingError as e: prem = {"pre-mapping": _map["mapping"], "errors": e.args[1]} _map["mapping"] = prem elif errors: _map["errors"] = errors return _map
def do(): yield from asyncio.sleep(0.0) nonlocal mode pinfo = { "category": INSPECTOR_CATEGORY, "source": "%s" % repr(data_provider), "step": "", "description": "" } # register begin of inspection (differ slightly depending on type) if data_provider_type == "source": registerer_obj.register_status("inspecting", subkey="inspect") elif data_provider_type == "build": registerer_obj.register_status("inspecting", transient=True, init=True, job={"step": "inspect"}) self.logger.info( "Running inspector on %s (type:%s,data_provider:%s)" % (repr(data_provider), data_provider_type, backend_provider)) if sample is not None: self.logger.info( "Sample set to %s, inspect only a subset of data", sample) if limit is None: self.logger.info("Inspecting all the documents") else: nonlocal batch_size # adjust batch_size so we inspect only "limit" docs if batch is smaller than the limit if batch_size > limit: batch_size = limit self.logger.info("Inspecting only %s documents", limit) # make it pickleable if data_provider_type == "source": # because register_obj is also used to fetch data, it has to be unprepare() for pickling registerer_obj.unprepare() else: # NOTE: do not unprepare() the builder, we'll loose the target name # (it's be randomly generated again) and we won't be able to register results pass cnt = 0 doccnt = 0 jobs = [] # normalize mode param and prepare global results if type(mode) == str: mode = [mode] converters, mode = btinspect.get_converters(mode) inspected = {} for m in mode: inspected.setdefault(m, {}) backend = create_backend(backend_provider).target_collection for ids in id_feeder(backend, batch_size=batch_size): if sample is not None: if random.random() > sample: continue cnt += 1 doccnt += batch_size if limit and doccnt > limit: break pinfo["description"] = "batch #%s" % cnt def batch_inspected(bnum, i, f): nonlocal inspected nonlocal got_error nonlocal mode try: res = f.result() for m in mode: inspected[m] = btinspect.merge_record( inspected[m], res[m], m) except Exception as e: got_error = e self.logger.error( "Error while inspecting data from batch #%s: %s" % (bnum, e)) raise pre_mapping = "mapping" in mode # we want to generate intermediate mapping so we can merge # all maps later and then generate the ES mapping from there self.logger.info("Creating inspect worker for batch #%s" % cnt) job = yield from self.job_manager.defer_to_process( pinfo, partial(inspect_data, backend_provider, ids, mode=mode, pre_mapping=pre_mapping, **kwargs)) job.add_done_callback(partial(batch_inspected, cnt, ids)) jobs.append(job) yield from asyncio.gather(*jobs) # compute metadata (they were skipped before) for m in mode: if m == "mapping": try: inspected["mapping"] = es.generate_es_mapping( inspected["mapping"]) # metadata for mapping only once generated inspected = btinspect.compute_metadata( inspected, m) except es.MappingError as e: inspected["mapping"] = { "pre-mapping": inspected["mapping"], "errors": e.args[1] } else: inspected = btinspect.compute_metadata(inspected, m) # just potential converters btinspect.run_converters(inspected, converters) def fully_inspected(res): nonlocal got_error try: res = btinspect.stringify_inspect_doc(res) _map = {"results": res} _map["data_provider"] = repr(data_provider) _map["started_at"] = started_at _map["duration"] = timesofar(t0) # when inspecting with "stats" mode, we can get huge number but mongo # can't store more than 2^64, make sure to get rid of big nums there def clean_big_nums(k, v): # TODO: same with float/double? seems mongo handles more there ? if isinstance(v, int) and v > 2**64: return k, math.nan else: return k, v dict_traverse(_map, clean_big_nums) # register begin of inspection (differ slightly depending on type) if "mapping" in mode and "errors" in res[ "mapping"] and "pre-mapping" in res["mapping"]: registerer_obj.register_status("failed", subkey="inspect", inspect=_map) got_error = InspectorError( res["mapping"]["errors"]) else: if data_provider_type == "source": registerer_obj.register_status( "success", subkey="inspect", inspect=_map) elif data_provider_type == "build": registerer_obj.register_status( "success", job={"step": "inspect"}, build={"inspect": _map}) except Exception as e: self.logger.exception( "Error while inspecting data: %s" % e) got_error = e if data_provider_type == "source": registerer_obj.register_status("failed", subkey="inspect", err=repr(e)) elif data_provider_type == "build": registerer_obj.register_status( "failed", job={"err": repr(e)}) fully_inspected(inspected) if data_provider_type is None: return if got_error: raise got_error
def do(): yield from asyncio.sleep(0.0) nonlocal mode pinfo = { "category": INSPECTOR_CATEGORY, "source": "%s" % repr(data_provider), "step": "", "description": "" } # register begin of inspection (differ slightly depending on type) if data_provider_type == "source": registerer_obj.register_status("inspecting", subkey="inspect") elif data_provider_type == "build": registerer_obj.register_status("inspecting", transient=True, init=True, job={"step": "inspect"}) self.logger.info("Running inspector on %s (type:%s,data_provider:%s)" % \ (repr(data_provider),data_provider_type,backend_provider)) # make it pickleable if data_provider_type == "source": # because register_obj is also used to fetch data, it has to be unprepare() for pickling registerer_obj.unprepare() else: # NOTE: do not unprepare() the builder, we'll loose the target name # (it's be randomly generated again) and we won't be able to register results pass cnt = 0 jobs = [] # normalize mode param and prepare global results if type(mode) == str: mode = [mode] inspected = {} for m in mode: inspected.setdefault(m, {}) backend = create_backend(backend_provider).target_collection for ids in id_feeder(backend, batch_size=batch_size): cnt += 1 pinfo["description"] = "batch #%s" % cnt def batch_inspected(bnum, i, f): nonlocal inspected nonlocal got_error nonlocal mode try: res = f.result() for m in mode: inspected[m] = btinspect.merge_record( inspected[m], res[m], m) except Exception as e: got_error = e self.logger.error( "Error while inspecting data from batch #%s: %s" % (bnum, e)) raise pre_mapping = "mapping" in mode # we want to generate intermediate mapping so we can merge # all maps later and then generate the ES mapping from there self.logger.info("Creating inspect worker for batch #%s" % cnt) job = yield from self.job_manager.defer_to_process( pinfo, partial(inspect_data, backend_provider, ids, mode=mode, pre_mapping=pre_mapping, **kwargs)) job.add_done_callback(partial(batch_inspected, cnt, ids)) jobs.append(job) yield from asyncio.gather(*jobs) # compute metadata (they were skipped before) for m in mode: if m == "mapping": try: inspected["mapping"] = es.generate_es_mapping( inspected["mapping"]) # metadata for mapping only once generated inspected = btinspect.compute_metadata( inspected, m) except es.MappingError as e: inspected["mapping"] = { "pre-mapping": inspected["mapping"], "errors": e.args[1] } else: inspected = btinspect.compute_metadata(inspected, m) def fully_inspected(res): nonlocal got_error try: res = btinspect.stringify_inspect_doc(res) _map = {"results": res} _map["data_provider"] = repr(data_provider) _map["started_at"] = started_at _map["duration"] = timesofar(t0) # register begin of inspection (differ slightly depending on type) if "mapping" in mode and "errors" in res[ "mapping"] and "pre-mapping" in res["mapping"]: registerer_obj.register_status("failed", subkey="inspect", inspect=_map) got_error = InspectorError( res["mapping"]["errors"]) else: if data_provider_type == "source": registerer_obj.register_status( "success", subkey="inspect", inspect=_map) elif data_provider_type == "build": registerer_obj.register_status( "success", job={"step": "inspect"}, build={"inspect": _map}) except Exception as e: self.logger.exception( "Error while inspecting data: %s" % e) got_error = e if data_provider_type == "source": registerer_obj.register_status("failed", subkey="inspect", err=repr(e)) elif data_provider_type == "build": registerer_obj.register_status( "failed", job={"err": repr(e)}) fully_inspected(inspected) if data_provider_type is None: return if got_error: raise got_error