def test_wrap_2(): Log.alert("Random types") switch = [ lambda: Random.int(20), lambda: Random.string(20), lambda: {"i": Random.int(2000)}, lambda: Data(i=Random.int(2000)), lambda: FlatList([{"i": Random.int(2000)}]), lambda: [{"i": Random.int(2000)}] ] inputs = [switch[min(len(switch) - 1, int(floor(-log(Random.float(), 2))))]() for i in range(NUM_INPUT)] for i in range(NUM_REPEAT): results = [] gc.collect() with Timer("more string: to_data"): for v in inputs: results.append(to_data(v)) results = [] gc.collect() with Timer("more string: baseline"): for v in inputs: results.append(baseline(v)) Log.note("Done {{i}} of {{num}}", i=i, num=NUM_REPEAT)
def test_compare_isinstance_to_text(self): num = 1 * 1000 * 1000 options = { 0: lambda: 6, 1: lambda: "string" # 2: lambda: {}, # 3: lambda: Data(), # 4: lambda: Null, } data = [options[Random.int(len(options))]() for _ in range(num)] with Timer("isinstance check") as i_time: i_result = [isinstance(d, text) for d in data] with Timer("set check") as s_time: s_result = [d.__class__ in (text, ) for d in data] with Timer("eq check") as e_time: e_result = [d.__class__ is text for d in data] with Timer("name check") as n_time: n_result = [is_instance(d, text) for d in data] with Timer("check w method") as m_time: m_result = [is_text(d) for d in data] self.assertEqual(s_result, i_result) self.assertEqual(m_result, i_result) self.assertEqual(e_result, i_result) self.assertEqual(n_result, i_result) self.assertGreater(i_time.duration, s_time.duration) self.assertGreater(m_time.duration, s_time.duration)
def extend(self, rows): if self.read_only: Log.error("not for writing") try: update = {} with Timer("encoding"): while True: output = [] for rownum, row in enumerate(rows): typed, more, add_nested = typed_encode(row, self.flake) update.update(more) if add_nested: # row HAS NEW NESTED COLUMN! # GO OVER THE rows AGAIN SO "RECORD" GET MAPPED TO "REPEATED" break output.append(typed) else: break if update or not self.shard: # BATCH HAS ADDITIONAL COLUMNS!! # WE CAN NOT USE THE EXISTING SHARD, MAKE A NEW ONE: self._create_new_shard() Log.note("added new shard with name: {{shard}}", shard=self.shard.table_id) with Timer("insert {{num}} rows to bq", param={"num": len(rows)}): failures = self.container.client.insert_rows_json( self.shard, json_rows=output, row_ids=[None] * len(output), skip_invalid_rows=False, ignore_unknown_values=False, ) if failures: if all(r == "stopped" for r in wrap(failures).errors.reason): self._create_new_shard() Log.note( "STOPPED encountered: Added new shard with name: {{shard}}", shard=self.shard.table_id, ) Log.error( "Got {{num}} failures:\n{{failures|json}}", num=len(failures), failures=failures[:5], ) else: self.last_extend = Date.now() Log.note("{{num}} rows added", num=len(output)) except Exception as e: e = Except.wrap(e) if len(rows) > 1 and "Request payload size exceeds the limit" in e: # TRY A SMALLER BATCH cut = len(rows) // 2 self.extend(rows[:cut]) self.extend(rows[cut:]) return Log.error("Do not know how to handle", cause=e)
def test_id_vs_id(self): ops = [Op() for _ in range(200)] lang1 = {id(o): o for o in ops} sample = Random.sample(ops, 1000 * 1000) with Timer("using id()"): result1 = [lang1[id(o)] for o in sample] lang2 = [None] * (max(o.id for o in ops) + 1) for o in ops: lang2[o.id] = o # lang2 = tuple(lang2) with Timer("using o.id"): result2 = [lang2[o.id] for o in sample]
def _get(self, key): with Timer("get {{key}} from S3", {"key": key}, verbose=False) as timer: output = s3_get(self, key) if output is not None: timer.verbose = True return output
def not_monitor(self, please_stop): Log.alert("metadata scan has been disabled") please_stop.on_go(lambda: self.todo.add(THREAD_STOP)) while not please_stop: c = self.todo.pop() if c == THREAD_STOP: break if c.last_updated >= Date.now() - TOO_OLD: continue with Timer("Update {{col.es_index}}.{{col.es_column}}", param={"col": c}, silent=not DEBUG, too_long=0.05): self.meta.columns.update({ "set": { "last_updated": Date.now() }, "clear": [ "count", "cardinality", "multi", "partitions", ], "where": { "eq": { "es_index": c.es_index, "es_column": c.es_column } } })
def _parse_properties(self, alias, mapping, meta): abs_columns = elasticsearch.parse_properties(alias, None, mapping.properties) with Timer("upserting {{num}} columns", {"num": len(abs_columns)}, debug=DEBUG): # LIST OF EVERY NESTED PATH query_paths = [[c.es_column] for c in abs_columns if c.es_type == "nested"] for a, b in itertools.product(query_paths, query_paths): aa = a[0] bb = b[0] if aa and bb.startswith(aa): for i, b_prefix in enumerate(b): if len(b_prefix) > len(aa): continue if aa == b_prefix: break # SPLIT ALREADY FOUND b.insert(i, aa) break for q in query_paths: q.append(SELF_PATH) query_paths.append(ROOT_PATH) self.alias_to_query_paths[alias] = query_paths # ADD RELATIVE NAMES for abs_column in abs_columns: abs_column.last_updated = None abs_column.jx_type = es_type_to_json_type[abs_column.es_type] for query_path in query_paths: abs_column.names[query_path[0]] = relative_field( abs_column.names["."], query_path[0]) self.todo.add(self.meta.columns.add(abs_column)) pass
def _db_load(self): self.last_load = Date.now() try: self.es_index = self.es_cluster.get_index( id=ID, index=META_COLUMNS_NAME, type=META_COLUMNS_TYPE_NAME, read_only=False) result = self.es_index.search({ "query": { "bool": { "should": [ { "bool": { "must_not": { "exists": { "field": "cardinality.~n~" } } } }, { # ASSUME UNUSED COLUMNS DO NOT EXIST "range": { "cardinality.~n~": { "gte": 0 } } }, ] } }, "sort": ["es_index.~s~", "name.~s~", "es_column.~s~"], "size": 10000, }) with Timer("adding columns to structure"): for r in result.hits.hits._source: col = doc_to_column(r) if col: self._add(col) Log.note("{{num}} columns loaded", num=result.hits.total) if not self.data.get(META_COLUMNS_NAME): Log.error("metadata missing from index!") except Exception as e: metadata = self.es_cluster.get_metadata(after=Date.now()) if any( index.startswith(META_COLUMNS_NAME) for index in metadata.indices.keys()): Log.error("metadata already exists!", cause=e) Log.warning("no {{index}} exists, making one", index=META_COLUMNS_NAME, cause=e) self._db_create()
def update_local_database(config, deviant_summary, candidates, since): if isinstance(deviant_summary, bigquery.Table): Log.note("Only the ETL process should fill the bigquery table") return # GET EVERYTHING WE HAVE SO FAR exists = deviant_summary.query({ "select": ["signature_hash", "last_updated"], "where": { "and": [ { "in": { "signature_hash": candidates.signature_hash } }, { "exists": "num_pushes" }, ] }, "sort": "last_updated", "limit": 100000, "format": "list", }).data # CHOOSE MISSING, THEN OLDEST, UP TO "RECENT" missing = list(set(candidates.signature_hash) - set(exists.signature_hash)) too_old = Date.today() - parse(LOCAL_RETENTION) needs_update = missing + [ e.signature_hash for e in exists if e.last_updated < too_old.unix ] Log.alert("{{num}} series are candidates for local update", num=len(needs_update)) limited_update = Queue("sigs") limited_update.extend( left(needs_update, coalesce(config.display.download_limit, 100))) Log.alert("Updating local database with {{num}} series", num=len(limited_update)) with Timer("Updating local database"): def loop(please_stop): while not please_stop: signature_hash = limited_update.pop_one() if not signature_hash: return process( signature_hash, since, source=config.database, deviant_summary=deviant_summary, ) threads = [Thread.run(text(i), loop) for i in range(3)] for t in threads: t.join() Log.note("Local database is up to date")
def encode(self, value, pretty=False): if pretty: return pretty_json(value) try: with Timer("scrub", too_long=0.1): scrubbed = scrub(value) with Timer("encode", too_long=0.1): return text_type(self.encoder(scrubbed)) except Exception as e: from mo_logs.exceptions import Except from mo_logs import Log e = Except.wrap(e) Log.warning("problem serializing {{type}}", type=text_type(repr(value)), cause=e) raise e
def _monitor(self, please_stop): with Timer(self.name): self.service.wait() self.debug and Log.note( "{{process}} STOP: returncode={{returncode}}", process=self.name, returncode=self.service.returncode) self.service_stopped.go() please_stop.go()
def __init__(self, kwargs=None): # GENERATE PRIVATE KEY self.config = kwargs self.session = None with Timer("generate {{bits}} bits rsa key", {"bits": self.config.rsa.bits}): Log.note("This will take a while....") self.public_key, self.private_key = rsa_crypto.generate_key( bits=self.config.rsa.bits)
def test_long_file(service): timer = Timer("test", silent=True) with timer: service.get_tuids( files="gfx/angle/checkout/src/libANGLE/formatutils.cpp", revision="29dcc9cb77c3") assert timer.duration.seconds < 30
def encode(self, value, pretty=False): if pretty: return pretty_json(value) try: with Timer("scrub", too_long=0.1): scrubbed = scrub(value) param = {"size": 0} with Timer("encode {{size}} characters", param=param, too_long=0.1): output = text_type(self.encoder(scrubbed)) param["size"] = len(output) return output except Exception as e: from mo_logs.exceptions import Except from mo_logs import Log e = Except.wrap(e) Log.warning("problem serializing {{type}}", type=text_type(repr(value)), cause=e) raise e
def test_save_then_load(self): test = { "data": [{ "a": "b" }], "query": { "meta": { "save": True }, "from": TEST_TABLE, "select": "a" }, "expecting_list": { "meta": { "format": "list" }, "data": ["b"] } } settings = self.utils.fill_container(test) bytes = unicode2utf8( value2json({ "from": settings.index, "select": "a", "format": "list" })) expected_hash = convert.bytes2base64( hashlib.sha1(bytes).digest()[0:6]).replace("/", "_") wrap(test).expecting_list.meta.saved_as = expected_hash self.utils.send_queries(test) # ENSURE THE QUERY HAS BEEN INDEXED Log.note("Flush saved query (with hash {{hash}})", hash=expected_hash) container = elasticsearch.Index(index="saved_queries", type=save_query.DATA_TYPE, kwargs=settings) container.flush(forced=True) with Timer("wait for 5 seconds"): Till(seconds=5).wait() url = URL(self.utils.testing.query) response = self.utils.try_till_response(url.scheme + "://" + url.host + ":" + text_type(url.port) + "/find/" + expected_hash, data=b'') self.assertEqual(response.status_code, 200) self.assertEqual(response.all_content, bytes)
def test_compare_isinstance_to_class_checks(self): num = 1 * 1000 * 1000 options = { 0: lambda: {}, 1: lambda: Data(), 2: lambda: Null, 3: lambda: 6, 4: lambda: "string", } data = [options[Random.int(len(options))]() for _ in range(num)] with Timer("isinstance check") as i_time: i_result = [isinstance(d, Mapping) for d in data] with Timer("set check") as s_time: s_result = [d.__class__ in MAPPING_TYPES for d in data] with Timer("eq check") as e_time: e_result = [ d.__class__ is Data or d.__class__ is dict for d in data ] with Timer("name check") as n_time: n_result = [ is_instance(d, Data) or is_instance(d, dict) for d in data ] with Timer("check w method") as m_time: m_result = [is_mapping(d) for d in data] self.assertEqual(s_result, i_result) self.assertEqual(m_result, i_result) self.assertEqual(e_result, i_result) self.assertEqual(n_result, i_result) self.assertGreater(i_time.duration, s_time.duration) self.assertGreater(m_time.duration, s_time.duration)
def upload(filename, temp_file): with Timer("upload file to S3 {{file}}", param={"file": filename}): try: connection = Connection(S3_CONFIG).connection bucket = connection.get_bucket(S3_CONFIG.bucket, validate=False) storage = bucket.new_key(filename) storage.set_contents_from_filename( temp_file.abspath, headers={"Content-Type": mimetype.JSON}) if S3_CONFIG.public: storage.set_acl("public-read") except Exception as e: Log.error("Problem connecting to {{bucket}}", bucket=S3_CONFIG.bucket, cause=e)
def test_mode_wait(query, please_stop): """ WAIT FOR METADATA TO ARRIVE ON INDEX :param query: dict() OF REQUEST BODY :return: nothing """ if not query["from"]: return try: if query["from"].startswith("meta."): return alias = split_field(query["from"])[0] after = Date.now() require_cardinality = meta.ENABLE_META_SCAN with Timer( "Get columns for {{table}} after {{after}}", { "table": alias, "after": after }, verbose=DEBUG, ): metadata_manager = find_container(alias, after=after).namespace timeout = Till(seconds=MINUTE.seconds) | please_stop while not timeout: # GET FRESH VERSIONS cols = metadata_manager.get_columns(table_name=alias, after=after, timeout=timeout) not_ready = [ c for c in cols if c.jx_type not in STRUCT and ( after >= c.last_updated or (require_cardinality and c.cardinality == None)) ] if not_ready: Log.note( "wait for column (table={{col.es_index}}, name={{col.es_column}}, cardinality={{col.cardinality|json}}, last_updated={{col.last_updated|datetime}}) metadata to arrive", col=first(not_ready), ) else: break Till(seconds=1).wait() except Exception as e: Log.warning("could not pickup columns", cause=e)
def update_local_database(): # GET EVERYTHING WE HAVE SO FAR exists = summary_table.query({ "select": ["id", "last_updated"], "where": { "and": [{ "in": { "id": candidates.id } }, { "exists": "num_pushes" }] }, "sort": "last_updated", "limit": 100000, "format": "list", }).data # CHOOSE MISSING, THEN OLDEST, UP TO "RECENT" missing = list(set(candidates.id) - set(exists.id)) too_old = Date.today() - parse(LOCAL_RETENTION) needs_update = missing + [ e for e in exists if e.last_updated < too_old.unix ] Log.alert("{{num}} series are candidates for local update", num=len(needs_update)) limited_update = Queue("sigs") limited_update.extend( left(needs_update, coalesce(config.analysis.download_limit, 100))) Log.alert("Updating local database with {{num}} series", num=len(limited_update)) with Timer("Updating local database"): def loop(please_stop): while not please_stop: sig_id = limited_update.pop_one() if not sig_id: return process(sig_id) threads = [Thread.run(text(i), loop) for i in range(3)] for t in threads: t.join() Log.note("Local database is up to date")
def main(): try: config = startup.read_settings() constants.set(config.constants) inject_secrets(config) with Timer("PATCH ADR: dd update() method to Configuration class"): def update(self, config): """ Update the configuration object with new parameters :param config: dict of configuration """ for k, v in config.items(): if v != None: self._config[k] = v self._config["sources"] = sorted( map(os.path.expanduser, set(self._config["sources"])) ) # Use the NullStore by default. This allows us to control whether # caching is enabled or not at runtime. self._config["cache"].setdefault("stores", {"null": {"driver": "null"}}) object.__setattr__(self, "cache", CacheManager(self._config["cache"])) self.cache.extend("null", lambda driver: NullStore()) setattr(Configuration, "update", update) # UPDATE ADR COFIGURATION adr.config.update(config.adr) Log.start(config.debug) # SHUNT ADR LOGGING TO MAIN LOGGING # https://loguru.readthedocs.io/en/stable/api/logger.html#loguru._logger.Logger.add loguru.logger.remove() loguru.logger.add( _logging, level="DEBUG", format="{message}", filter=lambda r: True, ) Schedulers(config).process() except Exception as e: Log.warning("Problem with etl! Shutting down.", cause=e) finally: Log.stop()
def _parse_properties(self, abs_index, properties, meta): # IT IS IMPORTANT THAT NESTED PROPERTIES NAME ALL COLUMNS, AND # ALL COLUMNS ARE GIVEN NAMES FOR ALL NESTED PROPERTIES def add_column(c, query_path): c.last_updated = Date.now() - TOO_OLD if query_path[0] != ".": c.names[query_path[0]] = relative_field( c.names["."], query_path[0]) with self.meta.columns.locker: for alias in meta.aliases: c_ = copy(c) c_.es_index = alias self._upsert_column(c_) self._upsert_column(c) abs_columns = elasticsearch.parse_properties(abs_index, None, properties.properties) self.abs_columns.update(abs_columns) with Timer("upserting {{num}} columns", {"num": len(abs_columns)}, debug=DEBUG): # LIST OF EVERY NESTED PATH query_paths = [[c.es_column] for c in abs_columns if c.type == "nested"] for a, b in itertools.product(query_paths, query_paths): aa = a[0] bb = b[0] if aa and bb.startswith(aa): for i, b_prefix in enumerate(b): if len(b_prefix) > len(aa): continue if aa == b_prefix: break # SPLIT ALREADY FOUND b.insert(i, aa) break for q in query_paths: q.append(".") query_paths.append(SELF_PATH) # ADD RELATIVE COLUMNS for abs_column in abs_columns: abs_column = abs_column.__copy__() abs_column.type = es_type_to_json_type[abs_column.type] for query_path in query_paths: add_column(abs_column, query_path) pass
def inject_secrets(config): """ INJECT THE SECRETS INTO THE CONFIGURATION :param config: CONFIG DATA ************************************************************************ ** ENSURE YOU HAVE AN ENVIRONMENT VARIABLE SET: ** TASKCLUSTER_ROOT_URL = https://community-tc.services.mozilla.com ************************************************************************ """ with Timer("get secrets"): options = taskcluster.optionsFromEnvironment() secrets = taskcluster.Secrets(options) acc = Data() for s in listwrap(SECRET_NAMES): acc[s] = secrets.get(concat_field(SECRET_PREFIX, s))['secret'] set_default(config, acc)
def inject_secrets(config): """ INJECT THE SECRETS INTO THE CONFIGURATION :param config: CONFIG DATA ************************************************************************ ** ENSURE YOU HAVE AN ENVIRONMENT VARIABLE SET: ** TASKCLUSTER_ROOT_URL = https://community-tc.services.mozilla.com ************************************************************************ """ with Timer("get secrets"): secrets = taskcluster.Secrets(config.taskcluster) acc = Data() for s in listwrap(SECRET_NAMES): secret_name = concat_field(SECRET_PREFIX, s) Log.note("get secret named {{name|quote}}", name=secret_name) acc[s] = secrets.get(secret_name)["secret"] set_default(config, acc)
def _parse_properties(self, alias, mapping, meta): abs_columns = elasticsearch.parse_properties(alias, None, mapping.properties) if any(c.cardinality == 0 and c.names['.'] != '_id' for c in abs_columns): Log.warning("Some columns are not stored {{names}}", names=[ ".".join((c.es_index, c.names['.'])) for c in abs_columns if c.cardinality == 0 ]) with Timer("upserting {{num}} columns", {"num": len(abs_columns)}, silent=not DEBUG): # LIST OF EVERY NESTED PATH query_paths = [[c.es_column] for c in abs_columns if c.es_type == "nested"] for a, b in itertools.product(query_paths, query_paths): aa = a[0] bb = b[0] if aa and bb.startswith(aa): for i, b_prefix in enumerate(b): if len(b_prefix) > len(aa): continue if aa == b_prefix: break # SPLIT ALREADY FOUND b.insert(i, aa) break for q in query_paths: q.append(SELF_PATH) query_paths.append(ROOT_PATH) self.alias_to_query_paths[alias] = query_paths for i in self.index_to_alias.get_domain(alias): self.alias_to_query_paths[i] = query_paths # ADD RELATIVE NAMES for abs_column in abs_columns: abs_column.last_updated = None abs_column.jx_type = jx_type(abs_column) for query_path in query_paths: abs_column.names[query_path[0]] = relative_field( abs_column.names["."], query_path[0]) self.todo.add(self.meta.columns.add(abs_column)) pass
def one_request(request, please_stop): and_op = request.where['and'] files = [] for a in and_op: if a['in'].path: files = a['in'].path elif a.eq.path: files = [a.eq.path] with Timer("Make TUID request from {{timestamp|datetime}}", {"timestamp": request.meta.request_time}): try: result = http.post_json("http://localhost:5000/tuid", json=request, timeout=30) if result is None or len(result.data) != len(files): Log.note("incomplete response for {{thread}}", thread=Thread.current().name) except Exception as e: Log.warning("Request failure", cause=e)
def not_monitor(self, please_stop): Log.alert("metadata scan has been disabled") please_stop.on_go(lambda: self.todo.add(THREAD_STOP)) while not please_stop: column = self.todo.pop() if column == THREAD_STOP: break # if untype_path(column.name) in ["build.type", "run.type"]: # Log.note("found") if column.jx_type in STRUCT or split_field( column.es_column)[-1] == EXISTS_TYPE: DEBUG and Log.note("{{column.es_column}} is a struct", column=column) column.last_updated = Date.now() continue elif column.last_updated > Date.now( ) - TOO_OLD and column.cardinality is not None: # DO NOT UPDATE FRESH COLUMN METADATA DEBUG and Log.note( "{{column.es_column}} is still fresh ({{ago}} ago)", column=column, ago=(Date.now() - Date(column.last_updated)).seconds) continue with Timer("Update {{col.es_index}}.{{col.es_column}}", param={"col": column}, silent=not DEBUG, too_long=0.05): if untype_path(column.name) in ["build.type", "run.type"]: try: self._update_cardinality(column) except Exception as e: Log.warning( "problem getting cardinality for {{column.name}}", column=column, cause=e) else: column.last_updated = Date.now()
def test_recovery_of_empty_string(self): test = wrap({ "data": [ {"a": "bee"} ], "query": { "from": TEST_TABLE, "select": "a", "where": {"prefix": {"a": ""}}, "format": "list" }, "expecting_list": { "meta": { "format": "list" }, "data": ["bee"] } }) settings = self.utils.fill_container(test) bytes = value2json(test.query).encode('utf8') expected_hash = convert.bytes2base64(hashlib.sha1(bytes).digest()[0:6]).replace("/", "_") test.expecting_list.meta.saved_as = expected_hash test.query.meta = {"save": True} self.utils.send_queries(test) # ENSURE THE QUERY HAS BEEN INDEXED Log.note("Flush saved query") container = elasticsearch.Index(index="saved_queries", kwargs=settings) container.flush(forced=True) with Timer("wait for 5 seconds"): Till(seconds=5).wait() url = URL(self.utils.testing.query) response = self.utils.try_till_response(url.scheme + "://" + url.host + ":" + text(url.port) + "/find/" + expected_hash, data=b'') self.assertEqual(response.status_code, 200) self.assertEqual(response.all_content, bytes)
def not_monitor(self, please_stop): Log.alert("metadata scan has been disabled") please_stop.then(lambda: self.todo.add(THREAD_STOP)) while not please_stop: pair = self.todo.pop() if pair is THREAD_STOP: break column, after = pair with Timer("Update {{col.es_index}}.{{col.es_column}}", param={"col": column}, silent=not DEBUG, too_long=0.05): if column.jx_type in STRUCT or split_field(column.es_column)[-1] == EXISTS_TYPE: # DEBUG and Log.note("{{column.es_column}} is a struct", column=column) continue elif after and column.last_updated > after: continue # COLUMN IS STILL YOUNG elif column.last_updated > Date.now() - TOO_OLD and column.cardinality > 0: # DO NOT UPDATE FRESH COLUMN METADATA DEBUG and Log.note("{{column.es_column}} is still fresh ({{ago}} ago)", column=column, ago=(Date.now()-Date(column.last_updated)).seconds) continue if untype_path(column.name) in KNOWN_MULTITYPES: try: self._update_cardinality(column) except Exception as e: Log.warning("problem getting cardinality for {{column.name}}", column=column, cause=e) continue self.meta.columns.update({ "set": { "last_updated": Date.now() }, "clear": [ "count", "cardinality", "multi", "partitions", ], "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} })
def write_status(guid, status): try: filename = guid + ".status.json" with Timer("upload status to S3 {{file}}", param={"file": filename}, verbose=DEBUG): try: connection = Connection(S3_CONFIG).connection bucket = connection.get_bucket(S3_CONFIG.bucket, validate=False) storage = bucket.new_key(filename) storage.set_contents_from_string( value2json(status), headers={"Content-Type": mimetype.JSON} ) if S3_CONFIG.public: storage.set_acl("public-read") except Exception as e: Log.error( "Problem connecting to {{bucket}}", bucket=S3_CONFIG.bucket, cause=e ) except Exception as e: Log.warning("problem setting status", cause=e)
def es_bulkaggsop(esq, frum, query): # https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations-bucket-terms-aggregation.html#_filtering_values_with_partitions query = query.copy() # WE WILL MARK UP THIS QUERY chunk_size = min(coalesce(query.chunk_size, MAX_CHUNK_SIZE), MAX_CHUNK_SIZE) schema = frum.schema query_path = first(schema.query_path) selects = listwrap(query.select) variable = first(query.groupby).value # FIND CARDINALITY cardinality_check = Timer("Get cardinality for {{column}}", param={"column": variable.var}) with cardinality_check: columns = schema.leaves(variable.var) if len(columns) != 1: Log.error("too many columns to bulk groupby:\n{{columns|json}}", columns=columns) column = first(columns) if query.where is TRUE: cardinality = column.cardinality if cardinality == None: esq.namespace._update_cardinality(column) cardinality = column.cardinality else: cardinality = esq.query({ "select": { "name": "card", "value": variable, "aggregate": "cardinality", }, "from": frum.name, "where": query.where, "format": "cube", }).card num_partitions = (cardinality + chunk_size - 1) // chunk_size if num_partitions > MAX_PARTITIONS: Log.error("Requesting more than {{num}} partitions", num=num_partitions) if num_partitions == 0: num_partitions = 1 acc, decoders, es_query = aggop_to_es_queries(selects, query_path, schema, query) guid = randoms.base64(32, extra="-_") abs_limit = mo_math.MIN( (query.limit, first(query.groupby).domain.limit)) formatter = formatters[query.format](abs_limit) Thread.run( "extract to " + guid + ".json", extractor, guid, num_partitions, esq, query, selects, query_path, schema, chunk_size, cardinality, abs_limit, formatter, parent_thread=Null, ).release() output = to_data({ "url": URL_PREFIX / (guid + ".json"), "status": URL_PREFIX / (guid + ".status.json"), "meta": { "format": query.format, "timing": { "cardinality_check": cardinality_check.duration }, "es_query": es_query, "num_partitions": num_partitions, "cardinality": cardinality, }, }) return output