def test_normal(self): num = 1000 results = [] for i in range(0, num): samples = numpy.random.normal(size=20) desc, score = deviance(samples) results.append(desc) self.assertEqual(list(jx.groupby(["ok", "not ok"])), [("not ok", ["not ok"]), ("ok", ["ok"])], "expecting version >=2.53") # normals ar OK for desc, n in jx.groupby(results): Log.note("{{desc}}: {{count}}", desc=desc, count=len(n)) if desc == "OK": self.assertLessEqual(num * 0.99, len(n))
def _execute_backlog(self): if not self.backlog: return (backlog, self.backlog) = (self.backlog, []) if self.db.__module__.startswith("pymysql"): # BUG IN PYMYSQL: CAN NOT HANDLE MULTIPLE STATEMENTS # https://github.com/PyMySQL/PyMySQL/issues/157 for b in backlog: sql = self.preamble + b try: if self.debug: Log.note("Execute SQL:\n{{sql|indent}}", sql=sql) self.cursor.execute(b) except Exception as e: Log.error("Can not execute sql:\n{{sql}}", sql=sql, cause=e) self.cursor.close() self.cursor = self.db.cursor() else: for i, g in jx.groupby(backlog, size=MAX_BATCH_SIZE): sql = self.preamble + ";\n".join(g) try: if self.debug: Log.note("Execute block of SQL:\n{{sql|indent}}", sql=sql) self.cursor.execute(sql) self.cursor.close() self.cursor = self.db.cursor() except Exception as e: Log.error("Problem executing SQL:\n{{sql|indent}}", sql=sql, cause=e, stack_depth=1)
def _execute_backlog(self): if not self.backlog: return backlog, self.backlog = self.backlog, [] if self.db.__module__.startswith("pymysql"): # BUG IN PYMYSQL: CAN NOT HANDLE MULTIPLE STATEMENTS # https://github.com/PyMySQL/PyMySQL/issues/157 for b in backlog: sql = self.preamble + b try: self.debug and Log.note("Execute SQL:\n{{sql|indent}}", sql=sql) self.cursor.execute(b) except Exception as e: Log.error("Can not execute sql:\n{{sql}}", sql=sql, cause=e) self.cursor.close() self.cursor = self.db.cursor() else: for i, g in jx.groupby(backlog, size=MAX_BATCH_SIZE): sql = self.preamble + ";\n".join(g) try: self.debug and Log.note("Execute block of SQL:\n{{sql|indent}}", sql=sql) self.cursor.execute(sql) self.cursor.close() self.cursor = self.db.cursor() except Exception as e: Log.error("Problem executing SQL:\n{{sql|indent}}", sql=sql, cause=e, stack_depth=1)
def add_cset_entries(self, ordered_rev_list, timestamp=False, number_forward=True): ''' Adds a list of revisions to the table. Assumes ordered_rev_list is an ordered based on how changesets are found in the changelog. Going forwards or backwards is dealt with by flipping the list :param ordered_cset_list: Order given from changeset log searching. :param timestamp: If false, records are kept indefinitely but if holes exist: (delete, None, delete, None) those delete's with None's around them will not be deleted. :param numbered: If True, this function will number the revision list by going forward from max(revNum), else it'll go backwards from revNum, then add X to all revnums and self.next_revnum where X is the length of ordered_rev_list :return: ''' with self.conn.transaction() as t: current_min = t.get_one("SELECT min(revnum) FROM csetlog")[0] current_max = t.get_one("SELECT max(revnum) FROM csetlog")[0] if not current_min or not current_max: current_min = 0 current_max = 0 direction = -1 start = current_min - 1 if number_forward: direction = 1 start = current_max + 1 ordered_rev_list = ordered_rev_list[::-1] insert_list = [(start + direction * count, rev, int(time.time()) if timestamp else -1) for count, rev in enumerate(ordered_rev_list)] # In case of overlapping requests fmt_insert_list = [] for cset_entry in insert_list: tmp = self._get_one_revision(t, cset_entry) if not tmp: fmt_insert_list.append(cset_entry) for _, tmp_insert_list in jx.groupby(fmt_insert_list, size=SQL_CSET_BATCH_SIZE): t.execute( "INSERT INTO csetLog (revnum, revision, timestamp)" + " VALUES " + sql_list( quote_set((revnum, revision, timestamp)) for revnum, revision, timestamp in tmp_insert_list)) # Move the revision numbers forward if needed self.recompute_table_revnums() # Start a maintenance run if needed if self.check_for_maintenance(): Log.note("Scheduling maintenance run on clogger.") self.maintenance_signal.go()
def insert_into_db_chunked(transaction, data, cmd, sql_chunk_size=500): # For the `cmd` object, we expect something like (don't forget the whitespace at the end): # "INSERT INTO temporal (tuid, file, revision, line) VALUES " # # `data` must be a list of tuples. for _, inserts_list in jx.groupby(data, size=sql_chunk_size): transaction.execute(cmd + sql_list( quote_set(entry) for entry in inserts_list))
def test_groupby_values(self): self.assertAlmostEqual( list(jx.groupby(["ok", "not ok"])), [ ("not ok", ("not ok", )), ("ok", ("ok", )) ], "expecting to group by whole value" )
def insert_into_db_chunked(transaction, data, cmd, sql_chunk_size=500): # For the `cmd` object, we expect something like (don't forget the whitespace at the end): # "INSERT INTO temporal (tuid, file, revision, line) VALUES " # # `data` must be a list of tuples. for _, inserts_list in jx.groupby(data, size=sql_chunk_size): transaction.execute( cmd + sql_list(quote_set(entry) for entry in inserts_list) )
def _execute_backlog(self): if not self.backlog: return backlog, self.backlog = self.backlog, [] for i, g in jx.groupby(backlog, size=MAX_BATCH_SIZE): sql = self.preamble + ";\n".join(g) try: self.debug and Log.note("Execute block of SQL:\n{{sql|indent}}", sql=sql) self.cursor.execute(sql) self.cursor.close() self.cursor = self.db.cursor() except Exception as e: Log.error("Problem executing SQL:\n{{sql|indent}}", sql=sql, cause=e, stack_depth=1)
def _insert_loop(self, please_stop=None): bad_count = 0 while not please_stop: try: messages = wrap(self.queue.pop_all()) if not messages: Till(seconds=PAUSE_AFTER_GOOD_INSERT).wait() continue for g, mm in jx.groupby(messages, size=self.batch_size): scrubbed = [] for i, message in enumerate(mm): if message is THREAD_STOP: please_stop.go() continue try: messages = flatten_causal_chain(message.value) scrubbed.append( { "value": [ _deep_json_to_string(m, depth=3) for m in messages ] } ) except Exception as e: Log.warning("Problem adding to scrubbed list", cause=e) self.es.extend(scrubbed) bad_count = 0 except Exception as f: Log.warning("Problem inserting logs into ES", cause=f) bad_count += 1 if bad_count > MAX_BAD_COUNT: Log.warning( "Given up trying to write debug logs to ES index {{index}}", index=self.es.settings.index, ) Till(seconds=PAUSE_AFTER_BAD_INSERT).wait() self.es.flush() # CONTINUE TO DRAIN THIS QUEUE while not please_stop: try: Till(seconds=PAUSE_AFTER_GOOD_INSERT).wait() self.queue.pop_all() except Exception as e: Log.warning("Should not happen", cause=e)
def _execute_backlog(self): if not self.backlog: return backlog, self.backlog = self.backlog, [] for i, g in jx.groupby(backlog, size=MAX_BATCH_SIZE): sql = self.preamble + ";\n".join(g) try: self.debug and Log.note( "Execute block of SQL:\n{{sql|indent}}", sql=sql) self.cursor.execute(sql) self.cursor.close() self.cursor = self.db.cursor() except Exception as e: Log.error("Problem executing SQL:\n{{sql|indent}}", sql=sql, cause=e, stack_depth=1)
def _insert_loop(self, please_stop=None): bad_count = 0 while not please_stop: try: Till(seconds=1).wait() messages = wrap(self.queue.pop_all()) if not messages: continue for g, mm in jx.groupby(messages, size=self.batch_size): scrubbed = [] try: for i, message in enumerate(mm): if message is THREAD_STOP: please_stop.go() return scrubbed.append( _deep_json_to_string(message, depth=3)) finally: self.es.extend(scrubbed) bad_count = 0 except Exception as e: Log.warning("Problem inserting logs into ES", cause=e) bad_count += 1 if bad_count > MAX_BAD_COUNT: Log.warning( "Given up trying to write debug logs to ES index {{index}}", index=self.es.settings.index) Till(seconds=30).wait() # CONTINUE TO DRAIN THIS QUEUE while not please_stop: try: Till(seconds=1).wait() self.queue.pop_all() except Exception as e: Log.warning("Should not happen", cause=e)
def process( sig_id, since, source, destination, ): """ :param sig_id: The performance hash :param since: Only data after this date :param show: :param show_limit: :param show_old: :param show_distribution: :return: """ if not isinstance(sig_id, int): Log.error("expecting id") # GET SIGNATURE DETAILS sig = get_signature(source, sig_id) # GET SIGNATURE DETAILS pushes = get_dataum(source, sig_id, since, LIMIT) pushes = jx.sort( [{ "value": median(rows.value), "runs": rows, "push": { "time": unwrap(t)["push.time"] }, } for t, rows in jx.groupby(pushes, "push.time") if t["push\\.time"] > since], "push.time", ) values = list(pushes.value) title = "-".join( map( str, [ sig.framework, sig.suite, sig.test, sig.platform, sig.repository, ], )) Log.note("With {{title}}", title=title) if len(values) > LIMIT: Log.alert( "Too many values for {{title}} ({at least {num}}), choosing last {{limit}}", title=title, num=len(values), limit=LIMIT, ) values = values[-LIMIT:] with Timer("find segments"): new_segments, new_diffs = find_segments(values, sig.alert_change_type, sig.alert_threshold) if len(new_segments) == 1: overall_dev_status = None overall_dev_score = None last_mean = None last_std = None last_dev_status = None last_dev_score = None relative_noise = None else: # NOISE OF LAST SEGMENT s, e = new_segments[-2], new_segments[-1] last_segment = np.array(values[s:e]) trimmed_segment = last_segment last_mean = np.mean(trimmed_segment) last_std = np.std(trimmed_segment) last_dev_status, last_dev_score = deviance(trimmed_segment) relative_noise = last_std / last_mean # FOR EACH SEGMENT, NORMALIZE MEAN AND VARIANCE normalized = [] for s, e in jx.pairs(new_segments): data = np.array(values[s:e]) norm = (data + last_mean - np.mean(data)) * last_std / np.std(data) normalized.extend(norm) overall_dev_status, overall_dev_score = deviance(normalized) Log.note( "\n\tdeviance = {{deviance}}\n\tnoise={{std}}\n\tpushes={{pushes}}\n\tsegments={{num_segments}}", title=title, deviance=(overall_dev_status, overall_dev_score), std=relative_noise, pushes=len(values), num_segments=len(new_segments) - 1, ) destination.add( Data( id=sig_id, title=title, num_pushes=len(values), num_segments=len(new_segments) - 1, relative_noise=relative_noise, overall_dev_status=overall_dev_status, overall_dev_score=overall_dev_score, last_mean=last_mean, last_std=last_std, last_dev_status=last_dev_status, last_dev_score=last_dev_score, last_updated=Date.now(), values=values, ) | scrub(sig))
def _update_cardinality(self, column): """ QUERY ES TO FIND CARDINALITY AND PARTITIONS FOR A SIMPLE COLUMN """ if column.es_index in self.index_does_not_exist: return if column.jx_type in STRUCT: Log.error("not supported") try: if column.es_index == "meta.columns": partitions = jx.sort([g[column.es_column] for g, _ in jx.groupby(self.meta.columns, column.es_column) if g[column.es_column] != None]) self.meta.columns.update({ "set": { "partitions": partitions, "count": len(self.meta.columns), "cardinality": len(partitions), "multi": 1, "last_updated": Date.now() }, "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) return if column.es_index == "meta.tables": partitions = jx.sort([g[column.es_column] for g, _ in jx.groupby(self.meta.tables, column.es_column) if g[column.es_column] != None]) self.meta.columns.update({ "set": { "partitions": partitions, "count": len(self.meta.tables), "cardinality": len(partitions), "multi": 1, "last_updated": Date.now() }, "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) return es_index = column.es_index.split(".")[0] is_text = [cc for cc in self.meta.columns if cc.es_column == column.es_column and cc.es_type == "text"] if is_text: # text IS A MULTIVALUE STRING THAT CAN ONLY BE FILTERED result = self.es_cluster.post("/" + es_index + "/_search", data={ "aggs": { "count": {"filter": {"match_all": {}}} }, "size": 0 }) count = result.hits.total cardinality = max(1001, count) multi = 1001 elif column.es_column == "_id": result = self.es_cluster.post("/" + es_index + "/_search", data={ "query": {"match_all": {}}, "size": 0 }) count = cardinality = result.hits.total multi = 1 elif column.es_type == BOOLEAN: result = self.es_cluster.post("/" + es_index + "/_search", data={ "aggs": { "count": _counting_query(column) }, "size": 0 }) count = result.hits.total cardinality = 2 multi = 1 else: result = self.es_cluster.post("/" + es_index + "/_search", data={ "aggs": { "count": _counting_query(column), "multi": {"max": {"script": "doc[" + quote(column.es_column) + "].values.size()"}} }, "size": 0 }) agg_results = result.aggregations count = result.hits.total cardinality = coalesce(agg_results.count.value, agg_results.count._nested.value, agg_results.count.doc_count) multi = int(coalesce(agg_results.multi.value, 1)) if cardinality == None: Log.error("logic error") query = Data(size=0) if column.es_column == "_id": self.meta.columns.update({ "set": { "count": cardinality, "cardinality": cardinality, "multi": 1, "last_updated": Date.now() }, "clear": ["partitions"], "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) return elif cardinality > 1000 or (count >= 30 and cardinality == count) or (count >= 1000 and cardinality / count > 0.99): DEBUG and Log.note("{{table}}.{{field}} has {{num}} parts", table=column.es_index, field=column.es_column, num=cardinality) self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "multi": multi, "last_updated": Date.now() }, "clear": ["partitions"], "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) return elif column.es_type in elasticsearch.ES_NUMERIC_TYPES and cardinality > 30: DEBUG and Log.note("{{table}}.{{field}} has {{num}} parts", table=column.es_index, field=column.es_column, num=cardinality) self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "multi": multi, "last_updated": Date.now() }, "clear": ["partitions"], "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) return elif len(column.nested_path) != 1: query.aggs["_"] = { "nested": {"path": column.nested_path[0]}, "aggs": {"_nested": {"terms": {"field": column.es_column}}} } elif cardinality == 0: query.aggs["_"] = {"terms": {"field": column.es_column}} else: query.aggs["_"] = {"terms": {"field": column.es_column, "size": cardinality}} result = self.es_cluster.post("/" + es_index + "/_search", data=query) aggs = result.aggregations._ if aggs._nested: parts = jx.sort(aggs._nested.buckets.key) else: parts = jx.sort(aggs.buckets.key) self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "multi": multi, "partitions": parts, "last_updated": Date.now() }, "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) except Exception as e: # CAN NOT IMPORT: THE TEST MODULES SETS UP LOGGING # from tests.test_jx import TEST_TABLE e = Except.wrap(e) TEST_TABLE = "testdata" is_missing_index = any(w in e for w in ["IndexMissingException", "index_not_found_exception"]) is_test_table = column.es_index.startswith((TEST_TABLE_PREFIX, TEST_TABLE)) if is_missing_index and is_test_table: # WE EXPECT TEST TABLES TO DISAPPEAR self.meta.columns.update({ "clear": ".", "where": {"eq": {"es_index": column.es_index}} }) self.index_does_not_exist.add(column.es_index) else: self.meta.columns.update({ "set": { "last_updated": Date.now() }, "clear": [ "count", "cardinality", "multi", "partitions", ], "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) Log.warning("Could not get {{col.es_index}}.{{col.es_column}} info", col=column, cause=e)
def _update_cardinality(self, column): """ QUERY ES TO FIND CARDINALITY AND PARTITIONS FOR A SIMPLE COLUMN """ now = Date.now() if column.es_index in self.index_does_not_exist: return if column.jx_type in STRUCT: Log.error("not supported") try: if column.es_index == "meta.columns": partitions = jx.sort([ g[column.es_column] for g, _ in jx.groupby(self.meta.columns, column.es_column) if g[column.es_column] != None ]) self.meta.columns.update({ "set": { "partitions": partitions, "count": len(self.meta.columns), "cardinality": len(partitions), "multi": 1, "last_updated": now }, "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) return if column.es_index == "meta.tables": partitions = jx.sort([ g[column.es_column] for g, _ in jx.groupby(self.meta.tables, column.es_column) if g[column.es_column] != None ]) self.meta.columns.update({ "set": { "partitions": partitions, "count": len(self.meta.tables), "cardinality": len(partitions), "multi": 1, "last_updated": now }, "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) return es_index = column.es_index.split(".")[0] is_text = [ cc for cc in self.meta.columns if cc.es_column == column.es_column and cc.es_type == "text" ] if is_text: # text IS A MULTIVALUE STRING THAT CAN ONLY BE FILTERED result = self.es_cluster.post("/" + es_index + "/_search", data={ "aggs": { "count": { "filter": { "match_all": {} } } }, "size": 0 }) count = result.hits.total cardinality = max(1001, count) multi = 1001 elif column.es_column == "_id": result = self.es_cluster.post("/" + es_index + "/_search", data={ "query": { "match_all": {} }, "size": 0 }) count = cardinality = result.hits.total multi = 1 elif column.es_type == BOOLEAN: result = self.es_cluster.post("/" + es_index + "/_search", data={ "aggs": { "count": _counting_query(column) }, "size": 0 }) count = result.hits.total cardinality = 2 DEBUG and Log.note("{{table}}.{{field}} has {{num}} parts", table=column.es_index, field=column.es_column, num=cardinality) self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "partitions": [False, True], "multi": 1, "last_updated": now }, "clear": ["partitions"], "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) return else: es_query = { "aggs": { "count": _counting_query(column), "_filter": { "aggs": { "multi": { "max": { "script": "doc[" + quote(column.es_column) + "].values.size()" } } }, "filter": { "bool": { "should": [{ "range": { "etl.timestamp.~n~": { "gte": (Date.today() - WEEK) } } }, { "bool": { "must_not": { "exists": { "field": "etl.timestamp.~n~" } } } }] } } } }, "size": 0 } result = self.es_cluster.post("/" + es_index + "/_search", data=es_query) agg_results = result.aggregations count = result.hits.total cardinality = coalesce(agg_results.count.value, agg_results.count._nested.value, agg_results.count.doc_count) multi = int(coalesce(agg_results._filter.multi.value, 1)) if cardinality == None: Log.error("logic error") query = Data(size=0) if column.es_column == "_id": self.meta.columns.update({ "set": { "count": cardinality, "cardinality": cardinality, "multi": 1, "last_updated": now }, "clear": ["partitions"], "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) return elif cardinality > 1000 or (count >= 30 and cardinality == count ) or (count >= 1000 and cardinality / count > 0.99): DEBUG and Log.note("{{table}}.{{field}} has {{num}} parts", table=column.es_index, field=column.es_column, num=cardinality) self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "multi": multi, "last_updated": now }, "clear": ["partitions"], "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) return elif column.es_type in elasticsearch.ES_NUMERIC_TYPES and cardinality > 30: DEBUG and Log.note("{{table}}.{{field}} has {{num}} parts", table=column.es_index, field=column.es_column, num=cardinality) self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "multi": multi, "last_updated": now }, "clear": ["partitions"], "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) return elif len(column.nested_path) != 1: query.aggs["_"] = { "nested": { "path": column.nested_path[0] }, "aggs": { "_nested": { "terms": { "field": column.es_column } } } } elif cardinality == 0: # WHEN DOES THIS HAPPEN? query.aggs["_"] = {"terms": {"field": column.es_column}} else: query.aggs["_"] = { "terms": { "field": column.es_column, "size": cardinality } } result = self.es_cluster.post("/" + es_index + "/_search", data=query) aggs = result.aggregations._ if aggs._nested: parts = jx.sort(aggs._nested.buckets.key) else: parts = jx.sort(aggs.buckets.key) DEBUG and Log.note( "update metadata for {{column.es_index}}.{{column.es_column}} (id={{id}}) at {{time}}", id=id(column), column=column, time=now) self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "multi": multi, "partitions": parts, "last_updated": now }, "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) except Exception as e: # CAN NOT IMPORT: THE TEST MODULES SETS UP LOGGING # from tests.test_jx import TEST_TABLE e = Except.wrap(e) TEST_TABLE = "testdata" is_missing_index = any( w in e for w in ["IndexMissingException", "index_not_found_exception"]) is_test_table = column.es_index.startswith( (TEST_TABLE_PREFIX, TEST_TABLE)) if is_missing_index: # WE EXPECT TEST TABLES TO DISAPPEAR Log.warning("Missing index {{col.es_index}}", col=column, cause=e) self.meta.columns.update({ "clear": ".", "where": { "eq": { "es_index": column.es_index } } }) self.index_does_not_exist.add(column.es_index) elif "No field found for" in e: self.meta.columns.update({ "clear": ".", "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) Log.warning( "Could not get column {{col.es_index}}.{{col.es_column}} info", col=column, cause=e) else: self.meta.columns.update({ "set": { "last_updated": now }, "clear": [ "count", "cardinality", "multi", "partitions", ], "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) Log.warning( "Could not get {{col.es_index}}.{{col.es_column}} info", col=column, cause=e)
def groupby(self, keys): return jx.groupby(self.__iter__(), keys)
def process( about_deviant, since, source, deviant_summary, show=False, show_limit=MAX_POINTS, show_old=False, show_distribution=None, ): """ :param signature_hash: The performance hash :param since: Only data after this date :param show: :param show_limit: :param show_old: :param show_distribution: :return: """ sig_id = about_deviant.id if not isinstance(sig_id, int): Log.error("expecting id") # GET SIGNATURE DETAILS sig = get_signature(db_config=source, signature_id=sig_id) # GET SIGNATURE DETAILS data = get_dataum(source, sig.id, since=since, limit=show_limit) min_date = since.unix pushes = jx.sort( [{ "value": median(rows.value), "runs": rows, "push": { "time": unwrap(t)["push.time"] }, } for t, rows in jx.groupby(data, "push.time") if t["push\\.time"] > min_date], "push.time", ) values = list(pushes.value) title = "-".join( map( str, [ sig.id, sig.framework, sig.suite, sig.test, sig.repository, sig.platform, about_deviant.overall_dev_status, ], )) # EG https://treeherder.mozilla.org/perf.html#/graphs?highlightAlerts=1&series=mozilla-central,fee739b45f7960e4a520d8e0bd781dd9d0a3bec4,1,10&timerange=31536000 url = "https://treeherder.mozilla.org/perf.html#/graphs?" + value2url_param( { "highlightAlerts": 1, "series": [ sig.repository, sig.id, 1, coalesce(sig.framework_id, sig.framework) ], "timerange": Duration(TREEHERDER_RANGE).seconds }) Log.note("With {{title}}: {{url}}", title=title, url=url) with Timer("find segments"): new_segments, new_diffs = find_segments(values, sig.alert_change_type, sig.alert_threshold) # USE PERFHERDER ALERTS TO IDENTIFY OLD SEGMENTS old_segments = tuple( sorted( set([ i for i, p in enumerate(pushes) if any(r.alert.id for r in p.runs) ] + [0, len(pushes)]))) old_medians = [0.0] + [ np.median(values[s:e]) for s, e in zip(old_segments[:-1], old_segments[1:]) ] old_diffs = np.array( [b / a - 1 for a, b in zip(old_medians[:-1], old_medians[1:])] + [0]) if len(new_segments) == 1: overall_dev_status = None overall_dev_score = None last_mean = None last_std = None last_dev_status = None last_dev_score = None relative_noise = None Log.note("not ") else: # NOISE OF LAST SEGMENT s, e = new_segments[-2], new_segments[-1] last_segment = np.array(values[s:e]) ignore = IGNORE_TOP trimmed_segment = last_segment[np.argsort(last_segment) [ignore:-ignore]] last_mean = np.mean(trimmed_segment) last_std = np.std(trimmed_segment) last_dev_status, last_dev_score = deviance(trimmed_segment) relative_noise = last_std / last_mean # FOR EACH SEGMENT, NORMALIZE MEAN AND VARIANCE normalized = [] for s, e in jx.pairs(new_segments): data = np.array(values[s:e]) norm = (data + last_mean - np.mean(data)) * last_std / np.std(data) normalized.extend(norm) overall_dev_status, overall_dev_score = deviance(normalized) Log.note( "\n\tdeviance = {{deviance}}\n\tnoise={{std}}\n\tpushes={{pushes}}\n\tsegments={{num_segments}}", title=title, deviance=(overall_dev_status, overall_dev_score), std=relative_noise, pushes=len(values), num_segments=len(new_segments) - 1, ) if show_distribution: histogram(trimmed_segment, title=last_dev_status + "=" + text(last_dev_score)) max_extra_diff = None max_missing_diff = None _is_diff = is_diff(new_segments, old_segments) if _is_diff: # FOR MISSING POINTS, CALC BIGGEST DIFF max_extra_diff = mo_math.MAX( abs(d) for s, d in zip(new_segments, new_diffs) if all(not (s - TOLERANCE <= o <= s + TOLERANCE) for o in old_segments)) max_missing_diff = mo_math.MAX( abs(d) for s, d in zip(old_segments, old_diffs) if all(not (s - TOLERANCE <= n <= s + TOLERANCE) for n in new_segments)) Log.alert( "Disagree max_extra_diff={{max_extra_diff|round(places=3)}}, max_missing_diff={{max_missing_diff|round(places=3)}}", max_extra_diff=max_extra_diff, max_missing_diff=max_missing_diff, ) Log.note("old={{old}}, new={{new}}", old=old_segments, new=new_segments) else: Log.note("Agree") if show and len(pushes): show_old and assign_colors(values, old_segments, title="OLD " + title) assign_colors(values, new_segments, title="NEW " + title) if url: webbrowser.open(url) if isinstance(deviant_summary, bigquery.Table): Log.note("BigQuery summary not updated") return deviant_summary.upsert( where={"eq": { "id": sig.id }}, doc=Data( id=sig_id, title=title, num_pushes=len(values), num_segments=len(new_segments) - 1, relative_noise=relative_noise, overall_dev_status=overall_dev_status, overall_dev_score=overall_dev_score, last_mean=last_mean, last_std=last_std, last_dev_status=last_dev_status, last_dev_score=last_dev_score, last_updated=Date.now(), is_diff=_is_diff, max_extra_diff=max_extra_diff, max_missing_diff=max_missing_diff, num_new_segments=len(new_segments), num_old_segments=len(old_segments), ), )
def follow_paths(position, path, nested_path, done_relations, no_nested_docs): if position.name in self.settings.exclude: return if self.path_not_allowed(path): return if DEBUG: Log.note("Trace {{path}}", path=path) if position.name != "__ids__": # USED TO CONFIRM WE CAN ACCESS THE TABLE (WILL THROW ERROR WHEN IF IT FAILS) self.db.query( ConcatSQL( SQL_SELECT, SQL_STAR, SQL_FROM, quote_column(position.schema, position.name), SQL_LIMIT, SQL_ONE, )) if position.name in reference_all_tables: no_nested_docs = True if position.name in reference_only_tables: return curr_join_list = copy(nested_path_to_join[nested_path[0]]) ############################################################################### # INNER OBJECTS ############################################################################### referenced_tables = list( sort_using_key( jx.groupby( jx.filter( relations, { "eq": { "table.name": position.name, "table.schema": position.schema, } }, ), "constraint.name", ), key=lambda p: first(p[1]).column.name, )) for g, constraint_columns in referenced_tables: g = unwrap(g) constraint_columns = deepcopy(constraint_columns) if g["constraint.name"] in done_relations: continue if any(cc for cc in constraint_columns if cc.referenced.table.name in self.settings.exclude): continue done_relations.add(g["constraint.name"]) many_to_one_joins = nested_path_to_join[nested_path[0]] index = len(many_to_one_joins) alias = "t" + text(index) for c in constraint_columns: c.referenced.table.alias = alias c.table = position many_to_one_joins.append({ "join_columns": constraint_columns, "path": path, "nested_path": nested_path, }) # HANDLE THE COMMON *id SUFFIX name = [] for cname, tname in zip( constraint_columns.column.name, constraint_columns.referenced.table.name, ): if cname.startswith(tname): name.append(tname) elif cname.endswith("_id"): name.append(cname[:-3]) else: name.append(cname) relation_string = many_to_one_string(constraint_columns[0]) step = "/".join(name) if len(constraint_columns) == 1: step = self.name_relations.get(relation_string, step) referenced_column_path = concat_field(path, step) if self.path_not_allowed(referenced_column_path): continue if referenced_column_path in reference_only_tables: continue col_pointer_name = relative_field(referenced_column_path, nested_path[0]) for col in columns: if (col.table.name == constraint_columns[0].referenced.table.name and col.table.schema == constraint_columns[0].referenced.table.schema): col_full_name = concat_field( col_pointer_name, literal_field(col.column.name)) if (col.is_id and len(nested_path) == 1 and col.table.name == fact_table.name and col.table.schema == fact_table.schema): # ALWAYS SHOW THE ID OF THE FACT c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text(c_index), "column": col, "sort": True, "path": referenced_column_path, "nested_path": nested_path, "put": col_full_name, }) elif col.column.name == constraint_columns[ 0].column.name: c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text(c_index), "column": col, "sort": False, "path": referenced_column_path, "nested_path": nested_path, "put": col_full_name if self.settings.show_foreign_keys else None, }) elif col.is_id: c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text(c_index), "column": col, "sort": False, "path": referenced_column_path, "nested_path": nested_path, "put": col_full_name if self.settings.show_foreign_keys else None, }) elif col.reference: c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text(c_index), "column": col, "sort": False, "path": referenced_column_path, "nested_path": nested_path, "put": col_pointer_name if not self.settings.show_foreign_keys else col_full_name, # REFERENCE FIELDS CAN REPLACE THE WHOLE OBJECT BEING REFERENCED }) elif col.include: c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text(c_index), "column": col, "sort": False, "path": referenced_column_path, "nested_path": nested_path, "put": col_full_name, }) if position.name in reference_only_tables: continue todo.append( Data( position=copy(constraint_columns[0].referenced.table), path=referenced_column_path, nested_path=nested_path, done_relations=copy(done_relations), no_nested_docs=no_nested_docs, )) ############################################################################### # NESTED OBJECTS ############################################################################### if not no_nested_docs: nesting_tables = list( sort_using_key( jx.groupby( jx.filter( relations, { "eq": { "referenced.table.name": position.name, "referenced.table.schema": position.schema, } }, ), "constraint.name", ), key=lambda p: [(r.table.name, r.column.name) for r in [first(p[1])]][0], )) for g, constraint_columns in nesting_tables: g = unwrap(g) constraint_columns = deepcopy(constraint_columns) if g["constraint.name"] in done_relations: continue done_relations.add(g["constraint.name"]) many_table = set(constraint_columns.table.name) if not (many_table - self.settings.exclude): continue relation_string = one_to_many_string(constraint_columns[0]) step = "/".join(many_table) if len(constraint_columns) == 1: step = self.name_relations.get(relation_string, step) referenced_column_path = concat_field(path, step) if self.path_not_allowed(referenced_column_path): continue new_nested_path = [referenced_column_path] + nested_path all_nested_paths.append(new_nested_path) if referenced_column_path in nested_path_to_join: Log.error( "{{path}} already exists, try adding entry to name_relations", path=referenced_column_path, ) one_to_many_joins = nested_path_to_join[ referenced_column_path] = copy(curr_join_list) index = len(one_to_many_joins) alias = "t" + text(index) for c in constraint_columns: c.table.alias = alias c.referenced.table = position one_to_many_joins.append( set_default( {}, g, { "children": True, "join_columns": constraint_columns, "path": path, "nested_path": nested_path, }, )) for col in columns: if (col.table.name == constraint_columns[0].table.name and col.table.schema == constraint_columns[0].table.schema): col_full_name = join_field( split_field(referenced_column_path) [len(split_field(new_nested_path[0])):] + [literal_field(col.column.name)]) if col.column.name == constraint_columns[ 0].column.name: c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text(c_index), "column": col, "sort": col.is_id, "path": referenced_column_path, "nested_path": new_nested_path, "put": col_full_name if self.settings.show_foreign_keys else None, }) elif col.is_id: c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text(c_index), "column": col, "sort": col.is_id, "path": referenced_column_path, "nested_path": new_nested_path, "put": col_full_name if self.settings.show_foreign_keys else None, }) else: c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text(c_index), "column": col, "sort": col.is_id, "path": referenced_column_path, "nested_path": new_nested_path, "put": col_full_name if col.include else None, }) todo.append( Data( position=constraint_columns[0].table, path=referenced_column_path, nested_path=new_nested_path, done_relations=copy(done_relations), no_nested_docs=no_nested_docs, ))
def _scan_database(self): # GET ALL RELATIONS raw_relations = self.db.query( """ SELECT table_schema, table_name, referenced_table_schema, referenced_table_name, referenced_column_name, constraint_name, column_name, ordinal_position FROM information_schema.key_column_usage WHERE referenced_column_name IS NOT NULL """, param=self.settings.database, ) if not raw_relations: Log.error("No relations in the database") for r in self.settings.add_relations: try: lhs, rhs = map(strings.trim, r.split("->")) lhs = lhs.split(".") if len(lhs) == 2: lhs = [self.settings.database.schema] + lhs rhs = rhs.split(".") if len(rhs) == 2: rhs = [self.settings.database.schema] + rhs to_add = Data( ordinal_position=1, # CAN ONLY HANDLE 1-COLUMN RELATIONS table_schema=lhs[0], table_name=lhs[1], column_name=lhs[2], referenced_table_schema=rhs[0], referenced_table_name=rhs[1], referenced_column_name=rhs[2], ) # CHECK IF EXISTING if jx.filter(raw_relations, {"eq": to_add}): Log.note("Relation {{relation}} already exists", relation=r) continue to_add.constraint_name = Random.hex(20) raw_relations.append(to_add) except Exception as e: Log.error("Could not parse {{line|quote}}", line=r, cause=e) relations = jx.select( raw_relations, [ { "name": "constraint.name", "value": "constraint_name" }, { "name": "table.schema", "value": "table_schema" }, { "name": "table.name", "value": "table_name" }, { "name": "column.name", "value": "column_name" }, { "name": "referenced.table.schema", "value": "referenced_table_schema" }, { "name": "referenced.table.name", "value": "referenced_table_name" }, { "name": "referenced.column.name", "value": "referenced_column_name" }, { "name": "ordinal_position", "value": "ordinal_position" }, ], ) # GET ALL TABLES raw_tables = self.db.query(""" SELECT t.table_schema, t.table_name, c.constraint_name, c.constraint_type, k.column_name, k.ordinal_position FROM information_schema.tables t LEFT JOIN information_schema.table_constraints c on c.table_name=t.table_name AND c.table_schema=t.table_schema and (constraint_type='UNIQUE' or constraint_type='PRIMARY KEY') LEFT JOIN information_schema.key_column_usage k on k.constraint_name=c.constraint_name AND k.table_name=t.table_name and k.table_schema=t.table_schema ORDER BY t.table_schema, t.table_name, c.constraint_name, k.ordinal_position, k.column_name """) # ORGANIZE, AND PICK ONE UNIQUE CONSTRAINT FOR LINKING tables = UniqueIndex(keys=["name", "schema"]) for t, c in jx.groupby(raw_tables, ["table_name", "table_schema"]): c = wrap(list(c)) best_index = Null is_referenced = False is_primary = False for g, w in jx.groupby(c, "constraint_name"): if not g.constraint_name: continue w = list(w) ref = False for r in relations: if (r.table.name == t.table_name and r.table.schema == t.table_schema and r.constraint.name == g.constraint_name): ref = True is_prime = w[0].constraint_type == "PRIMARY" reasons_this_one_is_better = [ best_index == None, # WE DO NOT HAVE A CANDIDATE YET is_prime and not is_primary, # PRIMARY KEYS ARE GOOD TO HAVE is_primary == is_prime and ref and not is_referenced, # REFERENCED UNIQUE TUPLES ARE GOOD TOO is_primary == is_prime and ref == is_referenced and len(w) < len(best_index), # THE SHORTER THE TUPLE, THE BETTER ] if any(reasons_this_one_is_better): is_primary = is_prime is_referenced = ref best_index = w tables.add({ "name": t.table_name, "schema": t.table_schema, "id": [b.column_name for b in best_index], }) fact_table = tables[self.settings.fact_table, self.settings.database.schema] ids_table = { "alias": "t0", "name": "__ids__", "schema": fact_table.schema, "id": fact_table.id, } relations.extend( wrap({ "constraint": { "name": "__link_ids_to_fact_table__" }, "table": ids_table, "column": { "name": c }, "referenced": { "table": fact_table, "column": { "name": c } }, "ordinal_position": i, }) for i, c in enumerate(fact_table.id)) tables.add(ids_table) # GET ALL COLUMNS raw_columns = self.db.query(""" SELECT column_name, table_schema, table_name, ordinal_position, data_type FROM information_schema.columns """) reference_only_tables = [ r.split(".")[0] for r in self.settings.reference_only if len(r.split(".")) == 2 ] reference_all_tables = [ r.split(".")[0] for r in self.settings.reference_only if len(r.split(".")) == 1 ] foreign_column_table_schema_triples = {(r.column.name, r.table.name, r.table.schema) for r in relations} referenced_column_table_schema_triples = {( r.referenced.column.name, r.referenced.table.name, r.referenced.table.schema, ) for r in relations} related_column_table_schema_triples = ( foreign_column_table_schema_triples | referenced_column_table_schema_triples) columns = UniqueIndex(["column.name", "table.name", "table.schema"]) for c in raw_columns: if c.table_name in reference_only_tables: if c.table_name + "." + c.column_name in self.settings.reference_only: include = True reference = True foreign = False elif c.column_name in tables[(c.table_name, c.table_schema)].id: include = self.settings.show_foreign_keys reference = False foreign = False else: include = False reference = False foreign = False elif c.table_name in reference_all_tables: # TABLES USED FOR REFERENCE, NO NESTED DOCUMENTS EXPECTED if c.column_name in tables[(c.table_name, c.table_schema)].id: include = self.settings.show_foreign_keys reference = True foreign = False elif ( c.column_name, c.table_name, c.table_schema, ) in foreign_column_table_schema_triples: include = False reference = False foreign = True else: include = True reference = False foreign = False elif c.column_name in tables[(c.table_name, c.table_schema)].id: include = self.settings.show_foreign_keys reference = False foreign = False elif ( c.column_name, c.table_name, c.table_schema, ) in foreign_column_table_schema_triples: include = False reference = False foreign = True elif ( c.column_name, c.table_name, c.table_schema, ) in referenced_column_table_schema_triples: include = self.settings.show_foreign_keys reference = False foreign = False else: include = True reference = False foreign = False rel = { "column": { "name": c.column_name, "type": c.data_type }, "table": { "name": c.table_name, "schema": c.table_schema }, "ordinal_position": c.ordinal_position, "is_id": c.column_name in tables[(c.table_name, c.table_schema)].id, "include": include, # TRUE IF THIS COLUMN IS OUTPUTTED "reference": reference, # TRUE IF THIS COLUMN REPRESENTS THE ROW "foreign": foreign, # TRUE IF THIS COLUMN POINTS TO ANOTHER ROW } columns.add(rel) # ITERATE OVER ALL PATHS todo = FlatList() output_columns = FlatList() nested_path_to_join = {} all_nested_paths = [["."]] def follow_paths(position, path, nested_path, done_relations, no_nested_docs): if position.name in self.settings.exclude: return if self.path_not_allowed(path): return if DEBUG: Log.note("Trace {{path}}", path=path) if position.name != "__ids__": # USED TO CONFIRM WE CAN ACCESS THE TABLE (WILL THROW ERROR WHEN IF IT FAILS) self.db.query( ConcatSQL( SQL_SELECT, SQL_STAR, SQL_FROM, quote_column(position.schema, position.name), SQL_LIMIT, SQL_ONE, )) if position.name in reference_all_tables: no_nested_docs = True if position.name in reference_only_tables: return curr_join_list = copy(nested_path_to_join[nested_path[0]]) ############################################################################### # INNER OBJECTS ############################################################################### referenced_tables = list( sort_using_key( jx.groupby( jx.filter( relations, { "eq": { "table.name": position.name, "table.schema": position.schema, } }, ), "constraint.name", ), key=lambda p: first(p[1]).column.name, )) for g, constraint_columns in referenced_tables: g = unwrap(g) constraint_columns = deepcopy(constraint_columns) if g["constraint.name"] in done_relations: continue if any(cc for cc in constraint_columns if cc.referenced.table.name in self.settings.exclude): continue done_relations.add(g["constraint.name"]) many_to_one_joins = nested_path_to_join[nested_path[0]] index = len(many_to_one_joins) alias = "t" + text(index) for c in constraint_columns: c.referenced.table.alias = alias c.table = position many_to_one_joins.append({ "join_columns": constraint_columns, "path": path, "nested_path": nested_path, }) # HANDLE THE COMMON *id SUFFIX name = [] for cname, tname in zip( constraint_columns.column.name, constraint_columns.referenced.table.name, ): if cname.startswith(tname): name.append(tname) elif cname.endswith("_id"): name.append(cname[:-3]) else: name.append(cname) relation_string = many_to_one_string(constraint_columns[0]) step = "/".join(name) if len(constraint_columns) == 1: step = self.name_relations.get(relation_string, step) referenced_column_path = concat_field(path, step) if self.path_not_allowed(referenced_column_path): continue if referenced_column_path in reference_only_tables: continue col_pointer_name = relative_field(referenced_column_path, nested_path[0]) for col in columns: if (col.table.name == constraint_columns[0].referenced.table.name and col.table.schema == constraint_columns[0].referenced.table.schema): col_full_name = concat_field( col_pointer_name, literal_field(col.column.name)) if (col.is_id and len(nested_path) == 1 and col.table.name == fact_table.name and col.table.schema == fact_table.schema): # ALWAYS SHOW THE ID OF THE FACT c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text(c_index), "column": col, "sort": True, "path": referenced_column_path, "nested_path": nested_path, "put": col_full_name, }) elif col.column.name == constraint_columns[ 0].column.name: c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text(c_index), "column": col, "sort": False, "path": referenced_column_path, "nested_path": nested_path, "put": col_full_name if self.settings.show_foreign_keys else None, }) elif col.is_id: c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text(c_index), "column": col, "sort": False, "path": referenced_column_path, "nested_path": nested_path, "put": col_full_name if self.settings.show_foreign_keys else None, }) elif col.reference: c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text(c_index), "column": col, "sort": False, "path": referenced_column_path, "nested_path": nested_path, "put": col_pointer_name if not self.settings.show_foreign_keys else col_full_name, # REFERENCE FIELDS CAN REPLACE THE WHOLE OBJECT BEING REFERENCED }) elif col.include: c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text(c_index), "column": col, "sort": False, "path": referenced_column_path, "nested_path": nested_path, "put": col_full_name, }) if position.name in reference_only_tables: continue todo.append( Data( position=copy(constraint_columns[0].referenced.table), path=referenced_column_path, nested_path=nested_path, done_relations=copy(done_relations), no_nested_docs=no_nested_docs, )) ############################################################################### # NESTED OBJECTS ############################################################################### if not no_nested_docs: nesting_tables = list( sort_using_key( jx.groupby( jx.filter( relations, { "eq": { "referenced.table.name": position.name, "referenced.table.schema": position.schema, } }, ), "constraint.name", ), key=lambda p: [(r.table.name, r.column.name) for r in [first(p[1])]][0], )) for g, constraint_columns in nesting_tables: g = unwrap(g) constraint_columns = deepcopy(constraint_columns) if g["constraint.name"] in done_relations: continue done_relations.add(g["constraint.name"]) many_table = set(constraint_columns.table.name) if not (many_table - self.settings.exclude): continue relation_string = one_to_many_string(constraint_columns[0]) step = "/".join(many_table) if len(constraint_columns) == 1: step = self.name_relations.get(relation_string, step) referenced_column_path = concat_field(path, step) if self.path_not_allowed(referenced_column_path): continue new_nested_path = [referenced_column_path] + nested_path all_nested_paths.append(new_nested_path) if referenced_column_path in nested_path_to_join: Log.error( "{{path}} already exists, try adding entry to name_relations", path=referenced_column_path, ) one_to_many_joins = nested_path_to_join[ referenced_column_path] = copy(curr_join_list) index = len(one_to_many_joins) alias = "t" + text(index) for c in constraint_columns: c.table.alias = alias c.referenced.table = position one_to_many_joins.append( set_default( {}, g, { "children": True, "join_columns": constraint_columns, "path": path, "nested_path": nested_path, }, )) for col in columns: if (col.table.name == constraint_columns[0].table.name and col.table.schema == constraint_columns[0].table.schema): col_full_name = join_field( split_field(referenced_column_path) [len(split_field(new_nested_path[0])):] + [literal_field(col.column.name)]) if col.column.name == constraint_columns[ 0].column.name: c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text(c_index), "column": col, "sort": col.is_id, "path": referenced_column_path, "nested_path": new_nested_path, "put": col_full_name if self.settings.show_foreign_keys else None, }) elif col.is_id: c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text(c_index), "column": col, "sort": col.is_id, "path": referenced_column_path, "nested_path": new_nested_path, "put": col_full_name if self.settings.show_foreign_keys else None, }) else: c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text(c_index), "column": col, "sort": col.is_id, "path": referenced_column_path, "nested_path": new_nested_path, "put": col_full_name if col.include else None, }) todo.append( Data( position=constraint_columns[0].table, path=referenced_column_path, nested_path=new_nested_path, done_relations=copy(done_relations), no_nested_docs=no_nested_docs, )) path = "." nested_path = [path] nested_path_to_join["."] = [{ "path": path, "join_columns": [{ "referenced": { "table": ids_table } }], "nested_path": nested_path, }] todo.append( Data( position=ids_table, path=path, nested_path=nested_path, done_relations=set(), no_nested_docs=False, )) while todo: item = todo.pop(0) follow_paths(**item) self.all_nested_paths = all_nested_paths self.nested_path_to_join = nested_path_to_join self.columns = output_columns
def groupby(self, keys): with self.locker: self._update_meta() return jx.groupby(self.__iter__(), keys)
def process(sig_id, show=False, show_limit=MAX_POINTS, show_old=True, show_distribution=None): if not mo_math.is_integer(sig_id): Log.error("expecting integer id") sig = first(get_signature(config.database, sig_id)) data = get_dataum(config.database, sig_id) min_date = (Date.today() - 3 * MONTH).unix pushes = jx.sort( [{ "value": median(rows.value), "runs": rows, "push": { "time": unwrap(t)["push.time"] }, } for t, rows in jx.groupby(data, "push.time") if t["push\\.time"] > min_date], "push.time", ) values = pushes.value title = "-".join( map( text, [ sig.id, sig.framework, sig.suite, sig.test, sig.platform, sig.repository.name, ], )) Log.note("With {{title}}", title=title) with Timer("find segments"): new_segments, new_diffs = find_segments(values, sig.alert_change_type, sig.alert_threshold) # USE PERFHERDER ALERTS TO IDENTIFY OLD SEGMENTS old_segments = tuple( sorted( set([ i for i, p in enumerate(pushes) if any(r.alert.id for r in p.runs) ] + [0, len(pushes)]))) old_medians = [0] + [ np.median(values[s:e]) for s, e in zip(old_segments[:-1], old_segments[1:]) ] old_diffs = np.array( [b / a - 1 for a, b in zip(old_medians[:-1], old_medians[1:])] + [0]) if len(new_segments) == 1: dev_status = None dev_score = None relative_noise = None else: # MEASURE DEVIANCE (USE THE LAST SEGMENT) s, e = new_segments[-2], new_segments[-1] last_segment = np.array(values[s:e]) trimmed_segment = last_segment[np.argsort(last_segment) [IGNORE_TOP:-IGNORE_TOP]] dev_status, dev_score = deviance(trimmed_segment) relative_noise = np.std(trimmed_segment) / np.mean(trimmed_segment) Log.note( "\n\tdeviance = {{deviance}}\n\tnoise={{std}}", title=title, deviance=(dev_status, dev_score), std=relative_noise, ) if show_distribution: histogram(last_segment, title=dev_status + "=" + text(dev_score)) max_extra_diff = None max_missing_diff = None _is_diff = is_diff(new_segments, old_segments) if _is_diff: # FOR MISSING POINTS, CALC BIGGEST DIFF max_extra_diff = mo_math.MAX( abs(d) for s, d in zip(new_segments, new_diffs) if all(not (s - TOLLERANCE <= o <= s + TOLLERANCE) for o in old_segments)) max_missing_diff = mo_math.MAX( abs(d) for s, d in zip(old_segments, old_diffs) if all(not (s - TOLLERANCE <= n <= s + TOLLERANCE) for n in new_segments)) Log.alert( "Disagree max_extra_diff={{max_extra_diff|round(places=3)}}, max_missing_diff={{max_missing_diff|round(places=3)}}", max_extra_diff=max_extra_diff, max_missing_diff=max_missing_diff, ) Log.note("old={{old}}, new={{new}}", old=old_segments, new=new_segments) if show and len(pushes): show_old and assign_colors( values, old_segments, title="OLD " + title) assign_colors(values, new_segments, title="NEW " + title) else: Log.note("Agree") if show and len(pushes): show_old and assign_colors( values, old_segments, title="OLD " + title) assign_colors(values, new_segments, title="NEW " + title) summary_table.upsert( where={"eq": { "id": sig.id }}, doc=Data( id=sig.id, title=title, num_pushes=len(pushes), is_diff=_is_diff, max_extra_diff=max_extra_diff, max_missing_diff=max_missing_diff, num_new_segments=len(new_segments), num_old_segments=len(old_segments), relative_noise=relative_noise, dev_status=dev_status, dev_score=dev_score, last_updated=Date.now(), ), )
def follow_paths(position, path, nested_path, done_relations, no_nested_docs): if position.name in self.settings.exclude: return if DEBUG: Log.note("Trace {{path}}", path=path) if position.name != "__ids__": # USED TO CONFIRM WE CAN ACCESS THE TABLE (WILL THROW ERROR WHEN IF IT FAILS) self.db.query("SELECT * FROM " + quote_column(position.name, position.schema) + " LIMIT 1") if position.name in reference_all_tables: no_nested_docs = True if position.name in reference_only_tables: return curr_join_list = copy(nested_path_to_join[nested_path[0]]) # INNER OBJECTS referenced_tables = list( jx.groupby( jx.filter( relations, { "eq": { "table.name": position.name, "table.schema": position.schema } }), "constraint.name")) for g, constraint_columns in referenced_tables: g = unwrap(g) constraint_columns = deepcopy(constraint_columns) if g["constraint.name"] in done_relations: continue if any(cc for cc in constraint_columns if cc.referenced.table.name in self.settings.exclude): continue done_relations.add(g["constraint.name"]) many_to_one_joins = nested_path_to_join[nested_path[0]] index = len(many_to_one_joins) alias = "t" + text_type(index) for c in constraint_columns: c.referenced.table.alias = alias c.table = position many_to_one_joins.append({ "join_columns": constraint_columns, "path": path, "nested_path": nested_path }) # referenced_table_path = join_field(split_field(path) + ["/".join(constraint_columns.referenced.table.name)]) # HANDLE THE COMMON *id SUFFIX name = [] for a, b in zip(constraint_columns.column.name, constraint_columns.referenced.table.name): if a.startswith(b): name.append(b) elif a.endswith("_id"): name.append(a[:-3]) else: name.append(a) referenced_column_path = join_field( split_field(path) + ["/".join(name)]) col_pointer_name = relative_field(referenced_column_path, nested_path[0]) # insert into nested1 VALUES (100, 10, 'aaa', -1); # id.about.time.nested1 .ref=10 # id.about.time.nested1 .ref.name for col in columns: if col.table.name == constraint_columns[ 0].referenced.table.name and col.table.schema == constraint_columns[ 0].referenced.table.schema: col_full_name = concat_field( col_pointer_name, literal_field(col.column.name)) if col.is_id and col.table.name == fact_table.name and col.table.schema == fact_table.schema: # ALWAYS SHOW THE ID OF THE FACT c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text_type(c_index), "column": col, "sort": True, "path": referenced_column_path, "nested_path": nested_path, "put": col_full_name }) elif col.column.name == constraint_columns[ 0].column.name: c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text_type(c_index), "column": col, "sort": False, "path": referenced_column_path, "nested_path": nested_path, "put": col_full_name if self.settings.show_foreign_keys else None }) elif col.is_id: c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text_type(c_index), "column": col, "sort": False, "path": referenced_column_path, "nested_path": nested_path, "put": col_full_name if self.settings.show_foreign_keys else None }) elif col.reference: c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text_type(c_index), "column": col, "sort": False, "path": referenced_column_path, "nested_path": nested_path, "put": col_pointer_name if not self.settings.show_foreign_keys else col_full_name # REFERENCE FIELDS CAN REPLACE THE WHOLE OBJECT BEING REFERENCED }) elif col.include: c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text_type(c_index), "column": col, "sort": False, "path": referenced_column_path, "nested_path": nested_path, "put": col_full_name }) if position.name in reference_only_tables: continue todo.append( Data(position=copy(constraint_columns[0].referenced.table), path=referenced_column_path, nested_path=nested_path, done_relations=copy(done_relations), no_nested_docs=no_nested_docs)) # NESTED OBJECTS if not no_nested_docs: for g, constraint_columns in jx.groupby( jx.filter( relations, { "eq": { "referenced.table.name": position.name, "referenced.table.schema": position.schema } }), "constraint.name"): g = unwrap(g) constraint_columns = deepcopy(constraint_columns) if g["constraint.name"] in done_relations: continue done_relations.add(g["constraint.name"]) many_table = set(constraint_columns.table.name) if not (many_table - self.settings.exclude): continue referenced_column_path = join_field( split_field(path) + ["/".join(many_table)]) new_nested_path = [referenced_column_path] + nested_path all_nested_paths.append(new_nested_path) # if new_path not in self.settings.include: # Log.note("Exclude nested path {{path}}", path=new_path) # continue one_to_many_joins = nested_path_to_join[ referenced_column_path] = copy(curr_join_list) index = len(one_to_many_joins) alias = "t" + text_type(index) for c in constraint_columns: c.table.alias = alias c.referenced.table = position one_to_many_joins.append( set_default({}, g, { "children": True, "join_columns": constraint_columns, "path": path, "nested_path": nested_path })) # insert into nested1 VALUES (100, 10, 'aaa', -1); # id.about.time.nested1 .ref=10# id.about.time.nested1 .ref.name for col in columns: if col.table.name == constraint_columns[ 0].table.name and col.table.schema == constraint_columns[ 0].table.schema: col_full_name = join_field( split_field(referenced_column_path) [len(split_field(new_nested_path[0])):] + [literal_field(col.column.name)]) if col.column.name == constraint_columns[ 0].column.name: c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text_type(c_index), "column": col, "sort": col.is_id, "path": referenced_column_path, "nested_path": new_nested_path, "put": col_full_name if self.settings.show_foreign_keys else None }) elif col.is_id: c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text_type(c_index), "column": col, "sort": col.is_id, "path": referenced_column_path, "nested_path": new_nested_path, "put": col_full_name if self.settings.show_foreign_keys else None }) else: c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text_type(c_index), "column": col, "sort": col.is_id, "path": referenced_column_path, "nested_path": new_nested_path, "put": col_full_name if col.include else None }) todo.append( Data(position=constraint_columns[0].table, path=referenced_column_path, nested_path=new_nested_path, done_relations=copy(done_relations), no_nested_docs=no_nested_docs))
def _scan_database(self): # GET ALL RELATIONS raw_relations = self.db.query(""" SELECT table_schema, table_name, referenced_table_schema, referenced_table_name, referenced_column_name, constraint_name, column_name, ordinal_position FROM information_schema.key_column_usage WHERE referenced_column_name IS NOT NULL """, param=self.settings.database) if not raw_relations: Log.error("No relations in the database") for r in self.settings.add_relations: try: a, b = map(strings.trim, r.split("->")) a = a.split(".") b = b.split(".") raw_relations.append( Data(table_schema=a[0], table_name=a[1], referenced_table_schema=b[0], referenced_table_name=b[1], referenced_column_name=b[2], constraint_name=Random.hex(20), column_name=a[2], ordinal_position=1)) except Exception as e: Log.error("Could not parse {{line|quote}}", line=r, cause=e) relations = jx.select(raw_relations, [{ "name": "constraint.name", "value": "constraint_name" }, { "name": "table.schema", "value": "table_schema" }, { "name": "table.name", "value": "table_name" }, { "name": "column.name", "value": "column_name" }, { "name": "referenced.table.schema", "value": "referenced_table_schema" }, { "name": "referenced.table.name", "value": "referenced_table_name" }, { "name": "referenced.column.name", "value": "referenced_column_name" }, { "name": "ordinal_position", "value": "ordinal_position" }]) # GET ALL TABLES raw_tables = self.db.query(""" SELECT t.table_schema, t.table_name, c.constraint_name, c.constraint_type, k.column_name, k.ordinal_position FROM information_schema.tables t LEFT JOIN information_schema.table_constraints c on c.table_name=t.table_name AND c.table_schema=t.table_schema and (constraint_type='UNIQUE' or constraint_type='PRIMARY KEY') LEFT JOIN information_schema.key_column_usage k on k.constraint_name=c.constraint_name AND k.table_name=t.table_name and k.table_schema=t.table_schema ORDER BY t.table_schema, t.table_name, c.constraint_name, k.ordinal_position, k.column_name """, param=self.settings.database) # ORGANIZE, AND PICK ONE UNIQUE CONSTRAINT FOR LINKING tables = UniqueIndex(keys=["name", "schema"]) for t, c in jx.groupby(raw_tables, ["table_name", "table_schema"]): c = wrap(list(c)) best_index = Null is_referenced = False is_primary = False for g, w in jx.groupby(c, "constraint_name"): if not g.constraint_name: continue w = list(w) ref = False for r in relations: if r.table.name == t.table_name and r.table.schema == t.table_schema and r.constraint.name == g.constraint_name: ref = True is_prime = w[0].constraint_type == "PRIMARY" reasons_this_one_is_better = [ best_index == None, # WE DO NOT HAVE A CANDIDATE YET is_prime and not is_primary, # PRIMARY KEYS ARE GOOD TO HAVE is_primary == is_prime and ref and not is_referenced, # REFERENCED UNIQUE TUPLES ARE GOOD TOO is_primary == is_prime and ref == is_referenced and len(w) < len(best_index) # THE SHORTER THE TUPLE, THE BETTER ] if any(reasons_this_one_is_better): is_primary = is_prime is_referenced = ref best_index = w tables.add({ "name": t.table_name, "schema": t.table_schema, "id": [b.column_name for b in best_index] }) fact_table = tables[self.settings.fact_table, self.settings.database.schema] ids_table = { "alias": "t0", "name": "__ids__", "schema": fact_table.schema, "id": fact_table.id } relations.extend( wrap({ "constraint": { "name": "__link_ids_to_fact_table__" }, "table": ids_table, "column": { "name": c }, "referenced": { "table": fact_table, "column": { "name": c } }, "ordinal_position": i }) for i, c in enumerate(fact_table.id)) tables.add(ids_table) # GET ALL COLUMNS raw_columns = self.db.query(""" SELECT column_name, table_schema, table_name, ordinal_position, data_type FROM information_schema.columns """, param=self.settings.database) reference_only_tables = [ r.split(".")[0] for r in self.settings.reference_only if len(r.split(".")) == 2 ] reference_all_tables = [ r.split(".")[0] for r in self.settings.reference_only if len(r.split(".")) == 1 ] foreign_column_table_schema_triples = {(r.column.name, r.table.name, r.table.schema) for r in relations} referenced_column_table_schema_triples = { (r.referenced.column.name, r.referenced.table.name, r.referenced.table.schema) for r in relations } related_column_table_schema_triples = foreign_column_table_schema_triples | referenced_column_table_schema_triples columns = UniqueIndex(["column.name", "table.name", "table.schema"]) for c in raw_columns: if c.table_name in reference_only_tables: if c.table_name + "." + c.column_name in self.settings.reference_only: include = True reference = True foreign = False elif c.column_name in tables[(c.table_name, c.table_schema)].id: include = self.settings.show_foreign_keys reference = False foreign = False else: include = False reference = False foreign = False elif c.table_name in reference_all_tables: # TABLES USED FOR REFERENCE, NO NESTED DOCUMENTS EXPECTED if c.column_name in tables[(c.table_name, c.table_schema)].id: include = self.settings.show_foreign_keys reference = True foreign = False elif (c.column_name, c.table_name, c.table_schema) in foreign_column_table_schema_triples: include = False reference = False foreign = True else: include = True reference = False foreign = False elif c.column_name in tables[(c.table_name, c.table_schema)].id: include = self.settings.show_foreign_keys reference = False foreign = False elif (c.column_name, c.table_name, c.table_schema) in foreign_column_table_schema_triples: include = False reference = False foreign = True elif (c.column_name, c.table_name, c.table_schema) in referenced_column_table_schema_triples: include = self.settings.show_foreign_keys reference = False foreign = False else: include = True reference = False foreign = False rel = { "column": { "name": c.column_name, "type": c.data_type }, "table": { "name": c.table_name, "schema": c.table_schema }, "ordinal_position": c.ordinal_position, "is_id": c.column_name in tables[(c.table_name, c.table_schema)].id, "include": include, # TRUE IF THIS COLUMN IS OUTPUTTED "reference": reference, # TRUE IF THIS COLUMN REPRESENTS THE ROW "foreign": foreign # TRUE IF THIS COLUMN POINTS TO ANOTHER ROW } columns.add(rel) # ITERATE OVER ALL PATHS todo = FlatList() output_columns = FlatList() nested_path_to_join = {} all_nested_paths = [["."]] def follow_paths(position, path, nested_path, done_relations, no_nested_docs): if position.name in self.settings.exclude: return if DEBUG: Log.note("Trace {{path}}", path=path) if position.name != "__ids__": # USED TO CONFIRM WE CAN ACCESS THE TABLE (WILL THROW ERROR WHEN IF IT FAILS) self.db.query("SELECT * FROM " + quote_column(position.name, position.schema) + " LIMIT 1") if position.name in reference_all_tables: no_nested_docs = True if position.name in reference_only_tables: return curr_join_list = copy(nested_path_to_join[nested_path[0]]) # INNER OBJECTS referenced_tables = list( jx.groupby( jx.filter( relations, { "eq": { "table.name": position.name, "table.schema": position.schema } }), "constraint.name")) for g, constraint_columns in referenced_tables: g = unwrap(g) constraint_columns = deepcopy(constraint_columns) if g["constraint.name"] in done_relations: continue if any(cc for cc in constraint_columns if cc.referenced.table.name in self.settings.exclude): continue done_relations.add(g["constraint.name"]) many_to_one_joins = nested_path_to_join[nested_path[0]] index = len(many_to_one_joins) alias = "t" + text_type(index) for c in constraint_columns: c.referenced.table.alias = alias c.table = position many_to_one_joins.append({ "join_columns": constraint_columns, "path": path, "nested_path": nested_path }) # referenced_table_path = join_field(split_field(path) + ["/".join(constraint_columns.referenced.table.name)]) # HANDLE THE COMMON *id SUFFIX name = [] for a, b in zip(constraint_columns.column.name, constraint_columns.referenced.table.name): if a.startswith(b): name.append(b) elif a.endswith("_id"): name.append(a[:-3]) else: name.append(a) referenced_column_path = join_field( split_field(path) + ["/".join(name)]) col_pointer_name = relative_field(referenced_column_path, nested_path[0]) # insert into nested1 VALUES (100, 10, 'aaa', -1); # id.about.time.nested1 .ref=10 # id.about.time.nested1 .ref.name for col in columns: if col.table.name == constraint_columns[ 0].referenced.table.name and col.table.schema == constraint_columns[ 0].referenced.table.schema: col_full_name = concat_field( col_pointer_name, literal_field(col.column.name)) if col.is_id and col.table.name == fact_table.name and col.table.schema == fact_table.schema: # ALWAYS SHOW THE ID OF THE FACT c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text_type(c_index), "column": col, "sort": True, "path": referenced_column_path, "nested_path": nested_path, "put": col_full_name }) elif col.column.name == constraint_columns[ 0].column.name: c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text_type(c_index), "column": col, "sort": False, "path": referenced_column_path, "nested_path": nested_path, "put": col_full_name if self.settings.show_foreign_keys else None }) elif col.is_id: c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text_type(c_index), "column": col, "sort": False, "path": referenced_column_path, "nested_path": nested_path, "put": col_full_name if self.settings.show_foreign_keys else None }) elif col.reference: c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text_type(c_index), "column": col, "sort": False, "path": referenced_column_path, "nested_path": nested_path, "put": col_pointer_name if not self.settings.show_foreign_keys else col_full_name # REFERENCE FIELDS CAN REPLACE THE WHOLE OBJECT BEING REFERENCED }) elif col.include: c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text_type(c_index), "column": col, "sort": False, "path": referenced_column_path, "nested_path": nested_path, "put": col_full_name }) if position.name in reference_only_tables: continue todo.append( Data(position=copy(constraint_columns[0].referenced.table), path=referenced_column_path, nested_path=nested_path, done_relations=copy(done_relations), no_nested_docs=no_nested_docs)) # NESTED OBJECTS if not no_nested_docs: for g, constraint_columns in jx.groupby( jx.filter( relations, { "eq": { "referenced.table.name": position.name, "referenced.table.schema": position.schema } }), "constraint.name"): g = unwrap(g) constraint_columns = deepcopy(constraint_columns) if g["constraint.name"] in done_relations: continue done_relations.add(g["constraint.name"]) many_table = set(constraint_columns.table.name) if not (many_table - self.settings.exclude): continue referenced_column_path = join_field( split_field(path) + ["/".join(many_table)]) new_nested_path = [referenced_column_path] + nested_path all_nested_paths.append(new_nested_path) # if new_path not in self.settings.include: # Log.note("Exclude nested path {{path}}", path=new_path) # continue one_to_many_joins = nested_path_to_join[ referenced_column_path] = copy(curr_join_list) index = len(one_to_many_joins) alias = "t" + text_type(index) for c in constraint_columns: c.table.alias = alias c.referenced.table = position one_to_many_joins.append( set_default({}, g, { "children": True, "join_columns": constraint_columns, "path": path, "nested_path": nested_path })) # insert into nested1 VALUES (100, 10, 'aaa', -1); # id.about.time.nested1 .ref=10# id.about.time.nested1 .ref.name for col in columns: if col.table.name == constraint_columns[ 0].table.name and col.table.schema == constraint_columns[ 0].table.schema: col_full_name = join_field( split_field(referenced_column_path) [len(split_field(new_nested_path[0])):] + [literal_field(col.column.name)]) if col.column.name == constraint_columns[ 0].column.name: c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text_type(c_index), "column": col, "sort": col.is_id, "path": referenced_column_path, "nested_path": new_nested_path, "put": col_full_name if self.settings.show_foreign_keys else None }) elif col.is_id: c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text_type(c_index), "column": col, "sort": col.is_id, "path": referenced_column_path, "nested_path": new_nested_path, "put": col_full_name if self.settings.show_foreign_keys else None }) else: c_index = len(output_columns) output_columns.append({ "table_alias": alias, "column_alias": "c" + text_type(c_index), "column": col, "sort": col.is_id, "path": referenced_column_path, "nested_path": new_nested_path, "put": col_full_name if col.include else None }) todo.append( Data(position=constraint_columns[0].table, path=referenced_column_path, nested_path=new_nested_path, done_relations=copy(done_relations), no_nested_docs=no_nested_docs)) path = "." nested_path = [path] nested_path_to_join["."] = [{ "path": path, "join_columns": [{ "referenced": { "table": ids_table } }], "nested_path": nested_path }] todo.append( Data(position=ids_table, path=path, nested_path=nested_path, done_relations=set(), no_nested_docs=False)) while todo: item = todo.pop(0) follow_paths(**item) self.all_nested_paths = all_nested_paths self.nested_path_to_join = nested_path_to_join self.columns = output_columns
def _update_cardinality(self, column): """ QUERY ES TO FIND CARDINALITY AND PARTITIONS FOR A SIMPLE COLUMN """ if column.es_index in self.index_does_not_exist: return if column.type in STRUCT: Log.error("not supported") try: if column.es_index == "meta.columns": with self.meta.columns.locker: partitions = jx.sort([ g[column.es_column] for g, _ in jx.groupby( self.meta.columns, column.es_column) if g[column.es_column] != None ]) self.meta.columns.update({ "set": { "partitions": partitions, "count": len(self.meta.columns), "cardinality": len(partitions), "multi": 1, "last_updated": Date.now() }, "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) return if column.es_index == "meta.tables": with self.meta.columns.locker: partitions = jx.sort([ g[column.es_column] for g, _ in jx.groupby( self.meta.tables, column.es_column) if g[column.es_column] != None ]) self.meta.columns.update({ "set": { "partitions": partitions, "count": len(self.meta.tables), "cardinality": len(partitions), "multi": 1, "last_updated": Date.now() }, "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) return es_index = column.es_index.split(".")[0] is_text = [ cc for cc in self.abs_columns if cc.es_column == column.es_column and cc.type == "text" ] if is_text: # text IS A MULTIVALUE STRING THAT CAN ONLY BE FILTERED result = self.default_es.post("/" + es_index + "/_search", data={ "aggs": { "count": { "filter": { "match_all": {} } } }, "size": 0 }) count = result.hits.total cardinality = 1001 multi = 1001 elif column.es_column == "_id": result = self.default_es.post("/" + es_index + "/_search", data={ "query": { "match_all": {} }, "size": 0 }) count = cardinality = result.hits.total multi = 1 else: result = self.default_es.post( "/" + es_index + "/_search", data={ "aggs": { "count": _counting_query(column), "multi": { "max": { "script": "doc[" + quote(column.es_column) + "].values.size()" } } }, "size": 0 }) r = result.aggregations.count count = result.hits.total cardinality = coalesce(r.value, r._nested.value, r.doc_count) multi = coalesce(r.multi.value, 1) if cardinality == None: Log.error("logic error") query = Data(size=0) if column.es_column == "_id": with self.meta.columns.locker: self.meta.columns.update({ "set": { "count": cardinality, "cardinality": cardinality, "multi": 1, "last_updated": Date.now() }, "clear": ["partitions"], "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) return elif cardinality > 1000 or (count >= 30 and cardinality == count ) or (count >= 1000 and cardinality / count > 0.99): if DEBUG: Log.note("{{table}}.{{field}} has {{num}} parts", table=column.es_index, field=column.es_column, num=cardinality) with self.meta.columns.locker: self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "multi": multi, "last_updated": Date.now() }, "clear": ["partitions"], "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) return elif column.type in elasticsearch.ES_NUMERIC_TYPES and cardinality > 30: if DEBUG: Log.note("{{field}} has {{num}} parts", field=column.es_index, num=cardinality) with self.meta.columns.locker: self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "multi": multi, "last_updated": Date.now() }, "clear": ["partitions"], "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) return elif len(column.nested_path) != 1: query.aggs["_"] = { "nested": { "path": column.nested_path[0] }, "aggs": { "_nested": { "terms": { "field": column.es_column } } } } elif cardinality == 0: query.aggs["_"] = {"terms": {"field": column.es_column}} else: query.aggs["_"] = { "terms": { "field": column.es_column, "size": cardinality } } result = self.default_es.post("/" + es_index + "/_search", data=query) aggs = result.aggregations._ if aggs._nested: parts = jx.sort(aggs._nested.buckets.key) else: parts = jx.sort(aggs.buckets.key) if DEBUG: Log.note("{{field}} has {{parts}}", field=column.names["."], parts=parts) with self.meta.columns.locker: self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "multi": multi, "partitions": parts, "last_updated": Date.now() }, "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) except Exception as e: # CAN NOT IMPORT: THE TEST MODULES SETS UP LOGGING # from tests.test_jx import TEST_TABLE TEST_TABLE = "testdata" is_missing_index = any( w in e for w in ["IndexMissingException", "index_not_found_exception"]) is_test_table = any( column.es_index.startswith(t) for t in [TEST_TABLE_PREFIX, TEST_TABLE]) if is_missing_index and is_test_table: # WE EXPECT TEST TABLES TO DISAPPEAR with self.meta.columns.locker: self.meta.columns.update({ "clear": ".", "where": { "eq": { "es_index": column.es_index } } }) self.index_does_not_exist.add(column.es_index) else: self.meta.columns.update({ "set": { "last_updated": Date.now() }, "clear": [ "count", "cardinality", "multi", "partitions", ], "where": { "eq": { "names.\\.": ".", "es_index": column.es_index, "es_column": column.es_column } } }) Log.warning( "Could not get {{col.es_index}}.{{col.es_column}} info", col=column, cause=e)
def add_cset_entries(self, ordered_rev_list, timestamp=False, number_forward=True): ''' Adds a list of revisions to the table. Assumes ordered_rev_list is an ordered based on how changesets are found in the changelog. Going forwards or backwards is dealt with by flipping the list :param ordered_cset_list: Order given from changeset log searching. :param timestamp: If false, records are kept indefinitely but if holes exist: (delete, None, delete, None) those delete's with None's around them will not be deleted. :param numbered: If True, this function will number the revision list by going forward from max(revNum), else it'll go backwards from revNum, then add X to all revnums and self.next_revnum where X is the length of ordered_rev_list :return: ''' with self.conn.transaction() as t: current_min = t.get_one("SELECT min(revnum) FROM csetlog")[0] current_max = t.get_one("SELECT max(revnum) FROM csetlog")[0] if not current_min or not current_max: current_min = 0 current_max = 0 direction = -1 start = current_min - 1 if number_forward: direction = 1 start = current_max + 1 ordered_rev_list = ordered_rev_list[::-1] insert_list = [ ( start + direction * count, rev, int(time.time()) if timestamp else -1 ) for count, rev in enumerate(ordered_rev_list) ] # In case of overlapping requests fmt_insert_list = [] for cset_entry in insert_list: tmp = self._get_one_revision(t, cset_entry) if not tmp: fmt_insert_list.append(cset_entry) for _, tmp_insert_list in jx.groupby(fmt_insert_list, size=SQL_CSET_BATCH_SIZE): t.execute( "INSERT INTO csetLog (revnum, revision, timestamp)" + " VALUES " + sql_list( quote_set((revnum, revision, timestamp)) for revnum, revision, timestamp in tmp_insert_list ) ) # Move the revision numbers forward if needed self.recompute_table_revnums() # Start a maintenance run if needed if self.check_for_maintenance(): Log.note("Scheduling maintenance run on clogger.") self.maintenance_signal.go()