def check_data_valume_diff(db, table): from_time = metrics_db.execute( text(""" SELECT max(created_at) as created_at FROM metrics_data_volume_diff WHERE table_id = :table_id """), { 'table_id': table.id }).first() from_time = from_time.created_at if from_time else None if from_time is None: # if now previous diff computed, compute from start of day # mostly because we show that stat daily from_time = datetime.combine(date.today(), time()) try: result = db.check_data_volume(table, where_timecol=f">= '{from_time}'") except AttributeError: result = db.execute( text(f""" SELECT count(*) as count FROM {table.table_name} WHERE {table.time_column} >= :from_time """), { 'from_time': from_time }).first() metrics_data_valume = metadata.tables['metrics_data_volume_diff'] stmt = metrics_data_valume.insert().values(table_id=table.id, from_time=from_time, count=result.count) metrics_db.execute(stmt)
def insert_schema_changed_record(table, operation, column_name, column_type, column_count): metrics_data_valume = metadata.tables['metrics_table_schema_changes'] stmt = metrics_data_valume.insert().values(table_id=table.id, operation=operation, column_name=column_name, column_type=column_type, column_count=column_count) metrics_db.execute(stmt)
def create_sample_tables_in_redata(): partners = [ 'team1', 'team2', 'team3', 'team4', 'team5', 'team6', 'team7', 'c1', 'c2', 'c3', 'c4', 'c5', 'c6', 'c7' ] types = [ 'OPEN', 'CLICK', 'EXIT', ] metrics_db.execute("CREATE SCHEMA IF NOT EXISTS sample") for partner in partners: metrics_db.execute(EVENTS.substitute(partner=partner)) metrics_db.execute(SESSION) metrics_db.execute(REPORT) num_events = random.randint(5000, 10000) utc_now = datetime.utcnow() secs_in_month = 3000000 failure_range = [1200000, 1400000] higher_values_range = [200000, 600000] nulls_show_up = [800000, 1000000] for event in range(num_events): print("Creating event ", event) how_much_back = random.randint(0, secs_in_month) rand_value = random.random() * 100 random_uid = 'uid_{}'.format(random.randint(0, 1000)) if failure_range[0] < how_much_back < failure_range[1]: how_much_back = random.randint(0, secs_in_month) if higher_values_range[0] < how_much_back < higher_values_range[1]: rand_value = random.random() * 100 * 5 + 50 if nulls_show_up[0] < how_much_back < nulls_show_up[1]: random_uid = None created_at = utc_now - timedelta(seconds=how_much_back) metrics_db.execute( insert_to_events.substitute(partner=random.choice(partners), event_type=random.choice(types), uid=random_uid, value=rand_value, created_at=created_at))
def check_data_delayed(db, table): age_fun = db.get_age_function() result = db.execute(f""" SELECT {age_fun}(now(), max({table.time_column})) FROM {table.table_name} """).fetchall()[0] metrics_data_delay = metadata.tables['metrics_data_delay'] stmt = metrics_data_delay.insert().values(table_id=table.id, value=result[0].total_seconds()) metrics_db.execute(stmt)
def check_count_per_value(db, table, checked_column, time_interval): try: result = db.check_count_per_value(table, checked_column, time_interval) except AttributeError: sep = db.get_interval_sep() check_distinct = db.execute(f""" SELECT count(distinct({checked_column})) as count FROM {table.table_name} WHERE {table.time_column} > now() - INTERVAL {sep}{time_interval}{sep} """).first() if check_distinct.count > 10: # Skipping if more than 10 different values showing up in column return result = db.execute(f""" SELECT count(*) as count, {checked_column} as value FROM {table.table_name} WHERE {table.time_column} > now() - INTERVAL {sep}{time_interval}{sep} and {checked_column} is not null GROUP BY {checked_column} ORDER BY count DESC LIMIT 10 """) metrics_data_values = metadata.tables['metrics_data_values'] for row in result: stmt = metrics_data_values.insert().values( table_id=table.id, column_name=checked_column, column_value=row.value, check_name='check_count_per_value', check_value=row.count, time_interval=time_interval) metrics_db.execute(stmt)
def check_generic(func_name, db, table, checked_column, time_interval): sep = db.get_interval_sep() result = db.execute(f""" SELECT {func_name}({checked_column}) as value FROM {table.table_name} WHERE {table.time_column} > now() - INTERVAL {sep}{time_interval}{sep} """).first() metrics_data_values = metadata.tables['metrics_data_values'] stmt = metrics_data_values.insert().values(table_id=table.id, column_name=checked_column, check_name=f'check_{func_name}', check_value=result.value, time_interval=time_interval) metrics_db.execute(stmt)
def check_data_volume(db, table, time_interval): sep = db.get_interval_sep() result = db.execute(f""" SELECT count(*) as count FROM {table.table_name} WHERE {table.time_column} > now() - INTERVAL {sep}{time_interval}{sep} """).first() metrics_data_valume = metadata.tables['metrics_data_volume'] stmt = metrics_data_valume.insert().values( table_id=table.id, time_interval=time_interval, count=result.count ) metrics_db.execute(stmt)
def check_count_nulls(db, table, checked_column, time_interval): sep = db.get_interval_sep() result = db.execute(f""" SELECT count(*) as value FROM {table.table_name} WHERE {table.time_column} > now() - INTERVAL {sep}{time_interval}{sep} and {checked_column} is null """).first() metrics_data_values = metadata.tables['metrics_data_values'] stmt = metrics_data_values.insert().values(table_id=table.id, column_name=checked_column, check_name='check_count_nulls', check_value=result.value, time_interval=time_interval) metrics_db.execute(stmt)
def check_data_volume(db, table, time_interval): try: interval_part = db.make_interval(time_interval) result = db.check_data_volume( table, where_timecol=f"> now() - {interval_part}") except AttributeError: sep = db.get_interval_sep() result = db.execute(f""" SELECT count(*) as count FROM {table.table_name} WHERE {table.time_column} > now() - INTERVAL {sep}{time_interval}{sep} """).first() metrics_data_valume = metadata.tables['metrics_data_volume'] stmt = metrics_data_valume.insert().values(table_id=table.id, time_interval=time_interval, count=result.count) metrics_db.execute(stmt)
def setup_metrics(): metrics_db.execute("""CREATE TABLE IF NOT EXISTS metrics_data_delay ( table_id integer, value integer, created_at timestamp default now() );""") metrics_db.execute( """CREATE TABLE IF NOT EXISTS metrics_table_schema_changes ( table_id integer, column_name text, column_type text, column_count integer, operation text, created_at timestamp default now() )""") metrics_db.execute("""CREATE TABLE IF NOT EXISTS metrics_data_volume ( table_id integer, time_interval text, count bigint, created_at timestamp default now() ); """) metrics_db.execute("""CREATE TABLE IF NOT EXISTS metrics_data_volume_diff ( table_id integer, from_time timestamp, count bigint, created_at timestamp default now() )""") metrics_db.execute("""CREATE TABLE IF NOT EXISTS metrics_data_values ( table_id integer, column_name text, column_value text, check_name text, check_value double precision, time_interval text, created_at timestamp default now() )""")
def check_data_volume_diff(db, table): from_time = metrics_db.execute( text(""" SELECT max(created_at) as created_at FROM metrics_data_volume_diff WHERE table_id = :table_id """), { 'table_id': table.id }).first() from_time = from_time.created_at if from_time else None if from_time is None: # if now previous diff computed, compute from start of day # mostly because we show that stat daily from_time = datetime.combine(date.today(), time()) result = db.check_data_volume_diff(table, from_time=from_time) for r in (result or []): metric = MetricsDataVolumeDiff(table_id=table.id, date=r.date, count=r.count) metrics_session.add(metric) metrics_session.commit()
def get_dashboard_for_table(db, table): table_data = load_json_data(settings.TABLE_DASHBOARD_LOCATION) panels = table_data['panels'] per_title = dict([(panel.title(), panel) for panel in ALL_PANELS]) table_data[ 'title'] = f"source: {db.name} table: {table.table_name} (generated)" for panel in panels: if per_title.get(panel['title']): panel = update_panel_element(table, panel, per_title[panel['title']]) all_checks = metrics_db.execute(f""" SELECT DISTINCT column_name, check_name FROM metrics_data_values WHERE table_id = '{table.id}' """) next_id = 20 y_pos = 20 x_pos = 0 check_per_column = {} for column_name, check_name in all_checks: if column_name not in check_per_column: check_per_column[column_name] = [check_name] else: check_per_column[column_name].append(check_name) for column_name in sorted(check_per_column.keys()): if x_pos != 0: x_pos = 0 y_pos += 7 panel = load_json_data(settings.CUSTOM_ROW_LOCATION) panel['title'] = f"{column_name} column" panel['id'] = next_id panel['gridPos']["y"] = y_pos panel['gridPos']["x"] = x_pos next_id += 1 y_pos += 1 panels.append(panel) checks = check_per_column[column_name] for check_name in checks: panel = load_json_data(settings.CUSTOM_PANEL_LOCATION) panel['title'] = f"{check_name}" panel['id'] = next_id panel['gridPos']["y"] = y_pos panel['gridPos']["x"] = x_pos next_id += 1 if x_pos == 0: x_pos += 12 else: x_pos = 0 y_pos += 7 if check_name != 'check_count_per_value': panel = update_panel_element(table, panel, CheckForColumn, column_name=column_name, check_name=check_name) else: panel = update_panel_element(table, panel, CheckForColumnByValue, column_name=column_name, check_name=check_name, time_interval='1 day') panels.append(panel) return table_data