def alert_on_z_score(df, check, alert_type, checked_txt, conf): df = df[df["result"].notnull()] if len(df) <= 1: return last_el_zscore = stats.zscore(df["result"])[-1] last_el = df["result"].iloc[-1] if math.isnan(last_el_zscore): return if abs(last_el_zscore) > settings.ACCEPTABLE_Z_SCORE_DIFF: alert_desc = "above" if last_el_zscore > 0 else "below" alert = Alert( text=f""" {checked_txt}, {alert_desc} expected range, value: {last_el}, z_score: {last_el_zscore:.2f} """, severity=2, table_id=check.table_id, alert_type=alert_type, created_at=conf.for_time, ) metrics_session.add(alert) metrics_session.commit()
def add_run(): run = Run(for_date=datetime.utcnow(), status='not started', run_type='scheduled') metrics_session.add(run) metrics_session.commit()
def create_column_checks(db, table): metrics = {} for col in table.schema["columns"]: if col["name"] in settings.SKIP_COLUMNS: continue if col["type"] not in db.numeric_types() + db.character_types(): continue checks_for_col = [] if col["type"] in db.numeric_types(): checks_for_col = [el for el in Metric.FOR_NUMERICAL_COL] elif col["type"] in db.character_types(): checks_for_col = [el for el in Metric.FOR_TEXT_COL] metrics[col["name"]] = checks_for_col check = Check( table_id=table.id, name="column_values", metrics=metrics, query={ "type": "standard", "path": f"redata.checks.data_values.check_column_values", "params": { "time_interval": "1 day" }, }, ) metrics_session.add(check) metrics_session.commit()
def alert_for_schema_change(db, check, conf): df = get_last_results(db, check, Metric.TABLE_METRIC, Metric.SCHEMA_CHANGE, conf, days=1) for index, row in df.iterrows(): changes = json.loads(row[0]) if changes["operation"] == "table detected": continue alert = Alert( text=f""" schema change detected - {changes['operation']}: {changes['column_name']} """, severity=2, table_id=check.table_id, alert_type=check.name, created_at=conf.for_time, ) metrics_session.add(alert) metrics_session.commit()
def create_column_checks(db, table): metrics = {} for col in table.schema['columns']: if col['name'] in settings.SKIP_COLUMNS: continue if col['type'] not in db.numeric_types() + db.character_types(): continue checks_for_col = [] if col['type'] in db.numeric_types(): checks_for_col = [el for el in Metric.FOR_NUMERICAL_COL] elif col['type'] in db.character_types(): checks_for_col = [el for el in Metric.FOR_TEXT_COL] metrics[col['name']] = checks_for_col check = Check(table_id=table.id, name='column_values', metrics=metrics, query={ 'type': 'standard', 'path': f'redata.checks.data_values.check_column_values', 'params': { 'time_interval': '1 day' } }) metrics_session.add(check) metrics_session.commit()
def alert_on_z_score(df, table, check_col, alert_type, checked_txt): df = df[df[check_col].notnull()] if len(df) <= 1: return last_el_zscore = stats.zscore(df[check_col])[-1] last_el = df[check_col].iloc[-1] if math.isnan(last_el_zscore): return if abs(last_el_zscore) > settings.ACCEPTABLE_Z_SCORE_DIFF: alert_desc = 'above' if last_el_zscore > 0 else 'below' print(f"Adding alert about table {table.table_name}") alert = Alert(text=f""" {checked_txt}, {alert_desc} expected range, value: {last_el}, z_score: {last_el_zscore:.2f} """, severity=2, table_id=table.id, alert_type=alert_type) metrics_session.add(alert) metrics_session.commit()
def insert_schema_changed_record(table, operation, column_name, column_type, column_count): metric = MetricsSchemaChanges(table_id=table.id, operation=operation, column_name=column_name, column_type=column_type, column_count=column_count) metrics_session.add(metric) metrics_session.commit()
def check_data_volume(db, table, time_interval): result = db.check_data_volume(table, time_interval) metric = MetricsDataVolume(table_id=table.id, time_interval=time_interval, count=result.count) metrics_session.add(metric) metrics_session.commit()
def setup_for_source_table(cls, db, db_table_name): print (f"Running setup for {db_table_name}") valid_types = db.datetime_types() schema_cols = get_current_table_schema(db, db_table_name) table = MonitoredTable( table_name=db_table_name, schema={'columns': schema_cols}, source_db=db.name ) # heuristics to find best column to sort by when computing stats about data # TODO: could probably look up in a provided table of regex + score, with higher scored matches being preferred # list all date/timestamp columns, filtering out anything that's blacklisted in configuration blacklist_regex = settings.REDATA_TIME_COL_BLACKLIST_REGEX matching_cols = [col['name'] for col in schema_cols if col['type'] in valid_types and re.search(blacklist_regex, col['name']) is None] # from matches, collect time cols that have max values at or before "now" cols_by_ts = defaultdict(list) now_ts = datetime.datetime.now() for col in matching_cols: max_ts = db.get_max_timestamp(table, col) if max_ts <= now_ts: cols_by_ts[max_ts].append(col) # list of all viable candidates, ordered by latest timestamp first candidates = list(itertools.chain( *[cols for ts, cols in sorted(cols_by_ts.items(), reverse=True)] )) # list of preferred columns out of the viable ones, by name filtering preferred = [col for col in candidates if col.lower().find('creat') != -1] if len(candidates) == 0: # no columns found? ignore table.. # TODO: add it, but set to disabled, for screening via web UI when we have one print (f"Not found column to sort by for {db_table_name}, skipping it for now") return None else: # if multiple columns found, primarily select from 'preferred' if exists, then set up the table col_name = preferred[0] if preferred else candidates[0] col_type = [col['type'] for col in schema_cols if col['name'] == col_name][0] if len(candidates) > 1: print (f"Found multiple columns to sort by {candidates}, choosing {col_name}, please update in DB if needed") else: print (f"Found column to sort by {col_name}") table.time_column=col_name table.time_column_type=col_type metrics_session.add(table) metrics_session.commit() return table
def add_run(): scan = Scan( start_date=datetime.utcnow(), end_date=datetime.utcnow(), status="not started", run_type="scheduled", ) metrics_session.add(scan) metrics_session.commit()
def check_data_delayed(db, table, conf): result = db.check_data_delayed(table, conf) if result[0]: metric = MetricsDataDelay(table_id=table.id, value=result[0].total_seconds(), created_at=conf.for_time) metrics_session.add(metric) metrics_session.commit()
def check_data_delayed(db, table): result = db.check_data_delayed(table) if result[0]: metric = MetricsDataDelay( table_id=table.id, value=result[0].total_seconds() ) metrics_session.add(metric) metrics_session.commit()
def check_generic(func_name, db, table, checked_column, time_interval): result = db.check_generic(func_name, table, checked_column, time_interval) metric = MetricsDataValues(table_id=table.id, column_name=checked_column, check_name=f'check_{func_name}', check_value=result.value, time_interval=time_interval) metrics_session.add(metric) metrics_session.commit()
def check_count_nulls(db, table, checked_column, time_interval): result = db.check_count_nulls(table, checked_column, time_interval) metric = MetricsDataValues(table_id=table.id, column_name=checked_column, check_name='check_count_nulls', check_value=result.value, time_interval=time_interval) metrics_session.add(metric) metrics_session.commit()
def check_count_per_value(db, table, checked_column, time_interval): result = db.check_count_per_value(table, checked_column, time_interval) for row in (result or []): metric = MetricsDataValues(table_id=table.id, column_name=checked_column, column_value=row.value, check_name='check_count_per_value', check_value=row.count, time_interval=time_interval) metrics_session.add(metric) metrics_session.commit()
def create_admin_user_if_not_exist(cls): assert os.environ.get( 'REDATA_ADMIN_USER'), 'please set env variable for admin user' assert os.environ.get('REDATA_ADMIN_PASSWORD' ), 'please set env variable for admin password' is_admin = metrics_session.query(cls).filter( cls.login == os.environ.get('REDATA_ADMIN_USER')).count() if not is_admin: user = cls(login=os.environ.get('REDATA_ADMIN_USER'), password=generate_password_hash( os.environ.get('REDATA_ADMIN_PASSWORD'))) metrics_session.add(user) metrics_session.commit() print("Created admin user")
def add_metrics(cls, results, check, conf): print(f"Adding results for check: {check}") for row in results: for col, metrics in check.metrics.items(): for m in metrics: select_name = col + '_' + m if col != Metric.TABLE_METRIC else m m = MetricFromCheck(check_id=check.id, table_id=check.table.id, table_column=col if col else None, params=check.query['params'], metric=m, result={'value': row[select_name]}, created_at=conf.for_time) metrics_session.add(m) metrics_session.commit()
def create_for_detected_table(db, table): for check in table_checks: func = check['func'] metric_dict = {Metric.TABLE_METRIC: [check['metric']]} model_check = Check(table_id=table.id, name=check['metric'], metrics=metric_dict, query={ 'type': 'standard', 'path': f'redata.checks.{func}', 'params': check['params'] }) metrics_session.add(model_check) metrics_session.commit() create_column_checks(db, table)
def add_metrics(cls, results, check, conf): print(f"Adding results for check: {check}") for row in results: for col, metrics in check.metrics.items(): for m in metrics: select_name = name_for(col, m) m = MetricFromCheck( check_id=check.id, table_id=check.table.id, table_column=col, params=check.query.get("params", {}), metric=m, result={"value": row[select_name]}, created_at=conf.for_time, ) metrics_session.add(m) metrics_session.commit()
def setup_for_source_table(cls, db, db_table_name): print (f"Running setup for {db_table_name}") preference = [ 'timestamp without time zone', 'timestamp with time zone', 'date', 'datetime' #mysql ] schema_cols = get_current_table_schema(db, db_table_name) # heuristics to find best column to sort by when computing stats about data proper_type = [col['name'] for col in schema_cols if col['type'] in preference] columns = [c for c in proper_type if c.find('creat') != -1 ] colname, col_type = None, None if len(proper_type) == 0: print (f"Not found column to sort by for {db_table_name}, skipping it for now") return None else: if len(columns) > 1: print (f"Found multiple columns to sort by {columns}, choosing {columns[0]}, please update in DB if needed") col_name = columns[0] if columns else proper_type[0] col_type = [col['type'] for col in schema_cols if col['name'] == col_name][0] print (f"Found column to sort by {col_name}") table = MonitoredTable( table_name=db_table_name, time_column=col_name, time_column_type=col_type, schema={'columns': schema_cols}, source_db=db.name ) metrics_session.add(table) metrics_session.commit() return table
def create_for_detected_table(db, table): for check in table_checks: func = check["func"] metric_dict = {Metric.TABLE_METRIC: [check["metric"]]} model_check = Check( table_id=table.id, name=check["metric"], metrics=metric_dict, query={ "type": "standard", "path": f"redata.checks.{func}", "params": check["params"], }, ) metrics_session.add(model_check) metrics_session.commit() create_column_checks(db, table)
def check_data_volume_diff(db, table): from_time = metrics_db.execute( text(""" SELECT max(created_at) as created_at FROM metrics_data_volume_diff WHERE table_id = :table_id """), { 'table_id': table.id }).first() from_time = from_time.created_at if from_time else None if from_time is None: # if now previous diff computed, compute from start of day # mostly because we show that stat daily from_time = datetime.combine(date.today(), time()) result = db.check_data_volume_diff(table, from_time=from_time) for r in (result or []): metric = MetricsDataVolumeDiff(table_id=table.id, date=r.date, count=r.count) metrics_session.add(metric) metrics_session.commit()