def execute(self, context=None): logging.info('Executing SQL check: ' + self.sql) hook = PrestoHook(presto_conn_id=self.presto_conn_id) records = hook.get_first(hql=self.sql) if not records: raise AirflowException("The query returned None") test_results = [] except_temp = ("Test failed.\nPass value:{self.pass_value}\n" "Query:\n{self.sql}\nResults:\n{records!s}") if not self.is_numeric_value_check: tests = [str(r) == self.pass_value for r in records] elif self.is_numeric_value_check: try: num_rec = [float(r) for r in records] except (ValueError, TypeError) as e: cvestr = "Converting a result to float failed.\n" raise AirflowException(cvestr+except_temp.format(**locals())) if self.has_tolerance: tests = [ r / (1 + self.tol) <= self.pass_value <= r / (1 - self.tol) for r in num_rec] else: tests = [r == self.pass_value for r in num_rec] if not all(tests): raise AirflowException(except_temp.format(**locals()))
def execute(self, context=None): hook = PrestoHook(presto_conn_id=self.presto_conn_id) logging.info('Executing SQL check: ' + self.sql) records = hook.get_first(hql=self.sql) logging.info("Record: " + str(records)) if not records: raise AirflowException("The query returned None") elif not all([bool(r) for r in records]): exceptstr = "Test failed.\nQuery:\n{q}\nResults:\n{r!s}" raise AirflowException(exceptstr.format(q=self.sql, r=records)) logging.info("Success.")
def execute(self, context=None): hook = PrestoHook(presto_conn_id=self.presto_conn_id) logging.info('Executing SQL check: ' + self.sql2) row2 = hook.get_first(hql=self.sql2) logging.info('Executing SQL check: ' + self.sql1) row1 = hook.get_first(hql=self.sql1) if not row2: raise AirflowException("The query {q} returned None").format( q=self.sql2) if not row1: raise AirflowException("The query {q} returned None").format( q=self.sql1) current = dict(zip(self.metrics_sorted, row1)) reference = dict(zip(self.metrics_sorted, row2)) ratios = {} test_results = {} rlog = "Ratio for {0}: {1} \n Ratio threshold : {2}" fstr = "'{k}' check failed. {r} is above {tr}" estr = "The following tests have failed:\n {0}" countstr = "The following {j} tests out of {n} failed:" for m in self.metrics_sorted: if current[m] == 0 or reference[m] == 0: ratio = None else: ratio = float(max(current[m], reference[m])) / \ min(current[m], reference[m]) logging.info(rlog.format(m, ratio, self.metrics_thresholds[m])) ratios[m] = ratio test_results[m] = ratio < self.metrics_thresholds[m] if not all(test_results.values()): failed_tests = [it[0] for it in test_results.items() if not it[1]] j = len(failed_tests) n = len(self.metrics_sorted) logging.warning(countstr.format(**locals())) for k in failed_tests: logging.warning( fstr.format(k=k, r=ratios[k], tr=self.metrics_thresholds[k])) raise AirflowException(estr.format(", ".join(failed_tests))) logging.info("All tests have passed")
def execute(self, context=None): hook = PrestoHook(presto_conn_id=self.presto_conn_id) logging.info('Executing SQL check: ' + self.sql2) row2 = hook.get_first(hql=self.sql2) logging.info('Executing SQL check: ' + self.sql1) row1 = hook.get_first(hql=self.sql1) if not row2: raise AirflowException("The query {q} returned None").format(q=self.sql2) if not row1: raise AirflowException("The query {q} returned None").format(q=self.sql1) current = dict(zip(self.metrics_sorted, row1)) reference = dict(zip(self.metrics_sorted, row2)) ratios = {} test_results = {} rlog = "Ratio for {0}: {1} \n Ratio threshold : {2}" fstr = "'{k}' check failed. {r} is above {tr}" estr = "The following tests have failed:\n {0}" countstr = "The following {j} tests out of {n} failed:" for m in self.metrics_sorted: if current[m] == 0 or reference[m] == 0: ratio = None else: ratio = float(max(current[m], reference[m])) / \ min(current[m], reference[m]) logging.info(rlog.format(m, ratio, self.metrics_thresholds[m])) ratios[m] = ratio test_results[m] = ratio < self.metrics_thresholds[m] if not all(test_results.values()): failed_tests = [it[0] for it in test_results.items() if not it[1]] j = len(failed_tests) n = len(self.metrics_sorted) logging.warning(countstr.format(**locals())) for k in failed_tests: logging.warning(fstr.format(k=k, r=ratios[k], tr=self.metrics_thresholds[k])) raise AirflowException(estr.format(", ".join(failed_tests))) logging.info("All tests have passed")
def execute(self, context=None): metastore = HiveMetastoreHook(metastore_conn_id=self.metastore_conn_id) table = metastore.get_table(table_name=self.table) field_types = {col.name: col.type for col in table.sd.cols} exprs = { ('', 'count'): 'COUNT(*)' } for col, col_type in field_types.items(): d = {} if self.assignment_func: d = self.assignment_func(col, col_type) if d is None: d = self.get_default_exprs(col, col_type) else: d = self.get_default_exprs(col, col_type) exprs.update(d) exprs.update(self.extra_exprs) exprs = OrderedDict(exprs) exprs_str = ",\n ".join([ v + " AS " + k[0] + '__' + k[1] for k, v in exprs.items()]) where_clause = [ "{0} = '{1}'".format(k, v) for k, v in self.partition.items()] where_clause = " AND\n ".join(where_clause) sql = """ SELECT {exprs_str} FROM {self.table} WHERE {where_clause}; """.format(**locals()) hook = PrestoHook(presto_conn_id=self.presto_conn_id) logging.info('Executing SQL check: ' + sql) row = hook.get_first(hql=sql) logging.info("Record: " + str(row)) if not row: raise Exception("The query returned None") part_json = json.dumps(self.partition, sort_keys=True) logging.info("Deleting rows from previous runs if they exist") mysql = MySqlHook(self.mysql_conn_id) sql = """ SELECT 1 FROM hive_stats WHERE table_name='{self.table}' AND partition_repr='{part_json}' AND dttm='{self.dttm}' LIMIT 1; """.format(**locals()) if mysql.get_records(sql): sql = """ DELETE FROM hive_stats WHERE table_name='{self.table}' AND partition_repr='{part_json}' AND dttm='{self.dttm}'; """.format(**locals()) mysql.run(sql) logging.info("Pivoting and loading cells into the Airflow db") rows = [ (self.ds, self.dttm, self.table, part_json) + (r[0][0], r[0][1], r[1]) for r in zip(exprs, row)] mysql.insert_rows( table='hive_stats', rows=rows, target_fields=[ 'ds', 'dttm', 'table_name', 'partition_repr', 'col', 'metric', 'value', ] )
def execute(self, context=None): metastore = HiveMetastoreHook(metastore_conn_id=self.metastore_conn_id) table = metastore.get_table(table_name=self.table) field_types = {col.name: col.type for col in table.sd.cols} exprs = {('', 'count'): 'COUNT(*)'} for col, col_type in field_types.items(): d = {} if self.assignment_func: d = self.assignment_func(col, col_type) if d is None: d = self.get_default_exprs(col, col_type) else: d = self.get_default_exprs(col, col_type) exprs.update(d) exprs.update(self.extra_exprs) exprs = OrderedDict(exprs) exprs_str = ",\n ".join( [v + " AS " + k[0] + '__' + k[1] for k, v in exprs.items()]) where_clause = [ "{0} = '{1}'".format(k, v) for k, v in self.partition.items() ] where_clause = " AND\n ".join(where_clause) sql = """ SELECT {exprs_str} FROM {self.table} WHERE {where_clause}; """.format(**locals()) hook = PrestoHook(presto_conn_id=self.presto_conn_id) logging.info('Executing SQL check: ' + sql) row = hook.get_first(hql=sql) logging.info("Record: " + str(row)) if not row: raise AirflowException("The query returned None") part_json = json.dumps(self.partition, sort_keys=True) logging.info("Deleting rows from previous runs if they exist") mysql = MySqlHook(self.mysql_conn_id) sql = """ SELECT 1 FROM hive_stats WHERE table_name='{self.table}' AND partition_repr='{part_json}' AND dttm='{self.dttm}' LIMIT 1; """.format(**locals()) if mysql.get_records(sql): sql = """ DELETE FROM hive_stats WHERE table_name='{self.table}' AND partition_repr='{part_json}' AND dttm='{self.dttm}'; """.format(**locals()) mysql.run(sql) logging.info("Pivoting and loading cells into the Airflow db") rows = [(self.ds, self.dttm, self.table, part_json) + (r[0][0], r[0][1], r[1]) for r in zip(exprs, row)] mysql.insert_rows(table='hive_stats', rows=rows, target_fields=[ 'ds', 'dttm', 'table_name', 'partition_repr', 'col', 'metric', 'value', ])