def objects(self): where_clause = '' if DB_WHITELIST: dbs = ",".join(["'" + db + "'" for db in DB_WHITELIST]) where_clause = "AND b.name IN ({})".format(dbs) if DB_BLACKLIST: dbs = ",".join(["'" + db + "'" for db in DB_BLACKLIST]) where_clause = "AND b.name NOT IN ({})".format(dbs) sql = """ SELECT CONCAT(b.NAME, '.', a.TBL_NAME), TBL_TYPE FROM TBLS a JOIN DBS b ON a.DB_ID = b.DB_ID WHERE a.TBL_NAME NOT LIKE '%tmp%' AND a.TBL_NAME NOT LIKE '%temp%' AND b.NAME NOT LIKE '%tmp%' AND b.NAME NOT LIKE '%temp%' {where_clause} LIMIT {LIMIT}; """.format(where_clause=where_clause, LIMIT=TABLE_SELECTOR_LIMIT) h = MySqlHook(METASTORE_MYSQL_CONN_ID) d = [ {'id': row[0], 'text': row[0]} for row in h.get_records(sql)] return json.dumps(d)
def simple_select_mysql(ds, **kwargs): mysql_hook = MySqlHook(mysql_conn_id="soletrade_localhost") users_trend = """SELECT DISTINCT(email), sum(spent), count(transactions() FROM orders AS o GROUP BY 1 ;""" with open('temp/output.csv', 'w') as f: wr = csv.writer(f, delimiter=',') csvRow = ["user", "spent", "transactions"] wr.writerow(csvRow) for user, spent, transactions in mysql_hook.get_records(users_trend): wr.writerow([user, spent, transactions])
def objects(self): where_clause = '' if DB_WHITELIST: dbs = ",".join(["'" + db + "'" for db in DB_WHITELIST]) where_clause = "AND b.name IN ({})".format(dbs) if DB_BLACKLIST: dbs = ",".join(["'" + db + "'" for db in DB_BLACKLIST]) where_clause = "AND b.name NOT IN ({})".format(dbs) sql = """ SELECT CONCAT(b.NAME, '.', a.TBL_NAME), TBL_TYPE FROM TBLS a JOIN DBS b ON a.DB_ID = b.DB_ID WHERE a.TBL_NAME NOT LIKE '%tmp%' AND a.TBL_NAME NOT LIKE '%temp%' AND b.NAME NOT LIKE '%tmp%' AND b.NAME NOT LIKE '%temp%' {where_clause} LIMIT {LIMIT}; """.format(where_clause=where_clause, LIMIT=TABLE_SELECTOR_LIMIT) h = MySqlHook(METASTORE_MYSQL_CONN_ID) d = [{'id': row[0], 'text': row[0]} for row in h.get_records(sql)] return json.dumps(d)
def execute(self, context=None): metastore = HiveMetastoreHook(metastore_conn_id=self.metastore_conn_id) table = metastore.get_table(table_name=self.table) field_types = {col.name: col.type for col in table.sd.cols} exprs = { ('', 'count'): 'COUNT(*)' } for col, col_type in field_types.items(): d = {} if self.assignment_func: d = self.assignment_func(col, col_type) if d is None: d = self.get_default_exprs(col, col_type) else: d = self.get_default_exprs(col, col_type) exprs.update(d) exprs.update(self.extra_exprs) exprs = OrderedDict(exprs) exprs_str = ",\n ".join([ v + " AS " + k[0] + '__' + k[1] for k, v in exprs.items()]) where_clause = [ "{0} = '{1}'".format(k, v) for k, v in self.partition.items()] where_clause = " AND\n ".join(where_clause) sql = """ SELECT {exprs_str} FROM {self.table} WHERE {where_clause}; """.format(**locals()) hook = PrestoHook(presto_conn_id=self.presto_conn_id) logging.info('Executing SQL check: ' + sql) row = hook.get_first(hql=sql) logging.info("Record: " + str(row)) if not row: raise Exception("The query returned None") part_json = json.dumps(self.partition, sort_keys=True) logging.info("Deleting rows from previous runs if they exist") mysql = MySqlHook(self.mysql_conn_id) sql = """ SELECT 1 FROM hive_stats WHERE table_name='{self.table}' AND partition_repr='{part_json}' AND dttm='{self.dttm}' LIMIT 1; """.format(**locals()) if mysql.get_records(sql): sql = """ DELETE FROM hive_stats WHERE table_name='{self.table}' AND partition_repr='{part_json}' AND dttm='{self.dttm}'; """.format(**locals()) mysql.run(sql) logging.info("Pivoting and loading cells into the Airflow db") rows = [ (self.ds, self.dttm, self.table, part_json) + (r[0][0], r[0][1], r[1]) for r in zip(exprs, row)] mysql.insert_rows( table='hive_stats', rows=rows, target_fields=[ 'ds', 'dttm', 'table_name', 'partition_repr', 'col', 'metric', 'value', ] )
def execute(self, context=None): metastore = HiveMetastoreHook(metastore_conn_id=self.metastore_conn_id) table = metastore.get_table(table_name=self.table) field_types = {col.name: col.type for col in table.sd.cols} exprs = {('', 'count'): 'COUNT(*)'} for col, col_type in field_types.items(): d = {} if self.assignment_func: d = self.assignment_func(col, col_type) if d is None: d = self.get_default_exprs(col, col_type) else: d = self.get_default_exprs(col, col_type) exprs.update(d) exprs.update(self.extra_exprs) exprs = OrderedDict(exprs) exprs_str = ",\n ".join( [v + " AS " + k[0] + '__' + k[1] for k, v in exprs.items()]) where_clause = [ "{0} = '{1}'".format(k, v) for k, v in self.partition.items() ] where_clause = " AND\n ".join(where_clause) sql = """ SELECT {exprs_str} FROM {self.table} WHERE {where_clause}; """.format(**locals()) hook = PrestoHook(presto_conn_id=self.presto_conn_id) logging.info('Executing SQL check: ' + sql) row = hook.get_first(hql=sql) logging.info("Record: " + str(row)) if not row: raise AirflowException("The query returned None") part_json = json.dumps(self.partition, sort_keys=True) logging.info("Deleting rows from previous runs if they exist") mysql = MySqlHook(self.mysql_conn_id) sql = """ SELECT 1 FROM hive_stats WHERE table_name='{self.table}' AND partition_repr='{part_json}' AND dttm='{self.dttm}' LIMIT 1; """.format(**locals()) if mysql.get_records(sql): sql = """ DELETE FROM hive_stats WHERE table_name='{self.table}' AND partition_repr='{part_json}' AND dttm='{self.dttm}'; """.format(**locals()) mysql.run(sql) logging.info("Pivoting and loading cells into the Airflow db") rows = [(self.ds, self.dttm, self.table, part_json) + (r[0][0], r[0][1], r[1]) for r in zip(exprs, row)] mysql.insert_rows(table='hive_stats', rows=rows, target_fields=[ 'ds', 'dttm', 'table_name', 'partition_repr', 'col', 'metric', 'value', ])