def execute(self, context): hive = HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id) mysql = MySqlHook(mysql_conn_id=self.mysql_conn_id) logging.info("Dumping MySQL query results to local file") conn = mysql.get_conn() cursor = conn.cursor() cursor.execute(self.sql) with NamedTemporaryFile("w") as f: csv_writer = csv.writer(f, delimiter=self.delimiter) field_dict = OrderedDict() for field in cursor.description: field_dict[field[0]] = self.type_map(field[1]) csv_writer.writerows(cursor) f.flush() cursor.close() conn.close() logging.info("Loading file into Hive") hive.load_file( f.name, self.hive_table, field_dict=field_dict, create=self.create, partition=self.partition, delimiter=self.delimiter, recreate=self.recreate)
def objects(self): where_clause = '' if DB_WHITELIST: dbs = ",".join(["'" + db + "'" for db in DB_WHITELIST]) where_clause = "AND b.name IN ({})".format(dbs) if DB_BLACKLIST: dbs = ",".join(["'" + db + "'" for db in DB_BLACKLIST]) where_clause = "AND b.name NOT IN ({})".format(dbs) sql = """ SELECT CONCAT(b.NAME, '.', a.TBL_NAME), TBL_TYPE FROM TBLS a JOIN DBS b ON a.DB_ID = b.DB_ID WHERE a.TBL_NAME NOT LIKE '%tmp%' AND a.TBL_NAME NOT LIKE '%temp%' AND b.NAME NOT LIKE '%tmp%' AND b.NAME NOT LIKE '%temp%' {where_clause} LIMIT {LIMIT}; """.format(where_clause=where_clause, LIMIT=TABLE_SELECTOR_LIMIT) h = MySqlHook(METASTORE_MYSQL_CONN_ID) d = [ {'id': row[0], 'text': row[0]} for row in h.get_records(sql)] return json.dumps(d)
def partitions(self): schema, table = request.args.get("table").split('.') sql = """ SELECT a.PART_NAME, a.CREATE_TIME, c.LOCATION, c.IS_COMPRESSED, c.INPUT_FORMAT, c.OUTPUT_FORMAT FROM PARTITIONS a JOIN TBLS b ON a.TBL_ID = b.TBL_ID JOIN DBS d ON b.DB_ID = d.DB_ID JOIN SDS c ON a.SD_ID = c.SD_ID WHERE b.TBL_NAME like '{table}' AND d.NAME like '{schema}' ORDER BY PART_NAME DESC """.format(**locals()) h = MySqlHook(METASTORE_MYSQL_CONN_ID) df = h.get_pandas_df(sql) return df.to_html( classes="table table-striped table-bordered table-hover", index=False, na_rep='',)
def partitions(self): schema, table = request.args.get("table").split('.') sql = """ SELECT a.PART_NAME, a.CREATE_TIME, c.LOCATION, c.IS_COMPRESSED, c.INPUT_FORMAT, c.OUTPUT_FORMAT FROM PARTITIONS a JOIN TBLS b ON a.TBL_ID = b.TBL_ID JOIN DBS d ON b.DB_ID = d.DB_ID JOIN SDS c ON a.SD_ID = c.SD_ID WHERE b.TBL_NAME like '{table}' AND d.NAME like '{schema}' ORDER BY PART_NAME DESC """.format(**locals()) h = MySqlHook(METASTORE_MYSQL_CONN_ID) df = h.get_pandas_df(sql) return df.to_html( classes="table table-striped table-bordered table-hover", index=False, na_rep='', )
def execute(self, context): hive = HiveServer2Hook(hiveserver2_conn_id=self.hiveserver2_conn_id) logging.info("Extracting data from Hive") logging.info(self.sql) if self.bulk_load: tmpfile = NamedTemporaryFile() hive.to_csv(self.sql, tmpfile.name, delimiter='\t', lineterminator='\n', output_header=False) else: results = hive.get_records(self.sql) mysql = MySqlHook(mysql_conn_id=self.mysql_conn_id) if self.mysql_preoperator: logging.info("Running MySQL preoperator") mysql.run(self.mysql_preoperator) logging.info("Inserting rows into MySQL") if self.bulk_load: mysql.bulk_load(table=self.mysql_table, tmp_file=tmpfile.name) tmpfile.close() else: mysql.insert_rows(table=self.mysql_table, rows=results) if self.mysql_postoperator: logging.info("Running MySQL postoperator") mysql.run(self.mysql_postoperator) logging.info("Done.")
def insert_or_delete_task_state(*args, **kargs): ti = kargs["ti"] ##current task instance insert_sql = kargs["insert_sql"] # insert upstream state sql delete_sql = kargs["delete_sql"] # delete upstream state sql upstream_task_id = kargs["up_id"] # upstream task id xcom_pull_key = kargs["xcom_key"] # key for xcom_pull mysql_conn_id = kargs["mysql_conn_id"] # conn id for database which stores state table actual_sql = "" ##method 1 of get the upstream task's state # up_state = ti.xcom_pull(key=xcom_pull_key, task_ids = upstream_task_id) upstream_task_list = kargs["task"].upstream_list logging.info("%s upstream task list is %s" % (kargs["task"], upstream_task_list)) upstream_task_state = ( upstream_task_list[0] .get_task_instances( session=settings.Session(), start_date=kargs["execution_date"], end_date=kargs["execution_date"] )[0] .state ) logging.info("%s upstream task(%s) state is %s" % (kargs["task"], upstream_task_list[0], upstream_task_state)) if upstream_task_state == "success": actual_sql = insert_sql elif upstream_task_state == "failed": actual_sql = delete_sql else: actual_sql = "show databases;" ## if not success or failed, do something effectiveless logging.info("upstream state is %s, actual update status sql is %s" % (str(upstream_task_state), str(actual_sql))) ##start to execute sql logging.info("Executing: %s" % (str(actual_sql))) hook = MySqlHook(mysql_conn_id=mysql_conn_id) hook.run(actual_sql)
def execute(self, context): hive = HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id) mysql = MySqlHook(mysql_conn_id=self.mysql_conn_id) logging.info("Dumping MySQL query results to local file") conn = mysql.get_conn() cursor = conn.cursor() cursor.execute(self.sql) with NamedTemporaryFile("wb") as f: csv_writer = csv.writer(f, delimiter=self.delimiter, encoding="utf-8") field_dict = OrderedDict() for field in cursor.description: field_dict[field[0]] = self.type_map(field[1]) csv_writer.writerows(cursor) f.flush() cursor.close() conn.close() logging.info("Loading file into Hive") hive.load_file(f.name, self.hive_table, field_dict=field_dict, create=self.create, partition=self.partition, delimiter=self.delimiter, recreate=self.recreate)
def execute(self, context): logging.info('Executing: ' + str(self.sql)) hook = MySqlHook(mysql_conn_id=self.mysql_conn_id) hook.run( self.sql, autocommit=self.autocommit, parameters=self.parameters)
def execute(self, context): hive = HiveServer2Hook(hiveserver2_conn_id=self.hive_cli_conn_id) logging.info("Extracting data from Hive") results = hive.get_records(self.sql) mysql = MySqlHook(mysql_conn_id=self.mysql_conn_id) logging.info("Inserting rows into MySQL") mysql.insert_rows(table=self.mysql_table, rows=results)
def _query_mysql(self): """ Queries mysql and returns a cursor to the results. """ mysql = MySqlHook(mysql_conn_id=self.mysql_conn_id) conn = mysql.get_conn() cursor = conn.cursor() cursor.execute(self.sql) return cursor
def __init__(self, sql, mysql_conn_id, *args, **kwargs): """ Parameters: mysql_conn_id: reference to a specific mysql database sql: the sql code you to be executed """ super(MySqlOperator, self).__init__(*args, **kwargs) self.hook = MySqlHook(mysql_conn_id=mysql_conn_id) self.sql = sql
def execute(self, context): hive = HiveServer2Hook(hiveserver2_conn_id=self.hiveserver2_conn_id) logging.info("Extracting data from Hive") logging.info(self.sql) results = hive.get_records(self.sql) mysql = MySqlHook(mysql_conn_id=self.mysql_conn_id) if self.mysql_preoperator: logging.info("Running MySQL preoperator") logging.info(self.mysql_preoperator) mysql.run(self.mysql_preoperator) logging.info("Inserting rows into MySQL") mysql.insert_rows(table=self.mysql_table, rows=results)
def execute(self, context): presto = PrestoHook(presto_conn_id=self.presto_conn_id) logging.info("Extracting data from Presto") logging.info(self.sql) results = presto.get_records(self.sql) mysql = MySqlHook(mysql_conn_id=self.mysql_conn_id) if self.mysql_preoperator: logging.info("Running MySQL preoperator") logging.info(self.mysql_preoperator) mysql.run(self.mysql_preoperator) logging.info("Inserting rows into MySQL") mysql.insert_rows(table=self.mysql_table, rows=results)
def post_execute(self, context): try: log_filepath = context["task_instance"].log_filepath.strip('.log') # parse the log file into a dict parsed = HiveLog.parselog(log_filepath) # captures the name of a parent dag in a subdag routine dag_ids = context['dag'].dag_id.split('.') if len(dag_ids) > 1: parsed['parent_dag_id'] = dag_ids[0] parsed['dag_id'] = dag_ids[1:].join('.') else: parsed['parent_dag_id'] = context['dag'].dag_id parsed['dag_id'] = parsed['parent_dag_id'] # Capture extra information from context parsed['task_id'] = context['task'].task_id parsed['input_date'] = context['ds'] parsed['owner'] = context['task'].owner # Create the table if it doesnt exist sql_hook = MySqlHook("airflow_db") sql_hook.run(table_creation_string()) # Create sql insertion strings and insert strings = table_insertion_strings(parsed) logging.info('Inserting honypot log metadata into database') for add_data_to_table_string in strings: sql_hook.run(add_data_to_table_string) logging.info('Completely log data insertion') sql_hook.run('COMMIT;') except Exception as e: logging.error("Honeypot post executor failed") logging.exception(e)
def simple_select_mysql(ds, **kwargs): mysql_hook = MySqlHook(mysql_conn_id="soletrade_localhost") users_trend = """SELECT DISTINCT(email), sum(spent), count(transactions() FROM orders AS o GROUP BY 1 ;""" with open('temp/output.csv', 'w') as f: wr = csv.writer(f, delimiter=',') csvRow = ["user", "spent", "transactions"] wr.writerow(csvRow) for user, spent, transactions in mysql_hook.get_records(users_trend): wr.writerow([user, spent, transactions])
def get_sql_hook(self, sql_conn_id): if 'sqlite' in sql_conn_id: from airflow.hooks import SqliteHook return SqliteHook(sql_conn_id) else: from airflow.hooks import MySqlHook return MySqlHook(sql_conn_id)
def execute(self,context): hook = MySqlHook(mysql_conn_id=self.mysql_conn_id) if self.data=="" and self.redis_key != "" and self.redis_conn_id != "": self.data = eval(self.redis_hook.get(self.redis_key)) #print type(self.data),len(self.data),self.data[0],type(self.data[0]) if len(self.data) <=0: logging.error("Not inserting data as the provided key is empty") return 1 try: #print "Started Exec" conn = hook.get_conn() cursor = conn.cursor() cursor.executemany(self.sql, self.data) except Exception,e: logging.info("Exception") traceback.print_exc()
def index(self): sql = """ SELECT a.name as db, db_location_uri as location, count(1) as object_count, a.desc as description FROM DBS a JOIN TBLS b ON a.DB_ID = b.DB_ID GROUP BY a.name, db_location_uri, a.desc """.format( **locals() ) h = MySqlHook(METASTORE_MYSQL_CONN_ID) df = h.get_pandas_df(sql) df.db = '<a href="/admin/metastorebrowserview/db/?db=' + df.db + '">' + df.db + "</a>" table = df.to_html( classes="table table-striped table-bordered table-hover", index=False, escape=False, na_rep="" ) return self.render("metastore_browser/dbs.html", table=table)
def index(self): sql = """ SELECT a.name as db, db_location_uri as location, count(1) as object_count, a.desc as description FROM DBS a JOIN TBLS b ON a.DB_ID = b.DB_ID GROUP BY a.name, db_location_uri, a.desc """.format(**locals()) h = MySqlHook(METASTORE_MYSQL_CONN_ID) df = h.get_pandas_df(sql) df.db = ('<a href="/admin/metastorebrowserview/db/?db=' + df.db + '">' + df.db + '</a>') table = df.to_html( classes="table table-striped table-bordered table-hover", index=False, escape=False, na_rep='', ) return self.render("metastore_browser/dbs.html", table=table)
def objects(self): where_clause = '' if DB_WHITELIST: dbs = ",".join(["'" + db + "'" for db in DB_WHITELIST]) where_clause = "AND b.name IN ({})".format(dbs) if DB_BLACKLIST: dbs = ",".join(["'" + db + "'" for db in DB_BLACKLIST]) where_clause = "AND b.name NOT IN ({})".format(dbs) sql = """ SELECT CONCAT(b.NAME, '.', a.TBL_NAME), TBL_TYPE FROM TBLS a JOIN DBS b ON a.DB_ID = b.DB_ID WHERE a.TBL_NAME NOT LIKE '%tmp%' AND a.TBL_NAME NOT LIKE '%temp%' AND b.NAME NOT LIKE '%tmp%' AND b.NAME NOT LIKE '%temp%' {where_clause} LIMIT {LIMIT}; """.format(where_clause=where_clause, LIMIT=TABLE_SELECTOR_LIMIT) h = MySqlHook(METASTORE_MYSQL_CONN_ID) d = [{'id': row[0], 'text': row[0]} for row in h.get_records(sql)] return json.dumps(d)
class MySqlOperator(BaseOperator): """ Executes sql code in a specific mysql database. """ __mapper_args__ = {'polymorphic_identity': 'MySqlOperator'} template_fields = ('sql', ) template_ext = ('.sql', ) @apply_defaults def __init__(self, sql, mysql_conn_id, *args, **kwargs): """ Parameters: mysql_conn_id: reference to a specific mysql database sql: the sql code you to be executed """ super(MySqlOperator, self).__init__(*args, **kwargs) self.hook = MySqlHook(mysql_conn_id=mysql_conn_id) self.sql = sql def execute(self, execution_date): logging.info('Executing: ' + self.sql) self.hook.run(self.sql)
def execute(self, context): hive = HiveServer2Hook(hiveserver2_conn_id=self.hiveserver2_conn_id) logging.info("Extracting data from Hive") logging.info(self.sql) results = hive.get_records(self.sql) mysql = MySqlHook(mysql_conn_id=self.mysql_conn_id) if self.mysql_preoperator: logging.info("Running MySQL preoperator") logging.info(self.mysql_preoperator) mysql.run(self.mysql_preoperator) logging.info("Inserting rows into MySQL") mysql.insert_rows(table=self.mysql_table, rows=results) if self.mysql_postoperator: logging.info("Running MySQL postoperator") logging.info(self.mysql_postoperator) mysql.run(self.mysql_postoperator)
def execute(self, context=None): metastore = HiveMetastoreHook(metastore_conn_id=self.metastore_conn_id) table = metastore.get_table(table_name=self.table) field_types = {col.name: col.type for col in table.sd.cols} exprs = { ('', 'count'): 'COUNT(*)' } for col, col_type in field_types.items(): d = {} if self.assignment_func: d = self.assignment_func(col, col_type) if d is None: d = self.get_default_exprs(col, col_type) else: d = self.get_default_exprs(col, col_type) exprs.update(d) exprs.update(self.extra_exprs) exprs = OrderedDict(exprs) exprs_str = ",\n ".join([ v + " AS " + k[0] + '__' + k[1] for k, v in exprs.items()]) where_clause = [ "{0} = '{1}'".format(k, v) for k, v in self.partition.items()] where_clause = " AND\n ".join(where_clause) sql = """ SELECT {exprs_str} FROM {self.table} WHERE {where_clause}; """.format(**locals()) hook = PrestoHook(presto_conn_id=self.presto_conn_id) logging.info('Executing SQL check: ' + sql) row = hook.get_first(hql=sql) logging.info("Record: " + str(row)) if not row: raise Exception("The query returned None") part_json = json.dumps(self.partition, sort_keys=True) logging.info("Deleting rows from previous runs if they exist") mysql = MySqlHook(self.mysql_conn_id) sql = """ SELECT 1 FROM hive_stats WHERE table_name='{self.table}' AND partition_repr='{part_json}' AND dttm='{self.dttm}' LIMIT 1; """.format(**locals()) if mysql.get_records(sql): sql = """ DELETE FROM hive_stats WHERE table_name='{self.table}' AND partition_repr='{part_json}' AND dttm='{self.dttm}'; """.format(**locals()) mysql.run(sql) logging.info("Pivoting and loading cells into the Airflow db") rows = [ (self.ds, self.dttm, self.table, part_json) + (r[0][0], r[0][1], r[1]) for r in zip(exprs, row)] mysql.insert_rows( table='hive_stats', rows=rows, target_fields=[ 'ds', 'dttm', 'table_name', 'partition_repr', 'col', 'metric', 'value', ] )
def execute(self, context): logging.info('Executing: ' + self.sql) hook = MySqlHook(mysql_conn_id=self.mysql_conn_id) hook.run(self.sql)
def execute(self, context): logging.info("Executing: " + self.sql) hook = MySqlHook(mysql_conn_id=self.mysql_conn_id) hook.run(self.sql)
def execute(self, context): logging.info('Executing: ' + str(self.sql)) hook = MySqlHook(mysql_conn_id=self.mysql_conn_id) hook.run(self.sql, autocommit=self.autocommit, parameters=self.parameters)
def execute(self, context=None): metastore = HiveMetastoreHook(metastore_conn_id=self.metastore_conn_id) table = metastore.get_table(table_name=self.table) field_types = {col.name: col.type for col in table.sd.cols} exprs = {('', 'count'): 'COUNT(*)'} for col, col_type in field_types.items(): d = {} if self.assignment_func: d = self.assignment_func(col, col_type) if d is None: d = self.get_default_exprs(col, col_type) else: d = self.get_default_exprs(col, col_type) exprs.update(d) exprs.update(self.extra_exprs) exprs = OrderedDict(exprs) exprs_str = ",\n ".join( [v + " AS " + k[0] + '__' + k[1] for k, v in exprs.items()]) where_clause = [ "{0} = '{1}'".format(k, v) for k, v in self.partition.items() ] where_clause = " AND\n ".join(where_clause) sql = """ SELECT {exprs_str} FROM {self.table} WHERE {where_clause}; """.format(**locals()) hook = PrestoHook(presto_conn_id=self.presto_conn_id) logging.info('Executing SQL check: ' + sql) row = hook.get_first(hql=sql) logging.info("Record: " + str(row)) if not row: raise AirflowException("The query returned None") part_json = json.dumps(self.partition, sort_keys=True) logging.info("Deleting rows from previous runs if they exist") mysql = MySqlHook(self.mysql_conn_id) sql = """ SELECT 1 FROM hive_stats WHERE table_name='{self.table}' AND partition_repr='{part_json}' AND dttm='{self.dttm}' LIMIT 1; """.format(**locals()) if mysql.get_records(sql): sql = """ DELETE FROM hive_stats WHERE table_name='{self.table}' AND partition_repr='{part_json}' AND dttm='{self.dttm}'; """.format(**locals()) mysql.run(sql) logging.info("Pivoting and loading cells into the Airflow db") rows = [(self.ds, self.dttm, self.table, part_json) + (r[0][0], r[0][1], r[1]) for r in zip(exprs, row)] mysql.insert_rows(table='hive_stats', rows=rows, target_fields=[ 'ds', 'dttm', 'table_name', 'partition_repr', 'col', 'metric', 'value', ])