def execute(self, context): hive = HiveServer2Hook(hiveserver2_conn_id=self.hiveserver2_conn_id) logging.info("Extracting data from Hive") logging.info(self.sql) if self.bulk_load: tmpfile = NamedTemporaryFile() hive.to_csv(self.sql, tmpfile.name, delimiter='\t', lineterminator='\n', output_header=False) else: results = hive.get_records(self.sql) mysql = MySqlHook(mysql_conn_id=self.mysql_conn_id) if self.mysql_preoperator: logging.info("Running MySQL preoperator") mysql.run(self.mysql_preoperator) logging.info("Inserting rows into MySQL") if self.bulk_load: mysql.bulk_load(table=self.mysql_table, tmp_file=tmpfile.name) tmpfile.close() else: mysql.insert_rows(table=self.mysql_table, rows=results) if self.mysql_postoperator: logging.info("Running MySQL postoperator") mysql.run(self.mysql_postoperator) logging.info("Done.")
def execute(self, context): hive = HiveServer2Hook(hiveserver2_conn_id=self.hiveserver2_conn_id) logging.info("Extracting data from Hive") logging.info(self.sql) if self.bulk_load: tmpfile = NamedTemporaryFile() hive.to_csv(self.sql, tmpfile.name, delimiter='\t', lineterminator='\n', output_header=False) else: results = hive.get_records(self.sql) mysql = MySqlHook(mysql_conn_id=self.mysql_conn_id) if self.mysql_preoperator: logging.info("Running MySQL preoperator") mysql.run(self.mysql_preoperator) logging.info("Inserting rows into MySQL") if self.bulk_load: mysql.bulk_load(table=self.mysql_table, tmp_file=tmpfile.name) tmpfile.close() else: mysql.insert_rows(table=self.mysql_table, rows=results) if self.mysql_postoperator: logging.info("Running MySQL postoperator") mysql.run(self.mysql_postoperator) logging.info("Done.")
def test_mysql_hook_test_bulk_load(self): records = ("foo", "bar", "baz") import tempfile with tempfile.NamedTemporaryFile() as t: t.write("\n".join(records).encode('utf8')) t.flush() from airflow.hooks.mysql_hook import MySqlHook h = MySqlHook('airflow_ci') with h.get_conn() as c: c.execute(""" CREATE TABLE IF NOT EXISTS test_airflow ( dummy VARCHAR(50) ) """) c.execute("TRUNCATE TABLE test_airflow") h.bulk_load("test_airflow", t.name) c.execute("SELECT dummy FROM test_airflow") results = tuple(result[0] for result in c.fetchall()) self.assertEqual(sorted(results), sorted(records))
def test_mysql_hook_test_bulk_load(self): records = ("foo", "bar", "baz") import tempfile with tempfile.NamedTemporaryFile() as t: t.write("\n".join(records).encode('utf8')) t.flush() from airflow.hooks.mysql_hook import MySqlHook h = MySqlHook('airflow_ci') with h.get_conn() as c: c.execute(""" CREATE TABLE IF NOT EXISTS test_airflow ( dummy VARCHAR(50) ) """) c.execute("TRUNCATE TABLE test_airflow") h.bulk_load("test_airflow", t.name) c.execute("SELECT dummy FROM test_airflow") results = tuple(result[0] for result in c.fetchall()) self.assertEqual(sorted(results), sorted(records))
def readS3FilesAndLoadtoMySql(**kwargs): """ Read Data from S3 Files and load to Mysql :param kwargs: :return: """ s3_files = fetchFilesBasedonPattern(**kwargs) tmp_trg_file_path = "/tmp/s3mysqlload_" + str(round(datetime.now().timestamp())) + "/" if s3_files is None: raise Exception("No Files are Available to process") else: files_df = pd.DataFrame() print(type(s3_files)) data = ",".join(s3_files) kwargs['ti'].xcom_push(key='s3_data_files', value=data) s3_client = generateS3Hook(kwargs["aws_conn_id"]) for path in s3_files: file_name = getFileName(path) if (file_name.lower().__contains__(".csv")): files_df = files_df.append(pd.read_csv( io.BytesIO(s3_client.get_key(key=path, bucket_name=kwargs['src_bucket']).get()['Body'].read()))) elif file_name.lower().__contains__(".json"): files_df = files_df.append(pd.read_json( io.BytesIO(s3_client.get_key(key=path, bucket_name=kwargs['src_bucket']).get()['Body'].read()))) if len(files_df) > 0: if not os.path.exists(tmp_trg_file_path): os.makedirs(tmp_trg_file_path) file_path = tmp_trg_file_path + str(round(datetime.now().timestamp())) + ".tsv" files_df.to_csv(file_path, sep="\t", index=False,header=False,line_terminator="\n") mysql_client = MySqlHook(mysql_conn_id=kwargs["mysql_conn"]) mysql_client.bulk_load(table=kwargs["schema"]+"."+kwargs["table"], tmp_file=file_path) shutil.rmtree(tmp_trg_file_path) else: raise Exception("Source Files are Empty")
def bulk_load_sql(table_name, **kwargs): local_filepath = kwargs['ti'].xcom_pull(task_ids='download_file') conn = MySqlHook(conn_name_attr='ib_sql') conn.bulk_load(table_name, local_filepath) return table_name
class LoadOltpOperator(BaseOperator): """ This operator is for updating/deleting/inserting records in MySQL oltp table First it finds keys for records from source .csv Then it deletes records from oltp table with those keys Then it removes all records from dataframe that have action flag "C" -> closed Then it writes modified dataframe to .csv file with tab-delimited format Then it bulk loads that .csv file into MySQL oltp table :param src_file: Source .csv file path :type src_file: str :param src_file_attr: source table metadata :type src_file_attr: dictionary :param tgt_table_conn_id: reference to the oltp MySQL database :type tgt_table_conn_id: string :param db_table: target table :type db_table: str :param df: Final dataframe which will be written to .csv :type df: pandas dataframe :param output_filename: Full name of output file :type output_filename: .csv """ #template_fields = ('sql', 'parameters', 'mysql_table') #template_ext = ('.sql',) ui_color = '#ededed' @apply_defaults def __init__(self, src_file="/usr/local/airflow/csv_lz_2/cleaned_depts.csv", src_file_attr={ "action": "action", "key": "department_id" }, tgt_table_conn_id='mysql_default', db_table=None, df=None, output_filename="/usr/local/airflow/csv_lz_2/depts_td.csv", *args, **kwargs): super(LoadOltpOperator, self).__init__(*args, **kwargs) self.src_file = src_file self.src_file_attr = src_file_attr self.tgt_table_conn_id = tgt_table_conn_id self.db_table = db_table self.conn = MySqlHook(conn_name_attr=tgt_table_conn_id) self.output_filename = output_filename def read_into_df(self, file_): """ Returns pandas df from .csv file """ return pd.read_csv(file_, sep=",", encoding="utf-8") def keys_as_tuples(self, df): """ Returns keys from dataframe as array of tuples This will be needed to delete records from MySql table """ keys_df = df[[self.src_file_attr["key"]]] return [tuple(x) for x in keys_df.values] def delete_db_records(self, keys): """ Deletes records from MySQL table """ delete_sql = """DELETE FROM {table_name} WHERE {key} = %s""".format( table_name=self.db_table, key=self.src_file_attr["key"]) for key in keys: self.conn.run(delete_sql, autocommit=True, parameters=key) def write_to_file(self, df): """ First drop "action" column, then write dataframe to .csv file """ df.drop(self.src_file_attr["action"], axis=1, inplace=True) df.to_csv(self.output_filename, sep='\t', header=False, index=False, encoding='utf-8') def execute(self, context): """ Read src file into data frame Return keys as an array of tuples Delete records by key from db Filter out closed records from dataframe """ logging.info('Executing: Reading ' + self.src_file + ' into dataframe') self.df = self.read_into_df(self.src_file) logging.info('Executing: Create tuples for keys ') keys = self.keys_as_tuples(self.df) logging.info('Executing: Deleting records from table ' + self.db_table) self.delete_db_records(keys) logging.info('Executing: Filter out closed records') self.df = self.df.loc[self.df[self.src_file_attr["action"]] != 'C'] # func: write dataframe into tab-delimited .csv file self.write_to_file(self.df) self.conn.bulk_load(self.db_table, self.output_filename)