def test_convert_type(self, value, schema_type, expected): op = MySqlToGoogleCloudStorageOperator(task_id=TASK_ID, mysql_conn_id=MYSQL_CONN_ID, sql=SQL, bucket=BUCKET, filename=JSON_FILENAME) self.assertEqual(op.convert_type(value, schema_type), expected)
def test_exec_success_csv_with_delimiter(self, gcs_hook_mock_class, mysql_hook_mock_class): """Test successful run of execute function for CSV with a field delimiter""" op = MySqlToGoogleCloudStorageOperator(task_id=TASK_ID, mysql_conn_id=MYSQL_CONN_ID, sql=SQL, export_format='csv', field_delimiter='|', bucket=BUCKET, filename=CSV_FILENAME) mysql_hook_mock = mysql_hook_mock_class.return_value mysql_hook_mock.get_conn().cursor().__iter__.return_value = iter(ROWS) mysql_hook_mock.get_conn().cursor().description = CURSOR_DESCRIPTION gcs_hook_mock = gcs_hook_mock_class.return_value def _assert_upload(bucket, obj, tmp_filename, mime_type=None): self.assertEqual(BUCKET, bucket) self.assertEqual(CSV_FILENAME.format(0), obj) self.assertEqual('text/csv', mime_type) with open(tmp_filename, 'rb') as f: self.assertEqual(b''.join(CSV_LINES_PIPE_DELIMITED), f.read()) gcs_hook_mock.upload.side_effect = _assert_upload op.execute(None) mysql_hook_mock_class.assert_called_once_with( mysql_conn_id=MYSQL_CONN_ID) mysql_hook_mock.get_conn().cursor().execute.assert_called_once_with( SQL)
def test_exec_success_json(self, gcs_hook_mock_class, mysql_hook_mock_class): """Test successful run of execute function for JSON""" op = MySqlToGoogleCloudStorageOperator(task_id=TASK_ID, mysql_conn_id=MYSQL_CONN_ID, sql=SQL, bucket=BUCKET, filename=JSON_FILENAME) mysql_hook_mock = mysql_hook_mock_class.return_value mysql_hook_mock.get_conn().cursor().__iter__.return_value = iter(ROWS) mysql_hook_mock.get_conn().cursor().description = CURSOR_DESCRIPTION gcs_hook_mock = gcs_hook_mock_class.return_value def _assert_upload(bucket, obj, tmp_filename, mime_type=None): self.assertEqual(BUCKET, bucket) self.assertEqual(JSON_FILENAME.format(0), obj) self.assertEqual('application/json', mime_type) with open(tmp_filename, 'rb') as f: self.assertEqual(b''.join(NDJSON_LINES), f.read()) gcs_hook_mock.upload.side_effect = _assert_upload op.execute(None) mysql_hook_mock_class.assert_called_once_with( mysql_conn_id=MYSQL_CONN_ID) mysql_hook_mock.get_conn().cursor().execute.assert_called_once_with( SQL)
def test_write_local_data_files(): # Configure task_id = "some_test_id" sql = "some_sql" bucket = "some_bucket" filename = "some_filename" row_iter = [[1, b'byte_str_1'], [2, b'byte_str_2']] schema = [{ 'name': 'location', 'type': 'STRING', 'mode': 'nullable', }, { 'name': 'uuid', 'type': 'BYTES', 'mode': 'nullable', }] schema_str = json.dumps(schema) op = MySqlToGoogleCloudStorageOperator( task_id=task_id, sql=sql, bucket=bucket, filename=filename, schema=schema_str) cursor_mock = MagicMock() cursor_mock.__iter__.return_value = row_iter # Run op._write_local_data_files(cursor_mock)
def test_schema_file(self, gcs_hook_mock_class, mysql_hook_mock_class): """Test writing schema files.""" mysql_hook_mock = mysql_hook_mock_class.return_value mysql_hook_mock.get_conn().cursor().__iter__.return_value = iter(ROWS) mysql_hook_mock.get_conn().cursor().description = CURSOR_DESCRIPTION gcs_hook_mock = gcs_hook_mock_class.return_value def _assert_upload(bucket, obj, tmp_filename, mime_type, gzip): # pylint: disable=unused-argument if obj == SCHEMA_FILENAME: self.assertFalse(gzip) with open(tmp_filename, 'rb') as file: self.assertEqual(b''.join(SCHEMA_JSON), file.read()) gcs_hook_mock.upload.side_effect = _assert_upload op = MySqlToGoogleCloudStorageOperator( task_id=TASK_ID, sql=SQL, bucket=BUCKET, filename=JSON_FILENAME, schema_filename=SCHEMA_FILENAME) op.execute(None) # once for the file and once for the schema self.assertEqual(2, gcs_hook_mock.upload.call_count)
def test_write_local_data_files(): # Configure task_id = "some_test_id" sql = "some_sql" bucket = "some_bucket" filename = "some_filename" row_iter = [[1, b'byte_str_1'], [2, b'byte_str_2']] schema = [{ 'name': 'location', 'type': 'STRING', 'mode': 'nullable', }, { 'name': 'uuid', 'type': 'BYTES', 'mode': 'nullable', }] schema_str = json.dumps(schema) op = MySqlToGoogleCloudStorageOperator(task_id=task_id, sql=sql, bucket=bucket, filename=filename, schema=schema_str) cursor_mock = MagicMock() cursor_mock.__iter__.return_value = row_iter # Run op._write_local_data_files(cursor_mock)
def test_exec_success_csv_with_delimiter(self, gcs_hook_mock_class, mysql_hook_mock_class): """Test successful run of execute function for CSV with a field delimiter""" op = MySqlToGoogleCloudStorageOperator( task_id=TASK_ID, mysql_conn_id=MYSQL_CONN_ID, sql=SQL, export_format='csv', field_delimiter='|', bucket=BUCKET, filename=CSV_FILENAME) mysql_hook_mock = mysql_hook_mock_class.return_value mysql_hook_mock.get_conn().cursor().__iter__.return_value = iter(ROWS) mysql_hook_mock.get_conn().cursor().description = CURSOR_DESCRIPTION gcs_hook_mock = gcs_hook_mock_class.return_value def _assert_upload(bucket, obj, tmp_filename, mime_type=None): self.assertEqual(BUCKET, bucket) self.assertEqual(CSV_FILENAME.format(0), obj) self.assertEqual('text/csv', mime_type) with open(tmp_filename, 'rb') as f: self.assertEqual(b''.join(CSV_LINES_PIPE_DELIMITED), f.read()) gcs_hook_mock.upload.side_effect = _assert_upload op.execute(None) mysql_hook_mock_class.assert_called_once_with(mysql_conn_id=MYSQL_CONN_ID) mysql_hook_mock.get_conn().cursor().execute.assert_called_once_with(SQL)
def test_exec_success_csv_ensure_utc(self, gcs_hook_mock_class, mysql_hook_mock_class): """Test successful run of execute function for CSV""" op = MySqlToGoogleCloudStorageOperator( task_id=TASK_ID, mysql_conn_id=MYSQL_CONN_ID, sql=SQL, export_format='CSV', bucket=BUCKET, filename=CSV_FILENAME, ensure_utc=True) mysql_hook_mock = mysql_hook_mock_class.return_value mysql_hook_mock.get_conn().cursor().__iter__.return_value = iter(ROWS) mysql_hook_mock.get_conn().cursor().description = CURSOR_DESCRIPTION gcs_hook_mock = gcs_hook_mock_class.return_value def _assert_upload(bucket, obj, tmp_filename, mime_type=None, gzip=False): self.assertEqual(BUCKET, bucket) self.assertEqual(CSV_FILENAME.format(0), obj) self.assertEqual('text/csv', mime_type) self.assertFalse(gzip) with open(tmp_filename, 'rb') as file: self.assertEqual(b''.join(CSV_LINES), file.read()) gcs_hook_mock.upload.side_effect = _assert_upload op.execute(None) mysql_hook_mock_class.assert_called_once_with(mysql_conn_id=MYSQL_CONN_ID) mysql_hook_mock.get_conn().cursor().execute.assert_has_calls([mock.call(TZ_QUERY), mock.call(SQL)])
def test_file_splitting(self, gcs_hook_mock_class, mysql_hook_mock_class): """Test that ndjson is split by approx_max_file_size_bytes param.""" mysql_hook_mock = mysql_hook_mock_class.return_value mysql_hook_mock.get_conn().cursor().__iter__.return_value = iter(ROWS) mysql_hook_mock.get_conn().cursor().description = CURSOR_DESCRIPTION gcs_hook_mock = gcs_hook_mock_class.return_value expected_upload = { JSON_FILENAME.format(0): b''.join(NDJSON_LINES[:2]), JSON_FILENAME.format(1): NDJSON_LINES[2], } def _assert_upload(bucket, obj, tmp_filename, mime_type=None): self.assertEqual(BUCKET, bucket) self.assertEqual('application/json', mime_type) with open(tmp_filename, 'rb') as f: self.assertEqual(expected_upload[obj], f.read()) gcs_hook_mock.upload.side_effect = _assert_upload op = MySqlToGoogleCloudStorageOperator( task_id=TASK_ID, sql=SQL, bucket=BUCKET, filename=JSON_FILENAME, approx_max_file_size_bytes=len(expected_upload[JSON_FILENAME.format(0)])) op.execute(None)
def test_schema_file(self, gcs_hook_mock_class, mysql_hook_mock_class): """Test writing schema files.""" mysql_hook_mock = mysql_hook_mock_class.return_value mysql_hook_mock.get_conn().cursor().__iter__.return_value = iter(ROWS) mysql_hook_mock.get_conn().cursor().description = CURSOR_DESCRIPTION gcs_hook_mock = gcs_hook_mock_class.return_value def _assert_upload(bucket, obj, tmp_filename, mime_type): if obj == SCHEMA_FILENAME: with open(tmp_filename, 'rb') as f: self.assertEqual(b''.join(SCHEMA_JSON), f.read()) gcs_hook_mock.upload.side_effect = _assert_upload op = MySqlToGoogleCloudStorageOperator( task_id=TASK_ID, sql=SQL, bucket=BUCKET, filename=JSON_FILENAME, schema_filename=SCHEMA_FILENAME) op.execute(None) # once for the file and once for the schema self.assertEqual(2, gcs_hook_mock.upload.call_count)
def test_exec_success_json(self, gcs_hook_mock_class, mysql_hook_mock_class): """Test successful run of execute function for JSON""" op = MySqlToGoogleCloudStorageOperator( task_id=TASK_ID, mysql_conn_id=MYSQL_CONN_ID, sql=SQL, bucket=BUCKET, filename=JSON_FILENAME) mysql_hook_mock = mysql_hook_mock_class.return_value mysql_hook_mock.get_conn().cursor().__iter__.return_value = iter(ROWS) mysql_hook_mock.get_conn().cursor().description = CURSOR_DESCRIPTION gcs_hook_mock = gcs_hook_mock_class.return_value def _assert_upload(bucket, obj, tmp_filename, mime_type=None): self.assertEqual(BUCKET, bucket) self.assertEqual(JSON_FILENAME.format(0), obj) self.assertEqual('application/json', mime_type) with open(tmp_filename, 'rb') as f: self.assertEqual(b''.join(NDJSON_LINES), f.read()) gcs_hook_mock.upload.side_effect = _assert_upload op.execute(None) mysql_hook_mock_class.assert_called_once_with(mysql_conn_id=MYSQL_CONN_ID) mysql_hook_mock.get_conn().cursor().execute.assert_called_once_with(SQL)
def test_exec_success_csv(self, gcs_hook_mock_class, mysql_hook_mock_class): """Test the execute function in case where the run is successful.""" op = MySqlToGoogleCloudStorageOperator( task_id=TASK_ID, mysql_conn_id=MYSQL_CONN_ID, sql=SQL, export_format={'file_format': 'csv', 'csv_dialect': 'excel'}, bucket=BUCKET, filename=FILENAME) mysql_hook_mock = mysql_hook_mock_class.return_value mysql_hook_mock.get_conn().cursor().__iter__.return_value = iter(ROWS) mysql_hook_mock.get_conn().cursor().description = CURSOR_DESCRIPTION gcs_hook_mock = gcs_hook_mock_class.return_value def _assert_upload(bucket, obj, tmp_filename, content_type): self.assertEqual(BUCKET, bucket) self.assertEqual(FILENAME.format(0), obj) self.assertEqual('application/csv', content_type) with open(tmp_filename, 'rb') as f: self.assertEqual(b''.join(CSV_LINES), f.read()) gcs_hook_mock.upload.side_effect = _assert_upload op.execute(None) mysql_hook_mock_class.assert_called_once_with(mysql_conn_id=MYSQL_CONN_ID) mysql_hook_mock.get_conn().cursor().execute.assert_called_once_with(SQL)
def test_file_splitting(self, gcs_hook_mock_class, mysql_hook_mock_class): """Test that ndjson is split by approx_max_file_size_bytes param.""" mysql_hook_mock = mysql_hook_mock_class.return_value mysql_hook_mock.get_conn().cursor().__iter__.return_value = iter(ROWS) mysql_hook_mock.get_conn().cursor().description = CURSOR_DESCRIPTION gcs_hook_mock = gcs_hook_mock_class.return_value expected_upload = { JSON_FILENAME.format(0): b''.join(NDJSON_LINES[:2]), JSON_FILENAME.format(1): NDJSON_LINES[2], } def _assert_upload(bucket, obj, tmp_filename, mime_type=None): self.assertEqual(BUCKET, bucket) self.assertEqual('application/json', mime_type) with open(tmp_filename, 'rb') as f: self.assertEqual(expected_upload[obj], f.read()) gcs_hook_mock.upload.side_effect = _assert_upload op = MySqlToGoogleCloudStorageOperator( task_id=TASK_ID, sql=SQL, bucket=BUCKET, filename=JSON_FILENAME, approx_max_file_size_bytes=len( expected_upload[JSON_FILENAME.format(0)])) op.execute(None)
def gen_export_table_task(table_config): export_task = MySqlToGoogleCloudStorageOperator(task_id='export_{}'.format(table_config.params['export_table']), dag=dag, sql=table_config.params['export_query'], bucket=table_config.params['export_bucket'], filename="cloudsql_to_bigquery/{}/{}".format(table_config.params['export_database'], table_config.params['export_table']) + "_{}", schema_filename="cloudsql_to_bigquery/schema/{}/schema_raw".format(table_config.params['export_table']), mysql_conn_id="gcp_dvh_cloudsql") export_task.doc_md = """\ #### Export table from cloudsql to cloud storage task documentation """ return export_task
def test_init(self): """Test MySqlToGoogleCloudStorageOperator instance is properly initialized.""" op = MySqlToGoogleCloudStorageOperator( task_id=TASK_ID, sql=SQL, bucket=BUCKET, filename=FILENAME) self.assertEqual(op.task_id, TASK_ID) self.assertEqual(op.sql, SQL) self.assertEqual(op.bucket, BUCKET) self.assertEqual(op.filename, FILENAME)
def test_init(self): """Test MySqlToGoogleCloudStorageOperator instance is properly initialized.""" op = MySqlToGoogleCloudStorageOperator( task_id=TASK_ID, sql=SQL, bucket=BUCKET, filename=JSON_FILENAME, export_format='CSV', field_delimiter='|') self.assertEqual(op.task_id, TASK_ID) self.assertEqual(op.sql, SQL) self.assertEqual(op.bucket, BUCKET) self.assertEqual(op.filename, JSON_FILENAME) self.assertEqual(op.export_format, 'csv') self.assertEqual(op.field_delimiter, '|')
def test_write_local_data_files(self): # Configure task_id = "some_test_id" sql = "some_sql" bucket = "some_bucket" filename = "some_filename" schema = "some_schema" description_list = [['col_integer'], ['col_byte']] row_iter = [[1, b'byte_str_1'], [2, b'byte_str_2']] op = MySqlToGoogleCloudStorageOperator(task_id=task_id, sql=sql, bucket=bucket, filename=filename, schema=schema) cursor_mock = MagicMock() cursor_mock.description = description_list cursor_mock.__iter__.return_value = row_iter # Run op._write_local_data_files(cursor_mock)
def test_write_local_data_files(self): # Configure task_id = "some_test_id" sql = "some_sql" bucket = "some_bucket" filename = "some_filename" schema = "some_schema" description_list = [['col_integer'], ['col_byte']] row_iter = [[1, b'byte_str_1'], [2, b'byte_str_2']] op = MySqlToGoogleCloudStorageOperator( task_id=task_id, sql=sql, bucket=bucket, filename=filename, schema=schema) cursor_mock = MagicMock() cursor_mock.description = description_list cursor_mock.__iter__.return_value = row_iter # Run op._write_local_data_files(cursor_mock)
datetime.min.time()) default_args = { 'start_date': yesterday, 'email_on_failure': False, 'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=5) } def get_composer_gcs_bucket(): return os.environ['GCS_BUCKET'] with DAG(dag_id='26_sql_operators', catchup=False, schedule_interval=timedelta(days=1), default_args=default_args, user_defined_macros={ 'get_composer_gcs_bucket': get_composer_gcs_bucket, }) as dag: sql_operators = MySqlToGoogleCloudStorageOperator( task_id='sql_operators', sql='SELECT * FROM PERSONS;', bucket='{{ get_composer_gcs_bucket() }}', filename='data/26_sql_operators/{{ execution_date }}/persons.json', mysql_conn_id='workshop_sql_conn_id', dag=dag)
# 'end_date': datetime(2016, 1, 1), } dag = DAG("study_pn_campaign", default_args=default_args, schedule_interval=timedelta(1)) extract_recent_questions = MySqlToGoogleCloudStorageOperator( task_id="extract_recent_questions", dag=dag, sql=""" SELECT qid, author_uid as uid, created FROM `qa_question` WHERE created >= 1605606503 AND created < 1605692903 """, bucket="pipeline-pn-campaign", filename="{{ ds }}/recent_questions.csv", mysql_conn_id='gotit_study_mysql', google_cloud_storage_conn_id='gotit_analytics_gc', export_format='csv' ) extract_recently_active_users = MySqlToGoogleCloudStorageOperator( task_id="extract_recently_active_users", dag=dag, sql=""" SELECT DISTINCT vc_account.uid as uid, vc_account.balance as balance FROM qa_question JOIN vc_account ON qa_question.author_uid = vc_account.uid
'email': ['*****@*****.**'], 'email_on_failure': False, 'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=5), } dag = DAG( 'cloud_sql_proxy_to_gcs', default_args=default_args, description='Cloud SQL Proxy to GCS', schedule_interval=timedelta(days=1), ) SQL = 'select * from sundar.supplier' t1 = MySqlToGoogleCloudStorageOperator( mysql_conn_id="sundar-sql-con", task_id="sample_test_conn", bucket="hackathon-demo-238019", filename="test.json", google_cloud_storage_conn_id="sundar-gcs-con", export_format="json", sql=SQL, dag=dag) t2 = BashOperator(task_id='sleep', depends_on_past=False, bash_command='sleep 5', dag=dag) t1 >> t2
slack_notify = SlackAPIPostOperator( task_id='slack_notify', token='xxxxxx', channel='data-status', username='******', text='Successfully performed sakila ETL operation', dag=dag) for connection in sakila_connections: for table in sakila_tables: extract = MySqlToGoogleCloudStorageOperator( task_id="extract_mysql_%s_%s" % (connection, table), mysql_conn_id=connection, google_cloud_storage_conn_id='gcp_test', sql="SELECT *, '%s' as source FROM sakila.%s" % (connection, table), bucket='ghen-airflow', filename="%s/%s/%s{}.json" % (connection, table, table), schema_filename="%s/schemas/%s.json" % (connection, table), dag=dag) load = GoogleCloudStorageToBigQueryOperator( task_id="load_bq_%s_%s" % (connection, table), bigquery_conn_id='gcp_test', google_cloud_storage_conn_id='gcp_test', bucket='ghen-airflow', destination_project_dataset_table="spark-test-173322.%s.%s" % (connection, table), source_objects=["%s/%s/%s*.json" % (connection, table, table)], schema_object="%s/schemas/%s.json" % (connection, table), source_format='NEWLINE_DELIMITED_JSON',
gcp_connection_id = "google_cloud_default" default_args = {'owner': 'airflow', 'start_date': datetime(2019, 03, 21)} def query_to_join_posts_with_users(): return 'SELECT p.owner_user_id as user_id, u.display_name as user_name, p.score as post_score, p.title as post_title FROM `demo_temp.stackoverflow_posts` as p join `demo_temp.stackoverflow_users` as u on p.owner_user_id = u.id' with DAG('stackoverflow', default_args=default_args, schedule_interval="*/10 * * * *") as dag: extract_posts_data = MySqlToGoogleCloudStorageOperator( task_id='extract_posts_data', sql='SELECT * FROM stackoverflow_posts', export_format='JSON', bucket='demo-sample', filename='so_posts.json') extract_users_data = PostgresToGoogleCloudStorageOperator( task_id='extract_users_data', sql='SELECT * FROM stackoverflow_users', bucket='demo-sample', filename='so_users.json') move_users_data_to_staging = gcs_to_bq.GoogleCloudStorageToBigQueryOperator( task_id='move_users_data_to_staging', bucket='demo-sample', source_objects=['so_users.json'], write_disposition='WRITE_TRUNCATE', schema_fields=[{
'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=5), } bucket_name = 'rendy-test' # change to your bucket name here sales_filename = 'sales_export.json' sales_schema_filename = 'sales_schema.json' with DAG('devfest2020', schedule_interval=timedelta(days=1), default_args=default_args) as dag: extract = MySqlToGoogleCloudStorageOperator( task_id='extract', sql='SELECT * FROM test_db.sales_table st', # change to your mysql table bucket=bucket_name, filename=sales_filename, schema_filename=sales_schema_filename, mysql_conn_id='devfest2020', # change to your mysql connection id ) load = GoogleCloudStorageToBigQueryOperator( task_id='load', destination_project_dataset_table= 'project.rendy_test.sales', #change to your bq bucket=bucket_name, source_objects=[sales_filename], schema_object=sales_schema_filename, write_disposition='WRITE_TRUNCATE', create_disposition='CREATE_IF_NEEDED', source_format='NEWLINE_DELIMITED_JSON')
def test_convert_type(self, value, schema_type, expected): self.assertEqual( MySqlToGoogleCloudStorageOperator._convert_type( value, schema_type), expected)
from airflow import DAG from airflow.contrib.operators.mysql_to_gcs import MySqlToGoogleCloudStorageOperator from airflow.operators.bash_operator import BashOperator from datetime import datetime, timedelta default_args = { 'owner': 'mikeghen', 'start_date': datetime(2017, 8, 11), 'depends_on_past': False, 'retries': 1, 'retry_delay': timedelta(minutes=5), } dag = DAG('mysql_to_gcs', default_args=default_args) export_actor = MySqlToGoogleCloudStorageOperator( task_id='extract_actors', mysql_conn_id='sakila_test', google_cloud_storage_conn_id='gcp_test', sql='SELECT * FROM sakila.actor', bucket='ghen-airflow', filename='sakila/actors/actors{}.json', schema_filename='sakila/schemas/actors.json', dag=dag)