def test_execute_no_suffix_without_destination_object(self, mock_hook): mock_hook.return_value.list.return_value = SOURCE_FILES_LIST operator = GoogleCloudStorageToGoogleCloudStorageOperator( task_id=TASK_ID, source_bucket=TEST_BUCKET, source_object=SOURCE_OBJECT_2, destination_bucket=DESTINATION_BUCKET) operator.execute(None) mock_hook.return_value.copy.assert_has_calls(MOCK_CALLS_EMPTY)
def test_execute_no_suffix_without_destination_object(self, mock_hook): mock_hook.return_value.list.return_value = SOURCE_FILES_LIST operator = GoogleCloudStorageToGoogleCloudStorageOperator( task_id=TASK_ID, source_bucket=TEST_BUCKET, source_object=SOURCE_OBJECT_2, destination_bucket=DESTINATION_BUCKET) operator.execute(None) mock_hook.return_value.copy.assert_has_calls(MOCK_CALLS_EMPTY)
def test_execute_prefix_and_suffix(self, mock_hook): operator = GoogleCloudStorageToGoogleCloudStorageOperator( task_id=TASK_ID, source_bucket=TEST_BUCKET, source_object=SOURCE_OBJECT_WILDCARD_MIDDLE, destination_bucket=DESTINATION_BUCKET) operator.execute(None) mock_hook.return_value.list.assert_called_once_with( TEST_BUCKET, prefix="test", delimiter="object" )
def test_execute_prefix_and_suffix(self, mock_hook): operator = GoogleCloudStorageToGoogleCloudStorageOperator( task_id=TASK_ID, source_bucket=TEST_BUCKET, source_object=SOURCE_OBJECT_WILDCARD_MIDDLE, destination_bucket=DESTINATION_BUCKET) operator.execute(None) mock_hook.return_value.list.assert_called_once_with( TEST_BUCKET, prefix="test", delimiter="object" )
def test_no_prefix_with_last_modified_time_with_false_cond(self, mock_hook): mock_hook.return_value.is_updated_after.return_value = False operator = GoogleCloudStorageToGoogleCloudStorageOperator( task_id=TASK_ID, source_bucket=TEST_BUCKET, source_object=SOURCE_OBJECT_NO_WILDCARD, destination_bucket=DESTINATION_BUCKET, destination_object=SOURCE_OBJECT_NO_WILDCARD, last_modified_time=MOD_TIME_1) operator.execute(None) mock_hook.return_value.rewrite.assert_not_called()
def test_execute_no_prefix_with_no_last_modified_time(self, mock_hook): operator = GoogleCloudStorageToGoogleCloudStorageOperator( task_id=TASK_ID, source_bucket=TEST_BUCKET, source_object=SOURCE_OBJECT_NO_WILDCARD, destination_bucket=DESTINATION_BUCKET, destination_object=SOURCE_OBJECT_NO_WILDCARD, last_modified_time=None) operator.execute(None) mock_hook.return_value.rewrite.assert_called_once_with( TEST_BUCKET, 'test_object.txt', DESTINATION_BUCKET, 'test_object.txt')
def test_execute_no_prefix_with_no_last_modified_time(self, mock_hook): operator = GoogleCloudStorageToGoogleCloudStorageOperator( task_id=TASK_ID, source_bucket=TEST_BUCKET, source_object=SOURCE_OBJECT_NO_WILDCARD, destination_bucket=DESTINATION_BUCKET, destination_object=SOURCE_OBJECT_NO_WILDCARD, last_modified_time=None) operator.execute(None) mock_hook.return_value.rewrite.assert_called_once_with( TEST_BUCKET, 'test_object.txt', DESTINATION_BUCKET, 'test_object.txt')
def test_no_prefix_with_last_modified_time_with_false_cond(self, mock_hook): mock_hook.return_value.is_updated_after.return_value = False operator = GoogleCloudStorageToGoogleCloudStorageOperator( task_id=TASK_ID, source_bucket=TEST_BUCKET, source_object=SOURCE_OBJECT_NO_WILDCARD, destination_bucket=DESTINATION_BUCKET, destination_object=SOURCE_OBJECT_NO_WILDCARD, last_modified_time=MOD_TIME_1) operator.execute(None) mock_hook.return_value.rewrite.assert_not_called()
def transfer_data_subdag( parent_dag_name, child_dag_name, default_args, source_bucket, destination_bucket, destination_bucket_prefix, app_name, submission_date, server_id, public_key_hex_external, google_cloud_storage_conn_id, ): """Copy the partitioned data from the staging bucket into the corresponding receiving buckets in each processor. The job then submits a `_SUCCESS` file which indicates the data is ready for processing. See the tests for the staging job for preprocessing convention: https://github.com/mozilla/prio-processor/blob/3cdc368707f8dc0f917d7b3d537c31645f4260f7/processor/tests/test_staging.py#L190-L205 See the `bin/generate` script for an example around conventions around processing: https://github.com/mozilla/prio-processor/blob/1a4a58a738c3d39bfb04bbaa33a323412f1398ec/bin/generate#L53-L67 """ with DAG(f"{parent_dag_name}.{child_dag_name}", default_args=default_args) as dag: prefix = "/".join([ destination_bucket_prefix, public_key_hex_external, app_name, submission_date, "raw/shares", ]) transfer_dataset = GoogleCloudStorageToGoogleCloudStorageOperator( task_id="transfer_dataset", source_bucket=source_bucket, source_object= f"staging/submission_date={submission_date}/server_id={server_id}/*", destination_bucket=destination_bucket, destination_object=f"{prefix}/", google_cloud_storage_conn_id=google_cloud_storage_conn_id, dag=dag, ) mark_dataset_success = GoogleCloudStorageToGoogleCloudStorageOperator( task_id="mark_dataset_success", source_bucket=source_bucket, source_object="staging/_SUCCESS", destination_bucket=destination_bucket, destination_object=f"{prefix}/_SUCCESS", google_cloud_storage_conn_id=google_cloud_storage_conn_id, dag=dag, ) transfer_dataset >> mark_dataset_success return dag
def test_wc_with_last_modified_time_with_one_true_cond(self, mock_hook): mock_hook.return_value.list.return_value = SOURCE_FILES_LIST mock_hook.return_value.is_updated_after.side_effect = [True, False, False] operator = GoogleCloudStorageToGoogleCloudStorageOperator( task_id=TASK_ID, source_bucket=TEST_BUCKET, source_object=SOURCE_OBJECT_WILDCARD_FILENAME, destination_bucket=DESTINATION_BUCKET, last_modified_time=MOD_TIME_1) operator.execute(None) mock_hook.return_value.rewrite.assert_called_once_with( TEST_BUCKET, 'test_object/file1.txt', DESTINATION_BUCKET, 'test_object/file1.txt')
def test_wc_with_last_modified_time_with_one_true_cond(self, mock_hook): mock_hook.return_value.list.return_value = SOURCE_FILES_LIST mock_hook.return_value.is_updated_after.side_effect = [True, False, False] operator = GoogleCloudStorageToGoogleCloudStorageOperator( task_id=TASK_ID, source_bucket=TEST_BUCKET, source_object=SOURCE_OBJECT_WILDCARD_FILENAME, destination_bucket=DESTINATION_BUCKET, last_modified_time=MOD_TIME_1) operator.execute(None) mock_hook.return_value.rewrite.assert_called_once_with( TEST_BUCKET, 'test_object/file1.txt', DESTINATION_BUCKET, 'test_object/file1.txt')
def test_execute_more_than_1_wildcard(self, mock_hook): mock_hook.return_value.list.return_value = SOURCE_FILES_LIST operator = GoogleCloudStorageToGoogleCloudStorageOperator( task_id=TASK_ID, source_bucket=TEST_BUCKET, source_object=SOURCE_OBJECT_MULTIPLE_WILDCARDS, destination_bucket=DESTINATION_BUCKET, destination_object=DESTINATION_OBJECT_PREFIX) total_wildcards = operator.source_object.count(WILDCARD) error_msg = "Only one wildcard '[*]' is allowed in source_object parameter. " \ "Found {}".format(total_wildcards, SOURCE_OBJECT_MULTIPLE_WILDCARDS) with self.assertRaisesRegexp(AirflowException, error_msg): operator.execute(None)
def test_execute_with_empty_destination_bucket(self, mock_hook): mock_hook.return_value.list.return_value = SOURCE_FILES_LIST operator = GoogleCloudStorageToGoogleCloudStorageOperator( task_id=TASK_ID, source_bucket=TEST_BUCKET, source_object=SOURCE_OBJECT_NO_WILDCARD, destination_bucket=None, destination_object=DESTINATION_OBJECT_PREFIX) with patch.object(operator.log, 'warning') as mock_warn: operator.execute(None) mock_warn.assert_called_with( 'destination_bucket is None. Defaulting it to source_bucket (%s)', TEST_BUCKET ) self.assertEquals(operator.destination_bucket, operator.source_bucket)
def test_execute_wildcard_without_destination_object(self, mock_hook): mock_hook.return_value.list.return_value = SOURCE_FILES_LIST operator = GoogleCloudStorageToGoogleCloudStorageOperator( task_id=TASK_ID, source_bucket=TEST_BUCKET, source_object=SOURCE_OBJECT_WILDCARD_FILENAME, destination_bucket=DESTINATION_BUCKET) operator.execute(None) mock_calls_none = [ mock.call(TEST_BUCKET, 'test_object/file1.txt', DESTINATION_BUCKET, 'test_object/file1.txt'), mock.call(TEST_BUCKET, 'test_object/file2.txt', DESTINATION_BUCKET, 'test_object/file2.txt'), ] mock_hook.return_value.rewrite.assert_has_calls(mock_calls_none)
def test_execute_wildcard_without_destination_object(self, mock_hook): mock_hook.return_value.list.return_value = SOURCE_FILES_LIST operator = GoogleCloudStorageToGoogleCloudStorageOperator( task_id=TASK_ID, source_bucket=TEST_BUCKET, source_object=SOURCE_OBJECT_WILDCARD_FILENAME, destination_bucket=DESTINATION_BUCKET) operator.execute(None) mock_calls_none = [ mock.call(TEST_BUCKET, 'test_object/file1.txt', DESTINATION_BUCKET, 'test_object/file1.txt'), mock.call(TEST_BUCKET, 'test_object/file2.txt', DESTINATION_BUCKET, 'test_object/file2.txt'), ] mock_hook.return_value.rewrite.assert_has_calls(mock_calls_none)
def test_execute_more_than_1_wildcard(self, mock_hook): mock_hook.return_value.list.return_value = SOURCE_FILES_LIST operator = GoogleCloudStorageToGoogleCloudStorageOperator( task_id=TASK_ID, source_bucket=TEST_BUCKET, source_object=SOURCE_OBJECT_MULTIPLE_WILDCARDS, destination_bucket=DESTINATION_BUCKET, destination_object=DESTINATION_OBJECT_PREFIX) total_wildcards = operator.source_object.count(WILDCARD) error_msg = "Only one wildcard '[*]' is allowed in source_object parameter. " \ "Found {}".format(total_wildcards) with six.assertRaisesRegex(self, AirflowException, error_msg): operator.execute(None)
def test_execute_with_empty_destination_bucket(self, mock_hook): mock_hook.return_value.list.return_value = SOURCE_FILES_LIST operator = GoogleCloudStorageToGoogleCloudStorageOperator( task_id=TASK_ID, source_bucket=TEST_BUCKET, source_object=SOURCE_OBJECT_NO_WILDCARD, destination_bucket=None, destination_object=DESTINATION_OBJECT_PREFIX) with patch.object(operator.log, 'warning') as mock_warn: operator.execute(None) mock_warn.assert_called_with( 'destination_bucket is None. Defaulting it to source_bucket (%s)', TEST_BUCKET ) self.assertEquals(operator.destination_bucket, operator.source_bucket)
def test_execute_wildcard_with_destination_object(self, mock_hook): mock_hook.return_value.list.return_value = SOURCE_FILES_LIST operator = GoogleCloudStorageToGoogleCloudStorageOperator( task_id=TASK_ID, source_bucket=TEST_BUCKET, source_object=SOURCE_OBJECT_4, destination_bucket=DESTINATION_BUCKET, destination_object=DESTINATION_OBJECT_PREFIX) operator.execute(None) mock_calls = [ mock.call(TEST_BUCKET, 'test_object/file1.txt', DESTINATION_BUCKET, 'foo/bar/file1.txt'), mock.call(TEST_BUCKET, 'test_object/file2.txt', DESTINATION_BUCKET, 'foo/bar/file2.txt'), ] mock_hook.return_value.copy.assert_has_calls(mock_calls)
def test_wc_with_no_last_modified_time(self, mock_hook): mock_hook.return_value.list.return_value = SOURCE_FILES_LIST operator = GoogleCloudStorageToGoogleCloudStorageOperator( task_id=TASK_ID, source_bucket=TEST_BUCKET, source_object=SOURCE_OBJECT_4, destination_bucket=DESTINATION_BUCKET, last_modified_time=None) operator.execute(None) mock_calls_none = [ mock.call(TEST_BUCKET, 'test_object/file1.txt', DESTINATION_BUCKET, 'test_object/file1.txt'), mock.call(TEST_BUCKET, 'test_object/file2.txt', DESTINATION_BUCKET, 'test_object/file2.txt'), ] mock_hook.return_value.rewrite.assert_has_calls(mock_calls_none)
def gc_tasks(name, schema, next_task=DummyOperator(task_id="Done")): bq_staging = f"{{{{ var.value.gc_project_id }}}}.{{{{ var.value.bq_dataset_source }}}}.{name}" bq_warehouse = f"{{{{ var.value.gc_project_id }}}}.{{{{ var.value.bq_dataset_target }}}}.{name}" t1 = GoogleCloudStorageToBigQueryOperator( task_id=f"staging_{name}", bucket="{{var.value.gcs_bucket}}", source_objects=[f"{name}*"], destination_project_dataset_table=bq_staging, write_disposition="WRITE_TRUNCATE", schema_fields=schema, skip_leading_rows=1, ) t2 = BigQueryOperator( task_id=f"merge_{name}_into_warehouse", sql=_create_merge_sql(bq_staging, bq_warehouse, schema), use_legacy_sql=False, ) t3 = GoogleCloudStorageToGoogleCloudStorageOperator( task_id=f"move_{name}_to_processed", source_bucket="{{var.value.gcs_bucket}}", source_object=f"{name}*", destination_bucket="{{var.value.gcs_bucket}}", destination_object=f"processed/{name}", move_object=True, ) t1 >> t2 >> t3 >> next_task return t1
def test_execute_wildcard_with_destination_object(self, mock_hook): mock_hook.return_value.list.return_value = SOURCE_FILES_LIST operator = GoogleCloudStorageToGoogleCloudStorageOperator( task_id=TASK_ID, source_bucket=TEST_BUCKET, source_object=SOURCE_OBJECT_4, destination_bucket=DESTINATION_BUCKET, destination_object=DESTINATION_OBJECT_PREFIX) operator.execute(None) mock_calls = [ mock.call(TEST_BUCKET, 'test_object/file1.txt', DESTINATION_BUCKET, 'foo/bar/file1.txt'), mock.call(TEST_BUCKET, 'test_object/file2.txt', DESTINATION_BUCKET, 'foo/bar/file2.txt'), ] mock_hook.return_value.copy.assert_has_calls(mock_calls)
def test_wc_with_no_last_modified_time(self, mock_hook): mock_hook.return_value.list.return_value = SOURCE_FILES_LIST operator = GoogleCloudStorageToGoogleCloudStorageOperator( task_id=TASK_ID, source_bucket=TEST_BUCKET, source_object=SOURCE_OBJECT_4, destination_bucket=DESTINATION_BUCKET, last_modified_time=None) operator.execute(None) mock_calls_none = [ mock.call(TEST_BUCKET, 'test_object/file1.txt', DESTINATION_BUCKET, 'test_object/file1.txt'), mock.call(TEST_BUCKET, 'test_object/file2.txt', DESTINATION_BUCKET, 'test_object/file2.txt'), ] mock_hook.return_value.rewrite.assert_has_calls(mock_calls_none)
def test_execute_wildcard_with_destination_object_retained_prefix(self, mock_hook): mock_hook.return_value.list.return_value = SOURCE_FILES_LIST operator = GoogleCloudStorageToGoogleCloudStorageOperator( task_id=TASK_ID, source_bucket=TEST_BUCKET, source_object=SOURCE_OBJECT_WILDCARD_FILENAME, destination_bucket=DESTINATION_BUCKET, destination_object='{}/{}'.format(DESTINATION_OBJECT_PREFIX, SOURCE_OBJECT_WILDCARD_SUFFIX[:-1]) ) operator.execute(None) mock_calls_retained = [ mock.call(TEST_BUCKET, 'test_object/file1.txt', DESTINATION_BUCKET, 'foo/bar/test_object/file1.txt'), mock.call(TEST_BUCKET, 'test_object/file2.txt', DESTINATION_BUCKET, 'foo/bar/test_object/file2.txt'), ] mock_hook.return_value.rewrite.assert_has_calls(mock_calls_retained)
def test_execute_wildcard_with_destination_object_retained_prefix(self, mock_hook): mock_hook.return_value.list.return_value = SOURCE_FILES_LIST operator = GoogleCloudStorageToGoogleCloudStorageOperator( task_id=TASK_ID, source_bucket=TEST_BUCKET, source_object=SOURCE_OBJECT_WILDCARD_FILENAME, destination_bucket=DESTINATION_BUCKET, destination_object='{}/{}'.format(DESTINATION_OBJECT_PREFIX, SOURCE_OBJECT_WILDCARD_SUFFIX[:-1]) ) operator.execute(None) mock_calls_retained = [ mock.call(TEST_BUCKET, 'test_object/file1.txt', DESTINATION_BUCKET, 'foo/bar/test_object/file1.txt'), mock.call(TEST_BUCKET, 'test_object/file2.txt', DESTINATION_BUCKET, 'foo/bar/test_object/file2.txt'), ] mock_hook.return_value.rewrite.assert_has_calls(mock_calls_retained)
'start_date': datetime(2019, 6, 17), 'provide_context': True, 'owner': 'airflow', 'depends_on_past': True } dag = DAG( dag_id='create_file', default_args=args, schedule_interval=None, ) with dag: createfile = BashOperator( task_id="create_file", bash_command="touch /home/airflow/gcs/data/trigger.trg") copytobucket = GoogleCloudStorageToGoogleCloudStorageOperator( task_id="copy_to_bucket", source_bucket="your_cloud_composer_bucket", source_object="data/trigger.trg", destination_bucket="your_cloud_composer_bucket", destination_object="data/sense_for_me.txt", google_cloud_storage_conn_id="google_cloud_storage_default", move_object="true") deletefile = BashOperator( task_id="delete_file", bash_command="rm /home/airflow/gcs/data/sense_for_me.txt") createfile >> copytobucket >> deletefile
clean_processor_b = PythonOperator( task_id="clean_processor_b", python_callable=clean_buckets, op_kwargs={ "private_bucket": BUCKET_PRIVATE_B, "shared_bucket": BUCKET_SHARED_B, "google_cloud_storage_conn_id": PRIO_B_CONN, }, dag=dag, ) load_processor_a = GoogleCloudStorageToGoogleCloudStorageOperator( task_id="load_processor_a", source_bucket=BUCKET_DATA_ADMIN, source_object="staging/submission_date={{ ds }}/server_id=a/*", destination_bucket=BUCKET_PRIVATE_A, destination_object="raw/submission_date={{ ds }}/", google_cloud_storage_conn_id=PRIO_ADMIN_CONN, dag=dag, ) load_processor_b = GoogleCloudStorageToGoogleCloudStorageOperator( task_id="load_processor_b", source_bucket=BUCKET_DATA_ADMIN, source_object="staging/submission_date={{ ds }}/server_id=b/*", destination_bucket=BUCKET_PRIVATE_B, destination_object="raw/submission_date={{ ds }}/", google_cloud_storage_conn_id=PRIO_ADMIN_CONN, dag=dag, )
# scenario will start right from here ! # t2 = SFTPOperator( # task_id='transfer-to-incoming', # ) # Listen incoming folder w/ sensor t3 = GoogleCloudStoragePrefixSensor(task_id='listen-incoming-file', bucket='datalake-datasets-123', prefix='incoming/sales_transactions_*') # TODO: better file structure can be defined, such as monthly aggregation datalake/sales/05/sales_transactions_* # copy from gcs to datalake for raw data storing t4 = GoogleCloudStorageToGoogleCloudStorageOperator( task_id='copy-to-datalake', source_bucket=INGESTION_BUCKET_NAME, source_object='incoming/sales_transactions_*', destination_bucket=INGESTION_BUCKET_NAME, destination_object='datalake/sales_transactions_', move_object=False) # copy from incoming to process for analytical calculations t5 = GoogleCloudStorageToGoogleCloudStorageOperator( task_id='move-to-processing', source_bucket=INGESTION_BUCKET_NAME, source_object='incoming/sales_transactions_*', destination_bucket=INGESTION_BUCKET_NAME, destination_object='processing/sales_transactions_', move_object=True) # git clone average-prices-by-product-enhanced.py file ????? # deploy to GCP dataflow as a beam job, and check GCP dataflow job status
'usePublicIps': 'False' } } with models.DAG('DAG_WordCount', schedule_interval=timedelta(days=1), default_args=default_dag_args) as dag: start_task = DummyOperator(task_id="start") move_to_processing = GoogleCloudStorageToGoogleCloudStorageOperator( task_id='move_file_to_processing', source_bucket='{}_word_count_data_5696'.format( os.environ['AIRFLOW_VAR_ENV']), source_object='inbox/*', destination_bucket='{}_word_count_data_5696'.format( os.environ['AIRFLOW_VAR_ENV']), destination_object='processing/', move_object=True, google_cloud_storage_conn_id='etl_sa') dataflow_task = dataflow_operator.DataFlowJavaOperator( task_id='dataflow_java_app', jar='gs://{}/data/pipeline-example/word-count-beam-0.1.jar'.format( os.environ['COMPOSER_BUCKET']), options={ 'autoscalingAlgorithm': 'BASIC', 'maxNumWorkers': '5' }, gcp_conn_id='etl_sa')
def generate_dag(branch_name, params, dag): with dag: batch_id = params['batch_id'] work_folder = params['%s_work_dir' % branch_name] inbound_bucket = params['inbound_bucket'] inbound_dir = params['inbound_dir'] offer_email_list_file_prefix = params['%s_file_prefix' % branch_name] inbound_full_path = 'gs://%s/%s' % (inbound_bucket, inbound_dir) move_files_to_work_directory = GoogleCloudStorageToGoogleCloudStorageOperator( task_id='move_%s_files_to_work_directory' % branch_name.lower(), source_bucket=inbound_bucket, source_object='%s/%s*.csv' % (inbound_dir, offer_email_list_file_prefix), destination_bucket=inbound_bucket, destination_object='%s/%s/%s' % (work_folder, batch_id, offer_email_list_file_prefix), google_cloud_storage_conn_id=params['google_cloud_conn_id'], dag=dag, move_object=params['move_object']) upload_to_bq_task = GoogleCloudStorageToBigQueryOperator( task_id='upload_%s_to_bq' % branch_name.lower(), bucket=get_bucket_from_name(inbound_full_path), source_objects=['%s/%s/*' % (work_folder, batch_id)], destination_project_dataset_table=params['%s_work_table' % branch_name], skip_leading_rows=0, source_format=SourceFormat.CSV, field_delimiter=',', schema_fields=params['%s_work_schema' % branch_name], create_disposition=CreateDisposition.CREATE_IF_NEEDED, write_disposition=WriteDisposition.WRITE_TRUNCATE, bigquery_conn_id=params['bigquery_conn_id'], google_cloud_storage_conn_id=params['google_cloud_conn_id'], autodetect=False, allow_jagged_rows=True, ignore_unknown_values=True, dag=dag) update_stage_table = BigQueryOperator( task_id='update_%s_stage_table' % branch_name.lower(), sql='sql/%s/%s_stage_table.sql' % (branch_name.lower(), branch_name.lower()), write_disposition=WriteDisposition.WRITE_TRUNCATE, create_disposition=CreateDisposition.CREATE_IF_NEEDED, destination_dataset_table=params['%s_stage_table' % branch_name], params=params, dag=dag) update_error_table = BigQueryOperator( task_id='update_%s_error_table' % branch_name.lower(), sql='sql/%s/%s_error_table.sql' % (branch_name.lower(), branch_name.lower()), write_disposition=WriteDisposition.WRITE_TRUNCATE, create_disposition=CreateDisposition.CREATE_IF_NEEDED, destination_dataset_table=params['%s_error_table' % branch_name], params=params, dag=dag) update_gold_table = BigQueryOperator( task_id='update_%s_gold_table' % branch_name.lower(), sql='sql/%s/%s_gold_table.sql' % (branch_name.lower(), branch_name.lower()), destination_dataset_table=params['%s_gold_table' % branch_name], write_disposition=WriteDisposition.WRITE_TRUNCATE, params=params, dag=dag) upload_to_bq_task >> update_error_table move_files_to_work_directory >> upload_to_bq_task >> update_stage_table >> update_gold_table return { 'task_in': move_files_to_work_directory, 'task_out': update_gold_table }
selector_task = BranchPythonOperator(task_id='select_load', python_callable=select_load, provide_context=True) list_task >> selector_task delete_local_files = BashOperator( task_id='delete_local_files', bash_command=f'rm {DATA_PATH}/{dag.dag_id}/*', provide_context=True) move_processed_files = GoogleCloudStorageToGoogleCloudStorageOperator( task_id=f'move_processed_files', trigger_rule='none_failed', source_bucket='gcs-ipt-stackoss-business-intelligence', source_object='trafico_vendors/pendientes_procesar/*.csv', destination_bucket='gcs-ipt-stackoss-business-intelligence', destination_object='trafico_vendors/procesados/', move_object=True, google_cloud_storage_conn_id='gcs_ipt_prod', retries=2) for name, procedures in bq_procedures.items(): load_task = PythonOperator(task_id=f'load_{name}_to_bq', python_callable=load_data, retries=2, provide_context=True) selector_task >> load_task for procedure in procedures: call_procedure = PythonOperator( task_id=f'call_{name}_procedure', python_callable=call_bigquery,
Query = python_operator.PythonOperator(task_id='Query', python_callable=QueryToGCS, op_kwargs={'sql': sql}) CommentsExport = python_operator.PythonOperator( task_id='CommentsExport', python_callable=CommentsToGCS) AnswerExport = python_operator.PythonOperator(task_id='AnswerExport', python_callable=AnswersToGCS) comment_file = '{}_{}.json'.format(comment_export, yesterday_string) answer_file = '{}_{}.json'.format(answer_export, yesterday_string) CommentToGCS = GoogleCloudStorageToGoogleCloudStorageOperator( task_id="Comment_to_GSC", source_bucket=source_bucket, source_object="data/{}".format(comment_file), destination_bucket=destination_bucket, destination_object=comment_file) AnswerToGCS = GoogleCloudStorageToGoogleCloudStorageOperator( task_id="Answer_to_GSC", source_bucket=source_bucket, source_object="data/{}".format(answer_file), destination_bucket=destination_bucket, destination_object=answer_file) CommentToBQ = GoogleCloudStorageToBigQueryOperator( task_id="Comments_to_BigQuery", bucket=destination_bucket, source_objects=[comment_file], # This needs to be a a list of sources schema_object=schema_comment,
"total sulfur dioxide", "volatile acidity" ], "data": [[12.8, 0.029, 0.48, 0.98, 6.2, 29, 3.33, 1.2, 0.39, 75, 0.66], [12.8, 0.029, 0.48, 0.98, 6.2, 29, 3.33, 1.2, 0.39, 75, 0.66]] } dag = DAG('airflow-wine-from-yamls', default_args=default_args, schedule_interval=None) with dag: data_extraction = GoogleCloudStorageToGoogleCloudStorageOperator( task_id='data_extraction', google_cloud_storage_conn_id='wine_input', source_bucket=wine_bucket, destination_bucket=wine_bucket, source_object='input/*.csv', destination_object='data/', project_id=gcp_project, default_args=default_args) data_transformation = BashOperator( task_id='data_transformation', bash_command='echo "imagine that we transform a data"', default_args=default_args) odahuflow_conn = GcpConnectionToOdahuConnectionOperator( task_id='odahuflow_connection_creation', google_cloud_storage_conn_id='wine_input', api_connection_id=api_connection_id, conn_template=wine, default_args=default_args)
) export_csv = gke_command( task_id="export_csv", cmds=["bash"], env_vars={"DATASET": "glam_etl"}, command=["script/glam/export_csv"], docker_image="mozilla/bigquery-etl:latest", gcp_conn_id="google_cloud_derived_datasets", dag=dag, ) gcs_delete = GoogleCloudStorageDeleteOperator( task_id="gcs_delete", bucket_name=glam_bucket, prefix="glam-extract-fenix", gcp_conn_id="google_cloud_airflow_dataproc", dag=dag, ) gcs_copy = GoogleCloudStorageToGoogleCloudStorageOperator( task_id="gcs_copy", source_bucket="glam-fenix-dev", source_object="*.csv", destination_bucket=glam_bucket, gcp_conn_id="google_cloud_airflow_dataproc", dag=dag, ) wait_for_copy_deduplicate >> run_sql >> export_csv >> gcs_delete >> gcs_copy