def setUp(self): self.dataflow = DataflowTemplateOperator( task_id=TASK_ID, template=TEMPLATE, parameters=PARAMETERS, dataflow_default_options=DEFAULT_OPTIONS_TEMPLATE, poll_sleep=POLL_SLEEP)
class DataFlowTemplateOperatorTest(unittest.TestCase): def setUp(self): self.dataflow = DataflowTemplateOperator( task_id=TASK_ID, template=TEMPLATE, parameters=PARAMETERS, dataflow_default_options=DEFAULT_OPTIONS_TEMPLATE, poll_sleep=POLL_SLEEP) def test_init(self): """Test DataflowTemplateOperator instance is properly initialized.""" self.assertEqual(self.dataflow.task_id, TASK_ID) self.assertEqual(self.dataflow.template, TEMPLATE) self.assertEqual(self.dataflow.parameters, PARAMETERS) self.assertEqual(self.dataflow.poll_sleep, POLL_SLEEP) self.assertEqual(self.dataflow.dataflow_default_options, DEFAULT_OPTIONS_TEMPLATE) @mock.patch('airflow.contrib.operators.dataflow_operator.DataFlowHook') def test_exec(self, dataflow_mock): """Test DataFlowHook is created and the right args are passed to start_template_workflow. """ start_template_hook = dataflow_mock.return_value.start_template_dataflow self.dataflow.execute(None) self.assertTrue(dataflow_mock.called) expected_options = { 'project': 'test', 'stagingLocation': 'gs://test/staging', 'tempLocation': 'gs://test/temp', 'zone': 'us-central1-f' } start_template_hook.assert_called_once_with(TASK_ID, expected_options, PARAMETERS, TEMPLATE)
def __init__(self, project, config, task_id_sufix, parameters, *args, **kwargs): self.config = config template_location = 'gs://{}/templates/{}/v{}'.format( self.config['bucket_name'], self.config['template_name'], self.config['template_version']) parameters['project'] = project self.extra_func_wrappers = kwargs.get('extra_funcs', []) DataflowTemplateOperator.__init__( self, task_id='extract-{}'.format(task_id_sufix), template=template_location, parameters=parameters, poll_sleep=60, *args, **kwargs)
class DataFlowTemplateOperatorTest(unittest.TestCase): def setUp(self): self.dataflow = DataflowTemplateOperator( task_id=TASK_ID, template=TEMPLATE, job_name=JOB_NAME, parameters=PARAMETERS, dataflow_default_options=DEFAULT_OPTIONS_TEMPLATE, poll_sleep=POLL_SLEEP) def test_init(self): """Test DataflowTemplateOperator instance is properly initialized.""" self.assertEqual(self.dataflow.task_id, TASK_ID) self.assertEqual(self.dataflow.job_name, JOB_NAME) self.assertEqual(self.dataflow.template, TEMPLATE) self.assertEqual(self.dataflow.parameters, PARAMETERS) self.assertEqual(self.dataflow.poll_sleep, POLL_SLEEP) self.assertEqual(self.dataflow.dataflow_default_options, DEFAULT_OPTIONS_TEMPLATE) @mock.patch('airflow.contrib.operators.dataflow_operator.DataFlowHook') def test_exec(self, dataflow_mock): """Test DataFlowHook is created and the right args are passed to start_template_workflow. """ start_template_hook = dataflow_mock.return_value.start_template_dataflow self.dataflow.execute(None) self.assertTrue(dataflow_mock.called) expected_options = { 'project': 'test', 'stagingLocation': 'gs://test/staging', 'tempLocation': 'gs://test/temp', 'zone': 'us-central1-f' } start_template_hook.assert_called_once_with(JOB_NAME, expected_options, PARAMETERS, TEMPLATE)
'project': PROJECT_ID, } } with models.DAG(dag_id=DAG_NAME, schedule_interval="@once", default_args=default_dag_args) as dag: t1 = DataflowTemplateOperator( task_id='task1', template='gs://dataflow-templates/latest/GCS_Text_to_BigQuery', parameters={ 'javascriptTextTransformFunctionName': 'transform', # udf.jsファイルにある呼び出したいメソッド名 'JSONPath': 'gs://{}/composer/schema/schema.json'.format( BUCKET_NAME), # bqのスキーマ定義ファイルのgcsパス 'javascriptTextTransformGcsPath': 'gs://{}/composer/udf/udf.js'.format( BUCKET_NAME), # udf.jsファイルのgcsパス 'inputFilePattern': 'gs://{}/composer/csv/sample.csv'.format( BUCKET_NAME), # bqへ投入するcsvファイルのgcsパス 'outputTable': '{}:my_dataset.sample'.format( PROJECT_ID), # 保存するbqのプロジェクトid:データセット名.テーブル名 'bigQueryLoadingTemporaryDirectory': 'gs://{}/composer/temp'.format(BUCKET_NAME), # bqへのロード中のtempディレクトリ }, )
def subdag(parent_dag_name, child_dag_name, args, json_gs): dag_subdag = DAG( dag_id=f'{parent_dag_name}.{child_dag_name}', default_args=args, start_date=datetime.datetime(2021, 8, 5, 20, 0), schedule_interval='0 13,14,15,16,17,18,19,20,21,22,23,0,1 * * *', ) connection_airflow_yas_sa_sii_de = BaseHook.get_connection( 'google_cloud_yas_sa_sii_de') service_account_yas_sa_sii_de = ast.literal_eval( connection_airflow_yas_sa_sii_de. extra_dejson["extra__google_cloud_platform__keyfile_dict"]) with gcsfs.GCSFileSystem( project='yas-dev-sii-pid', token=service_account_yas_sa_sii_de).open(json_gs) as f: jd = json.load(f) # Variables para ejecucion desde JSON url_trn = jd['url_trn'] # Datos de TRN job_name_hom = jd['job_name_hom'] url_hom = jd['url_hom'] file_name_hom = jd['file_name_hom'] template_location_hom = jd['template_location_hom'] # Datos Generales para la ejecucion temp_location = jd['temp_location'] project = jd['project'] region = jd['region'] subnetwork = jd['subnetwork'] service_account_email = jd['service_account_email'] machine_type = jd['machine_type'] max_num_workers = jd['max_num_workers'] num_workers = jd['num_workers'] folders = gcsfs.GCSFileSystem( project='yas-dev-sii-pid', token=service_account_yas_sa_sii_de).ls(url_trn) if len(folders) > 0: for folder in folders: date_folder = folder.split('/')[3] if len(date_folder) >= 10: url_source = 'gs://' + folder url_dest = url_hom + date_folder + '/' + file_name_hom parent_dag_name_for_id = parent_dag_name.lower() print('url_source: ' + url_source) print('url_dest: ' + url_dest) DataflowTemplateOperator( template=template_location_hom, job_name= f'{parent_dag_name_for_id}-{child_dag_name}-{date_folder}', task_id= f'{parent_dag_name_for_id}-{child_dag_name}-{date_folder}', location=region, parameters={ 'url_trn': url_source, 'url_hom': url_dest, }, default_args=args, dataflow_default_options={ 'project': project, 'zone': 'us-east1-c', 'tempLocation': temp_location, 'machineType': machine_type, 'serviceAccountEmail': service_account_email, 'subnetwork': subnetwork, }, gcp_conn_id='google_cloud_yas_sa_sii_de', dag=dag_subdag, ) return dag_subdag
DEFAULT_DAG_ARGS = { 'owner': 'airflow', 'depends_on_past': False, 'start_date': TODAY, 'email_on_failure': False, 'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=5), 'schedule_interval': '@hourly', 'dataflow_default_options': { 'project': PROJECT, 'zone': 'us-east1-b', 'stagingLocation': TEMP_BUCKET } } dag = DAG('Demo-DAG-DataflowCronHourly', default_args=DEFAULT_DAG_ARGS, dagrun_timeout=timedelta(hours=1), schedule_interval='00 * * * *') start = DummyOperator(task_id='inicio', dag=dag) end = DummyOperator(task_id='fim', dag=dag) t1 = DataflowTemplateOperator(task_id='dataflow_count_words_example', template=TEMPLATE, parameters=PARAMETERS, dag=dag) start >> t1 >> end
def execute(self, context): for extra_func_wrapper in self.extra_func_wrappers: extra_func_internal = extra_func_wrapper(self) extra_func_internal(context) DataflowTemplateOperator.execute(self, context)
'zone': ZONE, 'stagingLocation': TEMP_BUCKET } } dag = DAG('CDC-DAG-v1', default_args=DEFAULT_DAG_ARGS, dagrun_timeout=timedelta(hours=3), schedule_interval='00 * * * *') start = DummyOperator(task_id='Start', dag=dag) end = DummyOperator(task_id='End', dag=dag) dataflow_load_table1 = DataflowTemplateOperator( task_id='loadbq_table1_dataflow', template=TEMPLATE, parameters=PARAMETERS, environment=ENVIRONMENT, dag=dag) # Perform most popular question query. bq_merge_table1 = bigquery_operator.BigQueryOperator(task_id='bq_merge_table1', bql=""" MERGE `{table}` T USING ( SELECT CASE WHEN a.id IS NULL AND b.id IS NOT NULL THEN "I" WHEN a.id IS NOT NULL AND b.id IS NULL THEN "D"
stage_data = DataflowTemplateOperator( task_id='stage_data', template= '{{ (var.json|attr("config-{}".format(run_id)))["dataflow"]["template"] }}', dataflow_default_options={ 'project': '{{ (var.json|attr("config-{}".format(run_id)))["dataflow"]["options"]["project"] }}', 'region': '{{ (var.json|attr("config-{}".format(run_id)))["dataflow"]["options"]["region"] }}', 'zone': '{{ (var.json|attr("config-{}".format(run_id)))["dataflow"]["options"]["zone"] }}', 'network': '{{ (var.json|attr("config-{}".format(run_id)))["dataflow"]["options"]["network"] }}', 'subnetwork': '{{ (var.json|attr("config-{}".format(run_id)))["dataflow"]["options"]["subnetwork"] }}', 'tempLocation': '{{ (var.json|attr("config-{}".format(run_id)))["dataflow"]["options"]["tempLocation"] }}', }, parameters={ 'driverJars': '{{ (var.json|attr("config-{}".format(run_id)))["dataflow"]["parameters"]["driverJars"] }}', 'driverClassName': '{{ (var.json|attr("config-{}".format(run_id)))["dataflow"]["parameters"]["driverClassName"] }}', 'connectionURL': '{{ (var.json|attr("config-{}".format(run_id)))["dataflow"]["parameters"]["connectionURL"] }}', 'query': '{{ (var.json|attr("config-{}".format(run_id)))["dataflow"]["parameters"]["query"] }}', 'outputTable': '{{ (var.json|attr("config-{}".format(run_id)))["bigquery"]["staging_table"] }}', 'bigQueryLoadingTemporaryDirectory': '{{ (var.json|attr("config-{}".format(run_id)))["dataflow"]["parameters"]["bigQueryLoadingTemporaryDirectory"] }}', 'connectionProperties': '{{ (var.json|attr("config-{}".format(run_id)))["dataflow"]["parameters"]["connectionProperties"] }}', 'username': '******', 'password': '******', }, dag=dag)
source_bucket='{}_hr_data_8980'.format(os.environ['AIRFLOW_VAR_ENV']), source_object='inbox/*.csv', destination_bucket='{}_hr_data_8980'.format( os.environ['AIRFLOW_VAR_ENV']), destination_object='processing/', move_object=True, google_cloud_storage_conn_id='etl_sa') dataflow_task = DataflowTemplateOperator( task_id="invoke_dataflow", template="gs://{}_df_templates_8987/pipeline1/pipeline1_template". format(os.environ['AIRFLOW_VAR_ENV']), job_name='sample_dataflow_example', poll_sleep=5, parameters={ 'input': 'gs://{}_hr_data_8980/processing/*.csv'.format( os.environ['AIRFLOW_VAR_ENV']), 'deIdentiyTemplateId': 'generic_deidentify_template' }, dag=dag, ) move_to_archieve = GoogleCloudStorageToGoogleCloudStorageOperator( task_id='move_file_to_archieve', source_bucket='{}_hr_data_8980'.format(os.environ['AIRFLOW_VAR_ENV']), source_object='/processing/*.csv', destination_bucket='{}_hr_data_arc_7856'.format( os.environ['AIRFLOW_VAR_ENV']), destination_object='/', move_object=True,
'serviceAccountEmail': '*****@*****.**', 'subnetwork': "https://www.googleapis.com/compute/v1/projects/sha-net-dev-id/regions/us-east1/subnetworks/subnet-analytics-region-a" } } dag = DAG('dag-sii-bch-ing-ab-raw', default_args=default_args, schedule_interval='59 4 * * *', tags=['RAW']) raw_estatus_cuentas = DataflowTemplateOperator( template= 'gs://yas-sii-int-des-dev/AB/templates/TPL_SII_BCH_ING_AB_RAW_ESTATUS_CUENTAS', job_name='sii-bch-ing-ab-raw-estatus-cuentas', task_id='sii-bch-ing-ab-raw-estatus-cuentas', location=gce_region, gcp_conn_id='google_cloud_yas_sa_sii_de', dag=dag) raw_servicios_cuenta = DataflowTemplateOperator( template= 'gs://yas-sii-int-des-dev/AB/templates/TPL_SII_BCH_ING_AB_RAW_SERVICIOS_CUENTA', job_name='sii-bch-ing-ab-raw-servicios-cuenta', task_id='sii-bch-ing-ab-raw-servicios-cuenta', location=gce_region, gcp_conn_id='google_cloud_yas_sa_sii_de', dag=dag) raw_tipos_cuentas = DataflowTemplateOperator( template=
}, 'start_date': datetime.datetime(2019, 10, 15) } with models.DAG('start_dataflow', schedule_interval=None, default_args=default_args, catchup=False) as dag: def start_greeting(): import logging logging.info('Hello! Welcome to AirFlow') def end_greeting(): import logging logging.info('Thank you, Goodbye!') start = python_operator.PythonOperator(task_id='start', python_callable=start_greeting) end = python_operator.PythonOperator(task_id='end', python_callable=end_greeting) df_pipeline_mutation = DataflowTemplateOperator( task_id='df_pipeline_mutation', template='{{var.value.gcp_df_mutation_template}}', gcp_conn_id='google_cloud_default' ) df_pipeline_import = DataflowTemplateOperator( task_id='df_pipeline_import', template='{{var.value.gcp_df_import_template}}', gcp_conn_id='google_cloud_default' ) start >> df_pipeline_mutation >> df_pipeline_import >> end
dummy_start = DummyOperator(task_id='job_start', dag=dag) dummy_end = DummyOperator(task_id='job_end', dag=dag, trigger_rule='all_success') cntr = 0 while cntr < count_of_jobs: dftask = DataflowTemplateOperator( task_id='dataflow_pubsub_to_gcs-' + str(cntr), template='gs://$YOUR-PROJ-poc_dataflow/templates/PubsubJsonToGcs_v1.0', job_name='csv-pubsub-to-gcs-' + str(cntr) + '', dataflow_default_options=dataflow_default_options, parameters={ 'runner': 'DataflowRunner', 'tempLocation': 'gs://$YOUR-PROJ-poc_dataflow/temp/', 'inputPath': 'projects/$YOUR-PROJ--poc-proj/subscriptions/sub_csv' + str(cntr), 'outputPath': 'gs://$YOUR-PROJ-poc_data/ingested/json/2020-10-21/run' + str(cntr) + '/', 'jobName': 'pubsub-to-gcs-airflow' }, dag=dag) dummy_start >> dftask dftask >> dummy_end cntr += 1
machineType_exe, 'serviceAccountEmail': '*****@*****.**', 'subnetwork': "https://www.googleapis.com/compute/v1/projects/sha-net-dev-id/regions/us-east1/subnetworks/subnet-analytics-region-a" } } dag = DAG('dag-sii-bch-ing-ab-raw-cue-mov', default_args=default_args, schedule_interval='0 13,14,15,16,17,18,19,20,21,22,23,0,1 * * *', tags=['RAW', 'Movimientos', 'Cuentas']) raw_cuentas = DataflowTemplateOperator( template= 'gs://yas-sii-int-des-dev/AB/templates/TPL_SII_BCH_ING_AB_RAW_CUENTAS', job_name='sii-bch-ing-ab-raw-cuenta', task_id='sii-bch-ing-ab-raw-cuenta', location=gce_region, gcp_conn_id='google_cloud_yas_sa_sii_de', dag=dag) raw_movimientos = DataflowTemplateOperator( template= 'gs://yas-sii-int-des-dev/AB/templates/TPL_SII_BCH_ING_AB_RAW_MOVIMIENTOS', job_name='sii-bch-ing-ab-raw-movimientos', task_id='sii-bch-ing-ab-raw-movimientos', location=gce_region, gcp_conn_id='google_cloud_yas_sa_sii_de', dag=dag)
python_callable=get_nodash_date, provide_context=True) dataflow_job = DataflowTemplateOperator( # The task id of your job task_id="dataflow_operator_transform_csv_to_bq", # The name of the template that you're using. # Below is a list of all the templates you can use. # For versions in non-production environments, use the subfolder 'latest' # https://cloud.google.com/dataflow/docs/guides/templates/provided-batch#gcstexttobigquery template="gs://dataflow-templates/latest/GCS_Text_to_BigQuery", # Use the link above to specify the correct parameters for your template. parameters={ "javascriptTextTransformFunctionName": "transformCSVtoJSON", "JSONPath": bucket_path + "/schema.json", "javascriptTextTransformGcsPath": bucket_path + "/transform.js", "inputFilePattern": "gs://week_2_bs/keyword_search/search_" + '{{ ti.xcom_pull("get_execution_date") }}' + ".csv", "outputTable": project_id + ":searched_keyword.searched_keyword", "bigQueryLoadingTemporaryDirectory": bucket_path + "/tmp/", }, ) t2 = PythonOperator(task_id='get_dash_date', python_callable=get_dash_date,
"region": gce_region, # Set to your zone "zone": gce_zone, # This is a subfolder for storing temporary files, like the staged pipeline job. "temp_location": bucket_path + "/tmp/", }, } # Define a DAG (directed acyclic graph) of tasks. # Any task you create within the context manager is automatically added to the # DAG object. with models.DAG( # The id you will see in the DAG airflow page "dataflow_template_composer_dataflow_dag", default_args=default_args, # The interval with which to schedule the DAG schedule_interval=datetime.timedelta( days=1), # Override to match your needs ) as dag: start_template_job = DataflowTemplateOperator( # The task id of your job task_id="dataflow_template_operator_run", # The name of the template that you're using. # Below is a list of all the templates you can use. # For versions in non-production environments, use the subfolder 'latest' # https://cloud.google.com/dataflow/docs/guides/templates/provided-batch#gcstexttobigquery template="gs://dataflow_cicd_test/templates/test_beam", # Use the link above to specify the correct parameters for your template. parameters={})
with models.DAG( # The id you will see in the DAG airflow page "composer_dataflow_dag", default_args=default_args, # The interval with which to schedule the DAG schedule_interval=datetime.timedelta( days=1), # Override to match your needs ) as dag: start_template_job = DataflowTemplateOperator( # The task id of your job task_id="dataflow_operator_transform_csv_to_bq", # The name of the template that you're using. # Below is a list of all the templates you can use. # For versions in non-production environments, use the subfolder 'latest' # https://cloud.google.com/dataflow/docs/guides/templates/provided-batch#gcstexttobigquery template="gs://dataflow-templates/latest/GCS_Text_to_BigQuery", # Use the link above to specify the correct parameters for your template. parameters={ "javascriptTextTransformFunctionName": "transformCSVtoJSON", "JSONPath": bucket_path + "/jsonSchema.json", "javascriptTextTransformGcsPath": bucket_path + "/transformCSVtoJSON.js", "inputFilePattern": bucket_path + "/inputFile.txt", "outputTable": project_id + ":average_weather.average_weather", "bigQueryLoadingTemporaryDirectory": bucket_path + "/tmp/", }, ) # [END composer_dataflow_dag]
# Define a DAG (directed acyclic graph) of tasks. # Any task you create within the context manager is automatically added to the # DAG object. template_path = 'gs://{}/template/GCS_TO_GCS_5'.format(project_id) with models.DAG('composer_dataflowtemplate1016_3', schedule_interval='@once', default_args=default_dag_args, concurrency=1, max_active_runs=1) as dag: execute_dataflow_1 = DataflowTemplateOperator( task_id='datapflow_example1', template=template_path, parameters={ 'inputFile': "gs://{}/sample.csv".format(project_id), 'outputFile': "gs://{}/composer_output/sample_1.csv".format(project_id), }, dag=dag) execute_dataflow_2 = DataflowTemplateOperator( task_id='datapflow_example2', template=template_path, parameters={ 'inputFile': "gs://{}/composer_output/sample_1.csv".format(project_id), 'outputFile': "gs://{}/composer_output/sample_2.csv".format(project_id), }, dag=dag) execute_dataflow_3 = DataflowTemplateOperator(