def test_execute_timeout(self, mock_hook): task = GoogleCloudStoragePrefixSensor(task_id="task-id", bucket=TEST_BUCKET, prefix=TEST_PREFIX, poke_interval=0, timeout=1) mock_hook.return_value.list.return_value = [] with self.assertRaises(AirflowSensorTimeout): task.execute(mock.MagicMock) mock_hook.return_value.list.assert_called_once_with( TEST_BUCKET, prefix=TEST_PREFIX)
def test_should_return_false_on_empty_list(self, mock_hook): task = GoogleCloudStoragePrefixSensor( task_id="task-id", bucket=TEST_BUCKET, prefix=TEST_PREFIX, google_cloud_conn_id=TEST_GCP_CONN_ID, delegate_to=TEST_DELEGATE_TO, ) mock_hook.return_value.list.return_value = [] result = task.poke(mock.MagicMock) self.assertEqual(False, result)
def test_should_pass_arguments_to_hook(self, mock_hook): task = GoogleCloudStoragePrefixSensor( task_id="task-id", bucket=TEST_BUCKET, prefix=TEST_PREFIX, google_cloud_conn_id=TEST_GCP_CONN_ID, delegate_to=TEST_DELEGATE_TO, ) mock_hook.return_value.list.return_value = ["NOT_EMPTY_LIST"] result = task.poke(mock.MagicMock) mock_hook.assert_called_once_with( delegate_to=TEST_DELEGATE_TO, google_cloud_storage_conn_id=TEST_GCP_CONN_ID ) mock_hook.return_value.list.assert_called_once_with(TEST_BUCKET, prefix=TEST_PREFIX) self.assertEqual(True, result)
def test_execute(self, mock_hook): task = GoogleCloudStoragePrefixSensor( task_id="task-id", bucket=TEST_BUCKET, prefix=TEST_PREFIX, google_cloud_conn_id=TEST_GCP_CONN_ID, delegate_to=TEST_DELEGATE_TO, poke_interval=0) generated_messages = ['test-prefix/obj%s' % i for i in range(5)] mock_hook.return_value.list.return_value = generated_messages response = task.execute(None) mock_hook.assert_called_once_with( delegate_to=TEST_DELEGATE_TO, google_cloud_storage_conn_id=TEST_GCP_CONN_ID) mock_hook.return_value.list.assert_called_once_with(TEST_BUCKET, prefix=TEST_PREFIX) self.assertEqual(response, generated_messages)
False, 'start_date': datetime(2021, 2, 22, 0, 0, tzinfo=pendulum.timezone('Asia/Tokyo')), 'email_on_failure': False, 'email_on_retry': False, 'retries': 0, } dag = DAG(DAG_NAME, schedule_interval=None, default_args=default_args, catchup=False) # TASK設定 sensor = GoogleCloudStoragePrefixSensor( task_id='sensor', # 監視先バケット bucket=consts.PROJECT_ID + '-' + consts.CSV_BUCKET, # 監視先バケット配下のファイル(前方一致) prefix='{}/{}'.format(consts.FOLDER_NAME + consts.DATA_NAME), # 監視を持続させる時間(秒) # デフォルトは60 * 60 * 24 * 7。 timeout=60 * 60, # 監視持続時間を過ぎてもファイルを検知しなかった場合に自TASKを失敗とするかどうか # FalseにするとSkippedになる soft_fail=True, dag=dag, )
'retries': 1, 'retry_delay': timedelta(minutes=10), } def demo(argument, **context): print(argument) print('input parameters:') print(context['dag_run'].conf) with DAG('demo_dag', schedule_interval=None, default_args=default_args) as dag: wait_task = GoogleCloudStoragePrefixSensor( task_id='filesensor', bucket='{{var.value.gcs_bucket}}', prefix='{{var.value.gcs_file}}', google_cloud_conn_id='google_cloud_default', dag=dag) transform_file = GCSFileTransformOperator( task_id="transform_file", source_bucket='{{var.value.gcs_bucket}}', source_object='devfest', destination_bucket='{{var.value.gcs_bucket}}', destination_object='new_file', transform_script=["cp"], google_cloud_conn_id='google_cloud_default', ) wait_task >> transform_file
bash_command=f"mkdir -p {staging_directory}") # Deleting the working dir cleanup_staging_dir = BashOperator( task_id=f"cleanup-staging-dir", bash_command=f"rm -rf {staging_directory}") # Below loop iterates through each of the dicts inside the list. i.e. Iterates over files to be ingested into BQ for config in file_configurations: filename = f"{config['base_filename']}" # Out of the box Sensor for checking inside the gcs bucket against the prefix provided. file_sensor = GoogleCloudStoragePrefixSensor( task_id=f"wait-for-{filename}", bucket=f'{variables["input_bucket"]}', prefix=filename, timeout=10000, poke_interval=2000, mode="reschedule", ) """ Once the above task is finished and sensor has verified the file, then below custom op will copy the files to staging dir inside airflow bucket. """ copy_file_to_staging = GcstoGcsWithDestFilename( task_id=f"copy-{filename}-to-staging", source_bucket=variables["input_bucket"], source_object=filename + ".csv", destination_bucket=variables["airflow_home_bucket"], destination_object="data/tempfiles/vehicle_options_ingest/" + filename + ".csv", )
date_offset) + ")).strftime(\"%Y\") }}" return {"date": date_config, "month": month_config, "year": year_config} def gcs_prefix_check(date_offset): ''' returns string in format YYYY/MM/DD emulating sample directory structure in GCS''' date_dict = dynamic_date(date_offset) return date_dict["year"] + "/" + date_dict["month"] + "/" + date_dict[ "date"] gcs_prefix_check = GoogleCloudStoragePrefixSensor( dag=dag, task_id="gcs_prefix_check", bucket="example-bucket", prefix="dir1/dir2" + gcs_prefix_check(3) ) # GoogleCloudStoragePrefixSensor checks GCS for the existence of any BLOB which matches operator's prefix start_cluster_example = DataprocClusterCreateOperator( dag=dag, task_id='start_cluster_example', cluster_name='example-{{ ds }}', project_id="your-project-id", num_workers=2, num_preemptible_workers=4, master_machine_type='n1-standard-4', worker_machine_type='n1-standard-4c', worker_disk_size=300, master_disk_size=300, image_version='1.4-debian9',
params = params.generate_params(properties) with DAG('bq_customer_loyalty', default_args=properties.default_args, schedule_interval=datetime.timedelta(days=1), catchup=False) as dag: inbound_bucket = params['inbound_bucket'] inbound_dir = params['inbound_dir'] google_coud_conn_id = params['google_cloud_conn_id'] inbound_full_path = 'gs://%s/%s' % (inbound_bucket, inbound_dir) files_sensor_task = GoogleCloudStoragePrefixSensor( task_id="gs_sensor", google_cloud_conn_id=google_coud_conn_id, bucket=get_bucket_from_name(inbound_full_path), prefix=inbound_dir) generate_batch_id = PythonOperator(task_id='batch-id-gen', provide_context=True, python_callable=generate_batch_id) clear_xcom = PythonOperator(task_id='delete_xcom_bach_id', provide_context=True, python_callable=clear_xcom_bach_id, trigger_rule=TriggerRule.ALL_DONE) # OFFER_EMAIL_LIST offer_and_email_list = dag_generator.generate_dag( branch_name='offer_email_list', params=params, dag=dag)
# listen customers FTP server folder w/ sensor # define sftp-default connection in Airflow UI # t1 = SFTPSensor( # task_id='listen-sftp-server', # ) # TODO: not implemented # copy from sftp to gcp storage's incoming folder # scenario will start right from here ! # t2 = SFTPOperator( # task_id='transfer-to-incoming', # ) # Listen incoming folder w/ sensor t3 = GoogleCloudStoragePrefixSensor(task_id='listen-incoming-file', bucket='datalake-datasets-123', prefix='incoming/sales_transactions_*') # TODO: better file structure can be defined, such as monthly aggregation datalake/sales/05/sales_transactions_* # copy from gcs to datalake for raw data storing t4 = GoogleCloudStorageToGoogleCloudStorageOperator( task_id='copy-to-datalake', source_bucket=INGESTION_BUCKET_NAME, source_object='incoming/sales_transactions_*', destination_bucket=INGESTION_BUCKET_NAME, destination_object='datalake/sales_transactions_', move_object=False) # copy from incoming to process for analytical calculations t5 = GoogleCloudStorageToGoogleCloudStorageOperator( task_id='move-to-processing',
def cleanup_all_blobs(**context): blob_names = context['task_instance'].xcom_pull(task_ids='list_blobs_task') for blob_name in blob_names: logging.info('Deleting blob : %s', blob_name) delete_blob(bucket_name=watch_bucket, blob_name=blob_name) start_task = DummyOperator(task_id="Start", retries=0, dag=dag) end_task = DummyOperator(task_id="End", retries=0, dag=dag) blob_exists_check = GoogleCloudStoragePrefixSensor(task_id='blob_exist_check', bucket=watch_bucket, prefix="rtf_", poke_interval=30, timeout=10, soft_fail=True, provide_context=True, dag=dag) blob_list_task = PythonOperator(task_id='list_blobs_task', python_callable=fetch_blobs_list, dag=dag, provide_context=True) blob_move_task = PythonOperator(task_id='copy_blobs_task', python_callable=copy_all_blobs, dag=dag, provide_context=True) blob_delete_task = PythonOperator(task_id='delete_blobs_task', python_callable=cleanup_all_blobs, dag=dag,
def __init__(self, prefixes: list = ['*'], *args, **kwargs): GoogleCloudStoragePrefixSensor.__init__(self, prefix='*', *args, **kwargs) self.prefixes = prefixes