Ejemplo n.º 1
0
 def test_execute(self, mock_hook):
     task = GCSSynchronizeBucketsOperator(
         task_id="task-id",
         source_bucket="SOURCE_BUCKET",
         destination_bucket="DESTINATION_BUCKET",
         source_object="SOURCE_OBJECT",
         destination_object="DESTINATION_OBJECT",
         recursive=True,
         delete_extra_files=True,
         allow_overwrite=True,
         gcp_conn_id="GCP_CONN_ID",
         delegate_to="DELEGATE_TO",
         impersonation_chain=IMPERSONATION_CHAIN,
     )
     task.execute({})
     mock_hook.assert_called_once_with(
         google_cloud_storage_conn_id='GCP_CONN_ID',
         delegate_to='DELEGATE_TO',
         impersonation_chain=IMPERSONATION_CHAIN,
     )
     mock_hook.return_value.sync.assert_called_once_with(
         source_bucket='SOURCE_BUCKET',
         source_object='SOURCE_OBJECT',
         destination_bucket='DESTINATION_BUCKET',
         destination_object='DESTINATION_OBJECT',
         delete_extra_files=True,
         recursive=True,
         allow_overwrite=True,
     )
Ejemplo n.º 2
0
BUCKET_2_SRC = os.environ.get("GCP_GCS_BUCKET_2_SRC", "test-gcs-sync-2-src")
BUCKET_2_DST = os.environ.get("GCP_GCS_BUCKET_2_DST", "test-gcs-sync-2-dst")

BUCKET_3_SRC = os.environ.get("GCP_GCS_BUCKET_3_SRC", "test-gcs-sync-3-src")
BUCKET_3_DST = os.environ.get("GCP_GCS_BUCKET_3_DST", "test-gcs-sync-3-dst")

OBJECT_1 = os.environ.get("GCP_GCS_OBJECT_1", "test-gcs-to-gcs-1")
OBJECT_2 = os.environ.get("GCP_GCS_OBJECT_2", "test-gcs-to-gcs-2")

with models.DAG("example_gcs_to_gcs",
                schedule_interval='@once',
                start_date=days_ago(1),
                tags=['example']) as dag:
    # [START howto_synch_bucket]
    sync_bucket = GCSSynchronizeBucketsOperator(
        task_id="sync_bucket",
        source_bucket=BUCKET_1_SRC,
        destination_bucket=BUCKET_1_DST)
    # [END howto_synch_bucket]

    # [START howto_synch_full_bucket]
    sync_full_bucket = GCSSynchronizeBucketsOperator(
        task_id="sync_full_bucket",
        source_bucket=BUCKET_1_SRC,
        destination_bucket=BUCKET_1_DST,
        delete_extra_files=True,
        allow_overwrite=True,
    )
    # [END howto_synch_full_bucket]

    # [START howto_synch_to_subdir]
    sync_to_subdirectory = GCSSynchronizeBucketsOperator(
Ejemplo n.º 3
0
GCP_BQ_TABLE_AUTHORS = os.environ['GCP_BQ_TABLE_AUTHORS']
GCP_BQ_TABLE_REVIEWS = os.environ['GCP_BQ_TABLE_REVIEWS']
GCP_BQ_TABLE_BOOKS = os.environ['GCP_BQ_TABLE_BOOKS']

with DAG(dag_id='etl_pipeline',
         schedule_interval=None,
         start_date=days_ago(200),
         default_args={}) as dag:

    grab_data = PythonOperator(task_id='grab_data',
                               python_callable=create_source_from_airflow)

    transfer_grabbed_data = GCSSynchronizeBucketsOperator(
        task_id='transfer_grabbed_data',
        source_bucket=os.environ['GCP_GCS_BUCKET_LANDING'],
        destination_bucket=os.environ['GCP_GCS_BUCKET_WORKING'],
        gcp_conn_id='gr_storage_conn',
        source_object='{{ run_id }}',
        destination_object='{{ run_id }}',
    )

    spark_etl = DataprocInstantiateWorkflowTemplateOperator(
        task_id='spark_etl',
        template_id=os.environ['GCP_DATAPROC_TEMPLATE_ID'],
        project_id=os.environ['GCP_PROJECT_ID'],
        region=os.environ['GCP_REGION'],
        parameters={
            'PATH_TO_ETL_FILE':
            f"gs://{os.environ['GCP_GCS_BUCKET_ROUTINE']}/etl/etl.py",
            'CLUSTER_NAME': 'goodreads-etl',
            'ARG_SOURCE_BUCKET': os.environ['GCP_GCS_BUCKET_WORKING'],
            'ARG_DESTINATION_BUCKET': os.environ['GCP_GCS_BUCKET_PROCESSED'],