def test_execute_no_suffix_without_destination_object(self, mock_hook):
        mock_hook.return_value.list.return_value = SOURCE_FILES_LIST
        operator = GoogleCloudStorageToGoogleCloudStorageOperator(
            task_id=TASK_ID, source_bucket=TEST_BUCKET,
            source_object=SOURCE_OBJECT_2,
            destination_bucket=DESTINATION_BUCKET)

        operator.execute(None)
        mock_hook.return_value.copy.assert_has_calls(MOCK_CALLS_EMPTY)
Beispiel #2
0
    def test_execute_no_suffix_without_destination_object(self, mock_hook):
        mock_hook.return_value.list.return_value = SOURCE_FILES_LIST
        operator = GoogleCloudStorageToGoogleCloudStorageOperator(
            task_id=TASK_ID,
            source_bucket=TEST_BUCKET,
            source_object=SOURCE_OBJECT_2,
            destination_bucket=DESTINATION_BUCKET)

        operator.execute(None)
        mock_hook.return_value.copy.assert_has_calls(MOCK_CALLS_EMPTY)
    def test_execute_prefix_and_suffix(self, mock_hook):
        operator = GoogleCloudStorageToGoogleCloudStorageOperator(
            task_id=TASK_ID, source_bucket=TEST_BUCKET,
            source_object=SOURCE_OBJECT_WILDCARD_MIDDLE,
            destination_bucket=DESTINATION_BUCKET)

        operator.execute(None)
        mock_hook.return_value.list.assert_called_once_with(
            TEST_BUCKET, prefix="test", delimiter="object"
        )
Beispiel #4
0
    def test_execute_prefix_and_suffix(self, mock_hook):
        operator = GoogleCloudStorageToGoogleCloudStorageOperator(
            task_id=TASK_ID, source_bucket=TEST_BUCKET,
            source_object=SOURCE_OBJECT_WILDCARD_MIDDLE,
            destination_bucket=DESTINATION_BUCKET)

        operator.execute(None)
        mock_hook.return_value.list.assert_called_once_with(
            TEST_BUCKET, prefix="test", delimiter="object"
        )
    def test_no_prefix_with_last_modified_time_with_false_cond(self, mock_hook):
        mock_hook.return_value.is_updated_after.return_value = False
        operator = GoogleCloudStorageToGoogleCloudStorageOperator(
            task_id=TASK_ID, source_bucket=TEST_BUCKET,
            source_object=SOURCE_OBJECT_NO_WILDCARD,
            destination_bucket=DESTINATION_BUCKET,
            destination_object=SOURCE_OBJECT_NO_WILDCARD,
            last_modified_time=MOD_TIME_1)

        operator.execute(None)
        mock_hook.return_value.rewrite.assert_not_called()
    def test_execute_no_prefix_with_no_last_modified_time(self, mock_hook):
        operator = GoogleCloudStorageToGoogleCloudStorageOperator(
            task_id=TASK_ID, source_bucket=TEST_BUCKET,
            source_object=SOURCE_OBJECT_NO_WILDCARD,
            destination_bucket=DESTINATION_BUCKET,
            destination_object=SOURCE_OBJECT_NO_WILDCARD,
            last_modified_time=None)

        operator.execute(None)
        mock_hook.return_value.rewrite.assert_called_once_with(
            TEST_BUCKET, 'test_object.txt', DESTINATION_BUCKET, 'test_object.txt')
Beispiel #7
0
    def test_execute_no_prefix_with_no_last_modified_time(self, mock_hook):
        operator = GoogleCloudStorageToGoogleCloudStorageOperator(
            task_id=TASK_ID, source_bucket=TEST_BUCKET,
            source_object=SOURCE_OBJECT_NO_WILDCARD,
            destination_bucket=DESTINATION_BUCKET,
            destination_object=SOURCE_OBJECT_NO_WILDCARD,
            last_modified_time=None)

        operator.execute(None)
        mock_hook.return_value.rewrite.assert_called_once_with(
            TEST_BUCKET, 'test_object.txt', DESTINATION_BUCKET, 'test_object.txt')
Beispiel #8
0
    def test_no_prefix_with_last_modified_time_with_false_cond(self, mock_hook):
        mock_hook.return_value.is_updated_after.return_value = False
        operator = GoogleCloudStorageToGoogleCloudStorageOperator(
            task_id=TASK_ID, source_bucket=TEST_BUCKET,
            source_object=SOURCE_OBJECT_NO_WILDCARD,
            destination_bucket=DESTINATION_BUCKET,
            destination_object=SOURCE_OBJECT_NO_WILDCARD,
            last_modified_time=MOD_TIME_1)

        operator.execute(None)
        mock_hook.return_value.rewrite.assert_not_called()
def transfer_data_subdag(
    parent_dag_name,
    child_dag_name,
    default_args,
    source_bucket,
    destination_bucket,
    destination_bucket_prefix,
    app_name,
    submission_date,
    server_id,
    public_key_hex_external,
    google_cloud_storage_conn_id,
):
    """Copy the partitioned data from the staging bucket into the corresponding
    receiving buckets in each processor. The job then submits a `_SUCCESS` file
    which indicates the data is ready for processing.

    See the tests for the staging job for preprocessing convention:
    https://github.com/mozilla/prio-processor/blob/3cdc368707f8dc0f917d7b3d537c31645f4260f7/processor/tests/test_staging.py#L190-L205

    See the `bin/generate` script for an example around conventions around processing:
    https://github.com/mozilla/prio-processor/blob/1a4a58a738c3d39bfb04bbaa33a323412f1398ec/bin/generate#L53-L67
    """
    with DAG(f"{parent_dag_name}.{child_dag_name}",
             default_args=default_args) as dag:
        prefix = "/".join([
            destination_bucket_prefix,
            public_key_hex_external,
            app_name,
            submission_date,
            "raw/shares",
        ])
        transfer_dataset = GoogleCloudStorageToGoogleCloudStorageOperator(
            task_id="transfer_dataset",
            source_bucket=source_bucket,
            source_object=
            f"staging/submission_date={submission_date}/server_id={server_id}/*",
            destination_bucket=destination_bucket,
            destination_object=f"{prefix}/",
            google_cloud_storage_conn_id=google_cloud_storage_conn_id,
            dag=dag,
        )
        mark_dataset_success = GoogleCloudStorageToGoogleCloudStorageOperator(
            task_id="mark_dataset_success",
            source_bucket=source_bucket,
            source_object="staging/_SUCCESS",
            destination_bucket=destination_bucket,
            destination_object=f"{prefix}/_SUCCESS",
            google_cloud_storage_conn_id=google_cloud_storage_conn_id,
            dag=dag,
        )
        transfer_dataset >> mark_dataset_success
        return dag
Beispiel #10
0
    def test_wc_with_last_modified_time_with_one_true_cond(self, mock_hook):
        mock_hook.return_value.list.return_value = SOURCE_FILES_LIST
        mock_hook.return_value.is_updated_after.side_effect = [True, False, False]
        operator = GoogleCloudStorageToGoogleCloudStorageOperator(
            task_id=TASK_ID, source_bucket=TEST_BUCKET,
            source_object=SOURCE_OBJECT_WILDCARD_FILENAME,
            destination_bucket=DESTINATION_BUCKET,
            last_modified_time=MOD_TIME_1)

        operator.execute(None)
        mock_hook.return_value.rewrite.assert_called_once_with(
            TEST_BUCKET, 'test_object/file1.txt',
            DESTINATION_BUCKET, 'test_object/file1.txt')
    def test_wc_with_last_modified_time_with_one_true_cond(self, mock_hook):
        mock_hook.return_value.list.return_value = SOURCE_FILES_LIST
        mock_hook.return_value.is_updated_after.side_effect = [True, False, False]
        operator = GoogleCloudStorageToGoogleCloudStorageOperator(
            task_id=TASK_ID, source_bucket=TEST_BUCKET,
            source_object=SOURCE_OBJECT_WILDCARD_FILENAME,
            destination_bucket=DESTINATION_BUCKET,
            last_modified_time=MOD_TIME_1)

        operator.execute(None)
        mock_hook.return_value.rewrite.assert_called_once_with(
            TEST_BUCKET, 'test_object/file1.txt',
            DESTINATION_BUCKET, 'test_object/file1.txt')
    def test_execute_more_than_1_wildcard(self, mock_hook):
        mock_hook.return_value.list.return_value = SOURCE_FILES_LIST
        operator = GoogleCloudStorageToGoogleCloudStorageOperator(
            task_id=TASK_ID, source_bucket=TEST_BUCKET,
            source_object=SOURCE_OBJECT_MULTIPLE_WILDCARDS,
            destination_bucket=DESTINATION_BUCKET,
            destination_object=DESTINATION_OBJECT_PREFIX)

        total_wildcards = operator.source_object.count(WILDCARD)

        error_msg = "Only one wildcard '[*]' is allowed in source_object parameter. " \
                    "Found {}".format(total_wildcards, SOURCE_OBJECT_MULTIPLE_WILDCARDS)

        with self.assertRaisesRegexp(AirflowException, error_msg):
            operator.execute(None)
    def test_execute_with_empty_destination_bucket(self, mock_hook):
        mock_hook.return_value.list.return_value = SOURCE_FILES_LIST
        operator = GoogleCloudStorageToGoogleCloudStorageOperator(
            task_id=TASK_ID, source_bucket=TEST_BUCKET,
            source_object=SOURCE_OBJECT_NO_WILDCARD,
            destination_bucket=None,
            destination_object=DESTINATION_OBJECT_PREFIX)

        with patch.object(operator.log, 'warning') as mock_warn:
            operator.execute(None)
            mock_warn.assert_called_with(
                'destination_bucket is None. Defaulting it to source_bucket (%s)',
                TEST_BUCKET
            )
            self.assertEquals(operator.destination_bucket, operator.source_bucket)
    def test_execute_wildcard_without_destination_object(self, mock_hook):
        mock_hook.return_value.list.return_value = SOURCE_FILES_LIST
        operator = GoogleCloudStorageToGoogleCloudStorageOperator(
            task_id=TASK_ID, source_bucket=TEST_BUCKET,
            source_object=SOURCE_OBJECT_WILDCARD_FILENAME,
            destination_bucket=DESTINATION_BUCKET)

        operator.execute(None)
        mock_calls_none = [
            mock.call(TEST_BUCKET, 'test_object/file1.txt',
                      DESTINATION_BUCKET, 'test_object/file1.txt'),
            mock.call(TEST_BUCKET, 'test_object/file2.txt',
                      DESTINATION_BUCKET, 'test_object/file2.txt'),
        ]
        mock_hook.return_value.rewrite.assert_has_calls(mock_calls_none)
Beispiel #15
0
    def test_execute_wildcard_without_destination_object(self, mock_hook):
        mock_hook.return_value.list.return_value = SOURCE_FILES_LIST
        operator = GoogleCloudStorageToGoogleCloudStorageOperator(
            task_id=TASK_ID, source_bucket=TEST_BUCKET,
            source_object=SOURCE_OBJECT_WILDCARD_FILENAME,
            destination_bucket=DESTINATION_BUCKET)

        operator.execute(None)
        mock_calls_none = [
            mock.call(TEST_BUCKET, 'test_object/file1.txt',
                      DESTINATION_BUCKET, 'test_object/file1.txt'),
            mock.call(TEST_BUCKET, 'test_object/file2.txt',
                      DESTINATION_BUCKET, 'test_object/file2.txt'),
        ]
        mock_hook.return_value.rewrite.assert_has_calls(mock_calls_none)
Beispiel #16
0
    def test_execute_more_than_1_wildcard(self, mock_hook):
        mock_hook.return_value.list.return_value = SOURCE_FILES_LIST
        operator = GoogleCloudStorageToGoogleCloudStorageOperator(
            task_id=TASK_ID, source_bucket=TEST_BUCKET,
            source_object=SOURCE_OBJECT_MULTIPLE_WILDCARDS,
            destination_bucket=DESTINATION_BUCKET,
            destination_object=DESTINATION_OBJECT_PREFIX)

        total_wildcards = operator.source_object.count(WILDCARD)

        error_msg = "Only one wildcard '[*]' is allowed in source_object parameter. " \
                    "Found {}".format(total_wildcards)

        with six.assertRaisesRegex(self, AirflowException, error_msg):
            operator.execute(None)
Beispiel #17
0
    def test_execute_with_empty_destination_bucket(self, mock_hook):
        mock_hook.return_value.list.return_value = SOURCE_FILES_LIST
        operator = GoogleCloudStorageToGoogleCloudStorageOperator(
            task_id=TASK_ID, source_bucket=TEST_BUCKET,
            source_object=SOURCE_OBJECT_NO_WILDCARD,
            destination_bucket=None,
            destination_object=DESTINATION_OBJECT_PREFIX)

        with patch.object(operator.log, 'warning') as mock_warn:
            operator.execute(None)
            mock_warn.assert_called_with(
                'destination_bucket is None. Defaulting it to source_bucket (%s)',
                TEST_BUCKET
            )
            self.assertEquals(operator.destination_bucket, operator.source_bucket)
    def test_execute_wildcard_with_destination_object(self, mock_hook):
        mock_hook.return_value.list.return_value = SOURCE_FILES_LIST
        operator = GoogleCloudStorageToGoogleCloudStorageOperator(
            task_id=TASK_ID, source_bucket=TEST_BUCKET,
            source_object=SOURCE_OBJECT_4,
            destination_bucket=DESTINATION_BUCKET,
            destination_object=DESTINATION_OBJECT_PREFIX)

        operator.execute(None)
        mock_calls = [
            mock.call(TEST_BUCKET, 'test_object/file1.txt',
                      DESTINATION_BUCKET, 'foo/bar/file1.txt'),
            mock.call(TEST_BUCKET, 'test_object/file2.txt',
                      DESTINATION_BUCKET, 'foo/bar/file2.txt'),
        ]
        mock_hook.return_value.copy.assert_has_calls(mock_calls)
    def test_wc_with_no_last_modified_time(self, mock_hook):
        mock_hook.return_value.list.return_value = SOURCE_FILES_LIST
        operator = GoogleCloudStorageToGoogleCloudStorageOperator(
            task_id=TASK_ID, source_bucket=TEST_BUCKET,
            source_object=SOURCE_OBJECT_4,
            destination_bucket=DESTINATION_BUCKET,
            last_modified_time=None)

        operator.execute(None)
        mock_calls_none = [
            mock.call(TEST_BUCKET, 'test_object/file1.txt',
                      DESTINATION_BUCKET, 'test_object/file1.txt'),
            mock.call(TEST_BUCKET, 'test_object/file2.txt',
                      DESTINATION_BUCKET, 'test_object/file2.txt'),
        ]
        mock_hook.return_value.rewrite.assert_has_calls(mock_calls_none)
Beispiel #20
0
def gc_tasks(name, schema, next_task=DummyOperator(task_id="Done")):
    bq_staging = f"{{{{ var.value.gc_project_id }}}}.{{{{ var.value.bq_dataset_source }}}}.{name}"
    bq_warehouse = f"{{{{ var.value.gc_project_id }}}}.{{{{ var.value.bq_dataset_target }}}}.{name}"

    t1 = GoogleCloudStorageToBigQueryOperator(
        task_id=f"staging_{name}",
        bucket="{{var.value.gcs_bucket}}",
        source_objects=[f"{name}*"],
        destination_project_dataset_table=bq_staging,
        write_disposition="WRITE_TRUNCATE",
        schema_fields=schema,
        skip_leading_rows=1,
    )

    t2 = BigQueryOperator(
        task_id=f"merge_{name}_into_warehouse",
        sql=_create_merge_sql(bq_staging, bq_warehouse, schema),
        use_legacy_sql=False,
    )

    t3 = GoogleCloudStorageToGoogleCloudStorageOperator(
        task_id=f"move_{name}_to_processed",
        source_bucket="{{var.value.gcs_bucket}}",
        source_object=f"{name}*",
        destination_bucket="{{var.value.gcs_bucket}}",
        destination_object=f"processed/{name}",
        move_object=True,
    )

    t1 >> t2 >> t3 >> next_task

    return t1
    def test_execute_wildcard_with_destination_object(self, mock_hook):
        mock_hook.return_value.list.return_value = SOURCE_FILES_LIST
        operator = GoogleCloudStorageToGoogleCloudStorageOperator(
            task_id=TASK_ID,
            source_bucket=TEST_BUCKET,
            source_object=SOURCE_OBJECT_4,
            destination_bucket=DESTINATION_BUCKET,
            destination_object=DESTINATION_OBJECT_PREFIX)

        operator.execute(None)
        mock_calls = [
            mock.call(TEST_BUCKET, 'test_object/file1.txt', DESTINATION_BUCKET,
                      'foo/bar/file1.txt'),
            mock.call(TEST_BUCKET, 'test_object/file2.txt', DESTINATION_BUCKET,
                      'foo/bar/file2.txt'),
        ]
        mock_hook.return_value.copy.assert_has_calls(mock_calls)
Beispiel #22
0
    def test_wc_with_no_last_modified_time(self, mock_hook):
        mock_hook.return_value.list.return_value = SOURCE_FILES_LIST
        operator = GoogleCloudStorageToGoogleCloudStorageOperator(
            task_id=TASK_ID,
            source_bucket=TEST_BUCKET,
            source_object=SOURCE_OBJECT_4,
            destination_bucket=DESTINATION_BUCKET,
            last_modified_time=None)

        operator.execute(None)
        mock_calls_none = [
            mock.call(TEST_BUCKET, 'test_object/file1.txt', DESTINATION_BUCKET,
                      'test_object/file1.txt'),
            mock.call(TEST_BUCKET, 'test_object/file2.txt', DESTINATION_BUCKET,
                      'test_object/file2.txt'),
        ]
        mock_hook.return_value.rewrite.assert_has_calls(mock_calls_none)
    def test_execute_wildcard_with_destination_object_retained_prefix(self, mock_hook):
        mock_hook.return_value.list.return_value = SOURCE_FILES_LIST
        operator = GoogleCloudStorageToGoogleCloudStorageOperator(
            task_id=TASK_ID, source_bucket=TEST_BUCKET,
            source_object=SOURCE_OBJECT_WILDCARD_FILENAME,
            destination_bucket=DESTINATION_BUCKET,
            destination_object='{}/{}'.format(DESTINATION_OBJECT_PREFIX,
                                              SOURCE_OBJECT_WILDCARD_SUFFIX[:-1])
        )

        operator.execute(None)
        mock_calls_retained = [
            mock.call(TEST_BUCKET, 'test_object/file1.txt',
                      DESTINATION_BUCKET, 'foo/bar/test_object/file1.txt'),
            mock.call(TEST_BUCKET, 'test_object/file2.txt',
                      DESTINATION_BUCKET, 'foo/bar/test_object/file2.txt'),
        ]
        mock_hook.return_value.rewrite.assert_has_calls(mock_calls_retained)
Beispiel #24
0
    def test_execute_wildcard_with_destination_object_retained_prefix(self, mock_hook):
        mock_hook.return_value.list.return_value = SOURCE_FILES_LIST
        operator = GoogleCloudStorageToGoogleCloudStorageOperator(
            task_id=TASK_ID, source_bucket=TEST_BUCKET,
            source_object=SOURCE_OBJECT_WILDCARD_FILENAME,
            destination_bucket=DESTINATION_BUCKET,
            destination_object='{}/{}'.format(DESTINATION_OBJECT_PREFIX,
                                              SOURCE_OBJECT_WILDCARD_SUFFIX[:-1])
        )

        operator.execute(None)
        mock_calls_retained = [
            mock.call(TEST_BUCKET, 'test_object/file1.txt',
                      DESTINATION_BUCKET, 'foo/bar/test_object/file1.txt'),
            mock.call(TEST_BUCKET, 'test_object/file2.txt',
                      DESTINATION_BUCKET, 'foo/bar/test_object/file2.txt'),
        ]
        mock_hook.return_value.rewrite.assert_has_calls(mock_calls_retained)
Beispiel #25
0
    'start_date': datetime(2019, 6, 17),
    'provide_context': True,
    'owner': 'airflow',
    'depends_on_past': True
}

dag = DAG(
    dag_id='create_file',
    default_args=args,
    schedule_interval=None,
)

with dag:
    createfile = BashOperator(
        task_id="create_file",
        bash_command="touch /home/airflow/gcs/data/trigger.trg")

    copytobucket = GoogleCloudStorageToGoogleCloudStorageOperator(
        task_id="copy_to_bucket",
        source_bucket="your_cloud_composer_bucket",
        source_object="data/trigger.trg",
        destination_bucket="your_cloud_composer_bucket",
        destination_object="data/sense_for_me.txt",
        google_cloud_storage_conn_id="google_cloud_storage_default",
        move_object="true")

    deletefile = BashOperator(
        task_id="delete_file",
        bash_command="rm /home/airflow/gcs/data/sense_for_me.txt")

    createfile >> copytobucket >> deletefile
clean_processor_b = PythonOperator(
    task_id="clean_processor_b",
    python_callable=clean_buckets,
    op_kwargs={
        "private_bucket": BUCKET_PRIVATE_B,
        "shared_bucket": BUCKET_SHARED_B,
        "google_cloud_storage_conn_id": PRIO_B_CONN,
    },
    dag=dag,
)

load_processor_a = GoogleCloudStorageToGoogleCloudStorageOperator(
    task_id="load_processor_a",
    source_bucket=BUCKET_DATA_ADMIN,
    source_object="staging/submission_date={{ ds }}/server_id=a/*",
    destination_bucket=BUCKET_PRIVATE_A,
    destination_object="raw/submission_date={{ ds }}/",
    google_cloud_storage_conn_id=PRIO_ADMIN_CONN,
    dag=dag,
)

load_processor_b = GoogleCloudStorageToGoogleCloudStorageOperator(
    task_id="load_processor_b",
    source_bucket=BUCKET_DATA_ADMIN,
    source_object="staging/submission_date={{ ds }}/server_id=b/*",
    destination_bucket=BUCKET_PRIVATE_B,
    destination_object="raw/submission_date={{ ds }}/",
    google_cloud_storage_conn_id=PRIO_ADMIN_CONN,
    dag=dag,
)
Beispiel #27
0
    # scenario will start right from here !
    # t2 = SFTPOperator(
    #     task_id='transfer-to-incoming',
    # )

    # Listen incoming folder w/ sensor
    t3 = GoogleCloudStoragePrefixSensor(task_id='listen-incoming-file',
                                        bucket='datalake-datasets-123',
                                        prefix='incoming/sales_transactions_*')

    # TODO: better file structure can be defined, such as monthly aggregation datalake/sales/05/sales_transactions_*
    # copy from gcs to datalake for raw data storing
    t4 = GoogleCloudStorageToGoogleCloudStorageOperator(
        task_id='copy-to-datalake',
        source_bucket=INGESTION_BUCKET_NAME,
        source_object='incoming/sales_transactions_*',
        destination_bucket=INGESTION_BUCKET_NAME,
        destination_object='datalake/sales_transactions_',
        move_object=False)

    # copy from incoming to process for analytical calculations
    t5 = GoogleCloudStorageToGoogleCloudStorageOperator(
        task_id='move-to-processing',
        source_bucket=INGESTION_BUCKET_NAME,
        source_object='incoming/sales_transactions_*',
        destination_bucket=INGESTION_BUCKET_NAME,
        destination_object='processing/sales_transactions_',
        move_object=True)

    # git clone average-prices-by-product-enhanced.py file ?????
    # deploy to GCP dataflow as a beam job, and check GCP dataflow job status
Beispiel #28
0
        'usePublicIps':
        'False'
    }
}

with models.DAG('DAG_WordCount',
                schedule_interval=timedelta(days=1),
                default_args=default_dag_args) as dag:

    start_task = DummyOperator(task_id="start")

    move_to_processing = GoogleCloudStorageToGoogleCloudStorageOperator(
        task_id='move_file_to_processing',
        source_bucket='{}_word_count_data_5696'.format(
            os.environ['AIRFLOW_VAR_ENV']),
        source_object='inbox/*',
        destination_bucket='{}_word_count_data_5696'.format(
            os.environ['AIRFLOW_VAR_ENV']),
        destination_object='processing/',
        move_object=True,
        google_cloud_storage_conn_id='etl_sa')

    dataflow_task = dataflow_operator.DataFlowJavaOperator(
        task_id='dataflow_java_app',
        jar='gs://{}/data/pipeline-example/word-count-beam-0.1.jar'.format(
            os.environ['COMPOSER_BUCKET']),
        options={
            'autoscalingAlgorithm': 'BASIC',
            'maxNumWorkers': '5'
        },
        gcp_conn_id='etl_sa')
Beispiel #29
0
def generate_dag(branch_name, params, dag):
    with dag:
        batch_id = params['batch_id']

        work_folder = params['%s_work_dir' % branch_name]
        inbound_bucket = params['inbound_bucket']
        inbound_dir = params['inbound_dir']
        offer_email_list_file_prefix = params['%s_file_prefix' % branch_name]

        inbound_full_path = 'gs://%s/%s' % (inbound_bucket, inbound_dir)

        move_files_to_work_directory = GoogleCloudStorageToGoogleCloudStorageOperator(
            task_id='move_%s_files_to_work_directory' % branch_name.lower(),
            source_bucket=inbound_bucket,
            source_object='%s/%s*.csv' %
            (inbound_dir, offer_email_list_file_prefix),
            destination_bucket=inbound_bucket,
            destination_object='%s/%s/%s' %
            (work_folder, batch_id, offer_email_list_file_prefix),
            google_cloud_storage_conn_id=params['google_cloud_conn_id'],
            dag=dag,
            move_object=params['move_object'])

        upload_to_bq_task = GoogleCloudStorageToBigQueryOperator(
            task_id='upload_%s_to_bq' % branch_name.lower(),
            bucket=get_bucket_from_name(inbound_full_path),
            source_objects=['%s/%s/*' % (work_folder, batch_id)],
            destination_project_dataset_table=params['%s_work_table' %
                                                     branch_name],
            skip_leading_rows=0,
            source_format=SourceFormat.CSV,
            field_delimiter=',',
            schema_fields=params['%s_work_schema' % branch_name],
            create_disposition=CreateDisposition.CREATE_IF_NEEDED,
            write_disposition=WriteDisposition.WRITE_TRUNCATE,
            bigquery_conn_id=params['bigquery_conn_id'],
            google_cloud_storage_conn_id=params['google_cloud_conn_id'],
            autodetect=False,
            allow_jagged_rows=True,
            ignore_unknown_values=True,
            dag=dag)

        update_stage_table = BigQueryOperator(
            task_id='update_%s_stage_table' % branch_name.lower(),
            sql='sql/%s/%s_stage_table.sql' %
            (branch_name.lower(), branch_name.lower()),
            write_disposition=WriteDisposition.WRITE_TRUNCATE,
            create_disposition=CreateDisposition.CREATE_IF_NEEDED,
            destination_dataset_table=params['%s_stage_table' % branch_name],
            params=params,
            dag=dag)

        update_error_table = BigQueryOperator(
            task_id='update_%s_error_table' % branch_name.lower(),
            sql='sql/%s/%s_error_table.sql' %
            (branch_name.lower(), branch_name.lower()),
            write_disposition=WriteDisposition.WRITE_TRUNCATE,
            create_disposition=CreateDisposition.CREATE_IF_NEEDED,
            destination_dataset_table=params['%s_error_table' % branch_name],
            params=params,
            dag=dag)

        update_gold_table = BigQueryOperator(
            task_id='update_%s_gold_table' % branch_name.lower(),
            sql='sql/%s/%s_gold_table.sql' %
            (branch_name.lower(), branch_name.lower()),
            destination_dataset_table=params['%s_gold_table' % branch_name],
            write_disposition=WriteDisposition.WRITE_TRUNCATE,
            params=params,
            dag=dag)

        upload_to_bq_task >> update_error_table

        move_files_to_work_directory >> upload_to_bq_task >> update_stage_table >> update_gold_table

        return {
            'task_in': move_files_to_work_directory,
            'task_out': update_gold_table
        }
Beispiel #30
0
    selector_task = BranchPythonOperator(task_id='select_load',
                                         python_callable=select_load,
                                         provide_context=True)

    list_task >> selector_task

    delete_local_files = BashOperator(
        task_id='delete_local_files',
        bash_command=f'rm {DATA_PATH}/{dag.dag_id}/*',
        provide_context=True)

    move_processed_files = GoogleCloudStorageToGoogleCloudStorageOperator(
        task_id=f'move_processed_files',
        trigger_rule='none_failed',
        source_bucket='gcs-ipt-stackoss-business-intelligence',
        source_object='trafico_vendors/pendientes_procesar/*.csv',
        destination_bucket='gcs-ipt-stackoss-business-intelligence',
        destination_object='trafico_vendors/procesados/',
        move_object=True,
        google_cloud_storage_conn_id='gcs_ipt_prod',
        retries=2)

    for name, procedures in bq_procedures.items():
        load_task = PythonOperator(task_id=f'load_{name}_to_bq',
                                   python_callable=load_data,
                                   retries=2,
                                   provide_context=True)
        selector_task >> load_task
        for procedure in procedures:
            call_procedure = PythonOperator(
                task_id=f'call_{name}_procedure',
                python_callable=call_bigquery,
Beispiel #31
0
    Query = python_operator.PythonOperator(task_id='Query',
                                           python_callable=QueryToGCS,
                                           op_kwargs={'sql': sql})

    CommentsExport = python_operator.PythonOperator(
        task_id='CommentsExport', python_callable=CommentsToGCS)
    AnswerExport = python_operator.PythonOperator(task_id='AnswerExport',
                                                  python_callable=AnswersToGCS)

    comment_file = '{}_{}.json'.format(comment_export, yesterday_string)
    answer_file = '{}_{}.json'.format(answer_export, yesterday_string)

    CommentToGCS = GoogleCloudStorageToGoogleCloudStorageOperator(
        task_id="Comment_to_GSC",
        source_bucket=source_bucket,
        source_object="data/{}".format(comment_file),
        destination_bucket=destination_bucket,
        destination_object=comment_file)

    AnswerToGCS = GoogleCloudStorageToGoogleCloudStorageOperator(
        task_id="Answer_to_GSC",
        source_bucket=source_bucket,
        source_object="data/{}".format(answer_file),
        destination_bucket=destination_bucket,
        destination_object=answer_file)

    CommentToBQ = GoogleCloudStorageToBigQueryOperator(
        task_id="Comments_to_BigQuery",
        bucket=destination_bucket,
        source_objects=[comment_file],  # This needs to be a a list of sources
        schema_object=schema_comment,
        "total sulfur dioxide", "volatile acidity"
    ],
    "data": [[12.8, 0.029, 0.48, 0.98, 6.2, 29, 3.33, 1.2, 0.39, 75, 0.66],
             [12.8, 0.029, 0.48, 0.98, 6.2, 29, 3.33, 1.2, 0.39, 75, 0.66]]
}

dag = DAG('airflow-wine-from-yamls',
          default_args=default_args,
          schedule_interval=None)

with dag:
    data_extraction = GoogleCloudStorageToGoogleCloudStorageOperator(
        task_id='data_extraction',
        google_cloud_storage_conn_id='wine_input',
        source_bucket=wine_bucket,
        destination_bucket=wine_bucket,
        source_object='input/*.csv',
        destination_object='data/',
        project_id=gcp_project,
        default_args=default_args)
    data_transformation = BashOperator(
        task_id='data_transformation',
        bash_command='echo "imagine that we transform a data"',
        default_args=default_args)
    odahuflow_conn = GcpConnectionToOdahuConnectionOperator(
        task_id='odahuflow_connection_creation',
        google_cloud_storage_conn_id='wine_input',
        api_connection_id=api_connection_id,
        conn_template=wine,
        default_args=default_args)
Beispiel #33
0
)

export_csv = gke_command(
    task_id="export_csv",
    cmds=["bash"],
    env_vars={"DATASET": "glam_etl"},
    command=["script/glam/export_csv"],
    docker_image="mozilla/bigquery-etl:latest",
    gcp_conn_id="google_cloud_derived_datasets",
    dag=dag,
)

gcs_delete = GoogleCloudStorageDeleteOperator(
    task_id="gcs_delete",
    bucket_name=glam_bucket,
    prefix="glam-extract-fenix",
    gcp_conn_id="google_cloud_airflow_dataproc",
    dag=dag,
)

gcs_copy = GoogleCloudStorageToGoogleCloudStorageOperator(
    task_id="gcs_copy",
    source_bucket="glam-fenix-dev",
    source_object="*.csv",
    destination_bucket=glam_bucket,
    gcp_conn_id="google_cloud_airflow_dataproc",
    dag=dag,
)

wait_for_copy_deduplicate >> run_sql >> export_csv >> gcs_delete >> gcs_copy