def test_execute(self, gcs_mock_hook, s3_one_mock_hook, s3_two_mock_hook):
        """Test the execute function when the run is successful."""

        operator = S3ToGoogleCloudStorageOperator(task_id=TASK_ID,
                                                  bucket=S3_BUCKET,
                                                  prefix=S3_PREFIX,
                                                  delimiter=S3_DELIMITER,
                                                  dest_gcs_conn_id=GCS_CONN_ID,
                                                  dest_gcs=GCS_PATH_PREFIX)

        s3_one_mock_hook.return_value.list_keys.return_value = MOCK_FILES
        s3_two_mock_hook.return_value.list_keys.return_value = MOCK_FILES

        uploaded_files = operator.execute(None)
        gcs_mock_hook.return_value.upload.assert_has_calls([
            mock.call('gcs-bucket', 'data/TEST1.csv', mock.ANY, gzip=False),
            mock.call('gcs-bucket', 'data/TEST3.csv', mock.ANY, gzip=False),
            mock.call('gcs-bucket', 'data/TEST2.csv', mock.ANY, gzip=False)
        ],
                                                           any_order=True)

        s3_one_mock_hook.assert_called_once_with(aws_conn_id=AWS_CONN_ID,
                                                 verify=None)
        s3_two_mock_hook.assert_called_once_with(aws_conn_id=AWS_CONN_ID,
                                                 verify=None)
        gcs_mock_hook.assert_called_once_with(
            google_cloud_storage_conn_id=GCS_CONN_ID, delegate_to=None)

        # we expect MOCK_FILES to be uploaded
        self.assertEqual(sorted(MOCK_FILES), sorted(uploaded_files))
    def test_execute(self, gcs_mock_hook, s3_one_mock_hook, s3_two_mock_hook):
        """Test the execute function when the run is successful."""

        operator = S3ToGoogleCloudStorageOperator(task_id=TASK_ID,
                                                  bucket=S3_BUCKET,
                                                  prefix=S3_PREFIX,
                                                  delimiter=S3_DELIMITER,
                                                  dest_gcs_conn_id=GCS_CONN_ID,
                                                  dest_gcs=GCS_PATH_PREFIX)

        s3_one_mock_hook.return_value.list_keys.return_value = MOCK_FILES
        s3_two_mock_hook.return_value.list_keys.return_value = MOCK_FILES

        def _assert_upload(bucket, object, tmp_filename):
            gcs_bucket, gcs_object_path = _parse_gcs_url(GCS_PATH_PREFIX)

            self.assertEqual(gcs_bucket, bucket)
            self.assertIn(object[len(gcs_object_path):], MOCK_FILES)

        gcs_mock_hook.return_value.upload.side_effect = _assert_upload

        uploaded_files = operator.execute(None)

        s3_one_mock_hook.assert_called_once_with(aws_conn_id=AWS_CONN_ID,
                                                 verify=None)
        s3_two_mock_hook.assert_called_once_with(aws_conn_id=AWS_CONN_ID,
                                                 verify=None)
        gcs_mock_hook.assert_called_once_with(
            google_cloud_storage_conn_id=GCS_CONN_ID, delegate_to=None)

        # we expect MOCK_FILES to be uploaded
        self.assertEqual(sorted(MOCK_FILES), sorted(uploaded_files))
    def test_init(self):
        """Test S3ToGoogleCloudStorageOperator instance is properly initialized."""

        operator = S3ToGoogleCloudStorageOperator(task_id=TASK_ID,
                                                  bucket=S3_BUCKET,
                                                  prefix=S3_PREFIX,
                                                  delimiter=S3_DELIMITER,
                                                  gcp_conn_id=GCS_CONN_ID,
                                                  dest_gcs=GCS_PATH_PREFIX)

        self.assertEqual(operator.task_id, TASK_ID)
        self.assertEqual(operator.bucket, S3_BUCKET)
        self.assertEqual(operator.prefix, S3_PREFIX)
        self.assertEqual(operator.delimiter, S3_DELIMITER)
        self.assertEqual(operator.gcp_conn_id, GCS_CONN_ID)
        self.assertEqual(operator.dest_gcs, GCS_PATH_PREFIX)
    def test_execute_with_gzip(self, gcs_mock_hook, s3_one_mock_hook,
                               s3_two_mock_hook):
        """Test the execute function when the run is successful."""

        operator = S3ToGoogleCloudStorageOperator(task_id=TASK_ID,
                                                  bucket=S3_BUCKET,
                                                  prefix=S3_PREFIX,
                                                  delimiter=S3_DELIMITER,
                                                  dest_gcs_conn_id=GCS_CONN_ID,
                                                  dest_gcs=GCS_PATH_PREFIX,
                                                  gzip=True)

        s3_one_mock_hook.return_value.list_keys.return_value = MOCK_FILES
        s3_two_mock_hook.return_value.list_keys.return_value = MOCK_FILES

        operator.execute(None)
        gcs_mock_hook.return_value.upload.assert_has_calls([
            mock.call('gcs-bucket', 'data/TEST2.csv', mock.ANY, gzip=True),
            mock.call('gcs-bucket', 'data/TEST1.csv', mock.ANY, gzip=True),
            mock.call('gcs-bucket', 'data/TEST3.csv', mock.ANY, gzip=True)
        ],
                                                           any_order=True)
Example #5
0
    start = DummyOperator(
        task_id='start',
        trigger_rule='all_success'
    )

    end = DummyOperator(
        task_id='end',
        trigger_rule='one_success'
    )

    # Task to copy files from S3 to GCS
    s3_email_to_gcs = S3ToGoogleCloudStorageOperator(
        task_id='s3_to_gcs',
        bucket= S3_BUCKET, 
        aws_conn_id=AWS_CONNECTION,         
        dest_gcs_conn_id= GCS_BUCKET_CONNECTION,
        dest_gcs=GCS_BUCKET_NAME_S3_TASK
    )
    s3_email_to_gcs.set_upstream(start)

    # Task to load files from GCS into BQ 
    gcs_to_bq_task = GoogleCloudStorageToBigQueryOperator(
        task_id='gcs_to_bq',
        bucket=GCS_BUCKET_NAME_BQ_TASK,
        source_objects= ['*.parquet'], # Assumes we're loading parquet files, other file types are supported
        destination_project_dataset_table=DESTINATION_TABLE, 
        source_format='parquet',
        skip_leading_rows=1,
        max_bad_records=10,
        bigquery_conn_id= BQ_CONNECTION,