def test_move_to_storage_with_conflict_with_file_types(self) -> None: dt = datetime.datetime.now() self.fully_process_file(dt, GcsfsFilePath(bucket_name='my_bucket', blob_name='test_file.csv'), file_type_differentiation_on=True) # Try uploading a file with a duplicate name that has already been # moved to storage self.fully_process_file(dt, GcsfsFilePath(bucket_name='my_bucket', blob_name='test_file.csv'), file_type_differentiation_on=True) # pylint: disable=protected-access storage_paths = self.fs._ls_with_file_prefix(self.STORAGE_DIR_PATH, '', file_type_filter=None) self.assertEqual(len(storage_paths), 4) found_first_file = False found_second_file = False for path in storage_paths: if path.abs_path().endswith('test_file.csv'): found_first_file = True if path.abs_path().endswith('test_file-(1).csv'): found_second_file = True self.assertTrue(found_first_file) self.assertTrue(found_second_file)
def test_move_to_storage_with_conflict(self) -> None: dt = datetime.datetime.now() self.fully_process_file( dt, GcsfsFilePath(bucket_name="my_bucket", blob_name="test_file.csv") ) # Try uploading a file with a duplicate name that has already been # moved to storage self.fully_process_file( dt, GcsfsFilePath(bucket_name="my_bucket", blob_name="test_file.csv") ) # pylint: disable=protected-access storage_paths = self.fs._ls_with_file_prefix( self.STORAGE_DIR_PATH, "", file_type_filter=None ) self.assertEqual(len(storage_paths), 2) found_first_file = False found_second_file = False for path in storage_paths: self.assertTrue(filename_parts_from_path(path)) if path.abs_path().endswith("test_file.csv"): found_first_file = True if path.abs_path().endswith("test_file-(1).csv"): found_second_file = True self.assertTrue(found_first_file) self.assertTrue(found_second_file)
def test_direct_ingest_multiple_file_moves(self) -> None: self.fully_process_file( datetime.datetime.now(), GcsfsFilePath(bucket_name='my_bucket', blob_name='test_file.csv')) self.fully_process_file( datetime.datetime.now(), GcsfsFilePath(bucket_name='my_bucket', blob_name='test_file_2.csv'))
def test_direct_ingest_multiple_file_moves_with_file_types(self) -> None: self.fully_process_file(datetime.datetime.now(), GcsfsFilePath(bucket_name='my_bucket', blob_name='test_file.csv'), file_type_differentiation_on=True) self.fully_process_file(datetime.datetime.now(), GcsfsFilePath(bucket_name='my_bucket', blob_name='test_file_2.csv'), file_type_differentiation_on=True)
def test_direct_ingest_multiple_file_moves_with_file_types(self) -> None: self.fully_process_file( datetime.datetime.now(), GcsfsFilePath(bucket_name="my_bucket", blob_name="test_file.csv"), ) self.fully_process_file( datetime.datetime.now(), GcsfsFilePath(bucket_name="my_bucket", blob_name="test_file_2.csv"), )
def test_cache_ingest_file_as_parquet_malformed(self) -> None: path = GcsfsFilePath( bucket_name="test-bucket", blob_name= "storage_bucket/raw/2021/04/20/processed_2021-05-03T00:00:00:000000_raw_test_file-(1).csv", ) with open(fixture_path, "r") as f: self.cache_ingest_file(path, f.read(), separator="|") cache = SingleIngestFileParquetCache(self.fakeredis, path) self.assertEqual(1, self.fakeredis.llen(cache.cache_key)) expected = pandas.DataFrame( data=[["val1", "val2", "", "val4", "", "05/03/21"]], columns=[ "col1", "col2", "col3", "col4", "col5", "ingest_processing_date", ], ) actual = [ pandas.read_parquet(parquet_file) for parquet_file in cache.get_parquet_files() ][0] self.assertTrue( expected.compare(actual).empty, expected.compare(actual))
def test_cache_ingest_file_as_parquet(self) -> None: path = GcsfsFilePath( bucket_name="test-bucket", blob_name= "storage_bucket/raw/2021/04/20/processed_2021-05-03T00:00:00:000000_raw_test_file-(1).csv", ) input_df = pandas.DataFrame(data=[[1, 2], [2, 3]], columns=["x", "y"]) self.cache_ingest_file(path, input_df.to_csv(index=False)) cache = SingleIngestFileParquetCache(self.fakeredis, path) self.assertEqual(1, self.fakeredis.llen(cache.cache_key)) expected = pandas.DataFrame( data=[ ["1", "2", "05/03/21"], ["2", "3", "05/03/21"], ], columns=["x", "y", "ingest_processing_date"], ) actual = [ pandas.read_parquet(parquet_file) for parquet_file in cache.get_parquet_files() ][0] self.assertTrue(expected.compare(actual).empty)
def test_contents_of_lock_set(self) -> None: """Locks with pre-specified contents and asserts the lockfile contains those contents""" lock_manager = GCSPseudoLockManager(self.PROJECT_ID) lock_manager.lock(self.LOCK_NAME, self.CONTENTS) path = GcsfsFilePath(bucket_name=lock_manager.bucket_name, blob_name=self.LOCK_NAME) actual_contents = self.fs.download_as_string(path) self.assertEqual(self.CONTENTS, actual_contents)
def test_contents_of_lock_default(self) -> None: """Locks with default contents and asserts the lockfile contains correct time""" lock_manager = GCSPseudoLockManager(self.PROJECT_ID) lock_manager.lock(self.LOCK_NAME) path = GcsfsFilePath(bucket_name=lock_manager.bucket_name, blob_name=self.LOCK_NAME) actual_body = GCSPseudoLockBody.from_json_string( self.fs.download_as_string(path)) self.assertIsNotNone(actual_body)
def test_contents_of_lock_default(self) -> None: """Locks with default contents and asserts the lockfile contains correct time""" lock_manager = GCSPseudoLockManager(self.PROJECT_ID) lock_manager.lock(self.LOCK_NAME) correct_contents = datetime.now().strftime(self.TIME_FORMAT) path = GcsfsFilePath(bucket_name=lock_manager.bucket_name, blob_name=self.LOCK_NAME) actual_contents = self.fs.download_as_string(path) self.assertEqual(correct_contents, actual_contents)
def unlock(self, name: str) -> None: """Unlocks @param name by deleting file with name""" if self.is_locked(name): path = GcsfsFilePath(bucket_name=self.bucket_name, blob_name=name) self.fs.delete(path) else: raise GCSPseudoLockDoesNotExist( f"Lock with the name {name} does not yet exist in the bucket " f"{self.bucket_name}" )
def test_acquire_two_locks_different_schemas(self) -> None: self.lock_manager.acquire_lock(lock_id="lock1", schema_type=SchemaType.STATE) expected_paths = [ GcsfsFilePath(bucket_name=self.lock_bucket, blob_name="EXPORT_PROCESS_RUNNING_STATE") ] self.assertEqual(expected_paths, self.fake_fs.all_paths) self.lock_manager.acquire_lock(lock_id="lock1", schema_type=SchemaType.JAILS) expected_paths.append( GcsfsFilePath(bucket_name=self.lock_bucket, blob_name="EXPORT_PROCESS_RUNNING_JAILS")) self.assertEqual(expected_paths, self.fake_fs.all_paths) self.lock_manager.release_lock(schema_type=SchemaType.STATE) self.lock_manager.release_lock(schema_type=SchemaType.JAILS) self.assertEqual([], self.fake_fs.all_paths)
def get_lock_contents(self, name: str) -> str: """Returns contents of specified lock as string""" path = GcsfsFilePath(bucket_name=self.bucket_name, blob_name=name) if not self.fs.exists(path): raise GCSPseudoLockDoesNotExist( f"Lock with the name {name} does not yet exist in the bucket " f"{self.bucket_name}" ) contents = self.fs.download_as_string(path) return contents
def test_contents_of_unlocked_and_relocked(self) -> None: """Locks with pre-specified contents and asserts the lockfile contains those contents""" lock_manager = GCSPseudoLockManager(self.PROJECT_ID) lock_manager.lock(self.LOCK_NAME, self.CONTENTS) lock_manager.unlock(self.LOCK_NAME) lock_manager.lock(self.LOCK_NAME, self.CONTENTS2) path = GcsfsFilePath(bucket_name=lock_manager.bucket_name, blob_name=self.LOCK_NAME) actual_body = GCSPseudoLockBody.from_json_string( self.fs.download_as_string(path)) assert actual_body is not None self.assertEqual(self.CONTENTS2, actual_body.payload)
def lock(self, name: str, contents: Optional[str] = None) -> None: """ "Locks @param name by generating new file. If has @param contents, body of new file is contents. Otherwise sets body of file to json formatted time and uuid. """ if self.is_locked(name): raise GCSPseudoLockAlreadyExists( f"Lock with the name {name} already exists in the bucket " f"{self.bucket_name}" ) if contents is None: contents = datetime.now().strftime(self._TIME_FORMAT) path = GcsfsFilePath(bucket_name=self.bucket_name, blob_name=name) self.fs.upload_from_string(path, contents, "text/plain")
def test_withManifest_succeeds(self, mock_ingest: unittest.mock.MagicMock) -> None: # Act request_args = {"manifest_path": "gs://fake-bucket/foo/manifest.yaml"} headers = {"X-Appengine-Cron": "test-cron"} response = self.client.get( "/ingest", query_string=request_args, headers=headers ) # Assert self.assertEqual(200, response.status_code) mock_ingest.assert_called_with( ANY, GcsfsFilePath(bucket_name="fake-bucket", blob_name="foo/manifest.yaml") )
def _upload_fake_expired_lock(self, lock_manager: GCSPseudoLockManager, lock_name: str) -> None: now = datetime.now() yesterday = now - timedelta(days=1) path = GcsfsFilePath(bucket_name=lock_manager.bucket_name, blob_name=lock_name) self.fs.upload_from_string( path, json.dumps( GCSPseudoLockBody(lock_time=yesterday, expiration_in_seconds=3600).to_json(), default=str, ), content_type="text/text", )
def test_lock_expiration_not_met(self) -> None: now = datetime.now() lock_manager = GCSPseudoLockManager() path = GcsfsFilePath(bucket_name=lock_manager.bucket_name, blob_name=self.LOCK_NAME) self.fs.upload_from_string( path, json.dumps( GCSPseudoLockBody(lock_time=now, expiration_in_seconds=60).to_json(), default=str, ), content_type="text/text", ) self.assertTrue(lock_manager.is_locked(self.LOCK_NAME))
def test_ingestFails_raisesError( self, mock_ingest: unittest.mock.MagicMock ) -> None: # Arrange mock_ingest.side_effect = ValueError("Malformed manifest") # Act request_args = {"manifest_path": "gs://fake-bucket/foo/manifest.yaml"} headers = {"X-Appengine-Cron": "test-cron"} with self.assertRaisesRegex(ValueError, "Malformed manifest"): self.client.get("/ingest", query_string=request_args, headers=headers) # Assert mock_ingest.assert_called_with( ANY, GcsfsFilePath(bucket_name="fake-bucket", blob_name="foo/manifest.yaml") )
def upload_raw_file_to_gcs(fs: GCSFileSystem, local_filepath: str, bucket_name: str) -> None: """Upload raw Sendgrid CSV to GCS""" fs.upload_from_contents_handle_stream( path=GcsfsFilePath( bucket_name=bucket_name, blob_name=date.today().strftime(DATE_FORMAT), ), contents_handle=GcsfsFileContentsHandle(local_file_path=local_filepath, cleanup_file=False), content_type="text/csv", ) logging.info( "Uploaded file [%s] to Google Cloud Storage bucket name=[%s] blob name=[%s]", local_filepath, bucket_name, date.today().strftime(DATE_FORMAT), )
def _move_files_for_date(self, subdir_path_str: str) -> None: """Function that loops through each subdirectory and moves files in each subdirectory using the from path and to path specified.""" from_dir_path = GcsfsDirectoryPath.from_absolute_path( subdir_path_str.rstrip("/")) previous_date_format = from_dir_path.relative_path.rstrip("/").split( "/")[-1] new_date_format = date.fromisoformat(previous_date_format).strftime( "%Y/%m/%d/") from_paths = gsutil_ls(f"{subdir_path_str}*.csv") for from_path in from_paths: file_name = GcsfsFilePath( bucket_name=self.region_storage_dir_path.bucket_name, blob_name=from_path, ).file_name to_file_path = os.path.join( "gs://", self.region_storage_dir_path.bucket_name, self.region_code, GcsfsDirectIngestFileType.RAW_DATA.value, new_date_format, file_name, ) normalized_to_file_path = ( to_normalized_processed_file_path_from_normalized_path( to_file_path, file_type_override=GcsfsDirectIngestFileType.RAW_DATA)) to_path = normalized_to_file_path if not self.dry_run: gsutil_mv(from_path=from_path, to_path=to_path) with self.mutex: self.move_list.append((from_path, to_path)) if self.move_progress: self.move_progress.next()
def test_ingestFails_raisesError( self, mock_ingest: unittest.mock.MagicMock) -> None: # Arrange mock_ingest.side_effect = ValueError("Malformed manifest") # Act request_args = {"manifest_path": "gs://fake-bucket/foo/manifest.yaml"} headers = {"X-Appengine-Cron": "test-cron"} response = self.client.get("/ingest", query_string=request_args, headers=headers) # Assert self.assertEqual(500, response.status_code) self.assertEqual("Error ingesting data: 'Malformed manifest'", response.get_data().decode()) mock_ingest.assert_called_with( ANY, GcsfsFilePath(bucket_name="fake-bucket", blob_name="foo/manifest.yaml"))
def unlock(self, name: str) -> None: """Unlocks @param name by deleting file with name""" path = GcsfsFilePath(bucket_name=self.bucket_name, blob_name=name) # We are not using `is_locked` here because we want to delete expired # locks explicitly. if not self.fs.exists(path): raise GCSPseudoLockDoesNotExist( f"Lock with the name {name} does not yet exist in the bucket " f"{self.bucket_name}") for i in range(MAX_UNLOCK_ATTEMPTS): logging.debug("Deleting lock file with name: %s (attempt %s)", name, i) self.fs.delete(path) if not self.fs.exists(path): logging.debug("Successfully deleted lock file with name: %s", name) return raise GCSPseudoLockFailedUnlock( f"Failed to unlock lock file with name: {name}")
def lock( self, name: str, payload: Optional[str] = None, expiration_in_seconds: Optional[int] = None, ) -> None: """Locks @param name by generating new file. The body of the lock is json-encoded and contains the lock time, the caller's custom @param payload (if provided), and the @param expiration_in_seconds (if provided). """ if self.is_locked(name): raise GCSPseudoLockAlreadyExists( f"Lock with the name {name} already exists in the bucket " f"{self.bucket_name}") lock_body = GCSPseudoLockBody( lock_time=datetime.now(), payload=payload, expiration_in_seconds=expiration_in_seconds, ) path = GcsfsFilePath(bucket_name=self.bucket_name, blob_name=name) self.fs.upload_from_string( path, json.dumps(lock_body.to_json(), default=str), "text/plain") logging.debug("Created lock file with name: %s", name)
def test_export_happy_path(self) -> None: metric_view_one = MetricBigQueryViewBuilder( dataset_id='dataset', view_id='view1', view_query_template='select * from table', dimensions=['a', 'b', 'c'], ).build() export_config_one = ExportBigQueryViewConfig( view=metric_view_one, view_filter_clause='WHERE state_code = \'US_XX\'', intermediate_table_name='intermediate_table', output_directory=GcsfsDirectoryPath.from_absolute_path( 'gs://bucket1/US_XX'), ) export_config_one_staging = ExportBigQueryViewConfig( view=metric_view_one, view_filter_clause='WHERE state_code = \'US_XX\'', intermediate_table_name='intermediate_table', output_directory=GcsfsDirectoryPath.from_absolute_path( 'gs://bucket1/staging/US_XX'), ) metric_view_two = MetricBigQueryViewBuilder( dataset_id='dataset', view_id='view2', view_query_template='select * from view2', dimensions=['d', 'e', 'f'], ).build() export_config_two = ExportBigQueryViewConfig( view=metric_view_two, view_filter_clause='WHERE state_code = \'US_XX\'', intermediate_table_name='intermediate_table2', output_directory=GcsfsDirectoryPath.from_absolute_path( 'gs://bucket2/US_XX'), ) export_config_two_staging = ExportBigQueryViewConfig( view=metric_view_two, view_filter_clause='WHERE state_code = \'US_XX\'', intermediate_table_name='intermediate_table2', output_directory=GcsfsDirectoryPath.from_absolute_path( 'gs://bucket2/staging/US_XX'), ) mock_bq_client = create_autospec(BigQueryClient) mock_fs = create_autospec(GCSFileSystem) mock_fs.exists.return_value = True delegate_one = create_autospec(BigQueryViewExporter) delegate_one_staging_paths = [ export_config_one_staging.output_path('json'), export_config_two_staging.output_path('json') ] delegate_one.export_and_validate.return_value = delegate_one_staging_paths delegate_two = create_autospec(BigQueryViewExporter) delegate_two_staging_paths = [ export_config_one_staging.output_path('txt'), export_config_two_staging.output_path('txt') ] delegate_two.export_and_validate.return_value = delegate_two_staging_paths # Make the actual call exporter = CompositeBigQueryViewExporter(mock_bq_client, mock_fs, [delegate_one, delegate_two]) exporter.export_and_validate([export_config_one, export_config_two]) # Assert all mocks called as expected delegate_one.export_and_validate.assert_has_calls([ call([export_config_one_staging, export_config_two_staging]), ]) delegate_two.export_and_validate.assert_has_calls([ call([export_config_one_staging, export_config_two_staging]), ]) mock_fs.copy.assert_has_calls([ call( GcsfsFilePath(bucket_name='bucket1', blob_name='staging/US_XX/view1.json'), GcsfsFilePath(bucket_name='bucket1', blob_name='US_XX/view1.json')), call( GcsfsFilePath(bucket_name='bucket2', blob_name='staging/US_XX/view2.json'), GcsfsFilePath(bucket_name='bucket2', blob_name='US_XX/view2.json')), call( GcsfsFilePath(bucket_name='bucket1', blob_name='staging/US_XX/view1.txt'), GcsfsFilePath(bucket_name='bucket1', blob_name='US_XX/view1.txt')), call( GcsfsFilePath(bucket_name='bucket2', blob_name='staging/US_XX/view2.txt'), GcsfsFilePath(bucket_name='bucket2', blob_name='US_XX/view2.txt')) ]) mock_fs.delete.assert_has_calls([ call( GcsfsFilePath(bucket_name='bucket1', blob_name='staging/US_XX/view1.json')), call( GcsfsFilePath(bucket_name='bucket2', blob_name='staging/US_XX/view2.json')), call( GcsfsFilePath(bucket_name='bucket1', blob_name='staging/US_XX/view1.txt')), call( GcsfsFilePath(bucket_name='bucket2', blob_name='staging/US_XX/view2.txt')) ]) mock_fs.exists.assert_has_calls([ call( GcsfsFilePath(bucket_name='bucket1', blob_name='US_XX/view1.json')), call( GcsfsFilePath(bucket_name='bucket2', blob_name='US_XX/view2.json')), call( GcsfsFilePath(bucket_name='bucket1', blob_name='US_XX/view1.txt')), call( GcsfsFilePath(bucket_name='bucket2', blob_name='US_XX/view2.txt')), ])
def _lock_body_for_lock(self, name: str) -> Optional[GCSPseudoLockBody]: path = GcsfsFilePath(bucket_name=self.bucket_name, blob_name=name) return self._lock_body_for_path(path)
def test_direct_ingest_file_moves_with_file_types(self) -> None: self.fully_process_file( datetime.datetime.now(), GcsfsFilePath(bucket_name="my_bucket", blob_name="test_file.csv"), file_type_differentiation_on=True, )
def is_locked(self, name: str) -> bool: """Checks if @param name is locked by checking if file exists. Returns true if locked, false if unlocked""" path = GcsfsFilePath(bucket_name=self.bucket_name, blob_name=name) return self.fs.exists(path)
def test_export_final_existence_validation_failed(self) -> None: metric_view_one = MetricBigQueryViewBuilder( dataset_id='dataset', view_id='view1', view_query_template='select * from table', dimensions=['a', 'b', 'c'], ).build() export_config_one = ExportBigQueryViewConfig( view=metric_view_one, view_filter_clause='WHERE state_code = \'US_XX\'', intermediate_table_name='intermediate_table', output_directory=GcsfsDirectoryPath.from_absolute_path( 'gs://bucket1/US_XX'), ) export_config_one_staging = ExportBigQueryViewConfig( view=metric_view_one, view_filter_clause='WHERE state_code = \'US_XX\'', intermediate_table_name='intermediate_table', output_directory=GcsfsDirectoryPath.from_absolute_path( 'gs://bucket1/staging/US_XX'), ) metric_view_two = MetricBigQueryViewBuilder( dataset_id='dataset', view_id='view2', view_query_template='select * from view2', dimensions=['d', 'e', 'f'], ).build() export_config_two = ExportBigQueryViewConfig( view=metric_view_two, view_filter_clause='WHERE state_code = \'US_XX\'', intermediate_table_name='intermediate_table2', output_directory=GcsfsDirectoryPath.from_absolute_path( 'gs://bucket2/US_XX'), ) export_config_two_staging = ExportBigQueryViewConfig( view=metric_view_two, view_filter_clause='WHERE state_code = \'US_XX\'', intermediate_table_name='intermediate_table2', output_directory=GcsfsDirectoryPath.from_absolute_path( 'gs://bucket2/staging/US_XX'), ) mock_bq_client = create_autospec(BigQueryClient) mock_fs = create_autospec(GCSFileSystem) # This should cause export_and_validate to raise a ValueError mock_fs.exists.return_value = False delegate_one = create_autospec(BigQueryViewExporter) delegate_one_staging_paths = [ export_config_one_staging.output_path('json'), export_config_two_staging.output_path('json') ] delegate_one.export_and_validate.return_value = delegate_one_staging_paths delegate_two = create_autospec(BigQueryViewExporter) delegate_two_staging_paths = [ export_config_one_staging.output_path('txt'), export_config_two_staging.output_path('txt') ] delegate_two.export_and_validate.return_value = delegate_two_staging_paths # Make the actual call exporter = CompositeBigQueryViewExporter(mock_bq_client, mock_fs, [delegate_one, delegate_two]) with pytest.raises(ViewExportValidationError) as e: exporter.export_and_validate( [export_config_one, export_config_two]) # We get an error at the very end of the export chain because even though delegate validations passed, the # final validation failed self.assertIn( 'Validation on path bucket1/US_XX/view1.json failed the metric file export. ' 'Stopping execution here.', str(e.value)) # The delegate exporters validations all passed so we still copy from staging to final delegate_one.export_and_validate.assert_has_calls([ call([export_config_one_staging, export_config_two_staging]), ]) delegate_two.export_and_validate.assert_has_calls([ call([export_config_one_staging, export_config_two_staging]), ]) mock_fs.copy.assert_has_calls([ call( GcsfsFilePath(bucket_name='bucket1', blob_name='staging/US_XX/view1.json'), GcsfsFilePath(bucket_name='bucket1', blob_name='US_XX/view1.json')), call( GcsfsFilePath(bucket_name='bucket2', blob_name='staging/US_XX/view2.json'), GcsfsFilePath(bucket_name='bucket2', blob_name='US_XX/view2.json')), call( GcsfsFilePath(bucket_name='bucket1', blob_name='staging/US_XX/view1.txt'), GcsfsFilePath(bucket_name='bucket1', blob_name='US_XX/view1.txt')), call( GcsfsFilePath(bucket_name='bucket2', blob_name='staging/US_XX/view2.txt'), GcsfsFilePath(bucket_name='bucket2', blob_name='US_XX/view2.txt')) ]) mock_fs.delete.assert_has_calls([ call( GcsfsFilePath(bucket_name='bucket1', blob_name='staging/US_XX/view1.json')), call( GcsfsFilePath(bucket_name='bucket2', blob_name='staging/US_XX/view2.json')), call( GcsfsFilePath(bucket_name='bucket1', blob_name='staging/US_XX/view1.txt')), call( GcsfsFilePath(bucket_name='bucket2', blob_name='staging/US_XX/view2.txt')) ]) # Only one call to the Exists validation made because the first one failed mock_fs.exists.assert_has_calls([ call( GcsfsFilePath(bucket_name='bucket1', blob_name='US_XX/view1.json')), ])
def test_export_happy_path(self) -> None: metric_view_one = MetricBigQueryViewBuilder( dataset_id="dataset", view_id="view1", view_query_template="select * from table", dimensions=("a", "b", "c"), ).build() export_config_one = ExportBigQueryViewConfig( view=metric_view_one, view_filter_clause="WHERE state_code = 'US_XX'", intermediate_table_name="intermediate_table", output_directory=GcsfsDirectoryPath.from_absolute_path( "gs://bucket1/US_XX"), ) export_config_one_staging = ExportBigQueryViewConfig( view=metric_view_one, view_filter_clause="WHERE state_code = 'US_XX'", intermediate_table_name="intermediate_table", output_directory=GcsfsDirectoryPath.from_absolute_path( "gs://bucket1/staging/US_XX"), ) metric_view_two = MetricBigQueryViewBuilder( dataset_id="dataset", view_id="view2", view_query_template="select * from view2", dimensions=("d", "e", "f"), ).build() export_config_two = ExportBigQueryViewConfig( view=metric_view_two, view_filter_clause="WHERE state_code = 'US_XX'", intermediate_table_name="intermediate_table2", output_directory=GcsfsDirectoryPath.from_absolute_path( "gs://bucket2/US_XX"), ) export_config_two_staging = ExportBigQueryViewConfig( view=metric_view_two, view_filter_clause="WHERE state_code = 'US_XX'", intermediate_table_name="intermediate_table2", output_directory=GcsfsDirectoryPath.from_absolute_path( "gs://bucket2/staging/US_XX"), ) mock_fs = create_autospec(GCSFileSystem) mock_fs.exists.return_value = True delegate_one = create_autospec(BigQueryViewExporter) delegate_one_staging_paths = [ export_config_one_staging.output_path("json"), export_config_two_staging.output_path("json"), ] delegate_one.export_and_validate.return_value = delegate_one_staging_paths delegate_two = create_autospec(BigQueryViewExporter) delegate_two_staging_paths = [ export_config_one_staging.output_path("txt"), export_config_two_staging.output_path("txt"), ] delegate_two.export_and_validate.return_value = delegate_two_staging_paths # Make the actual call export_views_with_exporters( mock_fs, [export_config_one, export_config_two], { ExportOutputFormatType.JSON: delegate_one, ExportOutputFormatType.METRIC: delegate_two, }, ) # Assert all mocks called as expected delegate_one.export_and_validate.assert_has_calls([ call([export_config_one_staging, export_config_two_staging]), ]) delegate_two.export_and_validate.assert_has_calls([ call([export_config_one_staging, export_config_two_staging]), ]) mock_fs.copy.assert_has_calls( [ call( GcsfsFilePath(bucket_name="bucket1", blob_name="staging/US_XX/view1.json"), GcsfsFilePath(bucket_name="bucket1", blob_name="US_XX/view1.json"), ), call( GcsfsFilePath(bucket_name="bucket2", blob_name="staging/US_XX/view2.json"), GcsfsFilePath(bucket_name="bucket2", blob_name="US_XX/view2.json"), ), call( GcsfsFilePath(bucket_name="bucket1", blob_name="staging/US_XX/view1.txt"), GcsfsFilePath(bucket_name="bucket1", blob_name="US_XX/view1.txt"), ), call( GcsfsFilePath(bucket_name="bucket2", blob_name="staging/US_XX/view2.txt"), GcsfsFilePath(bucket_name="bucket2", blob_name="US_XX/view2.txt"), ), ], any_order=True, ) mock_fs.delete.assert_has_calls( [ call( GcsfsFilePath(bucket_name="bucket1", blob_name="staging/US_XX/view1.json")), call( GcsfsFilePath(bucket_name="bucket2", blob_name="staging/US_XX/view2.json")), call( GcsfsFilePath(bucket_name="bucket1", blob_name="staging/US_XX/view1.txt")), call( GcsfsFilePath(bucket_name="bucket2", blob_name="staging/US_XX/view2.txt")), ], any_order=True, )