def test_move_to_storage_with_conflict_with_file_types(self) -> None:
        dt = datetime.datetime.now()
        self.fully_process_file(dt,
                                GcsfsFilePath(bucket_name='my_bucket',
                                              blob_name='test_file.csv'),
                                file_type_differentiation_on=True)

        # Try uploading a file with a duplicate name that has already been
        # moved to storage
        self.fully_process_file(dt,
                                GcsfsFilePath(bucket_name='my_bucket',
                                              blob_name='test_file.csv'),
                                file_type_differentiation_on=True)

        # pylint: disable=protected-access
        storage_paths = self.fs._ls_with_file_prefix(self.STORAGE_DIR_PATH,
                                                     '',
                                                     file_type_filter=None)
        self.assertEqual(len(storage_paths), 4)

        found_first_file = False
        found_second_file = False
        for path in storage_paths:
            if path.abs_path().endswith('test_file.csv'):
                found_first_file = True
            if path.abs_path().endswith('test_file-(1).csv'):
                found_second_file = True

        self.assertTrue(found_first_file)
        self.assertTrue(found_second_file)
    def test_move_to_storage_with_conflict(self) -> None:
        dt = datetime.datetime.now()
        self.fully_process_file(
            dt, GcsfsFilePath(bucket_name="my_bucket", blob_name="test_file.csv")
        )

        # Try uploading a file with a duplicate name that has already been
        # moved to storage
        self.fully_process_file(
            dt, GcsfsFilePath(bucket_name="my_bucket", blob_name="test_file.csv")
        )

        # pylint: disable=protected-access
        storage_paths = self.fs._ls_with_file_prefix(
            self.STORAGE_DIR_PATH, "", file_type_filter=None
        )
        self.assertEqual(len(storage_paths), 2)

        found_first_file = False
        found_second_file = False
        for path in storage_paths:
            self.assertTrue(filename_parts_from_path(path))
            if path.abs_path().endswith("test_file.csv"):
                found_first_file = True
            if path.abs_path().endswith("test_file-(1).csv"):
                found_second_file = True

        self.assertTrue(found_first_file)
        self.assertTrue(found_second_file)
    def test_direct_ingest_multiple_file_moves(self) -> None:
        self.fully_process_file(
            datetime.datetime.now(),
            GcsfsFilePath(bucket_name='my_bucket', blob_name='test_file.csv'))

        self.fully_process_file(
            datetime.datetime.now(),
            GcsfsFilePath(bucket_name='my_bucket',
                          blob_name='test_file_2.csv'))
    def test_direct_ingest_multiple_file_moves_with_file_types(self) -> None:
        self.fully_process_file(datetime.datetime.now(),
                                GcsfsFilePath(bucket_name='my_bucket',
                                              blob_name='test_file.csv'),
                                file_type_differentiation_on=True)

        self.fully_process_file(datetime.datetime.now(),
                                GcsfsFilePath(bucket_name='my_bucket',
                                              blob_name='test_file_2.csv'),
                                file_type_differentiation_on=True)
Ejemplo n.º 5
0
    def test_direct_ingest_multiple_file_moves_with_file_types(self) -> None:
        self.fully_process_file(
            datetime.datetime.now(),
            GcsfsFilePath(bucket_name="my_bucket", blob_name="test_file.csv"),
        )

        self.fully_process_file(
            datetime.datetime.now(),
            GcsfsFilePath(bucket_name="my_bucket",
                          blob_name="test_file_2.csv"),
        )
Ejemplo n.º 6
0
    def test_cache_ingest_file_as_parquet_malformed(self) -> None:
        path = GcsfsFilePath(
            bucket_name="test-bucket",
            blob_name=
            "storage_bucket/raw/2021/04/20/processed_2021-05-03T00:00:00:000000_raw_test_file-(1).csv",
        )

        with open(fixture_path, "r") as f:
            self.cache_ingest_file(path, f.read(), separator="|")

        cache = SingleIngestFileParquetCache(self.fakeredis, path)
        self.assertEqual(1, self.fakeredis.llen(cache.cache_key))

        expected = pandas.DataFrame(
            data=[["val1", "val2", "", "val4", "", "05/03/21"]],
            columns=[
                "col1",
                "col2",
                "col3",
                "col4",
                "col5",
                "ingest_processing_date",
            ],
        )
        actual = [
            pandas.read_parquet(parquet_file)
            for parquet_file in cache.get_parquet_files()
        ][0]

        self.assertTrue(
            expected.compare(actual).empty, expected.compare(actual))
Ejemplo n.º 7
0
    def test_cache_ingest_file_as_parquet(self) -> None:
        path = GcsfsFilePath(
            bucket_name="test-bucket",
            blob_name=
            "storage_bucket/raw/2021/04/20/processed_2021-05-03T00:00:00:000000_raw_test_file-(1).csv",
        )
        input_df = pandas.DataFrame(data=[[1, 2], [2, 3]], columns=["x", "y"])

        self.cache_ingest_file(path, input_df.to_csv(index=False))

        cache = SingleIngestFileParquetCache(self.fakeredis, path)
        self.assertEqual(1, self.fakeredis.llen(cache.cache_key))

        expected = pandas.DataFrame(
            data=[
                ["1", "2", "05/03/21"],
                ["2", "3", "05/03/21"],
            ],
            columns=["x", "y", "ingest_processing_date"],
        )
        actual = [
            pandas.read_parquet(parquet_file)
            for parquet_file in cache.get_parquet_files()
        ][0]

        self.assertTrue(expected.compare(actual).empty)
 def test_contents_of_lock_set(self) -> None:
     """Locks with pre-specified contents and asserts the lockfile contains those contents"""
     lock_manager = GCSPseudoLockManager(self.PROJECT_ID)
     lock_manager.lock(self.LOCK_NAME, self.CONTENTS)
     path = GcsfsFilePath(bucket_name=lock_manager.bucket_name,
                          blob_name=self.LOCK_NAME)
     actual_contents = self.fs.download_as_string(path)
     self.assertEqual(self.CONTENTS, actual_contents)
 def test_contents_of_lock_default(self) -> None:
     """Locks with default contents and asserts the lockfile contains correct time"""
     lock_manager = GCSPseudoLockManager(self.PROJECT_ID)
     lock_manager.lock(self.LOCK_NAME)
     path = GcsfsFilePath(bucket_name=lock_manager.bucket_name,
                          blob_name=self.LOCK_NAME)
     actual_body = GCSPseudoLockBody.from_json_string(
         self.fs.download_as_string(path))
     self.assertIsNotNone(actual_body)
 def test_contents_of_lock_default(self) -> None:
     """Locks with default contents and asserts the lockfile contains correct time"""
     lock_manager = GCSPseudoLockManager(self.PROJECT_ID)
     lock_manager.lock(self.LOCK_NAME)
     correct_contents = datetime.now().strftime(self.TIME_FORMAT)
     path = GcsfsFilePath(bucket_name=lock_manager.bucket_name,
                          blob_name=self.LOCK_NAME)
     actual_contents = self.fs.download_as_string(path)
     self.assertEqual(correct_contents, actual_contents)
Ejemplo n.º 11
0
 def unlock(self, name: str) -> None:
     """Unlocks @param name by deleting file with name"""
     if self.is_locked(name):
         path = GcsfsFilePath(bucket_name=self.bucket_name, blob_name=name)
         self.fs.delete(path)
     else:
         raise GCSPseudoLockDoesNotExist(
             f"Lock with the name {name} does not yet exist in the bucket "
             f"{self.bucket_name}"
         )
    def test_acquire_two_locks_different_schemas(self) -> None:
        self.lock_manager.acquire_lock(lock_id="lock1",
                                       schema_type=SchemaType.STATE)
        expected_paths = [
            GcsfsFilePath(bucket_name=self.lock_bucket,
                          blob_name="EXPORT_PROCESS_RUNNING_STATE")
        ]
        self.assertEqual(expected_paths, self.fake_fs.all_paths)

        self.lock_manager.acquire_lock(lock_id="lock1",
                                       schema_type=SchemaType.JAILS)
        expected_paths.append(
            GcsfsFilePath(bucket_name=self.lock_bucket,
                          blob_name="EXPORT_PROCESS_RUNNING_JAILS"))
        self.assertEqual(expected_paths, self.fake_fs.all_paths)

        self.lock_manager.release_lock(schema_type=SchemaType.STATE)
        self.lock_manager.release_lock(schema_type=SchemaType.JAILS)
        self.assertEqual([], self.fake_fs.all_paths)
Ejemplo n.º 13
0
 def get_lock_contents(self, name: str) -> str:
     """Returns contents of specified lock as string"""
     path = GcsfsFilePath(bucket_name=self.bucket_name, blob_name=name)
     if not self.fs.exists(path):
         raise GCSPseudoLockDoesNotExist(
             f"Lock with the name {name} does not yet exist in the bucket "
             f"{self.bucket_name}"
         )
     contents = self.fs.download_as_string(path)
     return contents
    def test_contents_of_unlocked_and_relocked(self) -> None:
        """Locks with pre-specified contents and asserts the lockfile contains those contents"""
        lock_manager = GCSPseudoLockManager(self.PROJECT_ID)
        lock_manager.lock(self.LOCK_NAME, self.CONTENTS)
        lock_manager.unlock(self.LOCK_NAME)
        lock_manager.lock(self.LOCK_NAME, self.CONTENTS2)
        path = GcsfsFilePath(bucket_name=lock_manager.bucket_name,
                             blob_name=self.LOCK_NAME)
        actual_body = GCSPseudoLockBody.from_json_string(
            self.fs.download_as_string(path))

        assert actual_body is not None
        self.assertEqual(self.CONTENTS2, actual_body.payload)
Ejemplo n.º 15
0
 def lock(self, name: str, contents: Optional[str] = None) -> None:
     """ "Locks @param name by generating new file. If has @param contents, body of new file is contents.
     Otherwise sets body of file to json formatted time and uuid.
     """
     if self.is_locked(name):
         raise GCSPseudoLockAlreadyExists(
             f"Lock with the name {name} already exists in the bucket "
             f"{self.bucket_name}"
         )
     if contents is None:
         contents = datetime.now().strftime(self._TIME_FORMAT)
     path = GcsfsFilePath(bucket_name=self.bucket_name, blob_name=name)
     self.fs.upload_from_string(path, contents, "text/plain")
Ejemplo n.º 16
0
    def test_withManifest_succeeds(self, mock_ingest: unittest.mock.MagicMock) -> None:
        # Act
        request_args = {"manifest_path": "gs://fake-bucket/foo/manifest.yaml"}
        headers = {"X-Appengine-Cron": "test-cron"}
        response = self.client.get(
            "/ingest", query_string=request_args, headers=headers
        )

        # Assert
        self.assertEqual(200, response.status_code)
        mock_ingest.assert_called_with(
            ANY, GcsfsFilePath(bucket_name="fake-bucket", blob_name="foo/manifest.yaml")
        )
 def _upload_fake_expired_lock(self, lock_manager: GCSPseudoLockManager,
                               lock_name: str) -> None:
     now = datetime.now()
     yesterday = now - timedelta(days=1)
     path = GcsfsFilePath(bucket_name=lock_manager.bucket_name,
                          blob_name=lock_name)
     self.fs.upload_from_string(
         path,
         json.dumps(
             GCSPseudoLockBody(lock_time=yesterday,
                               expiration_in_seconds=3600).to_json(),
             default=str,
         ),
         content_type="text/text",
     )
    def test_lock_expiration_not_met(self) -> None:
        now = datetime.now()
        lock_manager = GCSPseudoLockManager()

        path = GcsfsFilePath(bucket_name=lock_manager.bucket_name,
                             blob_name=self.LOCK_NAME)
        self.fs.upload_from_string(
            path,
            json.dumps(
                GCSPseudoLockBody(lock_time=now,
                                  expiration_in_seconds=60).to_json(),
                default=str,
            ),
            content_type="text/text",
        )
        self.assertTrue(lock_manager.is_locked(self.LOCK_NAME))
Ejemplo n.º 19
0
    def test_ingestFails_raisesError(
        self, mock_ingest: unittest.mock.MagicMock
    ) -> None:
        # Arrange
        mock_ingest.side_effect = ValueError("Malformed manifest")

        # Act
        request_args = {"manifest_path": "gs://fake-bucket/foo/manifest.yaml"}
        headers = {"X-Appengine-Cron": "test-cron"}
        with self.assertRaisesRegex(ValueError, "Malformed manifest"):
            self.client.get("/ingest", query_string=request_args, headers=headers)

        # Assert
        mock_ingest.assert_called_with(
            ANY, GcsfsFilePath(bucket_name="fake-bucket", blob_name="foo/manifest.yaml")
        )
def upload_raw_file_to_gcs(fs: GCSFileSystem, local_filepath: str,
                           bucket_name: str) -> None:
    """Upload raw Sendgrid CSV to GCS"""

    fs.upload_from_contents_handle_stream(
        path=GcsfsFilePath(
            bucket_name=bucket_name,
            blob_name=date.today().strftime(DATE_FORMAT),
        ),
        contents_handle=GcsfsFileContentsHandle(local_file_path=local_filepath,
                                                cleanup_file=False),
        content_type="text/csv",
    )
    logging.info(
        "Uploaded file [%s] to Google Cloud Storage bucket name=[%s] blob name=[%s]",
        local_filepath,
        bucket_name,
        date.today().strftime(DATE_FORMAT),
    )
    def _move_files_for_date(self, subdir_path_str: str) -> None:
        """Function that loops through each subdirectory and moves files in each subdirectory using the from path
        and to path specified."""

        from_dir_path = GcsfsDirectoryPath.from_absolute_path(
            subdir_path_str.rstrip("/"))

        previous_date_format = from_dir_path.relative_path.rstrip("/").split(
            "/")[-1]
        new_date_format = date.fromisoformat(previous_date_format).strftime(
            "%Y/%m/%d/")

        from_paths = gsutil_ls(f"{subdir_path_str}*.csv")
        for from_path in from_paths:
            file_name = GcsfsFilePath(
                bucket_name=self.region_storage_dir_path.bucket_name,
                blob_name=from_path,
            ).file_name

            to_file_path = os.path.join(
                "gs://",
                self.region_storage_dir_path.bucket_name,
                self.region_code,
                GcsfsDirectIngestFileType.RAW_DATA.value,
                new_date_format,
                file_name,
            )

            normalized_to_file_path = (
                to_normalized_processed_file_path_from_normalized_path(
                    to_file_path,
                    file_type_override=GcsfsDirectIngestFileType.RAW_DATA))

            to_path = normalized_to_file_path

            if not self.dry_run:
                gsutil_mv(from_path=from_path, to_path=to_path)
            with self.mutex:
                self.move_list.append((from_path, to_path))

        if self.move_progress:
            self.move_progress.next()
Ejemplo n.º 22
0
    def test_ingestFails_raisesError(
            self, mock_ingest: unittest.mock.MagicMock) -> None:
        # Arrange
        mock_ingest.side_effect = ValueError("Malformed manifest")

        # Act
        request_args = {"manifest_path": "gs://fake-bucket/foo/manifest.yaml"}
        headers = {"X-Appengine-Cron": "test-cron"}
        response = self.client.get("/ingest",
                                   query_string=request_args,
                                   headers=headers)

        # Assert
        self.assertEqual(500, response.status_code)
        self.assertEqual("Error ingesting data: 'Malformed manifest'",
                         response.get_data().decode())
        mock_ingest.assert_called_with(
            ANY,
            GcsfsFilePath(bucket_name="fake-bucket",
                          blob_name="foo/manifest.yaml"))
Ejemplo n.º 23
0
    def unlock(self, name: str) -> None:
        """Unlocks @param name by deleting file with name"""
        path = GcsfsFilePath(bucket_name=self.bucket_name, blob_name=name)

        # We are not using `is_locked` here because we want to delete expired
        # locks explicitly.
        if not self.fs.exists(path):
            raise GCSPseudoLockDoesNotExist(
                f"Lock with the name {name} does not yet exist in the bucket "
                f"{self.bucket_name}")

        for i in range(MAX_UNLOCK_ATTEMPTS):
            logging.debug("Deleting lock file with name: %s (attempt %s)",
                          name, i)
            self.fs.delete(path)
            if not self.fs.exists(path):
                logging.debug("Successfully deleted lock file with name: %s",
                              name)
                return

        raise GCSPseudoLockFailedUnlock(
            f"Failed to unlock lock file with name: {name}")
Ejemplo n.º 24
0
    def lock(
        self,
        name: str,
        payload: Optional[str] = None,
        expiration_in_seconds: Optional[int] = None,
    ) -> None:
        """Locks @param name by generating new file. The body of the lock is json-encoded and contains
        the lock time, the caller's custom @param payload (if provided), and the
        @param expiration_in_seconds (if provided).
        """
        if self.is_locked(name):
            raise GCSPseudoLockAlreadyExists(
                f"Lock with the name {name} already exists in the bucket "
                f"{self.bucket_name}")

        lock_body = GCSPseudoLockBody(
            lock_time=datetime.now(),
            payload=payload,
            expiration_in_seconds=expiration_in_seconds,
        )
        path = GcsfsFilePath(bucket_name=self.bucket_name, blob_name=name)
        self.fs.upload_from_string(
            path, json.dumps(lock_body.to_json(), default=str), "text/plain")
        logging.debug("Created lock file with name: %s", name)
Ejemplo n.º 25
0
    def test_export_happy_path(self) -> None:
        metric_view_one = MetricBigQueryViewBuilder(
            dataset_id='dataset',
            view_id='view1',
            view_query_template='select * from table',
            dimensions=['a', 'b', 'c'],
        ).build()

        export_config_one = ExportBigQueryViewConfig(
            view=metric_view_one,
            view_filter_clause='WHERE state_code = \'US_XX\'',
            intermediate_table_name='intermediate_table',
            output_directory=GcsfsDirectoryPath.from_absolute_path(
                'gs://bucket1/US_XX'),
        )
        export_config_one_staging = ExportBigQueryViewConfig(
            view=metric_view_one,
            view_filter_clause='WHERE state_code = \'US_XX\'',
            intermediate_table_name='intermediate_table',
            output_directory=GcsfsDirectoryPath.from_absolute_path(
                'gs://bucket1/staging/US_XX'),
        )

        metric_view_two = MetricBigQueryViewBuilder(
            dataset_id='dataset',
            view_id='view2',
            view_query_template='select * from view2',
            dimensions=['d', 'e', 'f'],
        ).build()

        export_config_two = ExportBigQueryViewConfig(
            view=metric_view_two,
            view_filter_clause='WHERE state_code = \'US_XX\'',
            intermediate_table_name='intermediate_table2',
            output_directory=GcsfsDirectoryPath.from_absolute_path(
                'gs://bucket2/US_XX'),
        )
        export_config_two_staging = ExportBigQueryViewConfig(
            view=metric_view_two,
            view_filter_clause='WHERE state_code = \'US_XX\'',
            intermediate_table_name='intermediate_table2',
            output_directory=GcsfsDirectoryPath.from_absolute_path(
                'gs://bucket2/staging/US_XX'),
        )

        mock_bq_client = create_autospec(BigQueryClient)
        mock_fs = create_autospec(GCSFileSystem)

        mock_fs.exists.return_value = True

        delegate_one = create_autospec(BigQueryViewExporter)
        delegate_one_staging_paths = [
            export_config_one_staging.output_path('json'),
            export_config_two_staging.output_path('json')
        ]
        delegate_one.export_and_validate.return_value = delegate_one_staging_paths

        delegate_two = create_autospec(BigQueryViewExporter)
        delegate_two_staging_paths = [
            export_config_one_staging.output_path('txt'),
            export_config_two_staging.output_path('txt')
        ]
        delegate_two.export_and_validate.return_value = delegate_two_staging_paths

        # Make the actual call
        exporter = CompositeBigQueryViewExporter(mock_bq_client, mock_fs,
                                                 [delegate_one, delegate_two])
        exporter.export_and_validate([export_config_one, export_config_two])

        # Assert all mocks called as expected
        delegate_one.export_and_validate.assert_has_calls([
            call([export_config_one_staging, export_config_two_staging]),
        ])

        delegate_two.export_and_validate.assert_has_calls([
            call([export_config_one_staging, export_config_two_staging]),
        ])

        mock_fs.copy.assert_has_calls([
            call(
                GcsfsFilePath(bucket_name='bucket1',
                              blob_name='staging/US_XX/view1.json'),
                GcsfsFilePath(bucket_name='bucket1',
                              blob_name='US_XX/view1.json')),
            call(
                GcsfsFilePath(bucket_name='bucket2',
                              blob_name='staging/US_XX/view2.json'),
                GcsfsFilePath(bucket_name='bucket2',
                              blob_name='US_XX/view2.json')),
            call(
                GcsfsFilePath(bucket_name='bucket1',
                              blob_name='staging/US_XX/view1.txt'),
                GcsfsFilePath(bucket_name='bucket1',
                              blob_name='US_XX/view1.txt')),
            call(
                GcsfsFilePath(bucket_name='bucket2',
                              blob_name='staging/US_XX/view2.txt'),
                GcsfsFilePath(bucket_name='bucket2',
                              blob_name='US_XX/view2.txt'))
        ])

        mock_fs.delete.assert_has_calls([
            call(
                GcsfsFilePath(bucket_name='bucket1',
                              blob_name='staging/US_XX/view1.json')),
            call(
                GcsfsFilePath(bucket_name='bucket2',
                              blob_name='staging/US_XX/view2.json')),
            call(
                GcsfsFilePath(bucket_name='bucket1',
                              blob_name='staging/US_XX/view1.txt')),
            call(
                GcsfsFilePath(bucket_name='bucket2',
                              blob_name='staging/US_XX/view2.txt'))
        ])

        mock_fs.exists.assert_has_calls([
            call(
                GcsfsFilePath(bucket_name='bucket1',
                              blob_name='US_XX/view1.json')),
            call(
                GcsfsFilePath(bucket_name='bucket2',
                              blob_name='US_XX/view2.json')),
            call(
                GcsfsFilePath(bucket_name='bucket1',
                              blob_name='US_XX/view1.txt')),
            call(
                GcsfsFilePath(bucket_name='bucket2',
                              blob_name='US_XX/view2.txt')),
        ])
Ejemplo n.º 26
0
 def _lock_body_for_lock(self, name: str) -> Optional[GCSPseudoLockBody]:
     path = GcsfsFilePath(bucket_name=self.bucket_name, blob_name=name)
     return self._lock_body_for_path(path)
 def test_direct_ingest_file_moves_with_file_types(self) -> None:
     self.fully_process_file(
         datetime.datetime.now(),
         GcsfsFilePath(bucket_name="my_bucket", blob_name="test_file.csv"),
         file_type_differentiation_on=True,
     )
Ejemplo n.º 28
0
 def is_locked(self, name: str) -> bool:
     """Checks if @param name is locked by checking if file exists. Returns true if locked, false if unlocked"""
     path = GcsfsFilePath(bucket_name=self.bucket_name, blob_name=name)
     return self.fs.exists(path)
Ejemplo n.º 29
0
    def test_export_final_existence_validation_failed(self) -> None:
        metric_view_one = MetricBigQueryViewBuilder(
            dataset_id='dataset',
            view_id='view1',
            view_query_template='select * from table',
            dimensions=['a', 'b', 'c'],
        ).build()

        export_config_one = ExportBigQueryViewConfig(
            view=metric_view_one,
            view_filter_clause='WHERE state_code = \'US_XX\'',
            intermediate_table_name='intermediate_table',
            output_directory=GcsfsDirectoryPath.from_absolute_path(
                'gs://bucket1/US_XX'),
        )
        export_config_one_staging = ExportBigQueryViewConfig(
            view=metric_view_one,
            view_filter_clause='WHERE state_code = \'US_XX\'',
            intermediate_table_name='intermediate_table',
            output_directory=GcsfsDirectoryPath.from_absolute_path(
                'gs://bucket1/staging/US_XX'),
        )

        metric_view_two = MetricBigQueryViewBuilder(
            dataset_id='dataset',
            view_id='view2',
            view_query_template='select * from view2',
            dimensions=['d', 'e', 'f'],
        ).build()

        export_config_two = ExportBigQueryViewConfig(
            view=metric_view_two,
            view_filter_clause='WHERE state_code = \'US_XX\'',
            intermediate_table_name='intermediate_table2',
            output_directory=GcsfsDirectoryPath.from_absolute_path(
                'gs://bucket2/US_XX'),
        )
        export_config_two_staging = ExportBigQueryViewConfig(
            view=metric_view_two,
            view_filter_clause='WHERE state_code = \'US_XX\'',
            intermediate_table_name='intermediate_table2',
            output_directory=GcsfsDirectoryPath.from_absolute_path(
                'gs://bucket2/staging/US_XX'),
        )

        mock_bq_client = create_autospec(BigQueryClient)
        mock_fs = create_autospec(GCSFileSystem)

        # This should cause export_and_validate to raise a ValueError
        mock_fs.exists.return_value = False

        delegate_one = create_autospec(BigQueryViewExporter)
        delegate_one_staging_paths = [
            export_config_one_staging.output_path('json'),
            export_config_two_staging.output_path('json')
        ]
        delegate_one.export_and_validate.return_value = delegate_one_staging_paths

        delegate_two = create_autospec(BigQueryViewExporter)
        delegate_two_staging_paths = [
            export_config_one_staging.output_path('txt'),
            export_config_two_staging.output_path('txt')
        ]
        delegate_two.export_and_validate.return_value = delegate_two_staging_paths

        # Make the actual call
        exporter = CompositeBigQueryViewExporter(mock_bq_client, mock_fs,
                                                 [delegate_one, delegate_two])

        with pytest.raises(ViewExportValidationError) as e:
            exporter.export_and_validate(
                [export_config_one, export_config_two])

        # We get an error at the very end of the export chain because even though delegate validations passed, the
        # final validation failed
        self.assertIn(
            'Validation on path bucket1/US_XX/view1.json failed the metric file export. '
            'Stopping execution here.', str(e.value))

        # The delegate exporters validations all passed so we still copy from staging to final
        delegate_one.export_and_validate.assert_has_calls([
            call([export_config_one_staging, export_config_two_staging]),
        ])

        delegate_two.export_and_validate.assert_has_calls([
            call([export_config_one_staging, export_config_two_staging]),
        ])

        mock_fs.copy.assert_has_calls([
            call(
                GcsfsFilePath(bucket_name='bucket1',
                              blob_name='staging/US_XX/view1.json'),
                GcsfsFilePath(bucket_name='bucket1',
                              blob_name='US_XX/view1.json')),
            call(
                GcsfsFilePath(bucket_name='bucket2',
                              blob_name='staging/US_XX/view2.json'),
                GcsfsFilePath(bucket_name='bucket2',
                              blob_name='US_XX/view2.json')),
            call(
                GcsfsFilePath(bucket_name='bucket1',
                              blob_name='staging/US_XX/view1.txt'),
                GcsfsFilePath(bucket_name='bucket1',
                              blob_name='US_XX/view1.txt')),
            call(
                GcsfsFilePath(bucket_name='bucket2',
                              blob_name='staging/US_XX/view2.txt'),
                GcsfsFilePath(bucket_name='bucket2',
                              blob_name='US_XX/view2.txt'))
        ])

        mock_fs.delete.assert_has_calls([
            call(
                GcsfsFilePath(bucket_name='bucket1',
                              blob_name='staging/US_XX/view1.json')),
            call(
                GcsfsFilePath(bucket_name='bucket2',
                              blob_name='staging/US_XX/view2.json')),
            call(
                GcsfsFilePath(bucket_name='bucket1',
                              blob_name='staging/US_XX/view1.txt')),
            call(
                GcsfsFilePath(bucket_name='bucket2',
                              blob_name='staging/US_XX/view2.txt'))
        ])

        # Only one call to the Exists validation made because the first one failed
        mock_fs.exists.assert_has_calls([
            call(
                GcsfsFilePath(bucket_name='bucket1',
                              blob_name='US_XX/view1.json')),
        ])
Ejemplo n.º 30
0
    def test_export_happy_path(self) -> None:
        metric_view_one = MetricBigQueryViewBuilder(
            dataset_id="dataset",
            view_id="view1",
            view_query_template="select * from table",
            dimensions=("a", "b", "c"),
        ).build()

        export_config_one = ExportBigQueryViewConfig(
            view=metric_view_one,
            view_filter_clause="WHERE state_code = 'US_XX'",
            intermediate_table_name="intermediate_table",
            output_directory=GcsfsDirectoryPath.from_absolute_path(
                "gs://bucket1/US_XX"),
        )
        export_config_one_staging = ExportBigQueryViewConfig(
            view=metric_view_one,
            view_filter_clause="WHERE state_code = 'US_XX'",
            intermediate_table_name="intermediate_table",
            output_directory=GcsfsDirectoryPath.from_absolute_path(
                "gs://bucket1/staging/US_XX"),
        )

        metric_view_two = MetricBigQueryViewBuilder(
            dataset_id="dataset",
            view_id="view2",
            view_query_template="select * from view2",
            dimensions=("d", "e", "f"),
        ).build()

        export_config_two = ExportBigQueryViewConfig(
            view=metric_view_two,
            view_filter_clause="WHERE state_code = 'US_XX'",
            intermediate_table_name="intermediate_table2",
            output_directory=GcsfsDirectoryPath.from_absolute_path(
                "gs://bucket2/US_XX"),
        )
        export_config_two_staging = ExportBigQueryViewConfig(
            view=metric_view_two,
            view_filter_clause="WHERE state_code = 'US_XX'",
            intermediate_table_name="intermediate_table2",
            output_directory=GcsfsDirectoryPath.from_absolute_path(
                "gs://bucket2/staging/US_XX"),
        )

        mock_fs = create_autospec(GCSFileSystem)

        mock_fs.exists.return_value = True

        delegate_one = create_autospec(BigQueryViewExporter)
        delegate_one_staging_paths = [
            export_config_one_staging.output_path("json"),
            export_config_two_staging.output_path("json"),
        ]
        delegate_one.export_and_validate.return_value = delegate_one_staging_paths

        delegate_two = create_autospec(BigQueryViewExporter)
        delegate_two_staging_paths = [
            export_config_one_staging.output_path("txt"),
            export_config_two_staging.output_path("txt"),
        ]
        delegate_two.export_and_validate.return_value = delegate_two_staging_paths

        # Make the actual call
        export_views_with_exporters(
            mock_fs,
            [export_config_one, export_config_two],
            {
                ExportOutputFormatType.JSON: delegate_one,
                ExportOutputFormatType.METRIC: delegate_two,
            },
        )

        # Assert all mocks called as expected
        delegate_one.export_and_validate.assert_has_calls([
            call([export_config_one_staging, export_config_two_staging]),
        ])

        delegate_two.export_and_validate.assert_has_calls([
            call([export_config_one_staging, export_config_two_staging]),
        ])

        mock_fs.copy.assert_has_calls(
            [
                call(
                    GcsfsFilePath(bucket_name="bucket1",
                                  blob_name="staging/US_XX/view1.json"),
                    GcsfsFilePath(bucket_name="bucket1",
                                  blob_name="US_XX/view1.json"),
                ),
                call(
                    GcsfsFilePath(bucket_name="bucket2",
                                  blob_name="staging/US_XX/view2.json"),
                    GcsfsFilePath(bucket_name="bucket2",
                                  blob_name="US_XX/view2.json"),
                ),
                call(
                    GcsfsFilePath(bucket_name="bucket1",
                                  blob_name="staging/US_XX/view1.txt"),
                    GcsfsFilePath(bucket_name="bucket1",
                                  blob_name="US_XX/view1.txt"),
                ),
                call(
                    GcsfsFilePath(bucket_name="bucket2",
                                  blob_name="staging/US_XX/view2.txt"),
                    GcsfsFilePath(bucket_name="bucket2",
                                  blob_name="US_XX/view2.txt"),
                ),
            ],
            any_order=True,
        )

        mock_fs.delete.assert_has_calls(
            [
                call(
                    GcsfsFilePath(bucket_name="bucket1",
                                  blob_name="staging/US_XX/view1.json")),
                call(
                    GcsfsFilePath(bucket_name="bucket2",
                                  blob_name="staging/US_XX/view2.json")),
                call(
                    GcsfsFilePath(bucket_name="bucket1",
                                  blob_name="staging/US_XX/view1.txt")),
                call(
                    GcsfsFilePath(bucket_name="bucket2",
                                  blob_name="staging/US_XX/view2.txt")),
            ],
            any_order=True,
        )