コード例 #1
0
 def __init__(
     self,
     region_code: str,
     dry_run: bool,
 ):
     self.region_code = region_code
     self.file_type = GcsfsDirectIngestFileType.UNSPECIFIED
     self.dry_run = dry_run
     self.project_id = 'recidiviz-123'
     self.region_ingest_bucket_dir_path = GcsfsDirectoryPath.from_absolute_path(
         gcsfs_direct_ingest_directory_path_for_region(
             region_code, SystemLevel.STATE, project_id=self.project_id))
     self.region_storage_raw_dir_path = GcsfsDirectoryPath.from_absolute_path(
         gcsfs_direct_ingest_storage_directory_path_for_region(
             region_code,
             SystemLevel.STATE,
             GcsfsDirectIngestFileType.RAW_DATA,
             project_id=self.project_id))
     self.log_output_path = os.path.join(
         os.path.dirname(__file__),
         f'move_prod_ingest_files_to_raw_start_bound_{self.region_code}_region_dry_run_{dry_run}_'
         f'{datetime.datetime.now().isoformat()}.txt')
     self.mutex = threading.Lock()
     self.move_list: List[Tuple[str, str]] = []
     self.move_progress: Optional[Bar] = None
コード例 #2
0
    def __init__(self,
                 region_name: str,
                 system_level: SystemLevel,
                 ingest_directory_path: Optional[str] = None,
                 storage_directory_path: Optional[str] = None,
                 max_delay_sec_between_files: Optional[int] = None):
        super().__init__(region_name, system_level)
        self.fs = GcsfsFactory.build()
        self.max_delay_sec_between_files = max_delay_sec_between_files

        if not ingest_directory_path:
            ingest_directory_path = \
                gcsfs_direct_ingest_directory_path_for_region(region_name,
                                                              system_level)
        self.ingest_directory_path = \
            GcsfsDirectoryPath.from_absolute_path(ingest_directory_path)

        if not storage_directory_path:
            storage_directory_path = \
                gcsfs_direct_ingest_storage_directory_path_for_region(
                    region_name, system_level)

        self.storage_directory_path = \
            GcsfsDirectoryPath.from_absolute_path(storage_directory_path)

        self.file_prioritizer = \
            GcsfsDirectIngestJobPrioritizer(
                self.fs,
                self.ingest_directory_path,
                self._get_file_tag_rank_list())

        self.file_split_line_limit = self._FILE_SPLIT_LINE_LIMIT
コード例 #3
0
    def __init__(self,
                 region_name: str,
                 system_level: SystemLevel,
                 ingest_directory_path: Optional[str] = None,
                 storage_directory_path: Optional[str] = None,
                 max_delay_sec_between_files: Optional[int] = None):
        super().__init__(region_name, system_level)
        self.fs = GcsfsFactory.build()
        self.max_delay_sec_between_files = max_delay_sec_between_files

        if not ingest_directory_path:
            ingest_directory_path = \
                gcsfs_direct_ingest_directory_path_for_region(region_name,
                                                              system_level)
        self.ingest_directory_path = \
            GcsfsDirectoryPath.from_absolute_path(ingest_directory_path)

        if not storage_directory_path:
            storage_directory_path = \
                gcsfs_direct_ingest_storage_directory_path_for_region(
                    region_name, system_level)

        self.storage_directory_path = \
            GcsfsDirectoryPath.from_absolute_path(storage_directory_path)

        self.temp_output_directory_path = \
            GcsfsDirectoryPath.from_absolute_path(gcsfs_direct_ingest_temporary_output_directory_path())

        ingest_job_file_type_filter = \
            GcsfsDirectIngestFileType.INGEST_VIEW \
            if self.region.is_raw_vs_ingest_file_name_detection_enabled() else None
        self.file_prioritizer = \
            GcsfsDirectIngestJobPrioritizer(
                self.fs,
                self.ingest_directory_path,
                self.get_file_tag_rank_list(),
                ingest_job_file_type_filter)

        self.ingest_file_split_line_limit = self._INGEST_FILE_SPLIT_LINE_LIMIT

        self.file_metadata_manager = PostgresDirectIngestFileMetadataManager(
            region_code=self.region.region_code)

        self.raw_file_import_manager = DirectIngestRawFileImportManager(
            region=self.region,
            fs=self.fs,
            ingest_directory_path=self.ingest_directory_path,
            temp_output_directory_path=self.temp_output_directory_path,
            big_query_client=BigQueryClientImpl())

        self.ingest_view_export_manager = DirectIngestIngestViewExportManager(
            region=self.region,
            fs=self.fs,
            ingest_directory_path=self.ingest_directory_path,
            file_metadata_manager=self.file_metadata_manager,
            big_query_client=BigQueryClientImpl(),
            view_collector=DirectIngestPreProcessedIngestViewCollector(
                self.region, self.get_file_tag_rank_list()))
コード例 #4
0
 def __init__(self, file_type: GcsfsDirectIngestFileType, region_code: str,
              start_date_bound: Optional[str],
              end_date_bound: Optional[str], dry_run: bool, project_id: str,
              file_filter: Optional[str]):
     self.file_type = file_type
     self.region_code = region_code
     self.start_date_bound = start_date_bound
     self.end_date_bound = end_date_bound
     self.dry_run = dry_run
     self.file_filter = file_filter
     self.project_id = project_id
     self.region_storage_dir_path = GcsfsDirectoryPath.from_absolute_path(
         gcsfs_direct_ingest_storage_directory_path_for_region(
             region_code,
             SystemLevel.STATE,
             self.file_type,
             project_id=self.project_id))
     self.log_output_path = os.path.join(
         os.path.dirname(__file__),
         f'move_storage_files_to_deprecated_start_bound_{self.region_code}_region_{self.start_date_bound}'
         f'_end_bound_{self.end_date_bound}_dry_run_{dry_run}_{datetime.datetime.now().isoformat()}.txt'
     )
     self.mutex = threading.Lock()
     self.move_list: List[Tuple[str, str]] = []
     self.move_progress: Optional[Bar] = None
コード例 #5
0
    def _split_file(self, path: GcsfsFilePath,
                    file_contents_handle: GcsfsFileContentsHandle) -> None:

        output_dir = GcsfsDirectoryPath.from_file_path(path)

        upload_paths_and_df = []
        for i, df in enumerate(
                pd.read_csv(file_contents_handle.local_file_path,
                            dtype=str,
                            chunksize=self.file_split_line_limit,
                            keep_default_na=False)):
            upload_path = self._create_split_file_path(path,
                                                       output_dir,
                                                       split_num=i)
            upload_paths_and_df.append((upload_path, df))

        for output_path, df in upload_paths_and_df:
            logging.info("Writing file split [%s] to Cloud Storage.",
                         output_path.abs_path())

            self.fs.upload_from_string(output_path, df.to_csv(index=False),
                                       'text/csv')

        logging.info("Done splitting file [%s] into [%s] paths, returning.",
                     path.abs_path(), len(upload_paths_and_df))

        self.fs.mv_path_to_storage(path, self.storage_directory_path)
コード例 #6
0
    def _move_files(self, from_uri: str):
        curr_gcsfs_file_path = GcsfsFilePath.from_absolute_path(from_uri)
        previous_date_format = filename_parts_from_path(
            curr_gcsfs_file_path).date_str
        new_date_format = date.fromisoformat(previous_date_format).strftime(
            "%Y/%m/%d/")

        path_with_new_file_name = GcsfsFilePath.from_absolute_path(
            to_normalized_unprocessed_file_path_from_normalized_path(
                from_uri, GcsfsDirectIngestFileType.RAW_DATA))

        if DirectIngestGCSFileSystem.is_processed_file(curr_gcsfs_file_path):
            path_with_new_file_name = GcsfsFilePath.from_absolute_path(
                to_normalized_processed_file_path_from_normalized_path(
                    from_uri, GcsfsDirectIngestFileType.RAW_DATA))

        raw_dir_with_date = GcsfsDirectoryPath.from_dir_and_subdir(
            self.region_storage_raw_dir_path, new_date_format)

        to_uri = GcsfsFilePath.from_directory_and_file_name(
            raw_dir_with_date, path_with_new_file_name.file_name).uri()

        if not self.dry_run:
            gsutil_mv(from_path=from_uri, to_path=to_uri)
        with self.mutex:
            self.move_list.append((from_uri, to_uri))
            if self.move_progress:
                self.move_progress.next()
コード例 #7
0
 def create_export_manager(self, region):
     metadata_manager = PostgresDirectIngestFileMetadataManager(region.region_code)
     return DirectIngestIngestViewExportManager(
         region=region,
         fs=FakeDirectIngestGCSFileSystem(),
         ingest_directory_path=GcsfsDirectoryPath.from_absolute_path('ingest_bucket'),
         big_query_client=self.mock_client,
         file_metadata_manager=metadata_manager,
         view_collector=_ViewCollector(region, controller_file_tags=['ingest_view']))
コード例 #8
0
ファイル: export_utils.py プロジェクト: pnchbck/pulse-data
def gcs_export_directory(bucket_name: str, today: datetime.date,
                         state_code: str) -> GcsfsDirectoryPath:
    """Returns a GCS directory to export files into, of the format:
    gs://{bucket_name}/ingested_state_data/{state_code}/{YYYY}/{MM}/{DD}
    """
    path = GcsfsDirectoryPath.from_bucket_and_blob_name(
        bucket_name=bucket_name,
        blob_name=
        f'ingested_state_data/{state_code}/{today.year:04}/{today.month:02}/{today.day:02}/'
    )
    return cast(GcsfsDirectoryPath, path)
    def _copy_files_for_date(self, subdir_path_str: str):
        dir_path = GcsfsDirectoryPath.from_absolute_path(subdir_path_str.rstrip('/'))

        from_path = f'gs://{self.prod_region_storage_dir_path.bucket_name}/{dir_path.relative_path}*'
        to_path = f'gs://{self.staging_region_storage_dir_path.bucket_name}/{dir_path.relative_path}'

        if not self.dry_run:
            gsutil_cp(from_path=from_path, to_path=to_path)
        with self.mutex:
            self.copy_list.append((from_path, to_path))
            if self.copy_progress:
                self.copy_progress.next()
コード例 #10
0
    def setUp(self) -> None:
        self.project_id = 'recidiviz-456'
        self.test_region = fake_region(
            region_code='us_xx', are_raw_data_bq_imports_enabled_in_env=True)
        self.fs = FakeDirectIngestGCSFileSystem()
        self.ingest_directory_path = GcsfsDirectoryPath(
            bucket_name='direct/controllers/fixtures')
        self.temp_output_path = GcsfsDirectoryPath(bucket_name='temp_bucket')

        self.region_raw_file_config = DirectIngestRegionRawFileConfig(
            region_code='us_xx',
            yaml_config_file_path=fixtures.as_filepath(
                'us_xx_raw_data_files.yaml'),
        )

        self.mock_big_query_client = create_autospec(BigQueryClient)
        self.num_lines_uploaded = 0

        self.mock_big_query_client.insert_into_table_from_cloud_storage_async.side_effect = \
            self.mock_import_raw_file_to_big_query

        self.import_manager = DirectIngestRawFileImportManager(
            region=self.test_region,
            fs=self.fs,
            ingest_directory_path=self.ingest_directory_path,
            temp_output_directory_path=self.temp_output_path,
            region_raw_file_config=self.region_raw_file_config,
            big_query_client=self.mock_big_query_client)
        self.import_manager.csv_reader = TestSafeGcsCsvReader(self.fs)

        self.time_patcher = patch(
            'recidiviz.ingest.direct.controllers.direct_ingest_raw_file_import_manager.time'
        )
        self.mock_time = self.time_patcher.start()

        def fake_get_dataset_ref(dataset_id: str) -> bigquery.DatasetReference:
            return bigquery.DatasetReference(project=self.project_id,
                                             dataset_id=dataset_id)

        self.mock_big_query_client.dataset_ref_for_id = fake_get_dataset_ref
コード例 #11
0
    def __init__(self, project_id: str, region: str,
                 file_type_to_move: GcsfsDirectIngestFileType,
                 destination_file_type: GcsfsDirectIngestFileType,
                 start_date_bound: Optional[str],
                 end_date_bound: Optional[str], dry_run: bool,
                 file_filter: Optional[str]):

        self.project_id = project_id
        self.region = region
        self.file_type_to_move = file_type_to_move
        self.destination_file_type = destination_file_type

        if self.file_type_to_move != self.destination_file_type and \
                self.file_type_to_move != GcsfsDirectIngestFileType.UNSPECIFIED:
            raise ValueError(
                'Args file_type_to_move and destination_file_type must match if type to move is UNSPECIFIED'
            )

        self.start_date_bound = start_date_bound
        self.end_date_bound = end_date_bound
        self.dry_run = dry_run
        self.file_filter = file_filter

        self.storage_bucket = GcsfsDirectoryPath.from_absolute_path(
            gcsfs_direct_ingest_storage_directory_path_for_region(
                region, SystemLevel.STATE, project_id=self.project_id))
        self.ingest_bucket = GcsfsDirectoryPath.from_absolute_path(
            gcsfs_direct_ingest_directory_path_for_region(
                region, SystemLevel.STATE, project_id=self.project_id))

        self.mutex = threading.Lock()
        self.collect_progress: Optional[Bar] = None
        self.move_progress: Optional[Bar] = None
        self.moves_list: List[Tuple[str, str]] = []
        self.log_output_path = os.path.join(
            os.path.dirname(__file__),
            f'move_result_{region}_{self.project_id}_start_bound_{self.start_date_bound}_end_bound_'
            f'{self.end_date_bound}_dry_run_{self.dry_run}_{datetime.datetime.now().isoformat()}.txt'
        )
    def __init__(self, region_code: str, file_type: GcsfsDirectIngestFileType,
                 start_date_bound: Optional[str],
                 end_date_bound: Optional[str], dry_run: bool):
        self.file_type = file_type
        self.prod_region_storage_dir_path = GcsfsDirectoryPath.from_absolute_path(
            gcsfs_direct_ingest_storage_directory_path_for_region(
                region_code, SystemLevel.STATE, project_id='recidiviz-123'))
        self.staging_region_storage_dir_path = GcsfsDirectoryPath.from_absolute_path(
            gcsfs_direct_ingest_storage_directory_path_for_region(
                region_code, SystemLevel.STATE,
                project_id='recidiviz-staging'))
        self.dry_run = dry_run
        self.start_date_bound = start_date_bound
        self.end_date_bound = end_date_bound

        self.log_output_path = os.path.join(
            os.path.dirname(__file__),
            f'copy_prod_to_staging_result_{region_code}_start_bound_{self.start_date_bound}_end_bound_'
            f'{self.end_date_bound}_dry_run_{dry_run}_{datetime.datetime.now().isoformat()}.txt'
        )
        self.mutex = threading.Lock()
        self.copy_list: List[Tuple[str, str]] = []
        self.copy_progress: Optional[Bar] = None
コード例 #13
0
    def __init__(self, paths: str, project_id: str, region: str, date: str,
                 dry_run: bool):

        self.paths = paths
        self.project_id = project_id
        self.region = region.lower()
        self.datetime = datetime.datetime.fromisoformat(date)
        self.dry_run = dry_run

        self.ingest_bucket = GcsfsDirectoryPath.from_absolute_path(
            gcsfs_direct_ingest_directory_path_for_region(
                region, SystemLevel.STATE, project_id=self.project_id))

        self.mutex = threading.Lock()
        self.move_progress: Optional[Bar] = None
        self.copies_list: List[Tuple[str, str]] = []
        self.log_output_path = os.path.join(
            os.path.dirname(__file__),
            f'upload_to_ingest_result_{region}_{self.project_id}_date_{self.datetime.date().isoformat()}'
            f'_dry_run_{self.dry_run}_{datetime.datetime.now().isoformat()}.txt'
        )
コード例 #14
0
    def _move_files_for_date(self, subdir_path_str: str):
        """Function that loops through each subdirectory and moves files in each subdirectory using the from path
        and to path specified."""

        from_dir_path = GcsfsDirectoryPath.from_absolute_path(
            subdir_path_str.rstrip('/'))

        previous_date_format = from_dir_path.relative_path.rstrip('/').split(
            '/')[-1]
        new_date_format = date.fromisoformat(previous_date_format).strftime(
            "%Y/%m/%d/")

        from_paths = gsutil_ls(f'{subdir_path_str}*.csv')
        for from_path in from_paths:
            file_name = GcsfsFilePath(
                bucket_name=self.region_storage_dir_path.bucket_name,
                blob_name=from_path).file_name

            to_file_path = os.path.join(
                'gs://', self.region_storage_dir_path.bucket_name,
                self.region_code, GcsfsDirectIngestFileType.RAW_DATA.value,
                new_date_format, file_name)

            normalized_to_file_path = to_normalized_processed_file_path_from_normalized_path(
                to_file_path,
                file_type_override=GcsfsDirectIngestFileType.RAW_DATA)

            to_path = normalized_to_file_path

            if not self.dry_run:
                gsutil_mv(from_path=from_path, to_path=to_path)
            with self.mutex:
                self.move_list.append((from_path, to_path))

        if self.move_progress:
            self.move_progress.next()
コード例 #15
0
class DirectIngestRawFileImportManagerTest(unittest.TestCase):
    """Tests for DirectIngestRawFileImportManager."""
    def setUp(self) -> None:
        self.project_id = 'recidiviz-456'
        self.test_region = fake_region(
            region_code='us_xx', are_raw_data_bq_imports_enabled_in_env=True)
        self.fs = FakeDirectIngestGCSFileSystem()
        self.ingest_directory_path = GcsfsDirectoryPath(
            bucket_name='direct/controllers/fixtures')
        self.temp_output_path = GcsfsDirectoryPath(bucket_name='temp_bucket')

        self.region_raw_file_config = DirectIngestRegionRawFileConfig(
            region_code='us_xx',
            yaml_config_file_path=fixtures.as_filepath(
                'us_xx_raw_data_files.yaml'),
        )

        self.mock_big_query_client = create_autospec(BigQueryClient)
        self.num_lines_uploaded = 0

        self.mock_big_query_client.insert_into_table_from_cloud_storage_async.side_effect = \
            self.mock_import_raw_file_to_big_query

        self.import_manager = DirectIngestRawFileImportManager(
            region=self.test_region,
            fs=self.fs,
            ingest_directory_path=self.ingest_directory_path,
            temp_output_directory_path=self.temp_output_path,
            region_raw_file_config=self.region_raw_file_config,
            big_query_client=self.mock_big_query_client)
        self.import_manager.csv_reader = TestSafeGcsCsvReader(self.fs)

        self.time_patcher = patch(
            'recidiviz.ingest.direct.controllers.direct_ingest_raw_file_import_manager.time'
        )
        self.mock_time = self.time_patcher.start()

        def fake_get_dataset_ref(dataset_id: str) -> bigquery.DatasetReference:
            return bigquery.DatasetReference(project=self.project_id,
                                             dataset_id=dataset_id)

        self.mock_big_query_client.dataset_ref_for_id = fake_get_dataset_ref

    def tearDown(self) -> None:
        self.time_patcher.stop()

    def mock_import_raw_file_to_big_query(
            self, *, source_uri: str,
            destination_table_schema: List[bigquery.SchemaField], **_kwargs):
        col_names = [
            schema_field.name for schema_field in destination_table_schema
        ]
        temp_path = GcsfsFilePath.from_absolute_path(source_uri)
        local_temp_path = self.fs.uploaded_test_path_to_actual[
            temp_path.abs_path()]

        df = pd.read_csv(local_temp_path, header=None, dtype=str)
        for value in df.values:
            for cell in value:
                if isinstance(cell, str):
                    stripped_cell = cell.strip()
                    if stripped_cell != cell:
                        raise ValueError(
                            'Did not strip white space from raw data cell')

                if cell in col_names:
                    raise ValueError(
                        f'Wrote column row to output file: {value}')
        self.num_lines_uploaded += len(df)

        return mock.MagicMock()

    def _metadata_for_unprocessed_file_path(
            self, path: GcsfsFilePath) -> DirectIngestFileMetadata:
        parts = filename_parts_from_path(path)
        return DirectIngestFileMetadata(
            region_code=self.test_region.region_code,
            file_tag=parts.file_tag,
            file_id=123,
            processed_time=None)

    def _check_no_temp_files_remain(self):
        for path in self.fs.all_paths:
            if path.abs_path().startswith(self.temp_output_path.abs_path()):
                self.fail(
                    f'Expected temp path {path.abs_path()} to be cleaned up')

    def test_get_unprocessed_raw_files_to_import(self):
        self.assertEqual(
            [], self.import_manager.get_unprocessed_raw_files_to_import())

        raw_unprocessed = path_for_fixture_file_in_test_gcs_directory(
            directory=self.ingest_directory_path,
            filename='file_tag_first.csv',
            should_normalize=True,
            file_type=GcsfsDirectIngestFileType.RAW_DATA)

        ingest_view_unprocessed = path_for_fixture_file_in_test_gcs_directory(
            directory=self.ingest_directory_path,
            filename='file_tag_second.csv',
            should_normalize=True,
            file_type=GcsfsDirectIngestFileType.INGEST_VIEW)

        self.fs.test_add_path(raw_unprocessed)
        self.fs.test_add_path(ingest_view_unprocessed)

        self.assertEqual(
            [raw_unprocessed],
            self.import_manager.get_unprocessed_raw_files_to_import())

    def test_import_bq_file_not_in_tags(self):
        file_path = path_for_fixture_file_in_test_gcs_directory(
            directory=self.ingest_directory_path,
            filename='this_path_tag_not_in_yaml.csv',
            should_normalize=True,
            file_type=GcsfsDirectIngestFileType.RAW_DATA)

        with self.assertRaises(ValueError):
            self.import_manager.import_raw_file_to_big_query(
                file_path, create_autospec(DirectIngestFileMetadata))

    def test_import_bq_file_with_ingest_view_file(self):
        file_path = path_for_fixture_file_in_test_gcs_directory(
            directory=self.ingest_directory_path,
            filename='file_tag_first.csv',
            should_normalize=True,
            file_type=GcsfsDirectIngestFileType.INGEST_VIEW)

        with self.assertRaises(ValueError):
            self.import_manager.import_raw_file_to_big_query(
                file_path, create_autospec(DirectIngestFileMetadata))

    def test_import_bq_file_with_unspecified_type_file(self):
        file_path = path_for_fixture_file_in_test_gcs_directory(
            directory=self.ingest_directory_path,
            filename='file_tag_first.csv',
            should_normalize=True,
            file_type=GcsfsDirectIngestFileType.UNSPECIFIED)

        with self.assertRaises(ValueError):
            self.import_manager.import_raw_file_to_big_query(
                file_path, create_autospec(DirectIngestFileMetadata))

    def test_import_bq_file_feature_not_released_throws(self):
        self.import_manager = DirectIngestRawFileImportManager(
            region=fake_region(region_code='us_xx',
                               are_raw_data_bq_imports_enabled_in_env=False),
            fs=self.fs,
            ingest_directory_path=self.ingest_directory_path,
            temp_output_directory_path=self.temp_output_path,
            region_raw_file_config=self.region_raw_file_config,
            big_query_client=self.mock_big_query_client)

        file_path = path_for_fixture_file_in_test_gcs_directory(
            directory=self.ingest_directory_path,
            filename='file_tag_first.csv',
            should_normalize=True,
            file_type=GcsfsDirectIngestFileType.RAW_DATA)

        with self.assertRaises(ValueError):
            self.import_manager.import_raw_file_to_big_query(
                file_path, create_autospec(DirectIngestFileMetadata))

    def test_import_bq_file_with_raw_file(self):
        file_path = path_for_fixture_file_in_test_gcs_directory(
            directory=self.ingest_directory_path,
            filename='tagC.csv',
            should_normalize=True,
            file_type=GcsfsDirectIngestFileType.RAW_DATA)

        self.fs.test_add_path(file_path)

        self.import_manager.import_raw_file_to_big_query(
            file_path, self._metadata_for_unprocessed_file_path(file_path))

        self.assertEqual(1, len(self.fs.uploaded_test_path_to_actual))

        path = one(self.fs.uploaded_test_path_to_actual.keys())
        self.mock_big_query_client.insert_into_table_from_cloud_storage_async.assert_called_with(
            source_uri=f'gs://{path}',
            destination_dataset_ref=bigquery.DatasetReference(
                self.project_id, 'us_xx_raw_data'),
            destination_table_id='tagC',
            destination_table_schema=[
                bigquery.SchemaField('COL1', 'STRING', 'NULLABLE'),
                bigquery.SchemaField('COL2', 'STRING', 'NULLABLE'),
                bigquery.SchemaField('COL3', 'STRING', 'NULLABLE'),
                bigquery.SchemaField('file_id', 'INTEGER', 'REQUIRED'),
                bigquery.SchemaField('update_datetime', 'DATETIME', 'REQUIRED')
            ])
        self.assertEqual(2, self.num_lines_uploaded)
        self._check_no_temp_files_remain()

    def test_import_bq_file_with_raw_file_alternate_separator_and_encoding(
            self):
        file_path = path_for_fixture_file_in_test_gcs_directory(
            directory=self.ingest_directory_path,
            filename='tagPipeSeparatedNonUTF8.txt',
            should_normalize=True,
            file_type=GcsfsDirectIngestFileType.RAW_DATA)

        self.fs.test_add_path(file_path)

        self.import_manager.import_raw_file_to_big_query(
            file_path, self._metadata_for_unprocessed_file_path(file_path))

        self.assertEqual(1, len(self.fs.uploaded_test_path_to_actual))

        path = one(self.fs.uploaded_test_path_to_actual.keys())
        self.mock_big_query_client.insert_into_table_from_cloud_storage_async.assert_called_with(
            source_uri=f'gs://{path}',
            destination_dataset_ref=bigquery.DatasetReference(
                self.project_id, 'us_xx_raw_data'),
            destination_table_id='tagPipeSeparatedNonUTF8',
            destination_table_schema=[
                bigquery.SchemaField('PRIMARY_COL1', 'STRING', 'NULLABLE'),
                bigquery.SchemaField('COL2', 'STRING', 'NULLABLE'),
                bigquery.SchemaField('COL3', 'STRING', 'NULLABLE'),
                bigquery.SchemaField('COL4', 'STRING', 'NULLABLE'),
                bigquery.SchemaField('file_id', 'INTEGER', 'REQUIRED'),
                bigquery.SchemaField('update_datetime', 'DATETIME', 'REQUIRED')
            ])
        self.assertEqual(5, self.num_lines_uploaded)
        self._check_no_temp_files_remain()

    def test_import_bq_file_multiple_chunks_even_division(self):

        self.import_manager.upload_chunk_size = 1

        file_path = path_for_fixture_file_in_test_gcs_directory(
            directory=self.ingest_directory_path,
            filename='tagPipeSeparatedNonUTF8.txt',
            should_normalize=True,
            file_type=GcsfsDirectIngestFileType.RAW_DATA)

        self.fs.test_add_path(file_path)

        self.import_manager.import_raw_file_to_big_query(
            file_path, self._metadata_for_unprocessed_file_path(file_path))

        self.assertEqual(5, len(self.fs.uploaded_test_path_to_actual))

        expected_insert_calls = [
            call.insert_into_table_from_cloud_storage_async(
                source_uri=f'gs://{uploaded_path}',
                destination_dataset_ref=bigquery.DatasetReference(
                    self.project_id, 'us_xx_raw_data'),
                destination_table_id='tagPipeSeparatedNonUTF8',
                destination_table_schema=[
                    bigquery.SchemaField('PRIMARY_COL1', 'STRING', 'NULLABLE'),
                    bigquery.SchemaField('COL2', 'STRING', 'NULLABLE'),
                    bigquery.SchemaField('COL3', 'STRING', 'NULLABLE'),
                    bigquery.SchemaField('COL4', 'STRING', 'NULLABLE'),
                    bigquery.SchemaField('file_id', 'INTEGER', 'REQUIRED'),
                    bigquery.SchemaField('update_datetime', 'DATETIME',
                                         'REQUIRED')
                ]) for uploaded_path in self.fs.uploaded_test_path_to_actual
        ]

        self.assertEqual(expected_insert_calls,
                         self.mock_big_query_client.method_calls)
        self.assertEqual(
            len(expected_insert_calls) - 1, self.mock_time.sleep.call_count)
        self.assertEqual(5, self.num_lines_uploaded)
        self._check_no_temp_files_remain()

    def test_import_bq_file_multiple_chunks_uneven_division(self):

        self.import_manager.upload_chunk_size = 2

        file_path = path_for_fixture_file_in_test_gcs_directory(
            directory=self.ingest_directory_path,
            filename='tagPipeSeparatedNonUTF8.txt',
            should_normalize=True,
            file_type=GcsfsDirectIngestFileType.RAW_DATA)

        self.fs.test_add_path(file_path)

        self.import_manager.import_raw_file_to_big_query(
            file_path, self._metadata_for_unprocessed_file_path(file_path))

        self.assertEqual(3, len(self.fs.uploaded_test_path_to_actual))

        expected_insert_calls = [
            call.insert_into_table_from_cloud_storage_async(
                source_uri=f'gs://{uploaded_path}',
                destination_dataset_ref=bigquery.DatasetReference(
                    self.project_id, 'us_xx_raw_data'),
                destination_table_id='tagPipeSeparatedNonUTF8',
                destination_table_schema=[
                    bigquery.SchemaField('PRIMARY_COL1', 'STRING', 'NULLABLE'),
                    bigquery.SchemaField('COL2', 'STRING', 'NULLABLE'),
                    bigquery.SchemaField('COL3', 'STRING', 'NULLABLE'),
                    bigquery.SchemaField('COL4', 'STRING', 'NULLABLE'),
                    bigquery.SchemaField('file_id', 'INTEGER', 'REQUIRED'),
                    bigquery.SchemaField('update_datetime', 'DATETIME',
                                         'REQUIRED')
                ]) for uploaded_path in self.fs.uploaded_test_path_to_actual
        ]

        self.assertEqual(expected_insert_calls,
                         self.mock_big_query_client.method_calls)
        self.assertEqual(
            len(expected_insert_calls) - 1, self.mock_time.sleep.call_count)
        self.assertEqual(5, self.num_lines_uploaded)
        self._check_no_temp_files_remain()
コード例 #16
0
    def test_processing_continues_if_there_are_subfolders_in_ingest_dir(self):
        controller = build_gcsfs_controller_for_tests(
            StateTestGcsfsDirectIngestController,
            self.FIXTURE_PATH_PREFIX,
            run_async=False)

        if not isinstance(controller.fs, FakeDirectIngestGCSFileSystem):
            raise ValueError(f"Controller fs must have type "
                             f"FakeDirectIngestGCSFileSystem. Found instead "
                             f"type [{type(controller.fs)}]")

        subdir_path = \
            path_for_fixture_file(controller, f'subdir/',
                                  should_normalize=False)
        paths = [
            subdir_path,
            path_for_fixture_file(controller,
                                  f'subdir/Unexpected_Tag.csv',
                                  should_normalize=False),
            path_for_fixture_file(controller,
                                  f'tagA.csv',
                                  should_normalize=False),
            path_for_fixture_file(controller,
                                  f'tagB.csv',
                                  should_normalize=False),
            path_for_fixture_file(controller,
                                  f'tagC.csv',
                                  should_normalize=False),
            path_for_fixture_file(controller,
                                  f'subdir/tagC_2.csv',
                                  should_normalize=False),
        ]

        for path in paths:
            controller.fs.test_add_path(path)

        run_task_queues_to_empty(controller)

        dir_paths_found = []
        storage_file_paths = []
        ingest_file_paths = []

        for path in controller.fs.all_paths:
            if isinstance(path, GcsfsDirectoryPath):
                dir_paths_found.append(path)
                continue

            if path.abs_path().startswith(
                    controller.storage_directory_path.abs_path()):
                storage_file_paths.append(path)
            else:
                self.assertTrue(path.abs_path().startswith(
                    controller.ingest_directory_path.abs_path()))
                ingest_file_paths.append(path)

        self.assertEqual(1, len(dir_paths_found))
        self.assertEqual(subdir_path, dir_paths_found[0])

        self.assertEqual(3, len(storage_file_paths))
        storage_tags = {
            filename_parts_from_path(path).file_tag
            for path in storage_file_paths
        }
        self.assertEqual({'tagA', 'tagB', 'tagC'}, storage_tags)

        for path in storage_file_paths:
            self.assertTrue(controller.fs.is_normalized_file_path(path))
            self.assertTrue(controller.fs.is_processed_file(path))

        self.assertEqual(2, len(ingest_file_paths))
        ingest_tags = {
            filename_parts_from_path(path).file_tag
            for path in ingest_file_paths
        }
        self.assertEqual({'tagC', 'Unexpected_Tag'}, ingest_tags)

        for path in ingest_file_paths:
            self.assertTrue(controller.fs.is_normalized_file_path(path))
            self.assertTrue(controller.fs.is_seen_unprocessed_file(path))
            self.assertEqual(subdir_path,
                             GcsfsDirectoryPath.from_file_path(path))
コード例 #17
0
    def _split_file_if_necessary(self, path: GcsfsFilePath) -> bool:
        """Checks if the given file needs to be split according to this controller's |file_split_line_limit|.

        Returns True if the file was split, False if splitting was not necessary.
        """

        should_split = self._should_split_file(path)
        if not should_split:
            logging.info("No need to split file path [%s].", path.abs_path())
            return False

        logging.info("Proceeding to file splitting for path [%s].",
                     path.abs_path())

        original_metadata = None
        if self.region.are_ingest_view_exports_enabled_in_env():
            original_metadata = self.file_metadata_manager.get_file_metadata(
                path)

        output_dir = GcsfsDirectoryPath.from_file_path(path)

        split_contents_paths = self._split_file(path)
        for i, split_contents_path in enumerate(split_contents_paths):
            upload_path = self._create_split_file_path(path,
                                                       output_dir,
                                                       split_num=i)

            ingest_file_metadata = None

            if self.region.are_ingest_view_exports_enabled_in_env():
                if not isinstance(original_metadata,
                                  DirectIngestIngestFileMetadata):
                    raise ValueError(
                        'Attempting to split a non-ingest view type file')

                ingest_file_metadata = self.file_metadata_manager.register_ingest_file_split(
                    original_metadata, upload_path)
            logging.info(
                "Copying split [%s] to direct ingest directory at path [%s].",
                i, upload_path.abs_path())
            self.fs.mv(split_contents_path, upload_path)

            if self.region.are_ingest_view_exports_enabled_in_env():
                if not ingest_file_metadata:
                    raise ValueError(
                        f'Split file metadata for path unexpectedly none [{upload_path.abs_path()}]'
                    )

                self.file_metadata_manager.mark_ingest_view_exported(
                    ingest_file_metadata)

        if self.region.are_ingest_view_exports_enabled_in_env():
            self.file_metadata_manager.mark_file_as_processed(path)

        logging.info(
            "Done splitting file [%s] into [%s] paths, moving it to storage.",
            path.abs_path(), len(split_contents_paths))

        self.fs.mv_path_to_storage(path, self.storage_directory_path)

        return True
コード例 #18
0
    def _split_file_if_necessary(self, path: GcsfsFilePath) -> bool:
        """Checks if the given file needs to be split according to this controller's |file_split_line_limit|.

        Returns True if the file was split, False if splitting was not necessary.
        """

        should_split = self._should_split_file(path)
        if not should_split:
            logging.info("No need to split file path [%s].", path.abs_path())
            return False

        logging.info("Proceeding to file splitting for path [%s].",
                     path.abs_path())

        original_metadata = None
        if self.region.are_ingest_view_exports_enabled_in_env():
            original_metadata = self.file_metadata_manager.get_file_metadata(
                path)

        output_dir = GcsfsDirectoryPath.from_file_path(path)

        split_contents_paths = self._split_file(path)
        upload_paths = []
        for i, split_contents_path in enumerate(split_contents_paths):
            upload_path = self._create_split_file_path(path,
                                                       output_dir,
                                                       split_num=i)

            logging.info(
                "Copying split [%s] to direct ingest directory at path [%s].",
                i, upload_path.abs_path())

            upload_paths.append(upload_path)
            try:
                self.fs.mv(split_contents_path, upload_path)
            except Exception as e:
                logging.error(
                    'Threw error while copying split files from temp bucket - attempting to clean up before rethrowing.'
                    ' [%s]', e)
                for p in upload_paths:
                    self.fs.delete(p)
                raise e

        # We wait to register files with metadata manager until all files have been successfully copied to avoid leaving
        # the metadata manager in an inconsistent state.
        if self.region.are_ingest_view_exports_enabled_in_env():
            if not isinstance(original_metadata,
                              DirectIngestIngestFileMetadata):
                raise ValueError(
                    'Attempting to split a non-ingest view type file')

            logging.info(
                'Registering [%s] split files with the metadata manager.',
                len(upload_paths))

            for upload_path in upload_paths:
                ingest_file_metadata = self.file_metadata_manager.register_ingest_file_split(
                    original_metadata, upload_path)
                self.file_metadata_manager.mark_ingest_view_exported(
                    ingest_file_metadata)

            self.file_metadata_manager.mark_file_as_processed(path)

        logging.info(
            "Done splitting file [%s] into [%s] paths, moving it to storage.",
            path.abs_path(), len(split_contents_paths))

        self.fs.mv_path_to_storage(path, self.storage_directory_path)

        return True
class TestGcsfsDirectIngestJobPrioritizer(unittest.TestCase):
    """Tests for the GcsfsDirectIngestJobPrioritizer."""

    _DAY_1_TIME_1 = datetime.datetime(year=2019,
                                      month=1,
                                      day=2,
                                      hour=3,
                                      minute=4,
                                      second=5,
                                      microsecond=6789,
                                      tzinfo=datetime.timezone.utc)

    _DAY_1_TIME_2 = datetime.datetime(year=2019,
                                      month=1,
                                      day=2,
                                      hour=3,
                                      minute=4,
                                      second=5,
                                      microsecond=7789,
                                      tzinfo=datetime.timezone.utc)

    _DAY_1_TIME_3 = datetime.datetime(year=2019,
                                      month=1,
                                      day=2,
                                      hour=10,
                                      minute=4,
                                      second=5,
                                      microsecond=678,
                                      tzinfo=datetime.timezone.utc)

    _DAY_2_TIME_1 = datetime.datetime(year=2019,
                                      month=1,
                                      day=3,
                                      hour=3,
                                      minute=4,
                                      second=5,
                                      microsecond=6789,
                                      tzinfo=datetime.timezone.utc)

    _DAY_1 = _DAY_1_TIME_1.date()
    _DAY_2 = _DAY_2_TIME_1.date()

    _INGEST_BUCKET_PATH = \
        GcsfsDirectoryPath.from_absolute_path('direct/regions/us_nd/fixtures')

    def setUp(self) -> None:
        self.fs = FakeDirectIngestGCSFileSystem()
        self.prioritizer = GcsfsDirectIngestJobPrioritizer(
            self.fs, self._INGEST_BUCKET_PATH, ['tagA', 'tagB'])

    FIXTURE_PATH_PREFIX = 'direct/regions/us_nd/fixtures'

    def _normalized_path_for_filename(self, filename: str,
                                      dt: datetime.datetime) -> GcsfsFilePath:
        normalized_path = \
            to_normalized_unprocessed_file_path(
                os.path.join(self._INGEST_BUCKET_PATH.abs_path(),
                             filename), dt)
        return GcsfsFilePath.from_absolute_path(normalized_path)

    def _process_jobs_for_paths_with_no_gaps_in_expected_order(
            self, paths: List[GcsfsFilePath]):
        for path in paths:
            date_str = filename_parts_from_path(path).date_str
            next_job_args = self.prioritizer.get_next_job_args()
            self.assertIsNotNone(next_job_args)
            if next_job_args is None:
                # Make mypy happy
                self.fail()
            self.assertEqual(next_job_args.file_path, path)
            self.assertTrue(
                self.prioritizer.are_next_args_expected(next_job_args))

            self.assertTrue(
                self.prioritizer.are_more_jobs_expected_for_day(date_str))

            # ... job runs ...

            self.fs.mv_path_to_processed_path(path)

    def test_empty_fs(self):
        self.assertTrue(
            self.prioritizer.are_more_jobs_expected_for_day(
                self._DAY_1_TIME_1.date().isoformat()))
        self.assertIsNone(self.prioritizer.get_next_job_args())

    def test_single_expected_file(self):
        path = self._normalized_path_for_filename('tagA.csv',
                                                  self._DAY_1_TIME_1)

        self.fs.test_add_path(path)

        self._process_jobs_for_paths_with_no_gaps_in_expected_order([path])

        self.assertIsNone(self.prioritizer.get_next_job_args())

        # We still expect a file for tagB
        self.assertTrue(
            self.prioritizer.are_more_jobs_expected_for_day(
                self._DAY_1.isoformat()))

    def test_multiple_files(self):

        paths = [
            self._normalized_path_for_filename('tagA.csv', self._DAY_1_TIME_1),
            self._normalized_path_for_filename('tagB.csv', self._DAY_1_TIME_2)
        ]

        for path in paths:
            self.fs.test_add_path(path)

        self._process_jobs_for_paths_with_no_gaps_in_expected_order(paths)

        self.assertIsNone(self.prioritizer.get_next_job_args())
        self.assertFalse(
            self.prioritizer.are_more_jobs_expected_for_day(
                self._DAY_1.isoformat()))

    def test_unexpected_file(self):
        # Only file is out of order
        path = self._normalized_path_for_filename('tagB.csv',
                                                  self._DAY_1_TIME_1)

        self.fs.test_add_path(path)

        self.assertTrue(
            self.prioritizer.are_more_jobs_expected_for_day(
                self._DAY_1.isoformat()))

        next_job_args = self.prioritizer.get_next_job_args()
        self.assertIsNotNone(next_job_args)
        self.assertEqual(next_job_args.file_path, path)
        self.assertFalse(
            self.prioritizer.are_next_args_expected(next_job_args))

        # ... job runs eventually even though unexpected...

        self.fs.mv_path_to_processed_path(path)

        self.assertIsNone(self.prioritizer.get_next_job_args())

        # We still expect a file for tagA
        self.assertTrue(
            self.prioritizer.are_more_jobs_expected_for_day(
                self._DAY_1.isoformat()))

    def test_files_on_multiple_days(self):
        paths = [
            self._normalized_path_for_filename('tagA.csv', self._DAY_1_TIME_1),
            self._normalized_path_for_filename('tagB.csv', self._DAY_1_TIME_2),
            self._normalized_path_for_filename('tagA.csv', self._DAY_2_TIME_1),
        ]
        for path in paths:
            self.fs.test_add_path(path)

        self._process_jobs_for_paths_with_no_gaps_in_expected_order(paths)

        self.assertIsNone(self.prioritizer.get_next_job_args())
        self.assertFalse(
            self.prioritizer.are_more_jobs_expected_for_day(
                self._DAY_1.isoformat()))
        self.assertTrue(
            self.prioritizer.are_more_jobs_expected_for_day(
                self._DAY_2.isoformat()))

    def test_files_on_multiple_days_with_gap(self):
        """Runs a test where there are files on multiple days and there is a gap
        in the expected files for the first day.
        """
        paths = [
            self._normalized_path_for_filename('tagB.csv', self._DAY_1_TIME_2),
            self._normalized_path_for_filename('tagA.csv', self._DAY_2_TIME_1),
        ]
        for path in paths:
            self.fs.test_add_path(path)

        for i, path in enumerate(paths):
            date_str = filename_parts_from_path(path).date_str
            next_job_args = self.prioritizer.get_next_job_args()
            self.assertIsNotNone(next_job_args)
            self.assertEqual(next_job_args.file_path, path)

            are_args_expected = \
                self.prioritizer.are_next_args_expected(next_job_args)
            if i == 0:
                self.assertFalse(are_args_expected)
            else:
                self.assertTrue(are_args_expected)

            self.assertTrue(
                self.prioritizer.are_more_jobs_expected_for_day(date_str))

            # ... job runs ...

            self.fs.mv_path_to_processed_path(path)

        self.assertIsNone(self.prioritizer.get_next_job_args())
        self.assertTrue(
            self.prioritizer.are_more_jobs_expected_for_day(
                self._DAY_1.isoformat()))
        self.assertTrue(
            self.prioritizer.are_more_jobs_expected_for_day(
                self._DAY_2.isoformat()))

    def test_multiple_files_same_tag(self):
        paths = [
            self._normalized_path_for_filename('tagA.csv', self._DAY_1_TIME_1),
            self._normalized_path_for_filename('tagA.csv', self._DAY_1_TIME_2),
            self._normalized_path_for_filename('tagB.csv', self._DAY_1_TIME_3),
        ]
        for path in paths:
            self.fs.test_add_path(path)

        self._process_jobs_for_paths_with_no_gaps_in_expected_order(paths)

        self.assertIsNone(self.prioritizer.get_next_job_args())
        self.assertFalse(
            self.prioritizer.are_more_jobs_expected_for_day(
                self._DAY_1.isoformat()))

    def test_multiple_files_times_out_of_order(self):
        """Runs a test where there are no gaps but the files have been added
        (i.e. have creation times) out of order.
        """
        paths = [
            self._normalized_path_for_filename('tagA.csv', self._DAY_1_TIME_2),
            self._normalized_path_for_filename('tagB.csv', self._DAY_1_TIME_1),
            self._normalized_path_for_filename('tagB.csv', self._DAY_1_TIME_3),
        ]
        for path in paths:
            self.fs.test_add_path(path)

        for i, path in enumerate(paths):
            date_str = filename_parts_from_path(path).date_str
            next_job_args = self.prioritizer.get_next_job_args()
            self.assertIsNotNone(next_job_args)
            self.assertEqual(next_job_args.file_path, path)
            self.assertTrue(
                self.prioritizer.are_next_args_expected(next_job_args))

            are_more_jobs_expected = \
                self.prioritizer.are_more_jobs_expected_for_day(date_str)
            if i == 2:
                self.assertFalse(are_more_jobs_expected)
            else:
                self.assertTrue(are_more_jobs_expected)

            # ... job runs ...

            self.fs.mv_path_to_processed_path(path)

        self.assertIsNone(self.prioritizer.get_next_job_args())
        self.assertFalse(
            self.prioritizer.are_more_jobs_expected_for_day(
                self._DAY_1.isoformat()))

    def test_run_multiple_copies_of_same_tag(self):
        paths = [
            self._normalized_path_for_filename('tagA.csv', self._DAY_1_TIME_2),
            self._normalized_path_for_filename('tagA_2.csv',
                                               self._DAY_1_TIME_1),
            self._normalized_path_for_filename('tagB.csv', self._DAY_1_TIME_3),
        ]
        for path in paths:
            self.fs.test_add_path(path)

        self._process_jobs_for_paths_with_no_gaps_in_expected_order(paths)

        self.assertIsNone(self.prioritizer.get_next_job_args())
        self.assertFalse(
            self.prioritizer.are_more_jobs_expected_for_day(
                self._DAY_1.isoformat()))
コード例 #20
0
class TestFakeDirectIngestGcsFileSystem(TestCase):
    """Tests for the DirectIngestGCSFileSystem."""

    STORAGE_DIR_PATH = GcsfsDirectoryPath(bucket_name='storage_bucket',
                                          relative_path='region_subdir')

    INGEST_DIR_PATH = GcsfsDirectoryPath(bucket_name='my_bucket')

    def setUp(self) -> None:
        self.fs = FakeDirectIngestGCSFileSystem()

    def fully_process_file(self,
                           dt: datetime.datetime,
                           path: GcsfsFilePath,
                           file_type_differentiation_on: bool = False):
        """Mimics all the file system calls for a single file in the direct
        ingest system, from getting added to the ingest bucket, turning to a
        processed file, then getting moved to storage."""

        self.fs.test_add_path(path)

        start_num_total_files = len(self.fs.all_paths)
        # pylint: disable=protected-access
        start_ingest_paths = self.fs._ls_with_file_prefix(
            self.INGEST_DIR_PATH, '', None)
        start_storage_paths = self.fs._ls_with_file_prefix(
            self.STORAGE_DIR_PATH, '', None)
        if file_type_differentiation_on:
            start_raw_storage_paths = self.fs._ls_with_file_prefix(
                self.STORAGE_DIR_PATH,
                '',
                file_type_filter=GcsfsDirectIngestFileType.RAW_DATA)
            start_ingest_view_storage_paths = self.fs._ls_with_file_prefix(
                self.STORAGE_DIR_PATH,
                '',
                file_type_filter=GcsfsDirectIngestFileType.INGEST_VIEW)
        else:
            start_raw_storage_paths = []
            start_ingest_view_storage_paths = []

        # File is renamed to normalized path
        file_type = GcsfsDirectIngestFileType.RAW_DATA \
            if file_type_differentiation_on else GcsfsDirectIngestFileType.UNSPECIFIED

        self.fs.mv_path_to_normalized_path(path, file_type, dt)

        if file_type_differentiation_on:
            raw_unprocessed = self.fs.get_unprocessed_file_paths(
                self.INGEST_DIR_PATH,
                file_type_filter=GcsfsDirectIngestFileType.RAW_DATA)
            self.assertEqual(len(raw_unprocessed), 1)
            self.assertTrue(
                self.fs.is_seen_unprocessed_file(raw_unprocessed[0]))

            # ... raw file imported to BQ

            processed_path = self.fs.mv_path_to_processed_path(
                raw_unprocessed[0])

            processed = self.fs.get_processed_file_paths(
                self.INGEST_DIR_PATH, None)
            self.assertEqual(len(processed), 1)

            self.fs.copy(
                processed_path,
                GcsfsFilePath.from_absolute_path(
                    to_normalized_unprocessed_file_path_from_normalized_path(
                        processed_path.abs_path(),
                        file_type_override=GcsfsDirectIngestFileType.
                        INGEST_VIEW)))
            self.fs.mv_path_to_storage(processed_path, self.STORAGE_DIR_PATH)

        ingest_unprocessed_filter = GcsfsDirectIngestFileType.INGEST_VIEW if file_type_differentiation_on else None

        ingest_unprocessed = self.fs.get_unprocessed_file_paths(
            self.INGEST_DIR_PATH, file_type_filter=ingest_unprocessed_filter)
        self.assertEqual(len(ingest_unprocessed), 1)
        self.assertTrue(self.fs.is_seen_unprocessed_file(
            ingest_unprocessed[0]))

        # ... file is ingested

        # File is moved to processed path
        self.fs.mv_path_to_processed_path(ingest_unprocessed[0])
        processed = self.fs.get_processed_file_paths(self.INGEST_DIR_PATH,
                                                     None)
        self.assertEqual(len(processed), 1)
        self.assertTrue(self.fs.is_processed_file(processed[0]))

        unprocessed = self.fs.get_unprocessed_file_paths(
            self.INGEST_DIR_PATH, None)
        self.assertEqual(len(unprocessed), 0)

        # File is moved to storage
        ingest_move_type_filter = GcsfsDirectIngestFileType.INGEST_VIEW \
            if file_type_differentiation_on else None

        self.fs.mv_processed_paths_before_date_to_storage(
            self.INGEST_DIR_PATH,
            self.STORAGE_DIR_PATH,
            date_str_bound=dt.date().isoformat(),
            include_bound=True,
            file_type_filter=ingest_move_type_filter)

        end_ingest_paths = self.fs._ls_with_file_prefix(self.INGEST_DIR_PATH,
                                                        '',
                                                        file_type_filter=None)
        end_storage_paths = self.fs._ls_with_file_prefix(self.STORAGE_DIR_PATH,
                                                         '',
                                                         file_type_filter=None)
        if file_type_differentiation_on:
            end_raw_storage_paths = self.fs._ls_with_file_prefix(
                self.STORAGE_DIR_PATH,
                '',
                file_type_filter=GcsfsDirectIngestFileType.RAW_DATA)
            end_ingest_view_storage_paths = self.fs._ls_with_file_prefix(
                self.STORAGE_DIR_PATH,
                '',
                file_type_filter=GcsfsDirectIngestFileType.INGEST_VIEW)
        else:
            end_raw_storage_paths = []
            end_ingest_view_storage_paths = []

        # Each file gets re-exported as ingest view
        splitting_factor = 2 if file_type_differentiation_on else 1

        expected_final_total_files = start_num_total_files + splitting_factor - 1
        self.assertEqual(len(self.fs.all_paths), expected_final_total_files)
        self.assertEqual(len(end_ingest_paths), len(start_ingest_paths) - 1)
        self.assertEqual(len(end_storage_paths),
                         len(start_storage_paths) + 1 * splitting_factor)
        if file_type_differentiation_on:
            self.assertEqual(
                len(end_raw_storage_paths) +
                len(end_ingest_view_storage_paths), len(end_storage_paths))
            self.assertEqual(len(end_raw_storage_paths),
                             len(start_raw_storage_paths) + 1)
            self.assertEqual(len(end_ingest_view_storage_paths),
                             len(start_ingest_view_storage_paths) + 1)

        for sp in end_storage_paths:
            parts = filename_parts_from_path(sp)
            if sp.abs_path() not in {
                    p.abs_path()
                    for p in start_storage_paths
            }:
                self.assertTrue(sp.abs_path().startswith(
                    self.STORAGE_DIR_PATH.abs_path()))
                dir_path, storage_file_name = os.path.split(sp.abs_path())
                if parts.file_type != GcsfsDirectIngestFileType.UNSPECIFIED:
                    self.assertTrue(parts.file_type.value in dir_path)
                name, _ = path.file_name.split('.')
                self.assertTrue(name in storage_file_name)

    def test_direct_ingest_file_moves(self):
        self.fully_process_file(
            datetime.datetime.now(),
            GcsfsFilePath(bucket_name='my_bucket', blob_name='test_file.csv'))

    def test_direct_ingest_multiple_file_moves(self):
        self.fully_process_file(
            datetime.datetime.now(),
            GcsfsFilePath(bucket_name='my_bucket', blob_name='test_file.csv'))

        self.fully_process_file(
            datetime.datetime.now(),
            GcsfsFilePath(bucket_name='my_bucket',
                          blob_name='test_file_2.csv'))

    def test_move_to_storage_with_conflict(self):
        dt = datetime.datetime.now()
        self.fully_process_file(
            dt,
            GcsfsFilePath(bucket_name='my_bucket', blob_name='test_file.csv'))

        # Try uploading a file with a duplicate name that has already been
        # moved to storage
        self.fully_process_file(
            dt,
            GcsfsFilePath(bucket_name='my_bucket', blob_name='test_file.csv'))

        # pylint: disable=protected-access
        storage_paths = self.fs._ls_with_file_prefix(self.STORAGE_DIR_PATH,
                                                     '',
                                                     file_type_filter=None)
        self.assertEqual(len(storage_paths), 2)

        found_first_file = False
        found_second_file = False
        for path in storage_paths:
            self.assertTrue(filename_parts_from_path(path))
            if path.abs_path().endswith('test_file.csv'):
                found_first_file = True
            if path.abs_path().endswith('test_file-(1).csv'):
                found_second_file = True

        self.assertTrue(found_first_file)
        self.assertTrue(found_second_file)

    def test_direct_ingest_file_moves_with_file_types(self):
        self.fully_process_file(datetime.datetime.now(),
                                GcsfsFilePath(bucket_name='my_bucket',
                                              blob_name='test_file.csv'),
                                file_type_differentiation_on=True)

    def test_direct_ingest_multiple_file_moves_with_file_types(self):
        self.fully_process_file(datetime.datetime.now(),
                                GcsfsFilePath(bucket_name='my_bucket',
                                              blob_name='test_file.csv'),
                                file_type_differentiation_on=True)

        self.fully_process_file(datetime.datetime.now(),
                                GcsfsFilePath(bucket_name='my_bucket',
                                              blob_name='test_file_2.csv'),
                                file_type_differentiation_on=True)

    def test_move_to_storage_with_conflict_with_file_types(self):
        dt = datetime.datetime.now()
        self.fully_process_file(dt,
                                GcsfsFilePath(bucket_name='my_bucket',
                                              blob_name='test_file.csv'),
                                file_type_differentiation_on=True)

        # Try uploading a file with a duplicate name that has already been
        # moved to storage
        self.fully_process_file(dt,
                                GcsfsFilePath(bucket_name='my_bucket',
                                              blob_name='test_file.csv'),
                                file_type_differentiation_on=True)

        # pylint: disable=protected-access
        storage_paths = self.fs._ls_with_file_prefix(self.STORAGE_DIR_PATH,
                                                     '',
                                                     file_type_filter=None)
        self.assertEqual(len(storage_paths), 4)

        found_first_file = False
        found_second_file = False
        for path in storage_paths:
            if path.abs_path().endswith('test_file.csv'):
                found_first_file = True
            if path.abs_path().endswith('test_file-(1).csv'):
                found_second_file = True

        self.assertTrue(found_first_file)
        self.assertTrue(found_second_file)
コード例 #21
0
class TestDirectIngestGcsFileSystem(TestCase):
    """Tests for the DirectIngestGCSFileSystem."""

    STORAGE_DIR_PATH = GcsfsDirectoryPath(bucket_name='storage_bucket',
                                          relative_path='region_subdir')

    INGEST_DIR_PATH = GcsfsDirectoryPath(bucket_name='my_bucket')

    def fully_process_file(self, test_fs: FakeDirectIngestGCSFileSystem,
                           dt: datetime.datetime, path: GcsfsFilePath):
        """Mimics all the file system calls for a single file in the direct
        ingest system, from getting added to the ingest bucket, turning to a
        processed file, then getting moved to storage."""

        test_fs.test_add_path(path)

        start_num_total_files = len(test_fs.all_paths)
        # pylint: disable=protected-access
        start_ingest_paths = test_fs._ls_with_file_prefix(
            self.INGEST_DIR_PATH, '')
        start_storage_paths = test_fs._ls_with_file_prefix(
            self.STORAGE_DIR_PATH, '')

        # File is renamed to normalized path
        test_fs.mv_path_to_normalized_path(path, dt)

        unprocessed = test_fs.get_unprocessed_file_paths(self.INGEST_DIR_PATH)
        self.assertEqual(len(unprocessed), 1)
        self.assertTrue(test_fs.is_seen_unprocessed_file(unprocessed[0]))

        # ... file is processed

        # File is moved to processed path
        test_fs.mv_path_to_processed_path(unprocessed[0])
        processed = test_fs.get_processed_file_paths(self.INGEST_DIR_PATH)
        self.assertEqual(len(processed), 1)
        self.assertTrue(test_fs.is_processed_file(processed[0]))

        unprocessed = test_fs.get_unprocessed_file_paths(self.INGEST_DIR_PATH)
        self.assertEqual(len(unprocessed), 0)

        # File is moved to storage
        test_fs.mv_processed_paths_before_date_to_storage(
            self.INGEST_DIR_PATH,
            self.STORAGE_DIR_PATH,
            dt.date().isoformat(),
            include_bound=True)

        end_ingest_paths = test_fs._ls_with_file_prefix(
            self.INGEST_DIR_PATH, '')
        end_storage_paths = test_fs._ls_with_file_prefix(
            self.STORAGE_DIR_PATH, '')

        self.assertEqual(len(test_fs.all_paths), start_num_total_files)
        self.assertEqual(len(end_ingest_paths), len(start_ingest_paths) - 1)
        self.assertEqual(len(end_storage_paths), len(start_storage_paths) + 1)

        for sp in end_storage_paths:
            if sp.abs_path() not in \
                    {p.abs_path() for p in start_storage_paths}:
                self.assertTrue(sp.abs_path().startswith(
                    self.STORAGE_DIR_PATH.abs_path()))
                _, storage_file_name = \
                    os.path.split(sp.abs_path())
                name, _ = path.file_name.split('.')
                self.assertTrue(name in storage_file_name)

    def test_direct_ingest_file_moves(self):
        test_fs = FakeDirectIngestGCSFileSystem()
        self.fully_process_file(
            test_fs, datetime.datetime.now(),
            GcsfsFilePath(bucket_name='my_bucket', blob_name='test_file.csv'))

    def test_direct_ingest_multiple_file_moves(self):
        test_fs = FakeDirectIngestGCSFileSystem()
        self.fully_process_file(
            test_fs, datetime.datetime.now(),
            GcsfsFilePath(bucket_name='my_bucket', blob_name='test_file.csv'))

        self.fully_process_file(
            test_fs, datetime.datetime.now(),
            GcsfsFilePath(bucket_name='my_bucket',
                          blob_name='test_file_2.csv'))

    def test_move_to_storage_with_conflict(self):
        test_fs = FakeDirectIngestGCSFileSystem()
        dt = datetime.datetime.now()
        self.fully_process_file(
            test_fs, dt,
            GcsfsFilePath(bucket_name='my_bucket', blob_name='test_file.csv'))

        # Try uploading a file with a duplicate name that has already been
        # moved to storage
        self.fully_process_file(
            test_fs, dt,
            GcsfsFilePath(bucket_name='my_bucket', blob_name='test_file.csv'))

        # pylint: disable=protected-access
        storage_paths = test_fs._ls_with_file_prefix(self.STORAGE_DIR_PATH, '')
        self.assertEqual(len(storage_paths), 2)

        found_first_file = False
        found_second_file = False
        for path in storage_paths:
            if path.abs_path().endswith('test_file.csv'):
                found_first_file = True
            if path.abs_path().endswith('test_file-(1).csv'):
                found_second_file = True

        self.assertTrue(found_first_file)
        self.assertTrue(found_second_file)
コード例 #22
0
    def _split_file_if_necessary(self, path: GcsfsFilePath):
        """Checks if the given file needs to be split according to this
        controller's |file_split_line_limit|.
        """
        parts = filename_parts_from_path(path)

        if self.region.is_raw_vs_ingest_file_name_detection_enabled() and \
                parts.file_type != GcsfsDirectIngestFileType.INGEST_VIEW:
            raise ValueError(f'Should not be attempting to split files other than ingest view files, found path with '
                             f'file type: {parts.file_type}')

        if parts.file_tag not in self.get_file_tag_rank_list():
            logging.info("File tag [%s] for path [%s] not in rank list - "
                         "not splitting.",
                         parts.file_tag,
                         path.abs_path())
            return False

        if parts.is_file_split and \
                parts.file_split_size and \
                parts.file_split_size <= self.file_split_line_limit:
            logging.info("File [%s] already split with size [%s].",
                         path.abs_path(), parts.file_split_size)
            return False

        file_contents_handle = self._get_contents_handle_from_path(path)

        if not file_contents_handle:
            logging.info("File [%s] has no rows - not splitting.",
                         path.abs_path())
            return False

        if self._can_proceed_with_ingest_for_contents(file_contents_handle):
            logging.info("No need to split file path [%s].", path.abs_path())
            return False

        logging.info("Proceeding to file splitting for path [%s].",
                     path.abs_path())

        split_contents_handles = self._split_file(path, file_contents_handle)

        original_metadata = None
        if self.region.are_ingest_view_exports_enabled_in_env():
            original_metadata = self.file_metadata_manager.get_file_metadata(path)

        output_dir = GcsfsDirectoryPath.from_file_path(path)
        for i, split_contents_handle in enumerate(split_contents_handles):
            upload_path = self._create_split_file_path(path, output_dir, split_num=i)

            ingest_file_metadata = None

            if self.region.are_ingest_view_exports_enabled_in_env():
                if not isinstance(original_metadata, DirectIngestIngestFileMetadata):
                    raise ValueError('Attempting to split a non-ingest view type file')

                ingest_file_metadata = self.file_metadata_manager.register_ingest_file_split(original_metadata,
                                                                                             upload_path)
            logging.info("Writing file split [%s] to Cloud Storage.", upload_path.abs_path())
            self.fs.upload_from_contents_handle(upload_path, split_contents_handle, self._contents_type())

            if self.region.are_ingest_view_exports_enabled_in_env():
                if not ingest_file_metadata:
                    raise ValueError(f'Split file metadata for path unexpectedly none [{upload_path.abs_path()}]')

                self.file_metadata_manager.mark_ingest_view_exported(ingest_file_metadata)

        if self.region.are_ingest_view_exports_enabled_in_env():
            self.file_metadata_manager.mark_file_as_processed(path)

        logging.info("Done splitting file [%s] into [%s] paths, moving it to storage.",
                     path.abs_path(), len(split_contents_handles))

        self.fs.mv_path_to_storage(path, self.storage_directory_path)

        return True