Beispiel #1
0
    def __init__(
        self,
        *,
        region: Region,
        fs: DirectIngestGCSFileSystem,
        ingest_bucket_path: GcsfsBucketPath,
        temp_output_directory_path: GcsfsDirectoryPath,
        big_query_client: BigQueryClient,
        region_raw_file_config: Optional[
            DirectIngestRegionRawFileConfig] = None,
        upload_chunk_size: int = _DEFAULT_BQ_UPLOAD_CHUNK_SIZE,
    ):

        self.region = region
        self.fs = fs
        self.ingest_bucket_path = ingest_bucket_path
        self.temp_output_directory_path = temp_output_directory_path
        self.big_query_client = big_query_client
        self.region_raw_file_config = (
            region_raw_file_config
            if region_raw_file_config else DirectIngestRegionRawFileConfig(
                region_code=self.region.region_code,
                region_module=self.region.region_module,
            ))
        self.upload_chunk_size = upload_chunk_size
        self.csv_reader = GcsfsCsvReader(fs)
        self.raw_table_migrations = DirectIngestRawTableMigrationCollector(
            region_code=self.region.region_code,
            regions_module_override=self.region.region_module,
        ).collect_raw_table_migration_queries()
 def __init__(self,
              region_name: str,
              system_level: SystemLevel,
              ingest_directory_path: Optional[str],
              storage_directory_path: Optional[str],
              max_delay_sec_between_files: Optional[int] = None):
     super().__init__(region_name, system_level, ingest_directory_path,
                      storage_directory_path, max_delay_sec_between_files)
     self.csv_reader = GcsfsCsvReader(
         gcsfs.GCSFileSystem(project=metadata.project_id(),
                             cache_timeout=GCSFS_NO_CACHING))
Beispiel #3
0
    def _cache_ingest_file_as_parquet_task() -> Tuple[str, int]:
        """Downloads a GCS file and stores it to our Redis cache in Parquet format

         Example:
             POST /admin/data_discovery/cache_ingest_file_as_parquet_task
         Request Body:
             gcs_file_uri: (string) The `gs://` URI of the file to cache
             file_encoding: (string) The encoding of said file
             file_separator: (string) The value delimiter of side file
             file_quoting: (int) A `csv.QUOTE_*` value for the parser i.e. 3 (csv.QUOTE_NONE)
        Args:
             N/A
         Returns:
             Cache hit/miss result
        """
        cache = get_data_discovery_cache()
        body = get_cloud_task_json_body()
        path = GcsfsFilePath.from_absolute_path(body["gcs_file_uri"])
        parquet_path = SingleIngestFileParquetCache.parquet_cache_key(path)

        if not cache.exists(parquet_path):
            fs = GcsfsFactory.build()
            parquet_cache = SingleIngestFileParquetCache(
                get_data_discovery_cache(), path, expiry=DataDiscoveryTTL.PARQUET_FILES
            )
            csv_reader = GcsfsCsvReader(fs)
            csv_reader.streaming_read(
                path,
                CacheIngestFileAsParquetDelegate(parquet_cache, path),
                encodings_to_try=list(
                    {
                        body["file_encoding"],
                        *COMMON_RAW_FILE_ENCODINGS,
                    }
                ),
                delimiter=body["file_separator"],
                quoting=body["file_quoting"],
                lineterminator=body.get("file_custom_line_terminator"),
                chunk_size=75000,
                index_col=False,
                keep_default_na=False,
            )

            return CACHE_MISS, HTTPStatus.CREATED

        return CACHE_HIT, HTTPStatus.OK
    def run_single_line_gcs_csv_reader_test(
        self,
        input_file_path: str,
        expected_result_path: str,
        encoding: str,
        delimiter: str,
        line_terminator: str,
    ) -> None:
        """Runs a test reads a single line of a normalized stream using the csv reader,
        mimicking the way we read the columns from each file.
        """
        fake_fs = FakeGCSFileSystem()
        input_gcs_path = GcsfsFilePath.from_absolute_path(
            "gs://my-bucket/input.csv")
        fake_fs.test_add_path(path=input_gcs_path, local_path=input_file_path)
        input_delegate = _FakeDfCapturingDelegate()
        csv_reader = GcsfsCsvReader(fake_fs)
        csv_reader.streaming_read(
            path=input_gcs_path,
            dtype=str,
            delegate=input_delegate,
            chunk_size=1,
            encodings_to_try=[encoding],
            nrows=1,
            sep=delimiter,
            quoting=csv.QUOTE_NONE,
            lineterminator=line_terminator,
            engine="python",
        )

        expected_gcs_path = GcsfsFilePath.from_absolute_path(
            "gs://my-bucket/expected.csv")
        fake_fs.test_add_path(path=expected_gcs_path,
                              local_path=expected_result_path)
        expected_delegate = _FakeDfCapturingDelegate()
        csv_reader.streaming_read(
            path=expected_gcs_path,
            delegate=expected_delegate,
            dtype=str,
            chunk_size=1,
            nrows=1,
        )

        self.assertEqual(len(expected_delegate.dfs), len(input_delegate.dfs))
        for i, expected_df in enumerate(expected_delegate.dfs):
            expected_df.equals(input_delegate.dfs[i])
    def setUp(self) -> None:
        self.project_id = "recidiviz-456"
        self.project_id_patcher = patch("recidiviz.utils.metadata.project_id")
        self.project_id_patcher.start().return_value = self.project_id
        self.test_region = fake_region(region_code="us_xx",
                                       region_module=fake_regions_module)

        self.fs = DirectIngestGCSFileSystem(FakeGCSFileSystem())
        self.ingest_bucket_path = GcsfsBucketPath(
            bucket_name="my_ingest_bucket")
        self.temp_output_path = GcsfsDirectoryPath(bucket_name="temp_bucket")

        self.region_raw_file_config = DirectIngestRegionRawFileConfig(
            region_code="us_xx", region_module=fake_regions_module)

        self.mock_big_query_client = create_autospec(BigQueryClient)
        self.num_lines_uploaded = 0

        self.mock_big_query_client.insert_into_table_from_cloud_storage_async.side_effect = (
            self.mock_import_raw_file_to_big_query)

        self.import_manager = DirectIngestRawFileImportManager(
            region=self.test_region,
            fs=self.fs,
            ingest_bucket_path=self.ingest_bucket_path,
            temp_output_directory_path=self.temp_output_path,
            region_raw_file_config=self.region_raw_file_config,
            big_query_client=self.mock_big_query_client,
        )
        self.import_manager.csv_reader = GcsfsCsvReader(
            self.fs.gcs_file_system)

        self.time_patcher = patch(
            "recidiviz.ingest.direct.controllers.direct_ingest_raw_file_import_manager.time"
        )
        self.mock_time = self.time_patcher.start()

        def fake_get_dataset_ref(dataset_id: str) -> bigquery.DatasetReference:
            return bigquery.DatasetReference(project=self.project_id,
                                             dataset_id=dataset_id)

        self.mock_big_query_client.dataset_ref_for_id = fake_get_dataset_ref
    def __init__(self,
                 *,
                 region: Region,
                 fs: DirectIngestGCSFileSystem,
                 ingest_directory_path: GcsfsDirectoryPath,
                 temp_output_directory_path: GcsfsDirectoryPath,
                 big_query_client: BigQueryClient,
                 region_raw_file_config: Optional[
                     DirectIngestRegionRawFileConfig] = None,
                 upload_chunk_size: int = _DEFAULT_BQ_UPLOAD_CHUNK_SIZE):

        self.region = region
        self.fs = fs
        self.ingest_directory_path = ingest_directory_path
        self.temp_output_directory_path = temp_output_directory_path
        self.big_query_client = big_query_client
        self.region_raw_file_config = region_raw_file_config \
            if region_raw_file_config else DirectIngestRegionRawFileConfig(region_code=self.region.region_code)
        self.upload_chunk_size = upload_chunk_size
        self.csv_reader = GcsfsCsvReader(
            gcsfs.GCSFileSystem(project=metadata.project_id(),
                                cache_timeout=GCSFS_NO_CACHING))
    def setUp(self) -> None:

        self.mock_gcsfs = create_autospec(gcsfs.GCSFileSystem)
        self.mock_gcsfs.open = _fake_gcsfs_open
        self.reader = GcsfsCsvReader(self.mock_gcsfs)
def build_gcsfs_controller_for_tests(
    controller_cls: Type[CsvGcsfsDirectIngestController],
    ingest_instance: DirectIngestInstance,
    run_async: bool,
    can_start_ingest: bool = True,
    regions_module: ModuleType = fake_regions_module,
) -> BaseDirectIngestController:
    """Builds an instance of |controller_cls| for use in tests with several internal classes mocked properly. """
    fake_fs = FakeGCSFileSystem()

    def mock_build_fs() -> FakeGCSFileSystem:
        return fake_fs

    if "TestGcsfsDirectIngestController" in controller_cls.__name__:
        view_collector_cls: Type[
            BigQueryViewCollector] = FakeDirectIngestPreProcessedIngestViewCollector
    else:
        view_collector_cls = DirectIngestPreProcessedIngestViewCollector

    with patch(
            f"{BaseDirectIngestController.__module__}.DirectIngestCloudTaskManagerImpl"
    ) as mock_task_factory_cls:
        with patch(
                f"{BaseDirectIngestController.__module__}.BigQueryClientImpl"
        ) as mock_big_query_client_cls:
            with patch(
                    f"{BaseDirectIngestController.__module__}.DirectIngestRawFileImportManager",
                    FakeDirectIngestRawFileImportManager,
            ):
                with patch(
                        f"{BaseDirectIngestController.__module__}.DirectIngestPreProcessedIngestViewCollector",
                        view_collector_cls,
                ):
                    task_manager = (
                        FakeAsyncDirectIngestCloudTaskManager() if run_async
                        else FakeSynchronousDirectIngestCloudTaskManager())
                    mock_task_factory_cls.return_value = task_manager
                    mock_big_query_client_cls.return_value = (
                        FakeDirectIngestBigQueryClient(
                            project_id=metadata.project_id(),
                            fs=fake_fs,
                            region_code=controller_cls.region_code(),
                        ))
                    with patch.object(GcsfsFactory, "build",
                                      new=mock_build_fs):
                        with patch.object(
                                direct_ingest_raw_table_migration_collector,
                                "regions",
                                new=regions_module,
                        ):
                            controller = controller_cls(
                                ingest_bucket_path=
                                gcsfs_direct_ingest_bucket_for_region(
                                    region_code=controller_cls.region_code(),
                                    system_level=SystemLevel.for_region_code(
                                        controller_cls.region_code(),
                                        is_direct_ingest=True,
                                    ),
                                    ingest_instance=ingest_instance,
                                    project_id="recidiviz-xxx",
                                ))
                            controller.csv_reader = GcsfsCsvReader(fake_fs)
                            controller.raw_file_import_manager.csv_reader = (
                                controller.csv_reader)

                            task_manager.set_controller(controller)
                            fake_fs.test_set_delegate(
                                DirectIngestFakeGCSFileSystemDelegate(
                                    controller,
                                    can_start_ingest=can_start_ingest))
                            return controller
Beispiel #9
0
 def __init__(self, ingest_bucket_path: GcsfsBucketPath):
     super().__init__(ingest_bucket_path)
     self.csv_reader = GcsfsCsvReader(GcsfsFactory.build())
Beispiel #10
0
 def setUp(self) -> None:
     self.fake_gcs = FakeGCSFileSystem()
     self.reader = GcsfsCsvReader(self.fake_gcs)