def test_read_completely_empty_file(self) -> None:
        empty_file_path = fixtures.as_filepath("tagA.csv")

        delegate = _TestGcsfsCsvReaderDelegate()
        self.reader.streaming_read(
            GcsfsFilePath.from_absolute_path(empty_file_path),
            delegate=delegate,
            chunk_size=1,
        )
        self.assertEqual(1, len(delegate.encodings_attempted))
        self.assertEqual(delegate.encodings_attempted[0],
                         delegate.successful_encoding)
        self.assertEqual(0, len(delegate.dataframes))
        self.assertEqual(0, delegate.decode_errors)
        self.assertEqual(0, delegate.exceptions)

        delegate = _TestGcsfsCsvReaderDelegate()
        self.reader.streaming_read(
            GcsfsFilePath.from_absolute_path(empty_file_path),
            delegate=delegate,
            chunk_size=10,
        )
        self.assertEqual(1, len(delegate.encodings_attempted))
        self.assertEqual(delegate.encodings_attempted[0],
                         delegate.successful_encoding)
        self.assertEqual(0, len(delegate.dataframes))
        self.assertEqual(0, delegate.decode_errors)
        self.assertEqual(0, delegate.exceptions)
    def test_load_files_from_storage(self, mock_gcs_factory: MagicMock) -> None:
        """Test that load_files_from_storage returns files for the current batch and bucket name"""
        bucket_name = "bucket-name"
        self.mock_utils.get_email_content_bucket_name.return_value = bucket_name

        email_path = GcsfsFilePath.from_absolute_path(
            f"gs://{bucket_name}/{self.state_code}/{self.batch_id}/{self.to_address}.html"
        )
        other_path = GcsfsFilePath.from_absolute_path(
            f"gs://{bucket_name}/excluded/exclude.json"
        )

        fake_gcs_file_system = FakeGCSFileSystem()
        fake_gcs_file_system.upload_from_string(
            path=email_path, contents="<html>", content_type="text/html"
        )
        fake_gcs_file_system.upload_from_string(
            path=other_path, contents="{}", content_type="text/json"
        )

        mock_gcs_factory.return_value = fake_gcs_file_system

        files = email_delivery.load_files_from_storage(
            bucket_name, f"{self.state_code}/{self.batch_id}"
        )

        self.assertEqual(files, {f"{self.to_address}": "<html>"})
    def update_data_freshness_results(self) -> None:
        """Refreshes information in the metadata store about freshness of ingested data
        for all states."""
        bq_export_config = CloudSqlToBQConfig.for_schema_type(
            SchemaType.STATE,
            yaml_path=GcsfsFilePath.from_absolute_path(
                f"gs://{self.project_id}-configs/cloud_sql_to_bq_config.yaml"
            ),
        )
        if bq_export_config is None:
            raise ValueError("STATE CloudSqlToBQConfig unexpectedly None.")

        regions_paused = bq_export_config.region_codes_to_exclude

        latest_upper_bounds_path = GcsfsFilePath.from_absolute_path(
            f"gs://{self.project_id}-ingest-metadata/ingest_metadata_latest_ingested_upper_bounds.json"
        )
        latest_upper_bounds_json = self.gcs_fs.download_as_string(
            latest_upper_bounds_path
        )
        latest_upper_bounds = []

        for line in latest_upper_bounds_json.splitlines():
            line = line.strip()
            if not line:
                continue
            struct = json.loads(line)
            latest_upper_bounds.append(
                {
                    "state": struct["state_code"],
                    "date": struct.get("processed_date"),
                    "ingestPaused": struct["state_code"] in regions_paused,
                }
            )
        self.data_freshness_results = latest_upper_bounds
    def _move_files(self, from_uri: str):
        curr_gcsfs_file_path = GcsfsFilePath.from_absolute_path(from_uri)
        previous_date_format = filename_parts_from_path(
            curr_gcsfs_file_path).date_str
        new_date_format = date.fromisoformat(previous_date_format).strftime(
            "%Y/%m/%d/")

        path_with_new_file_name = GcsfsFilePath.from_absolute_path(
            to_normalized_unprocessed_file_path_from_normalized_path(
                from_uri, GcsfsDirectIngestFileType.RAW_DATA))

        if DirectIngestGCSFileSystem.is_processed_file(curr_gcsfs_file_path):
            path_with_new_file_name = GcsfsFilePath.from_absolute_path(
                to_normalized_processed_file_path_from_normalized_path(
                    from_uri, GcsfsDirectIngestFileType.RAW_DATA))

        raw_dir_with_date = GcsfsDirectoryPath.from_dir_and_subdir(
            self.region_storage_raw_dir_path, new_date_format)

        to_uri = GcsfsFilePath.from_directory_and_file_name(
            raw_dir_with_date, path_with_new_file_name.file_name).uri()

        if not self.dry_run:
            gsutil_mv(from_path=from_uri, to_path=to_uri)
        with self.mutex:
            self.move_list.append((from_uri, to_uri))
            if self.move_progress:
                self.move_progress.next()
Exemple #5
0
    def test_is_task_queued_has_tasks(self):
        # Arrange
        file_path = to_normalized_unprocessed_file_path(
            'bucket/file_path.csv', GcsfsDirectIngestFileType.INGEST_VIEW)
        gcsfs_args = \
            GcsfsIngestArgs(
                ingest_time=datetime.datetime.now(),
                file_path=GcsfsFilePath.from_absolute_path(file_path))

        full_task_name = \
            _build_task_id(_REGION.region_code, gcsfs_args.task_id_tag())
        info = ProcessIngestJobCloudTaskQueueInfo(
            queue_name='queue_name',
            task_names=[
                'projects/path/to/random_task',
                f'projects/path/to/{full_task_name}'
            ])
        file_path = to_normalized_unprocessed_file_path(
            'bucket/file_path.csv', GcsfsDirectIngestFileType.INGEST_VIEW)
        gcsfs_args = \
            GcsfsIngestArgs(
                ingest_time=datetime.datetime.now(),
                file_path=GcsfsFilePath.from_absolute_path(file_path))

        # Act
        gcsfs_args_queued = info.is_task_queued(_REGION, gcsfs_args)

        # Assert
        self.assertTrue(gcsfs_args_queued)
Exemple #6
0
    def test_read_file_with_columns_no_contents(self):
        empty_file_path = fixtures.as_filepath('tagB.csv')

        delegate = TestGcsfsCsvReaderDelegate()
        self.reader.streaming_read(
            GcsfsFilePath.from_absolute_path(empty_file_path),
            delegate=delegate,
            chunk_size=1)
        self.assertEqual(1, len(delegate.encodings_attempted))
        self.assertEqual(delegate.encodings_attempted[0],
                         delegate.successful_encoding)
        self.assertEqual(1, len(delegate.dataframes))
        encoding, df = delegate.dataframes[0]
        self.assertEqual(encoding, delegate.successful_encoding)
        self.assertEqual(0, df.shape[0])  # No rows
        self.assertEqual(7, df.shape[1])  # 7 columns
        self.assertEqual(0, delegate.decode_errors)
        self.assertEqual(0, delegate.exceptions)

        delegate = TestGcsfsCsvReaderDelegate()
        self.reader.streaming_read(
            GcsfsFilePath.from_absolute_path(empty_file_path),
            delegate=delegate,
            chunk_size=10)
        self.assertEqual(1, len(delegate.encodings_attempted))
        self.assertEqual(delegate.encodings_attempted[0],
                         delegate.successful_encoding)
        self.assertEqual(1, len(delegate.dataframes))
        encoding, df = delegate.dataframes[0]
        self.assertEqual(encoding, delegate.successful_encoding)
        self.assertEqual(0, df.shape[0])  # No rows
        self.assertEqual(7, df.shape[1])  # 7 columns
        self.assertEqual(0, delegate.decode_errors)
        self.assertEqual(0, delegate.exceptions)
    def setUp(self) -> None:
        self.gcs_factory_patcher = mock.patch(
            "recidiviz.admin_panel.dataset_metadata_store.GcsfsFactory.build")

        fake_gcs = FakeGCSFileSystem()
        fake_gcs.upload_from_string(
            path=GcsfsFilePath.from_absolute_path(
                "gs://recidiviz-456-configs/cloud_sql_to_bq_config.yaml"),
            contents="""
region_codes_to_exclude:
  - US_ND
state_history_tables_to_include:
  - state_person_history
county_columns_to_exclude:
  person:
    - full_name
    - birthdate_inferred_from_age
""",
            content_type="text/yaml",
        )
        fake_gcs.upload_from_string(
            path=GcsfsFilePath.from_absolute_path(
                "gs://recidiviz-456-ingest-metadata/ingest_metadata_latest_ingested_upper_bounds.json"
            ),
            contents="""
{"state_code":"US_PA","processed_date":"2020-11-25"}
{"state_code":"US_ID","processed_date":"2021-01-04"}
{"state_code":"US_MO","processed_date":"2020-12-21"}
{"state_code":"US_ND","processed_date":"2020-12-16"}
""",
            content_type="text/text",
        )

        fixture_folder = os.path.join(
            os.path.dirname(os.path.realpath(__file__)),
            "fixtures",
        )
        self.table_column_map: Dict[str, List[str]] = defaultdict(list)
        for f in os.listdir(fixture_folder):
            _, table, col = f.split("__")
            self.table_column_map[table].append(col[:-len(".json")])
            path = GcsfsFilePath.from_absolute_path(
                f"gs://recidiviz-456-ingest-metadata/{f}")
            fake_gcs.test_add_path(path,
                                   local_path=os.path.join(fixture_folder, f))

        self.gcs_factory_patcher.start().return_value = fake_gcs
        self.store = DatasetMetadataCountsStore(
            dataset_nickname="ingest",
            metadata_file_prefix="ingest_state_metadata",
            override_project_id="recidiviz-456",
        )
        self.store.recalculate_store()
def _retrieve_data_for_po_monthly_report(
    state_code: StateCode, batch_id: str
) -> List[Recipient]:
    """Retrieves the data if the report type is POMonthlyReport."""
    data_bucket = utils.get_data_storage_bucket_name()
    data_filename = ""
    gcs_file_system = GcsfsFactory.build()
    try:
        data_filename = utils.get_data_filename(state_code, ReportType.POMonthlyReport)
        path = GcsfsFilePath.from_absolute_path(f"gs://{data_bucket}/{data_filename}")
        file_contents = gcs_file_system.download_as_string(path)
    except BaseException:
        logging.info("Unable to load data file %s/%s", data_bucket, data_filename)
        raise

    archive_bucket = utils.get_data_archive_bucket_name()
    archive_filename = ""
    try:
        archive_filename = utils.get_data_archive_filename(batch_id, state_code)
        archive_path = GcsfsFilePath.from_absolute_path(
            f"gs://{archive_bucket}/{archive_filename}"
        )
        gcs_file_system.upload_from_string(
            path=archive_path, contents=file_contents, content_type="text/json"
        )
    except Exception:
        logging.error(
            "Unable to archive the data file to %s/%s", archive_bucket, archive_filename
        )
        raise

    json_list = file_contents.splitlines()

    recipient_data: List[dict] = []
    for json_str in json_list:
        try:
            item = json.loads(json_str)
        except Exception as err:
            logging.error(
                "Unable to parse JSON found in the file %s. Offending json string is: '%s'. <%s> %s",
                data_filename,
                json_str,
                type(err).__name__,
                err,
            )
        else:
            if email := item.get("email_address"):
                mismatches = _get_mismatch_data_for_officer(email)
                if mismatches is not None:
                    item["mismatches"] = mismatches
            recipient_data.append(item)
Exemple #9
0
    def test_metadata_added(self) -> None:
        """Tests that the metadata.json file is correctly added."""
        with open(
            os.path.join(
                f"{os.path.dirname(__file__)}/context/po_monthly_report", FIXTURE_FILE
            )
        ) as fixture_file:
            # Remove newlines
            self._write_test_data(json.dumps(json.loads(fixture_file.read())))

        result = start(
            batch_id="fake-batch-id",
            state_code=StateCode.US_ID,
            report_type=ReportType.POMonthlyReport,
            region_code="US_ID_D3",
        )
        self.assertEqual(len(result.successes), 1)

        # Test that metadata file is created correctly
        metadata_file = self.gcs_file_system.download_as_string(
            GcsfsFilePath.from_absolute_path(
                "gs://recidiviz-test-report-html/US_ID/fake-batch-id/metadata.json"
            )
        )
        self.assertEqual(
            json.loads(metadata_file),
            {
                "report_type": ReportType.POMonthlyReport.value,
                "review_month": "5",
                "review_year": "2021",
            },
        )

        # Try again for Top Opps email
        result = start(
            batch_id="fake-batch-id-2",
            state_code=StateCode.US_ID,
            report_type=ReportType.TopOpportunities,
            region_code="US_ID_D3",
        )

        metadata_file = self.gcs_file_system.download_as_string(
            GcsfsFilePath.from_absolute_path(
                "gs://recidiviz-test-report-html/US_ID/fake-batch-id-2/metadata.json"
            )
        )
        self.assertEqual(
            json.loads(metadata_file),
            {"report_type": ReportType.TopOpportunities.value},
        )
    def mock_import_raw_file_to_big_query(
        self,
        *,
        source_uri: str,
        destination_table_schema: List[bigquery.SchemaField],
        **_kwargs: Any,
    ) -> mock.MagicMock:
        col_names = [
            schema_field.name for schema_field in destination_table_schema
        ]
        temp_path = GcsfsFilePath.from_absolute_path(source_uri)
        local_temp_path = self.fs.gcs_file_system.real_absolute_path_for_path(
            temp_path)

        df = pd.read_csv(local_temp_path, header=None, dtype=str)
        for value in df.values:
            for cell in value:
                if isinstance(cell, str):
                    stripped_cell = cell.strip()
                    if stripped_cell != cell:
                        raise ValueError(
                            "Did not strip white space from raw data cell")

                if cell in col_names:
                    raise ValueError(
                        f"Wrote column row to output file: {value}")
        self.num_lines_uploaded += len(df)

        return mock.MagicMock()
Exemple #11
0
def _run_gcs_imports() -> Tuple[str, HTTPStatus]:
    """Exposes an endpoint to trigger standard GCS imports."""
    body = get_cloud_task_json_body()
    filename = body.get("filename")
    if not filename:
        return "Must include `filename` in the json payload", HTTPStatus.BAD_REQUEST
    for builder in CASE_TRIAGE_EXPORTED_VIEW_BUILDERS:
        if f"{builder.view_id}.csv" != filename:
            continue

        csv_path = GcsfsFilePath.from_absolute_path(
            os.path.join(
                CASE_TRIAGE_VIEWS_OUTPUT_DIRECTORY_URI.format(
                    project_id=metadata.project_id()),
                filename,
            ))

        import_gcs_csv_to_cloud_sql(
            SchemaType.CASE_TRIAGE,
            builder.view_id,
            csv_path,
            builder.columns,
            seconds_to_wait=180,
        )
        logging.info("View (%s) successfully imported", builder.view_id)

    return "", HTTPStatus.OK
Exemple #12
0
 def __init__(self) -> None:
     prefix = "" if not in_gcp() else f"{project_id()}-"
     self.allowlist_path = GcsfsFilePath.from_absolute_path(
         f"{prefix}case-triage-data/allowlist_v2.json"
     )
     self.allowed_users: List[str] = []
     self.admin_users: List[str] = []
Exemple #13
0
 def setUp(self) -> None:
     self.redis = fakeredis.FakeRedis()
     path = GcsfsFilePath.from_absolute_path(
         "gs://test_bucket/us_id/ingest_view/2021/01/11/unprocessed_2021-01-11T00:00:00:000000_ingest_view_test_view.csv"
     )
     self.cache = mock.MagicMock()
     self.delegate = CacheIngestFileAsParquetDelegate(self.cache, path)
Exemple #14
0
 def get_paths_to_upload(self) -> List[Tuple[str, datetime.datetime]]:
     """Returns the appropriate paths to upload and the proper associated timestamp that
     it is to be normalized with. Skips any files that are not properly supported."""
     path_candidates = []
     for path, timestamp in self.paths_with_timestamps:
         if self.gcsfs.is_dir(path):
             directory = GcsfsDirectoryPath.from_absolute_path(path)
             files_in_directory = self.gcsfs.ls_with_blob_prefix(
                 bucket_name=directory.bucket_name,
                 blob_prefix=directory.relative_path,
             )
             for file in files_in_directory:
                 if self._is_supported_extension(file.abs_path()):
                     path_candidates.append((file.abs_path(), timestamp))
                 else:
                     self.skipped_files.append(file.abs_path())
         elif self.gcsfs.is_file(path):
             file = GcsfsFilePath.from_absolute_path(path)
             if self._is_supported_extension(file.abs_path()):
                 path_candidates.append((file.abs_path(), timestamp))
             else:
                 self.skipped_files.append(file.abs_path())
         else:
             logging.warning(
                 "Could not indicate %s as a directory or a file in %s. Skipping",
                 path,
                 self.destination_ingest_bucket.uri(),
             )
             self.unable_to_upload_files.append(path)
             continue
     return path_candidates
Exemple #15
0
def _to_normalized_file_path_from_normalized_path(
    original_normalized_file_path: str,
    build_function: Callable,
    file_type_override: Optional[GcsfsDirectIngestFileType] = None,
) -> str:
    """Moves any normalized path back to a unprocessed/processed path with the same information embedded in the file
    name. If |file_type_override| is provided, we will always overwrite the original path file type with the override
    file type."""

    directory, _ = os.path.split(original_normalized_file_path)
    parts = filename_parts_from_path(
        GcsfsFilePath.from_absolute_path(original_normalized_file_path))

    file_type = file_type_override if file_type_override else parts.file_type

    utc_iso_timestamp_str = parts.utc_upload_datetime.strftime(
        "%Y-%m-%dT%H:%M:%S:%f")

    suffix_str = f"_{parts.filename_suffix}" if parts.filename_suffix else ""
    base_file_name = f"{parts.file_tag}{suffix_str}"

    path_to_return = build_function(
        utc_iso_timestamp_str=utc_iso_timestamp_str,
        file_type=file_type,
        base_file_name=base_file_name,
        extension=parts.extension,
    )

    return os.path.join(directory, path_to_return)
 def set_config_yaml(self, contents: str) -> None:
     path = GcsfsFilePath.from_absolute_path(
         f"gs://{self.mock_project_id}-configs/cloud_sql_to_bq_config.yaml"
     )
     self.fake_gcs.upload_from_string(
         path=path, contents=contents, content_type="text/yaml"
     )
Exemple #17
0
    def setUp(self) -> None:
        self.user_1_email = "*****@*****.**"
        self.mock_instance_id = "mock_instance_id"
        self.cloud_sql_client_patcher = patch(
            "recidiviz.cloud_sql.gcs_import_to_cloud_sql.CloudSQLClientImpl")
        self.mock_cloud_sql_client = MagicMock()
        self.cloud_sql_client_patcher.start(
        ).return_value = self.mock_cloud_sql_client

        self.mock_sqlalchemy_engine_manager = SQLAlchemyEngineManager
        setattr(
            self.mock_sqlalchemy_engine_manager,
            "get_stripped_cloudsql_instance_id",
            Mock(return_value=self.mock_instance_id),
        )
        self.database_key = SQLAlchemyDatabaseKey.for_schema(
            SchemaType.CASE_TRIAGE)
        local_postgres_helpers.use_on_disk_postgresql_database(
            self.database_key)

        self.table_name = DashboardUserRestrictions.__tablename__
        self.columns = [
            col.name for col in DashboardUserRestrictions.__table__.columns
        ]
        self.gcs_uri = GcsfsFilePath.from_absolute_path(
            "US_MO/dashboard_user_restrictions.csv")
    def test_read_with_exception(self) -> None:
        class _TestException(ValueError):
            pass

        class _ExceptionDelegate(_TestGcsfsCsvReaderDelegate):
            def on_dataframe(self, encoding: str, chunk_num: int,
                             df: pd.DataFrame) -> bool:
                should_continue = super().on_dataframe(encoding, chunk_num, df)
                if chunk_num > 0:
                    raise _TestException("We crashed processing!")
                return should_continue

        file_path = fixtures.as_filepath("encoded_utf_8.csv")
        delegate = _ExceptionDelegate()

        with self.assertRaises(_TestException):
            self.reader.streaming_read(
                GcsfsFilePath.from_absolute_path(file_path),
                delegate=delegate,
                chunk_size=1,
            )

        self.assertEqual(1, len(delegate.encodings_attempted))
        self.assertEqual("UTF-8", delegate.encodings_attempted[0])
        self.assertIsNone(delegate.successful_encoding)
        self.assertEqual(2, len(delegate.dataframes))
        self.assertEqual({"UTF-8"},
                         {encoding
                          for encoding, df in delegate.dataframes})
        self.assertEqual(0, delegate.decode_errors)
        self.assertEqual(1, delegate.exceptions)
Exemple #19
0
    def test_raw_data_import(self, mock_supported, mock_region,
                             mock_environment):
        mock_supported.return_value = ['us_xx']

        region_code = 'us_xx'

        mock_environment.return_value = 'staging'
        mock_controller = create_autospec(GcsfsDirectIngestController)
        mock_region.return_value = fake_region(region_code=region_code,
                                               environment='staging',
                                               ingestor=mock_controller)

        import_args = GcsfsRawDataBQImportArgs(
            raw_data_file_path=GcsfsFilePath.from_absolute_path(
                to_normalized_unprocessed_file_path(
                    'bucket/raw_data_path.csv',
                    file_type=GcsfsDirectIngestFileType.RAW_DATA)))
        request_args = {
            'region': region_code,
        }
        body = {
            'cloud_task_args': import_args.to_serializable(),
            'args_type': 'GcsfsRawDataBQImportArgs',
        }
        body_encoded = json.dumps(body).encode()

        headers = {'X-Appengine-Cron': 'test-cron'}

        response = self.client.post('/raw_data_import',
                                    query_string=request_args,
                                    headers=headers,
                                    data=body_encoded)
        self.assertEqual(200, response.status_code)
        mock_controller.do_raw_data_import.assert_called_with(import_args)
Exemple #20
0
    def test_handle_file_start_ingest_unsupported_region(
            self, mock_region, mock_environment):
        region_code = 'us_nd'

        mock_environment.return_value = 'production'
        mock_controller = create_autospec(GcsfsDirectIngestController)
        mock_region.return_value = fake_region(region_code=region_code,
                                               environment='staging',
                                               ingestor=mock_controller)

        path = GcsfsFilePath.from_absolute_path(
            'bucket-us-nd/elite_offenders.csv')

        request_args = {
            'region': region_code,
            'bucket': path.bucket_name,
            'relative_file_path': path.blob_name,
            'start_ingest': 'False',
        }
        headers = {'X-Appengine-Cron': 'test-cron'}

        response = self.client.get('/handle_direct_ingest_file',
                                   query_string=request_args,
                                   headers=headers)

        mock_region.assert_called_with('us_nd', is_direct_ingest=True)
        mock_controller.handle_file.assert_called_with(path, False)

        # Even though the region isn't supported, we don't crash - the
        # controller handles not starting ingest, and if it does by accident,
        # the actual schedule/process_job endpoints handle the unlaunched
        # region check.
        self.assertEqual(200, response.status_code)
Exemple #21
0
    def test_handle_file_start_ingest(self, mock_region, mock_environment):
        region_code = 'us_nd'

        mock_environment.return_value = 'production'
        mock_controller = create_autospec(GcsfsDirectIngestController)
        mock_region.return_value = fake_region(region_code=region_code,
                                               environment='production',
                                               ingestor=mock_controller)
        path = GcsfsFilePath.from_absolute_path(
            'bucket-us-nd/elite_offenders.csv')

        request_args = {
            'region': region_code,
            'bucket': path.bucket_name,
            'relative_file_path': path.blob_name,
            'start_ingest': 'True',
        }
        headers = {'X-Appengine-Cron': 'test-cron'}
        response = self.client.get('/handle_direct_ingest_file',
                                   query_string=request_args,
                                   headers=headers)

        mock_controller.handle_file.assert_called_with(path, True)

        # Even though the region isn't supported, we don't crash
        self.assertEqual(200, response.status_code)
Exemple #22
0
    def get_paths_to_upload(self) -> List[Tuple[str, datetime.datetime]]:
        """Returns the appropriate paths to upload and the proper associated timestamp that
        it is to be normalized with. Skips any files that are not properly supported."""
        path_candidates = []
        for path, timestamp in self.paths_with_timestamps:
            if self.gcsfs.is_dir(path):
                directory = GcsfsDirectoryPath.from_absolute_path(path)
                files_in_directory = self.gcsfs.ls_with_blob_prefix(
                    bucket_name=directory.bucket_name,
                    blob_prefix=directory.relative_path,
                )
                for file in files_in_directory:
                    path_candidates.append((file.abs_path(), timestamp))
            elif self.gcsfs.is_file(path):
                file = GcsfsFilePath.from_absolute_path(path)
                path_candidates.append((file.abs_path(), timestamp))
            else:
                logging.warning(
                    "Could not indicate %s as a directory or a file in %s. Skipping",
                    path,
                    self.gcs_destination_path.uri(),
                )
                self.unable_to_upload_files.append(path)
                continue

        result = []
        for path, timestamp in path_candidates:
            _, ext = os.path.splitext(path)
            if not ext or ext not in self.SUPPORTED_EXTENSIONS:
                logging.info("Skipping file [%s] - invalid extension %s", path,
                             ext)
                continue
            result.append((path, timestamp))

        return result
    def test_handle_file_start_ingest_unsupported_region(
            self, mock_region: mock.MagicMock,
            mock_environment: mock.MagicMock) -> None:
        region_code = "us_nd"

        mock_environment.return_value = "production"
        mock_controller = create_autospec(GcsfsDirectIngestController)
        mock_region.return_value = fake_region(region_code=region_code,
                                               environment="staging",
                                               ingestor=mock_controller)

        path = GcsfsFilePath.from_absolute_path(
            "bucket-us-nd/elite_offenders.csv")

        request_args = {
            "region": region_code,
            "bucket": path.bucket_name,
            "relative_file_path": path.blob_name,
            "start_ingest": "False",
        }
        headers = {"X-Appengine-Cron": "test-cron"}

        response = self.client.get("/handle_direct_ingest_file",
                                   query_string=request_args,
                                   headers=headers)

        mock_region.assert_called_with("us_nd", is_direct_ingest=True)
        mock_controller.handle_file.assert_called_with(path, False)

        # Even though the region isn't supported, we don't crash - the
        # controller handles not starting ingest, and if it does by accident,
        # the actual schedule/process_job endpoints handle the unlaunched
        # region check.
        self.assertEqual(200, response.status_code)
Exemple #24
0
    def test_retrieve_data_po_monthly_report(self) -> None:
        batch_id = "123"
        test_data = "\n".join(
            [
                build_report_json_fixture("*****@*****.**"),
                "my invalid json",
                build_report_json_fixture("*****@*****.**"),
            ]
        )
        self._write_test_data(test_data)

        recipients = retrieve_data(
            state_code=self.state_code,
            report_type=ReportType.POMonthlyReport,
            batch_id=batch_id,
        )

        # Invalid JSON lines are ignored; warnings are logged
        self.assertEqual(len(recipients), 2)
        self.assertEqual(recipients[0].email_address, "*****@*****.**")
        self.assertEqual(recipients[1].email_address, "*****@*****.**")

        # An archive of report JSON is stored
        self.assertEqual(
            self.gcs_file_system.download_as_string(
                GcsfsFilePath.from_absolute_path(
                    f"gs://recidiviz-test-report-data-archive/{self.state_code.value}/123.json"
                )
            ),
            test_data,
        )
    def test_normalize_file_path(self, mock_fs_factory: mock.MagicMock,
                                 mock_environment: mock.MagicMock) -> None:

        mock_environment.return_value = "production"
        mock_fs = FakeGCSFileSystem()
        mock_fs_factory.return_value = mock_fs

        path = GcsfsFilePath.from_absolute_path("bucket-us-xx/file-tag.csv")

        mock_fs.test_add_path(path, local_path=None)

        request_args = {
            "bucket": path.bucket_name,
            "relative_file_path": path.blob_name,
        }

        headers = {"X-Appengine-Cron": "test-cron"}
        response = self.client.get("/normalize_raw_file_path",
                                   query_string=request_args,
                                   headers=headers)

        self.assertEqual(200, response.status_code)

        self.assertEqual(1, len(mock_fs.all_paths))
        registered_path = mock_fs.all_paths[0]
        if not isinstance(registered_path, GcsfsFilePath):
            self.fail(f"Unexpected type for path [{type(registered_path)}]")
        self.assertTrue(
            DirectIngestGCSFileSystem.is_normalized_file_path(registered_path))
Exemple #26
0
    def mv_path_to_normalized_path(
        self,
        path: GcsfsFilePath,
        file_type: GcsfsDirectIngestFileType,
        dt: Optional[datetime.datetime] = None,
    ) -> GcsfsFilePath:
        """Renames a file with an unnormalized file name to a file with a normalized file name in the same directory. If
        |dt| is specified, the file will contain that timestamp, otherwise will contain the current timestamp.

        Returns the new normalized path location of this file after the move completes.
        """
        updated_file_path = GcsfsFilePath.from_absolute_path(
            to_normalized_unprocessed_file_path(path.abs_path(), file_type,
                                                dt))

        if self.exists(updated_file_path):
            raise ValueError(f"Desired path [{updated_file_path.abs_path()}] "
                             f"already exists, returning")

        logging.info(
            "Moving [%s] to normalized path [%s].",
            path.abs_path(),
            updated_file_path.abs_path(),
        )
        self.mv(path, updated_file_path)
        return updated_file_path
Exemple #27
0
def collect_file_paths(
    data_discovery_args: DataDiscoveryArgs,
    configs: ConfigsByFileType,
    gcs_files: List[str],
) -> FilesByFileType:
    """ Given a set of configs configs, filter the listed GCS files to only those that match our search filters """
    collected_files = defaultdict(list)

    for found_file in gcs_files:
        try:
            path = GcsfsFilePath.from_absolute_path(found_file)
            file_parts = filename_parts_from_path(path)
        except DirectIngestError as e:
            if e.error_type == DirectIngestErrorType.INPUT_ERROR:
                continue

            logger.exception(e)
            continue

        if (not data_discovery_args.start_date <=
                file_parts.utc_upload_datetime.date() <=
                data_discovery_args.end_date):
            continue

        if file_parts.is_file_split:
            continue

        if file_parts.file_tag in configs[file_parts.file_type]:
            collected_files[file_parts.file_type].append(path)

    return collected_files
Exemple #28
0
    def setUp(self) -> None:
        self.metadata_patcher = mock.patch(
            "recidiviz.utils.metadata.project_id")
        self.mock_project_id_fn = self.metadata_patcher.start()
        self.mock_project_id_fn.return_value = "recidiviz-staging"

        test_secrets = {
            # pylint: disable=protected-access
            SQLAlchemyEngineManager._get_cloudsql_instance_id_key(schema_type):
            f"test-project:us-east2:{schema_type.value}-data"
            for schema_type in SchemaType
        }
        self.get_secret_patcher = mock.patch(
            "recidiviz.utils.secrets.get_secret")

        self.get_secret_patcher.start().side_effect = test_secrets.get

        self.gcs_factory_patcher = mock.patch(
            "recidiviz.admin_panel.dataset_metadata_store.GcsfsFactory.build")

        self.fake_fs = FakeGCSFileSystem()
        self.gcs_factory_patcher.start().return_value = self.fake_fs

        self.fake_config_path = GcsfsFilePath.from_absolute_path(
            "gs://recidiviz-staging-configs/cloud_sql_to_bq_config.yaml")
    def test_handle_file_start_ingest(
            self, mock_region: mock.MagicMock,
            mock_environment: mock.MagicMock) -> None:
        region_code = "us_nd"

        mock_environment.return_value = "production"
        mock_controller = create_autospec(GcsfsDirectIngestController)
        mock_region.return_value = fake_region(region_code=region_code,
                                               environment="production",
                                               ingestor=mock_controller)
        path = GcsfsFilePath.from_absolute_path(
            "bucket-us-nd/elite_offenders.csv")

        request_args = {
            "region": region_code,
            "bucket": path.bucket_name,
            "relative_file_path": path.blob_name,
            "start_ingest": "True",
        }
        headers = {"X-Appengine-Cron": "test-cron"}
        response = self.client.get("/handle_direct_ingest_file",
                                   query_string=request_args,
                                   headers=headers)

        mock_controller.handle_file.assert_called_with(path, True)

        # Even though the region isn't supported, we don't crash
        self.assertEqual(200, response.status_code)
    def do_raw_data_import(self,
                           data_import_args: GcsfsRawDataBQImportArgs) -> None:
        """Process a raw incoming file by importing it to BQ, tracking it in our metadata tables, and moving it to
        storage on completion.
        """
        check_is_region_launched_in_env(self.region)
        if not self.region.are_raw_data_bq_imports_enabled_in_env():
            raise ValueError(
                f"Raw data imports not enabled for region [{self.region.region_code}]"
            )

        if not self.fs.exists(data_import_args.raw_data_file_path):
            logging.warning(
                "File path [%s] no longer exists - might have already been "
                "processed or deleted",
                data_import_args.raw_data_file_path,
            )
            self.kick_scheduler(just_finished_job=True)
            return

        file_metadata = self.file_metadata_manager.get_file_metadata(
            data_import_args.raw_data_file_path)

        if file_metadata.processed_time:
            logging.warning(
                "File [%s] is already marked as processed. Skipping file processing.",
                data_import_args.raw_data_file_path.file_name,
            )
            self.kick_scheduler(just_finished_job=True)
            return

        self.raw_file_import_manager.import_raw_file_to_big_query(
            data_import_args.raw_data_file_path, file_metadata)

        if not self.region.are_ingest_view_exports_enabled_in_env():
            # TODO(#3162) This is a stopgap measure for regions that have only partially launched. Delete once SQL
            #  pre-processing is enabled for all direct ingest regions.
            parts = filename_parts_from_path(
                data_import_args.raw_data_file_path)
            ingest_file_tags = self.get_file_tag_rank_list()

            if parts.file_tag in ingest_file_tags:
                self.fs.copy(
                    data_import_args.raw_data_file_path,
                    GcsfsFilePath.from_absolute_path(
                        to_normalized_unprocessed_file_path_from_normalized_path(
                            data_import_args.raw_data_file_path.abs_path(),
                            file_type_override=GcsfsDirectIngestFileType.
                            INGEST_VIEW,
                        )),
                )

        processed_path = self.fs.mv_path_to_processed_path(
            data_import_args.raw_data_file_path)
        self.file_metadata_manager.mark_file_as_processed(
            path=data_import_args.raw_data_file_path)

        self.fs.mv_path_to_storage(processed_path, self.storage_directory_path)
        self.kick_scheduler(just_finished_job=True)