Example #1
0
    def _move_files(self, from_uri: str):
        curr_gcsfs_file_path = GcsfsFilePath.from_absolute_path(from_uri)
        previous_date_format = filename_parts_from_path(
            curr_gcsfs_file_path).date_str
        new_date_format = date.fromisoformat(previous_date_format).strftime(
            "%Y/%m/%d/")

        path_with_new_file_name = GcsfsFilePath.from_absolute_path(
            to_normalized_unprocessed_file_path_from_normalized_path(
                from_uri, GcsfsDirectIngestFileType.RAW_DATA))

        if DirectIngestGCSFileSystem.is_processed_file(curr_gcsfs_file_path):
            path_with_new_file_name = GcsfsFilePath.from_absolute_path(
                to_normalized_processed_file_path_from_normalized_path(
                    from_uri, GcsfsDirectIngestFileType.RAW_DATA))

        raw_dir_with_date = GcsfsDirectoryPath.from_dir_and_subdir(
            self.region_storage_raw_dir_path, new_date_format)

        to_uri = GcsfsFilePath.from_directory_and_file_name(
            raw_dir_with_date, path_with_new_file_name.file_name).uri()

        if not self.dry_run:
            gsutil_mv(from_path=from_uri, to_path=to_uri)
        with self.mutex:
            self.move_list.append((from_uri, to_uri))
            if self.move_progress:
                self.move_progress.next()
    def test_is_task_queued_has_tasks(self):
        # Arrange
        file_path = to_normalized_unprocessed_file_path(
            'bucket/file_path.csv', GcsfsDirectIngestFileType.INGEST_VIEW)
        gcsfs_args = \
            GcsfsIngestArgs(
                ingest_time=datetime.datetime.now(),
                file_path=GcsfsFilePath.from_absolute_path(file_path))

        full_task_name = \
            _build_task_id(_REGION.region_code, gcsfs_args.task_id_tag())
        info = ProcessIngestJobCloudTaskQueueInfo(
            queue_name='queue_name',
            task_names=[
                'projects/path/to/random_task',
                f'projects/path/to/{full_task_name}'
            ])
        file_path = to_normalized_unprocessed_file_path(
            'bucket/file_path.csv', GcsfsDirectIngestFileType.INGEST_VIEW)
        gcsfs_args = \
            GcsfsIngestArgs(
                ingest_time=datetime.datetime.now(),
                file_path=GcsfsFilePath.from_absolute_path(file_path))

        # Act
        gcsfs_args_queued = info.is_task_queued(_REGION, gcsfs_args)

        # Assert
        self.assertTrue(gcsfs_args_queued)
Example #3
0
    def test_read_file_with_columns_no_contents(self):
        empty_file_path = fixtures.as_filepath('tagB.csv')

        delegate = TestGcsfsCsvReaderDelegate()
        self.reader.streaming_read(GcsfsFilePath.from_absolute_path(empty_file_path), delegate=delegate, chunk_size=1)
        self.assertEqual(1, len(delegate.encodings_attempted))
        self.assertEqual(delegate.encodings_attempted[0], delegate.successful_encoding)
        self.assertEqual(1, len(delegate.dataframes))
        encoding, df = delegate.dataframes[0]
        self.assertEqual(encoding, delegate.successful_encoding)
        self.assertEqual(0, df.shape[0])  # No rows
        self.assertEqual(7, df.shape[1])  # 7 columns
        self.assertEqual(0, delegate.decode_errors)
        self.assertEqual(0, delegate.exceptions)

        delegate = TestGcsfsCsvReaderDelegate()
        self.reader.streaming_read(GcsfsFilePath.from_absolute_path(empty_file_path), delegate=delegate, chunk_size=10)
        self.assertEqual(1, len(delegate.encodings_attempted))
        self.assertEqual(delegate.encodings_attempted[0], delegate.successful_encoding)
        self.assertEqual(1, len(delegate.dataframes))
        encoding, df = delegate.dataframes[0]
        self.assertEqual(encoding, delegate.successful_encoding)
        self.assertEqual(0, df.shape[0])  # No rows
        self.assertEqual(7, df.shape[1])  # 7 columns
        self.assertEqual(0, delegate.decode_errors)
        self.assertEqual(0, delegate.exceptions)
    def test_handle_file_start_ingest_unsupported_region(
            self, mock_region, mock_environment):
        region_code = 'us_nd'

        mock_environment.return_value = 'production'
        mock_controller = create_autospec(GcsfsDirectIngestController)
        mock_region.return_value = fake_region(region_code=region_code,
                                               environment='staging',
                                               ingestor=mock_controller)

        path = GcsfsFilePath.from_absolute_path(
            'bucket-us-nd/elite_offenders.csv')

        request_args = {
            'region': region_code,
            'bucket': path.bucket_name,
            'relative_file_path': path.blob_name,
            'start_ingest': 'False',
        }
        headers = {'X-Appengine-Cron': 'test-cron'}

        response = self.client.get('/handle_direct_ingest_file',
                                   query_string=request_args,
                                   headers=headers)

        mock_region.assert_called_with('us_nd', is_direct_ingest=True)
        mock_controller.handle_file.assert_called_with(path, False)

        # Even though the region isn't supported, we don't crash - the
        # controller handles not starting ingest, and if it does by accident,
        # the actual schedule/process_job endpoints handle the unlaunched
        # region check.
        self.assertEqual(200, response.status_code)
    def test_handle_file_start_ingest(self, mock_region, mock_environment):
        region_code = 'us_nd'

        mock_environment.return_value = 'production'
        mock_controller = create_autospec(GcsfsDirectIngestController)
        mock_region.return_value = fake_region(region_code=region_code,
                                               environment='production',
                                               ingestor=mock_controller)
        path = GcsfsFilePath.from_absolute_path(
            'bucket-us-nd/elite_offenders.csv')

        request_args = {
            'region': region_code,
            'bucket': path.bucket_name,
            'relative_file_path': path.blob_name,
            'start_ingest': 'True',
        }
        headers = {'X-Appengine-Cron': 'test-cron'}
        response = self.client.get('/handle_direct_ingest_file',
                                   query_string=request_args,
                                   headers=headers)

        mock_controller.handle_file.assert_called_with(path, True)

        # Even though the region isn't supported, we don't crash
        self.assertEqual(200, response.status_code)
    def _storage_path(self, storage_directory_path: GcsfsDirectoryPath,
                      opt_storage_subdir: Optional[str], date_str: str,
                      file_name: str) -> GcsfsFilePath:
        """Returns the storage file path for the input |file_name|,
        |storage_bucket|, and |ingest_date_str|"""
        if opt_storage_subdir is None:
            opt_storage_subdir = ''

        for file_num in range(self._RENAME_RETRIES):
            name, ext = file_name.split('.')
            actual_file_name = \
                file_name if file_num == 0 else f'{name}-({file_num}).{ext}'

            storage_path_str = os.path.join(
                storage_directory_path.bucket_name,
                storage_directory_path.relative_path, date_str,
                opt_storage_subdir, actual_file_name)
            storage_path = GcsfsFilePath.from_absolute_path(storage_path_str)

            if not self.exists(storage_path):
                return storage_path

            logging.error(
                "Storage path [%s] already exists, attempting rename",
                storage_path.abs_path())

        raise ValueError(
            f'Could not find valid storage path for file {file_name}.')
    def test_raw_data_import(self, mock_supported, mock_region,
                             mock_environment):
        mock_supported.return_value = ['us_xx']

        region_code = 'us_xx'

        mock_environment.return_value = 'staging'
        mock_controller = create_autospec(GcsfsDirectIngestController)
        mock_region.return_value = fake_region(region_code=region_code,
                                               environment='staging',
                                               ingestor=mock_controller)

        import_args = GcsfsRawDataBQImportArgs(
            raw_data_file_path=GcsfsFilePath.from_absolute_path(
                to_normalized_unprocessed_file_path(
                    'bucket/raw_data_path.csv',
                    file_type=GcsfsDirectIngestFileType.RAW_DATA)))
        request_args = {
            'region': region_code,
        }
        body = {
            'cloud_task_args': import_args.to_serializable(),
            'args_type': 'GcsfsRawDataBQImportArgs',
        }
        body_encoded = json.dumps(body).encode()

        headers = {'X-Appengine-Cron': 'test-cron'}

        response = self.client.post('/raw_data_import',
                                    query_string=request_args,
                                    headers=headers,
                                    data=body_encoded)
        self.assertEqual(200, response.status_code)
        mock_controller.do_raw_data_import.assert_called_with(import_args)
 def _normalized_path_for_filename(self, filename: str,
                                   dt: datetime.datetime) -> GcsfsFilePath:
     normalized_path = \
         to_normalized_unprocessed_file_path(
             os.path.join(self._INGEST_BUCKET_PATH.abs_path(),
                          filename), dt)
     return GcsfsFilePath.from_absolute_path(normalized_path)
 def _get_files_to_move(self) -> List[str]:
     """Function that gets the files to move to deprecated based on the file_filter and end/start dates specified"""
     subdirs = dfs_get_date_subdirs([self.region_storage_dir_path.uri()])
     result = []
     for yr_mth_day_subdir_path in subdirs:
         dir_path_blob = GcsfsFilePath.from_absolute_path(
             yr_mth_day_subdir_path).blob_name
         search_date = DATE_SUBDIR_REGEX.search(dir_path_blob)
         if search_date is None:
             raise ValueError(
                 "No match found. File paths should have the format YYYY/MM/DD. Instead we found"
                 f"{dir_path_blob}.")
         match_date = search_date.group()
         date_of_interest = datetime.datetime.strptime(
             match_date, '%Y/%m/%d').date().isoformat()
         if is_between_date_strs_inclusive(
                 upper_bound_date=self.end_date_bound,
                 lower_bound_date=self.start_date_bound,
                 date_of_interest=date_of_interest):
             from_paths = gsutil_ls(f'{yr_mth_day_subdir_path}*.csv')
             for from_path in from_paths:
                 _, file_name = os.path.split(from_path)
                 if re.match(INGESTED_FILE_REGEX, file_name):
                     if not self.file_filter or re.search(
                             self.file_filter, file_name):
                         result.append(from_path)
     return result
def to_normalized_unprocessed_file_path_from_normalized_path(
        original_normalized_file_path: str,
        file_type_override: Optional[GcsfsDirectIngestFileType] = None
) -> str:
    """Moves any normalized path back to an unprocessed path with the same information embedded in the file name. If
    |file_type_override| is provided, we will always overwrite the original path file type with the override file type.
    """
    directory, _ = os.path.split(original_normalized_file_path)
    parts = filename_parts_from_path(GcsfsFilePath.from_absolute_path(original_normalized_file_path))

    file_type = file_type_override if file_type_override else parts.file_type

    utc_iso_timestamp_str = parts.utc_upload_datetime.strftime('%Y-%m-%dT%H:%M:%S:%f')

    suffix_str = \
        f'_{parts.filename_suffix}' if parts.filename_suffix else ''
    base_file_name = f'{parts.file_tag}{suffix_str}'

    path_as_unprocessed = _build_unprocessed_file_name(
        utc_iso_timestamp_str=utc_iso_timestamp_str,
        file_type=file_type,
        base_file_name=base_file_name,
        extension=parts.extension)

    return os.path.join(directory, path_as_unprocessed)
    def mock_import_raw_file_to_big_query(
            self, *, source_uri: str,
            destination_table_schema: List[bigquery.SchemaField], **_kwargs):
        col_names = [
            schema_field.name for schema_field in destination_table_schema
        ]
        temp_path = GcsfsFilePath.from_absolute_path(source_uri)
        local_temp_path = self.fs.uploaded_test_path_to_actual[
            temp_path.abs_path()]

        df = pd.read_csv(local_temp_path, header=None, dtype=str)
        for value in df.values:
            for cell in value:
                if isinstance(cell, str):
                    stripped_cell = cell.strip()
                    if stripped_cell != cell:
                        raise ValueError(
                            'Did not strip white space from raw data cell')

                if cell in col_names:
                    raise ValueError(
                        f'Wrote column row to output file: {value}')
        self.num_lines_uploaded += len(df)

        return mock.MagicMock()
Example #12
0
 def export_query_results_to_cloud_storage(
         self, export_configs: List[ExportQueryConfig]) -> None:
     for export_config in export_configs:
         export_path = GcsfsFilePath.from_absolute_path(
             export_config.output_uri)
         self.fs.test_add_path(export_path)
         self.exported_file_tags.append(
             filename_parts_from_path(export_path).file_tag)
Example #13
0
 def _make_unprocessed_path(
     path_str: str,
     file_type: GcsfsDirectIngestFileType,
     dt=datetime.datetime(2015, 1, 2, 3, 3, 3, 3)
 ) -> GcsfsFilePath:
     normalized_path_str = to_normalized_unprocessed_file_path(
         original_file_path=path_str, file_type=file_type, dt=dt)
     return GcsfsFilePath.from_absolute_path(normalized_path_str)
Example #14
0
    def test_read_completely_empty_file(self):
        empty_file_path = fixtures.as_filepath('tagA.csv')

        delegate = TestGcsfsCsvReaderDelegate()
        self.reader.streaming_read(GcsfsFilePath.from_absolute_path(empty_file_path), delegate=delegate, chunk_size=1)
        self.assertEqual(1, len(delegate.encodings_attempted))
        self.assertEqual(delegate.encodings_attempted[0], delegate.successful_encoding)
        self.assertEqual(0, len(delegate.dataframes))
        self.assertEqual(0, delegate.decode_errors)
        self.assertEqual(0, delegate.exceptions)

        delegate = TestGcsfsCsvReaderDelegate()
        self.reader.streaming_read(GcsfsFilePath.from_absolute_path(empty_file_path), delegate=delegate, chunk_size=10)
        self.assertEqual(1, len(delegate.encodings_attempted))
        self.assertEqual(delegate.encodings_attempted[0], delegate.successful_encoding)
        self.assertEqual(0, len(delegate.dataframes))
        self.assertEqual(0, delegate.decode_errors)
        self.assertEqual(0, delegate.exceptions)
Example #15
0
 def _normalized_path_for_filename(self,
                                   filename: str,
                                   file_type: GcsfsDirectIngestFileType,
                                   dt: datetime.datetime) -> GcsfsFilePath:
     normalized_path = \
         to_normalized_unprocessed_file_path(
             original_file_path=os.path.join(self._INGEST_BUCKET_PATH.abs_path(), filename),
             file_type=file_type,
             dt=dt)
     return GcsfsFilePath.from_absolute_path(normalized_path)
    def do_raw_data_import(self,
                           data_import_args: GcsfsRawDataBQImportArgs) -> None:
        """Process a raw incoming file by importing it to BQ, tracking it in our metadata tables, and moving it to
        storage on completion.
        """
        check_is_region_launched_in_env(self.region)
        if not self.region.are_raw_data_bq_imports_enabled_in_env():
            raise ValueError(
                f'Raw data imports not enabled for region [{self.region.region_code}]'
            )

        if not self.fs.exists(data_import_args.raw_data_file_path):
            logging.warning(
                "File path [%s] no longer exists - might have already been "
                "processed or deleted", data_import_args.raw_data_file_path)
            self.kick_scheduler(just_finished_job=True)
            return

        file_metadata = self.file_metadata_manager.get_file_metadata(
            data_import_args.raw_data_file_path)

        if file_metadata.processed_time:
            logging.warning(
                'File [%s] is already marked as processed. Skipping file processing.',
                data_import_args.raw_data_file_path.file_name)
            self.kick_scheduler(just_finished_job=True)
            return

        self.raw_file_import_manager.import_raw_file_to_big_query(
            data_import_args.raw_data_file_path, file_metadata)

        if not self.region.are_ingest_view_exports_enabled_in_env():
            # TODO(3162) This is a stopgap measure for regions that have only partially launched. Delete once SQL
            #  pre-processing is enabled for all direct ingest regions.
            parts = filename_parts_from_path(
                data_import_args.raw_data_file_path)
            ingest_file_tags = self.get_file_tag_rank_list()

            if parts.file_tag in ingest_file_tags:
                self.fs.copy(
                    data_import_args.raw_data_file_path,
                    GcsfsFilePath.from_absolute_path(
                        to_normalized_unprocessed_file_path_from_normalized_path(
                            data_import_args.raw_data_file_path.abs_path(),
                            file_type_override=GcsfsDirectIngestFileType.
                            INGEST_VIEW)))

        processed_path = self.fs.mv_path_to_processed_path(
            data_import_args.raw_data_file_path)
        self.file_metadata_manager.mark_file_as_processed(
            path=data_import_args.raw_data_file_path)

        self.fs.mv_path_to_storage(processed_path, self.storage_directory_path)
        self.kick_scheduler(just_finished_job=True)
Example #17
0
    def test_read_with_no_failure(self):
        file_path = fixtures.as_filepath('encoded_utf_8.csv')
        delegate = TestGcsfsCsvReaderDelegate()
        self.reader.streaming_read(GcsfsFilePath.from_absolute_path(file_path), delegate=delegate, chunk_size=1)

        self.assertEqual(1, len(delegate.encodings_attempted))
        self.assertEqual('UTF-8', delegate.encodings_attempted[0])
        self.assertEqual('UTF-8', delegate.successful_encoding)
        self.assertEqual(4, len(delegate.dataframes))
        self.assertEqual({'UTF-8'}, {encoding for encoding, df in delegate.dataframes})
        self.assertEqual(0, delegate.decode_errors)
        self.assertEqual(0, delegate.exceptions)
Example #18
0
    def test_read_with_failure_first(self):
        file_path = fixtures.as_filepath('encoded_latin_1.csv')
        delegate = TestGcsfsCsvReaderDelegate()
        self.reader.streaming_read(GcsfsFilePath.from_absolute_path(file_path), delegate=delegate, chunk_size=1)

        index = COMMON_RAW_FILE_ENCODINGS.index('ISO-8859-1')
        self.assertEqual(index + 1, len(delegate.encodings_attempted))
        self.assertEqual(COMMON_RAW_FILE_ENCODINGS[:(index+1)], delegate.encodings_attempted)
        self.assertEqual('ISO-8859-1', delegate.successful_encoding)
        self.assertEqual(4, len(delegate.dataframes))
        self.assertEqual({'ISO-8859-1'}, {encoding for encoding, df in delegate.dataframes})
        self.assertEqual(1, delegate.decode_errors)
        self.assertEqual(0, delegate.exceptions)
Example #19
0
 def test_read_no_encodings_match(self):
     file_path = fixtures.as_filepath('encoded_latin_1.csv')
     delegate = TestGcsfsCsvReaderDelegate()
     encodings_to_try = ['UTF-8', 'UTF-16']
     with self.assertRaises(ValueError):
         self.reader.streaming_read(GcsfsFilePath.from_absolute_path(file_path),
                                    delegate=delegate, chunk_size=10, encodings_to_try=encodings_to_try)
     self.assertEqual(encodings_to_try, delegate.encodings_attempted)
     self.assertEqual(2, len(delegate.encodings_attempted))
     self.assertIsNone(delegate.successful_encoding)
     self.assertEqual(0, len(delegate.dataframes))
     self.assertEqual(2, delegate.decode_errors)
     self.assertEqual(0, delegate.exceptions)
    def mv_path_to_normalized_path(self,
                                   path: GcsfsFilePath,
                                   dt: Optional[datetime.datetime] = None):
        updated_file_path = \
            GcsfsFilePath.from_absolute_path(
                to_normalized_unprocessed_file_path(path.abs_path(), dt))

        if self.exists(updated_file_path):
            raise ValueError(f"Desired path [{updated_file_path.abs_path()}] "
                             f"already exists, returning")

        logging.info("Moving [%s] to normalized path [%s].", path.abs_path(),
                     updated_file_path.abs_path())
        self.mv(path, updated_file_path)
    def _storage_path(self,
                      storage_directory_path: GcsfsDirectoryPath,
                      path: GcsfsFilePath) -> GcsfsFilePath:
        """Returns the storage file path for the input |file_name|,
        |storage_bucket|, and |ingest_date_str|"""

        parts = filename_parts_from_path(path)

        if self.is_split_file(path):
            opt_storage_subdir = SPLIT_FILE_STORAGE_SUBDIR
        else:
            opt_storage_subdir = ''

        if parts.file_type is None or parts.file_type == GcsfsDirectIngestFileType.UNSPECIFIED:
            file_type_subidr = ''
            date_subdir = parts.date_str
        else:
            file_type_subidr = parts.file_type.value
            date_subdir = os.path.join(
                f'{parts.utc_upload_datetime.year:04}',
                f'{parts.utc_upload_datetime.month:02}',
                f'{parts.utc_upload_datetime.day:02}'
            )

        for file_num in range(self._RENAME_RETRIES):
            name, ext = path.file_name.split('.')
            actual_file_name = \
                path.file_name if file_num == 0 else f'{name}-({file_num}).{ext}'

            storage_path_str = os.path.join(
                storage_directory_path.bucket_name,
                storage_directory_path.relative_path,
                file_type_subidr,
                date_subdir,
                opt_storage_subdir,
                actual_file_name)
            storage_path = GcsfsFilePath.from_absolute_path(storage_path_str)

            if not self.exists(storage_path):
                return storage_path

            logging.error(
                "Storage path [%s] already exists, attempting rename",
                storage_path.abs_path())

        raise ValueError(
            f'Could not find valid storage path for file {path.file_name}.')
Example #22
0
    def test_serialize_gcsfs_ingest_args(self):
        now = datetime.datetime.now()

        str_now = datetime_to_serializable(now)
        now_converted = serializable_to_datetime(str_now)

        self.assertTrue(now, now_converted)

        args = GcsfsIngestArgs(
            ingest_time=datetime.datetime.now(),
            file_path=GcsfsFilePath.from_absolute_path('foo/bar.csv'),
        )

        args_dict = attr_to_json_dict(args)
        serialized = json.dumps(args_dict).encode()
        args_dict = json.loads(serialized)
        result_args = attr_from_json_dict(args_dict)
        self.assertEqual(args, result_args)
    def test_create_direct_ingest_process_job_task_gcsfs_args(
            self, mock_client, mock_uuid, mock_datetime):
        # Arrange
        project_id = 'recidiviz-456'
        file_path = to_normalized_unprocessed_file_path(
            'bucket/file_path.csv', GcsfsDirectIngestFileType.INGEST_VIEW)
        ingest_args = \
            GcsfsIngestArgs(
                ingest_time=datetime.datetime(year=2019, month=7, day=20),
                file_path=GcsfsFilePath.from_absolute_path(file_path))
        body = {
            'cloud_task_args': ingest_args.to_serializable(),
            'args_type': 'GcsfsIngestArgs'
        }
        body_encoded = json.dumps(body).encode()
        uuid = 'random-uuid'
        mock_uuid.uuid4.return_value = uuid
        date = '2019-07-20'
        mock_datetime.date.today.return_value = date
        queue_path = _REGION.shared_queue + '-path'

        task_name = _REGION.shared_queue + '/{}-{}-{}'.format(
            _REGION.region_code, date, uuid)
        task = tasks_v2.types.task_pb2.Task(
            name=task_name,
            app_engine_http_request={
                'http_method': 'POST',
                'relative_uri':
                f'/direct/process_job?region={_REGION.region_code}',
                'body': body_encoded
            })

        mock_client.return_value.task_path.return_value = task_name
        mock_client.return_value.queue_path.return_value = queue_path

        # Act
        DirectIngestCloudTaskManagerImpl(project_id=project_id).\
            create_direct_ingest_process_job_task(_REGION, ingest_args)

        # Assert
        mock_client.return_value.queue_path.assert_called_with(
            project_id, QUEUES_REGION, _REGION.shared_queue)
        mock_client.return_value.create_task.assert_called_with(
            queue_path, task)
    def test_create_direct_ingest_raw_data_import_task(self, mock_client,
                                                       mock_uuid):
        # Arrange
        project_id = 'recidiviz-456'
        import_args = GcsfsRawDataBQImportArgs(
            raw_data_file_path=GcsfsFilePath.from_absolute_path(
                to_normalized_unprocessed_file_path(
                    'bucket/raw_data_path.csv',
                    file_type=GcsfsDirectIngestFileType.RAW_DATA)))
        body = {
            'cloud_task_args': import_args.to_serializable(),
            'args_type': 'GcsfsRawDataBQImportArgs'
        }
        body_encoded = json.dumps(body).encode()
        uuid = 'random-uuid'
        mock_uuid.uuid4.return_value = uuid
        date = '2019-07-20'
        queue_path = _REGION.shared_queue + '-path'

        task_name = DIRECT_INGEST_BQ_IMPORT_EXPORT_QUEUE_V2 + '/{}-{}-{}'.format(
            _REGION.region_code, date, uuid)
        task = tasks_v2.types.task_pb2.Task(
            name=task_name,
            app_engine_http_request={
                'http_method': 'POST',
                'relative_uri':
                f'/direct/raw_data_import?region={_REGION.region_code}',
                'body': body_encoded
            })

        mock_client.return_value.task_path.return_value = task_name
        mock_client.return_value.queue_path.return_value = queue_path

        # Act
        DirectIngestCloudTaskManagerImpl(
            project_id=project_id).create_direct_ingest_raw_data_import_task(
                _REGION, import_args)

        # Assert
        mock_client.return_value.queue_path.assert_called_with(
            project_id, QUEUES_REGION, DIRECT_INGEST_BQ_IMPORT_EXPORT_QUEUE_V2)
        mock_client.return_value.create_task.assert_called_with(
            queue_path, task)
    def test_is_task_queued_no_tasks(self):
        # Arrange
        info = CloudTaskQueueInfo(queue_name='queue_name', task_names=[])

        file_path = to_normalized_unprocessed_file_path('bucket/file_path.csv')
        args = IngestArgs(ingest_time=datetime.datetime.now())
        gcsfs_args = \
            GcsfsIngestArgs(
                ingest_time=datetime.datetime.now(),
                file_path=GcsfsFilePath.from_absolute_path(file_path))

        # Act
        basic_args_queued = info.is_task_queued(_REGION, args)
        gcsfs_args_queued = info.is_task_queued(_REGION, gcsfs_args)

        # Assert
        self.assertFalse(basic_args_queued)
        self.assertFalse(gcsfs_args_queued)

        self.assertFalse(info.is_task_queued(_REGION, gcsfs_args))
 def _move_files_for_date(self, from_uri: str):
     """Function that loops through each list of files to move and moves them to the deprecated folder
     in accordance with the date they were received and the date they were deprecated."""
     curr_gcsfs_file_path = GcsfsFilePath.from_absolute_path(from_uri)
     previous_date_format = filename_parts_from_path(
         curr_gcsfs_file_path).date_str
     new_date_format = date.fromisoformat(previous_date_format).strftime(
         "%Y/%m/%d/")
     to_uri = os.path.join('gs://',
                           self.region_storage_dir_path.bucket_name,
                           self.region_code, 'deprecated',
                           f'deprecated_on_{date.today()}',
                           str(self.file_type.value), new_date_format,
                           curr_gcsfs_file_path.file_name)
     if not self.dry_run:
         gsutil_mv(from_path=from_uri, to_path=to_uri)
     with self.mutex:
         self.move_list.append((from_uri, to_uri))
         if self.move_progress:
             self.move_progress.next()
    def mv_path_to_normalized_path(self,
                                   path: GcsfsFilePath,
                                   file_type: GcsfsDirectIngestFileType,
                                   dt: Optional[datetime.datetime] = None) -> GcsfsFilePath:
        """Renames a file with an unnormalized file name to a file with a normalized file name in the same directory. If
        |dt| is specified, the file will contain that timestamp, otherwise will contain the current timestamp.

        Returns the new normalized path location of this file after the move completes.
        """
        updated_file_path = \
            GcsfsFilePath.from_absolute_path(
                to_normalized_unprocessed_file_path(path.abs_path(), file_type, dt))

        if self.exists(updated_file_path):
            raise ValueError(
                f"Desired path [{updated_file_path.abs_path()}] "
                f"already exists, returning")

        logging.info("Moving [%s] to normalized path [%s].",
                     path.abs_path(), updated_file_path.abs_path())
        self.mv(path, updated_file_path)
        return updated_file_path
Example #28
0
    def test_read_with_exception(self):
        class _TestException(ValueError):
            pass

        class _ExceptionDelegate(TestGcsfsCsvReaderDelegate):
            def on_dataframe(self, encoding: str, chunk_num: int, df: pd.DataFrame) -> bool:
                should_continue = super().on_dataframe(encoding, chunk_num, df)
                if chunk_num > 0:
                    raise _TestException('We crashed processing!')
                return should_continue

        file_path = fixtures.as_filepath('encoded_utf_8.csv')
        delegate = _ExceptionDelegate()

        with self.assertRaises(_TestException):
            self.reader.streaming_read(GcsfsFilePath.from_absolute_path(file_path), delegate=delegate, chunk_size=1)

        self.assertEqual(1, len(delegate.encodings_attempted))
        self.assertEqual('UTF-8', delegate.encodings_attempted[0])
        self.assertIsNone(delegate.successful_encoding)
        self.assertEqual(2, len(delegate.dataframes))
        self.assertEqual({'UTF-8'}, {encoding for encoding, df in delegate.dataframes})
        self.assertEqual(0, delegate.decode_errors)
        self.assertEqual(1, delegate.exceptions)
Example #29
0
    def test_filename_parts_from_path(self):
        with self.assertRaises(DirectIngestError):
            filename_parts_from_path(
                GcsfsFilePath.from_absolute_path(
                    'bucket/us_ca_sf/elite_offenders.csv'))

        parts = filename_parts_from_path(
            GcsfsFilePath.from_absolute_path(
                'bucket-us-nd/unprocessed_2019-08-07T22:09:18:770655_'
                'elite_offenders.csv'))

        self.assertEqual(parts.processed_state, 'unprocessed')
        self.assertEqual(parts.extension, 'csv')
        self.assertEqual(parts.file_tag, 'elite_offenders')
        self.assertEqual(parts.filename_suffix, None)
        self.assertEqual(
            parts.utc_upload_datetime,
            datetime.datetime.fromisoformat('2019-08-07T22:09:18:770655'))
        self.assertEqual(parts.date_str, '2019-08-07')
        self.assertEqual(parts.is_file_split, False)
        self.assertEqual(parts.file_split_size, None)

        parts = filename_parts_from_path(
            GcsfsFilePath.from_absolute_path(
                'bucket-us-nd/processed_2019-09-07T00:09:18:770655_'
                'elite_offenders.csv'))

        self.assertEqual(parts.processed_state, 'processed')
        self.assertEqual(parts.extension, 'csv')
        self.assertEqual(parts.file_tag, 'elite_offenders')
        self.assertEqual(parts.filename_suffix, None)
        self.assertEqual(
            parts.utc_upload_datetime,
            datetime.datetime.fromisoformat('2019-09-07T00:09:18:770655'))
        self.assertEqual(parts.date_str, '2019-09-07')
        self.assertEqual(parts.is_file_split, False)
        self.assertEqual(parts.file_split_size, None)

        parts = filename_parts_from_path(
            GcsfsFilePath.from_absolute_path(
                'bucket-us-nd/processed_2019-09-07T00:09:18:770655_'
                'elite_offenders_1split.csv'))

        self.assertEqual(parts.processed_state, 'processed')
        self.assertEqual(parts.extension, 'csv')
        self.assertEqual(parts.file_tag, 'elite_offenders')
        self.assertEqual(parts.filename_suffix, '1split')
        self.assertEqual(
            parts.utc_upload_datetime,
            datetime.datetime.fromisoformat('2019-09-07T00:09:18:770655'))
        self.assertEqual(parts.date_str, '2019-09-07')

        # Needs the actual file_split suffix to be a file split
        self.assertEqual(parts.is_file_split, False)
        self.assertEqual(parts.file_split_size, None)

        parts = filename_parts_from_path(
            GcsfsFilePath.from_absolute_path(
                'bucket-us-nd/processed_2019-09-07T00:09:18:770655_'
                'elite_offenders_002_file_split.csv'))

        self.assertEqual(parts.processed_state, 'processed')
        self.assertEqual(parts.extension, 'csv')
        self.assertEqual(parts.file_tag, 'elite_offenders')
        self.assertEqual(parts.filename_suffix, '002_file_split')
        self.assertEqual(
            parts.utc_upload_datetime,
            datetime.datetime.fromisoformat('2019-09-07T00:09:18:770655'))
        self.assertEqual(parts.date_str, '2019-09-07')

        self.assertEqual(parts.is_file_split, True)
        self.assertEqual(parts.file_split_size, None)

        parts = filename_parts_from_path(
            GcsfsFilePath.from_absolute_path(
                'bucket-us-nd/processed_2019-09-07T00:09:18:770655_'
                'elite_offenders_002_file_split_size300.csv'))

        self.assertEqual(parts.processed_state, 'processed')
        self.assertEqual(parts.extension, 'csv')
        self.assertEqual(parts.file_tag, 'elite_offenders')
        self.assertEqual(parts.filename_suffix, '002_file_split_size300')
        self.assertEqual(
            parts.utc_upload_datetime,
            datetime.datetime.fromisoformat('2019-09-07T00:09:18:770655'))
        self.assertEqual(parts.date_str, '2019-09-07')

        self.assertEqual(parts.is_file_split, True)
        self.assertEqual(parts.file_split_size, 300)

        parts = filename_parts_from_path(
            GcsfsFilePath.from_absolute_path(
                'bucket-us-nd/processed_2019-09-07T00:09:18:770655_'
                'BrazosCounty_2019_09_25.csv'))

        self.assertEqual(parts.processed_state, 'processed')
        self.assertEqual(parts.extension, 'csv')
        self.assertEqual(parts.file_tag, 'BrazosCounty')
        self.assertEqual(parts.filename_suffix, '2019_09_25')
        self.assertEqual(
            parts.utc_upload_datetime,
            datetime.datetime.fromisoformat('2019-09-07T00:09:18:770655'))
        self.assertEqual(parts.date_str, '2019-09-07')

        self.assertEqual(parts.is_file_split, False)
        self.assertEqual(parts.file_split_size, None)

        parts = filename_parts_from_path(
            GcsfsFilePath.from_absolute_path(
                'bucket-us-nd/processed_2019-09-07T00:09:18:770655_'
                'BrazosCounty_2019_09_25_002_file_split_size300.csv'))

        self.assertEqual(parts.processed_state, 'processed')
        self.assertEqual(parts.extension, 'csv')
        self.assertEqual(parts.file_tag, 'BrazosCounty')
        self.assertEqual(parts.filename_suffix,
                         '2019_09_25_002_file_split_size300')
        self.assertEqual(
            parts.utc_upload_datetime,
            datetime.datetime.fromisoformat('2019-09-07T00:09:18:770655'))
        self.assertEqual(parts.date_str, '2019-09-07')

        self.assertEqual(parts.is_file_split, True)
        self.assertEqual(parts.file_split_size, 300)

        parts = filename_parts_from_path(
            GcsfsFilePath.from_absolute_path(
                'bucket-us-mo/unprocessed_2019-09-07T00:09:18:770655_'
                'tak001_offender_identification.csv'))

        self.assertEqual(parts.processed_state, 'unprocessed')
        self.assertEqual(parts.extension, 'csv')
        self.assertEqual(parts.file_tag, 'tak001_offender_identification')
        self.assertEqual(parts.filename_suffix, None)
        self.assertEqual(
            parts.utc_upload_datetime,
            datetime.datetime.fromisoformat('2019-09-07T00:09:18:770655'))
        self.assertEqual(parts.date_str, '2019-09-07')

        self.assertEqual(parts.is_file_split, False)
        self.assertEqual(parts.file_split_size, None)

        parts = filename_parts_from_path(
            GcsfsFilePath.from_absolute_path(
                'bucket-us-mo/unprocessed_2019-09-07T00:09:18:770655_'
                'tak001_offender_identification_002_file_split_size300.csv'))

        self.assertEqual(parts.processed_state, 'unprocessed')
        self.assertEqual(parts.extension, 'csv')
        self.assertEqual(parts.file_tag, 'tak001_offender_identification')
        self.assertEqual(parts.filename_suffix, '002_file_split_size300')
        self.assertEqual(
            parts.utc_upload_datetime,
            datetime.datetime.fromisoformat('2019-09-07T00:09:18:770655'))
        self.assertEqual(parts.date_str, '2019-09-07')

        self.assertEqual(parts.is_file_split, True)
        self.assertEqual(parts.file_split_size, 300)
Example #30
0
    def test_filename_parts_from_path_with_file_type(self):
        with self.assertRaises(DirectIngestError):
            filename_parts_from_path(
                GcsfsFilePath.from_absolute_path(
                    'bucket/us_ca_sf/elite_offenders.csv'))

        parts = filename_parts_from_path(
            GcsfsFilePath.from_absolute_path(
                'bucket-us-nd/unprocessed_2019-08-07T22:09:18:770655_'
                'raw_elite_offenders.csv'))

        self.assertEqual(parts.processed_state, 'unprocessed')
        self.assertEqual(parts.extension, 'csv')
        self.assertEqual(parts.file_type, GcsfsDirectIngestFileType.RAW_DATA)
        self.assertEqual(parts.file_tag, 'elite_offenders')
        self.assertEqual(parts.filename_suffix, None)
        self.assertEqual(
            parts.utc_upload_datetime,
            datetime.datetime.fromisoformat('2019-08-07T22:09:18:770655'))
        self.assertEqual(parts.date_str, '2019-08-07')
        self.assertEqual(parts.is_file_split, False)
        self.assertEqual(parts.file_split_size, None)

        parts = filename_parts_from_path(
            GcsfsFilePath.from_absolute_path(
                'bucket-us-nd/processed_2019-09-07T00:09:18:770655_'
                'ingest_view_elite_offenders.csv'))

        self.assertEqual(parts.processed_state, 'processed')
        self.assertEqual(parts.extension, 'csv')
        self.assertEqual(parts.file_type,
                         GcsfsDirectIngestFileType.INGEST_VIEW)
        self.assertEqual(parts.file_tag, 'elite_offenders')
        self.assertEqual(parts.filename_suffix, None)
        self.assertEqual(
            parts.utc_upload_datetime,
            datetime.datetime.fromisoformat('2019-09-07T00:09:18:770655'))
        self.assertEqual(parts.date_str, '2019-09-07')
        self.assertEqual(parts.is_file_split, False)
        self.assertEqual(parts.file_split_size, None)

        parts = filename_parts_from_path(
            GcsfsFilePath.from_absolute_path(
                'bucket-us-nd/processed_2019-09-07T00:09:18:770655_'
                'raw_elite_offenders_1split.csv'))

        self.assertEqual(parts.processed_state, 'processed')
        self.assertEqual(parts.extension, 'csv')
        self.assertEqual(parts.file_type, GcsfsDirectIngestFileType.RAW_DATA)
        self.assertEqual(parts.file_tag, 'elite_offenders')
        self.assertEqual(parts.filename_suffix, '1split')
        self.assertEqual(
            parts.utc_upload_datetime,
            datetime.datetime.fromisoformat('2019-09-07T00:09:18:770655'))
        self.assertEqual(parts.date_str, '2019-09-07')

        # Needs the actual file_split suffix to be a file split
        self.assertEqual(parts.is_file_split, False)
        self.assertEqual(parts.file_split_size, None)

        parts = filename_parts_from_path(
            GcsfsFilePath.from_absolute_path(
                'bucket-us-nd/processed_2019-09-07T00:09:18:770655_'
                'ingest_view_elite_offenders_002_file_split.csv'))

        self.assertEqual(parts.processed_state, 'processed')
        self.assertEqual(parts.extension, 'csv')
        self.assertEqual(parts.file_type,
                         GcsfsDirectIngestFileType.INGEST_VIEW)
        self.assertEqual(parts.file_tag, 'elite_offenders')
        self.assertEqual(parts.filename_suffix, '002_file_split')
        self.assertEqual(
            parts.utc_upload_datetime,
            datetime.datetime.fromisoformat('2019-09-07T00:09:18:770655'))
        self.assertEqual(parts.date_str, '2019-09-07')

        self.assertEqual(parts.is_file_split, True)
        self.assertEqual(parts.file_split_size, None)

        parts = filename_parts_from_path(
            GcsfsFilePath.from_absolute_path(
                'bucket-us-nd/processed_2019-09-07T00:09:18:770655_'
                'raw_elite_offenders_002_file_split_size300.csv'))

        self.assertEqual(parts.processed_state, 'processed')
        self.assertEqual(parts.extension, 'csv')
        self.assertEqual(parts.file_type, GcsfsDirectIngestFileType.RAW_DATA)
        self.assertEqual(parts.file_tag, 'elite_offenders')
        self.assertEqual(parts.filename_suffix, '002_file_split_size300')
        self.assertEqual(
            parts.utc_upload_datetime,
            datetime.datetime.fromisoformat('2019-09-07T00:09:18:770655'))
        self.assertEqual(parts.date_str, '2019-09-07')

        self.assertEqual(parts.is_file_split, True)
        self.assertEqual(parts.file_split_size, 300)

        parts = filename_parts_from_path(
            GcsfsFilePath.from_absolute_path(
                'bucket-us-nd/processed_2019-09-07T00:09:18:770655_'
                'ingest_view_BrazosCounty_2019_09_25.csv'))

        self.assertEqual(parts.processed_state, 'processed')
        self.assertEqual(parts.extension, 'csv')
        self.assertEqual(parts.file_type,
                         GcsfsDirectIngestFileType.INGEST_VIEW)
        self.assertEqual(parts.file_tag, 'BrazosCounty')
        self.assertEqual(parts.filename_suffix, '2019_09_25')
        self.assertEqual(
            parts.utc_upload_datetime,
            datetime.datetime.fromisoformat('2019-09-07T00:09:18:770655'))
        self.assertEqual(parts.date_str, '2019-09-07')

        self.assertEqual(parts.is_file_split, False)
        self.assertEqual(parts.file_split_size, None)

        parts = filename_parts_from_path(
            GcsfsFilePath.from_absolute_path(
                'bucket-us-nd/processed_2019-09-07T00:09:18:770655_'
                'raw_BrazosCounty_2019_09_25_002_file_split_size300.csv'))

        self.assertEqual(parts.processed_state, 'processed')
        self.assertEqual(parts.extension, 'csv')
        self.assertEqual(parts.file_type, GcsfsDirectIngestFileType.RAW_DATA)
        self.assertEqual(parts.file_tag, 'BrazosCounty')
        self.assertEqual(parts.filename_suffix,
                         '2019_09_25_002_file_split_size300')
        self.assertEqual(
            parts.utc_upload_datetime,
            datetime.datetime.fromisoformat('2019-09-07T00:09:18:770655'))
        self.assertEqual(parts.date_str, '2019-09-07')

        self.assertEqual(parts.is_file_split, True)
        self.assertEqual(parts.file_split_size, 300)

        parts = filename_parts_from_path(
            GcsfsFilePath.from_absolute_path(
                'bucket-us-mo/unprocessed_2019-09-07T00:09:18:770655_'
                'ingest_view_tak001_offender_identification.csv'))

        self.assertEqual(parts.processed_state, 'unprocessed')
        self.assertEqual(parts.extension, 'csv')
        self.assertEqual(parts.file_type,
                         GcsfsDirectIngestFileType.INGEST_VIEW)
        self.assertEqual(parts.file_tag, 'tak001_offender_identification')
        self.assertEqual(parts.filename_suffix, None)
        self.assertEqual(
            parts.utc_upload_datetime,
            datetime.datetime.fromisoformat('2019-09-07T00:09:18:770655'))
        self.assertEqual(parts.date_str, '2019-09-07')

        self.assertEqual(parts.is_file_split, False)
        self.assertEqual(parts.file_split_size, None)

        parts = filename_parts_from_path(
            GcsfsFilePath.from_absolute_path(
                'bucket-us-mo/unprocessed_2019-09-07T00:09:18:770655_'
                'raw_tak001_offender_identification_002_file_split_size300.csv'
            ))

        self.assertEqual(parts.processed_state, 'unprocessed')
        self.assertEqual(parts.extension, 'csv')
        self.assertEqual(parts.file_type, GcsfsDirectIngestFileType.RAW_DATA)
        self.assertEqual(parts.file_tag, 'tak001_offender_identification')
        self.assertEqual(parts.filename_suffix, '002_file_split_size300')
        self.assertEqual(
            parts.utc_upload_datetime,
            datetime.datetime.fromisoformat('2019-09-07T00:09:18:770655'))
        self.assertEqual(parts.date_str, '2019-09-07')

        self.assertEqual(parts.is_file_split, True)
        self.assertEqual(parts.file_split_size, 300)

        parts = filename_parts_from_path(
            GcsfsFilePath.from_absolute_path(
                'storage_bucket/raw/2020/04/29/processed_2020-04-29T18:02:41:789323_raw_test_file-(1).csv'
            ))

        self.assertEqual(parts.processed_state, 'processed')
        self.assertEqual(parts.extension, 'csv')
        self.assertEqual(parts.file_type, GcsfsDirectIngestFileType.RAW_DATA)
        self.assertEqual(parts.file_tag, 'test_file')
        self.assertEqual(parts.filename_suffix, None)
        self.assertEqual(
            parts.utc_upload_datetime,
            datetime.datetime.fromisoformat('2020-04-29T18:02:41:789323'))
        self.assertEqual(parts.date_str, '2020-04-29')

        self.assertEqual(parts.is_file_split, False)

        parts = filename_parts_from_path(
            GcsfsFilePath.from_absolute_path(
                'bucket-us-mo/unprocessed_2019-09-07T00:09:18:770655_'
                'raw_tak001_offender_identification_002_file_split_size300-(5).csv'
            ))

        self.assertEqual(parts.processed_state, 'unprocessed')
        self.assertEqual(parts.extension, 'csv')
        self.assertEqual(parts.file_type, GcsfsDirectIngestFileType.RAW_DATA)
        self.assertEqual(parts.file_tag, 'tak001_offender_identification')
        self.assertEqual(parts.filename_suffix, '002_file_split_size300')
        self.assertEqual(
            parts.utc_upload_datetime,
            datetime.datetime.fromisoformat('2019-09-07T00:09:18:770655'))
        self.assertEqual(parts.date_str, '2019-09-07')

        self.assertEqual(parts.is_file_split, True)
        self.assertEqual(parts.file_split_size, 300)