Python GcsfsFilePath Examples, recidiviz.ingest.direct.controllers.gcsfs_path.GcsfsFilePath Python Examples

Example #1

0

Show file

File: direct_ingest_cloud_task_manager_impl_test.py Project: teymour-aldridge/pulse-data

    def test_is_task_queued_has_tasks(self):
        # Arrange
        file_path = to_normalized_unprocessed_file_path(
            'bucket/file_path.csv', GcsfsDirectIngestFileType.INGEST_VIEW)
        gcsfs_args = \
            GcsfsIngestArgs(
                ingest_time=datetime.datetime.now(),
                file_path=GcsfsFilePath.from_absolute_path(file_path))

        full_task_name = \
            _build_task_id(_REGION.region_code, gcsfs_args.task_id_tag())
        info = ProcessIngestJobCloudTaskQueueInfo(
            queue_name='queue_name',
            task_names=[
                'projects/path/to/random_task',
                f'projects/path/to/{full_task_name}'
            ])
        file_path = to_normalized_unprocessed_file_path(
            'bucket/file_path.csv', GcsfsDirectIngestFileType.INGEST_VIEW)
        gcsfs_args = \
            GcsfsIngestArgs(
                ingest_time=datetime.datetime.now(),
                file_path=GcsfsFilePath.from_absolute_path(file_path))

        # Act
        gcsfs_args_queued = info.is_task_queued(_REGION, gcsfs_args)

        # Assert
        self.assertTrue(gcsfs_args_queued)

Example #2

0

Show file

File: direct_ingest_gcs_file_system_test.py Project: Alex-sConjecture/pulse-data

    def test_move_to_storage_with_conflict(self):
        test_fs = FakeDirectIngestGCSFileSystem()
        dt = datetime.datetime.now()
        self.fully_process_file(
            test_fs, dt,
            GcsfsFilePath(bucket_name='my_bucket', blob_name='test_file.csv'))

        # Try uploading a file with a duplicate name that has already been
        # moved to storage
        self.fully_process_file(
            test_fs, dt,
            GcsfsFilePath(bucket_name='my_bucket', blob_name='test_file.csv'))

        # pylint: disable=protected-access
        storage_paths = test_fs._ls_with_file_prefix(self.STORAGE_DIR_PATH, '')
        self.assertEqual(len(storage_paths), 2)

        found_first_file = False
        found_second_file = False
        for path in storage_paths:
            if path.abs_path().endswith('test_file.csv'):
                found_first_file = True
            if path.abs_path().endswith('test_file-(1).csv'):
                found_second_file = True

        self.assertTrue(found_first_file)
        self.assertTrue(found_second_file)

Example #3

0

Show file

File: direct_ingest_raw_file_import_manager.py Project: pnchbck/pulse-data

    def import_raw_file_to_big_query(
            self, path: GcsfsFilePath,
            file_metadata: DirectIngestFileMetadata) -> None:
        """Import a raw data file at the given path to the appropriate raw data table in BigQuery."""

        if not self.region.are_raw_data_bq_imports_enabled_in_env():
            raise ValueError(
                f'Cannot import raw files for region [{self.region.region_code}]'
            )

        parts = filename_parts_from_path(path)
        if parts.file_tag not in self.region_raw_file_config.raw_file_tags:
            raise ValueError(
                f'Attempting to import raw file with tag [{parts.file_tag}] unspecified by [{self.region.region_code}] '
                f'config.')

        if parts.file_type != GcsfsDirectIngestFileType.RAW_DATA:
            raise ValueError(
                f'Unexpected file type [{parts.file_type}] for path [{parts.file_tag}].'
            )

        logging.info('Beginning BigQuery upload of raw file [%s]',
                     path.abs_path())

        temp_output_paths = self._upload_contents_to_temp_gcs_paths(
            path, file_metadata)
        self._load_contents_to_bigquery(path, temp_output_paths)

        logging.info('Completed BigQuery import of [%s]', path.abs_path())

Example #4

0

Show file

    def _move_files(self, from_uri: str):
        curr_gcsfs_file_path = GcsfsFilePath.from_absolute_path(from_uri)
        previous_date_format = filename_parts_from_path(
            curr_gcsfs_file_path).date_str
        new_date_format = date.fromisoformat(previous_date_format).strftime(
            "%Y/%m/%d/")

        path_with_new_file_name = GcsfsFilePath.from_absolute_path(
            to_normalized_unprocessed_file_path_from_normalized_path(
                from_uri, GcsfsDirectIngestFileType.RAW_DATA))

        if DirectIngestGCSFileSystem.is_processed_file(curr_gcsfs_file_path):
            path_with_new_file_name = GcsfsFilePath.from_absolute_path(
                to_normalized_processed_file_path_from_normalized_path(
                    from_uri, GcsfsDirectIngestFileType.RAW_DATA))

        raw_dir_with_date = GcsfsDirectoryPath.from_dir_and_subdir(
            self.region_storage_raw_dir_path, new_date_format)

        to_uri = GcsfsFilePath.from_directory_and_file_name(
            raw_dir_with_date, path_with_new_file_name.file_name).uri()

        if not self.dry_run:
            gsutil_mv(from_path=from_uri, to_path=to_uri)
        with self.mutex:
            self.move_list.append((from_uri, to_uri))
            if self.move_progress:
                self.move_progress.next()

Example #5

0

Show file

File: gcsfs_direct_ingest_controller.py Project: xgenie-007/pulse-data

    def _split_file_if_necessary(self, path: GcsfsFilePath):
        """Checks if the given file needs to be split according to this
        controller's |file_split_line_limit|.
        """
        parts = filename_parts_from_path(path)

        if parts.file_tag not in self._get_file_tag_rank_list():
            logging.info(
                "File tag [%s] for path [%s] not in rank list - "
                "not splitting.", parts.file_tag, path.abs_path())
            return False

        if parts.is_file_split and \
                parts.file_split_size and \
                parts.file_split_size <= self.file_split_line_limit:
            logging.info("File [%s] already split with size [%s].",
                         path.abs_path(), parts.file_split_size)
            return False

        file_contents_handle = self._get_contents_handle_from_path(path)

        if not file_contents_handle:
            logging.info("File [%s] has no rows - not splitting.",
                         path.abs_path())
            return False

        if self._can_proceed_with_ingest_for_contents(file_contents_handle):
            logging.info("No need to split file path [%s].", path.abs_path())
            return False

        logging.info("Proceeding to file splitting for path [%s].",
                     path.abs_path())

        self._split_file(path, file_contents_handle)
        return True

Example #6

0

Show file

File: gcsfs_direct_ingest_controller.py Project: xgenie-007/pulse-data

    def handle_file(self, path: GcsfsFilePath, start_ingest: bool):
        """Called when a single new file is added to an ingest bucket (may also
        be called as a result of a rename).

        May be called from any worker/queue.
        """
        if self.fs.is_processed_file(path):
            logging.info("File [%s] is already processed, returning.",
                         path.abs_path())
            return

        if self.fs.is_normalized_file_path(path):
            parts = filename_parts_from_path(path)
            if parts.is_file_split and \
                    parts.file_split_size and \
                    parts.file_split_size <= self.file_split_line_limit:
                self.kick_scheduler(just_finished_job=False)
                logging.info(
                    "File [%s] is already normalized and split split "
                    "with correct size, kicking scheduler.", path.abs_path())
                return

        logging.info("Creating cloud task to schedule next job.")
        self.cloud_task_manager.create_direct_ingest_handle_new_files_task(
            region=self.region, can_start_ingest=start_ingest)

Example #7

0

Show file

    def test_read_file_with_columns_no_contents(self):
        empty_file_path = fixtures.as_filepath('tagB.csv')

        delegate = TestGcsfsCsvReaderDelegate()
        self.reader.streaming_read(GcsfsFilePath.from_absolute_path(empty_file_path), delegate=delegate, chunk_size=1)
        self.assertEqual(1, len(delegate.encodings_attempted))
        self.assertEqual(delegate.encodings_attempted[0], delegate.successful_encoding)
        self.assertEqual(1, len(delegate.dataframes))
        encoding, df = delegate.dataframes[0]
        self.assertEqual(encoding, delegate.successful_encoding)
        self.assertEqual(0, df.shape[0])  # No rows
        self.assertEqual(7, df.shape[1])  # 7 columns
        self.assertEqual(0, delegate.decode_errors)
        self.assertEqual(0, delegate.exceptions)

        delegate = TestGcsfsCsvReaderDelegate()
        self.reader.streaming_read(GcsfsFilePath.from_absolute_path(empty_file_path), delegate=delegate, chunk_size=10)
        self.assertEqual(1, len(delegate.encodings_attempted))
        self.assertEqual(delegate.encodings_attempted[0], delegate.successful_encoding)
        self.assertEqual(1, len(delegate.dataframes))
        encoding, df = delegate.dataframes[0]
        self.assertEqual(encoding, delegate.successful_encoding)
        self.assertEqual(0, df.shape[0])  # No rows
        self.assertEqual(7, df.shape[1])  # 7 columns
        self.assertEqual(0, delegate.decode_errors)
        self.assertEqual(0, delegate.exceptions)

Example #8

0

Show file

    def test_move_to_storage_with_conflict_with_file_types(self):
        dt = datetime.datetime.now()
        self.fully_process_file(dt,
                                GcsfsFilePath(bucket_name='my_bucket',
                                              blob_name='test_file.csv'),
                                file_type_differentiation_on=True)

        # Try uploading a file with a duplicate name that has already been
        # moved to storage
        self.fully_process_file(dt,
                                GcsfsFilePath(bucket_name='my_bucket',
                                              blob_name='test_file.csv'),
                                file_type_differentiation_on=True)

        # pylint: disable=protected-access
        storage_paths = self.fs._ls_with_file_prefix(self.STORAGE_DIR_PATH,
                                                     '',
                                                     file_type_filter=None)
        self.assertEqual(len(storage_paths), 4)

        found_first_file = False
        found_second_file = False
        for path in storage_paths:
            if path.abs_path().endswith('test_file.csv'):
                found_first_file = True
            if path.abs_path().endswith('test_file-(1).csv'):
                found_second_file = True

        self.assertTrue(found_first_file)
        self.assertTrue(found_second_file)

Example #9

0

Show file

    def test_direct_ingest_multiple_file_moves(self):
        self.fully_process_file(
            datetime.datetime.now(),
            GcsfsFilePath(bucket_name='my_bucket', blob_name='test_file.csv'))

        self.fully_process_file(
            datetime.datetime.now(),
            GcsfsFilePath(bucket_name='my_bucket',
                          blob_name='test_file_2.csv'))

Example #10

0

Show file

File: direct_ingest_gcs_file_system_test.py Project: Alex-sConjecture/pulse-data

    def test_direct_ingest_multiple_file_moves(self):
        test_fs = FakeDirectIngestGCSFileSystem()
        self.fully_process_file(
            test_fs, datetime.datetime.now(),
            GcsfsFilePath(bucket_name='my_bucket', blob_name='test_file.csv'))

        self.fully_process_file(
            test_fs, datetime.datetime.now(),
            GcsfsFilePath(bucket_name='my_bucket',
                          blob_name='test_file_2.csv'))

Example #11

0

Show file

    def test_direct_ingest_multiple_file_moves_with_file_types(self):
        self.fully_process_file(datetime.datetime.now(),
                                GcsfsFilePath(bucket_name='my_bucket',
                                              blob_name='test_file.csv'),
                                file_type_differentiation_on=True)

        self.fully_process_file(datetime.datetime.now(),
                                GcsfsFilePath(bucket_name='my_bucket',
                                              blob_name='test_file_2.csv'),
                                file_type_differentiation_on=True)

Example #12

0

Show file

File: direct_ingest_raw_file_import_manager.py Project: sixtysixeast/pulse-data

    def _upload_contents_to_temp_gcs_paths(
            self,
            path: GcsfsFilePath,
            file_metadata: DirectIngestFileMetadata,
            contents_handle: GcsfsFileContentsHandle) -> List[Tuple[GcsfsFilePath, List[str]]]:
        """Uploads the contents of the file at the provided path to one or more GCS files, with whitespace stripped and
        additional metadata columns added.
        """

        logging.info('Starting chunked upload of contents to GCS')

        parts = filename_parts_from_path(path)
        file_config = self.region_raw_file_config.raw_file_configs[parts.file_tag]
        for encoding in file_config.encodings_to_try():
            logging.info('Attempting to do chunked upload of [%s] with encoding [%s]', path.abs_path(), encoding)
            temp_paths_with_columns = []
            try:
                for i, raw_data_df in enumerate(self._read_contents_into_dataframes(encoding,
                                                                                    contents_handle,
                                                                                    file_config)):
                    logging.info('Loaded DataFrame chunk [%d] has [%d] rows', i, raw_data_df.shape[0])

                    # Stripping white space from all fields
                    raw_data_df = raw_data_df.applymap(lambda x: x.strip())

                    augmented_df = self._augment_raw_data_with_metadata_columns(path=path,
                                                                                file_metadata=file_metadata,
                                                                                raw_data_df=raw_data_df)
                    logging.info('Augmented DataFrame chunk [%d] has [%d] rows', i, augmented_df.shape[0])
                    temp_output_path = self._get_temp_df_output_path(path, chunk_num=i)

                    logging.info('Writing DataFrame chunk [%d] to temporary output path [%s]',
                                 i, temp_output_path.abs_path())
                    self.fs.upload_from_string(temp_output_path,
                                               augmented_df.to_csv(header=False, index=False, quoting=csv.QUOTE_ALL),
                                               'text/csv')
                    logging.info('Done writing to temporary output path')

                    temp_paths_with_columns.append((temp_output_path, augmented_df.columns))
                logging.info('Successfully read file [%s] with encoding [%s]', path.abs_path(), encoding)
                return temp_paths_with_columns
            except UnicodeDecodeError:
                logging.info('Unable to read file [%s] with encoding [%s]', path.abs_path(), encoding)
                self._delete_temp_output_paths([path for path, _ in temp_paths_with_columns])
                temp_paths_with_columns.clear()
                continue
            except Exception as e:
                logging.error('Failed to upload to GCS - cleaning up temp paths')
                self._delete_temp_output_paths([path for path, _ in temp_paths_with_columns])
                raise e

        raise ValueError(
            f'Unable to read path [{path.abs_path()}] for any of these encodings: {file_config.encodings_to_try()}')

Example #13

0

Show file

File: direct_ingest_gcs_file_system.py Project: xgenie-007/pulse-data

    def mv_path_to_normalized_path(self,
                                   path: GcsfsFilePath,
                                   dt: Optional[datetime.datetime] = None):
        updated_file_path = \
            GcsfsFilePath.from_absolute_path(
                to_normalized_unprocessed_file_path(path.abs_path(), dt))

        if self.exists(updated_file_path):
            raise ValueError(f"Desired path [{updated_file_path.abs_path()}] "
                             f"already exists, returning")

        logging.info("Moving [%s] to normalized path [%s].", path.abs_path(),
                     updated_file_path.abs_path())
        self.mv(path, updated_file_path)

Example #14

0

Show file

File: move_storage_files_to_deprecated.py Project: nikhil-tibrewal/pulse-data

 def _get_files_to_move(self) -> List[str]:
     """Function that gets the files to move to deprecated based on the file_filter and end/start dates specified"""
     subdirs = dfs_get_date_subdirs([self.region_storage_dir_path.uri()])
     result = []
     for yr_mth_day_subdir_path in subdirs:
         dir_path_blob = GcsfsFilePath.from_absolute_path(
             yr_mth_day_subdir_path).blob_name
         search_date = DATE_SUBDIR_REGEX.search(dir_path_blob)
         if search_date is None:
             raise ValueError(
                 "No match found. File paths should have the format YYYY/MM/DD. Instead we found"
                 f"{dir_path_blob}.")
         match_date = search_date.group()
         date_of_interest = datetime.datetime.strptime(
             match_date, '%Y/%m/%d').date().isoformat()
         if is_between_date_strs_inclusive(
                 upper_bound_date=self.end_date_bound,
                 lower_bound_date=self.start_date_bound,
                 date_of_interest=date_of_interest):
             from_paths = gsutil_ls(f'{yr_mth_day_subdir_path}*.csv')
             for from_path in from_paths:
                 _, file_name = os.path.split(from_path)
                 if re.match(INGESTED_FILE_REGEX, file_name):
                     if not self.file_filter or re.search(
                             self.file_filter, file_name):
                         result.append(from_path)
     return result

Example #15

0

Show file

File: direct_ingest_gcs_file_system.py Project: teymour-aldridge/pulse-data

def to_normalized_unprocessed_file_path_from_normalized_path(
        original_normalized_file_path: str,
        file_type_override: Optional[GcsfsDirectIngestFileType] = None
) -> str:
    """Moves any normalized path back to an unprocessed path with the same information embedded in the file name. If
    |file_type_override| is provided, we will always overwrite the original path file type with the override file type.
    """
    directory, _ = os.path.split(original_normalized_file_path)
    parts = filename_parts_from_path(GcsfsFilePath.from_absolute_path(original_normalized_file_path))

    file_type = file_type_override if file_type_override else parts.file_type

    utc_iso_timestamp_str = parts.utc_upload_datetime.strftime('%Y-%m-%dT%H:%M:%S:%f')

    suffix_str = \
        f'_{parts.filename_suffix}' if parts.filename_suffix else ''
    base_file_name = f'{parts.file_tag}{suffix_str}'

    path_as_unprocessed = _build_unprocessed_file_name(
        utc_iso_timestamp_str=utc_iso_timestamp_str,
        file_type=file_type,
        base_file_name=base_file_name,
        extension=parts.extension)

    return os.path.join(directory, path_as_unprocessed)

Example #16

0

Show file

File: direct_ingest_gcs_file_system.py Project: teymour-aldridge/pulse-data

    def _to_processed_file_path(
            unprocessed_file_path: GcsfsFilePath) -> GcsfsFilePath:
        processed_file_name = unprocessed_file_path.file_name.replace(
            DIRECT_INGEST_UNPROCESSED_PREFIX, DIRECT_INGEST_PROCESSED_PREFIX)

        return GcsfsFilePath.with_new_file_name(unprocessed_file_path,
                                                processed_file_name)

Example #17

0

Show file

File: direct_ingest_control_test.py Project: Alex-sConjecture/pulse-data

    def test_handle_file_start_ingest_unsupported_region(
            self, mock_region, mock_environment):
        region_code = 'us_nd'

        mock_environment.return_value = 'production'
        mock_controller = create_autospec(GcsfsDirectIngestController)
        mock_region.return_value = fake_region(region_code=region_code,
                                               environment='staging',
                                               ingestor=mock_controller)

        path = GcsfsFilePath.from_absolute_path(
            'bucket-us-nd/elite_offenders.csv')

        request_args = {
            'region': region_code,
            'bucket': path.bucket_name,
            'relative_file_path': path.blob_name,
            'start_ingest': 'False',
        }
        headers = {'X-Appengine-Cron': 'test-cron'}

        response = self.client.get('/handle_direct_ingest_file',
                                   query_string=request_args,
                                   headers=headers)

        mock_region.assert_called_with('us_nd', is_direct_ingest=True)
        mock_controller.handle_file.assert_called_with(path, False)

        # Even though the region isn't supported, we don't crash - the
        # controller handles not starting ingest, and if it does by accident,
        # the actual schedule/process_job endpoints handle the unlaunched
        # region check.
        self.assertEqual(200, response.status_code)

Example #18

0

Show file

File: direct_ingest_control_test.py Project: Alex-sConjecture/pulse-data

    def test_handle_file_start_ingest(self, mock_region, mock_environment):
        region_code = 'us_nd'

        mock_environment.return_value = 'production'
        mock_controller = create_autospec(GcsfsDirectIngestController)
        mock_region.return_value = fake_region(region_code=region_code,
                                               environment='production',
                                               ingestor=mock_controller)
        path = GcsfsFilePath.from_absolute_path(
            'bucket-us-nd/elite_offenders.csv')

        request_args = {
            'region': region_code,
            'bucket': path.bucket_name,
            'relative_file_path': path.blob_name,
            'start_ingest': 'True',
        }
        headers = {'X-Appengine-Cron': 'test-cron'}
        response = self.client.get('/handle_direct_ingest_file',
                                   query_string=request_args,
                                   headers=headers)

        mock_controller.handle_file.assert_called_with(path, True)

        # Even though the region isn't supported, we don't crash
        self.assertEqual(200, response.status_code)

Example #19

0

Show file

File: direct_ingest_raw_file_import_manager_test.py Project: pnchbck/pulse-data

    def mock_import_raw_file_to_big_query(
            self, *, source_uri: str,
            destination_table_schema: List[bigquery.SchemaField], **_kwargs):
        col_names = [
            schema_field.name for schema_field in destination_table_schema
        ]
        temp_path = GcsfsFilePath.from_absolute_path(source_uri)
        local_temp_path = self.fs.uploaded_test_path_to_actual[
            temp_path.abs_path()]

        df = pd.read_csv(local_temp_path, header=None, dtype=str)
        for value in df.values:
            for cell in value:
                if isinstance(cell, str):
                    stripped_cell = cell.strip()
                    if stripped_cell != cell:
                        raise ValueError(
                            'Did not strip white space from raw data cell')

                if cell in col_names:
                    raise ValueError(
                        f'Wrote column row to output file: {value}')
        self.num_lines_uploaded += len(df)

        return mock.MagicMock()

Example #20

0

Show file

File: direct_ingest_gcs_file_system.py Project: xgenie-007/pulse-data

    def _storage_path(self, storage_directory_path: GcsfsDirectoryPath,
                      opt_storage_subdir: Optional[str], date_str: str,
                      file_name: str) -> GcsfsFilePath:
        """Returns the storage file path for the input |file_name|,
        |storage_bucket|, and |ingest_date_str|"""
        if opt_storage_subdir is None:
            opt_storage_subdir = ''

        for file_num in range(self._RENAME_RETRIES):
            name, ext = file_name.split('.')
            actual_file_name = \
                file_name if file_num == 0 else f'{name}-({file_num}).{ext}'

            storage_path_str = os.path.join(
                storage_directory_path.bucket_name,
                storage_directory_path.relative_path, date_str,
                opt_storage_subdir, actual_file_name)
            storage_path = GcsfsFilePath.from_absolute_path(storage_path_str)

            if not self.exists(storage_path):
                return storage_path

            logging.error(
                "Storage path [%s] already exists, attempting rename",
                storage_path.abs_path())

        raise ValueError(
            f'Could not find valid storage path for file {file_name}.')

Example #21

0

Show file

    def _split_file(self, path: GcsfsFilePath,
                    file_contents_handle: GcsfsFileContentsHandle) -> None:

        output_dir = GcsfsDirectoryPath.from_file_path(path)

        upload_paths_and_df = []
        for i, df in enumerate(
                pd.read_csv(file_contents_handle.local_file_path,
                            dtype=str,
                            chunksize=self.file_split_line_limit,
                            keep_default_na=False)):
            upload_path = self._create_split_file_path(path,
                                                       output_dir,
                                                       split_num=i)
            upload_paths_and_df.append((upload_path, df))

        for output_path, df in upload_paths_and_df:
            logging.info("Writing file split [%s] to Cloud Storage.",
                         output_path.abs_path())

            self.fs.upload_from_string(output_path, df.to_csv(index=False),
                                       'text/csv')

        logging.info("Done splitting file [%s] into [%s] paths, returning.",
                     path.abs_path(), len(upload_paths_and_df))

        self.fs.mv_path_to_storage(path, self.storage_directory_path)

Example #22

0

Show file

File: gcsfs_direct_ingest_job_prioritizer_test.py Project: Alex-sConjecture/pulse-data

 def _normalized_path_for_filename(self, filename: str,
                                   dt: datetime.datetime) -> GcsfsFilePath:
     normalized_path = \
         to_normalized_unprocessed_file_path(
             os.path.join(self._INGEST_BUCKET_PATH.abs_path(),
                          filename), dt)
     return GcsfsFilePath.from_absolute_path(normalized_path)

Example #23

0

Show file

File: direct_ingest_control_test.py Project: teymour-aldridge/pulse-data

    def test_raw_data_import(self, mock_supported, mock_region,
                             mock_environment):
        mock_supported.return_value = ['us_xx']

        region_code = 'us_xx'

        mock_environment.return_value = 'staging'
        mock_controller = create_autospec(GcsfsDirectIngestController)
        mock_region.return_value = fake_region(region_code=region_code,
                                               environment='staging',
                                               ingestor=mock_controller)

        import_args = GcsfsRawDataBQImportArgs(
            raw_data_file_path=GcsfsFilePath.from_absolute_path(
                to_normalized_unprocessed_file_path(
                    'bucket/raw_data_path.csv',
                    file_type=GcsfsDirectIngestFileType.RAW_DATA)))
        request_args = {
            'region': region_code,
        }
        body = {
            'cloud_task_args': import_args.to_serializable(),
            'args_type': 'GcsfsRawDataBQImportArgs',
        }
        body_encoded = json.dumps(body).encode()

        headers = {'X-Appengine-Cron': 'test-cron'}

        response = self.client.post('/raw_data_import',
                                    query_string=request_args,
                                    headers=headers,
                                    data=body_encoded)
        self.assertEqual(200, response.status_code)
        mock_controller.do_raw_data_import.assert_called_with(import_args)

Example #24

0

Show file

File: fake_direct_ingest_gcs_file_system.py Project: pnchbck/pulse-data

    def real_absolute_path_for_path(self, path: GcsfsFilePath) -> str:
        if path.abs_path() in self.uploaded_test_path_to_actual:
            return self.uploaded_test_path_to_actual[path.abs_path()]

        directory_path, _ = os.path.split(path.abs_path())

        parts = filename_parts_from_path(path)
        suffix = f'_{parts.filename_suffix}' if parts.filename_suffix else ''
        fixture_filename = f'{parts.file_tag}{suffix}.{parts.extension}'

        actual_fixture_file_path = \
            fixtures.file_path_from_relative_path(
                os.path.join(directory_path, fixture_filename))

        tempfile_path = self.generate_random_temp_path()
        return shutil.copyfile(actual_fixture_file_path, tempfile_path)

Example #25

0

Show file

 def _make_unprocessed_path(
     path_str: str,
     file_type: GcsfsDirectIngestFileType,
     dt=datetime.datetime(2015, 1, 2, 3, 3, 3, 3)
 ) -> GcsfsFilePath:
     normalized_path_str = to_normalized_unprocessed_file_path(
         original_file_path=path_str, file_type=file_type, dt=dt)
     return GcsfsFilePath.from_absolute_path(normalized_path_str)

Example #26

0

Show file

 def export_query_results_to_cloud_storage(
         self, export_configs: List[ExportQueryConfig]) -> None:
     for export_config in export_configs:
         export_path = GcsfsFilePath.from_absolute_path(
             export_config.output_uri)
         self.fs.test_add_path(export_path)
         self.exported_file_tags.append(
             filename_parts_from_path(export_path).file_tag)

Example #27

0

Show file

 def _path_in_split_file_storage_subdir(
         self, path: GcsfsFilePath,
         controller: GcsfsDirectIngestController):
     if self._path_in_storage_dir(path, controller):
         directory, _ = os.path.split(path.abs_path())
         if SPLIT_FILE_STORAGE_SUBDIR in directory:
             return True
     return False

Example #28

0

Show file

    def test_read_completely_empty_file(self):
        empty_file_path = fixtures.as_filepath('tagA.csv')

        delegate = TestGcsfsCsvReaderDelegate()
        self.reader.streaming_read(GcsfsFilePath.from_absolute_path(empty_file_path), delegate=delegate, chunk_size=1)
        self.assertEqual(1, len(delegate.encodings_attempted))
        self.assertEqual(delegate.encodings_attempted[0], delegate.successful_encoding)
        self.assertEqual(0, len(delegate.dataframes))
        self.assertEqual(0, delegate.decode_errors)
        self.assertEqual(0, delegate.exceptions)

        delegate = TestGcsfsCsvReaderDelegate()
        self.reader.streaming_read(GcsfsFilePath.from_absolute_path(empty_file_path), delegate=delegate, chunk_size=10)
        self.assertEqual(1, len(delegate.encodings_attempted))
        self.assertEqual(delegate.encodings_attempted[0], delegate.successful_encoding)
        self.assertEqual(0, len(delegate.dataframes))
        self.assertEqual(0, delegate.decode_errors)
        self.assertEqual(0, delegate.exceptions)

Example #29

0

Show file

File: fake_direct_ingest_gcs_file_system.py Project: pnchbck/pulse-data

    def upload_from_contents_handle(self,
                                    path: GcsfsFilePath,
                                    contents_handle: GcsfsFileContentsHandle,
                                    content_type: str):

        temp_path = self.generate_random_temp_path()
        shutil.copyfile(contents_handle.local_file_path, temp_path)
        self.uploaded_test_path_to_actual[path.abs_path()] = temp_path
        self._add_path(path)

Example #30

0

Show file

File: fake_direct_ingest_gcs_file_system.py Project: pnchbck/pulse-data

    def copy(self,
             src_path: GcsfsFilePath,
             dst_path: GcsfsPath) -> None:

        if isinstance(dst_path, GcsfsFilePath):
            path = dst_path
        elif isinstance(dst_path, GcsfsDirectoryPath):
            path = \
                GcsfsFilePath.from_directory_and_file_name(dst_path,
                                                           src_path.file_name)
        else:
            raise ValueError(f'Unexpected path type [{type(dst_path)}]')

        if src_path.abs_path() in self.uploaded_test_path_to_actual:
            self.uploaded_test_path_to_actual[dst_path.abs_path()] = \
                self.uploaded_test_path_to_actual[src_path.abs_path()]

        self._add_path(path)