def test_is_task_queued_has_tasks(self):
        # Arrange
        file_path = to_normalized_unprocessed_file_path(
            'bucket/file_path.csv', GcsfsDirectIngestFileType.INGEST_VIEW)
        gcsfs_args = \
            GcsfsIngestArgs(
                ingest_time=datetime.datetime.now(),
                file_path=GcsfsFilePath.from_absolute_path(file_path))

        full_task_name = \
            _build_task_id(_REGION.region_code, gcsfs_args.task_id_tag())
        info = ProcessIngestJobCloudTaskQueueInfo(
            queue_name='queue_name',
            task_names=[
                'projects/path/to/random_task',
                f'projects/path/to/{full_task_name}'
            ])
        file_path = to_normalized_unprocessed_file_path(
            'bucket/file_path.csv', GcsfsDirectIngestFileType.INGEST_VIEW)
        gcsfs_args = \
            GcsfsIngestArgs(
                ingest_time=datetime.datetime.now(),
                file_path=GcsfsFilePath.from_absolute_path(file_path))

        # Act
        gcsfs_args_queued = info.is_task_queued(_REGION, gcsfs_args)

        # Assert
        self.assertTrue(gcsfs_args_queued)
    def test_move_to_storage_with_conflict(self):
        test_fs = FakeDirectIngestGCSFileSystem()
        dt = datetime.datetime.now()
        self.fully_process_file(
            test_fs, dt,
            GcsfsFilePath(bucket_name='my_bucket', blob_name='test_file.csv'))

        # Try uploading a file with a duplicate name that has already been
        # moved to storage
        self.fully_process_file(
            test_fs, dt,
            GcsfsFilePath(bucket_name='my_bucket', blob_name='test_file.csv'))

        # pylint: disable=protected-access
        storage_paths = test_fs._ls_with_file_prefix(self.STORAGE_DIR_PATH, '')
        self.assertEqual(len(storage_paths), 2)

        found_first_file = False
        found_second_file = False
        for path in storage_paths:
            if path.abs_path().endswith('test_file.csv'):
                found_first_file = True
            if path.abs_path().endswith('test_file-(1).csv'):
                found_second_file = True

        self.assertTrue(found_first_file)
        self.assertTrue(found_second_file)
    def import_raw_file_to_big_query(
            self, path: GcsfsFilePath,
            file_metadata: DirectIngestFileMetadata) -> None:
        """Import a raw data file at the given path to the appropriate raw data table in BigQuery."""

        if not self.region.are_raw_data_bq_imports_enabled_in_env():
            raise ValueError(
                f'Cannot import raw files for region [{self.region.region_code}]'
            )

        parts = filename_parts_from_path(path)
        if parts.file_tag not in self.region_raw_file_config.raw_file_tags:
            raise ValueError(
                f'Attempting to import raw file with tag [{parts.file_tag}] unspecified by [{self.region.region_code}] '
                f'config.')

        if parts.file_type != GcsfsDirectIngestFileType.RAW_DATA:
            raise ValueError(
                f'Unexpected file type [{parts.file_type}] for path [{parts.file_tag}].'
            )

        logging.info('Beginning BigQuery upload of raw file [%s]',
                     path.abs_path())

        temp_output_paths = self._upload_contents_to_temp_gcs_paths(
            path, file_metadata)
        self._load_contents_to_bigquery(path, temp_output_paths)

        logging.info('Completed BigQuery import of [%s]', path.abs_path())
Example #4
0
    def _move_files(self, from_uri: str):
        curr_gcsfs_file_path = GcsfsFilePath.from_absolute_path(from_uri)
        previous_date_format = filename_parts_from_path(
            curr_gcsfs_file_path).date_str
        new_date_format = date.fromisoformat(previous_date_format).strftime(
            "%Y/%m/%d/")

        path_with_new_file_name = GcsfsFilePath.from_absolute_path(
            to_normalized_unprocessed_file_path_from_normalized_path(
                from_uri, GcsfsDirectIngestFileType.RAW_DATA))

        if DirectIngestGCSFileSystem.is_processed_file(curr_gcsfs_file_path):
            path_with_new_file_name = GcsfsFilePath.from_absolute_path(
                to_normalized_processed_file_path_from_normalized_path(
                    from_uri, GcsfsDirectIngestFileType.RAW_DATA))

        raw_dir_with_date = GcsfsDirectoryPath.from_dir_and_subdir(
            self.region_storage_raw_dir_path, new_date_format)

        to_uri = GcsfsFilePath.from_directory_and_file_name(
            raw_dir_with_date, path_with_new_file_name.file_name).uri()

        if not self.dry_run:
            gsutil_mv(from_path=from_uri, to_path=to_uri)
        with self.mutex:
            self.move_list.append((from_uri, to_uri))
            if self.move_progress:
                self.move_progress.next()
    def _split_file_if_necessary(self, path: GcsfsFilePath):
        """Checks if the given file needs to be split according to this
        controller's |file_split_line_limit|.
        """
        parts = filename_parts_from_path(path)

        if parts.file_tag not in self._get_file_tag_rank_list():
            logging.info(
                "File tag [%s] for path [%s] not in rank list - "
                "not splitting.", parts.file_tag, path.abs_path())
            return False

        if parts.is_file_split and \
                parts.file_split_size and \
                parts.file_split_size <= self.file_split_line_limit:
            logging.info("File [%s] already split with size [%s].",
                         path.abs_path(), parts.file_split_size)
            return False

        file_contents_handle = self._get_contents_handle_from_path(path)

        if not file_contents_handle:
            logging.info("File [%s] has no rows - not splitting.",
                         path.abs_path())
            return False

        if self._can_proceed_with_ingest_for_contents(file_contents_handle):
            logging.info("No need to split file path [%s].", path.abs_path())
            return False

        logging.info("Proceeding to file splitting for path [%s].",
                     path.abs_path())

        self._split_file(path, file_contents_handle)
        return True
    def handle_file(self, path: GcsfsFilePath, start_ingest: bool):
        """Called when a single new file is added to an ingest bucket (may also
        be called as a result of a rename).

        May be called from any worker/queue.
        """
        if self.fs.is_processed_file(path):
            logging.info("File [%s] is already processed, returning.",
                         path.abs_path())
            return

        if self.fs.is_normalized_file_path(path):
            parts = filename_parts_from_path(path)
            if parts.is_file_split and \
                    parts.file_split_size and \
                    parts.file_split_size <= self.file_split_line_limit:
                self.kick_scheduler(just_finished_job=False)
                logging.info(
                    "File [%s] is already normalized and split split "
                    "with correct size, kicking scheduler.", path.abs_path())
                return

        logging.info("Creating cloud task to schedule next job.")
        self.cloud_task_manager.create_direct_ingest_handle_new_files_task(
            region=self.region, can_start_ingest=start_ingest)
Example #7
0
    def test_read_file_with_columns_no_contents(self):
        empty_file_path = fixtures.as_filepath('tagB.csv')

        delegate = TestGcsfsCsvReaderDelegate()
        self.reader.streaming_read(GcsfsFilePath.from_absolute_path(empty_file_path), delegate=delegate, chunk_size=1)
        self.assertEqual(1, len(delegate.encodings_attempted))
        self.assertEqual(delegate.encodings_attempted[0], delegate.successful_encoding)
        self.assertEqual(1, len(delegate.dataframes))
        encoding, df = delegate.dataframes[0]
        self.assertEqual(encoding, delegate.successful_encoding)
        self.assertEqual(0, df.shape[0])  # No rows
        self.assertEqual(7, df.shape[1])  # 7 columns
        self.assertEqual(0, delegate.decode_errors)
        self.assertEqual(0, delegate.exceptions)

        delegate = TestGcsfsCsvReaderDelegate()
        self.reader.streaming_read(GcsfsFilePath.from_absolute_path(empty_file_path), delegate=delegate, chunk_size=10)
        self.assertEqual(1, len(delegate.encodings_attempted))
        self.assertEqual(delegate.encodings_attempted[0], delegate.successful_encoding)
        self.assertEqual(1, len(delegate.dataframes))
        encoding, df = delegate.dataframes[0]
        self.assertEqual(encoding, delegate.successful_encoding)
        self.assertEqual(0, df.shape[0])  # No rows
        self.assertEqual(7, df.shape[1])  # 7 columns
        self.assertEqual(0, delegate.decode_errors)
        self.assertEqual(0, delegate.exceptions)
Example #8
0
    def test_move_to_storage_with_conflict_with_file_types(self):
        dt = datetime.datetime.now()
        self.fully_process_file(dt,
                                GcsfsFilePath(bucket_name='my_bucket',
                                              blob_name='test_file.csv'),
                                file_type_differentiation_on=True)

        # Try uploading a file with a duplicate name that has already been
        # moved to storage
        self.fully_process_file(dt,
                                GcsfsFilePath(bucket_name='my_bucket',
                                              blob_name='test_file.csv'),
                                file_type_differentiation_on=True)

        # pylint: disable=protected-access
        storage_paths = self.fs._ls_with_file_prefix(self.STORAGE_DIR_PATH,
                                                     '',
                                                     file_type_filter=None)
        self.assertEqual(len(storage_paths), 4)

        found_first_file = False
        found_second_file = False
        for path in storage_paths:
            if path.abs_path().endswith('test_file.csv'):
                found_first_file = True
            if path.abs_path().endswith('test_file-(1).csv'):
                found_second_file = True

        self.assertTrue(found_first_file)
        self.assertTrue(found_second_file)
Example #9
0
    def test_direct_ingest_multiple_file_moves(self):
        self.fully_process_file(
            datetime.datetime.now(),
            GcsfsFilePath(bucket_name='my_bucket', blob_name='test_file.csv'))

        self.fully_process_file(
            datetime.datetime.now(),
            GcsfsFilePath(bucket_name='my_bucket',
                          blob_name='test_file_2.csv'))
    def test_direct_ingest_multiple_file_moves(self):
        test_fs = FakeDirectIngestGCSFileSystem()
        self.fully_process_file(
            test_fs, datetime.datetime.now(),
            GcsfsFilePath(bucket_name='my_bucket', blob_name='test_file.csv'))

        self.fully_process_file(
            test_fs, datetime.datetime.now(),
            GcsfsFilePath(bucket_name='my_bucket',
                          blob_name='test_file_2.csv'))
Example #11
0
    def test_direct_ingest_multiple_file_moves_with_file_types(self):
        self.fully_process_file(datetime.datetime.now(),
                                GcsfsFilePath(bucket_name='my_bucket',
                                              blob_name='test_file.csv'),
                                file_type_differentiation_on=True)

        self.fully_process_file(datetime.datetime.now(),
                                GcsfsFilePath(bucket_name='my_bucket',
                                              blob_name='test_file_2.csv'),
                                file_type_differentiation_on=True)
    def _upload_contents_to_temp_gcs_paths(
            self,
            path: GcsfsFilePath,
            file_metadata: DirectIngestFileMetadata,
            contents_handle: GcsfsFileContentsHandle) -> List[Tuple[GcsfsFilePath, List[str]]]:
        """Uploads the contents of the file at the provided path to one or more GCS files, with whitespace stripped and
        additional metadata columns added.
        """

        logging.info('Starting chunked upload of contents to GCS')

        parts = filename_parts_from_path(path)
        file_config = self.region_raw_file_config.raw_file_configs[parts.file_tag]
        for encoding in file_config.encodings_to_try():
            logging.info('Attempting to do chunked upload of [%s] with encoding [%s]', path.abs_path(), encoding)
            temp_paths_with_columns = []
            try:
                for i, raw_data_df in enumerate(self._read_contents_into_dataframes(encoding,
                                                                                    contents_handle,
                                                                                    file_config)):
                    logging.info('Loaded DataFrame chunk [%d] has [%d] rows', i, raw_data_df.shape[0])

                    # Stripping white space from all fields
                    raw_data_df = raw_data_df.applymap(lambda x: x.strip())

                    augmented_df = self._augment_raw_data_with_metadata_columns(path=path,
                                                                                file_metadata=file_metadata,
                                                                                raw_data_df=raw_data_df)
                    logging.info('Augmented DataFrame chunk [%d] has [%d] rows', i, augmented_df.shape[0])
                    temp_output_path = self._get_temp_df_output_path(path, chunk_num=i)

                    logging.info('Writing DataFrame chunk [%d] to temporary output path [%s]',
                                 i, temp_output_path.abs_path())
                    self.fs.upload_from_string(temp_output_path,
                                               augmented_df.to_csv(header=False, index=False, quoting=csv.QUOTE_ALL),
                                               'text/csv')
                    logging.info('Done writing to temporary output path')

                    temp_paths_with_columns.append((temp_output_path, augmented_df.columns))
                logging.info('Successfully read file [%s] with encoding [%s]', path.abs_path(), encoding)
                return temp_paths_with_columns
            except UnicodeDecodeError:
                logging.info('Unable to read file [%s] with encoding [%s]', path.abs_path(), encoding)
                self._delete_temp_output_paths([path for path, _ in temp_paths_with_columns])
                temp_paths_with_columns.clear()
                continue
            except Exception as e:
                logging.error('Failed to upload to GCS - cleaning up temp paths')
                self._delete_temp_output_paths([path for path, _ in temp_paths_with_columns])
                raise e

        raise ValueError(
            f'Unable to read path [{path.abs_path()}] for any of these encodings: {file_config.encodings_to_try()}')
    def mv_path_to_normalized_path(self,
                                   path: GcsfsFilePath,
                                   dt: Optional[datetime.datetime] = None):
        updated_file_path = \
            GcsfsFilePath.from_absolute_path(
                to_normalized_unprocessed_file_path(path.abs_path(), dt))

        if self.exists(updated_file_path):
            raise ValueError(f"Desired path [{updated_file_path.abs_path()}] "
                             f"already exists, returning")

        logging.info("Moving [%s] to normalized path [%s].", path.abs_path(),
                     updated_file_path.abs_path())
        self.mv(path, updated_file_path)
 def _get_files_to_move(self) -> List[str]:
     """Function that gets the files to move to deprecated based on the file_filter and end/start dates specified"""
     subdirs = dfs_get_date_subdirs([self.region_storage_dir_path.uri()])
     result = []
     for yr_mth_day_subdir_path in subdirs:
         dir_path_blob = GcsfsFilePath.from_absolute_path(
             yr_mth_day_subdir_path).blob_name
         search_date = DATE_SUBDIR_REGEX.search(dir_path_blob)
         if search_date is None:
             raise ValueError(
                 "No match found. File paths should have the format YYYY/MM/DD. Instead we found"
                 f"{dir_path_blob}.")
         match_date = search_date.group()
         date_of_interest = datetime.datetime.strptime(
             match_date, '%Y/%m/%d').date().isoformat()
         if is_between_date_strs_inclusive(
                 upper_bound_date=self.end_date_bound,
                 lower_bound_date=self.start_date_bound,
                 date_of_interest=date_of_interest):
             from_paths = gsutil_ls(f'{yr_mth_day_subdir_path}*.csv')
             for from_path in from_paths:
                 _, file_name = os.path.split(from_path)
                 if re.match(INGESTED_FILE_REGEX, file_name):
                     if not self.file_filter or re.search(
                             self.file_filter, file_name):
                         result.append(from_path)
     return result
def to_normalized_unprocessed_file_path_from_normalized_path(
        original_normalized_file_path: str,
        file_type_override: Optional[GcsfsDirectIngestFileType] = None
) -> str:
    """Moves any normalized path back to an unprocessed path with the same information embedded in the file name. If
    |file_type_override| is provided, we will always overwrite the original path file type with the override file type.
    """
    directory, _ = os.path.split(original_normalized_file_path)
    parts = filename_parts_from_path(GcsfsFilePath.from_absolute_path(original_normalized_file_path))

    file_type = file_type_override if file_type_override else parts.file_type

    utc_iso_timestamp_str = parts.utc_upload_datetime.strftime('%Y-%m-%dT%H:%M:%S:%f')

    suffix_str = \
        f'_{parts.filename_suffix}' if parts.filename_suffix else ''
    base_file_name = f'{parts.file_tag}{suffix_str}'

    path_as_unprocessed = _build_unprocessed_file_name(
        utc_iso_timestamp_str=utc_iso_timestamp_str,
        file_type=file_type,
        base_file_name=base_file_name,
        extension=parts.extension)

    return os.path.join(directory, path_as_unprocessed)
    def _to_processed_file_path(
            unprocessed_file_path: GcsfsFilePath) -> GcsfsFilePath:
        processed_file_name = unprocessed_file_path.file_name.replace(
            DIRECT_INGEST_UNPROCESSED_PREFIX, DIRECT_INGEST_PROCESSED_PREFIX)

        return GcsfsFilePath.with_new_file_name(unprocessed_file_path,
                                                processed_file_name)
    def test_handle_file_start_ingest_unsupported_region(
            self, mock_region, mock_environment):
        region_code = 'us_nd'

        mock_environment.return_value = 'production'
        mock_controller = create_autospec(GcsfsDirectIngestController)
        mock_region.return_value = fake_region(region_code=region_code,
                                               environment='staging',
                                               ingestor=mock_controller)

        path = GcsfsFilePath.from_absolute_path(
            'bucket-us-nd/elite_offenders.csv')

        request_args = {
            'region': region_code,
            'bucket': path.bucket_name,
            'relative_file_path': path.blob_name,
            'start_ingest': 'False',
        }
        headers = {'X-Appengine-Cron': 'test-cron'}

        response = self.client.get('/handle_direct_ingest_file',
                                   query_string=request_args,
                                   headers=headers)

        mock_region.assert_called_with('us_nd', is_direct_ingest=True)
        mock_controller.handle_file.assert_called_with(path, False)

        # Even though the region isn't supported, we don't crash - the
        # controller handles not starting ingest, and if it does by accident,
        # the actual schedule/process_job endpoints handle the unlaunched
        # region check.
        self.assertEqual(200, response.status_code)
    def test_handle_file_start_ingest(self, mock_region, mock_environment):
        region_code = 'us_nd'

        mock_environment.return_value = 'production'
        mock_controller = create_autospec(GcsfsDirectIngestController)
        mock_region.return_value = fake_region(region_code=region_code,
                                               environment='production',
                                               ingestor=mock_controller)
        path = GcsfsFilePath.from_absolute_path(
            'bucket-us-nd/elite_offenders.csv')

        request_args = {
            'region': region_code,
            'bucket': path.bucket_name,
            'relative_file_path': path.blob_name,
            'start_ingest': 'True',
        }
        headers = {'X-Appengine-Cron': 'test-cron'}
        response = self.client.get('/handle_direct_ingest_file',
                                   query_string=request_args,
                                   headers=headers)

        mock_controller.handle_file.assert_called_with(path, True)

        # Even though the region isn't supported, we don't crash
        self.assertEqual(200, response.status_code)
    def mock_import_raw_file_to_big_query(
            self, *, source_uri: str,
            destination_table_schema: List[bigquery.SchemaField], **_kwargs):
        col_names = [
            schema_field.name for schema_field in destination_table_schema
        ]
        temp_path = GcsfsFilePath.from_absolute_path(source_uri)
        local_temp_path = self.fs.uploaded_test_path_to_actual[
            temp_path.abs_path()]

        df = pd.read_csv(local_temp_path, header=None, dtype=str)
        for value in df.values:
            for cell in value:
                if isinstance(cell, str):
                    stripped_cell = cell.strip()
                    if stripped_cell != cell:
                        raise ValueError(
                            'Did not strip white space from raw data cell')

                if cell in col_names:
                    raise ValueError(
                        f'Wrote column row to output file: {value}')
        self.num_lines_uploaded += len(df)

        return mock.MagicMock()
    def _storage_path(self, storage_directory_path: GcsfsDirectoryPath,
                      opt_storage_subdir: Optional[str], date_str: str,
                      file_name: str) -> GcsfsFilePath:
        """Returns the storage file path for the input |file_name|,
        |storage_bucket|, and |ingest_date_str|"""
        if opt_storage_subdir is None:
            opt_storage_subdir = ''

        for file_num in range(self._RENAME_RETRIES):
            name, ext = file_name.split('.')
            actual_file_name = \
                file_name if file_num == 0 else f'{name}-({file_num}).{ext}'

            storage_path_str = os.path.join(
                storage_directory_path.bucket_name,
                storage_directory_path.relative_path, date_str,
                opt_storage_subdir, actual_file_name)
            storage_path = GcsfsFilePath.from_absolute_path(storage_path_str)

            if not self.exists(storage_path):
                return storage_path

            logging.error(
                "Storage path [%s] already exists, attempting rename",
                storage_path.abs_path())

        raise ValueError(
            f'Could not find valid storage path for file {file_name}.')
Example #21
0
    def _split_file(self, path: GcsfsFilePath,
                    file_contents_handle: GcsfsFileContentsHandle) -> None:

        output_dir = GcsfsDirectoryPath.from_file_path(path)

        upload_paths_and_df = []
        for i, df in enumerate(
                pd.read_csv(file_contents_handle.local_file_path,
                            dtype=str,
                            chunksize=self.file_split_line_limit,
                            keep_default_na=False)):
            upload_path = self._create_split_file_path(path,
                                                       output_dir,
                                                       split_num=i)
            upload_paths_and_df.append((upload_path, df))

        for output_path, df in upload_paths_and_df:
            logging.info("Writing file split [%s] to Cloud Storage.",
                         output_path.abs_path())

            self.fs.upload_from_string(output_path, df.to_csv(index=False),
                                       'text/csv')

        logging.info("Done splitting file [%s] into [%s] paths, returning.",
                     path.abs_path(), len(upload_paths_and_df))

        self.fs.mv_path_to_storage(path, self.storage_directory_path)
 def _normalized_path_for_filename(self, filename: str,
                                   dt: datetime.datetime) -> GcsfsFilePath:
     normalized_path = \
         to_normalized_unprocessed_file_path(
             os.path.join(self._INGEST_BUCKET_PATH.abs_path(),
                          filename), dt)
     return GcsfsFilePath.from_absolute_path(normalized_path)
    def test_raw_data_import(self, mock_supported, mock_region,
                             mock_environment):
        mock_supported.return_value = ['us_xx']

        region_code = 'us_xx'

        mock_environment.return_value = 'staging'
        mock_controller = create_autospec(GcsfsDirectIngestController)
        mock_region.return_value = fake_region(region_code=region_code,
                                               environment='staging',
                                               ingestor=mock_controller)

        import_args = GcsfsRawDataBQImportArgs(
            raw_data_file_path=GcsfsFilePath.from_absolute_path(
                to_normalized_unprocessed_file_path(
                    'bucket/raw_data_path.csv',
                    file_type=GcsfsDirectIngestFileType.RAW_DATA)))
        request_args = {
            'region': region_code,
        }
        body = {
            'cloud_task_args': import_args.to_serializable(),
            'args_type': 'GcsfsRawDataBQImportArgs',
        }
        body_encoded = json.dumps(body).encode()

        headers = {'X-Appengine-Cron': 'test-cron'}

        response = self.client.post('/raw_data_import',
                                    query_string=request_args,
                                    headers=headers,
                                    data=body_encoded)
        self.assertEqual(200, response.status_code)
        mock_controller.do_raw_data_import.assert_called_with(import_args)
    def real_absolute_path_for_path(self, path: GcsfsFilePath) -> str:
        if path.abs_path() in self.uploaded_test_path_to_actual:
            return self.uploaded_test_path_to_actual[path.abs_path()]

        directory_path, _ = os.path.split(path.abs_path())

        parts = filename_parts_from_path(path)
        suffix = f'_{parts.filename_suffix}' if parts.filename_suffix else ''
        fixture_filename = f'{parts.file_tag}{suffix}.{parts.extension}'

        actual_fixture_file_path = \
            fixtures.file_path_from_relative_path(
                os.path.join(directory_path, fixture_filename))

        tempfile_path = self.generate_random_temp_path()
        return shutil.copyfile(actual_fixture_file_path, tempfile_path)
Example #25
0
 def _make_unprocessed_path(
     path_str: str,
     file_type: GcsfsDirectIngestFileType,
     dt=datetime.datetime(2015, 1, 2, 3, 3, 3, 3)
 ) -> GcsfsFilePath:
     normalized_path_str = to_normalized_unprocessed_file_path(
         original_file_path=path_str, file_type=file_type, dt=dt)
     return GcsfsFilePath.from_absolute_path(normalized_path_str)
Example #26
0
 def export_query_results_to_cloud_storage(
         self, export_configs: List[ExportQueryConfig]) -> None:
     for export_config in export_configs:
         export_path = GcsfsFilePath.from_absolute_path(
             export_config.output_uri)
         self.fs.test_add_path(export_path)
         self.exported_file_tags.append(
             filename_parts_from_path(export_path).file_tag)
Example #27
0
 def _path_in_split_file_storage_subdir(
         self, path: GcsfsFilePath,
         controller: GcsfsDirectIngestController):
     if self._path_in_storage_dir(path, controller):
         directory, _ = os.path.split(path.abs_path())
         if SPLIT_FILE_STORAGE_SUBDIR in directory:
             return True
     return False
Example #28
0
    def test_read_completely_empty_file(self):
        empty_file_path = fixtures.as_filepath('tagA.csv')

        delegate = TestGcsfsCsvReaderDelegate()
        self.reader.streaming_read(GcsfsFilePath.from_absolute_path(empty_file_path), delegate=delegate, chunk_size=1)
        self.assertEqual(1, len(delegate.encodings_attempted))
        self.assertEqual(delegate.encodings_attempted[0], delegate.successful_encoding)
        self.assertEqual(0, len(delegate.dataframes))
        self.assertEqual(0, delegate.decode_errors)
        self.assertEqual(0, delegate.exceptions)

        delegate = TestGcsfsCsvReaderDelegate()
        self.reader.streaming_read(GcsfsFilePath.from_absolute_path(empty_file_path), delegate=delegate, chunk_size=10)
        self.assertEqual(1, len(delegate.encodings_attempted))
        self.assertEqual(delegate.encodings_attempted[0], delegate.successful_encoding)
        self.assertEqual(0, len(delegate.dataframes))
        self.assertEqual(0, delegate.decode_errors)
        self.assertEqual(0, delegate.exceptions)
    def upload_from_contents_handle(self,
                                    path: GcsfsFilePath,
                                    contents_handle: GcsfsFileContentsHandle,
                                    content_type: str):

        temp_path = self.generate_random_temp_path()
        shutil.copyfile(contents_handle.local_file_path, temp_path)
        self.uploaded_test_path_to_actual[path.abs_path()] = temp_path
        self._add_path(path)
    def copy(self,
             src_path: GcsfsFilePath,
             dst_path: GcsfsPath) -> None:

        if isinstance(dst_path, GcsfsFilePath):
            path = dst_path
        elif isinstance(dst_path, GcsfsDirectoryPath):
            path = \
                GcsfsFilePath.from_directory_and_file_name(dst_path,
                                                           src_path.file_name)
        else:
            raise ValueError(f'Unexpected path type [{type(dst_path)}]')

        if src_path.abs_path() in self.uploaded_test_path_to_actual:
            self.uploaded_test_path_to_actual[dst_path.abs_path()] = \
                self.uploaded_test_path_to_actual[src_path.abs_path()]

        self._add_path(path)