Python GcsfsFilePath.abs_path Examples, recidiviz.ingest.direct.controllers.gcsfs_path.GcsfsFilePath.abs_path Python Examples

Example #1

0

Show file

File: gcsfs_direct_ingest_controller.py Project: xgenie-007/pulse-data

    def _split_file_if_necessary(self, path: GcsfsFilePath):
        """Checks if the given file needs to be split according to this
        controller's |file_split_line_limit|.
        """
        parts = filename_parts_from_path(path)

        if parts.file_tag not in self._get_file_tag_rank_list():
            logging.info(
                "File tag [%s] for path [%s] not in rank list - "
                "not splitting.", parts.file_tag, path.abs_path())
            return False

        if parts.is_file_split and \
                parts.file_split_size and \
                parts.file_split_size <= self.file_split_line_limit:
            logging.info("File [%s] already split with size [%s].",
                         path.abs_path(), parts.file_split_size)
            return False

        file_contents_handle = self._get_contents_handle_from_path(path)

        if not file_contents_handle:
            logging.info("File [%s] has no rows - not splitting.",
                         path.abs_path())
            return False

        if self._can_proceed_with_ingest_for_contents(file_contents_handle):
            logging.info("No need to split file path [%s].", path.abs_path())
            return False

        logging.info("Proceeding to file splitting for path [%s].",
                     path.abs_path())

        self._split_file(path, file_contents_handle)
        return True

Example #2

0

Show file

File: direct_ingest_raw_file_import_manager.py Project: pnchbck/pulse-data

    def import_raw_file_to_big_query(
            self, path: GcsfsFilePath,
            file_metadata: DirectIngestFileMetadata) -> None:
        """Import a raw data file at the given path to the appropriate raw data table in BigQuery."""

        if not self.region.are_raw_data_bq_imports_enabled_in_env():
            raise ValueError(
                f'Cannot import raw files for region [{self.region.region_code}]'
            )

        parts = filename_parts_from_path(path)
        if parts.file_tag not in self.region_raw_file_config.raw_file_tags:
            raise ValueError(
                f'Attempting to import raw file with tag [{parts.file_tag}] unspecified by [{self.region.region_code}] '
                f'config.')

        if parts.file_type != GcsfsDirectIngestFileType.RAW_DATA:
            raise ValueError(
                f'Unexpected file type [{parts.file_type}] for path [{parts.file_tag}].'
            )

        logging.info('Beginning BigQuery upload of raw file [%s]',
                     path.abs_path())

        temp_output_paths = self._upload_contents_to_temp_gcs_paths(
            path, file_metadata)
        self._load_contents_to_bigquery(path, temp_output_paths)

        logging.info('Completed BigQuery import of [%s]', path.abs_path())

Example #3

0

Show file

File: gcsfs_direct_ingest_controller.py Project: xgenie-007/pulse-data

    def handle_file(self, path: GcsfsFilePath, start_ingest: bool):
        """Called when a single new file is added to an ingest bucket (may also
        be called as a result of a rename).

        May be called from any worker/queue.
        """
        if self.fs.is_processed_file(path):
            logging.info("File [%s] is already processed, returning.",
                         path.abs_path())
            return

        if self.fs.is_normalized_file_path(path):
            parts = filename_parts_from_path(path)
            if parts.is_file_split and \
                    parts.file_split_size and \
                    parts.file_split_size <= self.file_split_line_limit:
                self.kick_scheduler(just_finished_job=False)
                logging.info(
                    "File [%s] is already normalized and split split "
                    "with correct size, kicking scheduler.", path.abs_path())
                return

        logging.info("Creating cloud task to schedule next job.")
        self.cloud_task_manager.create_direct_ingest_handle_new_files_task(
            region=self.region, can_start_ingest=start_ingest)

Example #4

0

Show file

File: direct_ingest_raw_file_import_manager.py Project: sixtysixeast/pulse-data

    def _upload_contents_to_temp_gcs_paths(
            self,
            path: GcsfsFilePath,
            file_metadata: DirectIngestFileMetadata,
            contents_handle: GcsfsFileContentsHandle) -> List[Tuple[GcsfsFilePath, List[str]]]:
        """Uploads the contents of the file at the provided path to one or more GCS files, with whitespace stripped and
        additional metadata columns added.
        """

        logging.info('Starting chunked upload of contents to GCS')

        parts = filename_parts_from_path(path)
        file_config = self.region_raw_file_config.raw_file_configs[parts.file_tag]
        for encoding in file_config.encodings_to_try():
            logging.info('Attempting to do chunked upload of [%s] with encoding [%s]', path.abs_path(), encoding)
            temp_paths_with_columns = []
            try:
                for i, raw_data_df in enumerate(self._read_contents_into_dataframes(encoding,
                                                                                    contents_handle,
                                                                                    file_config)):
                    logging.info('Loaded DataFrame chunk [%d] has [%d] rows', i, raw_data_df.shape[0])

                    # Stripping white space from all fields
                    raw_data_df = raw_data_df.applymap(lambda x: x.strip())

                    augmented_df = self._augment_raw_data_with_metadata_columns(path=path,
                                                                                file_metadata=file_metadata,
                                                                                raw_data_df=raw_data_df)
                    logging.info('Augmented DataFrame chunk [%d] has [%d] rows', i, augmented_df.shape[0])
                    temp_output_path = self._get_temp_df_output_path(path, chunk_num=i)

                    logging.info('Writing DataFrame chunk [%d] to temporary output path [%s]',
                                 i, temp_output_path.abs_path())
                    self.fs.upload_from_string(temp_output_path,
                                               augmented_df.to_csv(header=False, index=False, quoting=csv.QUOTE_ALL),
                                               'text/csv')
                    logging.info('Done writing to temporary output path')

                    temp_paths_with_columns.append((temp_output_path, augmented_df.columns))
                logging.info('Successfully read file [%s] with encoding [%s]', path.abs_path(), encoding)
                return temp_paths_with_columns
            except UnicodeDecodeError:
                logging.info('Unable to read file [%s] with encoding [%s]', path.abs_path(), encoding)
                self._delete_temp_output_paths([path for path, _ in temp_paths_with_columns])
                temp_paths_with_columns.clear()
                continue
            except Exception as e:
                logging.error('Failed to upload to GCS - cleaning up temp paths')
                self._delete_temp_output_paths([path for path, _ in temp_paths_with_columns])
                raise e

        raise ValueError(
            f'Unable to read path [{path.abs_path()}] for any of these encodings: {file_config.encodings_to_try()}')

Example #5

0

Show file

File: direct_ingest_gcs_file_system.py Project: xgenie-007/pulse-data

    def mv_path_to_normalized_path(self,
                                   path: GcsfsFilePath,
                                   dt: Optional[datetime.datetime] = None):
        updated_file_path = \
            GcsfsFilePath.from_absolute_path(
                to_normalized_unprocessed_file_path(path.abs_path(), dt))

        if self.exists(updated_file_path):
            raise ValueError(f"Desired path [{updated_file_path.abs_path()}] "
                             f"already exists, returning")

        logging.info("Moving [%s] to normalized path [%s].", path.abs_path(),
                     updated_file_path.abs_path())
        self.mv(path, updated_file_path)

Example #6

0

Show file

File: fake_direct_ingest_gcs_file_system.py Project: pnchbck/pulse-data

    def real_absolute_path_for_path(self, path: GcsfsFilePath) -> str:
        if path.abs_path() in self.uploaded_test_path_to_actual:
            return self.uploaded_test_path_to_actual[path.abs_path()]

        directory_path, _ = os.path.split(path.abs_path())

        parts = filename_parts_from_path(path)
        suffix = f'_{parts.filename_suffix}' if parts.filename_suffix else ''
        fixture_filename = f'{parts.file_tag}{suffix}.{parts.extension}'

        actual_fixture_file_path = \
            fixtures.file_path_from_relative_path(
                os.path.join(directory_path, fixture_filename))

        tempfile_path = self.generate_random_temp_path()
        return shutil.copyfile(actual_fixture_file_path, tempfile_path)

Example #7

0

Show file

    def _split_file(self, path: GcsfsFilePath,
                    file_contents_handle: GcsfsFileContentsHandle) -> None:

        output_dir = GcsfsDirectoryPath.from_file_path(path)

        upload_paths_and_df = []
        for i, df in enumerate(
                pd.read_csv(file_contents_handle.local_file_path,
                            dtype=str,
                            chunksize=self.file_split_line_limit,
                            keep_default_na=False)):
            upload_path = self._create_split_file_path(path,
                                                       output_dir,
                                                       split_num=i)
            upload_paths_and_df.append((upload_path, df))

        for output_path, df in upload_paths_and_df:
            logging.info("Writing file split [%s] to Cloud Storage.",
                         output_path.abs_path())

            self.fs.upload_from_string(output_path, df.to_csv(index=False),
                                       'text/csv')

        logging.info("Done splitting file [%s] into [%s] paths, returning.",
                     path.abs_path(), len(upload_paths_and_df))

        self.fs.mv_path_to_storage(path, self.storage_directory_path)

Example #8

0

Show file

 def _path_in_split_file_storage_subdir(
         self, path: GcsfsFilePath,
         controller: GcsfsDirectIngestController):
     if self._path_in_storage_dir(path, controller):
         directory, _ = os.path.split(path.abs_path())
         if SPLIT_FILE_STORAGE_SUBDIR in directory:
             return True
     return False

Example #9

0

Show file

File: fake_direct_ingest_gcs_file_system.py Project: pnchbck/pulse-data

    def copy(self,
             src_path: GcsfsFilePath,
             dst_path: GcsfsPath) -> None:

        if isinstance(dst_path, GcsfsFilePath):
            path = dst_path
        elif isinstance(dst_path, GcsfsDirectoryPath):
            path = \
                GcsfsFilePath.from_directory_and_file_name(dst_path,
                                                           src_path.file_name)
        else:
            raise ValueError(f'Unexpected path type [{type(dst_path)}]')

        if src_path.abs_path() in self.uploaded_test_path_to_actual:
            self.uploaded_test_path_to_actual[dst_path.abs_path()] = \
                self.uploaded_test_path_to_actual[src_path.abs_path()]

        self._add_path(path)

Example #10

0

Show file

File: fake_direct_ingest_gcs_file_system.py Project: pnchbck/pulse-data

    def upload_from_contents_handle(self,
                                    path: GcsfsFilePath,
                                    contents_handle: GcsfsFileContentsHandle,
                                    content_type: str):

        temp_path = self.generate_random_temp_path()
        shutil.copyfile(contents_handle.local_file_path, temp_path)
        self.uploaded_test_path_to_actual[path.abs_path()] = temp_path
        self._add_path(path)

Example #11

0

Show file

File: fake_direct_ingest_gcs_file_system.py Project: pnchbck/pulse-data

    def upload_from_string(self,
                           path: GcsfsFilePath,
                           contents: str,
                           content_type: str):
        temp_path = self.generate_random_temp_path()
        with open(temp_path, 'w') as f:
            f.write(contents)

        self.uploaded_test_path_to_actual[path.abs_path()] = temp_path
        self._add_path(path)

Example #12

0

Show file

File: direct_ingest_gcs_file_system.py Project: teymour-aldridge/pulse-data

    def mv_path_to_storage(self,
                           path: GcsfsFilePath,
                           storage_directory_path: GcsfsDirectoryPath):
        """Moves a normalized path to it's appropriate storage location based on the date and file type information
        embedded in the file name."""
        storage_path = self._storage_path(storage_directory_path, path)

        logging.info("Moving [%s] to storage path [%s].",
                     path.abs_path(), storage_path.abs_path())
        self.mv(path, storage_path)

Example #13

0

Show file

File: direct_ingest_gcs_file_system.py Project: teymour-aldridge/pulse-data

    def mv_path_to_processed_path(self, path: GcsfsFilePath) -> GcsfsFilePath:
        """Renames file with an unprocessed file prefix to a path in the same directory with a 'processed' prefix.

        Returns the new processed path location of this file after the move completes.
        """

        processed_path = self._to_processed_file_path(path)
        logging.info("Moving [%s] to processed path [%s].",
                     path.abs_path(), processed_path.abs_path())
        self.mv(path, processed_path)
        return processed_path

Example #14

0

Show file

File: gcsfs_direct_ingest_controller.py Project: xgenie-007/pulse-data

    def _get_contents_handle_from_path(
            self, path: GcsfsFilePath) -> Optional[GcsfsFileContentsHandle]:
        if not self.fs.exists(path):
            logging.warning(
                "File path [%s] no longer exists - might have already been "
                "processed or deleted", path)
            return None

        logging.info("Starting download of file [{%s}].", path.abs_path())
        temp_file_path = self.fs.download_to_temp_file(path)

        if not temp_file_path:
            logging.warning("Download of file [{%s}] to local file failed.",
                            path.abs_path())
            return None

        logging.info("Completed download of file [{%s}] to local file [%s].",
                     path.abs_path(), temp_file_path)

        return GcsfsFileContentsHandle(temp_file_path)

Example #15

0

Show file

File: direct_ingest_gcs_file_system.py Project: teymour-aldridge/pulse-data

    def delete(self, path: GcsfsFilePath) -> None:
        if not isinstance(path, GcsfsFilePath):
            raise ValueError(f'Unexpected path type [{type(path)}]')

        bucket = self.storage_client.get_bucket(path.bucket_name)
        blob = bucket.get_blob(path.blob_name)

        if not blob:
            logging.warning("Path [%s] already does not exist, returning.",
                            path.abs_path())
            return

        blob.delete(self.storage_client)

Example #16

0

Show file

File: direct_ingest_gcs_file_system.py Project: teymour-aldridge/pulse-data

    def download_to_temp_file(self, path: GcsfsFilePath) -> Optional[GcsfsFileContentsHandle]:
        bucket = self.storage_client.get_bucket(path.bucket_name)
        blob = bucket.get_blob(path.blob_name)
        if not blob:
            raise ValueError(f'Blob at path [{path.abs_path()}] does not exist')

        temp_file_path = self.generate_random_temp_path()

        try:
            logging.info(
                "Started download of file [{%s}] to local file [%s].",
                path.abs_path(), temp_file_path)
            blob.download_to_filename(temp_file_path)
            logging.info(
                "Completed download of file [{%s}] to local file [%s].",
                path.abs_path(), temp_file_path)
            return GcsfsFileContentsHandle(temp_file_path)
        except NotFound:
            logging.info(
                "File path [%s] no longer exists - might have already "
                "been processed or deleted", path.abs_path())
            return None

Example #17

0

Show file

File: direct_ingest_gcs_file_system.py Project: teymour-aldridge/pulse-data

    def mv_path_to_normalized_path(self,
                                   path: GcsfsFilePath,
                                   file_type: GcsfsDirectIngestFileType,
                                   dt: Optional[datetime.datetime] = None) -> GcsfsFilePath:
        """Renames a file with an unnormalized file name to a file with a normalized file name in the same directory. If
        |dt| is specified, the file will contain that timestamp, otherwise will contain the current timestamp.

        Returns the new normalized path location of this file after the move completes.
        """
        updated_file_path = \
            GcsfsFilePath.from_absolute_path(
                to_normalized_unprocessed_file_path(path.abs_path(), file_type, dt))

        if self.exists(updated_file_path):
            raise ValueError(
                f"Desired path [{updated_file_path.abs_path()}] "
                f"already exists, returning")

        logging.info("Moving [%s] to normalized path [%s].",
                     path.abs_path(), updated_file_path.abs_path())
        self.mv(path, updated_file_path)
        return updated_file_path

Example #18

0

Show file

File: direct_ingest_gcs_file_system.py Project: xgenie-007/pulse-data

    def mv_path_to_storage(self, path: GcsfsFilePath,
                           storage_directory_path: GcsfsDirectoryPath):
        optional_storage_subdir = None
        if self.is_split_file(path):
            optional_storage_subdir = SPLIT_FILE_STORAGE_SUBDIR

        parts = filename_parts_from_path(path)
        storage_path = self._storage_path(storage_directory_path,
                                          optional_storage_subdir,
                                          parts.date_str, path.file_name)

        logging.info("Moving [%s] to storage path [%s].", path.abs_path(),
                     storage_path.abs_path())
        self.mv(path, storage_path)

Example #19

0

Show file

File: fake_direct_ingest_gcs_file_system.py Project: teymour-aldridge/pulse-data

    def download_to_temp_file(self, path: GcsfsFilePath) -> Optional[GcsfsFileContentsHandle]:
        """Downloads file contents into local temporary_file, returning path to
        temp file, or None if the path no-longer exists in the GCS file system.
        """
        if not self.exists(path):
            return None

        if path.abs_path() in self.uploaded_test_path_to_actual:
            return GcsfsFileContentsHandle(self.uploaded_test_path_to_actual[path.abs_path()])

        directory_path, _ = os.path.split(path.abs_path())

        parts = filename_parts_from_path(path)
        suffix = f'_{parts.filename_suffix}' if parts.filename_suffix else ''
        fixture_filename = f'{parts.file_tag}{suffix}.{parts.extension}'

        actual_fixture_file_path = \
            fixtures.file_path_from_relative_path(
                os.path.join(directory_path, fixture_filename))

        tempfile_path = self.generate_random_temp_path()

        return GcsfsFileContentsHandle(shutil.copyfile(actual_fixture_file_path, tempfile_path))

Example #20

0

Show file

File: gcsfs_direct_ingest_controller.py Project: teymour-aldridge/pulse-data

    def handle_file(self, path: GcsfsFilePath, start_ingest: bool):
        """Called when a single new file is added to an ingest bucket (may also
        be called as a result of a rename).

        May be called from any worker/queue.
        """
        if self.fs.is_processed_file(path):
            logging.info("File [%s] is already processed, returning.",
                         path.abs_path())
            return

        if self.fs.is_normalized_file_path(path):
            parts = filename_parts_from_path(path)

            if self.region.is_raw_vs_ingest_file_name_detection_enabled():
                if parts.file_type == GcsfsDirectIngestFileType.RAW_DATA or (
                        parts.file_type == GcsfsDirectIngestFileType.INGEST_VIEW and
                        self.region.are_ingest_view_exports_enabled_in_env()
                ):
                    # TODO(3020): Design/handle/write tests for case where this is a file we've moved from storage for a
                    #  rerun. Right now we will crash here because we'll try to set a discovery time that comes after
                    #  the processed time.
                    self.file_metadata_manager.register_new_file(path)

            if parts.is_file_split and \
                    parts.file_split_size and \
                    parts.file_split_size <= self.file_split_line_limit:
                self.kick_scheduler(just_finished_job=False)
                logging.info("File [%s] is already normalized and split split "
                             "with correct size, kicking scheduler.",
                             path.abs_path())
                return

        logging.info("Creating cloud task to schedule next job.")
        self.cloud_task_manager.create_direct_ingest_handle_new_files_task(
            region=self.region,
            can_start_ingest=start_ingest)

Example #21

0

Show file

File: gcsfs_direct_ingest_controller.py Project: nikhil-tibrewal/pulse-data

    def _should_split_file(self, path: GcsfsFilePath) -> bool:
        """Returns a handle to the contents of this path if this file should be split, None otherwise."""
        parts = filename_parts_from_path(path)

        if self.region.is_raw_vs_ingest_file_name_detection_enabled() and \
                parts.file_type != GcsfsDirectIngestFileType.INGEST_VIEW:
            raise ValueError(
                f'Should not be attempting to split files other than ingest view files, found path with '
                f'file type: {parts.file_type}')

        if parts.file_tag not in self.get_file_tag_rank_list():
            logging.info(
                "File tag [%s] for path [%s] not in rank list - not splitting.",
                parts.file_tag, path.abs_path())
            return False

        if parts.is_file_split and \
                parts.file_split_size and \
                parts.file_split_size <= self.ingest_file_split_line_limit:
            logging.info("File [%s] already split with size [%s].",
                         path.abs_path(), parts.file_split_size)
            return False

        return self._must_split_contents(parts.file_type, path)

Example #22

0

Show file

File: direct_ingest_gcs_file_system.py Project: xgenie-007/pulse-data

 def mv_path_to_processed_path(self, path: GcsfsFilePath):
     processed_path = self._to_processed_file_path(path)
     logging.info("Moving [%s] to processed path [%s].", path.abs_path(),
                  processed_path.abs_path())
     self.mv(path, processed_path)

Example #23

0

Show file

File: direct_ingest_raw_file_import_manager.py Project: teymour-aldridge/pulse-data

    def _read_contents_into_dataframes(self,
                                       path: GcsfsFilePath,
                                       contents_handle: GcsfsFileContentsHandle) -> Iterator[pd.DataFrame]:
        parts = filename_parts_from_path(path)
        file_config = self.region_raw_file_config.raw_file_configs[parts.file_tag]

        columns = self._get_validated_columns(file_config, contents_handle)
        try:
            for df in pd.read_csv(
                    contents_handle.local_file_path,
                    sep=file_config.separator,
                    dtype=str,
                    index_col=False,
                    header=None,
                    skiprows=1,
                    encoding=file_config.encoding,
                    quoting=(csv.QUOTE_NONE if file_config.ignore_quotes else csv.QUOTE_MINIMAL),
                    usecols=columns,
                    names=columns,
                    chunksize=self.upload_chunk_size,
                    keep_default_na=False):
                yield df
        except Exception as e:
            logging.error('Failed to parse DataFrame for path [%s] with config [%s]', path.abs_path(), file_config)
            raise e

Example #24

0

Show file

File: gcsfs_direct_ingest_controller.py Project: nikhil-tibrewal/pulse-data

    def _split_file_if_necessary(self, path: GcsfsFilePath) -> bool:
        """Checks if the given file needs to be split according to this controller's |file_split_line_limit|.

        Returns True if the file was split, False if splitting was not necessary.
        """

        should_split = self._should_split_file(path)
        if not should_split:
            logging.info("No need to split file path [%s].", path.abs_path())
            return False

        logging.info("Proceeding to file splitting for path [%s].",
                     path.abs_path())

        original_metadata = None
        if self.region.are_ingest_view_exports_enabled_in_env():
            original_metadata = self.file_metadata_manager.get_file_metadata(
                path)

        output_dir = GcsfsDirectoryPath.from_file_path(path)

        split_contents_paths = self._split_file(path)
        upload_paths = []
        for i, split_contents_path in enumerate(split_contents_paths):
            upload_path = self._create_split_file_path(path,
                                                       output_dir,
                                                       split_num=i)

            logging.info(
                "Copying split [%s] to direct ingest directory at path [%s].",
                i, upload_path.abs_path())

            upload_paths.append(upload_path)
            try:
                self.fs.mv(split_contents_path, upload_path)
            except Exception as e:
                logging.error(
                    'Threw error while copying split files from temp bucket - attempting to clean up before rethrowing.'
                    ' [%s]', e)
                for p in upload_paths:
                    self.fs.delete(p)
                raise e

        # We wait to register files with metadata manager until all files have been successfully copied to avoid leaving
        # the metadata manager in an inconsistent state.
        if self.region.are_ingest_view_exports_enabled_in_env():
            if not isinstance(original_metadata,
                              DirectIngestIngestFileMetadata):
                raise ValueError(
                    'Attempting to split a non-ingest view type file')

            logging.info(
                'Registering [%s] split files with the metadata manager.',
                len(upload_paths))

            for upload_path in upload_paths:
                ingest_file_metadata = self.file_metadata_manager.register_ingest_file_split(
                    original_metadata, upload_path)
                self.file_metadata_manager.mark_ingest_view_exported(
                    ingest_file_metadata)

            self.file_metadata_manager.mark_file_as_processed(path)

        logging.info(
            "Done splitting file [%s] into [%s] paths, moving it to storage.",
            path.abs_path(), len(split_contents_paths))

        self.fs.mv_path_to_storage(path, self.storage_directory_path)

        return True

Example #25

0

Show file

 def _path_in_storage_dir(self, path: GcsfsFilePath,
                          controller: GcsfsDirectIngestController):
     return path.abs_path().startswith(
         controller.storage_directory_path.abs_path())

Example #26

0

Show file

File: gcsfs_direct_ingest_controller.py Project: teymour-aldridge/pulse-data

    def _split_file_if_necessary(self, path: GcsfsFilePath):
        """Checks if the given file needs to be split according to this
        controller's |file_split_line_limit|.
        """
        parts = filename_parts_from_path(path)

        if self.region.is_raw_vs_ingest_file_name_detection_enabled() and \
                parts.file_type != GcsfsDirectIngestFileType.INGEST_VIEW:
            raise ValueError(f'Should not be attempting to split files other than ingest view files, found path with '
                             f'file type: {parts.file_type}')

        if parts.file_tag not in self.get_file_tag_rank_list():
            logging.info("File tag [%s] for path [%s] not in rank list - "
                         "not splitting.",
                         parts.file_tag,
                         path.abs_path())
            return False

        if parts.is_file_split and \
                parts.file_split_size and \
                parts.file_split_size <= self.file_split_line_limit:
            logging.info("File [%s] already split with size [%s].",
                         path.abs_path(), parts.file_split_size)
            return False

        file_contents_handle = self._get_contents_handle_from_path(path)

        if not file_contents_handle:
            logging.info("File [%s] has no rows - not splitting.",
                         path.abs_path())
            return False

        if self._can_proceed_with_ingest_for_contents(file_contents_handle):
            logging.info("No need to split file path [%s].", path.abs_path())
            return False

        logging.info("Proceeding to file splitting for path [%s].",
                     path.abs_path())

        split_contents_handles = self._split_file(path, file_contents_handle)

        original_metadata = None
        if self.region.are_ingest_view_exports_enabled_in_env():
            original_metadata = self.file_metadata_manager.get_file_metadata(path)

        output_dir = GcsfsDirectoryPath.from_file_path(path)
        for i, split_contents_handle in enumerate(split_contents_handles):
            upload_path = self._create_split_file_path(path, output_dir, split_num=i)

            ingest_file_metadata = None

            if self.region.are_ingest_view_exports_enabled_in_env():
                if not isinstance(original_metadata, DirectIngestIngestFileMetadata):
                    raise ValueError('Attempting to split a non-ingest view type file')

                ingest_file_metadata = self.file_metadata_manager.register_ingest_file_split(original_metadata,
                                                                                             upload_path)
            logging.info("Writing file split [%s] to Cloud Storage.", upload_path.abs_path())
            self.fs.upload_from_contents_handle(upload_path, split_contents_handle, self._contents_type())

            if self.region.are_ingest_view_exports_enabled_in_env():
                if not ingest_file_metadata:
                    raise ValueError(f'Split file metadata for path unexpectedly none [{upload_path.abs_path()}]')

                self.file_metadata_manager.mark_ingest_view_exported(ingest_file_metadata)

        if self.region.are_ingest_view_exports_enabled_in_env():
            self.file_metadata_manager.mark_file_as_processed(path)

        logging.info("Done splitting file [%s] into [%s] paths, moving it to storage.",
                     path.abs_path(), len(split_contents_handles))

        self.fs.mv_path_to_storage(path, self.storage_directory_path)

        return True

Example #27

0

Show file

File: gcsfs_direct_ingest_controller.py Project: abhishyantkhare/pulse-data

    def _split_file_if_necessary(self, path: GcsfsFilePath) -> bool:
        """Checks if the given file needs to be split according to this controller's |file_split_line_limit|.

        Returns True if the file was split, False if splitting was not necessary.
        """

        should_split = self._should_split_file(path)
        if not should_split:
            logging.info("No need to split file path [%s].", path.abs_path())
            return False

        logging.info("Proceeding to file splitting for path [%s].",
                     path.abs_path())

        original_metadata = None
        if self.region.are_ingest_view_exports_enabled_in_env():
            original_metadata = self.file_metadata_manager.get_file_metadata(
                path)

        output_dir = GcsfsDirectoryPath.from_file_path(path)

        split_contents_paths = self._split_file(path)
        for i, split_contents_path in enumerate(split_contents_paths):
            upload_path = self._create_split_file_path(path,
                                                       output_dir,
                                                       split_num=i)

            ingest_file_metadata = None

            if self.region.are_ingest_view_exports_enabled_in_env():
                if not isinstance(original_metadata,
                                  DirectIngestIngestFileMetadata):
                    raise ValueError(
                        'Attempting to split a non-ingest view type file')

                ingest_file_metadata = self.file_metadata_manager.register_ingest_file_split(
                    original_metadata, upload_path)
            logging.info(
                "Copying split [%s] to direct ingest directory at path [%s].",
                i, upload_path.abs_path())
            self.fs.mv(split_contents_path, upload_path)

            if self.region.are_ingest_view_exports_enabled_in_env():
                if not ingest_file_metadata:
                    raise ValueError(
                        f'Split file metadata for path unexpectedly none [{upload_path.abs_path()}]'
                    )

                self.file_metadata_manager.mark_ingest_view_exported(
                    ingest_file_metadata)

        if self.region.are_ingest_view_exports_enabled_in_env():
            self.file_metadata_manager.mark_file_as_processed(path)

        logging.info(
            "Done splitting file [%s] into [%s] paths, moving it to storage.",
            path.abs_path(), len(split_contents_paths))

        self.fs.mv_path_to_storage(path, self.storage_directory_path)

        return True