Beispiel #1
0
    def _split_file(self, path: GcsfsFilePath,
                    file_contents_handle: GcsfsFileContentsHandle) -> None:

        output_dir = GcsfsDirectoryPath.from_file_path(path)

        upload_paths_and_df = []
        for i, df in enumerate(
                pd.read_csv(file_contents_handle.local_file_path,
                            dtype=str,
                            chunksize=self.file_split_line_limit,
                            keep_default_na=False)):
            upload_path = self._create_split_file_path(path,
                                                       output_dir,
                                                       split_num=i)
            upload_paths_and_df.append((upload_path, df))

        for output_path, df in upload_paths_and_df:
            logging.info("Writing file split [%s] to Cloud Storage.",
                         output_path.abs_path())

            self.fs.upload_from_string(output_path, df.to_csv(index=False),
                                       'text/csv')

        logging.info("Done splitting file [%s] into [%s] paths, returning.",
                     path.abs_path(), len(upload_paths_and_df))

        self.fs.mv_path_to_storage(path, self.storage_directory_path)
Beispiel #2
0
    def test_processing_continues_if_there_are_subfolders_in_ingest_dir(self):
        controller = build_gcsfs_controller_for_tests(
            StateTestGcsfsDirectIngestController,
            self.FIXTURE_PATH_PREFIX,
            run_async=False)

        if not isinstance(controller.fs, FakeDirectIngestGCSFileSystem):
            raise ValueError(f"Controller fs must have type "
                             f"FakeDirectIngestGCSFileSystem. Found instead "
                             f"type [{type(controller.fs)}]")

        subdir_path = \
            path_for_fixture_file(controller, f'subdir/',
                                  should_normalize=False)
        paths = [
            subdir_path,
            path_for_fixture_file(controller,
                                  f'subdir/Unexpected_Tag.csv',
                                  should_normalize=False),
            path_for_fixture_file(controller,
                                  f'tagA.csv',
                                  should_normalize=False),
            path_for_fixture_file(controller,
                                  f'tagB.csv',
                                  should_normalize=False),
            path_for_fixture_file(controller,
                                  f'tagC.csv',
                                  should_normalize=False),
            path_for_fixture_file(controller,
                                  f'subdir/tagC_2.csv',
                                  should_normalize=False),
        ]

        for path in paths:
            controller.fs.test_add_path(path)

        run_task_queues_to_empty(controller)

        dir_paths_found = []
        storage_file_paths = []
        ingest_file_paths = []

        for path in controller.fs.all_paths:
            if isinstance(path, GcsfsDirectoryPath):
                dir_paths_found.append(path)
                continue

            if path.abs_path().startswith(
                    controller.storage_directory_path.abs_path()):
                storage_file_paths.append(path)
            else:
                self.assertTrue(path.abs_path().startswith(
                    controller.ingest_directory_path.abs_path()))
                ingest_file_paths.append(path)

        self.assertEqual(1, len(dir_paths_found))
        self.assertEqual(subdir_path, dir_paths_found[0])

        self.assertEqual(3, len(storage_file_paths))
        storage_tags = {
            filename_parts_from_path(path).file_tag
            for path in storage_file_paths
        }
        self.assertEqual({'tagA', 'tagB', 'tagC'}, storage_tags)

        for path in storage_file_paths:
            self.assertTrue(controller.fs.is_normalized_file_path(path))
            self.assertTrue(controller.fs.is_processed_file(path))

        self.assertEqual(2, len(ingest_file_paths))
        ingest_tags = {
            filename_parts_from_path(path).file_tag
            for path in ingest_file_paths
        }
        self.assertEqual({'tagC', 'Unexpected_Tag'}, ingest_tags)

        for path in ingest_file_paths:
            self.assertTrue(controller.fs.is_normalized_file_path(path))
            self.assertTrue(controller.fs.is_seen_unprocessed_file(path))
            self.assertEqual(subdir_path,
                             GcsfsDirectoryPath.from_file_path(path))
    def _split_file_if_necessary(self, path: GcsfsFilePath) -> bool:
        """Checks if the given file needs to be split according to this controller's |file_split_line_limit|.

        Returns True if the file was split, False if splitting was not necessary.
        """

        should_split = self._should_split_file(path)
        if not should_split:
            logging.info("No need to split file path [%s].", path.abs_path())
            return False

        logging.info("Proceeding to file splitting for path [%s].",
                     path.abs_path())

        original_metadata = None
        if self.region.are_ingest_view_exports_enabled_in_env():
            original_metadata = self.file_metadata_manager.get_file_metadata(
                path)

        output_dir = GcsfsDirectoryPath.from_file_path(path)

        split_contents_paths = self._split_file(path)
        upload_paths = []
        for i, split_contents_path in enumerate(split_contents_paths):
            upload_path = self._create_split_file_path(path,
                                                       output_dir,
                                                       split_num=i)

            logging.info(
                "Copying split [%s] to direct ingest directory at path [%s].",
                i, upload_path.abs_path())

            upload_paths.append(upload_path)
            try:
                self.fs.mv(split_contents_path, upload_path)
            except Exception as e:
                logging.error(
                    'Threw error while copying split files from temp bucket - attempting to clean up before rethrowing.'
                    ' [%s]', e)
                for p in upload_paths:
                    self.fs.delete(p)
                raise e

        # We wait to register files with metadata manager until all files have been successfully copied to avoid leaving
        # the metadata manager in an inconsistent state.
        if self.region.are_ingest_view_exports_enabled_in_env():
            if not isinstance(original_metadata,
                              DirectIngestIngestFileMetadata):
                raise ValueError(
                    'Attempting to split a non-ingest view type file')

            logging.info(
                'Registering [%s] split files with the metadata manager.',
                len(upload_paths))

            for upload_path in upload_paths:
                ingest_file_metadata = self.file_metadata_manager.register_ingest_file_split(
                    original_metadata, upload_path)
                self.file_metadata_manager.mark_ingest_view_exported(
                    ingest_file_metadata)

            self.file_metadata_manager.mark_file_as_processed(path)

        logging.info(
            "Done splitting file [%s] into [%s] paths, moving it to storage.",
            path.abs_path(), len(split_contents_paths))

        self.fs.mv_path_to_storage(path, self.storage_directory_path)

        return True
    def _split_file_if_necessary(self, path: GcsfsFilePath):
        """Checks if the given file needs to be split according to this
        controller's |file_split_line_limit|.
        """
        parts = filename_parts_from_path(path)

        if self.region.is_raw_vs_ingest_file_name_detection_enabled() and \
                parts.file_type != GcsfsDirectIngestFileType.INGEST_VIEW:
            raise ValueError(f'Should not be attempting to split files other than ingest view files, found path with '
                             f'file type: {parts.file_type}')

        if parts.file_tag not in self.get_file_tag_rank_list():
            logging.info("File tag [%s] for path [%s] not in rank list - "
                         "not splitting.",
                         parts.file_tag,
                         path.abs_path())
            return False

        if parts.is_file_split and \
                parts.file_split_size and \
                parts.file_split_size <= self.file_split_line_limit:
            logging.info("File [%s] already split with size [%s].",
                         path.abs_path(), parts.file_split_size)
            return False

        file_contents_handle = self._get_contents_handle_from_path(path)

        if not file_contents_handle:
            logging.info("File [%s] has no rows - not splitting.",
                         path.abs_path())
            return False

        if self._can_proceed_with_ingest_for_contents(file_contents_handle):
            logging.info("No need to split file path [%s].", path.abs_path())
            return False

        logging.info("Proceeding to file splitting for path [%s].",
                     path.abs_path())

        split_contents_handles = self._split_file(path, file_contents_handle)

        original_metadata = None
        if self.region.are_ingest_view_exports_enabled_in_env():
            original_metadata = self.file_metadata_manager.get_file_metadata(path)

        output_dir = GcsfsDirectoryPath.from_file_path(path)
        for i, split_contents_handle in enumerate(split_contents_handles):
            upload_path = self._create_split_file_path(path, output_dir, split_num=i)

            ingest_file_metadata = None

            if self.region.are_ingest_view_exports_enabled_in_env():
                if not isinstance(original_metadata, DirectIngestIngestFileMetadata):
                    raise ValueError('Attempting to split a non-ingest view type file')

                ingest_file_metadata = self.file_metadata_manager.register_ingest_file_split(original_metadata,
                                                                                             upload_path)
            logging.info("Writing file split [%s] to Cloud Storage.", upload_path.abs_path())
            self.fs.upload_from_contents_handle(upload_path, split_contents_handle, self._contents_type())

            if self.region.are_ingest_view_exports_enabled_in_env():
                if not ingest_file_metadata:
                    raise ValueError(f'Split file metadata for path unexpectedly none [{upload_path.abs_path()}]')

                self.file_metadata_manager.mark_ingest_view_exported(ingest_file_metadata)

        if self.region.are_ingest_view_exports_enabled_in_env():
            self.file_metadata_manager.mark_file_as_processed(path)

        logging.info("Done splitting file [%s] into [%s] paths, moving it to storage.",
                     path.abs_path(), len(split_contents_handles))

        self.fs.mv_path_to_storage(path, self.storage_directory_path)

        return True
    def _split_file_if_necessary(self, path: GcsfsFilePath) -> bool:
        """Checks if the given file needs to be split according to this controller's |file_split_line_limit|.

        Returns True if the file was split, False if splitting was not necessary.
        """

        should_split = self._should_split_file(path)
        if not should_split:
            logging.info("No need to split file path [%s].", path.abs_path())
            return False

        logging.info("Proceeding to file splitting for path [%s].",
                     path.abs_path())

        original_metadata = None
        if self.region.are_ingest_view_exports_enabled_in_env():
            original_metadata = self.file_metadata_manager.get_file_metadata(
                path)

        output_dir = GcsfsDirectoryPath.from_file_path(path)

        split_contents_paths = self._split_file(path)
        for i, split_contents_path in enumerate(split_contents_paths):
            upload_path = self._create_split_file_path(path,
                                                       output_dir,
                                                       split_num=i)

            ingest_file_metadata = None

            if self.region.are_ingest_view_exports_enabled_in_env():
                if not isinstance(original_metadata,
                                  DirectIngestIngestFileMetadata):
                    raise ValueError(
                        'Attempting to split a non-ingest view type file')

                ingest_file_metadata = self.file_metadata_manager.register_ingest_file_split(
                    original_metadata, upload_path)
            logging.info(
                "Copying split [%s] to direct ingest directory at path [%s].",
                i, upload_path.abs_path())
            self.fs.mv(split_contents_path, upload_path)

            if self.region.are_ingest_view_exports_enabled_in_env():
                if not ingest_file_metadata:
                    raise ValueError(
                        f'Split file metadata for path unexpectedly none [{upload_path.abs_path()}]'
                    )

                self.file_metadata_manager.mark_ingest_view_exported(
                    ingest_file_metadata)

        if self.region.are_ingest_view_exports_enabled_in_env():
            self.file_metadata_manager.mark_file_as_processed(path)

        logging.info(
            "Done splitting file [%s] into [%s] paths, moving it to storage.",
            path.abs_path(), len(split_contents_paths))

        self.fs.mv_path_to_storage(path, self.storage_directory_path)

        return True