def _compute_split_statistics_beam( *, split_files: _SplitFilesDict, out_dir: epath.PathLike, filename_template: naming.ShardedFileTemplate, ) -> List[split_lib.SplitInfo]: """Compute statistics.""" out_dir = epath.Path(out_dir) assert out_dir.exists(), f'{out_dir} does not exists' beam = lazy_imports_lib.lazy_imports.apache_beam # Launch the beam pipeline computation runner = None # Create the global pipeline object common for all splits # Disable type_hint as it doesn't works with typing.Protocol beam_options = beam.options.pipeline_options.PipelineOptions() beam_options.view_as( beam.options.pipeline_options.TypeOptions).pipeline_type_check = False with beam.Pipeline(runner=runner, options=beam_options) as pipeline: for split_name, file_infos in split_files.items(): _ = pipeline | split_name >> _process_split( # pylint: disable=no-value-for-parameter filename_template=filename_template, out_dir=out_dir, file_infos=file_infos, # pytype: disable=missing-parameter ) # After the files have been computed return [ _split_info_from_path( filename_template.replace(data_dir=out_dir, split=split)) for split in split_files ]
def copy( self, dst: epath.PathLike, overwrite: bool = False, ) -> epath.Path: """Copy the current file to the given destination. Args: dst: Target file. It can be any PathLike compatible path (e.g. `gs://...`) overwrite: Whether the file should be overwritten or not Returns: The new created file. Raises: FileExistsError: If `overwrite` is false and destination exists. """ dst = epath.Path(dst) if not overwrite and dst.exists(): raise FileExistsError( f'Cannot copy {self}. Destination {dst} exists.') # Otherwise, copy src to dst dst.write_bytes(self.read_bytes()) return dst
def mock_cwd(path: epath.PathLike) -> Iterator[None]: """Mock the current directory.""" path = pathlib.Path(path) assert path.exists() and path.is_dir() # Check given path is valid cwd dir with mock.patch('os.getcwd', return_value=os.fspath(path)): yield