Esempio n. 1
0
def _compute_split_statistics_beam(
    *,
    split_files: _SplitFilesDict,
    out_dir: epath.PathLike,
    filename_template: naming.ShardedFileTemplate,
) -> List[split_lib.SplitInfo]:
    """Compute statistics."""
    out_dir = epath.Path(out_dir)

    assert out_dir.exists(), f'{out_dir} does not exists'

    beam = lazy_imports_lib.lazy_imports.apache_beam

    # Launch the beam pipeline computation
    runner = None
    # Create the global pipeline object common for all splits
    # Disable type_hint as it doesn't works with typing.Protocol
    beam_options = beam.options.pipeline_options.PipelineOptions()
    beam_options.view_as(
        beam.options.pipeline_options.TypeOptions).pipeline_type_check = False
    with beam.Pipeline(runner=runner, options=beam_options) as pipeline:
        for split_name, file_infos in split_files.items():
            _ = pipeline | split_name >> _process_split(  # pylint: disable=no-value-for-parameter
                filename_template=filename_template,
                out_dir=out_dir,
                file_infos=file_infos,  # pytype: disable=missing-parameter
            )

    # After the files have been computed
    return [
        _split_info_from_path(
            filename_template.replace(data_dir=out_dir, split=split))
        for split in split_files
    ]
Esempio n. 2
0
    def copy(
        self,
        dst: epath.PathLike,
        overwrite: bool = False,
    ) -> epath.Path:
        """Copy the current file to the given destination.

    Args:
      dst: Target file. It can be any PathLike compatible path (e.g. `gs://...`)
      overwrite: Whether the file should be overwritten or not

    Returns:
      The new created file.

    Raises:
      FileExistsError: If `overwrite` is false and destination exists.
    """
        dst = epath.Path(dst)
        if not overwrite and dst.exists():
            raise FileExistsError(
                f'Cannot copy {self}. Destination {dst} exists.')
        # Otherwise, copy src to dst
        dst.write_bytes(self.read_bytes())
        return dst
Esempio n. 3
0
def mock_cwd(path: epath.PathLike) -> Iterator[None]:
    """Mock the current directory."""
    path = pathlib.Path(path)
    assert path.exists() and path.is_dir()  # Check given path is valid cwd dir
    with mock.patch('os.getcwd', return_value=os.fspath(path)):
        yield