def test_find(): with temparchive(data) as archive_file: lhs = fsspec.filesystem("libarchive", fo=archive_file) assert lhs.find("") == ["a", "b", "deeply/nested/path"] assert lhs.find("", withdirs=True) == [ "a", "b", "deeply/", "deeply/nested/", "deeply/nested/path", ] assert lhs.find("deeply") == ["deeply/nested/path"] assert lhs.find("deeply/") == lhs.find("deeply")
def test_mapping_prefix(tmpdir): tmpdir = str(tmpdir) os.makedirs(os.path.join(tmpdir, "afolder")) open(os.path.join(tmpdir, "afile"), "w").write("test") open(os.path.join(tmpdir, "afolder", "anotherfile"), "w").write("test2") m = fsspec.get_mapper("file://" + tmpdir) assert "afile" in m assert m["afolder/anotherfile"] == b"test2" fs = fsspec.filesystem("file") m2 = fs.get_mapper(tmpdir) m3 = fs.get_mapper("file://" + tmpdir) assert m == m2 == m3
def load_from_disk( dataset_path: str, fs=None, keep_in_memory: Optional[bool] = None) -> Union[Dataset, DatasetDict]: """ Loads a dataset that was previously saved using :meth:`Dataset.save_to_disk` from a dataset directory, or from a filesystem using either :class:`datasets.filesystems.S3FileSystem` or any implementation of ``fsspec.spec.AbstractFileSystem``. Args: dataset_path (:obj:`str`): Path (e.g. `"dataset/train"`) or remote URI (e.g. `"s3://my-bucket/dataset/train"`) of the Dataset or DatasetDict directory where the dataset will be loaded from. fs (:class:`~filesystems.S3FileSystem` or ``fsspec.spec.AbstractFileSystem``, optional, default ``None``): Instance of of the remote filesystem used to download the files from. keep_in_memory (:obj:`bool`, default ``None``): Whether to copy the dataset in-memory. If `None`, the dataset will not be copied in-memory unless explicitly enabled by setting `datasets.config.IN_MEMORY_MAX_SIZE` to nonzero. See more details in the :ref:`load_dataset_enhancing_performance` section. Returns: :class:`Dataset` or :class:`DatasetDict`: - If `dataset_path` is a path of a dataset directory: the dataset requested. - If `dataset_path` is a path of a dataset dict directory: a ``datasets.DatasetDict`` with each split. """ # gets filesystem from dataset, either s3:// or file:// and adjusted dataset_path if is_remote_filesystem(fs): dest_dataset_path = extract_path_from_uri(dataset_path) else: fs = fsspec.filesystem("file") dest_dataset_path = dataset_path if not fs.exists(dest_dataset_path): raise FileNotFoundError("Directory {} not found".format(dataset_path)) if fs.isfile( Path(dest_dataset_path, config.DATASET_INFO_FILENAME).as_posix()): return Dataset.load_from_disk(dataset_path, fs, keep_in_memory=keep_in_memory) elif fs.isfile( Path(dest_dataset_path, config.DATASETDICT_JSON_FILENAME).as_posix()): return DatasetDict.load_from_disk(dataset_path, fs, keep_in_memory=keep_in_memory) else: raise FileNotFoundError( "Directory {} is neither a dataset directory nor a dataset dict directory." .format(dataset_path))
def load_from_disk(dataset_dict_path: str, fs=None, keep_in_memory: Optional[bool] = None) -> "DatasetDict": """ Load a dataset that was previously saved using :meth:`save_to_disk` from a filesystem using either :class:`~filesystems.S3FileSystem` or ``fsspec.spec.AbstractFileSystem``. Args: dataset_dict_path (:obj:`str`): Path (e.g. ``"dataset/train"``) or remote URI (e.g. ``"s3//my-bucket/dataset/train"``) of the dataset dict directory where the dataset dict will be loaded from. fs (:class:`~filesystems.S3FileSystem` or ``fsspec.spec.AbstractFileSystem``, optional, default ``None``): Instance of the remote filesystem used to download the files from. keep_in_memory (:obj:`bool`, default ``None``): Whether to copy the dataset in-memory. If `None`, the dataset will not be copied in-memory unless explicitly enabled by setting `datasets.config.IN_MEMORY_MAX_SIZE` to nonzero. See more details in the :ref:`load_dataset_enhancing_performance` section. Returns: :class:`DatasetDict` """ dataset_dict = DatasetDict() if is_remote_filesystem(fs): dest_dataset_dict_path = extract_path_from_uri(dataset_dict_path) else: fs = fsspec.filesystem("file") dest_dataset_dict_path = dataset_dict_path dataset_dict_json_path = Path( dest_dataset_dict_path, config.DATASETDICT_JSON_FILENAME).as_posix() dataset_info_path = Path(dest_dataset_dict_path, config.DATASET_INFO_FILENAME).as_posix() if fs.isfile( dataset_info_path) and not fs.isfile(dataset_dict_json_path): raise FileNotFoundError( f"No such file or directory: '{dataset_dict_json_path}'. Expected to load a DatasetDict object, but got a Dataset. Please use datasets.load_from_disk instead." ) for k in json.load( fs.open(dataset_dict_json_path, "r", encoding="utf-8"))["splits"]: dataset_dict_split_path = ( dataset_dict_path.split("://")[0] + "://" + Path(dest_dataset_dict_path, k).as_posix() if is_remote_filesystem(fs) else Path(dest_dataset_dict_path, k).as_posix()) dataset_dict[k] = Dataset.load_from_disk( dataset_dict_split_path, fs, keep_in_memory=keep_in_memory) return dataset_dict
def _determine_dims(self): logger.debug("open mappers") # If self.path is a list of dictionaries, pass them directly to fsspec.filesystem import collections.abc if isinstance(self.path[0], collections.abc.Mapping): fo_list = self.path # If self.path is list of files, open the files and load the json as a dictionary else: with fsspec.open_files(self.path, **self.storage_options) as ofs: fo_list = [json.load(of) for of in ofs] fss = [ fsspec.filesystem("reference", fo=fo, remote_protocol=self.remote_protocol, remote_options=self.remote_options) for fo in fo_list ] self.fs = fss[0].fs mappers = [fs.get_mapper("") for fs in fss] logger.debug("open first two datasets") xr_kwargs_copy = self.xr_kwargs.copy() # Add consolidated=False to xr kwargs if not explictly given by user # needed to suppress zarr open warnings if (version.parse(xr.__version__) >= version.parse("0.19.0") and 'consolidated' not in xr_kwargs_copy): xr_kwargs_copy['consolidated'] = False dss = [ xr.open_dataset(m, engine="zarr", chunks={}, **xr_kwargs_copy) for m in mappers[:2] ] if self.preprocess: logger.debug("preprocess") dss = [self.preprocess(d) for d in dss] logger.debug("concat") ds = xr.concat(dss, **self.concat_kwargs) ds0 = dss[0] self.extra_dims = set(ds.dims) - set(ds0.dims) self.concat_dims = set(k for k, v in ds.dims.items() if k in ds0.dims and v / ds0.dims[k] == 2) self.same_dims = set(ds.dims) - self.extra_dims - self.concat_dims return ds, ds0, fss
def get_nudging_assets( run_duration: timedelta, current_date: Sequence[int], nudge_path: str, nudge_filename_pattern: str = "%Y%m%d_%HZ_T85LR.nc", copy_method: str = "copy", nudge_interval: timedelta = timedelta(hours=6), ) -> List[Mapping]: """Return list of assets of nudging files required for given run duration and start time. This method defines file paths directly from its arguments, without determining whether the files themselves are present. Args: run_duration: length of fv3gfs run current_date: start time of fv3gfs run as a sequence of 6 integers nudge_path: local or remote path to nudging files nudge_filename_pattern: template for nudging filenames. Pattern should follow style of datetime strptime and strftime 'format' argument. Defaults to '%Y%m%d_%HZ_T85LR.nc'. copy_method: copy_method for nudging file assets. Defaults to 'copy'. nudge_interval: time between nudging files. Must be multiple of 1 hour. Defaults to 6 hours. Returns: list of all assets required for nudging run Raises: ConfigError: if copy_method is "link" and a remote path is given for nudge_path """ if get_fs(nudge_path) != fsspec.filesystem( "file") and copy_method == "link": raise ConfigError( "Cannot link nudging files if using remote path for nudge_path. " f"Got {nudge_path}.") time_list = _get_nudge_time_list(run_duration, current_date, nudge_interval) filename_list = [ time.strftime(nudge_filename_pattern) for time in time_list ] nudging_assets = [ get_asset_dict(nudge_path, file_, target_location="INPUT", copy_method=copy_method) for file_ in filename_list ] return nudging_assets
def __init__( self, filepath: str, load_args: Dict[str, Any] = None, save_args: Dict[str, Any] = None, credentials: Dict[str, Any] = None, fs_args: Dict[str, Any] = None, layer: str = None, ) -> None: """ Creates a new instance of ``BioSequenceDataSet`` pointing to a concrete filepath. Args: filepath: path to sequence file prefixed with a protocol like `s3://`. If prefix is not provided, `file` protocol (local filesystem) will be used. The prefix should be any protocol supported by ``fsspec``. load_args: Options for parsing sequence files by Biopython ``SeqIO.parse()``. save_args: file format supported by Biopython ``SeqIO.write()``. E.g. `{"format": "fasta"}`. credentials: Credentials required to get access to the underlying filesystem. E.g. for ``GCSFileSystem`` it should look like `{"token": None}`. fs_args: Extra arguments to pass into underlying filesystem class. E.g. for ``GCSFileSystem`` class: `{"project": "my-project", ...}`. layer: The data layer according to the data engineering convention: https://kedro.readthedocs.io/en/stable/06_resources/01_faq.html#what-is-data-engineering-convention Note: Here you can find all supported file formats: https://biopython.org/wiki/SeqIO """ _fs_args = deepcopy(fs_args) or {} _credentials = deepcopy(credentials) or {} protocol, path = get_protocol_and_path(filepath) self._layer = layer self._filepath = PurePosixPath(path) self._protocol = protocol self._fs = fsspec.filesystem(self._protocol, **_credentials, **_fs_args) # Handle default load and save arguments self._load_args = deepcopy(self.DEFAULT_LOAD_ARGS) if load_args is not None: self._load_args.update(load_args) self._save_args = deepcopy(self.DEFAULT_SAVE_ARGS) if save_args is not None: self._save_args.update(save_args)
def test_directory_mem_nest(): import fsspec m = fsspec.filesystem("memory") df = pd.DataFrame({ 'x': [1, 2, 3, 4], 'y': [1.0, 2.0, 1.0, 2.0], 'z': ['a', 'b', 'c', 'd'] }) df.index.name = 'index' write('/dir/field=a/foo1.parquet', df, open_with=m.open) write('/dir/field=b/foo2.parquet', df, open_with=m.open) pf = ParquetFile("/dir", fs=m) assert pf.info['rows'] == 8 assert pf.to_pandas()['z'].tolist() == ['a', 'b', 'c', 'd'] * 2 assert pf.to_pandas()['field'].tolist() == ['a'] * 4 + ['b'] * 4
def get_filesystem(self, silent=True): """return fsspec file system object, if supported""" if self._filesystem: return self._filesystem try: import gcsfs # noqa except ImportError as exc: if not silent: raise ImportError( f"Google gcsfs not installed, run pip install gcsfs, {exc}" ) return None self._filesystem = fsspec.filesystem("gcs", **self.get_storage_options()) return self._filesystem
def test_detect_folders(tmpdir): fs = fsspec.filesystem("file") rundirs = ["rundir1", "rundir2"] for rdir in rundirs: tmpdir.mkdir(rdir).join("diags.nc").write("foobar") tmpdir.mkdir("not_a_rundir").join("useless_file.txt").write("useless!") result = detect_folders(tmpdir, fs) assert len(result) == 2 for found_dir in result: assert found_dir in rundirs assert isinstance(result[found_dir], DiagnosticFolder)
def local_filecache(): import tempfile original_location = tempfile.mkdtemp() cache_location = tempfile.mkdtemp() original_file = os.path.join(original_location, "afile") data = b"test data" with open(original_file, "wb") as f: f.write(data) # we can access the file and read it fs = fsspec.filesystem("filecache", target_protocol="file", cache_storage=cache_location) return (data, original_file, cache_location, fs)
def test_blocksize(ftp_writable): host, port, user, pw = ftp_writable fs = FTPFileSystem(host, port, user, pw) with fs.open("/out_block", "wb") as f: f.write(b"test" * 4000) fs = fsspec.filesystem( "blockcache", target_protocol="ftp", target_options={"host": host, "port": port, "username": user, "password": pw}, ) with fs.open("/out_block", block_size=20) as f: assert f.read(1) == b"t" with pytest.raises(ValueError): fs.open("/out_block", block_size=30)
async def _(): loop = asyncio.get_event_loop() fs = fsspec.filesystem("http", asynchronous=True, loop=loop) # fails because client creation has not yet been awaited assert isinstance( (await fs._cat([server + "/index/realfile"]))[0], RuntimeError ) with pytest.raises(RuntimeError): fs.cat([server + "/index/realfile"]) await fs.set_session() # creates client out = await fs._cat([server + "/index/realfile"]) del fs assert out == [data]
def run( # type: ignore self, data: pd.DataFrame, output_dir: str, filesystem: Optional[str] = "file", mode: Optional[str] = "model", ): """Save the game data. Saves the data to ``output_dir/data_{GameID}.csv`` Parameters ---------- data : pd.DataFrame The clean data. output_dir : str The directory containing the data. filesystem : str, optional (default "file") The name of the ``fsspec`` filesystem to use. mode : str, optional (default "model") The type of clean data to save. If ``model``, save to the directory ``model-data``. If ``rating``, save to ``rating-data``. Returns ------- None """ # Define subdirectory if mode == "model": subdir = "model-data" elif mode == "rating": subdir = "rating-data" else: raise ValueError("Please supply a valid value for ``mode``") # Get the filesystem fs = fsspec.filesystem(filesystem) fs.mkdirs(Path(output_dir, subdir), exist_ok=True) grouped = data.groupby("GAME_ID") for name, group in grouped: if not name.startswith("002"): self.logger.warning( f"{name} is not a regular season game. Skipping...") continue fpath = Path(output_dir, subdir, f"data_{name}.csv") self.logger.info(f"Writing data for game {name} to {str(fpath)}") with fs.open(fpath, "wb") as buf: group.to_csv(buf, sep="|", mode="wb")
def test_add_file_to_cache_after_save(local_filecache): (data, original_file, cache_location, fs) = local_filecache fs.save_cache() fs.cat(original_file) assert len(fs.cached_files[-1]) == 1 fs.save_cache() fs2 = fsspec.filesystem( "filecache", target_protocol="file", cache_storage=cache_location, do_not_use_cache_for_this_instance=True, # cache is masking the issue ) assert len(fs2.cached_files[-1]) == 1
def test_raw_file_distributed_serializable(self): from distributed.protocol import deserialize from distributed.protocol import serialize # Arrange fs = fsspec.filesystem("file") path = TEST_DATA_DIR + "/betfair/1.166811431.bz2" r = RawFile(open_file=fs.open(path=path, compression="bz2")) # Act result1: RawFile = deserialize(*serialize(r)) # Assert assert result1.open_file.fs == r.open_file.fs assert result1.open_file.path == r.open_file.path assert result1.block_size == r.block_size assert result1.open_file.compression == "bz2"
def __init__(self, filepath: str, version: Version = None): """Creates a new instance of ImageDataSet to load / save image data for given filepath. Args: filepath: The location of the image file to load / save data. version: The version of the dataset being saved and loaded. """ protocol, path = get_protocol_and_path(filepath) self._protocol = protocol self._fs = fsspec.filesystem(self._protocol) super().__init__( filepath=PurePosixPath(path), version=version, exists_function=self._fs.exists, glob_function=self._fs.glob, )
def test_cached_open_close_read(ftp_writable): # Regression test for <https://github.com/fsspec/filesystem_spec/issues/799> host, port, user, pw = ftp_writable fs = FTPFileSystem(host, port, user, pw) with fs.open("/out_block", "wb") as f: f.write(b"test" * 4000) fs = fsspec.filesystem( "cached", target_protocol="ftp", target_options={"host": host, "port": port, "username": user, "password": pw}, ) with fs.open("/out_block", block_size=1024) as f: pass with fs.open("/out_block", block_size=1024) as f: assert f.read(1) == b"t" # Regression test for <https://github.com/fsspec/filesystem_spec/issues/845> assert fs.cached_files[-1]["/out_block"]["blocks"] == {0}
def __init__(self, path=None, fs_protocol=None): """ Initialize a new instance of the ``DataCatalog`` class. Parameters ---------- path : str The root path to the data. fs_protocol : str The file system protocol to use. """ self.fs = fsspec.filesystem( fs_protocol or os.environ.get("NAUTILUS_BACKTEST_FS_PROTOCOL", "file")) self.root = pathlib.Path(path or os.environ["NAUTILUS_BACKTEST_DIR"]) self._processed_files_fn = f"{self.root}/.processed_raw_files.json"
def load_from_disk( dataset_path: str, fs=None, keep_in_memory: Optional[bool] = None) -> Union[Dataset, DatasetDict]: """ Loads a dataset that was previously saved using ``dataset.save_to_disk(dataset_path)`` from a dataset directory, or from a filesystem using either :class:`datasets.filesystems.S3FileSystem` or any implementation of ``fsspec.spec.AbstractFileSystem``. Args: dataset_path (:obj:`str`): Path (e.g. ``"dataset/train"``) or remote uri (e.g. ``"s3://my-bucket/dataset/train"``) of the Dataset or DatasetDict directory where the dataset will be loaded from. fs (:class:`~filesystems.S3FileSystem` or ``fsspec.spec.AbstractFileSystem``, optional, default ``None``): Instance of of the remote filesystem used to download the files from. keep_in_memory (:obj:`bool`, default ``None``): Whether to copy the dataset in-memory. If `None`, the dataset will be copied in-memory if its size is smaller than `datasets.config.MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES` (default `250 MiB`). This behavior can be disabled by setting ``datasets.config.MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES = None``, and in this case the dataset is not loaded in memory. Returns: ``datasets.Dataset`` or ``datasets.DatasetDict`` if `dataset_path` is a path of a dataset directory: the dataset requested, if `dataset_path` is a path of a dataset dict directory: a ``datasets.DatasetDict`` with each split. keep_in_memory (``bool``, default False): Whether to copy the data in-memory. """ # gets filesystem from dataset, either s3:// or file:// and adjusted dataset_path if is_remote_filesystem(fs): dest_dataset_path = extract_path_from_uri(dataset_path) else: fs = fsspec.filesystem("file") dest_dataset_path = dataset_path if not fs.exists(dest_dataset_path): raise FileNotFoundError("Directory {} not found".format(dataset_path)) if fs.isfile(Path(dest_dataset_path, "dataset_info.json").as_posix()): return Dataset.load_from_disk(dataset_path, fs, keep_in_memory=keep_in_memory) elif fs.isfile(Path(dest_dataset_path, "dataset_dict.json").as_posix()): return DatasetDict.load_from_disk(dataset_path, fs, keep_in_memory=keep_in_memory) else: raise FileNotFoundError( "Directory {} is neither a dataset directory nor a dataset dict directory." .format(dataset_path))
def pull_package(package_path, env, alias): """Pull a modular pipeline package, unpack it and install the files to corresponding locations. """ # pylint: disable=import-outside-toplevel import fsspec from kedro.io.core import get_protocol_and_path protocol, _ = get_protocol_and_path(package_path) filesystem = fsspec.filesystem(protocol) with tempfile.TemporaryDirectory() as temp_dir: temp_dir_path = Path(temp_dir).resolve() if package_path.endswith(".whl") and filesystem.exists(package_path): with filesystem.open(package_path) as fs_file: ZipFile(fs_file).extractall(temp_dir_path) else: python_call( "pip", [ "download", "--no-deps", "--dest", str(temp_dir_path), package_path ], ) wheel_file = list(temp_dir_path.glob("*.whl")) # `--no-deps` should fetch only one wheel file, and CLI should fail if that's # not the case. if len(wheel_file) != 1: file_names = [wf.name for wf in wheel_file] raise KedroCliError( f"More than 1 or no wheel files found: {str(file_names)}. " "There has to be exactly one distribution file.") ZipFile(wheel_file[0]).extractall(temp_dir_path) dist_info_file = list(temp_dir_path.glob("*.dist-info")) if len(dist_info_file) != 1: raise KedroCliError( f"More than 1 or no dist-info files found from {package_path}. " "There has to be exactly one dist-info directory.") # Extract package name, based on the naming convention for wheel files # https://www.python.org/dev/peps/pep-0427/#file-name-convention package_name = dist_info_file[0].stem.split("-")[0] _clean_pycache(temp_dir_path) _install_files(package_name, temp_dir_path, env, alias)
def temporary_directory( suffix: Optional[str] = None, prefix: Optional[str] = None, dir: Optional[PathType] = None, storage_options: Optional[Dict[str, str]] = None, ) -> Iterator[str]: """Create a temporary directory in a fsspec filesystem. Parameters ---------- suffix : Optional[str], optional If not None, the name of the temporary directory will end with that suffix. prefix : Optional[str], optional If not None, the name of the temporary directory will start with that prefix. dir : Optional[PathType], optional If not None, the temporary directory will be created in that directory, otherwise the local filesystem directory returned by `tempfile.gettempdir()` will be used. The directory may be specified as any fsspec URL. storage_options : Optional[Dict[str, str]], optional Any additional parameters for the storage backend (see `fsspec.open`). Yields ------- Generator[str, None, None] A context manager yielding the fsspec URL to the created directory. """ # Fill in defaults suffix = suffix or "" prefix = prefix or "" dir = dir or tempfile.gettempdir() storage_options = storage_options or {} # Find the filesystem by looking at the URL scheme (protocol), empty means local filesystem protocol = urlparse(str(dir)).scheme fs = fsspec.filesystem(protocol, **storage_options) # Construct a random directory name tempdir = build_url(dir, prefix + str(uuid.uuid4()) + suffix) try: fs.mkdir(tempdir) yield tempdir finally: # Remove the temporary directory on exiting the context manager fs.rm(tempdir, recursive=True)
def compute(self, url, filename="index.html"): """Compute reports_by_run index from all reports found at url. Args: url: path to directory containing report subdirectories. filename: name of report html files. Note: Reports are assumed to be located at {url}/*/{filename}. """ loop = asyncio.get_event_loop() if url.startswith("gs://"): fs = gcsfs.GCSFileSystem(asynchronous=True) else: fs = fsspec.filesystem("file") self.reports_by_run = loop.run_until_complete( self._get_reports(fs, url, filename)) loop.run_until_complete(_close_session(fs))
def test_blocksize(ftp_writable): host, port, user, pw = ftp_writable fs = FTPFileSystem(host, port, user, pw) with fs.open('/out', 'wb') as f: f.write(b'test') fs = fsspec.filesystem('cached', target_protocol='ftp', target_options={ 'host': host, 'port': port, 'username': user, 'password': pw }) assert fs.cat('/out') == b'test' with pytest.raises(ValueError): fs.open('/out', block_size=1)
def transform(date: dt.datetime): fs = fsspec.filesystem("az", **settings.auth.dict()) spinner = Halo() spinner.start("Fetching raw data") df = transform_data.load_raw_data(settings.listing_location(date), fs=fs) spinner.succeed("Fetched raw data!") spinner.start("Calculating feature") df = transform_data.calculate_beds_per_accommodates(df) df = transform_data.add_date(df, date) spinner.succeed("Feature calculated") spinner.start("Uploading feature") transform_data.upload_feature("beds_per_accommodates", df, fs=fs) spinner.succeed("Feature uploaded") click.secho("Data is transformed!")
def test_cats(): with filetexts(csv_files, mode="b"): fs = fsspec.filesystem("file") assert fs.cat(".test.fakedata.1.csv") == b"a,b\n" b"1,2\n" out = set(fs.cat([".test.fakedata.1.csv", ".test.fakedata.2.csv"]).values()) assert out == {b"a,b\n" b"1,2\n", b"a,b\n" b"3,4\n"} assert fs.cat(".test.fakedata.1.csv", None, None) == b"a,b\n" b"1,2\n" assert fs.cat(".test.fakedata.1.csv", start=1, end=6) == b"a,b\n" b"1,2\n"[1:6] assert fs.cat(".test.fakedata.1.csv", start=-1) == b"a,b\n" b"1,2\n"[-1:] assert ( fs.cat(".test.fakedata.1.csv", start=1, end=-2) == b"a,b\n" b"1,2\n"[1:-2] ) out = set( fs.cat( [".test.fakedata.1.csv", ".test.fakedata.2.csv"], start=1, end=-1 ).values() ) assert out == {b"a,b\n" b"1,2\n"[1:-1], b"a,b\n" b"3,4\n"[1:-1]}
def _get_fsspec_filesystem(location: str, fs_args: Optional[str]): # pylint: disable=import-outside-toplevel import anyconfig import fsspec from kedro.io.core import get_protocol_and_path protocol, _ = get_protocol_and_path(location) fs_args_config = anyconfig.load(fs_args) if fs_args else {} try: return fsspec.filesystem(protocol, **fs_args_config) except Exception as exc: # pylint: disable=broad-except # Specified protocol is not supported by `fsspec` # or requires extra dependencies click.secho(str(exc), fg="red") click.secho("Trying to use 'pip download'...", fg="red") return None
def test_get_sync(tmpdir): localfs = LocalFileSystem() real = tmpdir / "file" real.write_binary(b"0123456789") refs = {"a": b"data", "b": (str(real), 0, 5), "c/d": (str(real), 1, 6)} fs = fsspec.filesystem("reference", fo=refs, fs=localfs) fs.get("a", str(tmpdir / "a")) assert (tmpdir / "a").read_binary() == b"data" fs.get("b", str(tmpdir / "b")) assert (tmpdir / "b").read_binary() == b"01234" fs.get("c/d", str(tmpdir / "d")) assert (tmpdir / "d").read_binary() == b"123456" fs.get("c", str(tmpdir / "c"), recursive=True) assert (tmpdir / "c").isdir() assert (tmpdir / "c" / "d").read_binary() == b"123456"
def test_simple(jupyter): url, d = jupyter fs = fsspec.filesystem("jupyter", url=url) assert fs.ls("") == [] fs.pipe("afile", b"data") assert fs.cat("afile") == b"data" assert "afile" in os.listdir(d) with fs.open("bfile", "wb") as f: f.write(b"more") with fs.open("bfile", "rb") as f: assert f.read() == b"more" assert fs.info("bfile")["size"] == 4 fs.rm("afile") assert "afile" not in os.listdir(d)
def test_compression_filesystems(compression_fs_class, gz_file, bz2_file, lz4_file, zstd_file, xz_file, text_file): input_paths = { "gzip": gz_file, "xz": xz_file, "zstd": zstd_file, "bz2": bz2_file, "lz4": lz4_file } input_path = str(input_paths[compression_fs_class.protocol]) fs = fsspec.filesystem(compression_fs_class.protocol, fo=input_path) assert isinstance(fs, compression_fs_class) expected_filename = os.path.basename(input_path) expected_filename = expected_filename[:expected_filename.rindex(".")] assert fs.ls("/") == [expected_filename] with fs.open(expected_filename, "r", encoding="utf-8") as f, open( text_file, encoding="utf-8") as expected_file: assert f.read() == expected_file.read()