Esempio n. 1
0
def test_find():
    with temparchive(data) as archive_file:
        lhs = fsspec.filesystem("libarchive", fo=archive_file)

        assert lhs.find("") == ["a", "b", "deeply/nested/path"]
        assert lhs.find("", withdirs=True) == [
            "a",
            "b",
            "deeply/",
            "deeply/nested/",
            "deeply/nested/path",
        ]

        assert lhs.find("deeply") == ["deeply/nested/path"]
        assert lhs.find("deeply/") == lhs.find("deeply")
Esempio n. 2
0
def test_mapping_prefix(tmpdir):
    tmpdir = str(tmpdir)
    os.makedirs(os.path.join(tmpdir, "afolder"))
    open(os.path.join(tmpdir, "afile"), "w").write("test")
    open(os.path.join(tmpdir, "afolder", "anotherfile"), "w").write("test2")

    m = fsspec.get_mapper("file://" + tmpdir)
    assert "afile" in m
    assert m["afolder/anotherfile"] == b"test2"

    fs = fsspec.filesystem("file")
    m2 = fs.get_mapper(tmpdir)
    m3 = fs.get_mapper("file://" + tmpdir)

    assert m == m2 == m3
Esempio n. 3
0
def load_from_disk(
        dataset_path: str,
        fs=None,
        keep_in_memory: Optional[bool] = None) -> Union[Dataset, DatasetDict]:
    """
    Loads a dataset that was previously saved using :meth:`Dataset.save_to_disk` from a dataset directory, or
    from a filesystem using either :class:`datasets.filesystems.S3FileSystem` or any implementation of
    ``fsspec.spec.AbstractFileSystem``.

    Args:
        dataset_path (:obj:`str`): Path (e.g. `"dataset/train"`) or remote URI (e.g.
            `"s3://my-bucket/dataset/train"`) of the Dataset or DatasetDict directory where the dataset will be
            loaded from.
        fs (:class:`~filesystems.S3FileSystem` or ``fsspec.spec.AbstractFileSystem``, optional, default ``None``):
            Instance of of the remote filesystem used to download the files from.
        keep_in_memory (:obj:`bool`, default ``None``): Whether to copy the dataset in-memory. If `None`, the dataset
            will not be copied in-memory unless explicitly enabled by setting `datasets.config.IN_MEMORY_MAX_SIZE` to
            nonzero. See more details in the :ref:`load_dataset_enhancing_performance` section.

    Returns:
        :class:`Dataset` or :class:`DatasetDict`:
        - If `dataset_path` is a path of a dataset directory: the dataset requested.
        - If `dataset_path` is a path of a dataset dict directory: a ``datasets.DatasetDict`` with each split.
    """
    # gets filesystem from dataset, either s3:// or file:// and adjusted dataset_path
    if is_remote_filesystem(fs):
        dest_dataset_path = extract_path_from_uri(dataset_path)
    else:
        fs = fsspec.filesystem("file")
        dest_dataset_path = dataset_path

    if not fs.exists(dest_dataset_path):
        raise FileNotFoundError("Directory {} not found".format(dataset_path))
    if fs.isfile(
            Path(dest_dataset_path, config.DATASET_INFO_FILENAME).as_posix()):
        return Dataset.load_from_disk(dataset_path,
                                      fs,
                                      keep_in_memory=keep_in_memory)
    elif fs.isfile(
            Path(dest_dataset_path,
                 config.DATASETDICT_JSON_FILENAME).as_posix()):
        return DatasetDict.load_from_disk(dataset_path,
                                          fs,
                                          keep_in_memory=keep_in_memory)
    else:
        raise FileNotFoundError(
            "Directory {} is neither a dataset directory nor a dataset dict directory."
            .format(dataset_path))
Esempio n. 4
0
    def load_from_disk(dataset_dict_path: str,
                       fs=None,
                       keep_in_memory: Optional[bool] = None) -> "DatasetDict":
        """
        Load a dataset that was previously saved using :meth:`save_to_disk` from a filesystem using either
        :class:`~filesystems.S3FileSystem` or ``fsspec.spec.AbstractFileSystem``.

        Args:
            dataset_dict_path (:obj:`str`): Path (e.g. ``"dataset/train"``) or remote URI (e.g.
                ``"s3//my-bucket/dataset/train"``) of the dataset dict directory where the dataset dict will be loaded
                from.
            fs (:class:`~filesystems.S3FileSystem` or ``fsspec.spec.AbstractFileSystem``, optional, default ``None``):
                Instance of the remote filesystem used to download the files from.
            keep_in_memory (:obj:`bool`, default ``None``): Whether to copy the dataset in-memory. If `None`, the
                dataset will not be copied in-memory unless explicitly enabled by setting
                `datasets.config.IN_MEMORY_MAX_SIZE` to nonzero. See more details in the
                :ref:`load_dataset_enhancing_performance` section.

        Returns:
            :class:`DatasetDict`
        """
        dataset_dict = DatasetDict()
        if is_remote_filesystem(fs):
            dest_dataset_dict_path = extract_path_from_uri(dataset_dict_path)
        else:
            fs = fsspec.filesystem("file")
            dest_dataset_dict_path = dataset_dict_path
        dataset_dict_json_path = Path(
            dest_dataset_dict_path,
            config.DATASETDICT_JSON_FILENAME).as_posix()
        dataset_info_path = Path(dest_dataset_dict_path,
                                 config.DATASET_INFO_FILENAME).as_posix()
        if fs.isfile(
                dataset_info_path) and not fs.isfile(dataset_dict_json_path):
            raise FileNotFoundError(
                f"No such file or directory: '{dataset_dict_json_path}'. Expected to load a DatasetDict object, but got a Dataset. Please use datasets.load_from_disk instead."
            )
        for k in json.load(
                fs.open(dataset_dict_json_path, "r",
                        encoding="utf-8"))["splits"]:
            dataset_dict_split_path = (
                dataset_dict_path.split("://")[0] + "://" +
                Path(dest_dataset_dict_path, k).as_posix()
                if is_remote_filesystem(fs) else Path(dest_dataset_dict_path,
                                                      k).as_posix())
            dataset_dict[k] = Dataset.load_from_disk(
                dataset_dict_split_path, fs, keep_in_memory=keep_in_memory)
        return dataset_dict
Esempio n. 5
0
    def _determine_dims(self):
        logger.debug("open mappers")

        # If self.path is a list of dictionaries, pass them directly to fsspec.filesystem
        import collections.abc
        if isinstance(self.path[0], collections.abc.Mapping):
            fo_list = self.path

        # If self.path is list of files, open the files and load the json as a dictionary
        else:
            with fsspec.open_files(self.path, **self.storage_options) as ofs:
                fo_list = [json.load(of) for of in ofs]

        fss = [
            fsspec.filesystem("reference",
                              fo=fo,
                              remote_protocol=self.remote_protocol,
                              remote_options=self.remote_options)
            for fo in fo_list
        ]
        self.fs = fss[0].fs
        mappers = [fs.get_mapper("") for fs in fss]

        logger.debug("open first two datasets")
        xr_kwargs_copy = self.xr_kwargs.copy()

        # Add consolidated=False to xr kwargs if not explictly given by user
        # needed to suppress zarr open warnings
        if (version.parse(xr.__version__) >= version.parse("0.19.0")
                and 'consolidated' not in xr_kwargs_copy):
            xr_kwargs_copy['consolidated'] = False

        dss = [
            xr.open_dataset(m, engine="zarr", chunks={}, **xr_kwargs_copy)
            for m in mappers[:2]
        ]

        if self.preprocess:
            logger.debug("preprocess")
            dss = [self.preprocess(d) for d in dss]
        logger.debug("concat")
        ds = xr.concat(dss, **self.concat_kwargs)
        ds0 = dss[0]
        self.extra_dims = set(ds.dims) - set(ds0.dims)
        self.concat_dims = set(k for k, v in ds.dims.items()
                               if k in ds0.dims and v / ds0.dims[k] == 2)
        self.same_dims = set(ds.dims) - self.extra_dims - self.concat_dims
        return ds, ds0, fss
Esempio n. 6
0
def get_nudging_assets(
        run_duration: timedelta,
        current_date: Sequence[int],
        nudge_path: str,
        nudge_filename_pattern: str = "%Y%m%d_%HZ_T85LR.nc",
        copy_method: str = "copy",
        nudge_interval: timedelta = timedelta(hours=6),
) -> List[Mapping]:
    """Return list of assets of nudging files required for given run duration and
    start time.
    
    This method defines file paths directly from its arguments, without
    determining whether the files themselves are present.
    
    Args:
        run_duration: length of fv3gfs run
        current_date: start time of fv3gfs run as a sequence of 6 integers
        nudge_path: local or remote path to nudging files
        nudge_filename_pattern: template for nudging filenames. Pattern should follow
            style of datetime strptime and strftime 'format' argument. Defaults to
            '%Y%m%d_%HZ_T85LR.nc'.
        copy_method: copy_method for nudging file assets. Defaults to 'copy'.
        nudge_interval: time between nudging files. Must be multiple of 1 hour.
            Defaults to 6 hours.

    Returns:
        list of all assets required for nudging run

    Raises:
        ConfigError: if copy_method is "link" and a remote path is given for nudge_path
    """
    if get_fs(nudge_path) != fsspec.filesystem(
            "file") and copy_method == "link":
        raise ConfigError(
            "Cannot link nudging files if using remote path for nudge_path. "
            f"Got {nudge_path}.")
    time_list = _get_nudge_time_list(run_duration, current_date,
                                     nudge_interval)
    filename_list = [
        time.strftime(nudge_filename_pattern) for time in time_list
    ]
    nudging_assets = [
        get_asset_dict(nudge_path,
                       file_,
                       target_location="INPUT",
                       copy_method=copy_method) for file_ in filename_list
    ]
    return nudging_assets
Esempio n. 7
0
    def __init__(
        self,
        filepath: str,
        load_args: Dict[str, Any] = None,
        save_args: Dict[str, Any] = None,
        credentials: Dict[str, Any] = None,
        fs_args: Dict[str, Any] = None,
        layer: str = None,
    ) -> None:
        """
        Creates a new instance of ``BioSequenceDataSet`` pointing
        to a concrete filepath.

        Args:
            filepath: path to sequence file prefixed with a protocol like `s3://`.
                If prefix is not provided, `file` protocol (local filesystem) will be used.
                The prefix should be any protocol supported by ``fsspec``.
            load_args: Options for parsing sequence files by Biopython ``SeqIO.parse()``.
            save_args: file format supported by Biopython ``SeqIO.write()``.
                E.g. `{"format": "fasta"}`.
            credentials: Credentials required to get access to the underlying filesystem.
                E.g. for ``GCSFileSystem`` it should look like `{"token": None}`.
            fs_args: Extra arguments to pass into underlying filesystem class.
                E.g. for ``GCSFileSystem`` class: `{"project": "my-project", ...}`.
            layer: The data layer according to the data engineering convention:
                https://kedro.readthedocs.io/en/stable/06_resources/01_faq.html#what-is-data-engineering-convention

        Note: Here you can find all supported file formats: https://biopython.org/wiki/SeqIO
        """

        _fs_args = deepcopy(fs_args) or {}
        _credentials = deepcopy(credentials) or {}

        protocol, path = get_protocol_and_path(filepath)

        self._layer = layer
        self._filepath = PurePosixPath(path)
        self._protocol = protocol
        self._fs = fsspec.filesystem(self._protocol, **_credentials,
                                     **_fs_args)

        # Handle default load and save arguments
        self._load_args = deepcopy(self.DEFAULT_LOAD_ARGS)
        if load_args is not None:
            self._load_args.update(load_args)
        self._save_args = deepcopy(self.DEFAULT_SAVE_ARGS)
        if save_args is not None:
            self._save_args.update(save_args)
Esempio n. 8
0
def test_directory_mem_nest():
    import fsspec
    m = fsspec.filesystem("memory")
    df = pd.DataFrame({
        'x': [1, 2, 3, 4],
        'y': [1.0, 2.0, 1.0, 2.0],
        'z': ['a', 'b', 'c', 'd']
    })
    df.index.name = 'index'
    write('/dir/field=a/foo1.parquet', df, open_with=m.open)
    write('/dir/field=b/foo2.parquet', df, open_with=m.open)

    pf = ParquetFile("/dir", fs=m)
    assert pf.info['rows'] == 8
    assert pf.to_pandas()['z'].tolist() == ['a', 'b', 'c', 'd'] * 2
    assert pf.to_pandas()['field'].tolist() == ['a'] * 4 + ['b'] * 4
Esempio n. 9
0
    def get_filesystem(self, silent=True):
        """return fsspec file system object, if supported"""
        if self._filesystem:
            return self._filesystem

        try:
            import gcsfs  # noqa
        except ImportError as exc:
            if not silent:
                raise ImportError(
                    f"Google gcsfs not installed, run pip install gcsfs, {exc}"
                )
            return None

        self._filesystem = fsspec.filesystem("gcs", **self.get_storage_options())
        return self._filesystem
Esempio n. 10
0
def test_detect_folders(tmpdir):

    fs = fsspec.filesystem("file")

    rundirs = ["rundir1", "rundir2"]
    for rdir in rundirs:
        tmpdir.mkdir(rdir).join("diags.nc").write("foobar")

    tmpdir.mkdir("not_a_rundir").join("useless_file.txt").write("useless!")

    result = detect_folders(tmpdir, fs)

    assert len(result) == 2
    for found_dir in result:
        assert found_dir in rundirs
        assert isinstance(result[found_dir], DiagnosticFolder)
Esempio n. 11
0
def local_filecache():
    import tempfile

    original_location = tempfile.mkdtemp()
    cache_location = tempfile.mkdtemp()
    original_file = os.path.join(original_location, "afile")
    data = b"test data"
    with open(original_file, "wb") as f:
        f.write(data)

    # we can access the file and read it
    fs = fsspec.filesystem("filecache",
                           target_protocol="file",
                           cache_storage=cache_location)

    return (data, original_file, cache_location, fs)
Esempio n. 12
0
def test_blocksize(ftp_writable):
    host, port, user, pw = ftp_writable
    fs = FTPFileSystem(host, port, user, pw)
    with fs.open("/out_block", "wb") as f:
        f.write(b"test" * 4000)

    fs = fsspec.filesystem(
        "blockcache",
        target_protocol="ftp",
        target_options={"host": host, "port": port, "username": user, "password": pw},
    )

    with fs.open("/out_block", block_size=20) as f:
        assert f.read(1) == b"t"
    with pytest.raises(ValueError):
        fs.open("/out_block", block_size=30)
    async def _():
        loop = asyncio.get_event_loop()
        fs = fsspec.filesystem("http", asynchronous=True, loop=loop)

        # fails because client creation has not yet been awaited
        assert isinstance(
            (await fs._cat([server + "/index/realfile"]))[0], RuntimeError
        )
        with pytest.raises(RuntimeError):
            fs.cat([server + "/index/realfile"])

        await fs.set_session()  # creates client

        out = await fs._cat([server + "/index/realfile"])
        del fs
        assert out == [data]
Esempio n. 14
0
    def run(  # type: ignore
        self,
        data: pd.DataFrame,
        output_dir: str,
        filesystem: Optional[str] = "file",
        mode: Optional[str] = "model",
    ):
        """Save the game data.

        Saves the data to ``output_dir/data_{GameID}.csv``

        Parameters
        ----------
        data : pd.DataFrame
            The clean data.
        output_dir : str
            The directory containing the data.
        filesystem : str, optional (default "file")
            The name of the ``fsspec`` filesystem to use.
        mode : str, optional (default "model")
            The type of clean data to save. If ``model``, save to the directory
            ``model-data``. If ``rating``, save to ``rating-data``.

        Returns
        -------
        None
        """
        # Define subdirectory
        if mode == "model":
            subdir = "model-data"
        elif mode == "rating":
            subdir = "rating-data"
        else:
            raise ValueError("Please supply a valid value for ``mode``")
        # Get the filesystem
        fs = fsspec.filesystem(filesystem)
        fs.mkdirs(Path(output_dir, subdir), exist_ok=True)
        grouped = data.groupby("GAME_ID")
        for name, group in grouped:
            if not name.startswith("002"):
                self.logger.warning(
                    f"{name} is not a regular season game. Skipping...")
                continue
            fpath = Path(output_dir, subdir, f"data_{name}.csv")
            self.logger.info(f"Writing data for game {name} to {str(fpath)}")
            with fs.open(fpath, "wb") as buf:
                group.to_csv(buf, sep="|", mode="wb")
Esempio n. 15
0
def test_add_file_to_cache_after_save(local_filecache):
    (data, original_file, cache_location, fs) = local_filecache

    fs.save_cache()

    fs.cat(original_file)
    assert len(fs.cached_files[-1]) == 1

    fs.save_cache()

    fs2 = fsspec.filesystem(
        "filecache",
        target_protocol="file",
        cache_storage=cache_location,
        do_not_use_cache_for_this_instance=True,  # cache is masking the issue
    )
    assert len(fs2.cached_files[-1]) == 1
Esempio n. 16
0
    def test_raw_file_distributed_serializable(self):
        from distributed.protocol import deserialize
        from distributed.protocol import serialize

        # Arrange
        fs = fsspec.filesystem("file")
        path = TEST_DATA_DIR + "/betfair/1.166811431.bz2"
        r = RawFile(open_file=fs.open(path=path, compression="bz2"))

        # Act
        result1: RawFile = deserialize(*serialize(r))

        # Assert
        assert result1.open_file.fs == r.open_file.fs
        assert result1.open_file.path == r.open_file.path
        assert result1.block_size == r.block_size
        assert result1.open_file.compression == "bz2"
Esempio n. 17
0
    def __init__(self, filepath: str, version: Version = None):
        """Creates a new instance of ImageDataSet to load / save image data for given filepath.

        Args:
            filepath: The location of the image file to load / save data.
            version: The version of the dataset being saved and loaded.
        """
        protocol, path = get_protocol_and_path(filepath)
        self._protocol = protocol
        self._fs = fsspec.filesystem(self._protocol)

        super().__init__(
            filepath=PurePosixPath(path),
            version=version,
            exists_function=self._fs.exists,
            glob_function=self._fs.glob,
        )
Esempio n. 18
0
def test_cached_open_close_read(ftp_writable):
    # Regression test for <https://github.com/fsspec/filesystem_spec/issues/799>
    host, port, user, pw = ftp_writable
    fs = FTPFileSystem(host, port, user, pw)
    with fs.open("/out_block", "wb") as f:
        f.write(b"test" * 4000)
    fs = fsspec.filesystem(
        "cached",
        target_protocol="ftp",
        target_options={"host": host, "port": port, "username": user, "password": pw},
    )
    with fs.open("/out_block", block_size=1024) as f:
        pass
    with fs.open("/out_block", block_size=1024) as f:
        assert f.read(1) == b"t"
    # Regression test for <https://github.com/fsspec/filesystem_spec/issues/845>
    assert fs.cached_files[-1]["/out_block"]["blocks"] == {0}
Esempio n. 19
0
    def __init__(self, path=None, fs_protocol=None):
        """
        Initialize a new instance of the ``DataCatalog`` class.

        Parameters
        ----------
        path : str
            The root path to the data.
        fs_protocol : str
            The file system protocol to use.

        """
        self.fs = fsspec.filesystem(
            fs_protocol
            or os.environ.get("NAUTILUS_BACKTEST_FS_PROTOCOL", "file"))
        self.root = pathlib.Path(path or os.environ["NAUTILUS_BACKTEST_DIR"])
        self._processed_files_fn = f"{self.root}/.processed_raw_files.json"
Esempio n. 20
0
def load_from_disk(
        dataset_path: str,
        fs=None,
        keep_in_memory: Optional[bool] = None) -> Union[Dataset, DatasetDict]:
    """
    Loads a dataset that was previously saved using ``dataset.save_to_disk(dataset_path)`` from a dataset directory, or from a filesystem using either :class:`datasets.filesystems.S3FileSystem` or any implementation of ``fsspec.spec.AbstractFileSystem``.

    Args:
        dataset_path (:obj:`str`): Path (e.g. ``"dataset/train"``) or remote uri (e.g.
            ``"s3://my-bucket/dataset/train"``) of the Dataset or DatasetDict directory where the dataset will be
            loaded from.
        fs (:class:`~filesystems.S3FileSystem` or ``fsspec.spec.AbstractFileSystem``, optional, default ``None``):
            Instance of of the remote filesystem used to download the files from.
        keep_in_memory (:obj:`bool`, default ``None``): Whether to copy the dataset in-memory. If `None`, the
            dataset will be copied in-memory if its size is smaller than
            `datasets.config.MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES` (default `250 MiB`). This behavior can be disabled by
            setting ``datasets.config.MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES = None``, and in this case the dataset is
            not loaded in memory.

    Returns:
        ``datasets.Dataset`` or ``datasets.DatasetDict``
            if `dataset_path` is a path of a dataset directory: the dataset requested,
            if `dataset_path` is a path of a dataset dict directory: a ``datasets.DatasetDict`` with each split.
            keep_in_memory (``bool``, default False): Whether to copy the data in-memory.
    """
    # gets filesystem from dataset, either s3:// or file:// and adjusted dataset_path
    if is_remote_filesystem(fs):
        dest_dataset_path = extract_path_from_uri(dataset_path)
    else:
        fs = fsspec.filesystem("file")
        dest_dataset_path = dataset_path

    if not fs.exists(dest_dataset_path):
        raise FileNotFoundError("Directory {} not found".format(dataset_path))
    if fs.isfile(Path(dest_dataset_path, "dataset_info.json").as_posix()):
        return Dataset.load_from_disk(dataset_path,
                                      fs,
                                      keep_in_memory=keep_in_memory)
    elif fs.isfile(Path(dest_dataset_path, "dataset_dict.json").as_posix()):
        return DatasetDict.load_from_disk(dataset_path,
                                          fs,
                                          keep_in_memory=keep_in_memory)
    else:
        raise FileNotFoundError(
            "Directory {} is neither a dataset directory nor a dataset dict directory."
            .format(dataset_path))
Esempio n. 21
0
def pull_package(package_path, env, alias):
    """Pull a modular pipeline package, unpack it and install the files to corresponding
    locations.
    """
    # pylint: disable=import-outside-toplevel
    import fsspec

    from kedro.io.core import get_protocol_and_path

    protocol, _ = get_protocol_and_path(package_path)
    filesystem = fsspec.filesystem(protocol)

    with tempfile.TemporaryDirectory() as temp_dir:
        temp_dir_path = Path(temp_dir).resolve()
        if package_path.endswith(".whl") and filesystem.exists(package_path):
            with filesystem.open(package_path) as fs_file:
                ZipFile(fs_file).extractall(temp_dir_path)
        else:
            python_call(
                "pip",
                [
                    "download", "--no-deps", "--dest",
                    str(temp_dir_path), package_path
                ],
            )
            wheel_file = list(temp_dir_path.glob("*.whl"))
            # `--no-deps` should fetch only one wheel file, and CLI should fail if that's
            # not the case.
            if len(wheel_file) != 1:
                file_names = [wf.name for wf in wheel_file]
                raise KedroCliError(
                    f"More than 1 or no wheel files found: {str(file_names)}. "
                    "There has to be exactly one distribution file.")
            ZipFile(wheel_file[0]).extractall(temp_dir_path)

        dist_info_file = list(temp_dir_path.glob("*.dist-info"))
        if len(dist_info_file) != 1:
            raise KedroCliError(
                f"More than 1 or no dist-info files found from {package_path}. "
                "There has to be exactly one dist-info directory.")
        # Extract package name, based on the naming convention for wheel files
        # https://www.python.org/dev/peps/pep-0427/#file-name-convention
        package_name = dist_info_file[0].stem.split("-")[0]

        _clean_pycache(temp_dir_path)
        _install_files(package_name, temp_dir_path, env, alias)
Esempio n. 22
0
def temporary_directory(
    suffix: Optional[str] = None,
    prefix: Optional[str] = None,
    dir: Optional[PathType] = None,
    storage_options: Optional[Dict[str, str]] = None,
) -> Iterator[str]:
    """Create a temporary directory in a fsspec filesystem.

    Parameters
    ----------
    suffix : Optional[str], optional
        If not None, the name of the temporary directory will end with that suffix.
    prefix : Optional[str], optional
        If not None, the name of the temporary directory will start with that prefix.
    dir : Optional[PathType], optional
        If not None, the temporary directory will be created in that directory, otherwise
        the local filesystem directory returned by `tempfile.gettempdir()` will be used.
        The directory may be specified as any fsspec URL.
    storage_options : Optional[Dict[str, str]], optional
        Any additional parameters for the storage backend (see `fsspec.open`).

    Yields
    -------
    Generator[str, None, None]
        A context manager yielding the fsspec URL to the created directory.
    """

    # Fill in defaults
    suffix = suffix or ""
    prefix = prefix or ""
    dir = dir or tempfile.gettempdir()
    storage_options = storage_options or {}

    # Find the filesystem by looking at the URL scheme (protocol), empty means local filesystem
    protocol = urlparse(str(dir)).scheme
    fs = fsspec.filesystem(protocol, **storage_options)

    # Construct a random directory name
    tempdir = build_url(dir, prefix + str(uuid.uuid4()) + suffix)
    try:
        fs.mkdir(tempdir)
        yield tempdir
    finally:
        # Remove the temporary directory on exiting the context manager
        fs.rm(tempdir, recursive=True)
Esempio n. 23
0
    def compute(self, url, filename="index.html"):
        """Compute reports_by_run index from all reports found at url.

        Args:
            url: path to directory containing report subdirectories.
            filename: name of report html files.

        Note:
            Reports are assumed to be located at {url}/*/{filename}.
        """
        loop = asyncio.get_event_loop()
        if url.startswith("gs://"):
            fs = gcsfs.GCSFileSystem(asynchronous=True)
        else:
            fs = fsspec.filesystem("file")
        self.reports_by_run = loop.run_until_complete(
            self._get_reports(fs, url, filename))
        loop.run_until_complete(_close_session(fs))
Esempio n. 24
0
def test_blocksize(ftp_writable):
    host, port, user, pw = ftp_writable
    fs = FTPFileSystem(host, port, user, pw)
    with fs.open('/out', 'wb') as f:
        f.write(b'test')

    fs = fsspec.filesystem('cached',
                           target_protocol='ftp',
                           target_options={
                               'host': host,
                               'port': port,
                               'username': user,
                               'password': pw
                           })

    assert fs.cat('/out') == b'test'
    with pytest.raises(ValueError):
        fs.open('/out', block_size=1)
Esempio n. 25
0
def transform(date: dt.datetime):
    fs = fsspec.filesystem("az", **settings.auth.dict())
    spinner = Halo()

    spinner.start("Fetching raw data")
    df = transform_data.load_raw_data(settings.listing_location(date), fs=fs)
    spinner.succeed("Fetched raw data!")

    spinner.start("Calculating feature")
    df = transform_data.calculate_beds_per_accommodates(df)
    df = transform_data.add_date(df, date)
    spinner.succeed("Feature calculated")

    spinner.start("Uploading feature")
    transform_data.upload_feature("beds_per_accommodates", df, fs=fs)
    spinner.succeed("Feature uploaded")

    click.secho("Data is transformed!")
Esempio n. 26
0
def test_cats():
    with filetexts(csv_files, mode="b"):
        fs = fsspec.filesystem("file")
        assert fs.cat(".test.fakedata.1.csv") == b"a,b\n" b"1,2\n"
        out = set(fs.cat([".test.fakedata.1.csv", ".test.fakedata.2.csv"]).values())
        assert out == {b"a,b\n" b"1,2\n", b"a,b\n" b"3,4\n"}
        assert fs.cat(".test.fakedata.1.csv", None, None) == b"a,b\n" b"1,2\n"
        assert fs.cat(".test.fakedata.1.csv", start=1, end=6) == b"a,b\n" b"1,2\n"[1:6]
        assert fs.cat(".test.fakedata.1.csv", start=-1) == b"a,b\n" b"1,2\n"[-1:]
        assert (
            fs.cat(".test.fakedata.1.csv", start=1, end=-2) == b"a,b\n" b"1,2\n"[1:-2]
        )
        out = set(
            fs.cat(
                [".test.fakedata.1.csv", ".test.fakedata.2.csv"], start=1, end=-1
            ).values()
        )
        assert out == {b"a,b\n" b"1,2\n"[1:-1], b"a,b\n" b"3,4\n"[1:-1]}
Esempio n. 27
0
def _get_fsspec_filesystem(location: str, fs_args: Optional[str]):
    # pylint: disable=import-outside-toplevel
    import anyconfig
    import fsspec

    from kedro.io.core import get_protocol_and_path

    protocol, _ = get_protocol_and_path(location)
    fs_args_config = anyconfig.load(fs_args) if fs_args else {}

    try:
        return fsspec.filesystem(protocol, **fs_args_config)
    except Exception as exc:  # pylint: disable=broad-except
        # Specified protocol is not supported by `fsspec`
        # or requires extra dependencies
        click.secho(str(exc), fg="red")
        click.secho("Trying to use 'pip download'...", fg="red")
        return None
Esempio n. 28
0
def test_get_sync(tmpdir):
    localfs = LocalFileSystem()

    real = tmpdir / "file"
    real.write_binary(b"0123456789")

    refs = {"a": b"data", "b": (str(real), 0, 5), "c/d": (str(real), 1, 6)}
    fs = fsspec.filesystem("reference", fo=refs, fs=localfs)

    fs.get("a", str(tmpdir / "a"))
    assert (tmpdir / "a").read_binary() == b"data"
    fs.get("b", str(tmpdir / "b"))
    assert (tmpdir / "b").read_binary() == b"01234"
    fs.get("c/d", str(tmpdir / "d"))
    assert (tmpdir / "d").read_binary() == b"123456"
    fs.get("c", str(tmpdir / "c"), recursive=True)
    assert (tmpdir / "c").isdir()
    assert (tmpdir / "c" / "d").read_binary() == b"123456"
Esempio n. 29
0
def test_simple(jupyter):
    url, d = jupyter
    fs = fsspec.filesystem("jupyter", url=url)
    assert fs.ls("") == []

    fs.pipe("afile", b"data")
    assert fs.cat("afile") == b"data"
    assert "afile" in os.listdir(d)

    with fs.open("bfile", "wb") as f:
        f.write(b"more")
    with fs.open("bfile", "rb") as f:
        assert f.read() == b"more"

    assert fs.info("bfile")["size"] == 4
    fs.rm("afile")

    assert "afile" not in os.listdir(d)
Esempio n. 30
0
def test_compression_filesystems(compression_fs_class, gz_file, bz2_file,
                                 lz4_file, zstd_file, xz_file, text_file):
    input_paths = {
        "gzip": gz_file,
        "xz": xz_file,
        "zstd": zstd_file,
        "bz2": bz2_file,
        "lz4": lz4_file
    }
    input_path = str(input_paths[compression_fs_class.protocol])
    fs = fsspec.filesystem(compression_fs_class.protocol, fo=input_path)
    assert isinstance(fs, compression_fs_class)
    expected_filename = os.path.basename(input_path)
    expected_filename = expected_filename[:expected_filename.rindex(".")]
    assert fs.ls("/") == [expected_filename]
    with fs.open(expected_filename, "r", encoding="utf-8") as f, open(
            text_file, encoding="utf-8") as expected_file:
        assert f.read() == expected_file.read()