def test_cached_write(protocol): d = tempfile.mkdtemp() with fsspec.open_files(f"{protocol}::file://{d}/*.out", mode="wb", num=2) as files: for f in files: f.write(b"data") assert sorted(os.listdir(d)) == ["0.out", "1.out"]
def _load(self, reload=False): """Load text of catalog file and pass to parse Will do nothing if auto-reload is off and reload is not explicitly requested """ if self.access is False: # skip first load, if cat has given name (i.e., is subcat) self.updated = 0 self.access = True return if self.autoreload or reload: # First, we load from YAML, failing if syntax errors are found options = self.storage_options or {} if hasattr(self.path, 'path') or hasattr(self.path, 'read'): file_open = self.path self.path = make_path_posix( getattr(self.path, 'path', getattr(self.path, 'name', 'file'))) elif self.filesystem is None: file_open = open_files(self.path, mode='rb', **options) assert len(file_open) == 1 file_open = file_open[0] self.filesystem = file_open.fs else: file_open = self.filesystem.open(self.path, mode='rb') self._dir = get_dir(self.path) with file_open as f: text = f.read().decode() if "!template " in text: logger.warning("Use of '!template' deprecated - fixing") text = text.replace('!template ', '') self.parse(text)
def jsonl_file(request, tmp_path) -> str: data = [{"hello": "world"}, [1, 2, 3]] file_path = str(tmp_path / "1.jsonl") file_path += EXTENSIONS.get(request.param, "") with open_files([file_path], mode="wt", compression=request.param)[0] as f: f.write("\n".join(json.dumps(row) for row in data)) return file_path
def test_multizarr(generate_mzz): """Test creating a combined reference file with MultiZarrToZarr""" mzz = generate_mzz test_dict = mzz.translate() m = fsspec.get_mapper("reference://", fo=test_dict, remote_protocol="s3", remote_options=so) ds = xr.open_dataset(m, engine="zarr", backend_kwargs=dict(consolidated=False)) with fsspec.open_files(urls, **so) as fs: expts = [xr.open_dataset(f, engine="h5netcdf") for f in fs] expected = xr.concat(expts, dim="time").drop_vars("crs") assert set(ds) == set(expected) for name in ds: exp = { k: v.tolist() if isinstance(v, np.ndarray) else v for k, v in expected[name].attrs.items() } assert dict(ds[name].attrs) == exp for coo in ds.coords: if ds[coo].dtype.kind == "M": assert (ds[coo].values - expected[coo].values < np.array( [1], dtype="<m8[ms]")).all() else: assert np.allclose(ds[coo].values, expected[coo].values)
def _get_schema(self): from fsspec import open_files import dask.array as da if self._arr is None: path = self._get_cache(self.path)[0] files = open_files(path, 'rb', compression=None, **self.storage) if self.shape is None: arr = NumpyAccess(files[0]) self.shape = arr.shape self.dtype = arr.dtype arrs = [arr] + [NumpyAccess(f, self.shape, self.dtype, offset=arr.offset) for f in files[1:]] else: arrs = [NumpyAccess(f, self.shape, self.dtype) for f in files] self.chunks = (self._chunks, ) + (-1, ) * (len(self.shape) - 1) self._arrs = [da.from_array(arr, self.chunks) for arr in arrs] if len(self._arrs) > 1: self._arr = da.stack(self._arrs) else: self._arr = self._arrs[0] self.chunks = self._arr.chunks return Schema(dtype=str(self.dtype), shape=self.shape, extra_metadata=self.metadata, npartitions=self._arr.npartitions, chunks=self.chunks)
def json_file(request, tmp_path) -> str: data = {"hello": "world"} file_path = str(tmp_path / "1.json") file_path += EXTENSIONS.get(request.param, "") with open_files([file_path], mode="wt", compression=request.param)[0] as f: f.write(json.dumps(data)) return file_path
def _load_object_detection_api(self, model_spec: ObjectDetectionAPI_ModelSpec): import tensorflow as tf from object_detection.utils import config_util from object_detection.builders import model_builder temp_dir = tempfile.TemporaryDirectory() temp_dir_path = Path(temp_dir.name) model_config_path = temp_dir_path / Pathy(model_spec.config_path).name with open(model_config_path, 'wb') as out: with fsspec.open(model_spec.config_path, 'rb') as src: out.write(src.read()) src_checkpoint_path = Pathy(model_spec.checkpoint_path) checkpoint_path = temp_dir_path / src_checkpoint_path.name for src_file in fsspec.open_files(f"{src_checkpoint_path}*", 'rb'): out_file = temp_dir_path / Pathy(src_file.path).name with open(out_file, 'wb') as out: with src_file as src: out.write(src.read()) configs = config_util.get_configs_from_pipeline_file( pipeline_config_path=str(model_config_path) ) model_config = configs['model'] self.model = model_builder.build( model_config=model_config, is_training=False ) ckpt = tf.compat.v2.train.Checkpoint(model=self.model) ckpt.restore(str(checkpoint_path)).expect_partial() self.input_dtype = np.float32 # Run model through a dummy image so that variables are created zeros = np.zeros([640, 640, 3]) self._raw_predict_single_image_default(zeros) temp_dir.cleanup()
def __init__(self, fo="", mode="r", **storage_options): """ Parameters ---------- fo: str or file-like Contains ZIP, and must exist. If a str, will fetch file using `open_files()`, which must return one file exactly. mode: str Currently, only 'r' accepted storage_options: key-value May be credentials, e.g., `{'auth': ('username', 'pword')}` or any other parameters for requests """ if self._cached: return AbstractFileSystem.__init__(self) if mode != "r": raise ValueError("Only read from zip files accepted") self.in_fo = fo if isinstance(fo, str): files = open_files(fo) if len(files) != 1: raise ValueError('Path "{}" did not resolve to exactly' 'one file: "{}"'.format(fo, files)) fo = files[0] self.fo = fo.__enter__() # the whole instance is a context self.zip = zipfile.ZipFile(self.fo) self.block_size = storage_options.get("block_size", DEFAULT_BLOCK_SIZE) self.dir_cache = None
def _load_metadata(self): import dask.dataframe as dd import dask.delayed from fsspec import open_files self.files = open_files(self.url, **self.storage_options) def read_a_file(open_file, reader, kwargs): with open_file as of: df = reader(of, **kwargs) df['path'] = open_file.path return df if self.dataframe is None: self.parts = [ dask.delayed(read_a_file)(open_file, self.reader, self.kwargs) for open_file in self.files ] self.dataframe = dd.from_delayed(self.parts) self.npartitions = self.dataframe.npartitions self.shape = (None, len(self.dataframe.columns)) self.dtype = self.dataframe.dtypes.to_dict() self._schema = Schema(npartitions=self.npartitions, extra_metadata=self.metadata, dtype=self.dtype, shape=self.shape, datashape=None) return self._schema
def _fsfiles_for_s3(input_filenames): """Convert S3 URLs to something Satpy can understand and use. Examples: Example S3 URLs (no caching): .. code-block:: bash polar2grid.sh ... -f s3://noaa-goes16/ABI-L1b-RadC/2019/001/17/*_G16_s20190011702186* Example S3 URLs using fsspec caching: .. code-block:: bash polar2grid.sh ... -f simplecache::s3://noaa-goes16/ABI-L1b-RadC/2019/001/17/*_G16_s20190011702186* """ import fsspec from satpy.readers import FSFile kwargs = {"anon": True} if "simplecache::" in input_filenames[0]: kwargs = {"s3": kwargs} for open_file in fsspec.open_files(input_filenames, **kwargs): yield FSFile(open_file)
def test_multi_cache_chain(protocol): import zipfile d = tempfile.mkdtemp() fn = os.path.join(d, "test.zip") zipfile.ZipFile(fn, mode="w").open("test", "w").write(b"hello") with fsspec.open_files(f"zip://test::{protocol}::file://{fn}") as files: assert d not in files[0]._fileobj._file.name assert files[0].read() == b"hello" # special test contains "file:" string fn = os.path.join(d, "file.zip") zipfile.ZipFile(fn, mode="w").open("file", "w").write(b"hello") with fsspec.open_files(f"zip://file::{protocol}::file://{fn}") as files: assert d not in files[0]._fileobj._file.name assert files[0].read() == b"hello"
def _filenames_to_fsfile(filenames, storage_options): import fsspec from satpy.readers import FSFile if filenames: fsspec_files = fsspec.open_files(filenames, **storage_options) return [FSFile(f) for f in fsspec_files] return []
def _make_files(self, urlpath, **kwargs): import tempfile d = tempfile.mkdtemp() from fsspec import open_files self._ensure_cache_dir() self._urlpath = urlpath files_in = open_files(urlpath, 'rb', **self._storage_options) files_out = [ open_files( [make_path_posix(os.path.join(d, os.path.basename(f.path)))], 'wb')[0] for f in files_in ] super(CompressedCache, self)._load(files_in, files_out, urlpath, meta=False) return files_in, files_out
def get_label_to_base_label_image( base_labels_images: Union[str, Path], label_to_description: Union[str, Path, Dict[str, str]] = None, add_label_to_image: bool = False, make_labels_for_these_class_names_too: List[str] = [ ] # add known description to classes without base images ) -> Dict[str, np.ndarray]: if base_labels_images is None: return None base_labels_images_files = fsspec.open_files(str(base_labels_images)) ann_class_names_files = [ Pathy(base_label_image_file.path).stem for base_label_image_file in base_labels_images_files ] unique_ann_class_names = set(ann_class_names_files) if 'unknown' not in unique_ann_class_names: raise ValueError( f'"{base_labels_images}" must have image with name "unknown.*"') unknown_image_path = base_labels_images_files[ann_class_names_files.index( 'unknown')] label_to_base_label_image = defaultdict(lambda: unknown_image_path) label_to_base_label_image['unknown'] = unknown_image_path logger.info(f"Loading base labels images from {base_labels_images}...") for label in tqdm( list(unique_ann_class_names) + list(set(make_labels_for_these_class_names_too))): if label in unique_ann_class_names: base_label_image = base_labels_images_files[ ann_class_names_files.index(label)] else: base_label_image = label_to_base_label_image['unknown'] label_to_base_label_image[label] = base_label_image def label_to_base_label_image_func( label: str, label_to_description: Union[str, Path, Dict[str, str]] = label_to_description, add_label_to_image: bool = add_label_to_image): base_label_image = open_image(label_to_base_label_image[label]) if label_to_description is not None: if isinstance(label_to_description, str) or isinstance( label_to_description, Path): label_to_description = get_label_to_description( label_to_description_dict=label_to_description) base_label_image = get_base_label_image_with_description( base_label_image=base_label_image, label=label, description=label_to_description[label]) elif add_label_to_image: base_label_image = get_base_label_image_with_description( base_label_image=base_label_image, label=label, description='') return base_label_image return label_to_base_label_image_func
def test_complex(ftp_writable): host, port, user, pw = ftp_writable files = open_files('ftp:///ou*', host=host, port=port, username=user, password=pw, block_size=10000) assert len(files) == 1 with files[0] as fo: assert fo.read(10) == b'hellohello' assert len(fo.cache) == 10010 assert fo.read(2) == b'he' assert fo.tell() == 12
def test_complex(ftp_writable, cache_type): from fsspec.core import BytesCache host, port, user, pw = ftp_writable files = open_files('ftp:///ou*', host=host, port=port, username=user, password=pw, block_size=10000, cache_type=cache_type) assert len(files) == 1 with files[0] as fo: assert fo.read(10) == b'hellohello' if isinstance(fo.cache, BytesCache): assert len(fo.cache.cache) == 10010 assert fo.read(2) == b'he' assert fo.tell() == 12
def ftp_server(ftpserver: ProcessFTPServer) -> List[URL]: faker = Faker() files = ["file_1", "file_2", "file_3"] ftp_server_base_url = ftpserver.get_login_data(style="url") list_of_file_urls = [f"{ftp_server_base_url}/{filename}.txt" for filename in files] with fsspec.open_files(list_of_file_urls, "wt") as open_files: for index, fp in enumerate(open_files): fp.write(f"This is the file contents of '{files[index]}'\n") for s in faker.sentences(): fp.write(f"{s}\n") return [URL(f) for f in list_of_file_urls]
def test_chained_fs_multi(): d1 = tempfile.mkdtemp() d2 = tempfile.mkdtemp() f1 = os.path.join(d1, "f1") f2 = os.path.join(d1, "f2") with open(f1, "wb") as f: f.write(b"test1") with open(f2, "wb") as f: f.write(b"test2") of = fsspec.open_files( f"simplecache::file://{d1}/*", simplecache={ "cache_storage": d2, "same_names": True }, ) with of[0] as f: assert f.read() == b"test1" with of[1] as f: assert f.read() == b"test2" assert sorted(os.listdir(d2)) == ["f1", "f2"] d2 = tempfile.mkdtemp() of = fsspec.open_files( [f"simplecache::file://{f1}", f"simplecache::file://{f2}"], simplecache={ "cache_storage": d2, "same_names": True }, ) with of[0] as f: assert f.read() == b"test1" with of[1] as f: assert f.read() == b"test2" assert sorted(os.listdir(d2)) == ["f1", "f2"]
def test_multilevel_chained_fs(): """This test reproduces fsspec/filesystem_spec#334""" import zipfile d1 = tempfile.mkdtemp() f1 = os.path.join(d1, "f1.zip") with zipfile.ZipFile(f1, mode="w") as z: # filename, content z.writestr("foo.txt", "foo.txt") z.writestr("bar.txt", "bar.txt") # We expected this to be the correct syntax with pytest.raises(IsADirectoryError): of = fsspec.open_files(f"zip://*.txt::simplecache::file://{f1}") assert len(of) == 2 # But this is what is actually valid... of = fsspec.open_files(f"zip://*.txt::simplecache://{f1}::file://") assert len(of) == 2 for open_file in of: with open_file as f: assert f.read().decode("utf-8") == f.name
def last_checkpoint( checkpoints_dir: str, mode: str = 'rb') -> Generator[CheckpointSpecType, None, None]: """Return the last checkpoint or None.""" checkpoint_glob = os.path.join(checkpoints_dir, '*.pt') files = fsspec.open_files(checkpoint_glob, 'rb') if files: round_id = chain.from_iterable( ROUND_EXPRESSION.findall(f.path) for f in files) round_number, i = max((int(r), i) for i, r in enumerate(round_id)) with files[i] as checkpoint_file: yield checkpoint_file, round_number else: yield None, None
def _open_dataset(self): """ Open dataset using geopandas. """ if self._use_fsspec: with fsspec.open_files(self.urlpath, **self.storage_options) as f: f = self._resolve_single_file(f) if len(f) > 1 else f[0] self._dataframe = geopandas.read_parquet( f, **self._geopandas_kwargs, ) else: self._dataframe = geopandas.read_parquet(self.urlpath, **self._geopandas_kwargs)
def save(self, url, storage_options=None): """ Output this catalog to a file as YAML Parameters ---------- url : str Location to save to, perhaps remote storage_options : dict Extra arguments for the file-system """ from fsspec import open_files with open_files([url], **(storage_options or {}), mode='wt')[0] as f: f.write(self.serialize())
def test_multi_cache(protocol): with fsspec.open_files("memory://file*", "wb", num=2) as files: for f in files: f.write(b"hello") d2 = tempfile.mkdtemp() lurl = fsspec.open_local( f"{protocol}::memory://file*", mode="rb", **{protocol: { "cache_storage": d2, "same_names": True }}, ) assert all(d2 in u for u in lurl) assert all(os.path.basename(f) in ["file0", "file1"] for f in lurl) assert all(open(u, "rb").read() == b"hello" for u in lurl) d2 = tempfile.mkdtemp() lurl = fsspec.open_files( f"{protocol}::memory://file*", mode="rb", **{protocol: { "cache_storage": d2, "same_names": True }}, ) with lurl as files: for f in files: assert os.path.basename(f.name) in ["file0", "file1"] assert f.read() == b"hello" fs = fsspec.filesystem("memory") fs.store.clear() with lurl as files: for f in files: assert os.path.basename(f.name) in ["file0", "file1"] assert f.read() == b"hello"
def test_chained_fo(): import zipfile d1 = tempfile.mkdtemp() f1 = os.path.join(d1, "temp.zip") d3 = tempfile.mkdtemp() with zipfile.ZipFile(f1, mode="w") as z: z.writestr("afile", b"test") of = fsspec.open(f"zip://afile::file://{f1}") with of as f: assert f.read() == b"test" of = fsspec.open_files(f"zip://*::file://{f1}") with of[0] as f: assert f.read() == b"test" of = fsspec.open_files( f"simplecache::zip://*::file://{f1}", simplecache={"cache_storage": d3, "same_names": True}, ) with of[0] as f: assert f.read() == b"test" assert "afile" in os.listdir(d3)
def _determine_dims(self): logger.debug("open mappers") # If self.path is a list of dictionaries, pass them directly to fsspec.filesystem import collections.abc if isinstance(self.path[0], collections.abc.Mapping): fo_list = self.path # If self.path is list of files, open the files and load the json as a dictionary else: with fsspec.open_files(self.path, **self.storage_options) as ofs: fo_list = [json.load(of) for of in ofs] fss = [ fsspec.filesystem("reference", fo=fo, remote_protocol=self.remote_protocol, remote_options=self.remote_options) for fo in fo_list ] self.fs = fss[0].fs mappers = [fs.get_mapper("") for fs in fss] logger.debug("open first two datasets") xr_kwargs_copy = self.xr_kwargs.copy() # Add consolidated=False to xr kwargs if not explictly given by user # needed to suppress zarr open warnings if (version.parse(xr.__version__) >= version.parse("0.19.0") and 'consolidated' not in xr_kwargs_copy): xr_kwargs_copy['consolidated'] = False dss = [ xr.open_dataset(m, engine="zarr", chunks={}, **xr_kwargs_copy) for m in mappers[:2] ] if self.preprocess: logger.debug("preprocess") dss = [self.preprocess(d) for d in dss] logger.debug("concat") ds = xr.concat(dss, **self.concat_kwargs) ds0 = dss[0] self.extra_dims = set(ds.dims) - set(ds0.dims) self.concat_dims = set(k for k, v in ds.dims.items() if k in ds0.dims and v / ds0.dims[k] == 2) self.same_dims = set(ds.dims) - self.extra_dims - self.concat_dims return ds, ds0, fss
def _get_schema(self): from fsspec import open_files if self._files is None: urlpath = self._get_cache(self._urlpath)[0] self._files = open_files(urlpath, mode=self.mode, encoding=self.encoding, compression=self.compression, **self._storage_options) self.npartitions = len(self._files) return base.Schema(dtype=None, shape=(None, ), npartitions=self.npartitions, extra_metadata=self.metadata)
def _prepare_file_arg( file: Union[str, List[str], TextIO, Path, BinaryIO, bytes], **kwargs: Any ) -> ContextManager[Union[str, BinaryIO, List[str], List[BinaryIO]]]: """ Utility for read_[csv, parquet]. (not to be used by scan_[csv, parquet]). Returned value is always usable as a context. A `StringIO`, `BytesIO` file is returned as a `BytesIO`. A local path is returned as a string. An http URL is read into a buffer and returned as a `BytesIO`. When fsspec is installed, remote file(s) is (are) opened with `fsspec.open(file, **kwargs)` or `fsspec.open_files(file, **kwargs)`. """ # Small helper to use a variable as context @contextmanager def managed_file(file: Any) -> Iterator[Any]: try: yield file finally: pass if isinstance(file, StringIO): return BytesIO(file.read().encode("utf8")) if isinstance(file, BytesIO): return managed_file(file) if isinstance(file, Path): return managed_file(format_path(file)) if isinstance(file, str): if _WITH_FSSPEC: if infer_storage_options(file)["protocol"] == "file": return managed_file(format_path(file)) return fsspec.open(file, **kwargs) if file.startswith("http"): return _process_http_file(file) if isinstance(file, list) and bool(file) and all( isinstance(f, str) for f in file): if _WITH_FSSPEC: if all( infer_storage_options(f)["protocol"] == "file" for f in file): return managed_file([format_path(f) for f in file]) return fsspec.open_files(file, **kwargs) if isinstance(file, str): file = format_path(file) return managed_file(file)
def _load(self, _, __, urlpath, meta=True): import subprocess from fsspec import open_files path = os.path.join(self._cache_dir, self._hash(urlpath)) dat, part = os.path.split(urlpath) cmd = ['dat', 'clone', dat, path, '--no-watch'] try: subprocess.call(cmd, stdout=subprocess.PIPE) except (IOError, OSError): # pragma: no cover logger.info('Calling DAT failed') raise newpath = os.path.join(path, part) if meta: for of in open_files(newpath): self._log_metadata(urlpath, urlpath, of.path)
def _data_to_source(cat, path, **kwargs): from intake.catalog.local import YAMLFileCatalog from fsspec import open_files import yaml if not isinstance(cat, Catalog): raise NotImplementedError out = {} for name in cat: entry = cat[name] out[name] = entry.__getstate__() out[name]['parameters'] = [up._captured_init_kwargs for up in entry._user_parameters] out[name]['kwargs'].pop('parameters') fn = posixpath.join(path, 'cat.yaml') with open_files([fn], 'wt')[0] as f: yaml.dump({'sources': out}, f) return YAMLFileCatalog(fn)
def _data_to_source(cat, path, **kwargs): from intake.catalog.local import YAMLFileCatalog from fsspec import open_files import yaml if not isinstance(cat, Catalog): raise NotImplementedError out = {} # reach down into the private state because we apparently need the # Entry here rather than the public facing DataSource objects. for name, entry in cat._entries.items(): out[name] = entry.__getstate__() out[name]['parameters'] = [up._captured_init_kwargs for up in entry._user_parameters] out[name]['kwargs'].pop('parameters') fn = posixpath.join(path, 'cat.yaml') with open_files([fn], 'wt')[0] as f: yaml.dump({'sources': out}, f) return YAMLFileCatalog(fn)