def __init__(self, local_ftp: str = "", ds: str = "", cache: bool = False, cachedir: str = "", dimension: str = 'point', **kwargs): """ Init fetcher Parameters ---------- local_ftp : str Path to the local directory where the 'dac' folder is located. ds : str Name of the dataset to load. Use the global OPTIONS['dataset'] by default. cache : bool Determine if retrieved data should be cached locally or not, False by default. cachedir : str Absolute path to the cache directory dimension : str Main dimension of the output dataset. This can be "profile" to retrieve a collection of profiles, or "point" (default) to have data as a collection of measurements. This can be used to optimise performances """ self.cache = cache self.cachedir = cachedir self.fs = filestore(cache=self.cache, cachedir=self.cachedir) self.definition = 'Local ftp Argo data fetcher' self.dataset_id = OPTIONS['dataset'] if ds == '' else ds self.local_ftp = OPTIONS['local_ftp'] if local_ftp == '' else local_ftp self.init(**kwargs)
def test_cachefile(self): try: fs = filestore(cache=1, cachedir=self.testcachedir) fs.open_dataframe(self.csvfile, skiprows=8, header=0) assert isinstance(fs.cachepath(self.csvfile), str) shutil.rmtree(self.testcachedir) except Exception: shutil.rmtree(self.testcachedir) raise
def test_clear_cache(self): # Create dummy data to read and cache: uri = os.path.abspath("dummy_fileA.txt") with open(uri, "w") as fp: fp.write('Hello world!') # Create store: fs = filestore(cache=1, cachedir=self.testcachedir) # Then we read some dummy data from the dummy file to trigger caching with fs.open(uri, "r") as fp: txt = fp.read() assert isinstance(fs.cachepath(uri), str) # Now, we can clear the cache: fs.clear_cache() # And verify it does not exist anymore: with pytest.raises(CacheFileNotFound): fs.cachepath(uri) os.remove(uri)
def test_open_mfdataset(self): fs = filestore() ncfiles = fs.glob( os.path.sep.join( [self.ftproot, "dac/aoml/5900446/profiles/*_1*.nc"]))[0:2] for method in ["seq", "thread", "process"]: for progress in [True, False]: assert isinstance( fs.open_mfdataset(ncfiles, method=method, progress=progress), xr.Dataset, ) assert is_list_of_datasets( fs.open_mfdataset(ncfiles, method=method, progress=progress, concat=False))
def test_clear_cache(self): with tempfile.TemporaryDirectory() as cachedir: # Create dummy data to read and cache: uri = os.path.abspath("dummy_fileA.txt") with open(uri, "w") as fp: fp.write("Hello world!") # Create store: fs = filestore(cache=True, cachedir=cachedir) # Then we read some dummy data from the dummy file to trigger caching with fs.open(uri, "r") as fp: fp.read() assert isinstance(fs.cachepath(uri), str) assert os.path.isfile(fs.cachepath(uri)) # Now, we can clear the cache: fs.clear_cache() # And verify it does not exist anymore: with pytest.raises(CacheFileNotFound): fs.cachepath(uri) os.remove(uri) # Delete dummy file
def __init__(self, local_ftp: str = "", ds: str = "", cache: bool = False, cachedir: str = "", dimension: str = 'point', errors: str = 'raise', parallel: bool = False, parallel_method: str = 'thread', progress: bool = False, chunks: str = 'auto', chunks_maxsize: dict = {}, **kwargs): """ Init fetcher Parameters ---------- local_ftp: str (optional) Path to the local directory where the 'dac' folder is located. ds: str (optional) Dataset to load: 'phy' or 'ref' or 'bgc' errors: str (optional) If set to 'raise' (default), will raise a NetCDF4FileNotFoundError error if any of the requested files cannot be found. If set to 'ignore', the file not found is skipped when fetching data. cache: bool (optional) Cache data or not (default: False) cachedir: str (optional) Path to cache folder dimension: str Main dimension of the output dataset. This can be "profile" to retrieve a collection of profiles, or "point" (default) to have data as a collection of measurements. This can be used to optimise performances. parallel: bool (optional) Chunk request to use parallel fetching (default: False) parallel_method: str (optional) Define the parallelization method: ``thread``, ``process`` or a :class:`dask.distributed.client.Client`. progress: bool (optional) Show a progress bar or not when fetching data. chunks: 'auto' or dict of integers (optional) Dictionary with request access point as keys and number of chunks to create as values. Eg: - ``{'wmo': 10}`` will create a maximum of 10 chunks along WMOs when used with ``Fetch_wmo``. - ``{'lon': 2}`` will create a maximum of 2 chunks along longitude when used with ``Fetch_box``. chunks_maxsize: dict (optional) Dictionary with request access point as keys and chunk size as values (used as maximum values in 'auto' chunking). Eg: ``{'wmo': 5}`` will create chunks with as many as 5 WMOs each. """ self.cache = cache self.cachedir = cachedir self.fs = filestore(cache=self.cache, cachedir=self.cachedir) self.errors = errors if not isinstance(parallel, bool): # The parallelization method is passed through the argument 'parallel': parallel_method = parallel if parallel in ['thread', 'process']: parallel = True if parallel_method not in ["thread", "process"]: raise ValueError("localftp only support multi-threading and processing ('%s' unknown)" % parallel_method) self.parallel = parallel self.parallel_method = parallel_method self.progress = progress self.chunks = chunks self.chunks_maxsize = chunks_maxsize self.definition = 'Local ftp Argo data fetcher' self.dataset_id = OPTIONS['dataset'] if ds == '' else ds self.local_ftp = OPTIONS['local_ftp'] if local_ftp == '' else local_ftp check_localftp(self.local_ftp, errors='raise') # Validate local_ftp self.init(**kwargs)
def test_open_dataframe(self): fs = filestore() assert isinstance( fs.open_dataframe(self.csvfile, skiprows=8, header=0), pd.core.frame.DataFrame)
def test_open_dataset(self): ncfile = os.path.sep.join( [self.ftproot, "dac/aoml/5900446/5900446_prof.nc"]) fs = filestore() assert isinstance(fs.open_dataset(ncfile), xr.Dataset)
def test_nocachefile(self): fs = filestore(cache=1) with pytest.raises(CacheFileNotFound): fs.cachepath("dummy_uri")
def test_cache(self): fs = filestore(cache=1) assert isinstance( fs.fs, fsspec.implementations.cached.WholeFileCacheFileSystem)
def test_nocache(self): fs = filestore(cache=0) with pytest.raises(FileSystemHasNoCache): fs.cachepath("dummy_uri")
def test_creation(self): fs = filestore(cache=0) assert isinstance(fs.fs, fsspec.implementations.local.LocalFileSystem)
def test_cachefile(self): with tempfile.TemporaryDirectory() as cachedir: fs = filestore(cache=True, cachedir=cachedir) fs.read_csv(self.csvfile, skiprows=8, header=0) assert isinstance(fs.cachepath(self.csvfile), str)
def test_read_csv(self): fs = filestore() assert isinstance(fs.read_csv(self.csvfile, skiprows=8, header=0), pd.core.frame.DataFrame)
def test_glob(self): fs = filestore() assert isinstance(fs.glob(os.path.sep.join([self.ftproot, "dac/*"])), list)