def write_hdf5( self, data: Any = None, set_name: str = None, use_pandas: bool = False, **kwargs, ) -> Union[None, "h5py.File"]: if not hdf5_ok: raise TPImportError(errormessage_hdf5) if use_pandas: raise NotImplementedError( "TransparentPath does not support storing Dask objects in pandas's HDFStore yet." ) if self.suffix != ".hdf5" and self.suffix != "h5": warnings.warn( f"path {self} does not have '.h(df)5' as suffix while using to_hdf5. The path will be " f"changed to a path with '.hdf5' as suffix") self.change_suffix(".hdf5") if not self.nocheck: self._check_multiplicity() if self.__class__.cli is None: self.__class__.cli = client.Client(processes=False) check_kwargs(dd.to_hdf, kwargs) mode = "w" if "mode" in kwargs: mode = kwargs["mode"] del kwargs["mode"] if isinstance(data, dict): sets = data else: if set_name is None: set_name = "data" sets = {set_name: data} if self.fs_kind == "local": for aset in sets: dd.to_hdf(sets[aset], self, aset, mode=mode, **kwargs) else: with tempfile.NamedTemporaryFile() as f: futures = self.__class__.cli.map(dd.to_hdf, list(sets.values()), [f.name] * len(sets), list(sets.keys()), mode=mode, **kwargs) self.__class__.cli.gather(futures) TransparentPath(path=f.name, fs="local", bucket=self.bucket).put(self.path) return
def write_excel( self, data: Union[pd.DataFrame, pd.Series, dd.DataFrame], overwrite: bool = True, present: str = "ignore", **kwargs, ) -> None: if not excel_ok: raise TPImportError(errormessage_excel) if self.suffix != ".xlsx" and self.suffix != ".xls" and self.suffix != ".xlsm": warnings.warn( f"path {self} does not have '.xls(x,m)' as suffix while using to_excel. The path will be " f"changed to a path with '.xlsx' as suffix") self.change_suffix(".xlsx") if not self.nocheck: self._check_multiplicity() if not overwrite and self.is_file() and present != "ignore": raise TPFileExistsError() if self.fs_kind == "local": if self.__class__.cli is None: self.__class__.cli = client.Client(processes=False) check_kwargs(pd.DataFrame.to_excel, kwargs) parts = delayed(pd.DataFrame.to_excel)(data, self.__fspath__(), **kwargs) parts.compute() return else: with tempfile.NamedTemporaryFile(delete=True, suffix=".xlsx") as f: if TransparentPath.cli is None: TransparentPath.cli = client.Client(processes=False) check_kwargs(pd.DataFrame.to_excel, kwargs) parts = delayed(pd.DataFrame.to_excel)(data, f.name, **kwargs) parts.compute() TransparentPath(path=f.name, fs="local", bucket=self.bucket).put(self.path)
def check_dask(self, which: str = "read"): if which != "read": return if self.__class__.cli is None: self.__class__.cli = client.Client(processes=False) if self.suffix != ".csv" and self.suffix != ".parquet" and not self.is_file( ): raise TPFileNotFoundError(f"Could not find file {self}") else: if (not self.is_file() and not self.is_dir(exist=True) and not self.with_suffix("").is_dir(exist=True) and "*" not in str(self)): raise TPFileNotFoundError( f"Could not find file nor directory {self} nor {self.with_suffix('')}" )
def write_parquet( self, data: Union[pd.DataFrame, pd.Series, dd.DataFrame], overwrite: bool = True, present: str = "ignore", columns_to_string: bool = True, **kwargs, ) -> None: if not parquet_ok: raise TPImportError(errormessage_hdf5) if self.suffix != ".parquet": warnings.warn( f"path {self} does not have '.parquet' as suffix while using to_parquet. The path will be " f"changed to a path with '.parquet' as suffix") self.change_suffix(".parquet") compression = kwargs.get("compression", None) if compression is not None and compression != "snappy": warnings.warn( "TransparentPath can not write parquet files with a compression that is not snappy. You " f"specified '{compression}', it will be replaced by 'snappy'.") if not self.nocheck: self._check_multiplicity() if not overwrite and self.is_file() and present != "ignore": raise TPFileExistsError() if columns_to_string and not isinstance(data.columns[0], str): data.columns = data.columns.astype(str) if self.__class__.cli is None: self.__class__.cli = client.Client(processes=False) check_kwargs(dd.to_parquet, kwargs) dd.to_parquet(data, self.with_suffix("").__fspath__(), engine="pyarrow", compression="snappy", **kwargs)
def write_csv( self, data: dd.DataFrame, overwrite: bool = True, present: str = "ignore", **kwargs, ) -> Union[None, List[TransparentPath]]: if self.suffix != ".csv": warnings.warn( f"path {self} does not have '.csv' as suffix while using to_csv. The path will be " f"changed to a path with '.csv' as suffix") self.change_suffix(".csv") if not self.nocheck: self._check_multiplicity() if not overwrite and self.is_file() and present != "ignore": raise TPFileExistsError() if self.__class__.cli is None: self.__class__.cli = client.Client(processes=False) check_kwargs(dd.to_csv, kwargs) path_to_save = self if not path_to_save.stem.endswith("*"): path_to_save = path_to_save.parent / (path_to_save.stem + "_*.csv") futures = self.__class__.cli.submit(dd.to_csv, data, path_to_save.__fspath__(), **kwargs) outfiles = [ TransparentPath(f, fs=self.fs_kind, bucket=self.bucket) for f in futures.result() ] if len(outfiles) == 1: outfiles[0].mv(self) else: return outfiles