Exemple #1
0
    def write_hdf5(
        self,
        data: Any = None,
        set_name: str = None,
        use_pandas: bool = False,
        **kwargs,
    ) -> Union[None, "h5py.File"]:

        if not hdf5_ok:
            raise TPImportError(errormessage_hdf5)

        if use_pandas:
            raise NotImplementedError(
                "TransparentPath does not support storing Dask objects in pandas's HDFStore yet."
            )

        if self.suffix != ".hdf5" and self.suffix != "h5":
            warnings.warn(
                f"path {self} does not have '.h(df)5' as suffix while using to_hdf5. The path will be "
                f"changed to a path with '.hdf5' as suffix")
            self.change_suffix(".hdf5")

        if not self.nocheck:
            self._check_multiplicity()

        if self.__class__.cli is None:
            self.__class__.cli = client.Client(processes=False)
        check_kwargs(dd.to_hdf, kwargs)

        mode = "w"
        if "mode" in kwargs:
            mode = kwargs["mode"]
            del kwargs["mode"]

        if isinstance(data, dict):
            sets = data
        else:
            if set_name is None:
                set_name = "data"
            sets = {set_name: data}

        if self.fs_kind == "local":
            for aset in sets:
                dd.to_hdf(sets[aset], self, aset, mode=mode, **kwargs)
        else:
            with tempfile.NamedTemporaryFile() as f:
                futures = self.__class__.cli.map(dd.to_hdf,
                                                 list(sets.values()),
                                                 [f.name] * len(sets),
                                                 list(sets.keys()),
                                                 mode=mode,
                                                 **kwargs)
                self.__class__.cli.gather(futures)
                TransparentPath(path=f.name, fs="local",
                                bucket=self.bucket).put(self.path)
        return
Exemple #2
0
    def write_excel(
        self,
        data: Union[pd.DataFrame, pd.Series, dd.DataFrame],
        overwrite: bool = True,
        present: str = "ignore",
        **kwargs,
    ) -> None:

        if not excel_ok:
            raise TPImportError(errormessage_excel)

        if self.suffix != ".xlsx" and self.suffix != ".xls" and self.suffix != ".xlsm":
            warnings.warn(
                f"path {self} does not have '.xls(x,m)' as suffix while using to_excel. The path will be "
                f"changed to a path with '.xlsx' as suffix")
            self.change_suffix(".xlsx")

        if not self.nocheck:
            self._check_multiplicity()

        if not overwrite and self.is_file() and present != "ignore":
            raise TPFileExistsError()

        if self.fs_kind == "local":
            if self.__class__.cli is None:
                self.__class__.cli = client.Client(processes=False)
            check_kwargs(pd.DataFrame.to_excel, kwargs)
            parts = delayed(pd.DataFrame.to_excel)(data, self.__fspath__(),
                                                   **kwargs)
            parts.compute()
            return
        else:
            with tempfile.NamedTemporaryFile(delete=True, suffix=".xlsx") as f:
                if TransparentPath.cli is None:
                    TransparentPath.cli = client.Client(processes=False)
                check_kwargs(pd.DataFrame.to_excel, kwargs)
                parts = delayed(pd.DataFrame.to_excel)(data, f.name, **kwargs)
                parts.compute()
                TransparentPath(path=f.name, fs="local",
                                bucket=self.bucket).put(self.path)
Exemple #3
0
 def check_dask(self, which: str = "read"):
     if which != "read":
         return
     if self.__class__.cli is None:
         self.__class__.cli = client.Client(processes=False)
     if self.suffix != ".csv" and self.suffix != ".parquet" and not self.is_file(
     ):
         raise TPFileNotFoundError(f"Could not find file {self}")
     else:
         if (not self.is_file() and not self.is_dir(exist=True)
                 and not self.with_suffix("").is_dir(exist=True)
                 and "*" not in str(self)):
             raise TPFileNotFoundError(
                 f"Could not find file nor directory {self} nor {self.with_suffix('')}"
             )
Exemple #4
0
    def write_parquet(
        self,
        data: Union[pd.DataFrame, pd.Series, dd.DataFrame],
        overwrite: bool = True,
        present: str = "ignore",
        columns_to_string: bool = True,
        **kwargs,
    ) -> None:

        if not parquet_ok:
            raise TPImportError(errormessage_hdf5)

        if self.suffix != ".parquet":
            warnings.warn(
                f"path {self} does not have '.parquet' as suffix while using to_parquet. The path will be "
                f"changed to a path with '.parquet' as suffix")
            self.change_suffix(".parquet")

        compression = kwargs.get("compression", None)

        if compression is not None and compression != "snappy":
            warnings.warn(
                "TransparentPath can not write parquet files with a compression that is not snappy. You "
                f"specified '{compression}', it will be replaced by 'snappy'.")

        if not self.nocheck:
            self._check_multiplicity()

        if not overwrite and self.is_file() and present != "ignore":
            raise TPFileExistsError()

        if columns_to_string and not isinstance(data.columns[0], str):
            data.columns = data.columns.astype(str)

        if self.__class__.cli is None:
            self.__class__.cli = client.Client(processes=False)
        check_kwargs(dd.to_parquet, kwargs)
        dd.to_parquet(data,
                      self.with_suffix("").__fspath__(),
                      engine="pyarrow",
                      compression="snappy",
                      **kwargs)
Exemple #5
0
    def write_csv(
        self,
        data: dd.DataFrame,
        overwrite: bool = True,
        present: str = "ignore",
        **kwargs,
    ) -> Union[None, List[TransparentPath]]:

        if self.suffix != ".csv":
            warnings.warn(
                f"path {self} does not have '.csv' as suffix while using to_csv. The path will be "
                f"changed to a path with '.csv' as suffix")
            self.change_suffix(".csv")

        if not self.nocheck:
            self._check_multiplicity()

        if not overwrite and self.is_file() and present != "ignore":
            raise TPFileExistsError()

        if self.__class__.cli is None:
            self.__class__.cli = client.Client(processes=False)
        check_kwargs(dd.to_csv, kwargs)
        path_to_save = self
        if not path_to_save.stem.endswith("*"):
            path_to_save = path_to_save.parent / (path_to_save.stem + "_*.csv")
        futures = self.__class__.cli.submit(dd.to_csv, data,
                                            path_to_save.__fspath__(),
                                            **kwargs)
        outfiles = [
            TransparentPath(f, fs=self.fs_kind, bucket=self.bucket)
            for f in futures.result()
        ]
        if len(outfiles) == 1:
            outfiles[0].mv(self)
        else:
            return outfiles