Ejemplo n.º 1
0
 def test_file_hash(self):
     t = TypedDfBuilder("a").reserve("x", "y").build()
     df = t.convert(pd.DataFrame([pd.Series(dict(x="cat", y="dog"))]))
     # unfortunately, the file that gets output is os-dependent
     # \n vs \r\n is an issue, so we can't check the exact hash
     with tmpfile(".csv") as path:
         df.write_file(path, file_hash=True)
         hash_file = Checksums().get_filesum_of_file(path)
         assert hash_file.exists()
         got = Checksums().load_filesum_of_file(path)
         assert got.file_path == path
         hit = got.hash_value
         assert len(hit) == 64
         t.read_file(path, file_hash=True)
         t.read_file(path, hex_hash=hit)
Ejemplo n.º 2
0
    def delete_file(
        cls,
        path: PathLike,
        *,
        missing_ok: bool = False,
        alg: str = _DEFAULT_HASH_ALG,
        attrs_suffix: str = _DEFAULT_ATTRS_SUFFIX,
        rm_if_empty: bool = True,
    ) -> None:
        """
        Deletes a file, plus the checksum file and/or directory entry, and ``.attrs.json``.

        Args:
            path: The path to delete
            missing_ok: ok if the path does not exist (will still delete any associated paths)
            alg: The checksum algorithm
            attrs_suffix: The suffix for attrs file (normally .attrs.json)
            rm_if_empty: Remove the dir checksum file if it contains no additional paths

        Raises:
            :class:`typeddfs.df_errors.PathNotRelativeError`: To avoid, try calling ``resolve`` first
        """
        path = Path(path)
        # delete the file first so we can get an error on missing_ok=False
        Path(path).unlink(missing_ok=missing_ok)
        Checksums(alg=alg).delete_any(path, rm_if_empty=rm_if_empty)
        attrs_path = path.parent / (path.name + attrs_suffix)
        attrs_path.unlink(missing_ok=True)
Ejemplo n.º 3
0
    def hash(self,
             *,
             alg: str = "sha256",
             file: bool = True,
             directory: bool = False) -> __qualname__:
        """
        Write a hash file (e.g. .sha256) alongside files.
        Performed when calling :meth:`typeddfs.abs_dfs.AbsDf.write_file`.
        The hash files will be in the `sha1sum <https://en.wikipedia.org/wiki/Sha1sum>`_ format,
        with a the filename, followed by ``" *"``, followed by the filename.

        Note that this affects the default behavior of :meth:`typeddfs.abs_dfs.AbsDf.write_file`,
        which can be called with ``file_hash=False`` and/or ``dir_hash=False``.

        Args:
            alg: The name of the algorithm in ``hashlib``;
                 The final name will ignore any hyphens and be converted to lowercase,
                 and the suffix will be ``"." + alg``.
            file: Alongside a file ``"my_file.csv.gz"``,
                  write a file ``"my_file.csv.gz."+alg`` alongside.
            directory: Alongside a file ``"my_file.csv.gz"`` in ``"my_dir"``,
                       append to a file ``"my_dir/my_dir"+alg``,
                       which presumably should contain hashes for files in that directory.

        Returns:
            This builder for chaining
        """
        self._hash_alg = Checksums.resolve_algorithm(alg)
        self._hash_file = file
        self._hash_dir = directory
        return self
Ejemplo n.º 4
0
 def test_dir_hash(self):
     t = TypedDfBuilder("a").reserve("x", "y").build()
     df = t.convert(pd.DataFrame([pd.Series(dict(x="cat", y="kitten"))]))
     with tmpfile(".csv") as path:
         hash_dir = Checksums().get_dirsum_of_file(path)
         hash_dir.unlink(missing_ok=True)
         df.write_file(path, dir_hash=True)
         assert hash_dir.exists()
         got = Checksums().load_dirsum_exact(hash_dir)
         assert list(got.keys()) == [path]
         hit = got[path]
         assert len(hit) == 64
         t.read_file(path, dir_hash=True)
         t.read_file(path, hex_hash=hit)
Ejemplo n.º 5
0
 def test_get_algorithm(self):
     assert Checksums.resolve_algorithm("sha-256") == "sha256"
Ejemplo n.º 6
0
 def test_guess_algorithm(self):
     assert Checksums.guess_algorithm("my_file.sha256") == "sha256"
     assert Checksums.guess_algorithm("my_file.sha1") == "sha1"
Ejemplo n.º 7
0
    def read_file(
        cls,
        path: Union[Path, str],
        *,
        file_hash: Optional[bool] = None,
        dir_hash: Optional[bool] = None,
        hex_hash: Optional[str] = None,
        attrs: Optional[bool] = None,
    ) -> __qualname__:
        """
        Reads from a file (or possibly URL), guessing the format from the filename extension.
        Delegates to the ``read_*`` functions of this class.

        You can always write and then read back to get the same dataframe.
        .. code-block::

            # df is any DataFrame from typeddfs
            # path can use any suffix
            df.write_file(path))
            df.read_file(path)

        Text files always allow encoding with .gz, .zip, .bz2, or .xz.

        Supports:
            - .csv, .tsv, or .tab
            - .json
            - .xml
            - .feather
            - .parquet or .snappy
            - .h5 or .hdf
            - .xlsx, .xls, .odf, etc.
            - .toml
            - .properties
            - .ini
            - .fxf (fixed-width)
            - .flexwf (fixed-but-unspecified-width with an optional delimiter)
            - .txt, .lines, or .list

        See Also:
            :meth:`read_url`
            :meth:`write_file`


        Args:
            path: Only path-like strings or pathlib objects are supported, not buffers
                  (because we need a filename).
            file_hash: Check against a hash file specific to this file (e.g. <path>.sha1)
            dir_hash: Check against a per-directory hash file
            hex_hash: Check against this hex-encoded hash
            attrs: Set dataset attributes/metadata (``pd.DataFrame.attrs``) from a JSON file.
                   If True, uses :attr:`typeddfs.df_typing.DfTyping.attrs_suffix`.
                   If a str or Path, uses that file.
                   If None or False, does not set.

        Returns:
            An instance of this class
        """
        if any((str(path).startswith(x + "://")
                for x in ["http", "https", "ftp"])):
            # just save some pain -- better than a weird error in .resolve()
            raise ValueError(
                f"Cannot read from URL {path}; use read_url instead")
        path = Path(path).resolve()
        t: DfTyping = cls.get_typing()
        if attrs is None:
            attrs = t.io.use_attrs
        cs = Checksums(alg=t.io.hash_algorithm)
        cs.verify_any(path,
                      file_hash=file_hash,
                      dir_hash=dir_hash,
                      computed=hex_hash)
        df = cls._call_read(cls, path)
        if attrs:
            attrs_path = path.parent / (path.name + t.io.attrs_suffix)
            json_data = Utils.json_decoder().from_str(
                attrs_path.read_text(encoding="utf-8"))
            df.attrs.update(json_data)
        return cls._convert_typed(df)
Ejemplo n.º 8
0
    def write_file(
        self,
        path: Union[Path, str],
        *,
        overwrite: bool = True,
        mkdirs: bool = False,
        file_hash: Optional[bool] = None,
        dir_hash: Optional[bool] = None,
        attrs: Optional[bool] = None,
    ) -> Optional[str]:
        """
        Writes to a file, guessing the format from the filename extension.
        Delegates to the ``to_*`` functions of this class (e.g. ``to_csv``).
        Only includes file formats that can be read back in with corresponding ``to`` methods.

        Supports, where text formats permit optional .gz, .zip, .bz2, or .xz:
            - .csv, .tsv, or .tab
            - .json
            - .feather
            - .fwf (fixed-width)
            - .flexwf (columns aligned but using a delimiter)
            - .parquet or .snappy
            - .h5, .hdf, or .hdf5
            - .xlsx, .xls, and other variants for Excel
            - .odt and .ods (OpenOffice)
            - .xml
            - .toml
            - .ini
            - .properties
            - .pkl and .pickle
            - .txt, .lines, or .list; see :meth:`to_lines` and :meth:`read_lines`

        See Also:
            :meth:`read_file`

        Args:
            path: Only path-like strings or pathlib objects are supported, not buffers
                  (because we need a filename).
            overwrite: If False, complain if the file already exists
            mkdirs: Make the directory and parents if they do not exist
            file_hash: Write a hash for this file.
                       The filename will be path+"."+algorithm.
                       If None, chooses according to ``self.get_typing().io.hash_file``.
            dir_hash: Append a hash for this file into a list.
                      The filename will be the directory name suffixed by the algorithm;
                      (i.e. path.parent/(path.parent.name+"."+algorithm) ).
                      If None, chooses according to ``self.get_typing().io.hash_dir``.
            attrs: Write dataset attributes/metadata (``pd.DataFrame.attrs``) to a JSON file.
                   uses :attr:`typeddfs.df_typing.DfTyping.attrs_suffix`.
                   If None, chooses according to ``self.get_typing().io.use_attrs``.

        Returns:
            Whatever the corresponding method on ``pd.to_*`` returns.
            This is usually either str or None

        Raises:
            InvalidDfError: If the DataFrame is not valid for this type
            ValueError: If the type of a column or index name is non-str
        """
        if any((str(path).startswith(x + "://")
                for x in ["http", "https", "ftp"])):
            # just save some pain -- better than a weird error in .resolve()
            raise ValueError(f"Cannot write to URL {path}")
        path = Path(path).resolve()
        t = self.__class__.get_typing()
        file_hash = file_hash is True or file_hash is None and t.io.file_hash
        dir_hash = dir_hash is True or dir_hash is None and t.io.dir_hash
        attrs = attrs is True or attrs is None and t.io.use_attrs
        attrs_path = path.parent / (path.name + t.io.attrs_suffix)
        attrs_data = Utils.json_encoder().as_str(self.attrs)
        cs = Checksums(alg=t.io.hash_algorithm)
        file_hash_path = cs.get_filesum_of_file(path)
        dir_hash_path = cs.get_dirsum_of_file(path)
        # check for overwrite errors now to preserve atomicity
        if not overwrite:
            if path.exists():
                raise FileExistsError(f"File {path} already exists")
            if file_hash and file_hash_path.exists():
                raise HashFileExistsError(f"{file_hash_path} already exists")
            if dir_hash_path.exists():
                dir_sums = Checksums(
                    alg=t.io.hash_algorithm).load_dirsum_exact(dir_hash_path)
                if path in dir_sums:
                    raise HashEntryExistsError(
                        f"Path {path} listed in {dir_hash_path}")
            if file_hash and file_hash_path.exists():
                raise HashFileExistsError(f"{file_hash_path} already exists")
            if attrs and attrs_path.exists():
                raise FileExistsError(f"{attrs_path} already exists")
        self._check(self)
        types = set(self.column_names()).union(self.index_names())
        if any((not isinstance(c, str) for c in types)):
            raise NonStrColumnError(
                f"Columns must be of str type to serialize, not {types}")
        # now we're ready to write
        if mkdirs:
            path.parent.mkdir(exist_ok=True, parents=True)
        # to get a FileNotFoundError instead of a WritePermissionsError:
        if not mkdirs and not path.parent.exists():
            raise FileNotFoundError(f"Directory {path.parent} not found")
        # check for lack of write-ability to any of the files
        # we had to do this after creating the dirs unfortunately
        _all_files = [(attrs, attrs_path), (file_hash, file_hash_path),
                      (dir_hash, dir_hash_path)]
        all_files = [f for a, f in _all_files if a]
        all_dirs = [f.parent for (a, f) in _all_files]
        # we need to check both the dirs and the files
        Utils.verify_can_write_dirs(*all_dirs, missing_ok=False)
        Utils.verify_can_write_files(*all_files, missing_ok=True)
        # we verified as much as we can -- finally we can write!!
        # this writes the main file:
        z = self._call_write(path)
        # write the hashes
        # this shouldn't fail
        cs = Checksums(alg=t.io.hash_algorithm)
        cs.write_any(
            path,
            to_file=file_hash,
            to_dir=dir_hash,
            overwrite=overwrite,
        )
        # write dataset attributes
        # this also shouldn't fail
        if attrs:
            attrs_path.write_text(attrs_data, encoding="utf8")
        return z