Exemple #1
0
def test_does_nothing_when_given_zero_files(
    capsys: CaptureFixture,
    cache_file: LocalPath,
    hash_file: Optional[LocalPath],
    incremental: bool,
    overwrite: bool,
    post_hash: bool,
    verbose: bool,
):
    cache_file.write(rand_string())
    expected_cache_hash: str = syphon.hash.HashEntry(cache_file).hash

    assert not syphon.build(
        cache_file,
        *[],
        hash_filepath=hash_file,
        incremental=incremental,
        overwrite=overwrite,
        post_hash=post_hash,
        verbose=verbose,
    )
    assert_post_hash(False, cache_file, hash_filepath=hash_file)
    assert_captured_outerr(capsys.readouterr(), verbose, False)

    actual_cache_hash: str = syphon.hash.HashEntry(cache_file).hash
    assert expected_cache_hash == actual_cache_hash
Exemple #2
0
    def test_only_update_hash_file_when_post_hash_true(
        capsys: CaptureFixture,
        archive_dir: LocalPath,
        cache_file: LocalPath,
        hash_file: Optional[LocalPath],
        verbose: bool,
    ):
        datafile: str = os.path.join(get_data_path(), "iris.csv")
        assert syphon.archive(archive_dir, [datafile])
        assert not os.path.exists(os.path.join(get_data_path(), "#lock"))

        cache_file.write(rand_string())

        resolved_hashfile = (cache_file.dirpath(syphon.core.check.DEFAULT_FILE)
                             if hash_file is None else hash_file)
        pathlib.Path(resolved_hashfile).touch()
        with syphon.hash.HashFile(resolved_hashfile) as hashfile:
            hashfile.update(syphon.hash.HashEntry(cache_file))

        assert syphon.check(cache_file, hash_filepath=resolved_hashfile)
        assert syphon.build(
            cache_file,
            *get_data_files(archive_dir),
            hash_filepath=hash_file,
            incremental=False,
            overwrite=True,
            post_hash=False,
            verbose=verbose,
        )
        assert_captured_outerr(capsys.readouterr(), verbose, False)
        assert not syphon.check(cache_file, hash_filepath=resolved_hashfile)
        assert syphon.build(
            cache_file,
            *get_data_files(archive_dir),
            hash_filepath=hash_file,
            incremental=False,
            overwrite=True,
            post_hash=True,
            verbose=verbose,
        )
        assert_captured_outerr(capsys.readouterr(), verbose, False)
        assert syphon.check(cache_file, hash_filepath=resolved_hashfile)
Exemple #3
0
    def test_raises_fileexistserror_when_cache_exists(
        archive_dir: LocalPath,
        cache_file: LocalPath,
        hash_file: Optional[LocalPath],
        incremental: bool,
    ):
        datafile: str = os.path.join(get_data_path(), "iris.csv")

        assert syphon.archive(archive_dir, [datafile], overwrite=True)
        assert not os.path.exists(os.path.join(get_data_path(), "#lock"))

        cache_file.write(rand_string())

        with pytest.raises(FileExistsError) as errinfo:
            syphon.build(
                cache_file,
                *get_data_files(archive_dir),
                hash_filepath=hash_file,
                incremental=incremental,
                overwrite=False,
                post_hash=False,
            )
            assert datafile in str(errinfo.value)
        assert_post_hash(False, cache_file, hash_filepath=hash_file)
Exemple #4
0
    def test_incremental_fails_when_check_fails(
        capsys: CaptureFixture,
        schema: bool,
        archive_dir: LocalPath,
        cache_file: LocalPath,
        hash_file: Optional[LocalPath],
        post_hash: bool,
        verbose: bool,
    ):
        datafile: str = os.path.join(get_data_path(), "iris.csv")
        schema = SortedDict({"0": "Name"})
        schemafile = os.path.join(archive_dir, syphon.schema.DEFAULT_FILE)

        if schema:
            syphon.init(schema, schemafile)
        assert syphon.archive(archive_dir, [datafile],
                              schema_filepath=schemafile if schema else None)
        assert not os.path.exists(os.path.join(get_data_path(), "#lock"))

        expected_frame = DataFrame(
            read_csv(datafile, dtype=str, index_col="Index"))
        expected_frame.sort_index(inplace=True)

        LocalPath(datafile).copy(cache_file)
        assert os.path.exists(cache_file)

        # "check" ought to fail when the hash file does not exist.
        assert not syphon.check(cache_file, hash_filepath=hash_file)
        # If "check" fails, then the incremental build fails.
        assert not syphon.build(
            cache_file,
            *get_data_files(archive_dir),
            hash_filepath=hash_file,
            incremental=True,
            overwrite=True,
            post_hash=post_hash,
            verbose=verbose,
        )
        assert_post_hash(False, cache_file, hash_filepath=hash_file)

        actual_frame = DataFrame(
            read_csv(cache_file, dtype=str, index_col="Index"))
        actual_frame.sort_index(inplace=True)

        assert_frame_equal(expected_frame, actual_frame, check_exact=True)
        assert_captured_outerr(capsys.readouterr(), verbose, False)
Exemple #5
0
    def test_full_build_with_schema_maintains_data_fidelity(
        capsys: CaptureFixture,
        archive_dir: LocalPath,
        cache_file: LocalPath,
        hash_file: Optional[LocalPath],
        overwrite: bool,
        post_hash: bool,
        verbose: bool,
    ):
        datafile: str = os.path.join(get_data_path(), "iris.csv")
        schema = SortedDict({"0": "Name"})
        schemafile = os.path.join(archive_dir, syphon.schema.DEFAULT_FILE)

        syphon.init(schema, schemafile, overwrite=overwrite)
        assert syphon.archive(archive_dir, [datafile],
                              schema_filepath=schemafile,
                              overwrite=overwrite)
        assert not os.path.exists(os.path.join(get_data_path(), "#lock"))

        expected_frame = DataFrame(
            read_csv(datafile, dtype=str, index_col="Index"))
        expected_frame.sort_index(inplace=True)

        if overwrite:
            cache_file.write(rand_string())

        assert syphon.build(
            cache_file,
            *get_data_files(archive_dir),
            hash_filepath=hash_file,
            incremental=False,
            overwrite=overwrite,
            post_hash=post_hash,
            verbose=verbose,
        )
        assert_post_hash(post_hash, cache_file, hash_filepath=hash_file)

        actual_frame = DataFrame(
            read_csv(cache_file, dtype=str, index_col="Index"))
        actual_frame.sort_index(inplace=True)

        assert_frame_equal(expected_frame, actual_frame, check_exact=True)
        assert_captured_outerr(capsys.readouterr(), verbose, False)
Exemple #6
0
    def test_build_uses_unmodified_output_path_in_hash_entry(
            self, fs: "TestBuildHashEntryPath.FS", path_type: PathType):
        # NOTE: Current working directory is changed if PathType.NONE!
        target: Union[str, LocalPath] = fs.cache(path_type)

        datafile: str = os.path.join(get_data_path(), "iris.csv")
        assert syphon.archive(fs.archive, [datafile])
        assert not os.path.exists(os.path.join(get_data_path(), "#lock"))

        assert syphon.build(
            target,
            *get_data_files(fs.archive),
            hash_filepath=fs.hashfile,
            incremental=False,
            post_hash=True,
        )

        with fs.hashfile.open(mode="r") as hf:
            actual_hash_entry = hf.readline()

        assert str(target) in actual_hash_entry
Exemple #7
0
    def test_incremental_becomes_full_build_when_cache_does_not_exist(
        capsys: CaptureFixture,
        schema: bool,
        archive_dir: LocalPath,
        cache_file: LocalPath,
        hash_file: Optional[LocalPath],
        post_hash: bool,
        verbose: bool,
    ):
        datafile: str = os.path.join(get_data_path(), "iris.csv")
        schema = SortedDict({"0": "Name"})
        schemafile = os.path.join(archive_dir, syphon.schema.DEFAULT_FILE)

        if schema:
            syphon.init(schema, schemafile)
        assert syphon.archive(archive_dir, [datafile],
                              schema_filepath=schemafile if schema else None)
        assert not os.path.exists(os.path.join(get_data_path(), "#lock"))

        expected_frame = DataFrame(
            read_csv(datafile, dtype=str, index_col="Index"))
        expected_frame.sort_index(inplace=True)

        # Raises a FileExistsError unless a full build is performed.
        assert syphon.build(
            cache_file,
            *get_data_files(archive_dir),
            hash_filepath=hash_file,
            incremental=True,
            post_hash=post_hash,
            verbose=verbose,
        )
        assert_post_hash(post_hash, cache_file, hash_filepath=hash_file)

        actual_frame = DataFrame(
            read_csv(cache_file, dtype=str, index_col="Index"))
        actual_frame.sort_index(inplace=True)

        assert_frame_equal(expected_frame, actual_frame, check_exact=True)
        assert_captured_outerr(capsys.readouterr(), verbose, False)
Exemple #8
0
    def test_incremental_maintains_data_fidelity_when_new_data_new_and_missing_columns(
        capsys: CaptureFixture,
        archive_dir: LocalPath,
        import_dir: LocalPath,
        cache_file: LocalPath,
        hash_file: Optional[LocalPath],
        verbose: bool,
    ):
        """Incremental build maintains data fidelity when new data

        * has columns not present in the existing data cache.

        * is missing columns found in the existing data cache.
        """

        pre_datafiles: List[str] = [
            os.path.join(get_data_path(),
                         "iris_plus_partial-1-of-2-no-species.csv")
        ]
        datafiles: List[str] = [
            os.path.join(get_data_path(),
                         "iris_plus_partial-2-of-2-no-petalcolor.csv")
        ]

        resolved_hashfile = (cache_file.dirpath(syphon.core.check.DEFAULT_FILE)
                             if hash_file is None else hash_file)

        assert syphon.archive(archive_dir, pre_datafiles)
        assert not os.path.exists(os.path.join(get_data_path(), "#lock"))

        # Pre-build
        assert syphon.build(
            cache_file,
            *get_data_files(archive_dir),
            hash_filepath=hash_file,
            incremental=False,
            overwrite=False,
            post_hash=True,
            verbose=False,
        )
        # Get the hash of the cache file before our main build.
        pre_cache_hash: str = syphon.hash.HashEntry(cache_file).hash
        # Get the hash of the hash file for easy file change checking.
        pre_hash_hash: str = syphon.hash.HashEntry(resolved_hashfile).hash

        # Main build
        assert syphon.build(
            cache_file,
            *datafiles,
            hash_filepath=hash_file,
            incremental=True,
            overwrite=True,
            post_hash=True,
            verbose=verbose,
        )
        assert_captured_outerr(capsys.readouterr(), verbose, False)

        post_cache_hash: str = syphon.hash.HashEntry(cache_file).hash
        post_hash_hash: str = syphon.hash.HashEntry(resolved_hashfile).hash

        expected_frame = DataFrame(
            read_csv(
                os.path.join(
                    get_data_path(),
                    "iris_plus_partial-new-data-new-and-missing-columns.csv",
                ),
                dtype=str,
                index_col="Index",
            ))
        expected_frame.sort_index(inplace=True)

        assert pre_cache_hash != post_cache_hash
        assert pre_hash_hash != post_hash_hash

        with syphon.hash.HashFile(resolved_hashfile) as hashfile:
            for entry in hashfile:
                if os.path.samefile(entry.filepath, str(cache_file)):
                    assert post_cache_hash == entry.hash

        actual_frame = DataFrame(
            read_csv(cache_file, dtype=str, index_col="Index"))
        actual_frame.sort_index(inplace=True)

        assert_frame_equal(expected_frame, actual_frame, check_exact=True)