Beispiel #1
0
    def test_build_no_hash(archive_dir: LocalPath, cache_file: LocalPath):
        assert not os.path.exists(cache_file)
        assert syphon.__main__.main(_init_args(archive_dir)) == 0
        assert syphon.__main__.main(_archive_args(archive_dir)) == 0

        arguments = _build_args(archive_dir, cache_file)
        arguments.append("--no-hash")

        assert syphon.__main__.main(arguments) == 0
        assert os.path.exists(cache_file)
        assert not os.path.exists(
            cache_file.dirpath(syphon.core.check.DEFAULT_FILE))
        assert cache_file.size() > 0
Beispiel #2
0
def assert_post_hash(
    post_hash: bool,
    cache_file: LocalPath,
    hash_filepath: Optional[LocalPath],
    verbose: bool = False,
):
    import syphon.core.check

    resolved_hashfile = (cache_file.dirpath(syphon.core.check.DEFAULT_FILE)
                         if hash_filepath is None else hash_filepath)
    if post_hash:
        assert syphon.core.check.check(cache_file,
                                       hash_filepath=resolved_hashfile,
                                       verbose=verbose)
    else:
        assert not os.path.exists(resolved_hashfile)
Beispiel #3
0
    def test_only_update_hash_file_when_post_hash_true(
        capsys: CaptureFixture,
        archive_dir: LocalPath,
        cache_file: LocalPath,
        hash_file: Optional[LocalPath],
        verbose: bool,
    ):
        datafile: str = os.path.join(get_data_path(), "iris.csv")
        assert syphon.archive(archive_dir, [datafile])
        assert not os.path.exists(os.path.join(get_data_path(), "#lock"))

        cache_file.write(rand_string())

        resolved_hashfile = (cache_file.dirpath(syphon.core.check.DEFAULT_FILE)
                             if hash_file is None else hash_file)
        pathlib.Path(resolved_hashfile).touch()
        with syphon.hash.HashFile(resolved_hashfile) as hashfile:
            hashfile.update(syphon.hash.HashEntry(cache_file))

        assert syphon.check(cache_file, hash_filepath=resolved_hashfile)
        assert syphon.build(
            cache_file,
            *get_data_files(archive_dir),
            hash_filepath=hash_file,
            incremental=False,
            overwrite=True,
            post_hash=False,
            verbose=verbose,
        )
        assert_captured_outerr(capsys.readouterr(), verbose, False)
        assert not syphon.check(cache_file, hash_filepath=resolved_hashfile)
        assert syphon.build(
            cache_file,
            *get_data_files(archive_dir),
            hash_filepath=hash_file,
            incremental=False,
            overwrite=True,
            post_hash=True,
            verbose=verbose,
        )
        assert_captured_outerr(capsys.readouterr(), verbose, False)
        assert syphon.check(cache_file, hash_filepath=resolved_hashfile)
Beispiel #4
0
    def test_build(
        archive_dir: LocalPath,
        cache_file: LocalPath,
        hash_file: LocalPath,
        specify_hashfile: bool,
    ):
        assert not os.path.exists(cache_file)
        assert syphon.__main__.main(_init_args(archive_dir)) == 0
        assert syphon.__main__.main(_archive_args(archive_dir)) == 0

        arguments = _build_args(archive_dir, cache_file)
        if specify_hashfile:
            arguments.append(str(hash_file))

        assert syphon.__main__.main(arguments) == 0
        assert os.path.exists(cache_file)
        assert os.path.exists(hash_file if specify_hashfile else cache_file.
                              dirpath(syphon.core.check.DEFAULT_FILE))
        # If we're using our own hash file, then the default should not be created.
        if specify_hashfile:
            assert not os.path.exists(
                cache_file.dirpath(syphon.core.check.DEFAULT_FILE))
        assert cache_file.size() > 0
Beispiel #5
0
def schema_file(request: FixtureRequest, archive_dir: LocalPath) -> Optional[LocalPath]:
    return None if request.param else archive_dir.dirpath("schemafile")
Beispiel #6
0
    def test_incremental_maintains_data_fidelity_when_new_data_new_and_missing_columns(
        capsys: CaptureFixture,
        archive_dir: LocalPath,
        import_dir: LocalPath,
        cache_file: LocalPath,
        hash_file: Optional[LocalPath],
        verbose: bool,
    ):
        """Incremental build maintains data fidelity when new data

        * has columns not present in the existing data cache.

        * is missing columns found in the existing data cache.
        """

        pre_datafiles: List[str] = [
            os.path.join(get_data_path(),
                         "iris_plus_partial-1-of-2-no-species.csv")
        ]
        datafiles: List[str] = [
            os.path.join(get_data_path(),
                         "iris_plus_partial-2-of-2-no-petalcolor.csv")
        ]

        resolved_hashfile = (cache_file.dirpath(syphon.core.check.DEFAULT_FILE)
                             if hash_file is None else hash_file)

        assert syphon.archive(archive_dir, pre_datafiles)
        assert not os.path.exists(os.path.join(get_data_path(), "#lock"))

        # Pre-build
        assert syphon.build(
            cache_file,
            *get_data_files(archive_dir),
            hash_filepath=hash_file,
            incremental=False,
            overwrite=False,
            post_hash=True,
            verbose=False,
        )
        # Get the hash of the cache file before our main build.
        pre_cache_hash: str = syphon.hash.HashEntry(cache_file).hash
        # Get the hash of the hash file for easy file change checking.
        pre_hash_hash: str = syphon.hash.HashEntry(resolved_hashfile).hash

        # Main build
        assert syphon.build(
            cache_file,
            *datafiles,
            hash_filepath=hash_file,
            incremental=True,
            overwrite=True,
            post_hash=True,
            verbose=verbose,
        )
        assert_captured_outerr(capsys.readouterr(), verbose, False)

        post_cache_hash: str = syphon.hash.HashEntry(cache_file).hash
        post_hash_hash: str = syphon.hash.HashEntry(resolved_hashfile).hash

        expected_frame = DataFrame(
            read_csv(
                os.path.join(
                    get_data_path(),
                    "iris_plus_partial-new-data-new-and-missing-columns.csv",
                ),
                dtype=str,
                index_col="Index",
            ))
        expected_frame.sort_index(inplace=True)

        assert pre_cache_hash != post_cache_hash
        assert pre_hash_hash != post_hash_hash

        with syphon.hash.HashFile(resolved_hashfile) as hashfile:
            for entry in hashfile:
                if os.path.samefile(entry.filepath, str(cache_file)):
                    assert post_cache_hash == entry.hash

        actual_frame = DataFrame(
            read_csv(cache_file, dtype=str, index_col="Index"))
        actual_frame.sort_index(inplace=True)

        assert_frame_equal(expected_frame, actual_frame, check_exact=True)