Beispiel #1
0
    def test_increment_without_metadata_without_schema(
        self,
        capsys: CaptureFixture,
        archive_dir: LocalPath,
        archive_fixture: "TestArchive.ArchiveCacheAndHashPassthruChecker",
        schema_file: Optional[LocalPath],
        verbose: bool,
    ):
        # List of (expected frame filename, data filename) tuples
        targets: List[Tuple[str, str]] = [
            ("iris-part-1-of-6-combined.csv", "iris-part-1-of-6.csv"),
            ("iris-part-1-2.csv", "iris-part-2-of-6.csv"),
            ("iris-part-1-2-3.csv", "iris-part-3-of-6.csv"),
            ("iris-part-1-2-3-4.csv", "iris-part-4-of-6.csv"),
            ("iris-part-1-2-3-4-5.csv", "iris-part-5-of-6.csv"),
            ("iris_plus.csv", "iris-part-6-of-6.csv"),
        ]

        expected_hashfile = (
            LocalPath(archive_fixture.cache_file).dirpath(DEFAULT_HASH_FILE) if
            archive_fixture.hash_file is None else archive_fixture.hash_file)
        assert not os.path.exists(expected_hashfile)
        assert not os.path.exists(archive_fixture.cache_file)
        assert len(archive_dir.listdir()) == 0

        for expected_frame_filename, data_filename in targets:
            assert archive_fixture(
                archive_dir,
                [os.path.join(get_data_path(), data_filename)],
                cache_filepath=archive_fixture.cache_file,
                hash_filepath=archive_fixture.hash_file,
                verbose=verbose,
            )
            assert_captured_outerr(capsys.readouterr(), verbose, False)

            expected_frame = DataFrame(
                read_csv(
                    os.path.join(get_data_path(), expected_frame_filename),
                    dtype=str,
                    index_col="Index",
                ))
            del expected_frame["Species"]
            del expected_frame["PetalColor"]
            expected_frame.sort_index(inplace=True)
            actual_frame = DataFrame(
                read_csv(str(archive_fixture.cache_file),
                         dtype=str,
                         index_col="Index"))
            actual_frame.sort_index(inplace=True)
            assert_captured_outerr(capsys.readouterr(), False, False)

            assert_frame_equal(expected_frame, actual_frame)
            assert os.path.exists(expected_hashfile)
            assert syphon.check(
                archive_fixture.cache_file,
                hash_filepath=expected_hashfile,
                verbose=verbose,
            )
Beispiel #2
0
    def test_only_update_hash_file_when_post_hash_true(
        capsys: CaptureFixture,
        archive_dir: LocalPath,
        cache_file: LocalPath,
        hash_file: Optional[LocalPath],
        verbose: bool,
    ):
        datafile: str = os.path.join(get_data_path(), "iris.csv")
        assert syphon.archive(archive_dir, [datafile])
        assert not os.path.exists(os.path.join(get_data_path(), "#lock"))

        cache_file.write(rand_string())

        resolved_hashfile = (cache_file.dirpath(syphon.core.check.DEFAULT_FILE)
                             if hash_file is None else hash_file)
        pathlib.Path(resolved_hashfile).touch()
        with syphon.hash.HashFile(resolved_hashfile) as hashfile:
            hashfile.update(syphon.hash.HashEntry(cache_file))

        assert syphon.check(cache_file, hash_filepath=resolved_hashfile)
        assert syphon.build(
            cache_file,
            *get_data_files(archive_dir),
            hash_filepath=hash_file,
            incremental=False,
            overwrite=True,
            post_hash=False,
            verbose=verbose,
        )
        assert_captured_outerr(capsys.readouterr(), verbose, False)
        assert not syphon.check(cache_file, hash_filepath=resolved_hashfile)
        assert syphon.build(
            cache_file,
            *get_data_files(archive_dir),
            hash_filepath=hash_file,
            incremental=False,
            overwrite=True,
            post_hash=True,
            verbose=verbose,
        )
        assert_captured_outerr(capsys.readouterr(), verbose, False)
        assert syphon.check(cache_file, hash_filepath=resolved_hashfile)
Beispiel #3
0
    def test_incremental_fails_when_check_fails(
        capsys: CaptureFixture,
        schema: bool,
        archive_dir: LocalPath,
        cache_file: LocalPath,
        hash_file: Optional[LocalPath],
        post_hash: bool,
        verbose: bool,
    ):
        datafile: str = os.path.join(get_data_path(), "iris.csv")
        schema = SortedDict({"0": "Name"})
        schemafile = os.path.join(archive_dir, syphon.schema.DEFAULT_FILE)

        if schema:
            syphon.init(schema, schemafile)
        assert syphon.archive(archive_dir, [datafile],
                              schema_filepath=schemafile if schema else None)
        assert not os.path.exists(os.path.join(get_data_path(), "#lock"))

        expected_frame = DataFrame(
            read_csv(datafile, dtype=str, index_col="Index"))
        expected_frame.sort_index(inplace=True)

        LocalPath(datafile).copy(cache_file)
        assert os.path.exists(cache_file)

        # "check" ought to fail when the hash file does not exist.
        assert not syphon.check(cache_file, hash_filepath=hash_file)
        # If "check" fails, then the incremental build fails.
        assert not syphon.build(
            cache_file,
            *get_data_files(archive_dir),
            hash_filepath=hash_file,
            incremental=True,
            overwrite=True,
            post_hash=post_hash,
            verbose=verbose,
        )
        assert_post_hash(False, cache_file, hash_filepath=hash_file)

        actual_frame = DataFrame(
            read_csv(cache_file, dtype=str, index_col="Index"))
        actual_frame.sort_index(inplace=True)

        assert_frame_equal(expected_frame, actual_frame, check_exact=True)
        assert_captured_outerr(capsys.readouterr(), verbose, False)
Beispiel #4
0
    def test_increment_one_to_many_with_metadata_with_schema(
        self,
        capsys: CaptureFixture,
        archive_dir: LocalPath,
        archive_fixture: "TestArchive.ArchiveCacheAndHashPassthruChecker",
        schema_file: Optional[LocalPath],
        verbose: bool,
    ):
        # List of (expected frame filename, data filename, metadata filename) tuples
        targets: List[Tuple[str, str, List[str]]] = [
            (
                "iris-part-1-of-6-combined.csv",
                "iris-part-1-of-6.csv",
                [
                    "iris-part-1-of-6-meta-part-1-of-2.meta",
                    "iris-part-1-of-6-meta-part-2-of-2.meta",
                ],
            ),
            (
                "iris-part-1-2.csv",
                "iris-part-2-of-6.csv",
                [
                    "iris-part-2-of-6-meta-part-1-of-2.meta",
                    "iris-part-2-of-6-meta-part-2-of-2.meta",
                ],
            ),
            (
                "iris-part-1-2-3.csv",
                "iris-part-3-of-6.csv",
                [
                    "iris-part-3-of-6-meta-part-1-of-2.meta",
                    "iris-part-3-of-6-meta-part-2-of-2.meta",
                ],
            ),
            (
                "iris-part-1-2-3-4.csv",
                "iris-part-4-of-6.csv",
                [
                    "iris-part-4-of-6-meta-part-1-of-2.meta",
                    "iris-part-4-of-6-meta-part-2-of-2.meta",
                ],
            ),
            (
                "iris-part-1-2-3-4-5.csv",
                "iris-part-5-of-6.csv",
                [
                    "iris-part-5-of-6-meta-part-1-of-2.meta",
                    "iris-part-5-of-6-meta-part-2-of-2.meta",
                ],
            ),
            (
                "iris_plus.csv",
                "iris-part-6-of-6.csv",
                [
                    "iris-part-6-of-6-meta-part-1-of-2.meta",
                    "iris-part-6-of-6-meta-part-2-of-2.meta",
                ],
            ),
        ]

        expected_hashfile = (
            LocalPath(archive_fixture.cache_file).dirpath(DEFAULT_HASH_FILE) if
            archive_fixture.hash_file is None else archive_fixture.hash_file)
        assert not os.path.exists(expected_hashfile)
        assert not os.path.exists(archive_fixture.cache_file)
        assert len(archive_dir.listdir()) == 0

        expected_schemafile = (archive_dir.join(syphon.schema.DEFAULT_FILE)
                               if schema_file is None else schema_file)
        assert not os.path.exists(expected_schemafile)
        syphon.init(SortedDict({
            "0": "PetalColor",
            "1": "Species"
        }), expected_schemafile)
        assert os.path.exists(expected_schemafile)

        for expected_frame_filename, data_filename, metadata_filenames in targets:
            assert archive_fixture(
                archive_dir,
                [os.path.join(get_data_path(), data_filename)],
                meta_files=[
                    os.path.join(get_data_path(), m)
                    for m in metadata_filenames
                ],
                filemap_behavior=MappingBehavior.ONE_TO_MANY,
                schema_filepath=schema_file,
                cache_filepath=archive_fixture.cache_file,
                hash_filepath=archive_fixture.hash_file,
                verbose=verbose,
            )
            assert_captured_outerr(capsys.readouterr(), verbose, False)

            expected_frame = DataFrame(
                read_csv(
                    os.path.join(get_data_path(), expected_frame_filename),
                    dtype=str,
                    index_col="Index",
                ))
            expected_frame.sort_index(inplace=True)
            actual_frame = DataFrame(
                read_csv(str(archive_fixture.cache_file),
                         dtype=str,
                         index_col="Index"))
            actual_frame = actual_frame.reindex(columns=expected_frame.columns)
            actual_frame.sort_index(inplace=True)
            assert_captured_outerr(capsys.readouterr(), False, False)

            assert_frame_equal(expected_frame, actual_frame)
            assert os.path.exists(expected_hashfile)
            assert syphon.check(
                archive_fixture.cache_file,
                hash_filepath=expected_hashfile,
                verbose=verbose,
            )