def test_increment_without_metadata_without_schema( self, capsys: CaptureFixture, archive_dir: LocalPath, archive_fixture: "TestArchive.ArchiveCacheAndHashPassthruChecker", schema_file: Optional[LocalPath], verbose: bool, ): # List of (expected frame filename, data filename) tuples targets: List[Tuple[str, str]] = [ ("iris-part-1-of-6-combined.csv", "iris-part-1-of-6.csv"), ("iris-part-1-2.csv", "iris-part-2-of-6.csv"), ("iris-part-1-2-3.csv", "iris-part-3-of-6.csv"), ("iris-part-1-2-3-4.csv", "iris-part-4-of-6.csv"), ("iris-part-1-2-3-4-5.csv", "iris-part-5-of-6.csv"), ("iris_plus.csv", "iris-part-6-of-6.csv"), ] expected_hashfile = ( LocalPath(archive_fixture.cache_file).dirpath(DEFAULT_HASH_FILE) if archive_fixture.hash_file is None else archive_fixture.hash_file) assert not os.path.exists(expected_hashfile) assert not os.path.exists(archive_fixture.cache_file) assert len(archive_dir.listdir()) == 0 for expected_frame_filename, data_filename in targets: assert archive_fixture( archive_dir, [os.path.join(get_data_path(), data_filename)], cache_filepath=archive_fixture.cache_file, hash_filepath=archive_fixture.hash_file, verbose=verbose, ) assert_captured_outerr(capsys.readouterr(), verbose, False) expected_frame = DataFrame( read_csv( os.path.join(get_data_path(), expected_frame_filename), dtype=str, index_col="Index", )) del expected_frame["Species"] del expected_frame["PetalColor"] expected_frame.sort_index(inplace=True) actual_frame = DataFrame( read_csv(str(archive_fixture.cache_file), dtype=str, index_col="Index")) actual_frame.sort_index(inplace=True) assert_captured_outerr(capsys.readouterr(), False, False) assert_frame_equal(expected_frame, actual_frame) assert os.path.exists(expected_hashfile) assert syphon.check( archive_fixture.cache_file, hash_filepath=expected_hashfile, verbose=verbose, )
def test_only_update_hash_file_when_post_hash_true( capsys: CaptureFixture, archive_dir: LocalPath, cache_file: LocalPath, hash_file: Optional[LocalPath], verbose: bool, ): datafile: str = os.path.join(get_data_path(), "iris.csv") assert syphon.archive(archive_dir, [datafile]) assert not os.path.exists(os.path.join(get_data_path(), "#lock")) cache_file.write(rand_string()) resolved_hashfile = (cache_file.dirpath(syphon.core.check.DEFAULT_FILE) if hash_file is None else hash_file) pathlib.Path(resolved_hashfile).touch() with syphon.hash.HashFile(resolved_hashfile) as hashfile: hashfile.update(syphon.hash.HashEntry(cache_file)) assert syphon.check(cache_file, hash_filepath=resolved_hashfile) assert syphon.build( cache_file, *get_data_files(archive_dir), hash_filepath=hash_file, incremental=False, overwrite=True, post_hash=False, verbose=verbose, ) assert_captured_outerr(capsys.readouterr(), verbose, False) assert not syphon.check(cache_file, hash_filepath=resolved_hashfile) assert syphon.build( cache_file, *get_data_files(archive_dir), hash_filepath=hash_file, incremental=False, overwrite=True, post_hash=True, verbose=verbose, ) assert_captured_outerr(capsys.readouterr(), verbose, False) assert syphon.check(cache_file, hash_filepath=resolved_hashfile)
def test_incremental_fails_when_check_fails( capsys: CaptureFixture, schema: bool, archive_dir: LocalPath, cache_file: LocalPath, hash_file: Optional[LocalPath], post_hash: bool, verbose: bool, ): datafile: str = os.path.join(get_data_path(), "iris.csv") schema = SortedDict({"0": "Name"}) schemafile = os.path.join(archive_dir, syphon.schema.DEFAULT_FILE) if schema: syphon.init(schema, schemafile) assert syphon.archive(archive_dir, [datafile], schema_filepath=schemafile if schema else None) assert not os.path.exists(os.path.join(get_data_path(), "#lock")) expected_frame = DataFrame( read_csv(datafile, dtype=str, index_col="Index")) expected_frame.sort_index(inplace=True) LocalPath(datafile).copy(cache_file) assert os.path.exists(cache_file) # "check" ought to fail when the hash file does not exist. assert not syphon.check(cache_file, hash_filepath=hash_file) # If "check" fails, then the incremental build fails. assert not syphon.build( cache_file, *get_data_files(archive_dir), hash_filepath=hash_file, incremental=True, overwrite=True, post_hash=post_hash, verbose=verbose, ) assert_post_hash(False, cache_file, hash_filepath=hash_file) actual_frame = DataFrame( read_csv(cache_file, dtype=str, index_col="Index")) actual_frame.sort_index(inplace=True) assert_frame_equal(expected_frame, actual_frame, check_exact=True) assert_captured_outerr(capsys.readouterr(), verbose, False)
def test_increment_one_to_many_with_metadata_with_schema( self, capsys: CaptureFixture, archive_dir: LocalPath, archive_fixture: "TestArchive.ArchiveCacheAndHashPassthruChecker", schema_file: Optional[LocalPath], verbose: bool, ): # List of (expected frame filename, data filename, metadata filename) tuples targets: List[Tuple[str, str, List[str]]] = [ ( "iris-part-1-of-6-combined.csv", "iris-part-1-of-6.csv", [ "iris-part-1-of-6-meta-part-1-of-2.meta", "iris-part-1-of-6-meta-part-2-of-2.meta", ], ), ( "iris-part-1-2.csv", "iris-part-2-of-6.csv", [ "iris-part-2-of-6-meta-part-1-of-2.meta", "iris-part-2-of-6-meta-part-2-of-2.meta", ], ), ( "iris-part-1-2-3.csv", "iris-part-3-of-6.csv", [ "iris-part-3-of-6-meta-part-1-of-2.meta", "iris-part-3-of-6-meta-part-2-of-2.meta", ], ), ( "iris-part-1-2-3-4.csv", "iris-part-4-of-6.csv", [ "iris-part-4-of-6-meta-part-1-of-2.meta", "iris-part-4-of-6-meta-part-2-of-2.meta", ], ), ( "iris-part-1-2-3-4-5.csv", "iris-part-5-of-6.csv", [ "iris-part-5-of-6-meta-part-1-of-2.meta", "iris-part-5-of-6-meta-part-2-of-2.meta", ], ), ( "iris_plus.csv", "iris-part-6-of-6.csv", [ "iris-part-6-of-6-meta-part-1-of-2.meta", "iris-part-6-of-6-meta-part-2-of-2.meta", ], ), ] expected_hashfile = ( LocalPath(archive_fixture.cache_file).dirpath(DEFAULT_HASH_FILE) if archive_fixture.hash_file is None else archive_fixture.hash_file) assert not os.path.exists(expected_hashfile) assert not os.path.exists(archive_fixture.cache_file) assert len(archive_dir.listdir()) == 0 expected_schemafile = (archive_dir.join(syphon.schema.DEFAULT_FILE) if schema_file is None else schema_file) assert not os.path.exists(expected_schemafile) syphon.init(SortedDict({ "0": "PetalColor", "1": "Species" }), expected_schemafile) assert os.path.exists(expected_schemafile) for expected_frame_filename, data_filename, metadata_filenames in targets: assert archive_fixture( archive_dir, [os.path.join(get_data_path(), data_filename)], meta_files=[ os.path.join(get_data_path(), m) for m in metadata_filenames ], filemap_behavior=MappingBehavior.ONE_TO_MANY, schema_filepath=schema_file, cache_filepath=archive_fixture.cache_file, hash_filepath=archive_fixture.hash_file, verbose=verbose, ) assert_captured_outerr(capsys.readouterr(), verbose, False) expected_frame = DataFrame( read_csv( os.path.join(get_data_path(), expected_frame_filename), dtype=str, index_col="Index", )) expected_frame.sort_index(inplace=True) actual_frame = DataFrame( read_csv(str(archive_fixture.cache_file), dtype=str, index_col="Index")) actual_frame = actual_frame.reindex(columns=expected_frame.columns) actual_frame.sort_index(inplace=True) assert_captured_outerr(capsys.readouterr(), False, False) assert_frame_equal(expected_frame, actual_frame) assert os.path.exists(expected_hashfile) assert syphon.check( archive_fixture.cache_file, hash_filepath=expected_hashfile, verbose=verbose, )