def test_raises_indexerror_when_a_schema_column_does_not_exist( self, archive_meta_params: Tuple[str, str, SortedDict], archive_dir: LocalPath): bad_column = "non_existent_column" filename: str expectedfilename: str schema: SortedDict filename, expectedfilename, schema = archive_meta_params # Add a bad column. local_schema = schema.copy() local_schema["%d" % len(local_schema)] = bad_column datafile = os.path.join(get_data_path(), filename + ".csv") metafile = os.path.join(get_data_path(), filename + ".meta") schemafile = os.path.join(archive_dir, syphon.schema.DEFAULT_FILE) syphon.init(local_schema, schemafile) with pytest.raises(IndexError, match=bad_column): syphon.archive( archive_dir, [datafile], meta_files=[metafile], schema_filepath=schemafile, overwrite=True, ) assert not os.path.exists( os.path.join(os.path.dirname(datafile), "#lock"))
def test_raises_fileexistserror_on_existing_archive_file( self, archive_params: Tuple[str, SortedDict], archive_dir: LocalPath): filename: str schema: SortedDict filename, schema = archive_params datafile = os.path.join(get_data_path(), filename) schemafile = os.path.join(archive_dir, syphon.schema.DEFAULT_FILE) syphon.init(schema, schemafile) expected_df = DataFrame(read_csv(datafile, dtype=str)) expected_paths: SortedList = _get_expected_paths( archive_dir, schema, expected_df, filename) for e in expected_paths: os.makedirs(os.path.dirname(e), exist_ok=True) with open(e, mode="w") as f: f.write(rand_string()) with pytest.raises(FileExistsError, match=os.path.basename(datafile)): syphon.archive(archive_dir, [datafile], schema_filepath=schemafile, overwrite=False) assert not os.path.exists( os.path.join(os.path.dirname(datafile), "#lock"))
def test_without_metadata_with_schema( self, capsys: CaptureFixture, archive_params: Tuple[str, SortedDict], archive_dir: LocalPath, overwrite: bool, verbose: bool, ): filename: str schema: SortedDict filename, schema = archive_params datafile = os.path.join(get_data_path(), filename) schemafile = os.path.join(archive_dir, syphon.schema.DEFAULT_FILE) syphon.init(schema, schemafile) expected_df = DataFrame(read_csv(datafile, dtype=str)) expected_df.sort_values(list(expected_df.columns), inplace=True) expected_df.reset_index(drop=True, inplace=True) expected_paths: SortedList = _get_expected_paths( archive_dir, schema, expected_df, filename) if overwrite: for e in expected_paths: os.makedirs(os.path.dirname(e), exist_ok=True) with open(e, mode="w") as fd: fd.write(rand_string()) assert syphon.archive( archive_dir, [datafile], schema_filepath=schemafile, overwrite=overwrite, verbose=verbose, ) assert not os.path.exists( os.path.join(os.path.dirname(datafile), "#lock")) actual_frame = DataFrame() actual_paths = SortedList() for root, _, files in os.walk(archive_dir): for f in files: if ".csv" in f: filepath: str = os.path.join(root, f) actual_paths.add(filepath) actual_frame = concat([ actual_frame, DataFrame(read_csv(filepath, dtype=str)) ]) actual_frame.sort_values(list(actual_frame.columns), inplace=True) actual_frame.reset_index(drop=True, inplace=True) assert expected_paths == actual_paths assert_frame_equal(expected_df, actual_frame) assert_captured_outerr(capsys.readouterr(), verbose, False)
def test_init_fileexistserror(archive_dir: LocalPath, init_schema_fixture: SortedDict): schemafile = os.path.join(archive_dir, syphon.schema.DEFAULT_FILE) with open(schemafile, mode="w") as f: f.write("content") with pytest.raises(FileExistsError): syphon.init(init_schema_fixture, schemafile, overwrite=False)
def test_init( capsys: CaptureFixture, archive_dir: LocalPath, init_schema_fixture: SortedDict, overwrite: bool, verbose: bool, ): schemafile = os.path.join(archive_dir, syphon.schema.DEFAULT_FILE) syphon.init(init_schema_fixture, schemafile, overwrite, verbose) with open(schemafile, "r") as f: actual = SortedDict(loads(f.read())) assert actual == init_schema_fixture assert_captured_outerr(capsys.readouterr(), verbose, False)
def test_incremental_fails_when_check_fails( capsys: CaptureFixture, schema: bool, archive_dir: LocalPath, cache_file: LocalPath, hash_file: Optional[LocalPath], post_hash: bool, verbose: bool, ): datafile: str = os.path.join(get_data_path(), "iris.csv") schema = SortedDict({"0": "Name"}) schemafile = os.path.join(archive_dir, syphon.schema.DEFAULT_FILE) if schema: syphon.init(schema, schemafile) assert syphon.archive(archive_dir, [datafile], schema_filepath=schemafile if schema else None) assert not os.path.exists(os.path.join(get_data_path(), "#lock")) expected_frame = DataFrame( read_csv(datafile, dtype=str, index_col="Index")) expected_frame.sort_index(inplace=True) LocalPath(datafile).copy(cache_file) assert os.path.exists(cache_file) # "check" ought to fail when the hash file does not exist. assert not syphon.check(cache_file, hash_filepath=hash_file) # If "check" fails, then the incremental build fails. assert not syphon.build( cache_file, *get_data_files(archive_dir), hash_filepath=hash_file, incremental=True, overwrite=True, post_hash=post_hash, verbose=verbose, ) assert_post_hash(False, cache_file, hash_filepath=hash_file) actual_frame = DataFrame( read_csv(cache_file, dtype=str, index_col="Index")) actual_frame.sort_index(inplace=True) assert_frame_equal(expected_frame, actual_frame, check_exact=True) assert_captured_outerr(capsys.readouterr(), verbose, False)
def test_full_build_with_schema_maintains_data_fidelity( capsys: CaptureFixture, archive_dir: LocalPath, cache_file: LocalPath, hash_file: Optional[LocalPath], overwrite: bool, post_hash: bool, verbose: bool, ): datafile: str = os.path.join(get_data_path(), "iris.csv") schema = SortedDict({"0": "Name"}) schemafile = os.path.join(archive_dir, syphon.schema.DEFAULT_FILE) syphon.init(schema, schemafile, overwrite=overwrite) assert syphon.archive(archive_dir, [datafile], schema_filepath=schemafile, overwrite=overwrite) assert not os.path.exists(os.path.join(get_data_path(), "#lock")) expected_frame = DataFrame( read_csv(datafile, dtype=str, index_col="Index")) expected_frame.sort_index(inplace=True) if overwrite: cache_file.write(rand_string()) assert syphon.build( cache_file, *get_data_files(archive_dir), hash_filepath=hash_file, incremental=False, overwrite=overwrite, post_hash=post_hash, verbose=verbose, ) assert_post_hash(post_hash, cache_file, hash_filepath=hash_file) actual_frame = DataFrame( read_csv(cache_file, dtype=str, index_col="Index")) actual_frame.sort_index(inplace=True) assert_frame_equal(expected_frame, actual_frame, check_exact=True) assert_captured_outerr(capsys.readouterr(), verbose, False)
def test_incremental_becomes_full_build_when_cache_does_not_exist( capsys: CaptureFixture, schema: bool, archive_dir: LocalPath, cache_file: LocalPath, hash_file: Optional[LocalPath], post_hash: bool, verbose: bool, ): datafile: str = os.path.join(get_data_path(), "iris.csv") schema = SortedDict({"0": "Name"}) schemafile = os.path.join(archive_dir, syphon.schema.DEFAULT_FILE) if schema: syphon.init(schema, schemafile) assert syphon.archive(archive_dir, [datafile], schema_filepath=schemafile if schema else None) assert not os.path.exists(os.path.join(get_data_path(), "#lock")) expected_frame = DataFrame( read_csv(datafile, dtype=str, index_col="Index")) expected_frame.sort_index(inplace=True) # Raises a FileExistsError unless a full build is performed. assert syphon.build( cache_file, *get_data_files(archive_dir), hash_filepath=hash_file, incremental=True, post_hash=post_hash, verbose=verbose, ) assert_post_hash(post_hash, cache_file, hash_filepath=hash_file) actual_frame = DataFrame( read_csv(cache_file, dtype=str, index_col="Index")) actual_frame.sort_index(inplace=True) assert_frame_equal(expected_frame, actual_frame, check_exact=True) assert_captured_outerr(capsys.readouterr(), verbose, False)
def test_raises_valueerror_when_metadata_is_inconsistent( self, archive_meta_params: Tuple[str, str, SortedDict], archive_dir: LocalPath, import_dir: LocalPath, ): filename: str expectedfilename: str schema: SortedDict filename, expectedfilename, schema = archive_meta_params datafile = os.path.join(get_data_path(), filename + ".csv") bad_metafile = LocalPath( os.path.join(get_data_path(), filename + "-inconsistent.meta")) metafile = import_dir.join(filename + ".meta") bad_metafile.copy(metafile) schemafile = os.path.join(archive_dir, syphon.schema.DEFAULT_FILE) syphon.init(schema, schemafile) # Find the column that will be in the message. metaframe = DataFrame(read_csv(metafile, dtype=str)) column: Optional[str] = None for column in metaframe.columns: if len(metaframe[column].drop_duplicates().values) > 1: break del metaframe assert column is not None with pytest.raises(ValueError, match=column): syphon.archive( archive_dir, [datafile], meta_files=[metafile], schema_filepath=schemafile, overwrite=True, ) assert not os.path.exists( os.path.join(os.path.dirname(datafile), "#lock"))
def test_increment_without_metadata_with_schema( self, capsys: CaptureFixture, archive_dir: LocalPath, archive_fixture: "TestArchive.ArchiveCacheAndHashPassthruChecker", schema_file: Optional[LocalPath], verbose: bool, ): # List of (expected frame filename, data filename) tuples targets: List[Tuple[str, str]] = [ ("iris-part-1-of-6-combined.csv", "iris-part-1-of-6-combined.csv"), ("iris-part-1-2.csv", "iris-part-2-of-6-combined.csv"), ("iris-part-1-2-3.csv", "iris-part-3-of-6-combined.csv"), ("iris-part-1-2-3-4.csv", "iris-part-4-of-6-combined.csv"), ("iris-part-1-2-3-4-5.csv", "iris-part-5-of-6-combined.csv"), ("iris_plus.csv", "iris-part-6-of-6-combined.csv"), ] expected_hashfile = ( LocalPath(archive_fixture.cache_file).dirpath(DEFAULT_HASH_FILE) if archive_fixture.hash_file is None else archive_fixture.hash_file) assert not os.path.exists(expected_hashfile) assert not os.path.exists(archive_fixture.cache_file) assert len(archive_dir.listdir()) == 0 expected_schemafile = (archive_dir.join(syphon.schema.DEFAULT_FILE) if schema_file is None else schema_file) assert not os.path.exists(expected_schemafile) syphon.init(SortedDict({ "0": "PetalColor", "1": "Species" }), expected_schemafile) assert os.path.exists(expected_schemafile) for expected_frame_filename, data_filename in targets: assert archive_fixture( archive_dir, [os.path.join(get_data_path(), data_filename)], schema_filepath=schema_file, cache_filepath=archive_fixture.cache_file, hash_filepath=archive_fixture.hash_file, verbose=verbose, ) assert_captured_outerr(capsys.readouterr(), verbose, False) expected_frame = DataFrame( read_csv( os.path.join(get_data_path(), expected_frame_filename), dtype=str, index_col="Index", )) expected_frame.sort_index(inplace=True) actual_frame = DataFrame( read_csv(str(archive_fixture.cache_file), dtype=str, index_col="Index")) actual_frame.sort_index(inplace=True) assert_captured_outerr(capsys.readouterr(), False, False) assert_frame_equal(expected_frame, actual_frame) assert os.path.exists(expected_hashfile) assert syphon.check( archive_fixture.cache_file, hash_filepath=expected_hashfile, verbose=verbose, )