def test_trivial_version_table(self): """Test versioning with a table that simply copies the original file. """ with tempfile.TemporaryDirectory() as tmp_dir: options.persistence_dir = tmp_dir fpt = sets.FilePropertiesTable(base_dir=options.project_root) fpt.dataset_files_extension = "py" fpt_df = fpt.get_df() fpt_df["original_file_path"] = fpt_df["file_path"] tvt = TrivialVersionTable(version_base_dir=tmp_dir, version_name="trivial", original_base_dir=options.project_root) tvt_df = tvt.get_df() lsuffix = "_original" rsuffix = f"_{tvt.version_name}" joint_df = fpt_df.set_index("original_file_path").join( tvt_df.set_index("original_file_path"), lsuffix=lsuffix, rsuffix=rsuffix) assert not np.any(joint_df.applymap(lambda x: x is None)) for column in [c for c in joint_df.columns.values if lsuffix in c]: if column.replace(lsuffix, "") in fpt.indices: continue version_column = column.replace(lsuffix, rsuffix) if not column.startswith("corpus"): assert np.all(joint_df[column] == joint_df[version_column]) \ or column.startswith("row_created") \ or column.startswith("row_update"), \ f"Columns {column} and {version_column} differ: " \ f"{joint_df[joint_df[column] != joint_df[version_column]][[column, version_column]].iloc[0]}"
def test_trivial_version_table(self): """Test versioning with a table that simply copies the original file. """ with tempfile.TemporaryDirectory() as tmp_dir: options.persistence_dir = tmp_dir try: class TrivialVersionTable(sets.FileVersionTable, sets.FilePropertiesTable): """Trivial FileVersionTable that makes an identical copy of the original """ version_name = "TrivialCopy" def __init__(self, original_properties_table): super().__init__( version_name=self.version_name, original_base_dir=os.path.dirname( os.path.abspath(__file__)), original_properties_table=original_properties_table, version_base_dir=tmp_dir) def version(self, input_path, output_path, row): shutil.copy(input_path, output_path) assert os.path.getsize(input_path) == os.path.getsize( output_path) fpt = sets.FilePropertiesTable() fpt_df = fpt.get_df(target_indices=target_indices) fpt_df["original_file_path"] = fpt_df["file_path"] tvt = TrivialVersionTable(original_properties_table=fpt) tvt_df = tvt.get_df(target_indices=target_indices) lsuffix = "_original" rsuffix = f"_{tvt.version_name}" joint_df = fpt_df.set_index("original_file_path").join( tvt_df.set_index("original_file_path"), lsuffix=lsuffix, rsuffix=rsuffix) assert not np.any(joint_df.applymap(lambda x: x is None)) for column in [ c for c in joint_df.columns.values if lsuffix in c ]: if column.replace(lsuffix, "") in fpt.indices: continue version_column = column.replace(lsuffix, rsuffix) if not column.startswith("corpus"): assert np.all(joint_df[column] == joint_df[version_column]), \ f"Columns {column} and {version_column} differ: " \ f"{joint_df[joint_df[column] != joint_df[version_column]][[column, version_column]].iloc[0]}" finally: shutil.rmtree(tmp_dir)
def test_file_properties(self): """Test that file properties are correctly obtained and retrieved. """ target_indices = [ p for p in glob.glob(os.path.join(enb.calling_script_dir, "*.py")) if os.path.isfile(p) ] # dataset_df = get_result_df() with tempfile.NamedTemporaryFile(suffix=".csv") as tmp_file: dataset_properties_table = sets.FilePropertiesTable( csv_support_path=tmp_file.name) # Attempt loading from an empty file, verify it is empty because fill=False try: empty_property_table = dataset_properties_table.get_df( fill=False, target_indices=target_indices) except ValueError: assert len(target_indices) == 0 assert len(empty_property_table) == 0, empty_property_table assert empty_property_table.isnull().all().all() # Run the actual loading sequence dataset_properties_df = dataset_properties_table.get_df( target_indices=target_indices) assert len(dataset_properties_df) == len(target_indices) # Obtain again, forcing load from the temporary file without any additional computations new_df = dataset_properties_table.get_df( target_indices=target_indices, fill=False, overwrite=False) assert (dataset_properties_df.columns == new_df.columns).all() for c in dataset_properties_df.columns: try: if not (dataset_properties_df[c] == new_df[c]).all(): # Floating point values might be unstable try: assert np.abs(dataset_properties_df[c] - new_df[c]).max() < 1e-12 except TypeError: # Stability within dictionaries is not verified, # but only dictionaries can raise this error assert (dataset_properties_df[c].apply( lambda c: isinstance(c, dict))).all() except ValueError as ex: raise RuntimeError( "The original and loaded datasets differ") from ex
def test_file_properties(self): """Test that file properties are correctly obtained and retrieved. """ for parallel in [True, False]: # dataset_df = get_result_df() with tempfile.NamedTemporaryFile(suffix=".csv") as tmp_fid: tmp_path = tmp_fid.name dataset_properties_table = sets.FilePropertiesTable( csv_support_path=tmp_path) # Attempt loading from an empty file, verify it is empty empty_property_table = dataset_properties_table.get_df( target_indices=target_indices, fill=False, parallel_row_processing=parallel) assert len(empty_property_table) == len(target_indices) assert np.all(empty_property_table[[ c for c in empty_property_table.columns if c not in dataset_properties_table.indices]].applymap(lambda x: x is None)), \ empty_property_table # Run the actual loading sequence dataset_properties_df = dataset_properties_table.get_df( target_indices=target_indices, parallel_row_processing=parallel) # Obtain again, forcing load from the temporary file new_df = dataset_properties_table.get_df( target_indices=target_indices, fill=False, overwrite=False, parallel_row_processing=parallel) assert (dataset_properties_df.columns == new_df.columns).all() for c in dataset_properties_df.columns: if not (dataset_properties_df[c] == new_df[c]).all(): # Floating point values might be unstable try: assert np.abs(dataset_properties_df[c] - new_df[c]).max() < 1e-12 except TypeError: # Stability within dictionaries is not verified, # but only dictionaries can raise this error assert (dataset_properties_df[c].apply( lambda c: isinstance(c, dict))).all()
def __init__(self, original_base_dir, version_base_dir): super().__init__(original_base_dir=original_base_dir, version_base_dir=version_base_dir, original_properties_table=sets.FilePropertiesTable(), version_name=self.version_name, check_generated_files=False)
def __init__(self, original_base_dir, version_base_dir): super().__init__( original_base_dir=original_base_dir, version_base_dir=version_base_dir, original_properties_table=sets.FilePropertiesTable(), version_name=self.version_name)