Ejemplo n.º 1
0
    def test_trivial_version_table(self):
        """Test versioning with a table that simply copies the original file.
        """

        with tempfile.TemporaryDirectory() as tmp_dir:
            options.persistence_dir = tmp_dir
            fpt = sets.FilePropertiesTable(base_dir=options.project_root)
            fpt.dataset_files_extension = "py"
            fpt_df = fpt.get_df()
            fpt_df["original_file_path"] = fpt_df["file_path"]
            tvt = TrivialVersionTable(version_base_dir=tmp_dir,
                                      version_name="trivial",
                                      original_base_dir=options.project_root)
            tvt_df = tvt.get_df()

            lsuffix = "_original"
            rsuffix = f"_{tvt.version_name}"
            joint_df = fpt_df.set_index("original_file_path").join(
                tvt_df.set_index("original_file_path"),
                lsuffix=lsuffix,
                rsuffix=rsuffix)

            assert not np.any(joint_df.applymap(lambda x: x is None))

            for column in [c for c in joint_df.columns.values if lsuffix in c]:
                if column.replace(lsuffix, "") in fpt.indices:
                    continue
                version_column = column.replace(lsuffix, rsuffix)
                if not column.startswith("corpus"):
                    assert np.all(joint_df[column] == joint_df[version_column]) \
                           or column.startswith("row_created") \
                           or column.startswith("row_update"), \
                        f"Columns {column} and {version_column} differ: " \
                        f"{joint_df[joint_df[column] != joint_df[version_column]][[column, version_column]].iloc[0]}"
Ejemplo n.º 2
0
    def test_trivial_version_table(self):
        """Test versioning with a table that simply copies the original file.
        """
        with tempfile.TemporaryDirectory() as tmp_dir:
            options.persistence_dir = tmp_dir

            try:

                class TrivialVersionTable(sets.FileVersionTable,
                                          sets.FilePropertiesTable):
                    """Trivial FileVersionTable that makes an identical copy of the original
                    """
                    version_name = "TrivialCopy"

                    def __init__(self, original_properties_table):
                        super().__init__(
                            version_name=self.version_name,
                            original_base_dir=os.path.dirname(
                                os.path.abspath(__file__)),
                            original_properties_table=original_properties_table,
                            version_base_dir=tmp_dir)

                    def version(self, input_path, output_path, row):
                        shutil.copy(input_path, output_path)
                        assert os.path.getsize(input_path) == os.path.getsize(
                            output_path)

                fpt = sets.FilePropertiesTable()
                fpt_df = fpt.get_df(target_indices=target_indices)
                fpt_df["original_file_path"] = fpt_df["file_path"]
                tvt = TrivialVersionTable(original_properties_table=fpt)
                tvt_df = tvt.get_df(target_indices=target_indices)

                lsuffix = "_original"
                rsuffix = f"_{tvt.version_name}"
                joint_df = fpt_df.set_index("original_file_path").join(
                    tvt_df.set_index("original_file_path"),
                    lsuffix=lsuffix,
                    rsuffix=rsuffix)

                assert not np.any(joint_df.applymap(lambda x: x is None))

                for column in [
                        c for c in joint_df.columns.values if lsuffix in c
                ]:
                    if column.replace(lsuffix, "") in fpt.indices:
                        continue
                    version_column = column.replace(lsuffix, rsuffix)
                    if not column.startswith("corpus"):
                        assert np.all(joint_df[column] == joint_df[version_column]), \
                            f"Columns {column} and {version_column} differ: " \
                            f"{joint_df[joint_df[column] != joint_df[version_column]][[column, version_column]].iloc[0]}"
            finally:
                shutil.rmtree(tmp_dir)
Ejemplo n.º 3
0
    def test_file_properties(self):
        """Test that file properties are correctly obtained and retrieved.
        """
        target_indices = [
            p for p in glob.glob(os.path.join(enb.calling_script_dir, "*.py"))
            if os.path.isfile(p)
        ]

        # dataset_df = get_result_df()
        with tempfile.NamedTemporaryFile(suffix=".csv") as tmp_file:
            dataset_properties_table = sets.FilePropertiesTable(
                csv_support_path=tmp_file.name)

            # Attempt loading from an empty file, verify it is empty because fill=False
            try:
                empty_property_table = dataset_properties_table.get_df(
                    fill=False, target_indices=target_indices)
            except ValueError:
                assert len(target_indices) == 0

            assert len(empty_property_table) == 0, empty_property_table
            assert empty_property_table.isnull().all().all()

            # Run the actual loading sequence
            dataset_properties_df = dataset_properties_table.get_df(
                target_indices=target_indices)
            assert len(dataset_properties_df) == len(target_indices)

            # Obtain again, forcing load from the temporary file without any additional computations
            new_df = dataset_properties_table.get_df(
                target_indices=target_indices, fill=False, overwrite=False)
            assert (dataset_properties_df.columns == new_df.columns).all()

            for c in dataset_properties_df.columns:
                try:
                    if not (dataset_properties_df[c] == new_df[c]).all():
                        # Floating point values might be unstable
                        try:
                            assert np.abs(dataset_properties_df[c] -
                                          new_df[c]).max() < 1e-12
                        except TypeError:
                            # Stability within dictionaries is not verified,
                            # but only dictionaries can raise this error
                            assert (dataset_properties_df[c].apply(
                                lambda c: isinstance(c, dict))).all()
                except ValueError as ex:
                    raise RuntimeError(
                        "The original and loaded datasets differ") from ex
Ejemplo n.º 4
0
    def test_file_properties(self):
        """Test that file properties are correctly obtained and retrieved.
        """
        for parallel in [True, False]:
            # dataset_df = get_result_df()
            with tempfile.NamedTemporaryFile(suffix=".csv") as tmp_fid:
                tmp_path = tmp_fid.name
            dataset_properties_table = sets.FilePropertiesTable(
                csv_support_path=tmp_path)

            # Attempt loading from an empty file, verify it is empty
            empty_property_table = dataset_properties_table.get_df(
                target_indices=target_indices,
                fill=False,
                parallel_row_processing=parallel)
            assert len(empty_property_table) == len(target_indices)

            assert np.all(empty_property_table[[
                c for c in empty_property_table.columns
                if c not in dataset_properties_table.indices]].applymap(lambda x: x is None)), \
                empty_property_table

            # Run the actual loading sequence
            dataset_properties_df = dataset_properties_table.get_df(
                target_indices=target_indices,
                parallel_row_processing=parallel)

            # Obtain again, forcing load from the temporary file
            new_df = dataset_properties_table.get_df(
                target_indices=target_indices,
                fill=False,
                overwrite=False,
                parallel_row_processing=parallel)
            assert (dataset_properties_df.columns == new_df.columns).all()
            for c in dataset_properties_df.columns:
                if not (dataset_properties_df[c] == new_df[c]).all():
                    # Floating point values might be unstable
                    try:
                        assert np.abs(dataset_properties_df[c] -
                                      new_df[c]).max() < 1e-12
                    except TypeError:
                        # Stability within dictionaries is not verified,
                        # but only dictionaries can raise this error
                        assert (dataset_properties_df[c].apply(
                            lambda c: isinstance(c, dict))).all()
Ejemplo n.º 5
0
 def __init__(self, original_base_dir, version_base_dir):
     super().__init__(original_base_dir=original_base_dir,
                      version_base_dir=version_base_dir,
                      original_properties_table=sets.FilePropertiesTable(),
                      version_name=self.version_name,
                      check_generated_files=False)
Ejemplo n.º 6
0
 def __init__(self, original_base_dir, version_base_dir):
     super().__init__(
         original_base_dir=original_base_dir,
         version_base_dir=version_base_dir,
         original_properties_table=sets.FilePropertiesTable(),
         version_name=self.version_name)