Example #1
0
def test_sisso_regressor_omp(mocker):
    # Simple SISSO run with OMP
    # Mock the run of the custodian by just copying a reference SISSO.out file
    def copy_sisso_out():
        shutil.copy(
            os.path.join(TEST_FILES_DIR, "runs", "OMP", "SISSO.out"),
            "SISSO.out",
        )

    mocker.patch.object(
        pysisso.sklearn.Custodian,
        "run",
        return_value=[],
        side_effect=copy_sisso_out,
    )
    with ScratchDir("."):
        sisso_reg = SISSORegressor.OMP(desc_dim=4)
        assert sisso_reg.rung == 0
        assert sisso_reg.subs_sis == 1
        assert sisso_reg.desc_dim == 4
        assert sisso_reg.method == "L0"
        assert sisso_reg.L1L0_size4L0 is None
        X = np.array([
            [8, 1, 3.01, 4],
            [6, 2, 3.02, 3],
            [2, 3, 3.01, 0],
            [10, 4, 3.02, -8],
            [4, 5, 3.01, 10],
        ])
        y = 0.9 * X[:, 1] + 0.1 * X[:, 3] - 1.0
        sisso_reg.fit(X, y)

        actual_sin = "SISSO_dir/SISSO.in"
        ref_sin = os.path.join(TEST_FILES_DIR, "runs", "OMP", "SISSO.in")
        assert [line for line in open(actual_sin)
                ] == [line for line in open(ref_sin)]

        sisso_out = SISSOOut.from_file(filepath="SISSO_dir/SISSO.out")
        assert sisso_out.params.n_rungs == sisso_reg.rung
        assert sisso_out.params.SIS_subspaces_sizes == [sisso_reg.subs_sis]
        assert sisso_out.params.descriptor_dimension == sisso_reg.desc_dim
        assert sisso_out.params.sparsification_method == sisso_reg.method

        sisso_model = sisso_out.model
        assert str(sisso_model.descriptors[0]) == "(feature_1)"
        assert str(sisso_model.descriptors[1]) == "(feature_3)"
Example #2
0
from pysisso.sklearn import SISSORegressor

# Define the data set
X = np.array([
    [8, 1, 3.01, 4],
    [6, 2, 3.02, 3],
    [2, 3, 3.01, 0],
    [10, 4, 3.02, -8],
    [4, 5, 3.01, 10],
])
y = 0.9 * X[:, 1] + 0.1 * X[:, 3] - 1.0

# Define the regressor and fit the data
sisso_reg = SISSORegressor.OMP(desc_dim=4)
sisso_reg.fit(X,
              y,
              columns=["feature_0", "feature_1", "feature_2", "feature_3"])

# Get the final model obtained
sisso_out = SISSOOut.from_file(filepath="SISSO_dir/SISSO.out")
sisso_model = sisso_out.model

# Get the descriptors
descriptors = [str(d) for d in sisso_model.descriptors]

# Print the order of the OMP features
# Should start with feature_1, then feature_3.
# feature_0 and feature_2 might be interchanged.
for idesc, desc in enumerate(descriptors):
    print(f"#{idesc+1}: {desc} ({sisso_model.coefficients[0][idesc]})")
Example #3
0
def test_sisso_out():
    sisso_out = SISSOOut.from_file(
        filepath=os.path.join(TEST_FILES_DIR, "runs", "cubic_function", "SISSO.out")
    )

    sisso_version = sisso_out.version
    assert isinstance(sisso_version, SISSOVersion)
    assert sisso_version.version == (3, 0, 2)
    assert sisso_version.header_string == "Version SISSO.3.0.2, June, 2020."
    sisso_params = sisso_out.params
    assert isinstance(sisso_params, SISSOParams)
    assert sisso_params.number_of_samples == [100]
    assert sisso_params.sparsification_method == "L0"
    assert (
        str(sisso_params)
        == """Parameters for SISSO :
 - property_type : 3
 - descriptor_dimension : 3
 - total_number_properties : 1
 - task_weighting : [1]
 - number_of_samples : [100]
 - n_scalar_features : 1
 - n_rungs : 1
 - max_feature_complexity : 10
 - n_dimension_types : 0
 - dimension_types : [[]]
 - lower_bound_maxabs_value : 0.001
 - upper_bound_maxabs_value : 100000.0
 - SIS_subspaces_sizes : [20]
 - operators : ['(+)(*)(^2)(^3)(^-1)(cos)(sin)']
 - sparsification_method : L0
 - n_topmodels : 100
 - fit_intercept : True
 - metric : RMSE"""
    )
    sisso_iterations = sisso_out.iterations
    assert isinstance(sisso_iterations, list)
    assert len(sisso_iterations) == sisso_params.descriptor_dimension
    iteration_1 = sisso_iterations[0]
    last_iteration = sisso_iterations[-1]
    assert isinstance(iteration_1, SISSOIteration)
    assert isinstance(last_iteration, SISSOIteration)
    assert len(iteration_1.sisso_model.descriptors) == 1
    assert (
        len(last_iteration.sisso_model.descriptors) == sisso_params.descriptor_dimension
    )
    assert iteration_1.iteration_number == 1
    assert last_iteration.iteration_number == 3
    assert iteration_1.SIS_subspace_size == 6
    assert last_iteration.SIS_subspace_size == 0
    model_1 = iteration_1.sisso_model
    last_model = last_iteration.sisso_model
    assert model_1.dimension == 1
    assert last_model.dimension == 3
    assert len(model_1.descriptors) == 1
    assert len(last_model.descriptors) == 3
    assert len(model_1.rmse) == 1
    assert len(model_1.maxae) == 1
    assert len(last_model.rmse) == 1
    assert len(last_model.maxae) == 1
    assert model_1.rmse[0] == pytest.approx(0.7959386860e01)
    assert model_1.maxae[0] == pytest.approx(0.1858248525e02)
    assert last_model.rmse[0] == pytest.approx(0.1757799850e01)
    assert last_model.maxae[0] == pytest.approx(0.4267977958e01)
    assert len(model_1.coefficients) == 1
    assert len(last_model.coefficients) == 1
    assert len(model_1.coefficients[0]) == 1
    assert len(last_model.coefficients[0]) == 3
    assert model_1.coefficients[0] == pytest.approx([0.2553319133e00])
    assert last_model.coefficients[0] == pytest.approx(
        [0.9856312325e00, -0.3842863966e01, -0.1417565675e01]
    )
    assert len(model_1.intercept) == 1
    assert len(last_model.intercept) == 1
    assert model_1.intercept[0] == pytest.approx(-0.5364436924e01)
    assert last_model.intercept[0] == pytest.approx(0.3890294191e01)
    descriptors_1 = model_1.descriptors
    assert len(descriptors_1) == 1
    descriptors_last = last_model.descriptors
    assert len(descriptors_last) == 3
    descriptor_1 = descriptors_1[0]
    assert isinstance(descriptor_1, SISSODescriptor)
    assert descriptor_1.descriptor_id == 1
    assert descriptor_1.descriptor_string == "(myx)^3"
    df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=["XX", "myx", "ZZ"])
    descr_1_eval = descriptor_1.evaluate(df)
    assert len(descr_1_eval) == 2
    assert descr_1_eval[0] == pytest.approx(8)
    assert descr_1_eval[1] == pytest.approx(125)
    descriptor_last_1 = descriptors_last[0]
    descriptor_last_2 = descriptors_last[1]
    descriptor_last_3 = descriptors_last[2]
    assert descriptor_last_1.descriptor_id == 1
    assert descriptor_last_2.descriptor_id == 2
    assert descriptor_last_3.descriptor_id == 3
    assert descriptor_last_1.descriptor_string == "(myx)^3"
    assert descriptor_last_2.descriptor_string == "(myx)^2"
    assert descriptor_last_3.descriptor_string == "(myx)"
    assert str(descriptor_last_3) == descriptor_last_3.descriptor_string
    descr_last_1_eval = descriptor_last_1.evaluate(df)
    descr_last_2_eval = descriptor_last_2.evaluate(df)
    descr_last_3_eval = descriptor_last_3.evaluate(df)
    assert descr_last_1_eval[0] == pytest.approx(8)
    assert descr_last_1_eval[1] == pytest.approx(125)
    assert descr_last_2_eval[0] == pytest.approx(4)
    assert descr_last_2_eval[1] == pytest.approx(25)
    assert descr_last_3_eval[0] == pytest.approx(2)
    assert descr_last_3_eval[1] == pytest.approx(5)
    pred_1 = model_1.predict(df)
    assert pred_1[0] == pytest.approx(-3.3217816175999997)
    assert pred_1[1] == pytest.approx(26.5520522385)
    pred_last = last_model.predict(df)
    assert pred_last[0] == pytest.approx(-6.431243163)
    assert pred_last[1] == pytest.approx(23.9347707285)
    assert sisso_out.cpu_time == pytest.approx(0.64)
    models = sisso_out.models
    assert len(models) == 3
    assert isinstance(models[0], SISSOModel)
    assert isinstance(models[1], SISSOModel)
    assert isinstance(models[2], SISSOModel)

    # Partial SISSO output
    partial_sisso_out_fpath = os.path.join(
        TEST_FILES_DIR, "outputs", "SISSO.3.0.2.out_not_finished"
    )
    with pytest.raises(
        ValueError,
        match=r"Should get exactly one total " r"cpu time in the string, got 0.",
    ):
        SISSOOut.from_file(filepath=partial_sisso_out_fpath)
    sisso_out = SISSOOut.from_file(
        filepath=partial_sisso_out_fpath, allow_unfinished=True
    )
    assert len(sisso_out.iterations) == 2
    assert sisso_out.cpu_time is None
    models = sisso_out.models
    assert len(models) == 2
    assert isinstance(models[0], SISSOModel)
    assert isinstance(models[1], SISSOModel)
Example #4
0
from pysisso.outputs import (
    SISSODescriptor,
    SISSOIteration,
    SISSOModel,
    SISSOOut,
    SISSOParams,
    SISSOVersion,
    scd,
)

TEST_FILES_DIR = os.path.abspath(
    os.path.join(pysisso.__file__, "..", "..", "test_files")
)

sisso_out = SISSOOut.from_file(
    filepath=os.path.join(TEST_FILES_DIR, "runs", "cubic_function", "SISSO.out")
)


@pytest.mark.unit
def test_sisso_out():
    sisso_out = SISSOOut.from_file(
        filepath=os.path.join(TEST_FILES_DIR, "runs", "cubic_function", "SISSO.out")
    )

    sisso_version = sisso_out.version
    assert isinstance(sisso_version, SISSOVersion)
    assert sisso_version.version == (3, 0, 2)
    assert sisso_version.header_string == "Version SISSO.3.0.2, June, 2020."
    sisso_params = sisso_out.params
    assert isinstance(sisso_params, SISSOParams)
Example #5
0
    def fit(self, X, y, index=None, columns=None, tasks=None):
        """Fit a SISSO regression based on inputs X and output y.

        This method supports Multi-Task SISSO. For Single-Task SISSO, y must have a
        shape (n_samples) or (n_samples, 1).
        For Multi-Task SISSO, y must have a shape (n_samples, n_tasks). The arrays
        will be reshaped to fit SISSO's input files.
        For example, with 10 samples and 3 properties, the output array (y) will be
        reshaped to (30, 1). The input array (X) is left unchanged.
        It is also possible to provide samples without an output for some properties
        by setting that property to NaN. In that case, the corresponding values in the
        input (X) and output (y) arrays will be removed from the SISSO inputs.
        In the previous example, if 2 of the samples have NaN for the first property,
        1 sample has Nan for the second property and 4 samples have Nan for the third
        property, the final output array (y) will have a shape (30-2-1-4, 1), i.e.
        (23, 1), while the final input array (X) will have a shape (23, n_features).

        Args:
            X: Feature vectors as an array-like of shape (n_samples, n_features).
            y: Target values as an array-like of shape (n_samples,)
                or (n_samples, n_tasks).
            index: List of string identifiers for each sample. If None, "sampleN"
                with N=[1, ..., n_samples] will be used.
            columns: List of string names of the features. If None, "featN"
                with N=[1, ..., n_features] will be used.
            tasks: When Multi-Task SISSO is used, this is the list of string names
                that will be used for each task/property. If None, "taskN"
                with N=[1, ..., n_tasks] will be used.
        """
        if not self.use_custodian:
            raise NotImplementedError

        self.sisso_in = SISSOIn.from_sisso_keywords(  # pylint: disable=W0201
            ptype=1,
            ntask=self.ntask,
            task_weighting=self.task_weighting,
            desc_dim=self.desc_dim,
            restart=self.restart,
            rung=self.rung,
            opset=self.opset,
            maxcomplexity=self.maxcomplexity,
            dimclass=self.dimclass,
            maxfval_lb=self.maxfval_lb,
            maxfval_ub=self.maxfval_ub,
            subs_sis=self.subs_sis,
            method=self.method,
            L1L0_size4L0=self.L1L0_size4L0,
            fit_intercept=self.fit_intercept,
            metric=self.metric,
            nm_output=self.nm_output,
            isconvex=self.isconvex,
            width=self.width,
            nvf=self.nvf,
            vfsize=self.vfsize,
            vf2sf=self.vf2sf,
            npf_must=self.npf_must,
            L1_max_iter=self.L1_max_iter,
            L1_tole=self.L1_tole,
            L1_dens=self.L1_dens,
            L1_nlambda=self.L1_nlambda,
            L1_minrmse=self.L1_minrmse,
            L1_warm_start=self.L1_warm_start,
            L1_weighted=self.L1_weighted,
        )
        # Set up columns. These columns are used by the SISSO model wrapper afterwards
        # for the prediction
        if columns is None and isinstance(X, pd.DataFrame):
            columns = list(X.columns)
        self.columns = columns or [  # pylint: disable=W0201
            "feat{:d}".format(ifeat) for ifeat in range(1, X.shape[1] + 1)
        ]
        if len(self.columns) != X.shape[1]:
            raise ValueError(
                "Columns should be of the size of the second axis of X.")

        # Set up data
        X = np.array(X)
        y = np.array(y)
        if y.ndim == 1 or (y.ndim == 2
                           and y.shape[1] == 1):  # Single-Task SISSO
            self.ntasks = 1  # pylint: disable=W0201
            index = index or [
                "sample{:d}".format(ii) for ii in range(1, X.shape[0] + 1)
            ]
            if len(index) != len(y) or len(index) != len(X):
                raise ValueError("Index, X and y should have same size.")
            nsample = None
        elif y.ndim == 2 and y.shape[1] > 1:  # Multi-Task SISSO
            self.ntasks = y.shape[1]  # pylint: disable=W0201
            samples_index = index or [
                "sample{:d}".format(ii) for ii in range(1, X.shape[0] + 1)
            ]
            tasks = tasks or [
                "task{:d}".format(ii) for ii in range(1, self.ntasks + 1)
            ]
            newX = np.zeros((0, X.shape[1]))
            newy = np.array([])
            index = []
            nsample = []
            for itask in range(self.ntasks):
                yadd = y[:, itask]
                nanindices = np.argwhere(np.isnan(yadd)).flatten()
                totake = [
                    ii for ii in range(len(yadd)) if ii not in nanindices
                ]
                newy = np.concatenate([newy, np.take(yadd, indices=totake)])
                newX = np.row_stack([newX, np.take(X, indices=totake, axis=0)])
                nsample.append(len(totake))
                index.extend([
                    "{}_{}".format(sample_index, tasks[itask])
                    for i_sample, sample_index in enumerate(samples_index)
                    if i_sample in totake
                ])
            X = newX
            y = newy
        else:
            raise ValueError("Wrong shapes.")
        data = pd.DataFrame(X, index=index, columns=self.columns)
        data.insert(0, "target", y)
        data.insert(0, "identifier", index)

        # Set up SISSODat and SISSOIn
        sisso_dat = SISSODat(data=data,
                             features_dimensions=self.features_dimensions,
                             nsample=nsample)
        self.sisso_in.set_keywords_for_SISSO_dat(sisso_dat=sisso_dat)

        # Run SISSO
        if self.run_dir is None:
            makedirs_p("SISSO_runs")
            timestamp = get_timestamp()
            self.run_dir = tempfile.mkdtemp(suffix=None,
                                            prefix=f"SISSO_dir_{timestamp}_",
                                            dir="SISSO_runs")
        else:
            makedirs_p(self.run_dir)
        with cd(self.run_dir):
            self.sisso_in.to_file(filename="SISSO.in")
            sisso_dat.to_file(filename="train.dat")
            job = SISSOJob()
            c = Custodian(jobs=[job], handlers=[], validators=[])
            c.run()
            self.sisso_out = SISSOOut.from_file(  # pylint: disable=W0201
                filepath="SISSO.out")

        # Clean run directory
        if (self.clean_run_dir
            ):  # TODO: add check here to not remove "." if the user passes . ?
            shutil.rmtree(self.run_dir)