Beispiel #1
0
    def _make_moves(self, seeds_table: TabularData,
                    domain: Sequence[Tuple[float, float]],
                    num_dimensions: int) -> TabularData:
        """Produce a set of possible moves from given seed points.

        Parameters:
            seeds_table: the seed points as a tabular data source
            domain: the bounds of the domain, given as a sequence of bounds tuples (lower, upper)
            num_dimensions: the number of dimensions to explore along

        Returns:
            A tabular data source containing all of the possible next-step points
        """
        seeds: np.ndarray = seeds_table.samples()
        total_dimensions = seeds.shape[1]
        # Randomly select dimensions to vary
        dimension_indices = self.random.permutation(
            range(total_dimensions))[:num_dimensions]
        # For each seed and each dimension generate uniformly-spaced samples, then stack everything
        candidates_array = np.vstack([
            self._move_along_dimension(seed, domain, d_idx) for seed in seeds
            for d_idx in dimension_indices
        ])
        # remove duplicates
        return TabularData(candidates_array).subset(duplicates=False)
def test_ExtremelyRandomizedTreesRegressionSklearn_2():
    """Simple examples: linear 1-d function."""

    rf = ExtremelyRandomizedTreesRegressionSklearn(rng=2,
                                                   uncertainties="naive")
    train_data = TabularData(
        data=np.array([[-2], [-1.5], [-1], [-0.5], [0], [0.5], [1], [1.5],
                       [2]]),
        labels=np.array([-2, -1.5, -1, -0.5, 0, 0.5, 1, 1.5, 2]),
    )
    rf.fit(train_data)

    mean = rf.apply(TabularData(data=np.array([[-1], [0], [1]]))).mean
    assert np.allclose(mean, [-1, 0, 1], atol=0.2)

    stddev = rf.apply(TabularData(data=np.array([[-2], [0], [2]]))).stddev
    assert stddev[0] > stddev[1] < stddev[2]

    # without uncertainties
    rf = ExtremelyRandomizedTreesRegressionSklearn(
        rng=1)  # default for uncertainties is None
    rf.fit(train_data)

    preds = rf.apply(TabularData(data=np.array([[-1], [0], [1]])))
    assert np.allclose(preds.mean, [-1, 0, 1], atol=0.2)

    assert isinstance(preds, DeltaPredictiveDistribution)
def test_ExtremelyRandomizedTreesRegressionSklearn_1():
    """Simple examples."""

    # constant function
    # MH: for constant labels, expected uncertainties are zero
    train_data = TabularData(
        data=np.array([[-4], [-3], [-2], [-1], [0], [1], [2], [3], [4]]),
        labels=np.array([1, 1, 1, 1, 1, 1, 1, 1, 1]),
    )
    valid_data = TabularData(data=np.array([[-4], [-2], [0], [3], [4]]))
    rf = ExtremelyRandomizedTreesRegressionSklearn(n_estimators=10,
                                                   uncertainties="naive",
                                                   rng=0)
    preds = rf.fit(train_data).apply(valid_data)
    mean, stddev = preds.mean, preds.stddev

    assert np.allclose(mean, [1, 1, 1, 1, 1])
    assert np.allclose(stddev, [0, 0, 0, 0, 0])

    # delta distributions (zero standard deviation)
    rf = ExtremelyRandomizedTreesRegressionSklearn(n_estimators=10,
                                                   uncertainties=None,
                                                   rng=0)
    preds = rf.fit(train_data).apply(valid_data)
    mean, stddev = preds.mean, preds.stddev

    assert np.allclose(mean, [1, 1, 1, 1, 1])
    assert np.allclose(stddev, [0, 0, 0, 0, 0])

    assert isinstance(preds, DeltaPredictiveDistribution)
def test_ExtremelyRandomizedTreesRegressionSklearn_5():
    """Non-trivial test case, including standard deviation."""

    n, m, xlen = 150, 600, 10
    train_inputs = np.reshape(np.linspace(-xlen / 2, +xlen / 2, n), (n, 1))
    train_labels = (train_inputs * 2 + 1).flatten()
    train_data = TabularData(data=train_inputs, labels=train_labels)
    train_data = LabelNoise(noise=NormalNoise(
        rng=0)).fit(train_data).apply(train_data)

    valid_inputs = np.reshape(np.linspace(-xlen / 2, +xlen / 2, m), (m, 1))
    valid_labels = (valid_inputs * 2 + 1).flatten()
    valid_data = TabularData(data=valid_inputs, labels=valid_labels)
    valid_data = LabelNoise(noise=NormalNoise(
        rng=1)).fit(valid_data).apply(valid_data)

    # 12 trees meets minimal requirements for jackknife estimates
    rf = ExtremelyRandomizedTreesRegressionSklearn(rng=0,
                                                   uncertainties="naive")
    preds = rf.fit(train_data).apply(valid_data)
    mae = MeanAbsoluteError().evaluate(valid_data.labels(), preds)

    # for perfect predictions, expect MAE of 1.12943
    # (absolute difference between draws from two unit normal distributions)
    assert np.allclose(mae, 1.13, atol=0.25)
    assert np.allclose(np.mean(preds.stddev), 1, atol=0.3)
def test_ExtremelyRandomizedTreesRegressionSklearn_3():
    """Ensure predictions are identical independent of uncertainties method used."""

    rf1 = ExtremelyRandomizedTreesRegressionSklearn(rng=1, uncertainties=None)
    rf2 = ExtremelyRandomizedTreesRegressionSklearn(rng=1,
                                                    uncertainties="naive")
    train_data = TabularData(
        data=np.array([[-2], [-1.5], [-1], [-0.5], [0], [0.5], [1], [1.5],
                       [2]]),
        labels=np.array([-2, -1.5, -1, -0.5, 0, 0.5, 1, 1.5, 2]),
    )
    rf1.fit(train_data)
    rf2.fit(train_data)

    test_data = np.array([[-3], [-1], [0], [0.5], [1], [2]])
    mean1 = rf1.apply(TabularData(data=test_data)).mean
    mean2 = rf2.apply(TabularData(data=test_data)).mean
    assert np.allclose(mean1, mean2, atol=1e-6)
Beispiel #6
0
    def subset(self,
               indices: Optional[np.ndarray] = None,
               duplicates: bool = False) -> TabularData:
        """Create finite subset of data.

        Parameters:
            indices: a real matrix of appropriate dimensions (rows are vectors)
            duplicates: if True (default), the returned subset does not contain
                duplicate entries; if False, duplicates are kept. Both inputs
                and labels have to match for duplicates.

        Returns:
            Finite dataset of vectors.
        """

        # indices is validated by calls to samples() and labels()
        duplicates = params.boolean(duplicates)

        data = self.samples(indices)
        labels = self.labels(indices) if self.is_labeled else None

        ds = TabularData(data=data, labels=labels)

        return ds if not duplicates else ds.subset(duplicates=True)
Beispiel #7
0
    def _minimize(self, data: VectorSpaceData,
                  function_tracker: TrackedTransformation):
        def _clip_to_bounds(x):
            """Clip x to obey the bounds along each dimension. This is necessary because
            dual annealing does not respect bounds, but smlb is strict about bounds and will
            throw an exception if we try to sample outside of the domain.
            """
            for i in range(len(x)):
                lb = bounds[i, 0]
                ub = bounds[i, 1]
                x[i] = min(ub, max(x[i], lb))
            return x

        bounds = data.domain
        func = lambda x: function_tracker.apply(
            TabularData(_clip_to_bounds(x).reshape(1, -1)))
        seed = self.random.split(1)[
            0]  # split off a new random seed each time `optimize` is called
        # TODO: include a callback to record the results of each iteration. Store this info in
        #   TrackedTransformation and include it when creating the OptimizationTrajectory.
        self._minimization_algorithm(func, bounds, seed)
Beispiel #8
0
    def apply(self, data: Data) -> TabularData:
        """Compute matminer composition-based materials features.

        Parameters:
            data: material compositions, given as sum formula strings
                  Can be labeled, and labels will be retained

        Returns:
            TabularData or TabularLabeledData with matminer composition-based
            materials features as samples
        """

        data = params.instance(data, Data)

        inputs_ = tuple(self._composition(self.samplef(s)) for s in data.samples())
        features = self._mmfeatures.featurize_many(inputs_, pbar=False)
        features = np.asfarray(features)

        result = TabularData(data=features, labels=data.labels() if data.is_labeled else None)

        return result
Beispiel #9
0
    def apply(self, data: Data) -> TabularData:
        """Compute selected molecular features.

        Parameters:
            data: molecular structures given as SMILES strings.
                  Can be labeled, and labels will be retained

        Returns:
            TabularData with CDK molecular features as samples
        """

        data = params.instance(data, Data)  # todo: params.data(data, is_finite=True)

        failmode = DataTransformationFailureMode(self._failmode, data.num_samples)

        # set up molecule SMILES
        builder = self._java_gateway.jvm.org.openscience.cdk.DefaultChemObjectBuilder.getInstance()
        parser = self._java_gateway.jvm.org.openscience.cdk.smiles.SmilesParser(builder)

        def parse_smiles(s: str, i: int):
            """Return parsed SMILES string or None on failure."""
            try:
                return parser.parseSmiles(self._samplef(s))
            except py4j.protocol.Py4JJavaError:
                # expected to be raised from org.openscience.cdk.exception.InvalidSmilesException
                failmode.handle_failure(i)
                return None  # internal sentinel value

        smiles = tuple(parse_smiles(s, i) for i, s in enumerate(data.samples()))

        # compute descriptors
        # todo: the dtype of the columns could be set in advance by querying the descriptors
        #       currently, all values are stored as floating point numbers
        features = np.empty((data.num_samples, np.sum(self._arities)))
        index = 0

        def java_is_instance_of(object_, class_):
            return py4j.java_gateway.is_instance_of(
                self._java_gateway, object_, "org.openscience.cdk.qsar.result." + class_
            )

        def check_arity(expected, actual):
            if expected != actual:
                raise BenchmarkError(
                    f"Invalid descriptor result arity (expected {expected}, was {actual})"
                )

        for descriptor, arity in zip(self._descriptors, self._arities):
            for i, smile in enumerate(smiles):
                if smiles is None:
                    features[i, index : index + arity] = float("nan")
                    continue

                try:
                    value = descriptor.calculate(smile).getValue()
                except py4j.protocol.Py4JJavaError:
                    failmode.handle_failure(i)
                    features[i, index : index + arity] = float("nan")
                    continue

                if java_is_instance_of(value, "IntegerResult"):
                    check_arity(arity, 1)
                    features[i, index] = int(value.intValue())
                elif java_is_instance_of(value, "DoubleResult"):
                    check_arity(arity, 1)
                    features[i, index] = float(value.doubleValue())
                elif java_is_instance_of(value, "BooleanResult"):
                    check_arity(arity, 1)
                    features[i, index] = bool(value.booleanValue())
                elif java_is_instance_of(value, "IntegerArrayResult"):
                    check_arity(arity, value.length())
                    features[i, index : index + arity] = tuple(
                        int(value.get(j)) for j in range(value.length())
                    )
                elif java_is_instance_of(value, "DoubleArrayResult"):
                    check_arity(arity, value.length())
                    features[i, index : index + arity] = tuple(
                        float(value.get(j)) for j in range(value.length())
                    )
                # there seems to be no BooleanArrayResult in CDK
                else:
                    name = value.getClass().getSimpleName()
                    raise BenchmarkError(f"Unsupported CDK result type '{name}'")
            index += arity

        result = (
            TabularData(data=features, labels=data.labels())
            if data.is_labeled
            else TabularData(data=features)
        )

        result = failmode.finalize(result)

        return result
Beispiel #10
0
 def select_best(self, data: TabularData,
                 scores: Sequence[float]) -> TabularData:
     """Select the best points given a tabular data set and a list of matching scores."""
     best_indices = np.argsort(scores)[:self._num_seeds]
     return data.subset(best_indices)