def _make_moves(self, seeds_table: TabularData, domain: Sequence[Tuple[float, float]], num_dimensions: int) -> TabularData: """Produce a set of possible moves from given seed points. Parameters: seeds_table: the seed points as a tabular data source domain: the bounds of the domain, given as a sequence of bounds tuples (lower, upper) num_dimensions: the number of dimensions to explore along Returns: A tabular data source containing all of the possible next-step points """ seeds: np.ndarray = seeds_table.samples() total_dimensions = seeds.shape[1] # Randomly select dimensions to vary dimension_indices = self.random.permutation( range(total_dimensions))[:num_dimensions] # For each seed and each dimension generate uniformly-spaced samples, then stack everything candidates_array = np.vstack([ self._move_along_dimension(seed, domain, d_idx) for seed in seeds for d_idx in dimension_indices ]) # remove duplicates return TabularData(candidates_array).subset(duplicates=False)
def test_ExtremelyRandomizedTreesRegressionSklearn_2(): """Simple examples: linear 1-d function.""" rf = ExtremelyRandomizedTreesRegressionSklearn(rng=2, uncertainties="naive") train_data = TabularData( data=np.array([[-2], [-1.5], [-1], [-0.5], [0], [0.5], [1], [1.5], [2]]), labels=np.array([-2, -1.5, -1, -0.5, 0, 0.5, 1, 1.5, 2]), ) rf.fit(train_data) mean = rf.apply(TabularData(data=np.array([[-1], [0], [1]]))).mean assert np.allclose(mean, [-1, 0, 1], atol=0.2) stddev = rf.apply(TabularData(data=np.array([[-2], [0], [2]]))).stddev assert stddev[0] > stddev[1] < stddev[2] # without uncertainties rf = ExtremelyRandomizedTreesRegressionSklearn( rng=1) # default for uncertainties is None rf.fit(train_data) preds = rf.apply(TabularData(data=np.array([[-1], [0], [1]]))) assert np.allclose(preds.mean, [-1, 0, 1], atol=0.2) assert isinstance(preds, DeltaPredictiveDistribution)
def test_ExtremelyRandomizedTreesRegressionSklearn_1(): """Simple examples.""" # constant function # MH: for constant labels, expected uncertainties are zero train_data = TabularData( data=np.array([[-4], [-3], [-2], [-1], [0], [1], [2], [3], [4]]), labels=np.array([1, 1, 1, 1, 1, 1, 1, 1, 1]), ) valid_data = TabularData(data=np.array([[-4], [-2], [0], [3], [4]])) rf = ExtremelyRandomizedTreesRegressionSklearn(n_estimators=10, uncertainties="naive", rng=0) preds = rf.fit(train_data).apply(valid_data) mean, stddev = preds.mean, preds.stddev assert np.allclose(mean, [1, 1, 1, 1, 1]) assert np.allclose(stddev, [0, 0, 0, 0, 0]) # delta distributions (zero standard deviation) rf = ExtremelyRandomizedTreesRegressionSklearn(n_estimators=10, uncertainties=None, rng=0) preds = rf.fit(train_data).apply(valid_data) mean, stddev = preds.mean, preds.stddev assert np.allclose(mean, [1, 1, 1, 1, 1]) assert np.allclose(stddev, [0, 0, 0, 0, 0]) assert isinstance(preds, DeltaPredictiveDistribution)
def test_ExtremelyRandomizedTreesRegressionSklearn_5(): """Non-trivial test case, including standard deviation.""" n, m, xlen = 150, 600, 10 train_inputs = np.reshape(np.linspace(-xlen / 2, +xlen / 2, n), (n, 1)) train_labels = (train_inputs * 2 + 1).flatten() train_data = TabularData(data=train_inputs, labels=train_labels) train_data = LabelNoise(noise=NormalNoise( rng=0)).fit(train_data).apply(train_data) valid_inputs = np.reshape(np.linspace(-xlen / 2, +xlen / 2, m), (m, 1)) valid_labels = (valid_inputs * 2 + 1).flatten() valid_data = TabularData(data=valid_inputs, labels=valid_labels) valid_data = LabelNoise(noise=NormalNoise( rng=1)).fit(valid_data).apply(valid_data) # 12 trees meets minimal requirements for jackknife estimates rf = ExtremelyRandomizedTreesRegressionSklearn(rng=0, uncertainties="naive") preds = rf.fit(train_data).apply(valid_data) mae = MeanAbsoluteError().evaluate(valid_data.labels(), preds) # for perfect predictions, expect MAE of 1.12943 # (absolute difference between draws from two unit normal distributions) assert np.allclose(mae, 1.13, atol=0.25) assert np.allclose(np.mean(preds.stddev), 1, atol=0.3)
def test_ExtremelyRandomizedTreesRegressionSklearn_3(): """Ensure predictions are identical independent of uncertainties method used.""" rf1 = ExtremelyRandomizedTreesRegressionSklearn(rng=1, uncertainties=None) rf2 = ExtremelyRandomizedTreesRegressionSklearn(rng=1, uncertainties="naive") train_data = TabularData( data=np.array([[-2], [-1.5], [-1], [-0.5], [0], [0.5], [1], [1.5], [2]]), labels=np.array([-2, -1.5, -1, -0.5, 0, 0.5, 1, 1.5, 2]), ) rf1.fit(train_data) rf2.fit(train_data) test_data = np.array([[-3], [-1], [0], [0.5], [1], [2]]) mean1 = rf1.apply(TabularData(data=test_data)).mean mean2 = rf2.apply(TabularData(data=test_data)).mean assert np.allclose(mean1, mean2, atol=1e-6)
def subset(self, indices: Optional[np.ndarray] = None, duplicates: bool = False) -> TabularData: """Create finite subset of data. Parameters: indices: a real matrix of appropriate dimensions (rows are vectors) duplicates: if True (default), the returned subset does not contain duplicate entries; if False, duplicates are kept. Both inputs and labels have to match for duplicates. Returns: Finite dataset of vectors. """ # indices is validated by calls to samples() and labels() duplicates = params.boolean(duplicates) data = self.samples(indices) labels = self.labels(indices) if self.is_labeled else None ds = TabularData(data=data, labels=labels) return ds if not duplicates else ds.subset(duplicates=True)
def _minimize(self, data: VectorSpaceData, function_tracker: TrackedTransformation): def _clip_to_bounds(x): """Clip x to obey the bounds along each dimension. This is necessary because dual annealing does not respect bounds, but smlb is strict about bounds and will throw an exception if we try to sample outside of the domain. """ for i in range(len(x)): lb = bounds[i, 0] ub = bounds[i, 1] x[i] = min(ub, max(x[i], lb)) return x bounds = data.domain func = lambda x: function_tracker.apply( TabularData(_clip_to_bounds(x).reshape(1, -1))) seed = self.random.split(1)[ 0] # split off a new random seed each time `optimize` is called # TODO: include a callback to record the results of each iteration. Store this info in # TrackedTransformation and include it when creating the OptimizationTrajectory. self._minimization_algorithm(func, bounds, seed)
def apply(self, data: Data) -> TabularData: """Compute matminer composition-based materials features. Parameters: data: material compositions, given as sum formula strings Can be labeled, and labels will be retained Returns: TabularData or TabularLabeledData with matminer composition-based materials features as samples """ data = params.instance(data, Data) inputs_ = tuple(self._composition(self.samplef(s)) for s in data.samples()) features = self._mmfeatures.featurize_many(inputs_, pbar=False) features = np.asfarray(features) result = TabularData(data=features, labels=data.labels() if data.is_labeled else None) return result
def apply(self, data: Data) -> TabularData: """Compute selected molecular features. Parameters: data: molecular structures given as SMILES strings. Can be labeled, and labels will be retained Returns: TabularData with CDK molecular features as samples """ data = params.instance(data, Data) # todo: params.data(data, is_finite=True) failmode = DataTransformationFailureMode(self._failmode, data.num_samples) # set up molecule SMILES builder = self._java_gateway.jvm.org.openscience.cdk.DefaultChemObjectBuilder.getInstance() parser = self._java_gateway.jvm.org.openscience.cdk.smiles.SmilesParser(builder) def parse_smiles(s: str, i: int): """Return parsed SMILES string or None on failure.""" try: return parser.parseSmiles(self._samplef(s)) except py4j.protocol.Py4JJavaError: # expected to be raised from org.openscience.cdk.exception.InvalidSmilesException failmode.handle_failure(i) return None # internal sentinel value smiles = tuple(parse_smiles(s, i) for i, s in enumerate(data.samples())) # compute descriptors # todo: the dtype of the columns could be set in advance by querying the descriptors # currently, all values are stored as floating point numbers features = np.empty((data.num_samples, np.sum(self._arities))) index = 0 def java_is_instance_of(object_, class_): return py4j.java_gateway.is_instance_of( self._java_gateway, object_, "org.openscience.cdk.qsar.result." + class_ ) def check_arity(expected, actual): if expected != actual: raise BenchmarkError( f"Invalid descriptor result arity (expected {expected}, was {actual})" ) for descriptor, arity in zip(self._descriptors, self._arities): for i, smile in enumerate(smiles): if smiles is None: features[i, index : index + arity] = float("nan") continue try: value = descriptor.calculate(smile).getValue() except py4j.protocol.Py4JJavaError: failmode.handle_failure(i) features[i, index : index + arity] = float("nan") continue if java_is_instance_of(value, "IntegerResult"): check_arity(arity, 1) features[i, index] = int(value.intValue()) elif java_is_instance_of(value, "DoubleResult"): check_arity(arity, 1) features[i, index] = float(value.doubleValue()) elif java_is_instance_of(value, "BooleanResult"): check_arity(arity, 1) features[i, index] = bool(value.booleanValue()) elif java_is_instance_of(value, "IntegerArrayResult"): check_arity(arity, value.length()) features[i, index : index + arity] = tuple( int(value.get(j)) for j in range(value.length()) ) elif java_is_instance_of(value, "DoubleArrayResult"): check_arity(arity, value.length()) features[i, index : index + arity] = tuple( float(value.get(j)) for j in range(value.length()) ) # there seems to be no BooleanArrayResult in CDK else: name = value.getClass().getSimpleName() raise BenchmarkError(f"Unsupported CDK result type '{name}'") index += arity result = ( TabularData(data=features, labels=data.labels()) if data.is_labeled else TabularData(data=features) ) result = failmode.finalize(result) return result
def select_best(self, data: TabularData, scores: Sequence[float]) -> TabularData: """Select the best points given a tabular data set and a list of matching scores.""" best_indices = np.argsort(scores)[:self._num_seeds] return data.subset(best_indices)