def apply(self, data: Data) -> PredictiveDistribution: """Predicts new inputs. Parameters: data: finite indexed data to predict Returns: predictive normal distributions if predictive uncertainties were requested, otherwise delta distributions """ data = params.instance( data, Data ) # todo: params.data(..., is_labeled=True, is_finite=True) xpred = params.real_matrix(data.samples()) if self._with_uncertainties: try: preds, stddevs = self._model.predict(xpred, return_std=True) return NormalPredictiveDistribution(mean=preds, stddev=stddevs) except Py4JJavaError as e: raise BenchmarkError("applying lolo model failed") from e else: try: preds = self._model.predict(xpred, return_std=False) return DeltaPredictiveDistribution(mean=preds) except Py4JJavaError as e: raise BenchmarkError("applying lolo model failed") from e
def apply( self, data: Data ) -> Union[DeltaPredictiveDistribution, NormalPredictiveDistribution]: r"""Predicts new inputs. Parameters: data: finite indexed data to predict; Returns: predictive normal distribution """ data = params.instance(data, Data) xpred = params.real_matrix(data.samples()) # predict # scikit-learn's ExtraTreesRegressor.predict() method does not support # returning predictions for all trees in the ensemble. Therefore, # `preds = self._model.predict(xpred)` is insufficient. if self._uncertainties is None: preds = self._model.predict(xpred) return DeltaPredictiveDistribution(mean=preds) elif self._uncertainties == "naive": preds = np.asfarray([tree.predict(xpred) for tree in self._model.estimators_]) return NormalPredictiveDistribution( mean=np.mean(preds, axis=0), stddev=np.std(preds, axis=0) ) else: raise BenchmarkError( "internal error, unknown parameter for uncertainties of ExtremelyRandomizedTreesRegressionSklearn" )
def apply(self, data: Data) -> NormalPredictiveDistribution: r"""Predicts new inputs. Parameters: data: finite indexed data to predict; Returns: predictive normal distribution """ data = params.instance( data, Data ) # todo: params.data(..., is_finite=True, is_labeled=True) xpred = params.real_matrix(data.samples()) # predict # scikit-learn's RandomForestRegressor.predict() method does not support # returning predictions for all trees in the ensemble. Therefore, # `preds = self._model.predict(xpred)` is insufficient. if self._uncertainties is None and self._correlations is None: preds = self._model.predict(xpred) return DeltaPredictiveDistribution(mean=preds) elif self._uncertainties == "naive": preds = np.asfarray([tree.predict(xpred) for tree in self._model.estimators_]) if self._correlations is None: return NormalPredictiveDistribution( mean=np.mean(preds, axis=0), stddev=np.std(preds, axis=0) ) elif self._correlations == "naive": if (data.num_samples > 25000) and not self._force_corr: warn( "Input correlations requested for >2.5E4 predictions." " Corelation matrix will not be computed, because a matrix this large may" " take up too much RAM. (2.5E4^2 entries * 8 byes per entry / 1E6 bytes per MB = 3200MB)." " To force computation anyway, set `force_corr = True` in learner constructor.", UserWarning, ) return NormalPredictiveDistribution( mean=np.mean(preds, axis=0), stddev=np.std(preds, axis=0) ) else: # Must handle single-prediction separately, as in this case np.corrcoef # will return single number rather than 1x1 array. if preds.shape[1] == 1: corr = np.array([[1]]) else: corr = np.corrcoef(preds, rowvar=False) return CorrelatedNormalPredictiveDistribution( mean=np.mean(preds, axis=0), stddev=np.std(preds, axis=0), corr=corr ) else: raise BenchmarkError( "internal error, unknown parameter for correlations of RandomForestRegressionSklearn" ) else: raise BenchmarkError( "internal error, unknown parameter for uncertainties of RandomForestRegressionSklearn" )
def apply(self, data: Data) -> NormalPredictiveDistribution: r"""Predicts new inputs. For Gaussian processes, both the noise-free predictive (posterior) distribution as well as the noise estimate are normally distributed. The predictive distribution with noise is the sum of the former two. The $\alpha$ training noise specified at initialization time is not added at prediction time, and thus not part of the noise model. The current implementation considers contributions from any WhiteKernel or other kernel that has a hyperparameter 'noise_level'. Limitations: It is a currently accepted shortcoming that WhiteKernels that are not 'first-level' sum members might yield wrong noise models. Examples: WhiteKernel(...) + other kernels will work kernel(...) * WhiteKernel(...) will not work as intended Training data noise $\alpha$ is not added Parameters: data: finite indexed data to predict; Returns: predictive normal distribution with the following decomposition: predicted: sum of model and noise distribution noise_part: normal distribution for estimated noise signal_part: normal distribution for estimated model contribution; the Gaussian process' "predictive variance"; depends only on distance from the training data """ data = params.instance( data, Data) # todo: params.data(..., is_finite=True, is_labeled=True) xpred = params.real_matrix(data.samples()) n = data.num_samples # predict preds, stddevs = self._model.predict(xpred, return_std=True) # noise # noise are all noise_level of WhiteKernel, where noise_level is variance (not standard deviation) # this assumes that the noise level are independent noise = tuple(v for k, v in self._model.kernel_.get_params().items() if k.endswith("noise_level")) noise = np.ones(shape=n) * np.sum(noise) noise_part = NormalPredictiveDistribution(mean=np.zeros(shape=n), stddev=np.sqrt(noise)) return NormalPredictiveDistribution( mean=preds, stddev=np.sqrt(np.square(stddevs) + noise), noise_part=noise_part, signal_part=NormalPredictiveDistribution(mean=preds, stddev=stddevs), )
def validate_data_interface(ds: smlb.Data) -> bool: """Tests for compliance with Data interface. Runs tests that every Data-compliant class should satisfy. Returns: True Raises: AssertionError for failed tests """ # actual or "virtual" abc inheritance assert isinstance(ds, smlb.Data) if ds.num_samples == float("inf"): # infinite data tests pass else: # finite data test # integer-representable non-negative size assert int(ds.num_samples) == ds.num_samples assert ds.num_samples >= 0 # all samples are returned assert len(ds.samples()) == ds.num_samples # subsets assert ds.subset([]).num_samples == 0 assert ds.subset().num_samples <= ds.num_samples assert ds.subset(duplicates=True).num_samples == ds.num_samples # intersection with self assert smlb.intersection(ds, ds).num_samples <= ds.num_samples # assert smlb.intersection(ds, ds, duplicates=True).num_samples == ds.num_samples # todo: support this as well # complement with self assert smlb.complement(ds, ds).num_samples == 0 # assert smlb.complement(ds, ds, duplicates=True).num_samples == 0 # todo: support this as well if ds.is_labeled: # all labels are returned assert len(ds.labels()) == ds.num_samples # subsets assert ds.subset([]).is_labeled assert ds.subset().is_labeled # intersection assert smlb.intersection(ds, ds).is_labeled # assert smlb.intersection(ds, ds, duplicates=True).is_labeled # todo: support this as well # complement assert smlb.complement(ds, ds).is_labeled # assert smlb.complement(ds, ds, duplicates=True).is_labeled # todo: support this as well return True
def apply(self, data: Data): if not data.is_finite: raise InvalidParameterError( "a finite dataset", f"an infinite dataset of type {data.__class__}") means = self._function.labels(data.samples()) stddevs = np.zeros_like(means) return NormalPredictiveDistribution(means, stddevs)
def apply( self, data: Data ) -> Union[DeltaPredictiveDistribution, NormalPredictiveDistribution]: r"""Predicts new inputs. Parameters: data: finite indexed data to predict; Returns: predictive normal distribution """ data = params.instance( data, Data) # todo: params.data(..., is_finite=True, is_labeled=True) xpred = params.real_matrix(data.samples()) # predict # scikit-learn's ExtraTreesRegressor.predict() method does not support # returning predictions for all trees in the ensemble. Therefore, # `preds = self._model.predict(xpred)` is insufficient. if self._uncertainties is None: preds = self._model.predict(xpred) return DeltaPredictiveDistribution(mean=preds) elif self._uncertainties == "naive": # todo: there is a discrepancy between the ensemble mean and predictions # until this has been resolved, naive uncertainties are not supported # when fixing this, update parameter validation and unit tests raise NotImplementedError # # #trees x #samples matrix of predictions of ensemble's trees # staged_preds = np.asfarray(tuple(self._model.staged_predict(xpred))) # # this does NOT yield the same predictions as self._model.predict(xpred) # mean, stddev = ( # np.mean(staged_preds, axis=0), # np.std(staged_preds, axis=0), # ) # return NormalPredictiveDistribution(mean=mean, stddev=stddev) else: raise BenchmarkError( "internal error, unknown parameter for uncertainties of ExtremelyRandomizedTreesRegressionSklearn" )
def apply(self, data: Data) -> TabularData: """Compute matminer composition-based materials features. Parameters: data: material compositions, given as sum formula strings Can be labeled, and labels will be retained Returns: TabularData or TabularLabeledData with matminer composition-based materials features as samples """ data = params.instance(data, Data) inputs_ = tuple(self._composition(self.samplef(s)) for s in data.samples()) features = self._mmfeatures.featurize_many(inputs_, pbar=False) features = np.asfarray(features) result = TabularData(data=features, labels=data.labels() if data.is_labeled else None) return result
def fit(self, data: Data) -> "RandomForestRegressionSklearn": """Fits the model using training data. Parameters: data: tabular labeled data to train on Returns: self (allows chaining) """ data = params.instance( data, Data) # todo: params.data(..., is_finite=True, is_labeled=True) n = data.num_samples xtrain = params.real_matrix(data.samples(), nrows=n) ytrain = params.real_vector(data.labels(), dimensions=n) self._model.fit(xtrain, ytrain) return self
def fit(self, data: Data) -> "ExtremelyRandomizedTreesRegressionSklearn": """Fits the model using training data. Parameters: data: tabular labeled data to train on Returns: self (allows chaining) """ data = params.instance(data, Data) if not data.is_labeled: raise InvalidParameterError("labeled data", "unlabeled data") n = data.num_samples xtrain = params.real_matrix(data.samples(), nrows=n) ytrain = params.real_vector(data.labels(), dimensions=n) self._model.fit(xtrain, ytrain) return self
def fit(self, data: Data) -> "GaussianProcessRegressionSklearn": """Fits the model using training data. Parameters: data: labeled data to train on; must derive from IndexedData and LabeledData Returns: self (allows chaining) """ data = params.instance( data, Data) # todo: params.data(..., is_finite=True, is_labeled=True) n = data.num_samples xtrain = params.real_matrix(data.samples(), nrows=n) ytrain = params.real_vector(data.labels(), dimensions=n) self._model.fit(xtrain, ytrain) return self
def fit(self, data: Data) -> "RandomForestRegressionLolo": """Fits the model using training data. Parameters: data: labeled tabular data to train on Returns: self (allows chaining) """ data = params.instance( data, Data ) # todo: params.data(..., is_labeled=True, is_finite=True) n = data.num_samples xtrain = params.real_matrix(data.samples(), nrows=n) ytrain = params.real_vector(data.labels(), dimensions=n) try: self._model.fit(xtrain, ytrain) except Py4JJavaError as e: raise BenchmarkError("training lolo model failed") from e return self
def apply(self, data: Data) -> TabularData: """Compute selected molecular features. Parameters: data: molecular structures given as SMILES strings. Can be labeled, and labels will be retained Returns: TabularData with CDK molecular features as samples """ data = params.instance(data, Data) # todo: params.data(data, is_finite=True) failmode = DataTransformationFailureMode(self._failmode, data.num_samples) # set up molecule SMILES builder = self._java_gateway.jvm.org.openscience.cdk.DefaultChemObjectBuilder.getInstance() parser = self._java_gateway.jvm.org.openscience.cdk.smiles.SmilesParser(builder) def parse_smiles(s: str, i: int): """Return parsed SMILES string or None on failure.""" try: return parser.parseSmiles(self._samplef(s)) except py4j.protocol.Py4JJavaError: # expected to be raised from org.openscience.cdk.exception.InvalidSmilesException failmode.handle_failure(i) return None # internal sentinel value smiles = tuple(parse_smiles(s, i) for i, s in enumerate(data.samples())) # compute descriptors # todo: the dtype of the columns could be set in advance by querying the descriptors # currently, all values are stored as floating point numbers features = np.empty((data.num_samples, np.sum(self._arities))) index = 0 def java_is_instance_of(object_, class_): return py4j.java_gateway.is_instance_of( self._java_gateway, object_, "org.openscience.cdk.qsar.result." + class_ ) def check_arity(expected, actual): if expected != actual: raise BenchmarkError( f"Invalid descriptor result arity (expected {expected}, was {actual})" ) for descriptor, arity in zip(self._descriptors, self._arities): for i, smile in enumerate(smiles): if smiles is None: features[i, index : index + arity] = float("nan") continue try: value = descriptor.calculate(smile).getValue() except py4j.protocol.Py4JJavaError: failmode.handle_failure(i) features[i, index : index + arity] = float("nan") continue if java_is_instance_of(value, "IntegerResult"): check_arity(arity, 1) features[i, index] = int(value.intValue()) elif java_is_instance_of(value, "DoubleResult"): check_arity(arity, 1) features[i, index] = float(value.doubleValue()) elif java_is_instance_of(value, "BooleanResult"): check_arity(arity, 1) features[i, index] = bool(value.booleanValue()) elif java_is_instance_of(value, "IntegerArrayResult"): check_arity(arity, value.length()) features[i, index : index + arity] = tuple( int(value.get(j)) for j in range(value.length()) ) elif java_is_instance_of(value, "DoubleArrayResult"): check_arity(arity, value.length()) features[i, index : index + arity] = tuple( float(value.get(j)) for j in range(value.length()) ) # there seems to be no BooleanArrayResult in CDK else: name = value.getClass().getSimpleName() raise BenchmarkError(f"Unsupported CDK result type '{name}'") index += arity result = ( TabularData(data=features, labels=data.labels()) if data.is_labeled else TabularData(data=features) ) result = failmode.finalize(result) return result