def __init__(self, data: np.ndarray, labels: Optional[np.ndarray] = None, **kwargs): """Initialize dataset. Parameters: data: tabular data as a NumPy ndarray labels: tabular data as a NumPy ndarray. If not specified, dataset is unlabeled. Raises: InvalidParameterError for invalid arguments. In particular, numbers of data and labels must match. Examples: From numerical NumPy data: ``` TabularData(numpy.ndarray(...), ...) ``` From a Pandas DataFrame: ``` df = pandas.DataFrame(..., columns=[...]) TabularData(df.to_records(index=False), labels=...) ``` From mixed NumPy data, with column names (note use of tuples): ``` a = numpy.array([('a', 1), ('b', 2)], dtype=[('C', str), ('D', int)]) TabularData(a, ...) ``` """ # parameter validation data = params.instance(data, np.ndarray) labels = params.optional_(labels, lambda arg: params.instance(arg, np.ndarray)) if labels is not None: # number of samples and labels must match if data.shape[0] != labels.shape[0]: raise InvalidParameterError( "same number of samples and labels", f"{data.shape[0]} samples, {labels.shape[0]} labels", ) # uniqueness of "column" names, if any, is enforced by NumPy, # but only separately for data and labels if is_sequence(data.dtype.names) and is_sequence( labels.dtype.names): column_names = data.dtype.names + labels.dtype.names if len(column_names) != len(np.unique(column_names)): raise InvalidParameterError( "unique column names for samples and labels", column_names) self._data, self._labels = data, labels super().__init__(**kwargs)
def __init__( self, target=None, configuration: Optional[PlotConfiguration] = None, axes_labels=(None, None, None, None), axes_scales=("linear", "linear"), **kwargs, ): """Initialize Evaluation. Parameters: target: rendering target that evaluation outcome is rendered to; can be a single filename, or a matplotlib Axes or (Figure, Axes) pair, or a sequence thereof; if a matplotlib Axes or (Figure, Axes) pair, evaluation will add to it; if None, a new rendering target is created configuration: optional plot configuration controlling rendering details axes_labels: labels for all axes (bottom, left, top, right), None to not label an axis; for shorter tuples remaining entries are assumed None, so ('x', 'y') is valid axes_scales: scales ("linear" or "log") for horizontal and vertical axes Examples: __init__(axes_labels=("bottom", "left", "top")) # right is None __init__(axes_scales=("log", "log")) """ configuration = params.any_( configuration, lambda arg: params.instance(arg, PlotConfiguration), params.none ) super().__init__(configuration=configuration, **kwargs) # Axes, (Figure, Axes), filename, None, or sequence (without None) target_f = lambda arg: params.any_( arg, lambda arg: params.instance(arg, mpl.axes.Axes), lambda arg: params.tuple_( arg, lambda arg: params.instance(arg, mpl.figure.Figure), lambda arg: params.instance(arg, mpl.axes.Axes), arity=2, ), params.string, ) self._target = params.any_( target, target_f, params.none, lambda arg: params.tuple_(arg, target_f) ) self._axes_labels = params.tuple_( axes_labels, lambda arg: params.any_(arg, params.string, params.none), arity=4, default=None, ) self._axes_scales = params.tuple_( axes_scales, lambda arg: params.enumeration(arg, {"linear", "log"}), arity=2 ) self._figaxis = None
def _intersection(lhs: "TabularData", rhs: "TabularData", duplicates: bool = False) -> "TabularData": """Specialized intersection. For labeled data, labels are compared as well. The datasets must be compatible in the sense that both are of type TabularData or derived, and either labeled or unlabeled. Parameters: lhs: one of the two datasets to intersect ('left hand side') rhs: one of the two datasets to intersect ('right hand side') duplicates: if False (default), the returned data do not contain duplicate entries; if True, duplicates are taken into account. Both inputs and labels have to match for duplicates. Returns: TabularData containing only samples in both datasets, either without duplicates (set intersection) or taking duplicates into account (multiset intersection) Raises: NotImplementedError if the set intersection can not be computed """ # parameter validation lhs = params.instance(lhs, TabularData) rhs = params.instance(rhs, TabularData) duplicates = params.boolean(duplicates) # special case: empty set if lhs.num_samples == 0: return lhs.subset() # copy if rhs.num_samples == 0: return rhs.subset() # copy if lhs.is_labeled != rhs.is_labeled: raise InvalidParameterError("compatible TabularData", "mismatch in labeling") # intersection calculation _lhs, _rhs = TabularData._joint_data_labels( lhs), TabularData._joint_data_labels(rhs) if _lhs.dtype != _rhs.dtype: raise InvalidParameterError( "Matching TabularData", f"{_lhs.dtype.descr} and {_rhs.dtype.descr}") if duplicates is False: _, indices, _ = np.intersect1d( _lhs, _rhs, return_indices=True) # drops any duplicates indices = np.sort(indices) # restores original order return lhs.subset(indices) else: # duplicates = True raise NotImplementedError( # todo: implement "specialized multiset intersection not implemented for TabularData" )
def __init__(self, learner: Learner, scorer: Scorer, maximize: bool = True): self._learner = params.instance(learner, Learner) self._scorer = params.instance(scorer, Scorer) self._maximize = params.boolean(maximize) # If the goal is to maximize the score, invert the value because optimizers minimize. if self.maximize: self._direction = -1 else: self._direction = 1 self._steps = []
def apply(self, data: Data) -> PredictiveDistribution: """Predicts new inputs. Parameters: data: finite indexed data to predict Returns: predictive normal distributions if predictive uncertainties were requested, otherwise delta distributions """ data = params.instance( data, Data ) # todo: params.data(..., is_labeled=True, is_finite=True) xpred = params.real_matrix(data.samples()) if self._with_uncertainties: try: preds, stddevs = self._model.predict(xpred, return_std=True) return NormalPredictiveDistribution(mean=preds, stddev=stddevs) except Py4JJavaError as e: raise BenchmarkError("applying lolo model failed") from e else: try: preds = self._model.predict(xpred, return_std=False) return DeltaPredictiveDistribution(mean=preds) except Py4JJavaError as e: raise BenchmarkError("applying lolo model failed") from e
def __init__(self, internal_hp_optimization: bool = True, kernel: Optional[Kernel] = None, alpha: Union[float, Sequence] = 1e-5, optimizer="fmin_l_bfgs_b", n_restarts_optimizer=0, normalize_y=False, random_state: int = None, **kwargs): """Initialize state. sklearn-specific parameters are passed through to the implementation. Parameters: internal_hp_optimization: if True, hyperparameters are optimized "internally" by the Gaussian process, that is, scikit-learn optimizes hyperparameters and for smlb the learner has no hyperparameters; if False, hyperparameters are optimized by smlb (and scikit-learn does not optimize any hyperparameters) kernel: scikit-learn kernel; if None, a single Gaussian kernel is used as default alpha: regularization constant (scalar or vector); added as-is to kernel matrix diagonal. Equivalent to adding a "WhiteKernel"; the default is the corresponding value from scikit-learn's WhiteKernel, and different from scikit-learn's GaussianProcessRegressor. optimizer: hyperparameter optimization algorithm; used only if internal_hp_optimization is True n_restarts_optimizer: number of times optimizer is restarted; only used if internal_hp_optimization is True normalize_y: whether to subtract the mean of the labels random_state: integer seed See skl.gaussian_process.GaussianProcessRegressor parameters. """ super().__init__(**kwargs) internal_hp_optimization = params.boolean(internal_hp_optimization) kernel = params.any_(kernel, lambda arg: params.instance(arg, Kernel), params.none) # incomplete check for alpha as dimension becomes known only at fitting time alpha = params.any_( alpha, lambda arg: params.real(arg, from_=0), lambda arg: params.real_vector(arg, domain=[0, np.inf]), ) # todo: check optimizer, requires params.union (of string and callable) and params.function normalize_y = params.boolean(normalize_y) random_state = params.integer(random_state) if kernel is None: kernel = skl.gaussian_process.kernels.RBF( ) + skl.gaussian_process.kernels.WhiteKernel() assert internal_hp_optimization is True # external HP optimization not yet supported self._model = skl.gaussian_process.GaussianProcessRegressor( kernel=kernel, alpha=alpha, optimizer=optimizer, n_restarts_optimizer=n_restarts_optimizer, normalize_y=normalize_y, random_state=random_state, )
def apply( self, data: Data ) -> Union[DeltaPredictiveDistribution, NormalPredictiveDistribution]: r"""Predicts new inputs. Parameters: data: finite indexed data to predict; Returns: predictive normal distribution """ data = params.instance(data, Data) xpred = params.real_matrix(data.samples()) # predict # scikit-learn's ExtraTreesRegressor.predict() method does not support # returning predictions for all trees in the ensemble. Therefore, # `preds = self._model.predict(xpred)` is insufficient. if self._uncertainties is None: preds = self._model.predict(xpred) return DeltaPredictiveDistribution(mean=preds) elif self._uncertainties == "naive": preds = np.asfarray([tree.predict(xpred) for tree in self._model.estimators_]) return NormalPredictiveDistribution( mean=np.mean(preds, axis=0), stddev=np.std(preds, axis=0) ) else: raise BenchmarkError( "internal error, unknown parameter for uncertainties of ExtremelyRandomizedTreesRegressionSklearn" )
def apply(self, data: Data) -> NormalPredictiveDistribution: r"""Predicts new inputs. Parameters: data: finite indexed data to predict; Returns: predictive normal distribution """ data = params.instance( data, Data ) # todo: params.data(..., is_finite=True, is_labeled=True) xpred = params.real_matrix(data.samples()) # predict # scikit-learn's RandomForestRegressor.predict() method does not support # returning predictions for all trees in the ensemble. Therefore, # `preds = self._model.predict(xpred)` is insufficient. if self._uncertainties is None and self._correlations is None: preds = self._model.predict(xpred) return DeltaPredictiveDistribution(mean=preds) elif self._uncertainties == "naive": preds = np.asfarray([tree.predict(xpred) for tree in self._model.estimators_]) if self._correlations is None: return NormalPredictiveDistribution( mean=np.mean(preds, axis=0), stddev=np.std(preds, axis=0) ) elif self._correlations == "naive": if (data.num_samples > 25000) and not self._force_corr: warn( "Input correlations requested for >2.5E4 predictions." " Corelation matrix will not be computed, because a matrix this large may" " take up too much RAM. (2.5E4^2 entries * 8 byes per entry / 1E6 bytes per MB = 3200MB)." " To force computation anyway, set `force_corr = True` in learner constructor.", UserWarning, ) return NormalPredictiveDistribution( mean=np.mean(preds, axis=0), stddev=np.std(preds, axis=0) ) else: # Must handle single-prediction separately, as in this case np.corrcoef # will return single number rather than 1x1 array. if preds.shape[1] == 1: corr = np.array([[1]]) else: corr = np.corrcoef(preds, rowvar=False) return CorrelatedNormalPredictiveDistribution( mean=np.mean(preds, axis=0), stddev=np.std(preds, axis=0), corr=corr ) else: raise BenchmarkError( "internal error, unknown parameter for correlations of RandomForestRegressionSklearn" ) else: raise BenchmarkError( "internal error, unknown parameter for uncertainties of RandomForestRegressionSklearn" )
def apply(self, data: Data) -> NormalPredictiveDistribution: r"""Predicts new inputs. For Gaussian processes, both the noise-free predictive (posterior) distribution as well as the noise estimate are normally distributed. The predictive distribution with noise is the sum of the former two. The $\alpha$ training noise specified at initialization time is not added at prediction time, and thus not part of the noise model. The current implementation considers contributions from any WhiteKernel or other kernel that has a hyperparameter 'noise_level'. Limitations: It is a currently accepted shortcoming that WhiteKernels that are not 'first-level' sum members might yield wrong noise models. Examples: WhiteKernel(...) + other kernels will work kernel(...) * WhiteKernel(...) will not work as intended Training data noise $\alpha$ is not added Parameters: data: finite indexed data to predict; Returns: predictive normal distribution with the following decomposition: predicted: sum of model and noise distribution noise_part: normal distribution for estimated noise signal_part: normal distribution for estimated model contribution; the Gaussian process' "predictive variance"; depends only on distance from the training data """ data = params.instance( data, Data) # todo: params.data(..., is_finite=True, is_labeled=True) xpred = params.real_matrix(data.samples()) n = data.num_samples # predict preds, stddevs = self._model.predict(xpred, return_std=True) # noise # noise are all noise_level of WhiteKernel, where noise_level is variance (not standard deviation) # this assumes that the noise level are independent noise = tuple(v for k, v in self._model.kernel_.get_params().items() if k.endswith("noise_level")) noise = np.ones(shape=n) * np.sum(noise) noise_part = NormalPredictiveDistribution(mean=np.zeros(shape=n), stddev=np.sqrt(noise)) return NormalPredictiveDistribution( mean=preds, stddev=np.sqrt(np.square(stddevs) + noise), noise_part=noise_part, signal_part=NormalPredictiveDistribution(mean=preds, stddev=stddevs), )
def __init__( self, input_: TabularData, output: PredictiveDistribution, scores: Sequence[float], **kwargs ): super().__init__(**kwargs) self._input: TabularData = params.instance(input_, TabularData) self._output: PredictiveDistribution = params.instance(output, PredictiveDistribution) # total number of function evaluations during this step self._num_evaluations: int = params.integer(self._input.num_samples, from_=1) self._scores: Sequence[float] = params.any_( scores, lambda arg: params.sequence(arg, length=1, type_=float), lambda arg: params.sequence(arg, length=self._num_evaluations, type_=float), )
def complement(lhs: "Data", rhs: "Data", duplicates: bool = False) -> "Data": """(Multi)set complement of two datasets. This complement method does not retain duplicates by default. For multiset behaviour, specify 'duplicates=True'. Parameters: lhs: set A in A - B ('left hand side') rhs: set B in A - B ('right hand side') duplicates: if False (default), the returned data do not contain duplicate entries; if True, duplicates are taken into account. Both inputs and labels have to match for duplicates. Returns: Data containing all samples in lhs, but not in rhs, without duplicates """ # parameter validation lhs = params.instance(lhs, Data) rhs = params.instance(rhs, Data) # special case: empty set if lhs.num_samples == 0: return lhs.subset() if rhs.num_samples == 0: return lhs.subset() # try specialized implementations exception = None try: if hasattr(lhs.__class__, "_complement"): return lhs.__class__._complement(lhs, rhs, duplicates) except (NotImplementedError, InvalidParameterError) as e: exception = e try: if hasattr(rhs.__class__, "_complement"): return rhs.__class__._complement(lhs, rhs, duplicates) except (NotImplementedError, InvalidParameterError) as e: exception = e # no specialized method found or succeeded raise NotImplementedError( "generalized (multi)set complement not implemented") from exception
def apply(self, data: Data) -> Data: """Transforms data. Parameters: data: labeled data to transform Returns: transformed data Raises: InvalidParameterError if Data is not labeled """ data = params.instance(data, Data) if not data.is_labeled: raise InvalidParameterError("labeled data", "unlabeled data") # patch the labels() method of the data object (not class) # there is no need to store the old labels function as it is a class member, not an object member for name in ("_orig_labels", "labels", "_noise"): # patch if necessary by choosing a random name instead of _labels if name in data.__dict__: raise BenchmarkError( f"internal error: data object already has {name} method") # create a copy of the dataset data = copy.deepcopy(data) # rename labels to _labels for data only setattr(data, "_orig_labels", getattr(data, "labels")) # store noise model setattr(data, "_noise", self._noise) # add wrapper as new labels() method def labels(self, indices=None): """Query labels of a sequence of samples. This wrapper adds noise. Parameters: indices: a sequence of sample 'indices'. See 'samples()' for details. Returns: a sequence of labels """ labels = self._orig_labels(indices) return labels + self._noise.noise(labels.shape) setattr(data, "labels", labels.__get__(data)) return data
def __init__( self, data: VectorSpaceData, model: Learner, scorer: Scorer, optimizers: Sequence[Optimizer], evaluations: Sequence[Evaluation] = (OptimizationTrajectoryPlot(),), num_trials: int = 1, training_data: Optional[Data] = None, ): self._data = params.instance(data, VectorSpaceData) self._scorer = params.instance(scorer, Scorer) self._model = params.instance(model, Learner) self._optimizers = params.sequence(optimizers, type_=Optimizer) self._evaluations = params.tuple_( evaluations, lambda arg: params.instance(arg, Evaluation) ) self._num_trials = params.integer(num_trials, from_=1) self._training_data = params.optional_( training_data, lambda arg: params.instance(arg, Data) )
def __init__( self, data: Data, training: Sequence[Sampler], validation: Sampler, learners: Sequence[SupervisedLearner], features: DataValuedTransformation = IdentityFeatures(), metric: ScalarEvaluationMetric = RootMeanSquaredError(), evaluations: Sequence[Evaluation] = (LearningCurvePlot(),), # todo: add table progressf: Optional[Callable[[int, int], None]] = None, ): """Initialize workflow. Parameters: data: labeled data training: sequence of Samplers, one for each training set size validation: Sampler for validation set learners: sequence of supervised regression algorithms features: any data-valued transformation metric: evaluation metric to use; root mean squared error by default evaluations: one or more evaluations; default are learning curve and table progressf: callable with two parameters, done iterations and total number of iterations """ self._data = params.instance(data, Data) # todo: params.data(..., is_labeled=True) if not self._data.is_labeled: raise InvalidParameterError("labeled data", "unlabeled data") self._training = params.sequence(training, type_=Sampler) self._validation = params.instance(validation, Sampler) self._learners = params.sequence(learners, type_=SupervisedLearner) self._features = params.instance(features, Features) self._metric = params.instance(metric, ScalarEvaluationMetric) self._evaluations = params.tuple_( evaluations, lambda arg: params.instance(arg, Evaluation) ) self._progressf = params.optional_( progressf, lambda arg: params.callable(arg, num_pos_or_kw=2) ) if self._progressf is None: self._progressf = lambda *args: None
def __init__(self, noise: Noise, **kwargs): """Initialize state. Parameters: noise: noise model Returns: dataset with noisy labels """ super().__init__(**kwargs) self._noise = params.instance(noise, Noise)
def __init__(self, configuration: Optional[EvaluationConfiguration] = None, **kwargs): """Initialize Evaluation. Parameters: configuration: optional configuration object controlling rendering details """ super().__init__(**kwargs) self._configuration = params.any_( configuration, lambda arg: params.instance(arg, EvaluationConfiguration), params.none ) if self._configuration is None: self._configuration = self._default_configuration() self._auxiliary = dict() # internal handle on optional auxiliary outcome data
def __init__(self, noise_part=None, signal_part=None, **kwargs): """Initialize decompositions. Parameters: noise_part: estimated noise distribution; the aleatoric component signal_part: estimated signal distribution; the epistemic component """ super().__init__(**kwargs) optional = lambda arg: params.any_( arg, lambda x: params.instance(x, PredictiveDistribution), params. none) self._noise_part = optional(noise_part) self._signal_part = optional(signal_part) pass
def fit(self, data: Data) -> Learner: """Fits the model to training data. Parameters: data: labeled training data Returns: self (allows chaining) Raises: InvalidParameterError if data is not labeled """ data = params.instance(data, Data) if not data.is_labeled: raise InvalidParameterError("Labeled data", "unlabeled data") return self
def apply( self, data: Data ) -> Union[DeltaPredictiveDistribution, NormalPredictiveDistribution]: r"""Predicts new inputs. Parameters: data: finite indexed data to predict; Returns: predictive normal distribution """ data = params.instance( data, Data) # todo: params.data(..., is_finite=True, is_labeled=True) xpred = params.real_matrix(data.samples()) # predict # scikit-learn's ExtraTreesRegressor.predict() method does not support # returning predictions for all trees in the ensemble. Therefore, # `preds = self._model.predict(xpred)` is insufficient. if self._uncertainties is None: preds = self._model.predict(xpred) return DeltaPredictiveDistribution(mean=preds) elif self._uncertainties == "naive": # todo: there is a discrepancy between the ensemble mean and predictions # until this has been resolved, naive uncertainties are not supported # when fixing this, update parameter validation and unit tests raise NotImplementedError # # #trees x #samples matrix of predictions of ensemble's trees # staged_preds = np.asfarray(tuple(self._model.staged_predict(xpred))) # # this does NOT yield the same predictions as self._model.predict(xpred) # mean, stddev = ( # np.mean(staged_preds, axis=0), # np.std(staged_preds, axis=0), # ) # return NormalPredictiveDistribution(mean=mean, stddev=stddev) else: raise BenchmarkError( "internal error, unknown parameter for uncertainties of ExtremelyRandomizedTreesRegressionSklearn" )
def apply(self, data: Data, **kwargs) -> Data: """Draw random subset of data. Parameters: data: dataset to sample from Returns: random subset of data """ data = params.instance(data, Data) if not data.is_finite: raise InvalidParameterError("finite Data", type(data).__name__) size = params.integer( self._size, from_=0, to=data.num_samples ) # validate upper bound (see __init__) ind = self.random.choice(data.num_samples, size=size, replace=False) return data.subset(ind)
def apply(self, data: Data, **kwargs) -> Data: """Draw random vectors. Parameters: data: Data to draw from Returns: TabularData of vectors """ data = params.instance(data, Data) if self._domain is None: if data.domain is None: domain = np.asarray([[0, 1]] * data.dimensions) else: domain = data.domain else: domain = params.hypercube_domain( self._domain, dimensions=data.dimensions ) # checks dimensionality (see __init__) for low, high in domain: if low == -np.inf or high == np.inf: raise BenchmarkError("can not sample from infinite domain") # vectors = np.transpose( # np.asfarray( # [ # self.random.uniform(low=low, high=high, size=self._size) # for (low, high) in self._domain # ] # ) # ) # this version avoids the python loop for efficiency in high dimensions vectors = ( self.random.uniform(size=(self._size, data.dimensions)) * (domain[:, 1] - domain[:, 0]) + domain[:, 0] # noqa W503 ) return data.subset(vectors)
def fit(self, data: Data) -> "RandomForestRegressionSklearn": """Fits the model using training data. Parameters: data: tabular labeled data to train on Returns: self (allows chaining) """ data = params.instance( data, Data) # todo: params.data(..., is_finite=True, is_labeled=True) n = data.num_samples xtrain = params.real_matrix(data.samples(), nrows=n) ytrain = params.real_vector(data.labels(), dimensions=n) self._model.fit(xtrain, ytrain) return self
def apply(self, data: Data) -> TabularData: """Compute matminer composition-based materials features. Parameters: data: material compositions, given as sum formula strings Can be labeled, and labels will be retained Returns: TabularData or TabularLabeledData with matminer composition-based materials features as samples """ data = params.instance(data, Data) inputs_ = tuple(self._composition(self.samplef(s)) for s in data.samples()) features = self._mmfeatures.featurize_many(inputs_, pbar=False) features = np.asfarray(features) result = TabularData(data=features, labels=data.labels() if data.is_labeled else None) return result
def _joint_data_labels(ds): """Single structured array for data and labels for comparison. Structured arrays can be used to run NumPy set methods on arrays with more than one dimension. """ ds = params.instance(ds, TabularData) if is_sequence(ds._data.dtype.names): # structured array lhs = ds._data else: # homogeneous array, possibly many dimensions lhs = np.reshape(ds._data, (ds.num_samples, -1)) lhs = lhs.view([("", ds._data.dtype)] * np.prod(lhs.shape[1:])) lhs = np.reshape(lhs, ds.num_samples) if not ds.is_labeled: result = lhs else: # is_labeled # alternatives for hstack() that did not work included # numpy.lib.recfunctions.merge_arrays. if is_sequence(ds._labels.dtype.names): # structured array rhs = ds._labels else: # homogeneous array, possibly high-dimensional rhs = np.reshape(ds._labels, (ds.num_samples, -1)) rhs = rhs.view([(str(i), rhs.dtype) for i in range(np.prod(rhs.shape[1:]))]) rhs = np.reshape(rhs, ds.num_samples) # lhs and rhs are structured array (views) now # unfortunately, np.hstack fails for these dtypes = lhs.dtype.descr + rhs.dtype.descr result = np.empty(ds.num_samples, dtype=dtypes) for name in lhs.dtype.names: result[name] = lhs[name] for name in rhs.dtype.names: result[name] = rhs[name] return result
def apply(self, data: VectorSpaceData): """Sample set from evenly-spaced grid in a vector space. A specified number of samples are drawn from the smallest evenly-space grid of sufficient size. Returns: sampled set If size does not correspond exactly to a k x ... x k grid, that is, if size is not a power of k, the next-largest grid of size k+1 x ... x k+1 is created and some of its samples are removed. Here, k denotes the number of evenly-spaced samples per dimension. """ data = params.instance(data, VectorSpaceData) k = self.next_grid_size(data, self._size) population = self.full_grid(data, samples_per_dim=k, domain=self._domain) ind = self.random.choice(len(population), size=self._size, replace=False) return data.subset(population[ind])
def fit(self, data: Data) -> "GaussianProcessRegressionSklearn": """Fits the model using training data. Parameters: data: labeled data to train on; must derive from IndexedData and LabeledData Returns: self (allows chaining) """ data = params.instance( data, Data) # todo: params.data(..., is_finite=True, is_labeled=True) n = data.num_samples xtrain = params.real_matrix(data.samples(), nrows=n) ytrain = params.real_vector(data.labels(), dimensions=n) self._model.fit(xtrain, ytrain) return self
def fit(self, data: Data) -> "ExtremelyRandomizedTreesRegressionSklearn": """Fits the model using training data. Parameters: data: tabular labeled data to train on Returns: self (allows chaining) """ data = params.instance(data, Data) if not data.is_labeled: raise InvalidParameterError("labeled data", "unlabeled data") n = data.num_samples xtrain = params.real_matrix(data.samples(), nrows=n) ytrain = params.real_vector(data.labels(), dimensions=n) self._model.fit(xtrain, ytrain) return self
def full_grid(self, data: VectorSpaceData, samples_per_dim: int, domain=None): """Full multi-dimensional evenly-spaced grid. For one sample per dimension, the result is a single vector, the mean of the domain. Parameters: data: sampled dataset samples_per_dim: number of evenly-spaced samples to take in each dimension domain: (sub)domain to sample from; by default, data's domain is used Returns: two-dimensional NumPy array where samples are rows """ data = params.instance(data, VectorSpaceData) k = params.integer(samples_per_dim, above=0) # positive integer domain = data.domain if domain is None else domain domain = params.hypercube_domain(domain, data.dimensions) if k == 1: return np.mean(domain, axis=1).reshape((1, -1)) locs = (np.linspace(xfrom, xto, k) for xfrom, xto in domain) return np.asfarray(list(itertools.product(*locs)))
def fit(self, data: Data) -> "RandomForestRegressionLolo": """Fits the model using training data. Parameters: data: labeled tabular data to train on Returns: self (allows chaining) """ data = params.instance( data, Data ) # todo: params.data(..., is_labeled=True, is_finite=True) n = data.num_samples xtrain = params.real_matrix(data.samples(), nrows=n) ytrain = params.real_vector(data.labels(), dimensions=n) try: self._model.fit(xtrain, ytrain) except Py4JJavaError as e: raise BenchmarkError("training lolo model failed") from e return self
def apply(self, data: Data) -> TabularData: """Compute selected molecular features. Parameters: data: molecular structures given as SMILES strings. Can be labeled, and labels will be retained Returns: TabularData with CDK molecular features as samples """ data = params.instance(data, Data) # todo: params.data(data, is_finite=True) failmode = DataTransformationFailureMode(self._failmode, data.num_samples) # set up molecule SMILES builder = self._java_gateway.jvm.org.openscience.cdk.DefaultChemObjectBuilder.getInstance() parser = self._java_gateway.jvm.org.openscience.cdk.smiles.SmilesParser(builder) def parse_smiles(s: str, i: int): """Return parsed SMILES string or None on failure.""" try: return parser.parseSmiles(self._samplef(s)) except py4j.protocol.Py4JJavaError: # expected to be raised from org.openscience.cdk.exception.InvalidSmilesException failmode.handle_failure(i) return None # internal sentinel value smiles = tuple(parse_smiles(s, i) for i, s in enumerate(data.samples())) # compute descriptors # todo: the dtype of the columns could be set in advance by querying the descriptors # currently, all values are stored as floating point numbers features = np.empty((data.num_samples, np.sum(self._arities))) index = 0 def java_is_instance_of(object_, class_): return py4j.java_gateway.is_instance_of( self._java_gateway, object_, "org.openscience.cdk.qsar.result." + class_ ) def check_arity(expected, actual): if expected != actual: raise BenchmarkError( f"Invalid descriptor result arity (expected {expected}, was {actual})" ) for descriptor, arity in zip(self._descriptors, self._arities): for i, smile in enumerate(smiles): if smiles is None: features[i, index : index + arity] = float("nan") continue try: value = descriptor.calculate(smile).getValue() except py4j.protocol.Py4JJavaError: failmode.handle_failure(i) features[i, index : index + arity] = float("nan") continue if java_is_instance_of(value, "IntegerResult"): check_arity(arity, 1) features[i, index] = int(value.intValue()) elif java_is_instance_of(value, "DoubleResult"): check_arity(arity, 1) features[i, index] = float(value.doubleValue()) elif java_is_instance_of(value, "BooleanResult"): check_arity(arity, 1) features[i, index] = bool(value.booleanValue()) elif java_is_instance_of(value, "IntegerArrayResult"): check_arity(arity, value.length()) features[i, index : index + arity] = tuple( int(value.get(j)) for j in range(value.length()) ) elif java_is_instance_of(value, "DoubleArrayResult"): check_arity(arity, value.length()) features[i, index : index + arity] = tuple( float(value.get(j)) for j in range(value.length()) ) # there seems to be no BooleanArrayResult in CDK else: name = value.getClass().getSimpleName() raise BenchmarkError(f"Unsupported CDK result type '{name}'") index += arity result = ( TabularData(data=features, labels=data.labels()) if data.is_labeled else TabularData(data=features) ) result = failmode.finalize(result) return result