def axes_scales(self, scales=(None, None), **kwargs): """Set axes scales. Parameters: axes_scales: scales (None, "linear" or "log") for horizontal and vertical axes; None indicates to use the current value Examples: axes_scales = (None, "log") # change only vertical axis """ scale_or_none_f = lambda arg: params.any_( arg, lambda arg: params.enumeration(arg, {"linear", "log"}), params.none ) scales = params.tuple_(scales, scale_or_none_f, arity=2, default=None) # re-assign tuple as a whole self._scales = ( self.axes_scales[0] if scales[0] is None else scales[0], self.axes_scales[1] if scales[1] is None else scales[1], ) # set axes if specified (not None) # this allows to pass kwargs specific to one axis if scales[0] is not None: self.ax.set_xscale(scales[0], **kwargs) if scales[1] is not None: self.ax.set_yscale(scales[1], **kwargs)
def __init__( self, visualization_type: str = "points", rectify: Union[float, bool] = False, **kwargs ): """Initialize generalized function plot. Parameters: visualization_type: how to visualize generalized functions. Either single value or list of appropriate length. Possible values: "points" (default), "box-whisker", "shaded-line" rectify: whether and by how much each curves' values will be horizontally displaced to visually disentangle markers from different curves at the same location. True indicates automatic displacement, False indicates no displacement. If not specified, horizontal axis positions are not modified (default). If the horizontal axis scaling is logarithmic, the rectification factor is applied in log-space. Examples: # show three curves with automatic horizontal rectification __init__(visualization_type=("points", "points", "box-whisker"), rectify=True) """ super().__init__(**kwargs) # parameter validation enum_f = lambda arg: params.enumeration(arg, {"points", "box-whisker", "shaded-line"}) self._visualization_type = params.any_( visualization_type, enum_f, lambda arg: params.tuple_(arg, enum_f) ) # arity can only be tested in evaluate() self._rectify = params.any_(rectify, lambda arg: params.real(arg, from_=0), params.boolean)
def __init__( self, target=None, configuration: Optional[PlotConfiguration] = None, axes_labels=(None, None, None, None), axes_scales=("linear", "linear"), **kwargs, ): """Initialize Evaluation. Parameters: target: rendering target that evaluation outcome is rendered to; can be a single filename, or a matplotlib Axes or (Figure, Axes) pair, or a sequence thereof; if a matplotlib Axes or (Figure, Axes) pair, evaluation will add to it; if None, a new rendering target is created configuration: optional plot configuration controlling rendering details axes_labels: labels for all axes (bottom, left, top, right), None to not label an axis; for shorter tuples remaining entries are assumed None, so ('x', 'y') is valid axes_scales: scales ("linear" or "log") for horizontal and vertical axes Examples: __init__(axes_labels=("bottom", "left", "top")) # right is None __init__(axes_scales=("log", "log")) """ configuration = params.any_( configuration, lambda arg: params.instance(arg, PlotConfiguration), params.none ) super().__init__(configuration=configuration, **kwargs) # Axes, (Figure, Axes), filename, None, or sequence (without None) target_f = lambda arg: params.any_( arg, lambda arg: params.instance(arg, mpl.axes.Axes), lambda arg: params.tuple_( arg, lambda arg: params.instance(arg, mpl.figure.Figure), lambda arg: params.instance(arg, mpl.axes.Axes), arity=2, ), params.string, ) self._target = params.any_( target, target_f, params.none, lambda arg: params.tuple_(arg, target_f) ) self._axes_labels = params.tuple_( axes_labels, lambda arg: params.any_(arg, params.string, params.none), arity=4, default=None, ) self._axes_scales = params.tuple_( axes_scales, lambda arg: params.enumeration(arg, {"linear", "log"}), arity=2 ) self._figaxis = None
def __init__(self, target: float, goal: str = "maximize", **kwargs): super().__init__(**kwargs) self._target = params.real(target) goal = params.enumeration(goal, {"maximize", "minimize"}) if goal == "maximize": self._direction = 1 elif goal == "minimize": self._direction = -1
def __init__( self, rng: int = None, num_seeds: int = 1, resolution: int = 64, max_relative_jump: float = 1.0, dimensions_varied: Union[str, float, int] = "all", max_iters: Optional[int] = None, max_evals: Optional[int] = None, **kwargs, ): """Initialize state. Parameters: rng: pseudo-random number generator seed num_seeds: the number of starting points, and the number of points chosen at the end of each iteration resolution: the number of points to sample along a single dimension for a single seed max_relative_jump: the maximum relative step size along a single dimension. If a given dimension has length `L` and a seed has value `x` along that dimension, then the candidates are `resolution` linearly spaced points from the range [x - max_relative_jump * L, x + max_relative_jump * L] (clipped by the bounds). `max_relative_jump must be on (0, 1]. For a value of 1, the entire range is always considered. dimensions_varied: how many randomly selected dimensions to explore with each step. 'all' indicates all dimensions. An integer directly specifies the number of dimensions. A float on (0, 1) indicates the fractional number of the total. max_iters: the maximum number of iterations max_evals: the maximum number of function evaluations (this is a soft maximum: once it is reached then the current iteration finishes) TODO: add tolerance stopping conditions """ super().__init__(rng=rng, **kwargs) self._num_seeds = params.integer(num_seeds, from_=1) self._resolution = params.integer(resolution, from_=2) self._max_relative_jump = params.real(max_relative_jump, above=0.0, to=1.0) self._dimensions_varied = params.any_( dimensions_varied, lambda arg: params.integer(arg, above=0), lambda arg: params.real(arg, above=0.0, below=1.0), lambda arg: params.enumeration(arg, {"all"}), ) self._max_iters = params.optional_( max_iters, lambda arg: params.integer(arg, from_=1)) self._max_evals = params.optional_( max_evals, lambda arg: params.integer(arg, from_=1)) if self._max_iters is None and self._max_evals is None: raise InvalidParameterError( "at least one stopping condition defined", "all Nones")
def element_data(element, property_): """Query chemical element data. Parameters: element: chemical element, given by either proton number (int) or abbreviation (str) property_: queried property; one of 'abbreviation', 'Z' (proton number) Returns: queried property Raises: InvalidParameterError: for invalid parameters """ element = params.chemical_element(element) property_ = params.enumeration(property_, {"Z", "abbreviation"}) return _element_data[element][property_]
def __init__(self, orient=None, **kwargs): """Initialize state. Parameters: orient: actively orients metric towards minimization (-1) or maximization (+1) if unspecified, the natural orientation of the metric is retained Raises: InvalidParameterError if trying to orient a metric with no natural orientation """ super().__init__(**kwargs) orient = params.enumeration(orient, {-1, +1, None}) self._sign = +1 # default value leaves _evaluate() unchanged if orient is not None: if not self.has_orientation: raise InvalidParameterError("oriented metric", self.orientation) # -1 if desired and actual orientation disagree, otherwise +1 self._sign = orient * self.orientation
def __init__( self, fits: bool = True, fit_lambda: float = 1e-7, fit_weights: Optional[str] = None, base=10, **kwargs, ): """Initialize learning curve plot. Parameters: fits: if True, show estimated asymptotic fits fit_lambda: regularization strength for asymptotic fits; defaults to 1e-7 fit_weights: if and how to weight fits; one of None: no weighting, "variance": weigh by variance for each training set size base: base for logarithmic plotting All parameters from base classes, in particular GeneralizedFunctionPlot and Plot. """ # set learning curve-specific arguments if not explicitly set kwargs["axes_scales"] = kwargs.get("axes_scales", ("log", "log")) kwargs["axes_labels"] = kwargs.get( "axes_labels", ("training set size", "evaluation metric", None, None) ) super().__init__(**kwargs) # parameters self._fits = params.boolean(fits) self._fit_lambda = params.real(fit_lambda, from_=0) self._fit_weights = params.any_( fit_weights, lambda arg: params.enumeration(arg, {"variance"}), params.none ) self._base = params.real(base, from_=2) self._logf = lambda x: np.log(x) / np.log(self._base) self._powf = lambda x: np.power(self._base, x)
def evaluate(self, results, **kwargs): """Compute plot data for multiple generalized (set-valued) functions. Multiple curves C_1, ..., C_k can be drawn. Each curve C_i is specified by a non-empty sequence of 2-tuples, where the first value is location on horizontal axis, and the other value is a sequence of locations on the vertical axis. Each curve can be drawn in a different way (points, box-whisker). Parameters: results: sequence of generalized functions data (curve data). Each datum is a sequence of tuples (x,fx), where x is a real number and fx is a sequence of real numbers. Examples: # two curves sharing one horizontal location evaluate([ [(1,(1,0.9,1.1)), (3,(2,))], # curve 1 [(1,(0.7,)), (2,(3.1,2.8)), (4,(5.5,7.3,6))], # curve 2 ]) """ super().evaluate(results=results, **kwargs) # parameter validation tuple_testf = lambda arg: params.tuple_(arg, params.real, params.real_vector, arity=2) curve_testf = lambda arg: params.tuple_(arg, tuple_testf) results = params.tuple_(results, curve_testf) # _rectify evaluates to True if True or if > 0 if len(results) > len(self.RECTIFY_DELTAS) and self._rectify: raise InvalidParameterError( f"at most {len(self.RECTIFY_DELTAS)} curves", f"{len(self.RECTIFY_DELTAS)} curves" ) # finalize parameter validation for visualization_type if not is_sequence(self._visualization_type): self._visualization_type = (self._visualization_type,) * len(results) self._visualization_type = params.tuple_( self._visualization_type, lambda arg: params.enumeration(arg, {"points", "box-whisker", "shaded-line"}), arity=len(results), default="points", ) # prepare plot # determine all distinct horizontal positons in the results data all_positions = np.unique([entry[0] for curve in results for entry in curve]) # there is nothing to do without data to plot if len(all_positions) == 0: self._plotdata = [] return # do not rectify if there is only a single horizontal position if len(all_positions) == 1 or self._rectify is False: self._rectify = 0.0 # automatic determination of horizontal rectification factor # # the correct way to draw box-plots on a logarithmic horizontal axis is to have # different left-width and right-width of the boxes. However, matplotlib does not # support this. Because box widths are small compared to horizontal plot range, # it suffices to use the sum of left- and right-half widths. between_groups_spacing = 0.4 in_group_spacing = 0.9 # box-whisker plots if self.axes_scales[0] == "linear": logf = lambda arg: arg powf = lambda arg: arg elif self.axes_scales[0] == "log": base = 10 logf = lambda arg: np.log(arg) / np.log(base) powf = lambda arg: np.power(base, arg) if self._rectify is True: # diff(...) requires at least two horizontal locations; this is ensured above self._rectify = ( between_groups_spacing * min(np.diff(logf(all_positions))) / len(results) ) # determine positions self._plotdata = [None] * len(results) deltas = self.RECTIFY_DELTAS[len(results)] if self._rectify else np.zeros(len(results)) for (i, curve) in enumerate(results): # point markers, every single point is drawn if self._visualization_type[i] == "points": positions = powf( np.hstack( [ logf(entry[0] * np.ones(len(entry[1]))) + deltas[i] * self._rectify / 2 for entry in curve ] ) ) values = np.hstack([entry[1] for entry in curve]) self._plotdata[i] = np.transpose([positions, values]) # box-whisker plots elif self._visualization_type[i] == "box-whisker": positions = np.asfarray( [logf(entry[0]) + deltas[i] * self._rectify / 2 for entry in curve] ) values = [entry[1] for entry in curve] # can't use rectify for width if 0; 1 is a wild guess # todo: if plot ranges have been set, a better default value could # be 10% of horizontal plot range w = 1 if not self._rectify else self._rectify widths = powf((positions + w / 2) * in_group_spacing) - powf( (positions - w / 2) * in_group_spacing ) positions = powf(positions) self._plotdata[i] = (positions, values, widths) elif self._visualization_type[i] == "shaded-line": positions = np.asfarray([entry[0] for entry in curve]) values = [entry[1] for entry in curve] self._plotdata[i] = (positions, values) else: raise BenchmarkError("internal error, unknown visualization type")
def __init__( self, select: Optional[Sequence[str]] = None, failmode="raise", samplef: Callable[[Any], Any] = lambda arg: arg, java_gateway: Optional[CdkJavaGateway] = None, **kwargs, ): """Initialize state. Parameters: select: which features to compute (by default, all). List of names, order matters. Presets are available as class constants: PRESET_ALL: all features PRESET_ROBUST: a subset of descriptors that are fast to compute and do not fail often (tested on QM9 and CEP datasets; see accompanying notebook) failmode: how to handle failed descriptor calculations, either due to rejected SMILES encodings or failing descriptor code. Possible values: "raise" [default]: raise a Benchmarexception "drop": drop the sample. Returned Data will have fewer samples ("mask", mask): where `mask` is a NumPy array with dtype bool whose entries will be set to False for failures ("index", index): where `index` is an empty list to which the indices of failed entries will be appended samplef: a function accepting and returning a sample. This enables transformation of samples, for example, to select an entry by key if sample is a dictionary, or to turn a dictionary into a vector. Default is to return the sample unchanged. java_gateway: a gateway to a Java virtual machine Requires a CDK jar. """ super().__init__(**kwargs) # parameters select = params.optional_( select, lambda arg: params.tuple_( arg, lambda arg: params.enumeration(arg, self.DESCRIPTORS.keys()) ), ) select = self.PRESET_ALL if select is None else select self._failmode = DataTransformationFailureMode.failmode(failmode) self._samplef = params.callable(samplef, num_pos_or_kw=1) self._java_gateway = params.optional_( java_gateway, lambda arg: params.instance(arg, JavaGateway) ) if self._java_gateway is None: self._java_gateway = CdkJavaGateway() self._java_gateway = self._java_gateway.gateway # set up descriptors self._descriptors = tuple( eval("self._java_gateway.jvm." + self.DESCRIPTORS[name][0] + "()") for name in select ) builder = self._java_gateway.jvm.org.openscience.cdk.DefaultChemObjectBuilder.getInstance() for descriptor in self._descriptors: descriptor.initialise(builder) self._arities = tuple(self.DESCRIPTORS[name][1] for name in select)
def __init__( self, rng: int = None, uncertainties: Optional[str] = None, n_estimators: int = 100, criterion: str = "mse", max_depth: Optional[int] = None, min_samples_split: Union[int, float] = 2, min_samples_leaf: Union[int, float] = 1, min_weight_fraction_leaf: float = 0.0, max_features: Union[int, float, str, None] = "auto", max_leaf_nodes: Optional[int] = None, min_impurity_decrease: float = 0.0, # min_impurity_split deprecated bootstrap: bool = True, n_jobs: Optional[int] = None, ccp_alpha: float = 0.0, max_samples: Optional[Union[int, float]] = None, **kwargs, ): """Initialize state. sklearn-specific parameters are passed through to the implementation. Parameters: uncertainties: whether and how to compute predictive uncertainties; choices are None; by default, RandomForestRegressor does not return predictive uncertainties; "naive"; uses the ensembles standard deviation n_estimators: number of decision trees criterion: either variance reduction ("mse", mean squared error), or, mean absolute error ("mae") max_depth: maximum depth of a tree; default is restricted only by min_samples_leaf min_samples_split: minimum number of samples required to split an internal node; float numbers indicate a fraction of number of training samples min_samples_leaf: minimum number of training samples required in a leaf node float numbers indicate a fraction of number of training samples min_weight_fraction_leaf: minimum weighted fraction of weights required in a leaf node max_features: number of features considered when splitting; integers directly specify the number, floating point values specify which fraction of all features to use; "auto" uses all features, "sqrt" and "log2" use square root and binary logarithm of number of features max_leaf_nodes: maximum number of leaves a tree can have min_impurity_decrease: minimum impurity decrease required for splitting bootstrap: if False, the whole dataset is used to build trees n_jobs: number of parallel jobs; -1 to use all available processors; None means 1 ccp_alpha: complexity parameter for minimal cost-complexity pruning. max_samples: number of input samples to draw during bootstrap; integers directly specify the number, floating point values specify which fraction of samples to use; all by default The sklearn.RandomForestRegressor parameters `oob_score`, `verbose`, `warm_restart` are not considered. See skl.ensemble.ExtraTreesRegressor parameters. """ super().__init__(rng=rng, **kwargs) # validate parameters self._uncertainties = params.enumeration(uncertainties, {None, "naive"}) n_estimators = params.integer(n_estimators, from_=1) criterion = params.enumeration(criterion, {"mse", "mae"}) max_depth = params.any_(max_depth, lambda arg: params.integer(arg, from_=1), params.none) min_samples_split = params.any_( min_samples_split, lambda arg: params.integer(arg, from_=2), lambda arg: params.real(arg, above=0.0, to=1.0), ) min_samples_leaf = params.any_( min_samples_leaf, lambda arg: params.integer(arg, from_=1), lambda arg: params.real(arg, above=0.0, to=1.0), ) min_weight_fraction_leaf = params.real(min_weight_fraction_leaf, from_=0.0, to=1.0) max_features = params.any_( max_features, lambda arg: params.integer(arg, above=0), lambda arg: params.real(arg, above=0.0, to=1.0), lambda arg: params.enumeration(arg, {"auto", "sqrt", "log2"}), params.none, ) max_leaf_nodes = params.any_( max_leaf_nodes, lambda arg: params.integer(arg, from_=1), params.none ) min_impurity_decrease = params.real(min_impurity_decrease, from_=0.0) bootstrap = params.boolean(bootstrap) n_jobs = params.any_( n_jobs, lambda arg: params.integer(arg, from_=-1, to=-1), lambda arg: params.integer(arg, from_=1), params.none, ) ccp_alpha = params.real(ccp_alpha, from_=0.0) max_samples = params.any_( max_samples, lambda arg: params.integer(arg, from_=1), lambda arg: params.real(arg, from_=0.0, to=1.0), params.none, ) self._model = ExtraTreesRegressor( n_estimators=n_estimators, criterion=criterion, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, min_weight_fraction_leaf=min_weight_fraction_leaf, max_features=max_features, max_leaf_nodes=max_leaf_nodes, min_impurity_decrease=min_impurity_decrease, bootstrap=bootstrap, n_jobs=n_jobs, ccp_alpha=ccp_alpha, max_samples=max_samples, )
def __init__(self, rng: int = None, strategy: str = "best1bin", maxiter: int = 1000, popsize: int = 15, tol: float = 0.01, mutation=(0.5, 1), recombination: float = 0.7, **kwargs): """Initialize state. Scipy-specific parameters are passed through. Parameters: rng: integer seed. Will be used to generate a new seed each time the optimizer is run. strategy: The differential evolution strategy to use. See documentation for complete list and explanations. maxiter: The maximum number of generations over which the entire population is evolved. popsize: A multiplier for setting the total population size. tol: Relative tolerance for convergence. mutation: The mutation constant. Either a number between 0 and 2 or a tuple (min, max) in which case the mutation constant is randomly selected uniformly from between min and max with each generation. recombination: The recombination constant. Must be between 0 and 1. """ super().__init__(rng=rng, **kwargs) allowed_strategies = { "best1bin", "best1exp", "rand1exp", "randtobest1exp", "currenttobest1exp", "best2exp", "rand2exp", "randtobest1bin", "currenttobest1bin", "best2bin", "rand2bin", "rand1bin", } self._strategy = params.enumeration(strategy, allowed_strategies) self._maxiter = params.integer(maxiter, from_=1) self._popsize = params.integer(popsize, from_=1) self._tol = params.real(tol, above=0.0) def test_mutation_range(arg, low=0.0): return params.real(arg, from_=low, to=2.0) self._mutation = params.any_( mutation, test_mutation_range, lambda pair: params.tuple_( pair, test_mutation_range, lambda arg2: test_mutation_range(arg2, low=pair[0]), arity=2, ), ) self._recombination = params.real(recombination, from_=0.0, to=1.0)
def __init__( self, select: Union[str, Sequence[str]] = "all", samplef: Callable[[Any], Any] = lambda arg: arg, stoichiometry_p_list: Sequence[int] = (0, 2, 3, 5, 7, 10), elemental_preset: str = "magpie", ionic_fast: bool = False, valence_orbitals: Sequence[str] = ("s", "p", "d", "f"), valence_props: Sequence[str] = ("avg", "frac"), **kwargs, ): """Initialize state. Selected parameters of the wrapped matminer classes Stoichiometry, ElementProperty, IonProperty, ValenceOrbital can be passed through. These parameters are prefixed with stoichiometry, elemental, ionic, valence. For example, stoichiometry_p_list is the p_list parameter of Stoichiometry. For further details on these, see https://github.com/hackingmaterials/matminer/blob/master/matminer/featurizers/composition.py Parameters: select: which feature sets to compute (by default, all). Specifying multiple sets (e.g., ('stoichiometry', 'elemental') selects both). Valid choices: 'all': all features 'stoichiometry': norms of stoichiometric features 'elemental': element properties 'ionic': ion properties 'valence': valence orbital shell features samplef: a function accepting and returning a sample. This enables transformation of samples, for example, to select an entry by key if sample is a dictionary, or to turn a dictionary into a vector. Default is to return the sample unchanged. stoichiometry_p_list: list of L_p norms to compute elemental_preset: matminer preset to use. Valid choices include: 'magpie', 'deml', 'matminer', 'matscholar_el', 'megnet_el' ionic_fast: if True, assumes that elements exist in single oxidation state valence_orbitals: which valence orbitals to consider valence_props: whether to return average properties, fractional, or both Requires the matminer package (see file documentation). """ super().__init__(**kwargs) SELECT_SETS = ("stoichiometry", "elemental", "ionic", "valence") if select == "all": select = SELECT_SETS if isinstance(select, str): select = (select, ) # tuple(str,) yields tuple of characters in str select = params.tuple_( select, lambda arg: params.enumeration(arg, set(SELECT_SETS)), ) self._stoichiometry_p_list = params.tuple_( stoichiometry_p_list, lambda p: params.integer(p, from_=0)) self._elemental_preset = params.enumeration( elemental_preset, {"magpie", "deml", "matminer", "matscholar_el", "megnet_el"}) self._ionic_fast = params.boolean(ionic_fast) self._valence_orbitals = params.tuple_( valence_orbitals, lambda arg: params.enumeration(arg, {"s", "p", "d", "f"})) self._valence_props = params.tuple_( valence_props, lambda arg: params.enumeration(arg, {"avg", "frac"})) self.samplef = samplef # todo: add callable to params # set up matminer try: import matminer import matminer.featurizers import matminer.featurizers.base import matminer.featurizers.composition import matminer.featurizers.conversions import pymatgen except ModuleNotFoundError as e: raise BenchmarkError( f"'{type(self).__name__}' requires 'matminer' and 'pymatgen' packages" ) from e self._composition = pymatgen.core.composition.Composition # set up features features = [] if "stoichiometry" in select: features.append( matminer.featurizers.composition.Stoichiometry( p_list=self._stoichiometry_p_list)) if "elemental" in select: features.append( matminer.featurizers.composition.ElementProperty.from_preset( self._elemental_preset)) if "ionic" in select: features.append( matminer.featurizers.composition.IonProperty( fast=self._ionic_fast)) if "valence" in select: features.append( matminer.featurizers.composition.ValenceOrbital( orbitals=self._valence_orbitals, props=self._valence_props)) self._mmfeatures = matminer.featurizers.base.MultipleFeaturizer( features)
def __init__( self, num_trees: int = -1, use_jackknife: bool = True, bias_learner: Optional[BaseLoloLearner] = None, leaf_learner: Optional[BaseLoloLearner] = None, subset_strategy: Union[str, int, float] = "auto", min_leaf_instances: int = 1, max_depth: int = 2 ** 30, uncertainty_calibration: bool = False, randomize_pivot_location: bool = False, # randomly_rotate_features: bool = False, currently in develop branch **kwargs ): """Initialize random forest model. See lolo Scala source code for initialization parameters: https://github.com/CitrineInformatics/lolo/blob/develop/src/main/scala/io/citrine/lolo/learners/RandomForest.scala When using `uncertainty_calibration=False` (the default), the number of trees `num_trees` should be set to a multiple of the number n of training samples, `num_trees = 4 * n` or higher. When using `uncertainty_calibration=True`, `num_trees = 64` is sufficient. Parameters: num_trees: number of trees in the forest; -1 uses number of training samples use_jackknife: whether to use jackknife-based variance estimates bias_learner: algorithm used to model bias leaf_learner: algorithm used at each leaf of the random forest subset_strategy: strategy to determine number of features used at each split "auto": use the default for lolo (all features for regression, sqrt for classification) "log2": use the base 2 log of the number of features "sqrt": use the square root of the number of features integer: set the number of features explicitly float: use a certain fraction of the features min_leaf_instances: minimum number of features used at each leaf max_depth: maximum depth of decision trees uncertainty_calibration: whether to empirically re-calibrate predicted uncertainties based on out-of-bag residuals randomize_pivot_location: whether to draw pivots randomly or always select the midpoint randomly_rotate_features: whether to rotate real scalar fetures for each tree """ super().__init__(**kwargs) # validate parameters num_trees = params.any_( num_trees, lambda i: params.integer(i, above=0), lambda i: params.integer(i, from_=-1, to=-1), ) use_jackknife = params.boolean(use_jackknife) bias_learner = params.any_( bias_learner, lambda arg: params.instance(arg, BaseLoloLearner), params.none ) leaf_learner = params.any_( leaf_learner, lambda arg: params.instance(arg, BaseLoloLearner), params.none ) subset_strategy = params.any_( subset_strategy, lambda s: params.enumeration(s, {"auto", "log2", "sqrt"}), lambda s: params.integer(s, above=0), lambda s: params.real(s, above=0), ) min_leaf_instances = params.integer(min_leaf_instances, above=0) # the default 2**30 works for 32 bit or larger architectures max_depth = params.integer(max_depth, above=0) uncertainty_calibration = params.boolean(uncertainty_calibration) randomize_pivot_location = params.boolean(randomize_pivot_location) # randomly_rotate_features = params.boolean(randomly_rotate_features) # set up model try: self._model = RandomForestRegressor( num_trees=num_trees, use_jackknife=use_jackknife, bias_learner=bias_learner, leaf_learner=leaf_learner, subset_strategy=subset_strategy, min_leaf_instances=min_leaf_instances, max_depth=max_depth, uncertainty_calibration=uncertainty_calibration, randomize_pivot_location=randomize_pivot_location, # randomly_rotate_features=randomly_rotate_features, ) except Py4JJavaError as e: raise BenchmarkError("instantiating lolo model failed") from e self._with_uncertainties = use_jackknife # otherwise, deviations will be zero
def __init__( self, uncertainties: Optional[str] = None, loss: str = "ls", alpha: float = 0.9, learning_rate: float = 0.1, subsample: float = 1.0, n_estimators: int = 100, criterion: str = "mse", max_depth: int = 3, min_samples_split: Union[int, float] = 2, min_samples_leaf: Union[int, float] = 1, min_weight_fraction_leaf: float = 0.0, max_features: Union[int, float, str, None] = None, max_leaf_nodes: Optional[int] = None, min_impurity_decrease: float = 0.0, # min_impurity_split deprecated random_state: int = None, ccp_alpha: float = 0.0, init: Optional[Any] = None, validation_fraction: float = 0.1, n_iter_no_change: Optional[int] = None, tol: float = 0.0001, **kwargs, ): """Initialize state. sklearn-specific parameters are passed through to the implementation. Parameters: uncertainties: whether and how to compute predictive uncertainties; possible choices are None; by default, RandomForestRegressor does not return any predictive uncertainties; loss: loss function to optimize; valid values are "ls" (least squares), "lad" (least absolute deviation), "huber" (Huber's loss), "quantile" (quantile regression). Use alpha parameter for huber and quantile. alpha: quantile for "huber" and "quantile" loss functions learning_rate: value by which to shrink contribution of consecutive trees; trade-off with num_estimators subsample: fraction of samples for fitting base learners; if <1 results in Stochastic Gradient Boosting. reducing subsample reduces variance and increases bias. n_estimators: number of decision trees criterion: either Friedman improved score ("friedman_rmse"), variance reduction ("mse", mean squared error), or, mean absolute error ("mae") max_depth: maximum depth of a tree; default is 3 min_samples_split: minimum number of samples required to split an internal node; float numbers indicate a fraction of number of training samples min_samples_leaf: minimum number of training samples required in a leaf node float numbers indicate a fraction of number of training samples min_weight_fraction_leaf: minimum weighted fraction of weights required in a leaf node max_features: number of features considered when splitting; integers directly specify the number, floating point values specify which fraction of all features to use; "auto" uses all features, "sqrt" and "log2" use square root and binary logarithm of number of features max_leaf_nodes: maximum number of leaves a tree can have min_impurity_decrease: minimum impurity decrease required for splitting random_state: pseudo-random number generator seed ccp_alpha: complexity parameter for minimal cost-complexity pruning. init: estimator for initial predictions; can be 'zero' for constant zero predictions validation_fraction: fraction of training data to set aside for early stopping; only with n_iter_no_change n_iter_no_change: set to integer to stop after no improvement (beyond tol) for that many rounds tol: tolerance for early stopping; only improvements larger than tol are considered The sklearn.GradientBoostingRegressor parameters `oob_score`, `verbose`, `warm_start` are not considered. See skl.ensemble.ExtraTreesRegressor parameters. """ super().__init__(**kwargs) # validate parameters self._uncertainties = params.enumeration(uncertainties, {None}) loss = params.enumeration(loss, {"ls", "lad", "huber", "quantile"}) alpha = params.real(alpha, above=0, below=1) learning_rate = params.real(learning_rate, above=0, to=1) subsample = params.real(subsample, above=0, to=1) n_estimators = params.integer(n_estimators, from_=1) criterion = params.enumeration(criterion, {"friedman_rmse", "mse", "mae"}) max_depth = params.any_(max_depth, lambda arg: params.integer(arg, from_=1), params.none) min_samples_split = params.any_( min_samples_split, lambda arg: params.integer(arg, from_=2), lambda arg: params.real(arg, above=0.0, to=1.0), ) min_samples_leaf = params.any_( min_samples_leaf, lambda arg: params.integer(arg, from_=1), lambda arg: params.real(arg, above=0.0, to=1.0), ) min_weight_fraction_leaf = params.real(min_weight_fraction_leaf, from_=0.0, to=1.0) max_features = params.any_( max_features, lambda arg: params.integer(arg, above=0), lambda arg: params.real(arg, above=0.0, to=1.0), lambda arg: params.enumeration(arg, {"auto", "sqrt", "log2"}), params.none, ) max_leaf_nodes = params.any_(max_leaf_nodes, lambda arg: params.integer(arg, from_=1), params.none) min_impurity_decrease = params.real(min_impurity_decrease, from_=0.0) random_state = params.integer(random_state) ccp_alpha = params.real(ccp_alpha, from_=0.0) # no validation for init (no class signature validator) validation_fraction = params.real(validation_fraction, above=0, below=1) n_iter_no_change = params.any_( n_iter_no_change, lambda arg: params.integer(arg, from_=0), params.none) tol = params.real(tol, from_=0) self._model = skl.ensemble.GradientBoostingRegressor( loss=loss, alpha=alpha, learning_rate=learning_rate, subsample=subsample, n_estimators=n_estimators, criterion=criterion, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, min_weight_fraction_leaf=min_weight_fraction_leaf, max_features=max_features, max_leaf_nodes=max_leaf_nodes, min_impurity_decrease=min_impurity_decrease, random_state=random_state, ccp_alpha=ccp_alpha, init=init, validation_fraction=validation_fraction, n_iter_no_change=n_iter_no_change, tol=tol, )