def __init__( self, target=None, configuration: Optional[PlotConfiguration] = None, axes_labels=(None, None, None, None), axes_scales=("linear", "linear"), **kwargs, ): """Initialize Evaluation. Parameters: target: rendering target that evaluation outcome is rendered to; can be a single filename, or a matplotlib Axes or (Figure, Axes) pair, or a sequence thereof; if a matplotlib Axes or (Figure, Axes) pair, evaluation will add to it; if None, a new rendering target is created configuration: optional plot configuration controlling rendering details axes_labels: labels for all axes (bottom, left, top, right), None to not label an axis; for shorter tuples remaining entries are assumed None, so ('x', 'y') is valid axes_scales: scales ("linear" or "log") for horizontal and vertical axes Examples: __init__(axes_labels=("bottom", "left", "top")) # right is None __init__(axes_scales=("log", "log")) """ configuration = params.any_( configuration, lambda arg: params.instance(arg, PlotConfiguration), params.none ) super().__init__(configuration=configuration, **kwargs) # Axes, (Figure, Axes), filename, None, or sequence (without None) target_f = lambda arg: params.any_( arg, lambda arg: params.instance(arg, mpl.axes.Axes), lambda arg: params.tuple_( arg, lambda arg: params.instance(arg, mpl.figure.Figure), lambda arg: params.instance(arg, mpl.axes.Axes), arity=2, ), params.string, ) self._target = params.any_( target, target_f, params.none, lambda arg: params.tuple_(arg, target_f) ) self._axes_labels = params.tuple_( axes_labels, lambda arg: params.any_(arg, params.string, params.none), arity=4, default=None, ) self._axes_scales = params.tuple_( axes_scales, lambda arg: params.enumeration(arg, {"linear", "log"}), arity=2 ) self._figaxis = None
def _indices_testf(self, indices: Sequence[Any]): return params.optional_( indices, lambda arg: list( params.any_( # NumPy indexing expects a list arg, lambda arg: params.tuple_(arg, None, arity=0), # empty set lambda arg: params.tuple_( arg, lambda arg: params.integer( arg, from_=0, below=self.num_samples)), )), )
def evaluate(self, results, **kwargs): """Evaluate learning curve plot. Parameters: results: sequence of curve data, where each curve datum is a sequence of tuples (n,fx) of training set size n (positive integer) and performance values fx (sequence of real numbers). """ # parameter validation tuple_testf = lambda arg: params.tuple_(arg, lambda arg: params.real( arg, above=0), params.real_vector, arity=2) curve_testf = lambda arg: params.tuple_(arg, tuple_testf) results = params.tuple_(results, curve_testf) super().evaluate(results=results, **kwargs) ypowf = self._powf if self.axes_scales[1] == "log" else lambda arg: arg # asymptotic estimates if self._fits: asymptotic_fits = tuple( self.asymptotic_fit(fdata) for fdata in results) all_sizes = np.unique( [entry[0] for fdata in results for entry in fdata]) sizes = np.linspace(start=np.min(all_sizes), stop=np.max(all_sizes), num=25) self._fit_data = np.empty(shape=(len(results), 2, len(sizes))) for i, (offset, slope, _, _) in enumerate(asymptotic_fits): yvalues = [ ypowf(offset + slope * self._logf(n)) for n in sizes ] self._fit_data[i, 0, :] = sizes self._fit_data[i, 1, :] = yvalues self.add_auxiliary( "asymptotic_fits", tuple({ "offset": offset, "slope": slope, "residuals": residuals, "variance": variance, } for (offset, slope, residuals, variance) in asymptotic_fits), )
def evaluate(self, results, **kwargs): """Evaluate optimization trajectory plot. Parameters: results: sequence of curve data, where each curve datum is a sequence of tuples (index, scores) of function evaluation number (positive integer) and best scores found after that many evaluations (sequence of real numbers). """ tuple_testf = lambda arg: params.tuple_( arg, lambda arg: params.real(arg, above=0), params.real_vector, arity=2 ) curve_testf = lambda arg: params.tuple_(arg, tuple_testf) results = params.tuple_(results, curve_testf) super().evaluate(results=results, **kwargs)
def axes_labels(self, labels=(None, None, None, None), **kwargs): """Set axes labels. Parameters: axes_labels: labels for bottom, left, top, right axes None indicates to use the current value Examples: axes_labels = (None, "y") # set only left axis label """ string_or_none_f = lambda arg: params.any_(arg, params.string, params.none) labels = params.tuple_(labels, string_or_none_f, arity=4, default=None) # re-assign tuple as a whole self._labels = tuple( self.axes_labels[i] if labels[i] is None else labels[i] for i in range(4) ) # set labels if specified (not None) # this allows to pass kwargs specific to one axis if labels[0] is not None: self.ax.set_xlabel(labels[0], fontdict=self._fontdict(), **kwargs) if labels[1] is not None: self.ax.set_ylabel(labels[1], fontdict=self._fontdict(), **kwargs) if labels[2] is not None or labels[3] is not None: # todo; possible implementation via xtwin/ytwin, storing these axes in outcome raise NotImplementedError
def axes_scales(self, scales=(None, None), **kwargs): """Set axes scales. Parameters: axes_scales: scales (None, "linear" or "log") for horizontal and vertical axes; None indicates to use the current value Examples: axes_scales = (None, "log") # change only vertical axis """ scale_or_none_f = lambda arg: params.any_( arg, lambda arg: params.enumeration(arg, {"linear", "log"}), params.none ) scales = params.tuple_(scales, scale_or_none_f, arity=2, default=None) # re-assign tuple as a whole self._scales = ( self.axes_scales[0] if scales[0] is None else scales[0], self.axes_scales[1] if scales[1] is None else scales[1], ) # set axes if specified (not None) # this allows to pass kwargs specific to one axis if scales[0] is not None: self.ax.set_xscale(scales[0], **kwargs) if scales[1] is not None: self.ax.set_yscale(scales[1], **kwargs)
def box_whisker(self, positions, values, color=0, widths=0.5, **kwargs): """Draw box-whisker plots. Parameter: positions: where to place plots on horizontal axis values: samples for each location color: color index widths: widths of boxes """ positions = params.real_vector(positions) point_set_f = lambda arg: params.real_vector(arg) values = params.tuple_(values, params.real_vector, arity=len(positions)) color = params.integer(color, from_=0, below=len(self.configuration.color_set)) widths = params.real_vector(widths, dimensions=len(positions), domain=(0, 999)) color = self.configuration.color(color) self.ax.boxplot( values, positions=positions, whis=(0, 100), bootstrap=None, widths=widths, notch=False, showmeans=True, boxprops={"color": color}, whiskerprops={"color": color}, capprops={"color": color}, meanprops={"marker": "*", "markerfacecolor": color, "markeredgecolor": color}, medianprops={"color": color}, manage_ticks=False, **kwargs, )
def __init__( self, visualization_type: str = "points", rectify: Union[float, bool] = False, **kwargs ): """Initialize generalized function plot. Parameters: visualization_type: how to visualize generalized functions. Either single value or list of appropriate length. Possible values: "points" (default), "box-whisker", "shaded-line" rectify: whether and by how much each curves' values will be horizontally displaced to visually disentangle markers from different curves at the same location. True indicates automatic displacement, False indicates no displacement. If not specified, horizontal axis positions are not modified (default). If the horizontal axis scaling is logarithmic, the rectification factor is applied in log-space. Examples: # show three curves with automatic horizontal rectification __init__(visualization_type=("points", "points", "box-whisker"), rectify=True) """ super().__init__(**kwargs) # parameter validation enum_f = lambda arg: params.enumeration(arg, {"points", "box-whisker", "shaded-line"}) self._visualization_type = params.any_( visualization_type, enum_f, lambda arg: params.tuple_(arg, enum_f) ) # arity can only be tested in evaluate() self._rectify = params.any_(rectify, lambda arg: params.real(arg, from_=0), params.boolean)
def shaded_line( self, positions: np.ndarray, values: List[np.ndarray], color_idx: int = 0, label: Optional[str] = None, quantile_width: float = 0.5, alpha: float = 0.2, show_extrema: bool = True, **kwargs, ): """Draw a line plot with shaded quantiles. Parameters: positions: 1-d array of point locations on the horizontal axis values: list of arrays, each one containing all of the values at a given location. len(values) must equal len(positions) color_idx: color index label: line label quantile_width: fraction of the range to shade. For the default value, 0.5, shade from the 25th percentile to the 75th percentile. alpha: shading alpha level show_extrema: whether or not to draw dashed lines at the best/worst point """ positions = params.real_vector(positions) values = params.tuple_(values, params.real_vector, arity=len(positions)) color_idx = params.integer(color_idx, from_=0, below=len(self.configuration.color_set)) quantile_width = params.real(quantile_width, from_=0, to=1) alpha = params.real(alpha, from_=0, to=1) color = self.configuration.color(color_idx) lower_bound = 0.5 - quantile_width / 2.0 upper_bound = 0.5 + quantile_width / 2.0 median = [np.median(samples) for samples in values] lower_shading = [np.quantile(samples, lower_bound) for samples in values] upper_shading = [np.quantile(samples, upper_bound) for samples in values] self.ax.plot(positions, median, linestyle="-", color=color, label=label, **kwargs) self.ax.fill_between( positions, lower_shading, upper_shading, color=color, alpha=alpha, **kwargs, ) if show_extrema: min_val = [np.min(samples) for samples in values] max_val = [np.max(samples) for samples in values] self.ax.plot(positions, min_val, linestyle="--", color=color, **kwargs) self.ax.plot(positions, max_val, linestyle="--", color=color, **kwargs)
def noise(self, shape=None): """Add Gaussian noise to labels. Parameters: shape: shape of noise vector, matrix or higher-order tensor Returns: a numerical array of given shape containing independent identically distributed Gaussian noise Raises: InvalidParameterError: for invalid parameters """ # valid shape are either positive integer or a tuple of positive integer is_nonneg_int = lambda arg: params.integer(arg, from_=1) is_tuple = lambda arg: params.tuple_(arg, is_nonneg_int) shape = params.any_(shape, is_nonneg_int, is_tuple) return self.random.normal(self._mean, self._stddev, size=shape)
def __init__( self, data: VectorSpaceData, model: Learner, scorer: Scorer, optimizers: Sequence[Optimizer], evaluations: Sequence[Evaluation] = (OptimizationTrajectoryPlot(),), num_trials: int = 1, training_data: Optional[Data] = None, ): self._data = params.instance(data, VectorSpaceData) self._scorer = params.instance(scorer, Scorer) self._model = params.instance(model, Learner) self._optimizers = params.sequence(optimizers, type_=Optimizer) self._evaluations = params.tuple_( evaluations, lambda arg: params.instance(arg, Evaluation) ) self._num_trials = params.integer(num_trials, from_=1) self._training_data = params.optional_( training_data, lambda arg: params.instance(arg, Data) )
def noise(self, shape=None): """Return no noise. A constant value is returned. Parameters: shape: shape of noise vector, matrix or higher-order tensor Returns: a numerical array of given shape containing a constant value Raises: InvalidParameterError: for invalid parameters """ # valid shape are either positive integer or a tuple of positive integer is_nonneg_int = lambda arg: params.integer(arg, from_=1) is_tuple = lambda arg: params.tuple_(arg, is_nonneg_int) shape = params.any_(shape, is_nonneg_int, is_tuple) return np.full(shape, self._value)
def __init__( self, data: Data, training: Sequence[Sampler], validation: Sampler, learners: Sequence[SupervisedLearner], features: DataValuedTransformation = IdentityFeatures(), metric: ScalarEvaluationMetric = RootMeanSquaredError(), evaluations: Sequence[Evaluation] = (LearningCurvePlot(),), # todo: add table progressf: Optional[Callable[[int, int], None]] = None, ): """Initialize workflow. Parameters: data: labeled data training: sequence of Samplers, one for each training set size validation: Sampler for validation set learners: sequence of supervised regression algorithms features: any data-valued transformation metric: evaluation metric to use; root mean squared error by default evaluations: one or more evaluations; default are learning curve and table progressf: callable with two parameters, done iterations and total number of iterations """ self._data = params.instance(data, Data) # todo: params.data(..., is_labeled=True) if not self._data.is_labeled: raise InvalidParameterError("labeled data", "unlabeled data") self._training = params.sequence(training, type_=Sampler) self._validation = params.instance(validation, Sampler) self._learners = params.sequence(learners, type_=SupervisedLearner) self._features = params.instance(features, Features) self._metric = params.instance(metric, ScalarEvaluationMetric) self._evaluations = params.tuple_( evaluations, lambda arg: params.instance(arg, Evaluation) ) self._progressf = params.optional_( progressf, lambda arg: params.callable(arg, num_pos_or_kw=2) ) if self._progressf is None: self._progressf = lambda *args: None
def __init__( self, select: Optional[Sequence[str]] = None, failmode="raise", samplef: Callable[[Any], Any] = lambda arg: arg, java_gateway: Optional[CdkJavaGateway] = None, **kwargs, ): """Initialize state. Parameters: select: which features to compute (by default, all). List of names, order matters. Presets are available as class constants: PRESET_ALL: all features PRESET_ROBUST: a subset of descriptors that are fast to compute and do not fail often (tested on QM9 and CEP datasets; see accompanying notebook) failmode: how to handle failed descriptor calculations, either due to rejected SMILES encodings or failing descriptor code. Possible values: "raise" [default]: raise a Benchmarexception "drop": drop the sample. Returned Data will have fewer samples ("mask", mask): where `mask` is a NumPy array with dtype bool whose entries will be set to False for failures ("index", index): where `index` is an empty list to which the indices of failed entries will be appended samplef: a function accepting and returning a sample. This enables transformation of samples, for example, to select an entry by key if sample is a dictionary, or to turn a dictionary into a vector. Default is to return the sample unchanged. java_gateway: a gateway to a Java virtual machine Requires a CDK jar. """ super().__init__(**kwargs) # parameters select = params.optional_( select, lambda arg: params.tuple_( arg, lambda arg: params.enumeration(arg, self.DESCRIPTORS.keys()) ), ) select = self.PRESET_ALL if select is None else select self._failmode = DataTransformationFailureMode.failmode(failmode) self._samplef = params.callable(samplef, num_pos_or_kw=1) self._java_gateway = params.optional_( java_gateway, lambda arg: params.instance(arg, JavaGateway) ) if self._java_gateway is None: self._java_gateway = CdkJavaGateway() self._java_gateway = self._java_gateway.gateway # set up descriptors self._descriptors = tuple( eval("self._java_gateway.jvm." + self.DESCRIPTORS[name][0] + "()") for name in select ) builder = self._java_gateway.jvm.org.openscience.cdk.DefaultChemObjectBuilder.getInstance() for descriptor in self._descriptors: descriptor.initialise(builder) self._arities = tuple(self.DESCRIPTORS[name][1] for name in select)
def evaluate(self, results, **kwargs): """Compute plot data for multiple generalized (set-valued) functions. Multiple curves C_1, ..., C_k can be drawn. Each curve C_i is specified by a non-empty sequence of 2-tuples, where the first value is location on horizontal axis, and the other value is a sequence of locations on the vertical axis. Each curve can be drawn in a different way (points, box-whisker). Parameters: results: sequence of generalized functions data (curve data). Each datum is a sequence of tuples (x,fx), where x is a real number and fx is a sequence of real numbers. Examples: # two curves sharing one horizontal location evaluate([ [(1,(1,0.9,1.1)), (3,(2,))], # curve 1 [(1,(0.7,)), (2,(3.1,2.8)), (4,(5.5,7.3,6))], # curve 2 ]) """ super().evaluate(results=results, **kwargs) # parameter validation tuple_testf = lambda arg: params.tuple_(arg, params.real, params.real_vector, arity=2) curve_testf = lambda arg: params.tuple_(arg, tuple_testf) results = params.tuple_(results, curve_testf) # _rectify evaluates to True if True or if > 0 if len(results) > len(self.RECTIFY_DELTAS) and self._rectify: raise InvalidParameterError( f"at most {len(self.RECTIFY_DELTAS)} curves", f"{len(self.RECTIFY_DELTAS)} curves" ) # finalize parameter validation for visualization_type if not is_sequence(self._visualization_type): self._visualization_type = (self._visualization_type,) * len(results) self._visualization_type = params.tuple_( self._visualization_type, lambda arg: params.enumeration(arg, {"points", "box-whisker", "shaded-line"}), arity=len(results), default="points", ) # prepare plot # determine all distinct horizontal positons in the results data all_positions = np.unique([entry[0] for curve in results for entry in curve]) # there is nothing to do without data to plot if len(all_positions) == 0: self._plotdata = [] return # do not rectify if there is only a single horizontal position if len(all_positions) == 1 or self._rectify is False: self._rectify = 0.0 # automatic determination of horizontal rectification factor # # the correct way to draw box-plots on a logarithmic horizontal axis is to have # different left-width and right-width of the boxes. However, matplotlib does not # support this. Because box widths are small compared to horizontal plot range, # it suffices to use the sum of left- and right-half widths. between_groups_spacing = 0.4 in_group_spacing = 0.9 # box-whisker plots if self.axes_scales[0] == "linear": logf = lambda arg: arg powf = lambda arg: arg elif self.axes_scales[0] == "log": base = 10 logf = lambda arg: np.log(arg) / np.log(base) powf = lambda arg: np.power(base, arg) if self._rectify is True: # diff(...) requires at least two horizontal locations; this is ensured above self._rectify = ( between_groups_spacing * min(np.diff(logf(all_positions))) / len(results) ) # determine positions self._plotdata = [None] * len(results) deltas = self.RECTIFY_DELTAS[len(results)] if self._rectify else np.zeros(len(results)) for (i, curve) in enumerate(results): # point markers, every single point is drawn if self._visualization_type[i] == "points": positions = powf( np.hstack( [ logf(entry[0] * np.ones(len(entry[1]))) + deltas[i] * self._rectify / 2 for entry in curve ] ) ) values = np.hstack([entry[1] for entry in curve]) self._plotdata[i] = np.transpose([positions, values]) # box-whisker plots elif self._visualization_type[i] == "box-whisker": positions = np.asfarray( [logf(entry[0]) + deltas[i] * self._rectify / 2 for entry in curve] ) values = [entry[1] for entry in curve] # can't use rectify for width if 0; 1 is a wild guess # todo: if plot ranges have been set, a better default value could # be 10% of horizontal plot range w = 1 if not self._rectify else self._rectify widths = powf((positions + w / 2) * in_group_spacing) - powf( (positions - w / 2) * in_group_spacing ) positions = powf(positions) self._plotdata[i] = (positions, values, widths) elif self._visualization_type[i] == "shaded-line": positions = np.asfarray([entry[0] for entry in curve]) values = [entry[1] for entry in curve] self._plotdata[i] = (positions, values) else: raise BenchmarkError("internal error, unknown visualization type")
def __init__( self, select: Union[str, Sequence[str]] = "all", samplef: Callable[[Any], Any] = lambda arg: arg, stoichiometry_p_list: Sequence[int] = (0, 2, 3, 5, 7, 10), elemental_preset: str = "magpie", ionic_fast: bool = False, valence_orbitals: Sequence[str] = ("s", "p", "d", "f"), valence_props: Sequence[str] = ("avg", "frac"), **kwargs, ): """Initialize state. Selected parameters of the wrapped matminer classes Stoichiometry, ElementProperty, IonProperty, ValenceOrbital can be passed through. These parameters are prefixed with stoichiometry, elemental, ionic, valence. For example, stoichiometry_p_list is the p_list parameter of Stoichiometry. For further details on these, see https://github.com/hackingmaterials/matminer/blob/master/matminer/featurizers/composition.py Parameters: select: which feature sets to compute (by default, all). Specifying multiple sets (e.g., ('stoichiometry', 'elemental') selects both). Valid choices: 'all': all features 'stoichiometry': norms of stoichiometric features 'elemental': element properties 'ionic': ion properties 'valence': valence orbital shell features samplef: a function accepting and returning a sample. This enables transformation of samples, for example, to select an entry by key if sample is a dictionary, or to turn a dictionary into a vector. Default is to return the sample unchanged. stoichiometry_p_list: list of L_p norms to compute elemental_preset: matminer preset to use. Valid choices include: 'magpie', 'deml', 'matminer', 'matscholar_el', 'megnet_el' ionic_fast: if True, assumes that elements exist in single oxidation state valence_orbitals: which valence orbitals to consider valence_props: whether to return average properties, fractional, or both Requires the matminer package (see file documentation). """ super().__init__(**kwargs) SELECT_SETS = ("stoichiometry", "elemental", "ionic", "valence") if select == "all": select = SELECT_SETS if isinstance(select, str): select = (select, ) # tuple(str,) yields tuple of characters in str select = params.tuple_( select, lambda arg: params.enumeration(arg, set(SELECT_SETS)), ) self._stoichiometry_p_list = params.tuple_( stoichiometry_p_list, lambda p: params.integer(p, from_=0)) self._elemental_preset = params.enumeration( elemental_preset, {"magpie", "deml", "matminer", "matscholar_el", "megnet_el"}) self._ionic_fast = params.boolean(ionic_fast) self._valence_orbitals = params.tuple_( valence_orbitals, lambda arg: params.enumeration(arg, {"s", "p", "d", "f"})) self._valence_props = params.tuple_( valence_props, lambda arg: params.enumeration(arg, {"avg", "frac"})) self.samplef = samplef # todo: add callable to params # set up matminer try: import matminer import matminer.featurizers import matminer.featurizers.base import matminer.featurizers.composition import matminer.featurizers.conversions import pymatgen except ModuleNotFoundError as e: raise BenchmarkError( f"'{type(self).__name__}' requires 'matminer' and 'pymatgen' packages" ) from e self._composition = pymatgen.core.composition.Composition # set up features features = [] if "stoichiometry" in select: features.append( matminer.featurizers.composition.Stoichiometry( p_list=self._stoichiometry_p_list)) if "elemental" in select: features.append( matminer.featurizers.composition.ElementProperty.from_preset( self._elemental_preset)) if "ionic" in select: features.append( matminer.featurizers.composition.IonProperty( fast=self._ionic_fast)) if "valence" in select: features.append( matminer.featurizers.composition.ValenceOrbital( orbitals=self._valence_orbitals, props=self._valence_props)) self._mmfeatures = matminer.featurizers.base.MultipleFeaturizer( features)
def __init__( self, data: "pandas.DataFrame", # noqa F821 labels: Optional[Union["pandas.DataFrame", Sequence[str]]] = None, dtype: Optional[dict] = None, join: Optional[str] = None, filterf: Optional[Callable[[Any], bool]] = None, samplef: Optional[Callable[[Any], Any]] = None, labelf: Optional[Callable[[Any], Any]] = None, **kwargs, ): """Initialize dataset. Parameters control loading and preprocessing of the data. Order: 1. joining 2. filtering 3. sample and label transform Parameters: data: the samples in the form of a Pandas DataFrame. labels: the labels, either in the form of a Pandas DataFrame with same number of rows as data and different column names, or in the form of a list of column names, which are then split out from the data and used as labels. If not specified, the dataset is unlabeled. dtype: the NumPy data types to use for samples and labels, in the form of a dictionary with column names as keys and dtypes as values. Can be used to override dtype auto-detection for some or all columns. join: if specified, name of "column" to join by; this changes labels to be sequences of single-entry labels filterf: a function that accepts a sample and returns whether to keep it (True) or exclude it (False). Default retains all samples samplef: function accepting and returning a sample; applied to all samples as post-processing labelf: function accepting and returning a label; applied to all labels as post-processing Raises: InvalidParameterError for invalid arguments. In particular, numbers of data and labels must match. If column names are given, they must be unique across data and labels, if any. """ import pandas as pd # only import if class is used # parameter validation data = params.instance(data, pd.DataFrame) labels = params.optional_( labels, lambda arg: params.any_( arg, lambda arg: params.instance(arg, pd.DataFrame ), # before tuple_ lambda arg: params.tuple_(arg, params.string), ), ) dtype = params.optional_(dtype, lambda arg: params.instance(arg, dict), default={}) join = params.optional_(join, params.string) singleargf = lambda arg: params.callable(arg, num_pos_or_kw=1 ) # noqa: E731 filterf = params.optional_(filterf, singleargf) samplef = params.optional_(samplef, singleargf) labelf = params.optional_(labelf, singleargf) if labels is None and labelf: raise InvalidParameterError( "matching labels and label function", "label function specified for unlabeled data") # process data data = data.reset_index(drop=True) # if labels are given as separate DataFrame, join them if isinstance(labels, pd.DataFrame): if len(data) != len(labels): raise InvalidParameterError( "matching data and labels", f"different number of rows ({len(data)} != {len(labels)})", ) labels = labels.reset_index(drop=True) col_names = np.hstack((data.columns, labels.columns)) if len(col_names) != len(pd.unique(col_names)): raise InvalidParameterError( "unique column names", f"{data.columns.values} and {labels.columns.values}") data = pd.concat([data, labels], axis=1) labels = labels.columns.values # 1. optional joining if join: groups = data.groupby(join, sort=False, as_index=False) data = groups.aggregate(lambda tdf: tdf.tolist()) # 2. optional filtering if filterf: selection = data.apply(filterf, axis=1) data = data[selection] # split data and labels if labels is not None: # DataFrame column indexing requires list, not tuple data, labels = data.drop(columns=list(labels)), data[list(labels)] # 3. optional sample and label transform if samplef: data = data.apply(samplef, axis=1, result_type="reduce") if isinstance(data, pd.Series): data = pd.DataFrame(data, columns=["Samples"]) if labelf: labels = labels.apply(labelf, axis=1, result_type="reduce") if isinstance(labels, pd.Series): labels = pd.DataFrame(labels, columns=["Labels"]) # convert to NumPy structured array data = self._to_numpy(data, dtype=dtype) labels = self._to_numpy(labels, dtype=dtype) if labels is not None else None super().__init__(data=data, labels=labels, **kwargs)
def __init__(self, rng: int = None, strategy: str = "best1bin", maxiter: int = 1000, popsize: int = 15, tol: float = 0.01, mutation=(0.5, 1), recombination: float = 0.7, **kwargs): """Initialize state. Scipy-specific parameters are passed through. Parameters: rng: integer seed. Will be used to generate a new seed each time the optimizer is run. strategy: The differential evolution strategy to use. See documentation for complete list and explanations. maxiter: The maximum number of generations over which the entire population is evolved. popsize: A multiplier for setting the total population size. tol: Relative tolerance for convergence. mutation: The mutation constant. Either a number between 0 and 2 or a tuple (min, max) in which case the mutation constant is randomly selected uniformly from between min and max with each generation. recombination: The recombination constant. Must be between 0 and 1. """ super().__init__(rng=rng, **kwargs) allowed_strategies = { "best1bin", "best1exp", "rand1exp", "randtobest1exp", "currenttobest1exp", "best2exp", "rand2exp", "randtobest1bin", "currenttobest1bin", "best2bin", "rand2bin", "rand1bin", } self._strategy = params.enumeration(strategy, allowed_strategies) self._maxiter = params.integer(maxiter, from_=1) self._popsize = params.integer(popsize, from_=1) self._tol = params.real(tol, above=0.0) def test_mutation_range(arg, low=0.0): return params.real(arg, from_=low, to=2.0) self._mutation = params.any_( mutation, test_mutation_range, lambda pair: params.tuple_( pair, test_mutation_range, lambda arg2: test_mutation_range(arg2, low=pair[0]), arity=2, ), ) self._recombination = params.real(recombination, from_=0.0, to=1.0)
def test_tuple_(): """Tests tuple_ meta test.""" testf = lambda arg: params.none(arg) # special case: no tuple with pytest.raises(InvalidParameterError): params.tuple_(None, lambda arg: arg) # special case: single test assert params.tuple_((None,), testf) == (None,) with pytest.raises(InvalidParameterError): params.any_("_", testf) # special case: 2-tuple assert params.tuple_((None, None), testf, testf) == (None, None) with pytest.raises(InvalidParameterError): params.tuple_(("_", None), testf, testf) with pytest.raises(InvalidParameterError): params.tuple_((None, "_"), testf, testf) with pytest.raises(InvalidParameterError): params.tuple_(("_", "_"), testf, testf) # arity parameter assert params.tuple_((None, None), testf, arity=2) with pytest.raises(InvalidParameterError): params.tuple_((None, None), testf, arity=3) with pytest.raises(InvalidParameterError): params.tuple_((None, None, None), testf, arity=2) # default parameter assert params.tuple_((None,), testf, arity=3, default=None) == (None, None, None) # no arity, no default assert params.tuple_((None, None, None), testf) == (None, None, None)