def __init__(self, dimensions: int, function: Optional[Callable[[np.ndarray], Sequence[L]]] = None, domain: Optional[Sequence[Tuple[float, float]]] = None, **kwargs): """Initialize vector space data. If no function is specified, data are unlabeled. If a domain is specified, samples must be within that domain. Parameters: dimensions: dimensionality of vector space; positive finite integer function: a function that accepts a real matrix (vectors are rows) and returns a corresponding sequence of labels. If not specified, Data are unlabeled. domain: domain in the form of a hypercube, if specified; given as a sequence of intervals [a,b], where a <= b. If only a single interval is specified it is used for all dimensions. Raises: InvalidParameterError for invalid arguments. """ self._dimensions = params.integer(dimensions, above=0) self._function = params.optional_( function, lambda arg: params.callable(arg, num_pos_or_kw=1)) self._domain = params.optional_( domain, lambda arg: params.hypercube_domain(arg, self._dimensions)) super().__init__(*kwargs)
def __init__( self, rng: int = None, num_seeds: int = 1, resolution: int = 64, max_relative_jump: float = 1.0, dimensions_varied: Union[str, float, int] = "all", max_iters: Optional[int] = None, max_evals: Optional[int] = None, **kwargs, ): """Initialize state. Parameters: rng: pseudo-random number generator seed num_seeds: the number of starting points, and the number of points chosen at the end of each iteration resolution: the number of points to sample along a single dimension for a single seed max_relative_jump: the maximum relative step size along a single dimension. If a given dimension has length `L` and a seed has value `x` along that dimension, then the candidates are `resolution` linearly spaced points from the range [x - max_relative_jump * L, x + max_relative_jump * L] (clipped by the bounds). `max_relative_jump must be on (0, 1]. For a value of 1, the entire range is always considered. dimensions_varied: how many randomly selected dimensions to explore with each step. 'all' indicates all dimensions. An integer directly specifies the number of dimensions. A float on (0, 1) indicates the fractional number of the total. max_iters: the maximum number of iterations max_evals: the maximum number of function evaluations (this is a soft maximum: once it is reached then the current iteration finishes) TODO: add tolerance stopping conditions """ super().__init__(rng=rng, **kwargs) self._num_seeds = params.integer(num_seeds, from_=1) self._resolution = params.integer(resolution, from_=2) self._max_relative_jump = params.real(max_relative_jump, above=0.0, to=1.0) self._dimensions_varied = params.any_( dimensions_varied, lambda arg: params.integer(arg, above=0), lambda arg: params.real(arg, above=0.0, below=1.0), lambda arg: params.enumeration(arg, {"all"}), ) self._max_iters = params.optional_( max_iters, lambda arg: params.integer(arg, from_=1)) self._max_evals = params.optional_( max_evals, lambda arg: params.integer(arg, from_=1)) if self._max_iters is None and self._max_evals is None: raise InvalidParameterError( "at least one stopping condition defined", "all Nones")
def __init__( self, optimizer_names: Optional[List[str]] = None, log_scale: bool = False, quantile_width: float = 0.5, show_extrama: bool = True, **kwargs, ): self._optimizer_names = params.optional_( optimizer_names, lambda arg: params.sequence(arg, type_=str) ) self._show_extrema = params.boolean(show_extrama) log_scale = params.boolean(log_scale) scale = "log" if log_scale else "linear" self._quantile_width = params.real(quantile_width, from_=0, to=1) kwargs["axes_scales"] = kwargs.get("axes_scales", (scale, "linear")) kwargs["axes_labels"] = kwargs.get( "axes_labels", ("function evaluations", "best score", None, None) ) kwargs["rectify"] = False kwargs["visualization_type"] = "shaded-line" super().__init__(**kwargs)
def __init__(self, data: np.ndarray, labels: Optional[np.ndarray] = None, **kwargs): """Initialize dataset. Parameters: data: tabular data as a NumPy ndarray labels: tabular data as a NumPy ndarray. If not specified, dataset is unlabeled. Raises: InvalidParameterError for invalid arguments. In particular, numbers of data and labels must match. Examples: From numerical NumPy data: ``` TabularData(numpy.ndarray(...), ...) ``` From a Pandas DataFrame: ``` df = pandas.DataFrame(..., columns=[...]) TabularData(df.to_records(index=False), labels=...) ``` From mixed NumPy data, with column names (note use of tuples): ``` a = numpy.array([('a', 1), ('b', 2)], dtype=[('C', str), ('D', int)]) TabularData(a, ...) ``` """ # parameter validation data = params.instance(data, np.ndarray) labels = params.optional_(labels, lambda arg: params.instance(arg, np.ndarray)) if labels is not None: # number of samples and labels must match if data.shape[0] != labels.shape[0]: raise InvalidParameterError( "same number of samples and labels", f"{data.shape[0]} samples, {labels.shape[0]} labels", ) # uniqueness of "column" names, if any, is enforced by NumPy, # but only separately for data and labels if is_sequence(data.dtype.names) and is_sequence( labels.dtype.names): column_names = data.dtype.names + labels.dtype.names if len(column_names) != len(np.unique(column_names)): raise InvalidParameterError( "unique column names for samples and labels", column_names) self._data, self._labels = data, labels super().__init__(**kwargs)
def __init__(self, labels_to_load: Optional[Union[str, List[str]]] = None, ignore_dubious: bool = False): """Initialize Ni-superalloy dataset with specified labels. Parameters: labels_to_load (str or List[str]): which labels to load. Options are 'Yield Strength', 'Ultimate Tensile Strength', 'Stress Rupture Time', 'Stress Rupture Stress', and 'Elongation'. If None, then all labels are loaded. ignore_dubious: whether or not to ignore samples that have something questionable about them """ labels_to_load = params.optional_( labels_to_load, lambda arg: params.any_( arg, params.string, lambda arg: params.sequence(arg, type_=str), ), ) ignore_dubious = params.boolean(ignore_dubious) filepath = self.DEFAULT_PATH data, labels = self._load_data_and_labels(filepath, labels_to_load, ignore_dubious) super().__init__(data=data, labels=labels)
def _indices_testf(self, indices: Sequence[Any]): return params.optional_( indices, lambda arg: list( params.any_( # NumPy indexing expects a list arg, lambda arg: params.tuple_(arg, None, arity=0), # empty set lambda arg: params.tuple_( arg, lambda arg: params.integer( arg, from_=0, below=self.num_samples)), )), )
def test_optional_(): """Test optional_ meta test.""" # only testf and None are valid assert params.optional_(None, params.integer) is None assert params.optional_(1, params.integer) == 1 with pytest.raises(InvalidParameterError): params.optional_("x", params.integer) with pytest.raises(InvalidParameterError): params.optional_(1, lambda arg: params.integer(arg, above=1)) # default value assert params.optional_(1, params.integer, default=2) == 1 assert params.optional_(None, params.integer, default=2) == 2
def __init__(self, size, domain: Optional[Any] = None, rng=None, **kwargs): """Initialize sampler. Parameters: size: number of vector samples to draw domain: (sub)domain to sample from; default is to use the data's domain if available, or the unit hypercube otherwise rng: pseudo-random number generator used Returns: IndexedFiniteData of vectors """ super().__init__(rng=rng, **kwargs) self._size = params.integer(size, from_=0) # no upper bound on number of vectors to draw self._domain = params.optional_(domain, lambda arg: params.hypercube_domain(arg))
def __init__( self, data: VectorSpaceData, model: Learner, scorer: Scorer, optimizers: Sequence[Optimizer], evaluations: Sequence[Evaluation] = (OptimizationTrajectoryPlot(),), num_trials: int = 1, training_data: Optional[Data] = None, ): self._data = params.instance(data, VectorSpaceData) self._scorer = params.instance(scorer, Scorer) self._model = params.instance(model, Learner) self._optimizers = params.sequence(optimizers, type_=Optimizer) self._evaluations = params.tuple_( evaluations, lambda arg: params.instance(arg, Evaluation) ) self._num_trials = params.integer(num_trials, from_=1) self._training_data = params.optional_( training_data, lambda arg: params.instance(arg, Data) )
def __init__( self, data: Data, training: Sequence[Sampler], validation: Sampler, learners: Sequence[SupervisedLearner], features: DataValuedTransformation = IdentityFeatures(), metric: ScalarEvaluationMetric = RootMeanSquaredError(), evaluations: Sequence[Evaluation] = (LearningCurvePlot(),), # todo: add table progressf: Optional[Callable[[int, int], None]] = None, ): """Initialize workflow. Parameters: data: labeled data training: sequence of Samplers, one for each training set size validation: Sampler for validation set learners: sequence of supervised regression algorithms features: any data-valued transformation metric: evaluation metric to use; root mean squared error by default evaluations: one or more evaluations; default are learning curve and table progressf: callable with two parameters, done iterations and total number of iterations """ self._data = params.instance(data, Data) # todo: params.data(..., is_labeled=True) if not self._data.is_labeled: raise InvalidParameterError("labeled data", "unlabeled data") self._training = params.sequence(training, type_=Sampler) self._validation = params.instance(validation, Sampler) self._learners = params.sequence(learners, type_=SupervisedLearner) self._features = params.instance(features, Features) self._metric = params.instance(metric, ScalarEvaluationMetric) self._evaluations = params.tuple_( evaluations, lambda arg: params.instance(arg, Evaluation) ) self._progressf = params.optional_( progressf, lambda arg: params.callable(arg, num_pos_or_kw=2) ) if self._progressf is None: self._progressf = lambda *args: None
def best_score_trajectory( self, maximize: bool = True, length: Optional[int] = None ) -> Sequence[float]: """Calculate the best score found so far as a function of number of function evaluations. Parameters: maximize: whether the goal is to maximize (true) or minimize (false) the score length: total length of the result. If larger than the actual number of function evaluations, the result will be padded with the best value. If smaller than the actual number of evaluations, the result will be truncated. If None, the result is returned as-is. Returns: A sequence of floats, each one corresponding to the best score found at that point in the optimization trajectory. """ maximize = params.boolean(maximize) length = params.optional_(length, lambda arg: params.integer(arg, from_=1)) best_score = np.empty(self.num_evaluations) idx = 0 best_score_so_far = self.steps[0].scores[0] direction = 1.0 if maximize else -1.0 for optimization_iter in self.steps: for eval_ in optimization_iter.scores: if eval_ * direction > best_score_so_far * direction: best_score_so_far = eval_ best_score[idx] = best_score_so_far * direction idx += 1 if length is not None: extra_padding = length - len(best_score) if extra_padding < 0: return best_score[:extra_padding] # TODO: Raise a warning? return np.pad(best_score, ((0, extra_padding),), mode="edge") else: return best_score
def __init__( self, select: Optional[Sequence[str]] = None, failmode="raise", samplef: Callable[[Any], Any] = lambda arg: arg, java_gateway: Optional[CdkJavaGateway] = None, **kwargs, ): """Initialize state. Parameters: select: which features to compute (by default, all). List of names, order matters. Presets are available as class constants: PRESET_ALL: all features PRESET_ROBUST: a subset of descriptors that are fast to compute and do not fail often (tested on QM9 and CEP datasets; see accompanying notebook) failmode: how to handle failed descriptor calculations, either due to rejected SMILES encodings or failing descriptor code. Possible values: "raise" [default]: raise a Benchmarexception "drop": drop the sample. Returned Data will have fewer samples ("mask", mask): where `mask` is a NumPy array with dtype bool whose entries will be set to False for failures ("index", index): where `index` is an empty list to which the indices of failed entries will be appended samplef: a function accepting and returning a sample. This enables transformation of samples, for example, to select an entry by key if sample is a dictionary, or to turn a dictionary into a vector. Default is to return the sample unchanged. java_gateway: a gateway to a Java virtual machine Requires a CDK jar. """ super().__init__(**kwargs) # parameters select = params.optional_( select, lambda arg: params.tuple_( arg, lambda arg: params.enumeration(arg, self.DESCRIPTORS.keys()) ), ) select = self.PRESET_ALL if select is None else select self._failmode = DataTransformationFailureMode.failmode(failmode) self._samplef = params.callable(samplef, num_pos_or_kw=1) self._java_gateway = params.optional_( java_gateway, lambda arg: params.instance(arg, JavaGateway) ) if self._java_gateway is None: self._java_gateway = CdkJavaGateway() self._java_gateway = self._java_gateway.gateway # set up descriptors self._descriptors = tuple( eval("self._java_gateway.jvm." + self.DESCRIPTORS[name][0] + "()") for name in select ) builder = self._java_gateway.jvm.org.openscience.cdk.DefaultChemObjectBuilder.getInstance() for descriptor in self._descriptors: descriptor.initialise(builder) self._arities = tuple(self.DESCRIPTORS[name][1] for name in select)
def __init__( self, data: "pandas.DataFrame", # noqa F821 labels: Optional[Union["pandas.DataFrame", Sequence[str]]] = None, dtype: Optional[dict] = None, join: Optional[str] = None, filterf: Optional[Callable[[Any], bool]] = None, samplef: Optional[Callable[[Any], Any]] = None, labelf: Optional[Callable[[Any], Any]] = None, **kwargs, ): """Initialize dataset. Parameters control loading and preprocessing of the data. Order: 1. joining 2. filtering 3. sample and label transform Parameters: data: the samples in the form of a Pandas DataFrame. labels: the labels, either in the form of a Pandas DataFrame with same number of rows as data and different column names, or in the form of a list of column names, which are then split out from the data and used as labels. If not specified, the dataset is unlabeled. dtype: the NumPy data types to use for samples and labels, in the form of a dictionary with column names as keys and dtypes as values. Can be used to override dtype auto-detection for some or all columns. join: if specified, name of "column" to join by; this changes labels to be sequences of single-entry labels filterf: a function that accepts a sample and returns whether to keep it (True) or exclude it (False). Default retains all samples samplef: function accepting and returning a sample; applied to all samples as post-processing labelf: function accepting and returning a label; applied to all labels as post-processing Raises: InvalidParameterError for invalid arguments. In particular, numbers of data and labels must match. If column names are given, they must be unique across data and labels, if any. """ import pandas as pd # only import if class is used # parameter validation data = params.instance(data, pd.DataFrame) labels = params.optional_( labels, lambda arg: params.any_( arg, lambda arg: params.instance(arg, pd.DataFrame ), # before tuple_ lambda arg: params.tuple_(arg, params.string), ), ) dtype = params.optional_(dtype, lambda arg: params.instance(arg, dict), default={}) join = params.optional_(join, params.string) singleargf = lambda arg: params.callable(arg, num_pos_or_kw=1 ) # noqa: E731 filterf = params.optional_(filterf, singleargf) samplef = params.optional_(samplef, singleargf) labelf = params.optional_(labelf, singleargf) if labels is None and labelf: raise InvalidParameterError( "matching labels and label function", "label function specified for unlabeled data") # process data data = data.reset_index(drop=True) # if labels are given as separate DataFrame, join them if isinstance(labels, pd.DataFrame): if len(data) != len(labels): raise InvalidParameterError( "matching data and labels", f"different number of rows ({len(data)} != {len(labels)})", ) labels = labels.reset_index(drop=True) col_names = np.hstack((data.columns, labels.columns)) if len(col_names) != len(pd.unique(col_names)): raise InvalidParameterError( "unique column names", f"{data.columns.values} and {labels.columns.values}") data = pd.concat([data, labels], axis=1) labels = labels.columns.values # 1. optional joining if join: groups = data.groupby(join, sort=False, as_index=False) data = groups.aggregate(lambda tdf: tdf.tolist()) # 2. optional filtering if filterf: selection = data.apply(filterf, axis=1) data = data[selection] # split data and labels if labels is not None: # DataFrame column indexing requires list, not tuple data, labels = data.drop(columns=list(labels)), data[list(labels)] # 3. optional sample and label transform if samplef: data = data.apply(samplef, axis=1, result_type="reduce") if isinstance(data, pd.Series): data = pd.DataFrame(data, columns=["Samples"]) if labelf: labels = labels.apply(labelf, axis=1, result_type="reduce") if isinstance(labels, pd.Series): labels = pd.DataFrame(labels, columns=["Labels"]) # convert to NumPy structured array data = self._to_numpy(data, dtype=dtype) labels = self._to_numpy(labels, dtype=dtype) if labels is not None else None super().__init__(data=data, labels=labels, **kwargs)