Exemple #1
0
    def __init__(self,
                 dimensions: int,
                 function: Optional[Callable[[np.ndarray],
                                             Sequence[L]]] = None,
                 domain: Optional[Sequence[Tuple[float, float]]] = None,
                 **kwargs):
        """Initialize vector space data.

        If no function is specified, data are unlabeled.
        If a domain is specified, samples must be within that domain.

        Parameters:
            dimensions: dimensionality of vector space; positive finite integer
            function: a function that accepts a real matrix (vectors are rows)
                and returns a corresponding sequence of labels.
                If not specified, Data are unlabeled.
            domain: domain in the form of a hypercube, if specified;
                given as a sequence of intervals [a,b], where a <= b.
                If only a single interval is specified it is used for all dimensions.

        Raises:
            InvalidParameterError for invalid arguments.
        """

        self._dimensions = params.integer(dimensions, above=0)
        self._function = params.optional_(
            function, lambda arg: params.callable(arg, num_pos_or_kw=1))
        self._domain = params.optional_(
            domain, lambda arg: params.hypercube_domain(arg, self._dimensions))

        super().__init__(*kwargs)
Exemple #2
0
    def __init__(
        self,
        rng: int = None,
        num_seeds: int = 1,
        resolution: int = 64,
        max_relative_jump: float = 1.0,
        dimensions_varied: Union[str, float, int] = "all",
        max_iters: Optional[int] = None,
        max_evals: Optional[int] = None,
        **kwargs,
    ):
        """Initialize state.

        Parameters:
            rng: pseudo-random number generator seed
            num_seeds: the number of starting points, and the number of points chosen at the end
                of each iteration
            resolution: the number of points to sample along a single dimension for a single seed
            max_relative_jump: the maximum relative step size along a single dimension. If a given
                dimension has length `L` and a seed has value `x` along that dimension, then the
                candidates are `resolution` linearly spaced points from the range
                [x - max_relative_jump * L, x + max_relative_jump * L] (clipped by the bounds).
                `max_relative_jump must be on (0, 1].
                For a value of 1, the entire range is always considered.
            dimensions_varied: how many randomly selected dimensions to explore with each step.
                'all' indicates all dimensions. An integer directly specifies the number of
                dimensions. A float on (0, 1) indicates the fractional number of the total.
            max_iters: the maximum number of iterations
            max_evals: the maximum number of function evaluations (this is a soft maximum:
                once it is reached then the current iteration finishes)

        TODO: add tolerance stopping conditions
        """
        super().__init__(rng=rng, **kwargs)

        self._num_seeds = params.integer(num_seeds, from_=1)
        self._resolution = params.integer(resolution, from_=2)
        self._max_relative_jump = params.real(max_relative_jump,
                                              above=0.0,
                                              to=1.0)
        self._dimensions_varied = params.any_(
            dimensions_varied,
            lambda arg: params.integer(arg, above=0),
            lambda arg: params.real(arg, above=0.0, below=1.0),
            lambda arg: params.enumeration(arg, {"all"}),
        )
        self._max_iters = params.optional_(
            max_iters, lambda arg: params.integer(arg, from_=1))
        self._max_evals = params.optional_(
            max_evals, lambda arg: params.integer(arg, from_=1))
        if self._max_iters is None and self._max_evals is None:
            raise InvalidParameterError(
                "at least one stopping condition defined", "all Nones")
Exemple #3
0
    def __init__(
        self,
        optimizer_names: Optional[List[str]] = None,
        log_scale: bool = False,
        quantile_width: float = 0.5,
        show_extrama: bool = True,
        **kwargs,
    ):
        self._optimizer_names = params.optional_(
            optimizer_names, lambda arg: params.sequence(arg, type_=str)
        )
        self._show_extrema = params.boolean(show_extrama)
        log_scale = params.boolean(log_scale)
        scale = "log" if log_scale else "linear"

        self._quantile_width = params.real(quantile_width, from_=0, to=1)

        kwargs["axes_scales"] = kwargs.get("axes_scales", (scale, "linear"))
        kwargs["axes_labels"] = kwargs.get(
            "axes_labels", ("function evaluations", "best score", None, None)
        )
        kwargs["rectify"] = False
        kwargs["visualization_type"] = "shaded-line"

        super().__init__(**kwargs)
Exemple #4
0
    def __init__(self,
                 data: np.ndarray,
                 labels: Optional[np.ndarray] = None,
                 **kwargs):
        """Initialize dataset.

        Parameters:
            data: tabular data as a NumPy ndarray
            labels: tabular data as a NumPy ndarray. If not specified,
                dataset is unlabeled.

        Raises:
            InvalidParameterError for invalid arguments. In particular,
                numbers of data and labels must match.

        Examples:
            From numerical NumPy data:
            ```
            TabularData(numpy.ndarray(...), ...)
            ```

            From a Pandas DataFrame:
            ```
            df = pandas.DataFrame(..., columns=[...])
            TabularData(df.to_records(index=False), labels=...)
            ```

            From mixed NumPy data, with column names (note use of tuples):
            ```
            a = numpy.array([('a', 1), ('b', 2)], dtype=[('C', str), ('D', int)])
            TabularData(a, ...)
            ```
        """

        # parameter validation
        data = params.instance(data, np.ndarray)
        labels = params.optional_(labels,
                                  lambda arg: params.instance(arg, np.ndarray))

        if labels is not None:
            # number of samples and labels must match
            if data.shape[0] != labels.shape[0]:
                raise InvalidParameterError(
                    "same number of samples and labels",
                    f"{data.shape[0]} samples, {labels.shape[0]} labels",
                )

            # uniqueness of "column" names, if any, is enforced by NumPy,
            # but only separately for data and labels
            if is_sequence(data.dtype.names) and is_sequence(
                    labels.dtype.names):
                column_names = data.dtype.names + labels.dtype.names
                if len(column_names) != len(np.unique(column_names)):
                    raise InvalidParameterError(
                        "unique column names for samples and labels",
                        column_names)

        self._data, self._labels = data, labels

        super().__init__(**kwargs)
Exemple #5
0
    def __init__(self,
                 labels_to_load: Optional[Union[str, List[str]]] = None,
                 ignore_dubious: bool = False):
        """Initialize Ni-superalloy dataset with specified labels.

        Parameters:
            labels_to_load (str or List[str]): which labels to load. Options are
                'Yield Strength', 'Ultimate Tensile Strength', 'Stress Rupture Time',
                'Stress Rupture Stress', and 'Elongation'.
                If None, then all labels are loaded.
            ignore_dubious: whether or not to ignore samples that have something
                questionable about them

        """

        labels_to_load = params.optional_(
            labels_to_load,
            lambda arg: params.any_(
                arg,
                params.string,
                lambda arg: params.sequence(arg, type_=str),
            ),
        )
        ignore_dubious = params.boolean(ignore_dubious)

        filepath = self.DEFAULT_PATH
        data, labels = self._load_data_and_labels(filepath, labels_to_load,
                                                  ignore_dubious)
        super().__init__(data=data, labels=labels)
Exemple #6
0
 def _indices_testf(self, indices: Sequence[Any]):
     return params.optional_(
         indices,
         lambda arg: list(
             params.any_(  # NumPy indexing expects a list
                 arg,
                 lambda arg: params.tuple_(arg, None, arity=0),  # empty set
                 lambda arg: params.tuple_(
                     arg, lambda arg: params.integer(
                         arg, from_=0, below=self.num_samples)),
             )),
     )
Exemple #7
0
def test_optional_():
    """Test optional_ meta test."""

    # only testf and None are valid
    assert params.optional_(None, params.integer) is None
    assert params.optional_(1, params.integer) == 1
    with pytest.raises(InvalidParameterError):
        params.optional_("x", params.integer)
    with pytest.raises(InvalidParameterError):
        params.optional_(1, lambda arg: params.integer(arg, above=1))

    # default value
    assert params.optional_(1, params.integer, default=2) == 1
    assert params.optional_(None, params.integer, default=2) == 2
Exemple #8
0
    def __init__(self, size, domain: Optional[Any] = None, rng=None, **kwargs):
        """Initialize sampler.

        Parameters:
            size: number of vector samples to draw
            domain: (sub)domain to sample from; default is to use the data's domain
                if available, or the unit hypercube otherwise
            rng: pseudo-random number generator used

        Returns:
            IndexedFiniteData of vectors
        """

        super().__init__(rng=rng, **kwargs)

        self._size = params.integer(size, from_=0)  # no upper bound on number of vectors to draw
        self._domain = params.optional_(domain, lambda arg: params.hypercube_domain(arg))
Exemple #9
0
 def __init__(
     self,
     data: VectorSpaceData,
     model: Learner,
     scorer: Scorer,
     optimizers: Sequence[Optimizer],
     evaluations: Sequence[Evaluation] = (OptimizationTrajectoryPlot(),),
     num_trials: int = 1,
     training_data: Optional[Data] = None,
 ):
     self._data = params.instance(data, VectorSpaceData)
     self._scorer = params.instance(scorer, Scorer)
     self._model = params.instance(model, Learner)
     self._optimizers = params.sequence(optimizers, type_=Optimizer)
     self._evaluations = params.tuple_(
         evaluations, lambda arg: params.instance(arg, Evaluation)
     )
     self._num_trials = params.integer(num_trials, from_=1)
     self._training_data = params.optional_(
         training_data, lambda arg: params.instance(arg, Data)
     )
    def __init__(
        self,
        data: Data,
        training: Sequence[Sampler],
        validation: Sampler,
        learners: Sequence[SupervisedLearner],
        features: DataValuedTransformation = IdentityFeatures(),
        metric: ScalarEvaluationMetric = RootMeanSquaredError(),
        evaluations: Sequence[Evaluation] = (LearningCurvePlot(),),  # todo: add table
        progressf: Optional[Callable[[int, int], None]] = None,
    ):
        """Initialize workflow.

        Parameters:
            data: labeled data
            training: sequence of Samplers, one for each training set size
            validation: Sampler for validation set
            learners: sequence of supervised regression algorithms
            features: any data-valued transformation
            metric: evaluation metric to use; root mean squared error by default
            evaluations: one or more evaluations; default are learning curve and table
            progressf: callable with two parameters, done iterations and total number of iterations
        """

        self._data = params.instance(data, Data)  # todo: params.data(..., is_labeled=True)
        if not self._data.is_labeled:
            raise InvalidParameterError("labeled data", "unlabeled data")
        self._training = params.sequence(training, type_=Sampler)
        self._validation = params.instance(validation, Sampler)
        self._learners = params.sequence(learners, type_=SupervisedLearner)
        self._features = params.instance(features, Features)
        self._metric = params.instance(metric, ScalarEvaluationMetric)
        self._evaluations = params.tuple_(
            evaluations, lambda arg: params.instance(arg, Evaluation)
        )
        self._progressf = params.optional_(
            progressf, lambda arg: params.callable(arg, num_pos_or_kw=2)
        )
        if self._progressf is None:
            self._progressf = lambda *args: None
Exemple #11
0
    def best_score_trajectory(
        self, maximize: bool = True, length: Optional[int] = None
    ) -> Sequence[float]:
        """Calculate the best score found so far as a function of number of function evaluations.

        Parameters:
            maximize: whether the goal is to maximize (true) or minimize (false) the score
            length: total length of the result. If larger than the actual number of function
                evaluations, the result will be padded with the best value. If smaller than the
                actual number of evaluations, the result will be truncated.
                If None, the result is returned as-is.

        Returns:
            A sequence of floats, each one corresponding to the best score found at that point
            in the optimization trajectory.
        """
        maximize = params.boolean(maximize)
        length = params.optional_(length, lambda arg: params.integer(arg, from_=1))

        best_score = np.empty(self.num_evaluations)
        idx = 0
        best_score_so_far = self.steps[0].scores[0]
        direction = 1.0 if maximize else -1.0

        for optimization_iter in self.steps:
            for eval_ in optimization_iter.scores:
                if eval_ * direction > best_score_so_far * direction:
                    best_score_so_far = eval_
                best_score[idx] = best_score_so_far * direction
                idx += 1

        if length is not None:
            extra_padding = length - len(best_score)
            if extra_padding < 0:
                return best_score[:extra_padding]  # TODO: Raise a warning?
            return np.pad(best_score, ((0, extra_padding),), mode="edge")
        else:
            return best_score
Exemple #12
0
    def __init__(
        self,
        select: Optional[Sequence[str]] = None,
        failmode="raise",
        samplef: Callable[[Any], Any] = lambda arg: arg,
        java_gateway: Optional[CdkJavaGateway] = None,
        **kwargs,
    ):
        """Initialize state.

        Parameters:
            select: which features to compute (by default, all). List of names, order matters.
                Presets are available as class constants:
                PRESET_ALL: all features
                PRESET_ROBUST: a subset of descriptors that are fast to compute and do not fail
                    often (tested on QM9 and CEP datasets; see accompanying notebook)
            failmode: how to handle failed descriptor calculations, either due to rejected SMILES
                encodings or failing descriptor code. Possible values:
                "raise" [default]: raise a Benchmarexception
                "drop": drop the sample. Returned Data will have fewer samples
                ("mask", mask): where `mask` is a NumPy array with dtype bool whose entries will
                    be set to False for failures
                ("index", index): where `index` is an empty list to which the indices of failed
                    entries will be appended
            samplef: a function accepting and returning a sample. This enables
                transformation of samples, for example, to select an entry by key
                if sample is a dictionary, or to turn a dictionary into a vector.
                Default is to return the sample unchanged.
            java_gateway: a gateway to a Java virtual machine

        Requires a CDK jar.
        """

        super().__init__(**kwargs)

        # parameters
        select = params.optional_(
            select,
            lambda arg: params.tuple_(
                arg, lambda arg: params.enumeration(arg, self.DESCRIPTORS.keys())
            ),
        )
        select = self.PRESET_ALL if select is None else select
        self._failmode = DataTransformationFailureMode.failmode(failmode)
        self._samplef = params.callable(samplef, num_pos_or_kw=1)
        self._java_gateway = params.optional_(
            java_gateway, lambda arg: params.instance(arg, JavaGateway)
        )
        if self._java_gateway is None:
            self._java_gateway = CdkJavaGateway()
        self._java_gateway = self._java_gateway.gateway

        # set up descriptors
        self._descriptors = tuple(
            eval("self._java_gateway.jvm." + self.DESCRIPTORS[name][0] + "()") for name in select
        )

        builder = self._java_gateway.jvm.org.openscience.cdk.DefaultChemObjectBuilder.getInstance()
        for descriptor in self._descriptors:
            descriptor.initialise(builder)

        self._arities = tuple(self.DESCRIPTORS[name][1] for name in select)
Exemple #13
0
    def __init__(
        self,
        data: "pandas.DataFrame",  # noqa F821
        labels: Optional[Union["pandas.DataFrame", Sequence[str]]] = None,
        dtype: Optional[dict] = None,
        join: Optional[str] = None,
        filterf: Optional[Callable[[Any], bool]] = None,
        samplef: Optional[Callable[[Any], Any]] = None,
        labelf: Optional[Callable[[Any], Any]] = None,
        **kwargs,
    ):
        """Initialize dataset.

        Parameters control loading and preprocessing of the data. Order:
        1. joining
        2. filtering
        3. sample and label transform

        Parameters:
            data: the samples in the form of a Pandas DataFrame.
            labels: the labels, either in the form of a Pandas DataFrame with same number of rows
                as data and different column names, or in the form of a list of column names,
                which are then split out from the data and used as labels. If not specified,
                the dataset is unlabeled.
            dtype: the NumPy data types to use for samples and labels, in the form of a dictionary
                with column names as keys and dtypes as values. Can be used to override dtype
                auto-detection for some or all columns.
            join: if specified, name of "column" to join by; this changes labels
                to be sequences of single-entry labels
            filterf: a function that accepts a sample and returns whether to keep it
                (True) or exclude it (False). Default retains all samples
            samplef: function accepting and returning a sample; applied to all samples
                as post-processing
            labelf: function accepting and returning a label; applied to all labels
                as post-processing

        Raises:
            InvalidParameterError for invalid arguments. In particular,
                numbers of data and labels must match. If column names are given,
                they must be unique across data and labels, if any.
        """

        import pandas as pd  # only import if class is used

        # parameter validation
        data = params.instance(data, pd.DataFrame)
        labels = params.optional_(
            labels,
            lambda arg: params.any_(
                arg,
                lambda arg: params.instance(arg, pd.DataFrame
                                            ),  # before tuple_
                lambda arg: params.tuple_(arg, params.string),
            ),
        )
        dtype = params.optional_(dtype,
                                 lambda arg: params.instance(arg, dict),
                                 default={})
        join = params.optional_(join, params.string)
        singleargf = lambda arg: params.callable(arg, num_pos_or_kw=1
                                                 )  # noqa: E731
        filterf = params.optional_(filterf, singleargf)
        samplef = params.optional_(samplef, singleargf)
        labelf = params.optional_(labelf, singleargf)

        if labels is None and labelf:
            raise InvalidParameterError(
                "matching labels and label function",
                "label function specified for unlabeled data")

        # process data
        data = data.reset_index(drop=True)

        # if labels are given as separate DataFrame, join them
        if isinstance(labels, pd.DataFrame):
            if len(data) != len(labels):
                raise InvalidParameterError(
                    "matching data and labels",
                    f"different number of rows ({len(data)} != {len(labels)})",
                )

            labels = labels.reset_index(drop=True)

            col_names = np.hstack((data.columns, labels.columns))
            if len(col_names) != len(pd.unique(col_names)):
                raise InvalidParameterError(
                    "unique column names",
                    f"{data.columns.values} and {labels.columns.values}")

            data = pd.concat([data, labels], axis=1)
            labels = labels.columns.values

        # 1. optional joining
        if join:
            groups = data.groupby(join, sort=False, as_index=False)
            data = groups.aggregate(lambda tdf: tdf.tolist())

        # 2. optional filtering
        if filterf:
            selection = data.apply(filterf, axis=1)
            data = data[selection]

        # split data and labels
        if labels is not None:
            # DataFrame column indexing requires list, not tuple
            data, labels = data.drop(columns=list(labels)), data[list(labels)]

        # 3. optional sample and label transform
        if samplef:
            data = data.apply(samplef, axis=1, result_type="reduce")
            if isinstance(data, pd.Series):
                data = pd.DataFrame(data, columns=["Samples"])
        if labelf:
            labels = labels.apply(labelf, axis=1, result_type="reduce")
            if isinstance(labels, pd.Series):
                labels = pd.DataFrame(labels, columns=["Labels"])

        # convert to NumPy structured array
        data = self._to_numpy(data, dtype=dtype)
        labels = self._to_numpy(labels,
                                dtype=dtype) if labels is not None else None

        super().__init__(data=data, labels=labels, **kwargs)