Ejemplo n.º 1
0
    def __init__(
        self,
        target=None,
        configuration: Optional[PlotConfiguration] = None,
        axes_labels=(None, None, None, None),
        axes_scales=("linear", "linear"),
        **kwargs,
    ):
        """Initialize Evaluation.

        Parameters:
            target: rendering target that evaluation outcome is rendered to;
                can be a single filename, or a matplotlib Axes or (Figure, Axes) pair,
                or a sequence thereof; if a matplotlib Axes or (Figure, Axes) pair,
                evaluation will add to it; if None, a new rendering target is created
            configuration: optional plot configuration controlling rendering details
            axes_labels: labels for all axes (bottom, left, top, right), None to not label an axis;
                         for shorter tuples remaining entries are assumed None, so ('x', 'y') is valid
            axes_scales: scales ("linear" or "log") for horizontal and vertical axes

        Examples:
            __init__(axes_labels=("bottom", "left", "top"))  # right is None
            __init__(axes_scales=("log", "log"))
        """

        configuration = params.any_(
            configuration, lambda arg: params.instance(arg, PlotConfiguration), params.none
        )

        super().__init__(configuration=configuration, **kwargs)

        # Axes, (Figure, Axes), filename, None, or sequence (without None)
        target_f = lambda arg: params.any_(
            arg,
            lambda arg: params.instance(arg, mpl.axes.Axes),
            lambda arg: params.tuple_(
                arg,
                lambda arg: params.instance(arg, mpl.figure.Figure),
                lambda arg: params.instance(arg, mpl.axes.Axes),
                arity=2,
            ),
            params.string,
        )
        self._target = params.any_(
            target, target_f, params.none, lambda arg: params.tuple_(arg, target_f)
        )

        self._axes_labels = params.tuple_(
            axes_labels,
            lambda arg: params.any_(arg, params.string, params.none),
            arity=4,
            default=None,
        )

        self._axes_scales = params.tuple_(
            axes_scales, lambda arg: params.enumeration(arg, {"linear", "log"}), arity=2
        )

        self._figaxis = None
Ejemplo n.º 2
0
 def _indices_testf(self, indices: Sequence[Any]):
     return params.optional_(
         indices,
         lambda arg: list(
             params.any_(  # NumPy indexing expects a list
                 arg,
                 lambda arg: params.tuple_(arg, None, arity=0),  # empty set
                 lambda arg: params.tuple_(
                     arg, lambda arg: params.integer(
                         arg, from_=0, below=self.num_samples)),
             )),
     )
Ejemplo n.º 3
0
Archivo: plots.py Proyecto: syam-s/smlb
    def evaluate(self, results, **kwargs):
        """Evaluate learning curve plot.

        Parameters:
            results: sequence of curve data, where each curve datum is a sequence of tuples (n,fx)
                of training set size n (positive integer) and performance values fx (sequence of real numbers).
        """

        # parameter validation

        tuple_testf = lambda arg: params.tuple_(arg,
                                                lambda arg: params.real(
                                                    arg, above=0),
                                                params.real_vector,
                                                arity=2)
        curve_testf = lambda arg: params.tuple_(arg, tuple_testf)
        results = params.tuple_(results, curve_testf)

        super().evaluate(results=results, **kwargs)

        ypowf = self._powf if self.axes_scales[1] == "log" else lambda arg: arg

        # asymptotic estimates
        if self._fits:
            asymptotic_fits = tuple(
                self.asymptotic_fit(fdata) for fdata in results)

            all_sizes = np.unique(
                [entry[0] for fdata in results for entry in fdata])
            sizes = np.linspace(start=np.min(all_sizes),
                                stop=np.max(all_sizes),
                                num=25)
            self._fit_data = np.empty(shape=(len(results), 2, len(sizes)))
            for i, (offset, slope, _, _) in enumerate(asymptotic_fits):
                yvalues = [
                    ypowf(offset + slope * self._logf(n)) for n in sizes
                ]
                self._fit_data[i, 0, :] = sizes
                self._fit_data[i, 1, :] = yvalues

            self.add_auxiliary(
                "asymptotic_fits",
                tuple({
                    "offset": offset,
                    "slope": slope,
                    "residuals": residuals,
                    "variance": variance,
                } for (offset, slope, residuals, variance) in asymptotic_fits),
            )
Ejemplo n.º 4
0
    def evaluate(self, results, **kwargs):
        """Evaluate optimization trajectory plot.

        Parameters:
            results: sequence of curve data, where each curve datum is a sequence of
                tuples (index, scores) of function evaluation number (positive integer)
                and best scores found after that many evaluations (sequence of real numbers).
        """
        tuple_testf = lambda arg: params.tuple_(
            arg, lambda arg: params.real(arg, above=0), params.real_vector, arity=2
        )
        curve_testf = lambda arg: params.tuple_(arg, tuple_testf)
        results = params.tuple_(results, curve_testf)

        super().evaluate(results=results, **kwargs)
Ejemplo n.º 5
0
    def axes_labels(self, labels=(None, None, None, None), **kwargs):
        """Set axes labels.

        Parameters:
            axes_labels: labels for bottom, left, top, right axes
                None indicates to use the current value

        Examples:
            axes_labels = (None, "y")  # set only left axis label
        """

        string_or_none_f = lambda arg: params.any_(arg, params.string, params.none)
        labels = params.tuple_(labels, string_or_none_f, arity=4, default=None)

        # re-assign tuple as a whole
        self._labels = tuple(
            self.axes_labels[i] if labels[i] is None else labels[i] for i in range(4)
        )

        # set labels if specified (not None)
        # this allows to pass kwargs specific to one axis
        if labels[0] is not None:
            self.ax.set_xlabel(labels[0], fontdict=self._fontdict(), **kwargs)
        if labels[1] is not None:
            self.ax.set_ylabel(labels[1], fontdict=self._fontdict(), **kwargs)
        if labels[2] is not None or labels[3] is not None:
            # todo; possible implementation via xtwin/ytwin, storing these axes in outcome
            raise NotImplementedError
Ejemplo n.º 6
0
    def axes_scales(self, scales=(None, None), **kwargs):
        """Set axes scales.

        Parameters:
            axes_scales: scales (None, "linear" or "log") for horizontal and vertical axes;
                None indicates to use the current value

        Examples:
            axes_scales = (None, "log")  # change only vertical axis
        """

        scale_or_none_f = lambda arg: params.any_(
            arg, lambda arg: params.enumeration(arg, {"linear", "log"}), params.none
        )
        scales = params.tuple_(scales, scale_or_none_f, arity=2, default=None)

        # re-assign tuple as a whole
        self._scales = (
            self.axes_scales[0] if scales[0] is None else scales[0],
            self.axes_scales[1] if scales[1] is None else scales[1],
        )

        # set axes if specified (not None)
        # this allows to pass kwargs specific to one axis
        if scales[0] is not None:
            self.ax.set_xscale(scales[0], **kwargs)
        if scales[1] is not None:
            self.ax.set_yscale(scales[1], **kwargs)
Ejemplo n.º 7
0
    def box_whisker(self, positions, values, color=0, widths=0.5, **kwargs):
        """Draw box-whisker plots.

        Parameter:
            positions: where to place plots on horizontal axis
            values: samples for each location
            color: color index
            widths: widths of boxes
        """

        positions = params.real_vector(positions)
        point_set_f = lambda arg: params.real_vector(arg)
        values = params.tuple_(values, params.real_vector, arity=len(positions))
        color = params.integer(color, from_=0, below=len(self.configuration.color_set))
        widths = params.real_vector(widths, dimensions=len(positions), domain=(0, 999))

        color = self.configuration.color(color)

        self.ax.boxplot(
            values,
            positions=positions,
            whis=(0, 100),
            bootstrap=None,
            widths=widths,
            notch=False,
            showmeans=True,
            boxprops={"color": color},
            whiskerprops={"color": color},
            capprops={"color": color},
            meanprops={"marker": "*", "markerfacecolor": color, "markeredgecolor": color},
            medianprops={"color": color},
            manage_ticks=False,
            **kwargs,
        )
Ejemplo n.º 8
0
    def __init__(
        self, visualization_type: str = "points", rectify: Union[float, bool] = False, **kwargs
    ):
        """Initialize generalized function plot.

        Parameters:
            visualization_type: how to visualize generalized functions.
                Either single value or list of appropriate length.
                Possible values: "points" (default), "box-whisker", "shaded-line"
            rectify: whether and by how much each curves' values will be horizontally displaced
                to visually disentangle markers from different curves at the same location.
                True indicates automatic displacement, False indicates no displacement.
                If not specified, horizontal axis positions are not modified (default).
                If the horizontal axis scaling is logarithmic, the rectification factor
                is applied in log-space.

        Examples:
            # show three curves with automatic horizontal rectification
            __init__(visualization_type=("points", "points", "box-whisker"), rectify=True)
        """

        super().__init__(**kwargs)

        # parameter validation

        enum_f = lambda arg: params.enumeration(arg, {"points", "box-whisker", "shaded-line"})
        self._visualization_type = params.any_(
            visualization_type, enum_f, lambda arg: params.tuple_(arg, enum_f)
        )
        # arity can only be tested in evaluate()

        self._rectify = params.any_(rectify, lambda arg: params.real(arg, from_=0), params.boolean)
Ejemplo n.º 9
0
    def shaded_line(
        self,
        positions: np.ndarray,
        values: List[np.ndarray],
        color_idx: int = 0,
        label: Optional[str] = None,
        quantile_width: float = 0.5,
        alpha: float = 0.2,
        show_extrema: bool = True,
        **kwargs,
    ):
        """Draw a line plot with shaded quantiles.

        Parameters:
            positions: 1-d array of point locations on the horizontal axis
            values: list of arrays, each one containing all of the values at a given location.
                len(values) must equal len(positions)
            color_idx: color index
            label: line label
            quantile_width: fraction of the range to shade. For the default value, 0.5,
                shade from the 25th percentile to the 75th percentile.
            alpha: shading alpha level
            show_extrema: whether or not to draw dashed lines at the best/worst point
        """
        positions = params.real_vector(positions)
        values = params.tuple_(values, params.real_vector, arity=len(positions))
        color_idx = params.integer(color_idx, from_=0, below=len(self.configuration.color_set))
        quantile_width = params.real(quantile_width, from_=0, to=1)
        alpha = params.real(alpha, from_=0, to=1)

        color = self.configuration.color(color_idx)
        lower_bound = 0.5 - quantile_width / 2.0
        upper_bound = 0.5 + quantile_width / 2.0

        median = [np.median(samples) for samples in values]
        lower_shading = [np.quantile(samples, lower_bound) for samples in values]
        upper_shading = [np.quantile(samples, upper_bound) for samples in values]

        self.ax.plot(positions, median, linestyle="-", color=color, label=label, **kwargs)
        self.ax.fill_between(
            positions,
            lower_shading,
            upper_shading,
            color=color,
            alpha=alpha,
            **kwargs,
        )

        if show_extrema:
            min_val = [np.min(samples) for samples in values]
            max_val = [np.max(samples) for samples in values]
            self.ax.plot(positions, min_val, linestyle="--", color=color, **kwargs)
            self.ax.plot(positions, max_val, linestyle="--", color=color, **kwargs)
Ejemplo n.º 10
0
Archivo: noise.py Proyecto: syam-s/smlb
    def noise(self, shape=None):
        """Add Gaussian noise to labels.

        Parameters:
            shape: shape of noise vector, matrix or higher-order tensor

        Returns:
            a numerical array of given shape containing independent
            identically distributed Gaussian noise

        Raises:
            InvalidParameterError: for invalid parameters
        """

        # valid shape are either positive integer or a tuple of positive integer
        is_nonneg_int = lambda arg: params.integer(arg, from_=1)
        is_tuple = lambda arg: params.tuple_(arg, is_nonneg_int)
        shape = params.any_(shape, is_nonneg_int, is_tuple)

        return self.random.normal(self._mean, self._stddev, size=shape)
Ejemplo n.º 11
0
 def __init__(
     self,
     data: VectorSpaceData,
     model: Learner,
     scorer: Scorer,
     optimizers: Sequence[Optimizer],
     evaluations: Sequence[Evaluation] = (OptimizationTrajectoryPlot(),),
     num_trials: int = 1,
     training_data: Optional[Data] = None,
 ):
     self._data = params.instance(data, VectorSpaceData)
     self._scorer = params.instance(scorer, Scorer)
     self._model = params.instance(model, Learner)
     self._optimizers = params.sequence(optimizers, type_=Optimizer)
     self._evaluations = params.tuple_(
         evaluations, lambda arg: params.instance(arg, Evaluation)
     )
     self._num_trials = params.integer(num_trials, from_=1)
     self._training_data = params.optional_(
         training_data, lambda arg: params.instance(arg, Data)
     )
Ejemplo n.º 12
0
Archivo: noise.py Proyecto: syam-s/smlb
    def noise(self, shape=None):
        """Return no noise.

        A constant value is returned.

        Parameters:
            shape: shape of noise vector, matrix or higher-order tensor

        Returns:
            a numerical array of given shape containing a constant value

        Raises:
            InvalidParameterError: for invalid parameters
        """

        # valid shape are either positive integer or a tuple of positive integer
        is_nonneg_int = lambda arg: params.integer(arg, from_=1)
        is_tuple = lambda arg: params.tuple_(arg, is_nonneg_int)
        shape = params.any_(shape, is_nonneg_int, is_tuple)

        return np.full(shape, self._value)
Ejemplo n.º 13
0
    def __init__(
        self,
        data: Data,
        training: Sequence[Sampler],
        validation: Sampler,
        learners: Sequence[SupervisedLearner],
        features: DataValuedTransformation = IdentityFeatures(),
        metric: ScalarEvaluationMetric = RootMeanSquaredError(),
        evaluations: Sequence[Evaluation] = (LearningCurvePlot(),),  # todo: add table
        progressf: Optional[Callable[[int, int], None]] = None,
    ):
        """Initialize workflow.

        Parameters:
            data: labeled data
            training: sequence of Samplers, one for each training set size
            validation: Sampler for validation set
            learners: sequence of supervised regression algorithms
            features: any data-valued transformation
            metric: evaluation metric to use; root mean squared error by default
            evaluations: one or more evaluations; default are learning curve and table
            progressf: callable with two parameters, done iterations and total number of iterations
        """

        self._data = params.instance(data, Data)  # todo: params.data(..., is_labeled=True)
        if not self._data.is_labeled:
            raise InvalidParameterError("labeled data", "unlabeled data")
        self._training = params.sequence(training, type_=Sampler)
        self._validation = params.instance(validation, Sampler)
        self._learners = params.sequence(learners, type_=SupervisedLearner)
        self._features = params.instance(features, Features)
        self._metric = params.instance(metric, ScalarEvaluationMetric)
        self._evaluations = params.tuple_(
            evaluations, lambda arg: params.instance(arg, Evaluation)
        )
        self._progressf = params.optional_(
            progressf, lambda arg: params.callable(arg, num_pos_or_kw=2)
        )
        if self._progressf is None:
            self._progressf = lambda *args: None
Ejemplo n.º 14
0
    def __init__(
        self,
        select: Optional[Sequence[str]] = None,
        failmode="raise",
        samplef: Callable[[Any], Any] = lambda arg: arg,
        java_gateway: Optional[CdkJavaGateway] = None,
        **kwargs,
    ):
        """Initialize state.

        Parameters:
            select: which features to compute (by default, all). List of names, order matters.
                Presets are available as class constants:
                PRESET_ALL: all features
                PRESET_ROBUST: a subset of descriptors that are fast to compute and do not fail
                    often (tested on QM9 and CEP datasets; see accompanying notebook)
            failmode: how to handle failed descriptor calculations, either due to rejected SMILES
                encodings or failing descriptor code. Possible values:
                "raise" [default]: raise a Benchmarexception
                "drop": drop the sample. Returned Data will have fewer samples
                ("mask", mask): where `mask` is a NumPy array with dtype bool whose entries will
                    be set to False for failures
                ("index", index): where `index` is an empty list to which the indices of failed
                    entries will be appended
            samplef: a function accepting and returning a sample. This enables
                transformation of samples, for example, to select an entry by key
                if sample is a dictionary, or to turn a dictionary into a vector.
                Default is to return the sample unchanged.
            java_gateway: a gateway to a Java virtual machine

        Requires a CDK jar.
        """

        super().__init__(**kwargs)

        # parameters
        select = params.optional_(
            select,
            lambda arg: params.tuple_(
                arg, lambda arg: params.enumeration(arg, self.DESCRIPTORS.keys())
            ),
        )
        select = self.PRESET_ALL if select is None else select
        self._failmode = DataTransformationFailureMode.failmode(failmode)
        self._samplef = params.callable(samplef, num_pos_or_kw=1)
        self._java_gateway = params.optional_(
            java_gateway, lambda arg: params.instance(arg, JavaGateway)
        )
        if self._java_gateway is None:
            self._java_gateway = CdkJavaGateway()
        self._java_gateway = self._java_gateway.gateway

        # set up descriptors
        self._descriptors = tuple(
            eval("self._java_gateway.jvm." + self.DESCRIPTORS[name][0] + "()") for name in select
        )

        builder = self._java_gateway.jvm.org.openscience.cdk.DefaultChemObjectBuilder.getInstance()
        for descriptor in self._descriptors:
            descriptor.initialise(builder)

        self._arities = tuple(self.DESCRIPTORS[name][1] for name in select)
Ejemplo n.º 15
0
    def evaluate(self, results, **kwargs):
        """Compute plot data for multiple generalized (set-valued) functions.

        Multiple curves C_1, ..., C_k can be drawn.
        Each curve C_i is specified by a non-empty sequence of 2-tuples,
        where the first value is location on horizontal axis, and the
        other value is a sequence of locations on the vertical axis.

        Each curve can be drawn in a different way (points, box-whisker).

        Parameters:
            results: sequence of generalized functions data (curve data).
                     Each datum is a sequence of tuples (x,fx), where
                     x is a real number and fx is a sequence of real numbers.

        Examples:
            # two curves sharing one horizontal location
            evaluate([
                [(1,(1,0.9,1.1)), (3,(2,))],  # curve 1
                [(1,(0.7,)), (2,(3.1,2.8)), (4,(5.5,7.3,6))], # curve 2
            ])
        """

        super().evaluate(results=results, **kwargs)

        # parameter validation

        tuple_testf = lambda arg: params.tuple_(arg, params.real, params.real_vector, arity=2)
        curve_testf = lambda arg: params.tuple_(arg, tuple_testf)
        results = params.tuple_(results, curve_testf)

        # _rectify evaluates to True if True or if > 0
        if len(results) > len(self.RECTIFY_DELTAS) and self._rectify:
            raise InvalidParameterError(
                f"at most {len(self.RECTIFY_DELTAS)} curves", f"{len(self.RECTIFY_DELTAS)} curves"
            )

        # finalize parameter validation for visualization_type
        if not is_sequence(self._visualization_type):
            self._visualization_type = (self._visualization_type,) * len(results)
        self._visualization_type = params.tuple_(
            self._visualization_type,
            lambda arg: params.enumeration(arg, {"points", "box-whisker", "shaded-line"}),
            arity=len(results),
            default="points",
        )

        # prepare plot

        # determine all distinct horizontal positons in the results data
        all_positions = np.unique([entry[0] for curve in results for entry in curve])

        # there is nothing to do without data to plot
        if len(all_positions) == 0:
            self._plotdata = []
            return

        # do not rectify if there is only a single horizontal position
        if len(all_positions) == 1 or self._rectify is False:
            self._rectify = 0.0

        # automatic determination of horizontal rectification factor
        #
        # the correct way to draw box-plots on a logarithmic horizontal axis is to have
        # different left-width and right-width of the boxes. However, matplotlib does not
        # support this. Because box widths are small compared to horizontal plot range,
        # it suffices to use the sum of left- and right-half widths.
        between_groups_spacing = 0.4
        in_group_spacing = 0.9  # box-whisker plots
        if self.axes_scales[0] == "linear":
            logf = lambda arg: arg
            powf = lambda arg: arg
        elif self.axes_scales[0] == "log":
            base = 10
            logf = lambda arg: np.log(arg) / np.log(base)
            powf = lambda arg: np.power(base, arg)

        if self._rectify is True:
            # diff(...) requires at least two horizontal locations; this is ensured above
            self._rectify = (
                between_groups_spacing * min(np.diff(logf(all_positions))) / len(results)
            )

        # determine positions
        self._plotdata = [None] * len(results)
        deltas = self.RECTIFY_DELTAS[len(results)] if self._rectify else np.zeros(len(results))
        for (i, curve) in enumerate(results):
            # point markers, every single point is drawn
            if self._visualization_type[i] == "points":
                positions = powf(
                    np.hstack(
                        [
                            logf(entry[0] * np.ones(len(entry[1]))) + deltas[i] * self._rectify / 2
                            for entry in curve
                        ]
                    )
                )
                values = np.hstack([entry[1] for entry in curve])
                self._plotdata[i] = np.transpose([positions, values])
            # box-whisker plots
            elif self._visualization_type[i] == "box-whisker":
                positions = np.asfarray(
                    [logf(entry[0]) + deltas[i] * self._rectify / 2 for entry in curve]
                )
                values = [entry[1] for entry in curve]
                # can't use rectify for width if 0; 1 is a wild guess
                # todo: if plot ranges have been set, a better default value could
                #       be 10% of horizontal plot range
                w = 1 if not self._rectify else self._rectify
                widths = powf((positions + w / 2) * in_group_spacing) - powf(
                    (positions - w / 2) * in_group_spacing
                )
                positions = powf(positions)
                self._plotdata[i] = (positions, values, widths)
            elif self._visualization_type[i] == "shaded-line":
                positions = np.asfarray([entry[0] for entry in curve])
                values = [entry[1] for entry in curve]
                self._plotdata[i] = (positions, values)
            else:
                raise BenchmarkError("internal error, unknown visualization type")
Ejemplo n.º 16
0
    def __init__(
            self,
            select: Union[str, Sequence[str]] = "all",
            samplef: Callable[[Any], Any] = lambda arg: arg,
            stoichiometry_p_list: Sequence[int] = (0, 2, 3, 5, 7, 10),
            elemental_preset: str = "magpie",
            ionic_fast: bool = False,
            valence_orbitals: Sequence[str] = ("s", "p", "d", "f"),
            valence_props: Sequence[str] = ("avg", "frac"),
            **kwargs,
    ):
        """Initialize state.

        Selected parameters of the wrapped matminer classes Stoichiometry, ElementProperty,
        IonProperty, ValenceOrbital can be passed through. These parameters are prefixed
        with stoichiometry, elemental, ionic, valence. For example, stoichiometry_p_list
        is the p_list parameter of Stoichiometry. For further details on these, see
        https://github.com/hackingmaterials/matminer/blob/master/matminer/featurizers/composition.py

        Parameters:
            select: which feature sets to compute (by default, all). Specifying
                multiple sets (e.g., ('stoichiometry', 'elemental') selects both).
                Valid choices:
                'all': all features
                'stoichiometry': norms of stoichiometric features
                'elemental': element properties
                'ionic': ion properties
                'valence': valence orbital shell features
            samplef: a function accepting and returning a sample. This enables
                transformation of samples, for example, to select an entry by key
                if sample is a dictionary, or to turn a dictionary into a vector.
                Default is to return the sample unchanged.
            stoichiometry_p_list: list of L_p norms to compute
            elemental_preset: matminer preset to use. Valid choices include:
                'magpie', 'deml', 'matminer', 'matscholar_el', 'megnet_el'
            ionic_fast: if True, assumes that elements exist in single oxidation state
            valence_orbitals: which valence orbitals to consider
            valence_props: whether to return average properties, fractional, or both

        Requires the matminer package (see file documentation).
        """

        super().__init__(**kwargs)

        SELECT_SETS = ("stoichiometry", "elemental", "ionic", "valence")

        if select == "all":
            select = SELECT_SETS
        if isinstance(select, str):
            select = (select,
                      )  # tuple(str,) yields tuple of characters in str
        select = params.tuple_(
            select,
            lambda arg: params.enumeration(arg, set(SELECT_SETS)),
        )

        self._stoichiometry_p_list = params.tuple_(
            stoichiometry_p_list, lambda p: params.integer(p, from_=0))
        self._elemental_preset = params.enumeration(
            elemental_preset,
            {"magpie", "deml", "matminer", "matscholar_el", "megnet_el"})
        self._ionic_fast = params.boolean(ionic_fast)
        self._valence_orbitals = params.tuple_(
            valence_orbitals,
            lambda arg: params.enumeration(arg, {"s", "p", "d", "f"}))
        self._valence_props = params.tuple_(
            valence_props,
            lambda arg: params.enumeration(arg, {"avg", "frac"}))

        self.samplef = samplef  # todo: add callable to params

        # set up matminer
        try:
            import matminer
            import matminer.featurizers
            import matminer.featurizers.base
            import matminer.featurizers.composition
            import matminer.featurizers.conversions
            import pymatgen
        except ModuleNotFoundError as e:
            raise BenchmarkError(
                f"'{type(self).__name__}' requires 'matminer' and 'pymatgen' packages"
            ) from e

        self._composition = pymatgen.core.composition.Composition

        # set up features
        features = []
        if "stoichiometry" in select:
            features.append(
                matminer.featurizers.composition.Stoichiometry(
                    p_list=self._stoichiometry_p_list))
        if "elemental" in select:
            features.append(
                matminer.featurizers.composition.ElementProperty.from_preset(
                    self._elemental_preset))
        if "ionic" in select:
            features.append(
                matminer.featurizers.composition.IonProperty(
                    fast=self._ionic_fast))
        if "valence" in select:
            features.append(
                matminer.featurizers.composition.ValenceOrbital(
                    orbitals=self._valence_orbitals,
                    props=self._valence_props))

        self._mmfeatures = matminer.featurizers.base.MultipleFeaturizer(
            features)
Ejemplo n.º 17
0
    def __init__(
        self,
        data: "pandas.DataFrame",  # noqa F821
        labels: Optional[Union["pandas.DataFrame", Sequence[str]]] = None,
        dtype: Optional[dict] = None,
        join: Optional[str] = None,
        filterf: Optional[Callable[[Any], bool]] = None,
        samplef: Optional[Callable[[Any], Any]] = None,
        labelf: Optional[Callable[[Any], Any]] = None,
        **kwargs,
    ):
        """Initialize dataset.

        Parameters control loading and preprocessing of the data. Order:
        1. joining
        2. filtering
        3. sample and label transform

        Parameters:
            data: the samples in the form of a Pandas DataFrame.
            labels: the labels, either in the form of a Pandas DataFrame with same number of rows
                as data and different column names, or in the form of a list of column names,
                which are then split out from the data and used as labels. If not specified,
                the dataset is unlabeled.
            dtype: the NumPy data types to use for samples and labels, in the form of a dictionary
                with column names as keys and dtypes as values. Can be used to override dtype
                auto-detection for some or all columns.
            join: if specified, name of "column" to join by; this changes labels
                to be sequences of single-entry labels
            filterf: a function that accepts a sample and returns whether to keep it
                (True) or exclude it (False). Default retains all samples
            samplef: function accepting and returning a sample; applied to all samples
                as post-processing
            labelf: function accepting and returning a label; applied to all labels
                as post-processing

        Raises:
            InvalidParameterError for invalid arguments. In particular,
                numbers of data and labels must match. If column names are given,
                they must be unique across data and labels, if any.
        """

        import pandas as pd  # only import if class is used

        # parameter validation
        data = params.instance(data, pd.DataFrame)
        labels = params.optional_(
            labels,
            lambda arg: params.any_(
                arg,
                lambda arg: params.instance(arg, pd.DataFrame
                                            ),  # before tuple_
                lambda arg: params.tuple_(arg, params.string),
            ),
        )
        dtype = params.optional_(dtype,
                                 lambda arg: params.instance(arg, dict),
                                 default={})
        join = params.optional_(join, params.string)
        singleargf = lambda arg: params.callable(arg, num_pos_or_kw=1
                                                 )  # noqa: E731
        filterf = params.optional_(filterf, singleargf)
        samplef = params.optional_(samplef, singleargf)
        labelf = params.optional_(labelf, singleargf)

        if labels is None and labelf:
            raise InvalidParameterError(
                "matching labels and label function",
                "label function specified for unlabeled data")

        # process data
        data = data.reset_index(drop=True)

        # if labels are given as separate DataFrame, join them
        if isinstance(labels, pd.DataFrame):
            if len(data) != len(labels):
                raise InvalidParameterError(
                    "matching data and labels",
                    f"different number of rows ({len(data)} != {len(labels)})",
                )

            labels = labels.reset_index(drop=True)

            col_names = np.hstack((data.columns, labels.columns))
            if len(col_names) != len(pd.unique(col_names)):
                raise InvalidParameterError(
                    "unique column names",
                    f"{data.columns.values} and {labels.columns.values}")

            data = pd.concat([data, labels], axis=1)
            labels = labels.columns.values

        # 1. optional joining
        if join:
            groups = data.groupby(join, sort=False, as_index=False)
            data = groups.aggregate(lambda tdf: tdf.tolist())

        # 2. optional filtering
        if filterf:
            selection = data.apply(filterf, axis=1)
            data = data[selection]

        # split data and labels
        if labels is not None:
            # DataFrame column indexing requires list, not tuple
            data, labels = data.drop(columns=list(labels)), data[list(labels)]

        # 3. optional sample and label transform
        if samplef:
            data = data.apply(samplef, axis=1, result_type="reduce")
            if isinstance(data, pd.Series):
                data = pd.DataFrame(data, columns=["Samples"])
        if labelf:
            labels = labels.apply(labelf, axis=1, result_type="reduce")
            if isinstance(labels, pd.Series):
                labels = pd.DataFrame(labels, columns=["Labels"])

        # convert to NumPy structured array
        data = self._to_numpy(data, dtype=dtype)
        labels = self._to_numpy(labels,
                                dtype=dtype) if labels is not None else None

        super().__init__(data=data, labels=labels, **kwargs)
Ejemplo n.º 18
0
    def __init__(self,
                 rng: int = None,
                 strategy: str = "best1bin",
                 maxiter: int = 1000,
                 popsize: int = 15,
                 tol: float = 0.01,
                 mutation=(0.5, 1),
                 recombination: float = 0.7,
                 **kwargs):
        """Initialize state.

        Scipy-specific parameters are passed through.

        Parameters:
            rng: integer seed. Will be used to generate a new seed each time the optimizer is run.
            strategy: The differential evolution strategy to use. See documentation for complete
                list and explanations.
            maxiter: The maximum number of generations over which the entire population is evolved.
            popsize: A multiplier for setting the total population size.
            tol: Relative tolerance for convergence.
            mutation: The mutation constant. Either a number between 0 and 2 or a tuple (min, max)
                in which case the mutation constant is randomly selected uniformly from between
                min and max with each generation.
            recombination: The recombination constant. Must be between 0 and 1.

        """
        super().__init__(rng=rng, **kwargs)

        allowed_strategies = {
            "best1bin",
            "best1exp",
            "rand1exp",
            "randtobest1exp",
            "currenttobest1exp",
            "best2exp",
            "rand2exp",
            "randtobest1bin",
            "currenttobest1bin",
            "best2bin",
            "rand2bin",
            "rand1bin",
        }
        self._strategy = params.enumeration(strategy, allowed_strategies)

        self._maxiter = params.integer(maxiter, from_=1)
        self._popsize = params.integer(popsize, from_=1)
        self._tol = params.real(tol, above=0.0)

        def test_mutation_range(arg, low=0.0):
            return params.real(arg, from_=low, to=2.0)

        self._mutation = params.any_(
            mutation,
            test_mutation_range,
            lambda pair: params.tuple_(
                pair,
                test_mutation_range,
                lambda arg2: test_mutation_range(arg2, low=pair[0]),
                arity=2,
            ),
        )
        self._recombination = params.real(recombination, from_=0.0, to=1.0)
Ejemplo n.º 19
0
def test_tuple_():
    """Tests tuple_ meta test."""

    testf = lambda arg: params.none(arg)

    # special case: no tuple
    with pytest.raises(InvalidParameterError):
        params.tuple_(None, lambda arg: arg)

    # special case: single test
    assert params.tuple_((None,), testf) == (None,)
    with pytest.raises(InvalidParameterError):
        params.any_("_", testf)

    # special case: 2-tuple
    assert params.tuple_((None, None), testf, testf) == (None, None)
    with pytest.raises(InvalidParameterError):
        params.tuple_(("_", None), testf, testf)
    with pytest.raises(InvalidParameterError):
        params.tuple_((None, "_"), testf, testf)
    with pytest.raises(InvalidParameterError):
        params.tuple_(("_", "_"), testf, testf)

    # arity parameter
    assert params.tuple_((None, None), testf, arity=2)
    with pytest.raises(InvalidParameterError):
        params.tuple_((None, None), testf, arity=3)
    with pytest.raises(InvalidParameterError):
        params.tuple_((None, None, None), testf, arity=2)

    # default parameter
    assert params.tuple_((None,), testf, arity=3, default=None) == (None, None, None)

    # no arity, no default
    assert params.tuple_((None, None, None), testf) == (None, None, None)