Esempio n. 1
0
 def check(this: DataPlaceHolders, what: str) -> List:
     this = listify(this)
     if not is_data_placeholder_list(this):
         raise ValueError("{} must be of type DataPlaceholder.".format(what))
     if len(set(this)) != len(this):
         raise ValueError("{} must be unique.".format(what))
     return this
Esempio n. 2
0
 def _compute_node(self, node, Xs, cache):
     # TODO: Raise warning if computed output is already in cache.
     # This happens when recomputing a step that had a subset of its outputs already passed in the inputs.
     # TODO: Some regressors have extra options in their predict method, and they return a tuple of arrays.
     # https://scikit-learn.org/stable/glossary.html#term-predict
     output_data = node.compute_func(unlistify(Xs))
     output_data = listify(output_data)
     self._update_cache(cache, output_data, node)
Esempio n. 3
0
    def predict(
        self,
        X: Union[ArrayLikes, DataDict],
        output_names: Optional[Union[str, List[str]]] = None,
    ) -> ArrayLikes:
        """Predict by applying the model on the given input data.

        Parameters
        ----------
        X
            Input data. It follows the same format as in the ``fit`` method.

        output_names
            Names of required outputs (optional). You can specify any final or
            intermediate output by passing the name of its associated data
            placeholder. This is useful for debugging. If not specified, it will
            return the outputs specified at instantiation.

        Returns
        -------
        array-like or list of array-like
            The computed outputs.
        """
        # Intermediate results are stored here
        results_cache = dict()  # type: Dict[DataPlaceholder, ArrayLike]

        # Normalize inputs
        X_norm = self._normalize_data(X, self._internal_inputs)

        # Get required outputs
        if output_names is None:
            outputs = self._internal_outputs
        else:
            output_names = listify(output_names)
            if len(set(output_names)) != len(output_names):
                raise ValueError("output_names must be unique.")
            outputs = [self.get_data_placeholder(output) for output in output_names]

        # We allow unused inputs to allow debugging different outputs
        # without having to change the inputs accordingly.
        nodes = self._get_required_nodes(
            X_norm, [], outputs, allow_unused_inputs=True, follow_targets=False
        )

        # Compute
        results_cache.update(X_norm)

        for node in nodes:
            Xs = [results_cache[i] for i in node.inputs]
            self._compute_node(node, Xs, results_cache)

        output_data = [results_cache[o] for o in outputs]
        if len(output_data) == 1:
            return output_data[0]
        else:
            return output_data
Esempio n. 4
0
 def _fit_compute_node(self, node, Xs, ys, cache, **fit_params):
     # TODO: same as _compute_node TODO?
     if ys:
         output_data = node.fit_compute_func(
             unlistify(Xs), unlistify(ys), **fit_params
         )
     else:
         output_data = node.fit_compute_func(unlistify(Xs), **fit_params)
     output_data = listify(output_data)
     self._update_cache(cache, output_data, node)
Esempio n. 5
0
def test_split(x, indices_or_sections, teardown):
    x1 = Input()
    ys = Split(indices_or_sections, axis=0)(x1)
    model = Model(x1, ys)

    y_expected = np.split(x, indices_or_sections, axis=0)
    y_pred = model.predict(x)
    y_pred = listify(y_pred)

    for actual, expected in safezip2(y_pred, y_expected):
        assert_array_equal(actual, expected)
Esempio n. 6
0
    def _compute_step(step, Xs, cache):
        # TODO: Raise warning if computed output is already in cache.
        # This happens when recomputing a step that had a subset of its outputs already passed in the inputs.
        # TODO: Some regressors have extra options in their predict method, and they return a tuple of arrays.
        # https://scikit-learn.org/stable/glossary.html#term-predict
        output_data = step.compute(*Xs)
        output_data = listify(output_data)

        try:
            cache.update(safezip2(step.outputs, output_data))
        except ValueError as e:
            message = (
                "The number of output data elements ({}) does not match "
                "the number of {} outputs ({}).".format(
                    len(output_data), step.name, len(step.outputs)))
            raise RuntimeError(message) from e
Esempio n. 7
0
    def _normalize_list(
        data: ArrayLikes, data_placeholders: List[DataPlaceholder]
    ) -> Dict[DataPlaceholder, ArrayLike]:
        data = listify(data)

        try:
            data_norm = dict(safezip2(data_placeholders, data))

        except ValueError as e:
            # TODO: Improve this message
            message = (
                "When passing inputs/outputs as a list or a single array, "
                "the number of arrays must match the number of inputs/outputs "
                "specified at instantiation. "
                "Got {}, expected: {}.".format(len(data), len(data_placeholders))
            )
            raise ValueError(message) from e

        return data_norm
Esempio n. 8
0
def test_listify(x, expected):
    assert listify(x) == expected
Esempio n. 9
0
    def __call__(
        self,
        inputs: Union[DataPlaceholder, List[DataPlaceholder]],
        targets: Optional[Union[DataPlaceholder,
                                List[DataPlaceholder]]] = None,
        *,
        compute_func: Union[str, Callable[..., Any]] = "auto",
        fit_compute_func: Optional[Union[str, Callable[..., Any]]] = "auto",
        trainable: bool = True
    ) -> Union[DataPlaceholder, List[DataPlaceholder]]:
        """Call the step on input(s) (from previous steps) and generates the
        output(s) to be used in further steps.

        You can call the same step on different inputs and targets to reuse the step
        (similar to the concept of shared layers and nodes in Keras), and specify a
        different ``compute_func``/``trainable`` configuration on each call. This is
        achieved via "ports": each call creates a new port and associates the given
        configuration to it. You may access the configuration at each port using the
        ``get_*_at(port)`` methods.

        Parameters
        ----------
        inputs
            Input(s) to the step.

        targets
            Target(s) to the step.

        compute_func
            Specifies which function must be used when computing the step during
            the model graph execution. If ``"auto"`` (default), it will use the ``predict``
            or the ``transform`` method (in that order). If a name string is passed,
            it will use the method that matches the given name. If a callable is
            passed, it will use that callable when computing the step.

            The number of inputs and outputs of the function must match those of the
            step (this is not checked, but will raise an error during graph
            execution if there is a mismatch).

            scikit-learn classes typically implement a ``predict`` method (Estimators)
            or a ``transform`` method (Transformers), but with this argument you can,
            for example, specify ``predict_proba`` as the compute function.

        fit_compute_func
            Specifies which function must be used when fitting AND computing the step
            during the model graph execution.

            If ``"auto"`` (default), it will use the ``fit_predict`` or the ``fit_transform``
            method (in that order) if they are implemented, otherwise it will be
            disabled. If a name string is passed, it will use the method that matches
            the given name. If a callable is passed, it will use that callable when
            fitting the step. If ``None`` is passed it will be ignored during graph
            execution.

            The number of inputs, outputs and targets, of the function must match those
            of the step (this is not checked, but will raise an error during graph
            execution if there is a mismatch).

            By default, when a model is fit, the graph engine will for each step
            1) execute ``fit`` to fit the step, and then 2) execute ``compute_func`` to
            compute the outputs required by successor steps. If a step specifies a
            ``fit_compute_func``, the graph execution will use that instead to fit and
            compute the outputs in a single call. This can be useful for

            1. leveraging implementations of ``fit_transform`` that are more efficient
               than calling ``fit`` and ``transform`` separately,
            2. using transductive estimators,
            3. implementing training protocols such as that of stacked classifiers,
               where the classifier in the first stage might compute out-of-fold
               predictions.

        trainable
            Whether the step is trainable (True) or not (False). This flag is only
            meaningful only for steps with a fit method. Setting ``trainable=False``
            allows to skip the step when fitting a Model. This is useful if you
            want to freeze some pre-trained steps.

        Returns
        -------
        DataPlaceholder
            Output(s) of the step.
        """

        inputs = listify(inputs)
        if not is_data_placeholder_list(inputs):
            raise ValueError("inputs must be of type DataPlaceholder.")

        if targets is not None:
            if not hasattr(self, "fit"):
                raise RuntimeError(
                    "Cannot pass targets to steps that do not have a fit method."
                )

            # TODO: Consider inspecting the fit signature to determine whether the step
            # needs a target (i.e. fit(self, X, y)) or not (i.e. fit(self, X, y=None)).
            # The presence of a default of None for the target might not be reliable
            # though, as there could be estimators (perhaps semi-supervised) that can take
            # both target data and None. Also, sklearn has meta-estimators (e.g. Pipeline)
            # and meta-transformers (e.g. SelectFromModel) that accept both target data
            # and None.
            #
            # Adding this inspection, however, could simplify the API by rejecting early
            # unnecessary targets (e.g. passing targets to PCA) or warning missing targets
            # (e.g. not passing targets to LogisticRegression with trainable=True). This
            # also avoids unintuitive logic to allow superfluous targets during step call,
            # model instantiation and model fit.
            #
            # | requires target |   trainable   | passed target |   result   |
            # ----------------------------------------------------------------
            # |       yes       |      True     |      yes      |     ok     |
            # |       yes       |      True     |      no       |    warn    |
            # |       yes       |      False    |      yes      |    warn    |
            # |       yes       |      False    |      no       |     ok     |
            # |       no        |        -      |      yes      |    error   |
            # |       no        |        -      |      no       |     ok     |

            if not trainable:
                warnings.warn(
                    UserWarning(
                        "You are passing targets to a non-trainable step."))

            targets = listify(targets)
            if not is_data_placeholder_list(targets):
                raise ValueError(
                    "If specified, targets must be of type DataPlaceholder.")

        else:
            targets = []

        outputs = self._build_outputs()

        self._nodes.append(
            Node(
                self,
                inputs,
                outputs,
                targets,
                getattr(self, "fit", None),
                self._check_compute_func(compute_func),
                self._check_fit_compute_func(fit_compute_func),
                trainable,
            ))

        if self._n_outputs == 1:
            return outputs[0]
        else:
            # Return a shallow copy to avoid modifying self._outputs when
            # using the idiom of passing a variable holding an output to
            # another step and re-writing the variable with the new output:
            #     zs = SomeMultiOutputStep()(...)
            #     zs[i] = SomeStep()(zs[i])
            return list(outputs)
Esempio n. 10
0
    def predict(
        self,
        X: Union[ArrayLikes, DataDict],
        output_names: Optional[Union[str, List[str]]] = None,
    ) -> ArrayLikes:
        """

        **Models are query-able**. That is, you can request other outputs other
        than those specified at model instantiation. This allows querying
        intermediate outputs and ease debugging.

        Parameters
        ----------
        X
            Input data. It follows the same format as in the fit function.

        output_names
            Names of required outputs (optional). You can specify any final or
            intermediate output by passing the name of its associated data
            placeholder. If not specified, it will return the outputs specified
            at instantiation.

        Returns
        -------
        The computed outputs.
        """
        # Intermediate results are stored here
        results_cache = dict()  # type: Dict[DataPlaceholder, ArrayLike]

        # Normalize inputs
        X_norm = self._normalize_data(X, self._internal_inputs)

        # Get required outputs
        if output_names is None:
            outputs = self._internal_outputs
        else:
            output_names = listify(output_names)
            if len(set(output_names)) != len(output_names):
                raise ValueError("output_names must be unique.")
            outputs = [
                self.get_data_placeholder(output) for output in output_names
            ]

        # We allow unused inputs to allow debugging different outputs
        # without having to change the inputs accordingly.
        steps = self._get_required_steps(X_norm, [],
                                         outputs,
                                         allow_unused_inputs=True,
                                         follow_targets=False)

        # Compute
        results_cache.update(X_norm)

        for step in steps:
            Xs = [results_cache[i] for i in step.inputs]
            self._compute_step(step, Xs, results_cache)

        output_data = [results_cache[o] for o in outputs]
        if len(output_data) == 1:
            return output_data[0]
        else:
            return output_data
Esempio n. 11
0
    def __call__(
        self,
        inputs: Union[DataPlaceholder, List[DataPlaceholder]],
        targets: Optional[Union[DataPlaceholder,
                                List[DataPlaceholder]]] = None,
    ) -> Union[DataPlaceholder, List[DataPlaceholder]]:
        """Call the step on input(s) (from previous steps) and generates the
        output(s) to be used in further steps.

        Parameters
        ----------
        inputs
            Input(s) to the step.

        targets
            Target(s) to the step.

        Returns
        -------
        DataPlaceholder
            Output(s) of the step.

        Notes
        -----
        Currently, calling the same step on different inputs and targets to
        reuse the step (similar to the concept of shared layers and nodes in
        Keras) is not supported. Calling a step twice on different inputs will
        override the connectivity from the first call. Support for shareable
        steps might be added in future releases.
        """
        inputs = listify(inputs)
        if not is_data_placeholder_list(inputs):
            raise ValueError("inputs must be of type DataPlaceholder.")

        if targets is not None:
            if not hasattr(self, "fit"):
                raise RuntimeError(
                    "Cannot pass targets to steps that do not have a fit method."
                )

            # TODO: Consider inspecting the fit signature to determine whether the step
            # needs a target (i.e. fit(self, X, y)) or not (i.e. fit(self, X, y=None)).
            # The presence of a default of None for the target might not be reliable
            # though, as there could be estimators (perhaps semi-supervised) that can take
            # both target data and None. Also, sklearn has meta-estimators (e.g. Pipeline)
            # and meta-transformers (e.g. SelectFromModel) that accept both target data
            # and None.
            #
            # Adding this inspection, however, could simplify the API by rejecting early
            # unnecessary targets (e.g. passing targets to PCA) or warning missing targets
            # (e.g. not passing targets to LogisticRegression with trainable=True). This
            # also avoids unintuitive logic to allow superfluous targets during step call,
            # model instantiation and model fit.
            #
            # | requires target |   trainable   | passed target |   result   |
            # ----------------------------------------------------------------
            # |       yes       |      True     |      yes      |     ok     |
            # |       yes       |      True     |      no       |    warn    |
            # |       yes       |      False    |      yes      |    warn    |
            # |       yes       |      False    |      no       |     ok     |
            # |       no        |        -      |      yes      |    error   |
            # |       no        |        -      |      no       |     ok     |

            if not self.trainable:
                warnings.warn(
                    UserWarning(
                        "You are passing targets to a non-trainable step."))

            targets = listify(targets)
            if not is_data_placeholder_list(targets):
                raise ValueError(
                    "If specified, targets must be of type DataPlaceholder.")

        else:
            targets = []

        self._inputs = inputs
        self._targets = targets
        self._outputs = self._build_outputs()

        if self._n_outputs == 1:
            return self._outputs[0]
        else:
            # Return a shallow copy to avoid modifying self._outputs when
            # using the idiom of passing a variable holding an output to
            # another step and re-writing the variable with the new output:
            #     zs = SomeMultiOutputStep()(...)
            #     zs[i] = SomeStep()(zs[i])
            return list(self.outputs)