Beispiel #1
0
    def _calc_all_intracls_dists(
        cls,
        N: np.ndarray,
        y: np.ndarray,
        dist_metric: str = "euclidean",
        get_max_dist: bool = True,
        cls_inds: t.Optional[np.ndarray] = None,
        classes: t.Optional[np.ndarray] = None,
    ) -> np.ndarray:
        """Calculate all intraclass (internal to a class) distances."""
        if cls_inds is None:
            if classes is None:
                classes = np.unique(y)

            cls_inds = _utils.calc_cls_inds(y=y, classes=classes)

        intracls_dists = np.array(
            [
                cls._calc_intracls_dists(
                    N[cur_class, :],
                    dist_metric=dist_metric,
                    get_max_dist=get_max_dist,
                ) for cur_class in cls_inds
            ],
            dtype=object,
        )

        return intracls_dists
Beispiel #2
0
    def _calc_pwise_norm_intercls_dist(
        cls,
        N: np.ndarray,
        y: np.ndarray,
        dist_metric: str = "euclidean",
        classes: t.Optional[np.ndarray] = None,
        cls_inds: t.Optional[np.ndarray] = None,
    ) -> t.List[np.ndarray]:
        """Calculate all pairwise normalized interclass distances."""
        if cls_inds is None:
            if classes is None:
                classes = np.unique(y)

            cls_inds = _utils.calc_cls_inds(y=y, classes=classes)

        intercls_dists = [
            cls._calc_normalized_intercls_dist(
                N[cls_inds[id_cls_a, :], :],
                N[cls_inds[id_cls_b, :], :],
                dist_metric=dist_metric,
            ) for id_cls_a, id_cls_b in itertools.combinations(
                np.arange(cls_inds.shape[0]), 2)
        ]

        return intercls_dists
Beispiel #3
0
    def precompute_complexity(cls,
                              y: t.Optional[np.ndarray] = None,
                              **kwargs) -> t.Dict[str, t.Any]:
        """Precompute some useful things to support feature-based measures.

        Parameters
        ----------
        y : :obj:`np.ndarray`, optional
            Target attribute.

        **kwargs
            Additional arguments. May have previously precomputed before this
            method from other precomputed methods, so they can help speed up
            this precomputation.

        Returns
        -------
        :obj:`dict`
            With following precomputed items:
                - ``ovo_comb`` (list): List of all class OVO combination,
                  i.e., all combinations of distinct class indices by pairs
                  ([(0, 1), (0, 2) ...].)
                - ``cls_inds`` (:obj:`np.ndarray`): Boolean array which
                  indicates whether each example belongs to each class. The
                  rows represents the distinct classes, and the instances
                  are represented by the columns.
                - ``classes`` (:obj:`np.ndarray`): distinct classes in the
                  fitted target attribute.
                - ``class_freqs`` (:obj:`np.ndarray`): The number of examples
                  in each class. The indices represent the classes.
        """
        precomp_vals = {}  # type: t.Dict[str, t.Any]

        if (y is not None and not {"classes", "class_freqs"}.issubset(kwargs)):
            sub_dic = MFEGeneral.precompute_general_class(y)
            precomp_vals.update(sub_dic)

        classes = kwargs.get("classes", precomp_vals.get("classes"))

        if y is not None and "cls_inds" not in kwargs:
            cls_inds = _utils.calc_cls_inds(y, classes)
            precomp_vals["cls_inds"] = cls_inds

        if y is not None and "ovo_comb" not in kwargs:
            ovo_comb = cls._calc_ovo_comb(classes)
            precomp_vals["ovo_comb"] = ovo_comb

        return precomp_vals
Beispiel #4
0
    def precompute_clustering_class(cls,
                                    y: t.Optional[np.ndarray] = None,
                                    **kwargs) -> t.Dict[str, t.Any]:
        """Precompute distinct classes and its frequencies from ``y``.

        Parameters
        ----------
        y : :obj:`np.ndarray`, optional
            Instance cluster index (or target attribute).

        **kwargs
            Additional arguments. May have previously precomputed before
            this method from other precomputed methods, so they can help
            speed up this precomputation.

        Returns
        -------
        :obj:`dict`
            The following precomputed items are returned:
                * ``classes`` (:obj:`np.ndarray`):  distinct classes of
                  ``y``, if ``y`` is not :obj:`NoneType`.
                * ``class_freqs`` (:obj:`np.ndarray`): class frequencies of
                  ``y``, if ``y`` is not :obj:`NoneType`.
                * ``cls_inds`` (:obj:`np.ndarray`): Boolean array which
                  indicates whether each example belongs to each class. The
                  rows represents the distinct classes, and the instances
                  are represented by the columns.
        """
        precomp_vals = {}

        if y is not None and not {"classes", "class_freqs"}.issubset(kwargs):
            classes, class_freqs = np.unique(y, return_counts=True)

            precomp_vals["classes"] = classes
            precomp_vals["class_freqs"] = class_freqs

        classes = kwargs.get("classes", precomp_vals.get("classes"))

        if y is not None and "cls_inds" not in kwargs:
            cls_inds = _utils.calc_cls_inds(y, classes)
            precomp_vals["cls_inds"] = cls_inds

        return precomp_vals
Beispiel #5
0
    def _get_class_representatives(
            cls,
            N: np.ndarray,
            y: np.ndarray,
            representative: t.Union[t.Sequence, np.ndarray, str] = "mean",
            cls_inds: t.Optional[np.ndarray] = None,
            classes: t.Optional[np.ndarray] = None) -> np.ndarray:
        """Get a representative instance for each distinct class.

        If ``representative`` argument is a string, then it must be
        some statistical method to be aplied in the attributes of
        instances of the same class in ``N`` to construct the class
        representative instance (currently supported only ``mean`` and
        ``median``). If ``representative`` is a sequence, then its
        shape must be (number_of_classes, number_of_attributes) (i.e.,
        there must have one class representative for each distinct class,
        and every class representative must have the same dimension of
        the instances in ``N``.)
        """
        if classes is None:
            classes = np.unique(y)

        if isinstance(representative, str):
            center_method = {
                "mean": np.mean,
                "median": np.median,
            }.get(representative)

            if center_method is None:
                raise ValueError("'representative' must be 'mean' or "
                                 "'median'. Got '{}'.".format(representative))

            if cls_inds is None:
                cls_inds = _utils.calc_cls_inds(y=y, classes=classes)

            representative = [
                center_method(N[cur_class, :], axis=0)
                for cur_class in cls_inds
            ]

        elif not isinstance(representative,
                            (collections.Sequence, np.ndarray)):
            raise TypeError("'representative' type must be string "
                            "or a sequence or a numpy array. "
                            "Got '{}'.".format(type(representative)))

        representative_arr = np.asarray(representative)

        num_repr, repr_dim = representative_arr.shape
        _, num_attr = N.shape

        if num_repr != classes.size:
            raise ValueError("There must exist one class representative "
                             "for every distinct class. (Expected '{}', "
                             "got '{}'".format(classes.size, num_repr))

        if repr_dim != num_attr:
            raise ValueError("The dimension of each class representative "
                             "must match the instances dimension. (Expected "
                             "'{}', got '{}'".format(classes.size, repr_dim))

        return representative_arr
Beispiel #6
0
    def ft_n4(cls,
              N: np.ndarray,
              y: np.ndarray,
              cls_inds: t.Optional[np.ndarray] = None,
              metric_n4: str = "minkowski",
              p_n4: int = 2,
              n_neighbors_n4: int = 1,
              random_state: t.Optional[int] = None) -> np.ndarray:
        """Compute the non-linearity of the NN Classifier.

        Parameters
        ----------
        N : :obj:`np.ndarray`
            Numerical fitted data.

        y : :obj:`np.ndarray`
            Target attribute.

        cls_inds : :obj:`np.ndarray`, optional
            Boolean array which indicates the examples of each class.
            The rows represents each distinct class, and the columns
            represents the instances.

        metric_n4 : str, optional (default = "minkowski")
            The distance metric used in the internal kNN classifier. See the
            documentation of the ``sklearn.neighbors.DistanceMetric`` class
            for a list of available metrics.

        p_n4 : int, optional (default = 2)
            Power parameter for the Minkowski metric. When p = 1, this is
            equivalent to using manhattan_distance (l1), and
            euclidean_distance (l2) for p = 2. For arbitrary p,
            minkowski_distance (l_p) is used. Please, check the
            ``sklearn.neighbors.KNeighborsClassifier`` documentation for
            more information.

        random_state : int, optional
            If given, set the random seed before computing the randomized
            data interpolation.

        Returns
        -------
        :obj:`np.ndarray`
            Misclassifications of the NN classifier in the interpolated
            dataset.

        References
        ----------
        .. [1] Ana C. Lorena, Luís P. F. Garcia, Jens Lehmann, Marcilio C. P.
           Souto, and Tin K. Ho. How Complex is your classification problem?
           A survey on measuring classification complexity (V2). (2019)
           (Cited on page 9-11). Published in ACM Computing Surveys (CSUR),
           Volume 52 Issue 5, October 2019, Article No. 107.
        """
        if cls_inds is None:
            classes = np.unique(y)
            cls_inds = _utils.calc_cls_inds(y, classes)

        # 0-1 feature scaling
        N = sklearn.preprocessing.MinMaxScaler(
            feature_range=(0, 1)).fit_transform(N)

        if random_state is not None:
            np.random.seed(random_state)

        N_test = np.zeros(N.shape, dtype=N.dtype)
        y_test = np.zeros(y.shape, dtype=y.dtype)

        ind_cur = 0

        for inds_cur_cls in cls_inds:
            N_cur_cls = N[inds_cur_cls, :]
            subset_size = N_cur_cls.shape[0]

            # Currently it is allowed to a instance 'interpolate with itself',
            # which holds the instance itself as result.
            sample_a = N_cur_cls[np.random.choice(subset_size, subset_size), :]
            sample_b = N_cur_cls[np.random.choice(subset_size, subset_size), :]

            rand_delta = np.random.ranf(N_cur_cls.shape)

            N_subset_interp = sample_a + (sample_b - sample_a) * rand_delta

            ind_next = ind_cur + subset_size
            N_test[ind_cur:ind_next, :] = N_subset_interp
            y_test[ind_cur:ind_next] = y[inds_cur_cls]
            ind_cur = ind_next

        knn = sklearn.neighbors.KNeighborsClassifier(
            n_neighbors=n_neighbors_n4, p=p_n4, metric=metric_n4).fit(N, y)

        y_pred = knn.predict(N_test)

        misclassifications = np.not_equal(y_test, y_pred).astype(int)

        # The measure is computed in the literature using the mean. However, it
        # is formulated here as a meta-feature. Therefore, the post-processing
        # should be used to get the mean and other measures as well.
        return misclassifications