def _calc_all_intracls_dists( cls, N: np.ndarray, y: np.ndarray, dist_metric: str = "euclidean", get_max_dist: bool = True, cls_inds: t.Optional[np.ndarray] = None, classes: t.Optional[np.ndarray] = None, ) -> np.ndarray: """Calculate all intraclass (internal to a class) distances.""" if cls_inds is None: if classes is None: classes = np.unique(y) cls_inds = _utils.calc_cls_inds(y=y, classes=classes) intracls_dists = np.array( [ cls._calc_intracls_dists( N[cur_class, :], dist_metric=dist_metric, get_max_dist=get_max_dist, ) for cur_class in cls_inds ], dtype=object, ) return intracls_dists
def _calc_pwise_norm_intercls_dist( cls, N: np.ndarray, y: np.ndarray, dist_metric: str = "euclidean", classes: t.Optional[np.ndarray] = None, cls_inds: t.Optional[np.ndarray] = None, ) -> t.List[np.ndarray]: """Calculate all pairwise normalized interclass distances.""" if cls_inds is None: if classes is None: classes = np.unique(y) cls_inds = _utils.calc_cls_inds(y=y, classes=classes) intercls_dists = [ cls._calc_normalized_intercls_dist( N[cls_inds[id_cls_a, :], :], N[cls_inds[id_cls_b, :], :], dist_metric=dist_metric, ) for id_cls_a, id_cls_b in itertools.combinations( np.arange(cls_inds.shape[0]), 2) ] return intercls_dists
def precompute_complexity(cls, y: t.Optional[np.ndarray] = None, **kwargs) -> t.Dict[str, t.Any]: """Precompute some useful things to support feature-based measures. Parameters ---------- y : :obj:`np.ndarray`, optional Target attribute. **kwargs Additional arguments. May have previously precomputed before this method from other precomputed methods, so they can help speed up this precomputation. Returns ------- :obj:`dict` With following precomputed items: - ``ovo_comb`` (list): List of all class OVO combination, i.e., all combinations of distinct class indices by pairs ([(0, 1), (0, 2) ...].) - ``cls_inds`` (:obj:`np.ndarray`): Boolean array which indicates whether each example belongs to each class. The rows represents the distinct classes, and the instances are represented by the columns. - ``classes`` (:obj:`np.ndarray`): distinct classes in the fitted target attribute. - ``class_freqs`` (:obj:`np.ndarray`): The number of examples in each class. The indices represent the classes. """ precomp_vals = {} # type: t.Dict[str, t.Any] if (y is not None and not {"classes", "class_freqs"}.issubset(kwargs)): sub_dic = MFEGeneral.precompute_general_class(y) precomp_vals.update(sub_dic) classes = kwargs.get("classes", precomp_vals.get("classes")) if y is not None and "cls_inds" not in kwargs: cls_inds = _utils.calc_cls_inds(y, classes) precomp_vals["cls_inds"] = cls_inds if y is not None and "ovo_comb" not in kwargs: ovo_comb = cls._calc_ovo_comb(classes) precomp_vals["ovo_comb"] = ovo_comb return precomp_vals
def precompute_clustering_class(cls, y: t.Optional[np.ndarray] = None, **kwargs) -> t.Dict[str, t.Any]: """Precompute distinct classes and its frequencies from ``y``. Parameters ---------- y : :obj:`np.ndarray`, optional Instance cluster index (or target attribute). **kwargs Additional arguments. May have previously precomputed before this method from other precomputed methods, so they can help speed up this precomputation. Returns ------- :obj:`dict` The following precomputed items are returned: * ``classes`` (:obj:`np.ndarray`): distinct classes of ``y``, if ``y`` is not :obj:`NoneType`. * ``class_freqs`` (:obj:`np.ndarray`): class frequencies of ``y``, if ``y`` is not :obj:`NoneType`. * ``cls_inds`` (:obj:`np.ndarray`): Boolean array which indicates whether each example belongs to each class. The rows represents the distinct classes, and the instances are represented by the columns. """ precomp_vals = {} if y is not None and not {"classes", "class_freqs"}.issubset(kwargs): classes, class_freqs = np.unique(y, return_counts=True) precomp_vals["classes"] = classes precomp_vals["class_freqs"] = class_freqs classes = kwargs.get("classes", precomp_vals.get("classes")) if y is not None and "cls_inds" not in kwargs: cls_inds = _utils.calc_cls_inds(y, classes) precomp_vals["cls_inds"] = cls_inds return precomp_vals
def _get_class_representatives( cls, N: np.ndarray, y: np.ndarray, representative: t.Union[t.Sequence, np.ndarray, str] = "mean", cls_inds: t.Optional[np.ndarray] = None, classes: t.Optional[np.ndarray] = None) -> np.ndarray: """Get a representative instance for each distinct class. If ``representative`` argument is a string, then it must be some statistical method to be aplied in the attributes of instances of the same class in ``N`` to construct the class representative instance (currently supported only ``mean`` and ``median``). If ``representative`` is a sequence, then its shape must be (number_of_classes, number_of_attributes) (i.e., there must have one class representative for each distinct class, and every class representative must have the same dimension of the instances in ``N``.) """ if classes is None: classes = np.unique(y) if isinstance(representative, str): center_method = { "mean": np.mean, "median": np.median, }.get(representative) if center_method is None: raise ValueError("'representative' must be 'mean' or " "'median'. Got '{}'.".format(representative)) if cls_inds is None: cls_inds = _utils.calc_cls_inds(y=y, classes=classes) representative = [ center_method(N[cur_class, :], axis=0) for cur_class in cls_inds ] elif not isinstance(representative, (collections.Sequence, np.ndarray)): raise TypeError("'representative' type must be string " "or a sequence or a numpy array. " "Got '{}'.".format(type(representative))) representative_arr = np.asarray(representative) num_repr, repr_dim = representative_arr.shape _, num_attr = N.shape if num_repr != classes.size: raise ValueError("There must exist one class representative " "for every distinct class. (Expected '{}', " "got '{}'".format(classes.size, num_repr)) if repr_dim != num_attr: raise ValueError("The dimension of each class representative " "must match the instances dimension. (Expected " "'{}', got '{}'".format(classes.size, repr_dim)) return representative_arr
def ft_n4(cls, N: np.ndarray, y: np.ndarray, cls_inds: t.Optional[np.ndarray] = None, metric_n4: str = "minkowski", p_n4: int = 2, n_neighbors_n4: int = 1, random_state: t.Optional[int] = None) -> np.ndarray: """Compute the non-linearity of the NN Classifier. Parameters ---------- N : :obj:`np.ndarray` Numerical fitted data. y : :obj:`np.ndarray` Target attribute. cls_inds : :obj:`np.ndarray`, optional Boolean array which indicates the examples of each class. The rows represents each distinct class, and the columns represents the instances. metric_n4 : str, optional (default = "minkowski") The distance metric used in the internal kNN classifier. See the documentation of the ``sklearn.neighbors.DistanceMetric`` class for a list of available metrics. p_n4 : int, optional (default = 2) Power parameter for the Minkowski metric. When p = 1, this is equivalent to using manhattan_distance (l1), and euclidean_distance (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used. Please, check the ``sklearn.neighbors.KNeighborsClassifier`` documentation for more information. random_state : int, optional If given, set the random seed before computing the randomized data interpolation. Returns ------- :obj:`np.ndarray` Misclassifications of the NN classifier in the interpolated dataset. References ---------- .. [1] Ana C. Lorena, Luís P. F. Garcia, Jens Lehmann, Marcilio C. P. Souto, and Tin K. Ho. How Complex is your classification problem? A survey on measuring classification complexity (V2). (2019) (Cited on page 9-11). Published in ACM Computing Surveys (CSUR), Volume 52 Issue 5, October 2019, Article No. 107. """ if cls_inds is None: classes = np.unique(y) cls_inds = _utils.calc_cls_inds(y, classes) # 0-1 feature scaling N = sklearn.preprocessing.MinMaxScaler( feature_range=(0, 1)).fit_transform(N) if random_state is not None: np.random.seed(random_state) N_test = np.zeros(N.shape, dtype=N.dtype) y_test = np.zeros(y.shape, dtype=y.dtype) ind_cur = 0 for inds_cur_cls in cls_inds: N_cur_cls = N[inds_cur_cls, :] subset_size = N_cur_cls.shape[0] # Currently it is allowed to a instance 'interpolate with itself', # which holds the instance itself as result. sample_a = N_cur_cls[np.random.choice(subset_size, subset_size), :] sample_b = N_cur_cls[np.random.choice(subset_size, subset_size), :] rand_delta = np.random.ranf(N_cur_cls.shape) N_subset_interp = sample_a + (sample_b - sample_a) * rand_delta ind_next = ind_cur + subset_size N_test[ind_cur:ind_next, :] = N_subset_interp y_test[ind_cur:ind_next] = y[inds_cur_cls] ind_cur = ind_next knn = sklearn.neighbors.KNeighborsClassifier( n_neighbors=n_neighbors_n4, p=p_n4, metric=metric_n4).fit(N, y) y_pred = knn.predict(N_test) misclassifications = np.not_equal(y_test, y_pred).astype(int) # The measure is computed in the literature using the mean. However, it # is formulated here as a meta-feature. Therefore, the post-processing # should be used to get the mean and other measures as well. return misclassifications