Esempio n. 1
0
def test_construct_maj_int_min_when_wrong_strategy():
    class_sizes = {
        0: 5,  # class 0 occurs 5 times in dataset
        1: 6,
        3: 7,
        5: 10,
        8: 2000
    }
    y = np.array([
        class_label for class_label, class_size in class_sizes.items()
        for _ in range(class_size)
    ])
    np.random.shuffle(y)

    with pytest.raises(ValueError):
        construct_maj_int_min(y, strategy='WRONG_STRATEGY')
Esempio n. 2
0
    def _initialize_algorithm(self, X, y):
        if self.maj_int_min is None:
            self.maj_int_min = construct_maj_int_min(y)
        self.majority_classes = self.maj_int_min['maj']
        self.intermediate_classes = self.maj_int_min['int']
        self.minority_classes = self.maj_int_min['min']

        self.stds, self.means = [1] * X.shape[1], [0] * X.shape[1]
        if self.cost is None:
            self.cost = self._estimate_cost_matrix(y)
Esempio n. 3
0
def test_construct_maj_int_min_when_correct_and_average_strategy():
    class_sizes = {
        0: 5,  # class 0 occurs 5 times in dataset
        1: 6,
        3: 7,
        5: 10,
        8: 2000
    }
    y = np.array([
        class_label for class_label, class_size in class_sizes.items()
        for _ in range(class_size)
    ])
    np.random.shuffle(y)

    maj_int_dict = construct_maj_int_min(y, strategy='average')

    assert len(maj_int_dict['int']) == 0

    assert len(maj_int_dict['min']) == 4
    assert all(i in maj_int_dict['min'] for i in [0, 1, 3, 5])

    assert len(maj_int_dict['maj']) == 1
    assert maj_int_dict['maj'][0] == 8
Esempio n. 4
0
def test_construct_maj_int_min_when_correct_and_median_strategy():
    class_sizes = {
        0: 5,  # class 0 occurs 5 times in dataset
        1: 6,
        3: 7,  # median
        5: 10,
        8: 12
    }
    y = np.array([
        class_label for class_label, class_size in class_sizes.items()
        for _ in range(class_size)
    ])
    np.random.shuffle(y)

    maj_int_dict = construct_maj_int_min(y, strategy='median')

    assert len(maj_int_dict['int']) == 1
    assert maj_int_dict['int'][0] == 3

    assert len(maj_int_dict['min']) == 2
    assert all(i in maj_int_dict['min'] for i in [0, 1])

    assert len(maj_int_dict['maj']) == 2
    assert all(i in maj_int_dict['maj'] for i in [5, 8])
Esempio n. 5
0
    def _fit_resample(self, X, y):
        """
        :param X:
            two dimensional numpy array (number of samples x number of features) with float numbers
        :param y:
            one dimensional numpy array with labels for rows in X
        :return:
            resampled X, resampled y
        """
        if self.class_balances is None:
            self.class_balances = construct_maj_int_min(y)

        self.knn.fit(X)
        self.X, self.y = X, y

        oversampled_X, oversampled_y = self.X.copy(), self.y.copy()
        quantities = Counter(self.y)
        goal_quantity = int(max(list(quantities.values())))
        labels = list(set(self.y))
        minority_classes = self.class_balances['min']

        for class_label in labels:
            if minority_classes is not None and class_label not in minority_classes:
                continue

            chosen_minor_class_samples_to_oversample, weights = self._choose_samples(
                class_label)
            if len(chosen_minor_class_samples_to_oversample) == 0:
                continue

            oversampling_rate = int(
                (goal_quantity - quantities[class_label]) * self.prop)
            if oversampling_rate > 0:
                if len(chosen_minor_class_samples_to_oversample) == 1:
                    oversampled_set = np.repeat(
                        chosen_minor_class_samples_to_oversample,
                        oversampling_rate,
                        axis=0)
                else:
                    chosen_samples_features_mean = np.mean(
                        chosen_minor_class_samples_to_oversample, axis=0)
                    zero_mean_samples = chosen_minor_class_samples_to_oversample - chosen_samples_features_mean

                    n_components = min(zero_mean_samples.shape)
                    pca = PCA(n_components=n_components).fit(zero_mean_samples)

                    uncorrelated_samples = pca.transform(zero_mean_samples)
                    variables_variance = np.diag(
                        np.cov(uncorrelated_samples, rowvar=False))

                    oversampled_set = self._MDO_oversampling(
                        uncorrelated_samples, variables_variance,
                        oversampling_rate, weights)
                    oversampled_set = pca.inverse_transform(
                        oversampled_set) + chosen_samples_features_mean

                oversampled_X = np.vstack((oversampled_X, oversampled_set))
                oversampled_y = np.hstack(
                    (oversampled_y,
                     np.array([class_label] * oversampling_rate)))

        return oversampled_X, oversampled_y