コード例 #1
0
def test_ratio_float_error():
    y = np.array([1] * 50 + [2] * 100 + [3] * 25)
    ratio = -10
    error_regex = "When 'ratio' is a float, it should in the range"
    with raises(ValueError, match=error_regex):
        check_ratio(ratio, y, 'under-sampling')
    ratio = 10
    with raises(ValueError, match=error_regex):
        check_ratio(ratio, y, 'under-sampling')
コード例 #2
0
def test_ratio_dict_over_sampling():
    y = np.array([1] * 50 + [2] * 100 + [3] * 25)
    ratio = {1: 70, 2: 100, 3: 70}
    ratio_ = check_ratio(ratio, y, 'over-sampling')
    assert ratio_ == {1: 20, 2: 0, 3: 45}
    ratio = {1: 70, 2: 140, 3: 70}
    expected_msg = ("After over-sampling, the number of samples \(140\) in"
                    " class 2 will be larger than the number of samples in the"
                    " majority class \(class #2 -> 100\)")
    with warns(UserWarning, expected_msg):
        check_ratio(ratio, y, 'over-sampling')
コード例 #3
0
def test_ratio_dict_over_sampling():
    y = np.array([1] * 50 + [2] * 100 + [3] * 25)
    ratio = {1: 70, 2: 100, 3: 70}
    ratio_ = check_ratio(ratio, y, 'over-sampling')
    assert ratio_ == {1: 20, 2: 0, 3: 45}
    ratio = {1: 70, 2: 140, 3: 70}
    expected_msg = ("After over-sampling, the number of samples \(140\) in"
                    " class 2 will be larger than the number of samples in the"
                    " majority class \(class #2 -> 100\)")
    with warns(UserWarning, expected_msg):
        check_ratio(ratio, y, 'over-sampling')
コード例 #4
0
    def fit(self, X, y):
        """Find the classes statistics before to perform sampling.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            Matrix containing the data which have to be sampled.

        y : array-like, shape (n_samples,)
            Corresponding label for each sample in X.

        Returns
        -------
        self : object,
            Return self.

        """
        X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'])
        y = check_target_type(y)
        self.X_hash_, self.y_hash_ = hash_X_y(X, y)
        self.ratio_ = check_ratio(self.ratio, y, self._sampling_type)

        # Cluster input space
        self.clustering_labels_ = self.clusterer[0][1].fit_predict(X, y)

        # Identify majority and minority
        majority_label = [label for label, n_samples in self.ratio_.items() if n_samples == 0][0]
        minority_labels = [label for label in self.ratio_.keys() if label != majority_label]

        # Clusters imbalance ratios

        weights = pd.DataFrame()


        return self
コード例 #5
0
def test_ratio_callable():
    y = np.array([1] * 50 + [2] * 100 + [3] * 25)

    def ratio_func(y):
        # this function could create an equal number of samples
        target_stats = Counter(y)
        n_samples = max(target_stats.values())
        return {key: int(n_samples) for key in target_stats.keys()}

    ratio_ = check_ratio(ratio_func, y, 'over-sampling')
    assert ratio_ == {1: 50, 2: 0, 3: 75}
コード例 #6
0
def test_ratio_dict_over_sampling():
    y = np.array([1] * 50 + [2] * 100 + [3] * 25)
    ratio = {1: 70, 2: 100, 3: 70}
    ratio_ = check_ratio(ratio, y, 'over-sampling')
    assert_equal(ratio_, {1: 20, 2: 0, 3: 45})
    ratio = {1: 70, 2: 140, 3: 70}
    assert_warns_message(
        UserWarning, "After over-sampling, the number of"
        " samples (140) in class 2 will be larger than the"
        " number of samples in the majority class (class #2"
        " -> 100)", check_ratio, ratio, y, 'over-sampling')
コード例 #7
0
def test_ratio_callable():
    y = np.array([1] * 50 + [2] * 100 + [3] * 25)

    def ratio_func(y):
        # this function could create an equal number of samples
        target_stats = Counter(y)
        n_samples = max(target_stats.values())
        return {key: int(n_samples)
                for key in target_stats.keys()}

    ratio_ = check_ratio(ratio_func, y, 'over-sampling')
    assert ratio_ == {1: 50, 2: 0, 3: 75}
コード例 #8
0
def test_ratio_callable_args():
    y = np.array([1] * 50 + [2] * 100 + [3] * 25)
    multiplier = {1: 1.5, 2: 1, 3: 3}

    def ratio_func(y, multiplier):
        """samples such that each class will be affected by the multiplier."""
        target_stats = Counter(y)
        return {key: int(values * multiplier[key])
                for key, values in target_stats.items()}

    ratio_ = check_ratio(ratio_func, y, 'over-sampling',
                         multiplier=multiplier)
    assert ratio_ == {1: 25, 2: 0, 3: 50}
コード例 #9
0
def test_ratio_callable_args():
    y = np.array([1] * 50 + [2] * 100 + [3] * 25)
    multiplier = {1: 1.5, 2: 1, 3: 3}

    def ratio_func(y, multiplier):
        """samples such that each class will be affected by the multiplier."""
        target_stats = Counter(y)
        return {
            key: int(values * multiplier[key])
            for key, values in target_stats.items()
        }

    ratio_ = check_ratio(ratio_func, y, 'over-sampling', multiplier=multiplier)
    assert ratio_ == {1: 25, 2: 0, 3: 50}
コード例 #10
0
def test_check_ratio_error():
    with raises(ValueError, match="'sampling_type' should be one of"):
        check_ratio('auto', np.array([1, 2, 3]), 'rnd')

    error_regex = "The target 'y' needs to have more than 1 class."
    with raises(ValueError, match=error_regex):
        check_ratio('auto', np.ones((10, )), 'over-sampling')

    error_regex = "When 'ratio' is a string, it needs to be one of"
    with raises(ValueError, match=error_regex):
        check_ratio('rnd', np.array([1, 2, 3]), 'over-sampling')
コード例 #11
0
def test_check_ratio_error():
    with raises(ValueError, match="'sampling_type' should be one of"):
        check_ratio('auto', np.array([1, 2, 3]), 'rnd')

    error_regex = "The target 'y' needs to have more than 1 class."
    with raises(ValueError, match=error_regex):
        check_ratio('auto', np.ones((10, )), 'over-sampling')

    error_regex = "When 'ratio' is a string, it needs to be one of"
    with raises(ValueError, match=error_regex):
        check_ratio('rnd', np.array([1, 2, 3]), 'over-sampling')
コード例 #12
0
def test_ratio_dict_error():
    y = np.array([1] * 50 + [2] * 100 + [3] * 25)
    ratio = {1: -100, 2: 50, 3: 25}
    with raises(ValueError, match="in a class cannot be negative."):
        check_ratio(ratio, y, 'under-sampling')
    ratio = {10: 10}
    with raises(ValueError, match="are not present in the data."):
        check_ratio(ratio, y, 'over-sampling')
    ratio = {1: 45, 2: 100, 3: 70}
    error_regex = ("With over-sampling methods, the number of samples in a"
                   " class should be greater or equal to the original number"
                   " of samples. Originally, there is 50 samples and 45"
                   " samples are asked.")
    with raises(ValueError, match=error_regex):
        check_ratio(ratio, y, 'over-sampling')

    error_regex = ("With under-sampling methods, the number of samples in a"
                   " class should be less or equal to the original number of"
                   " samples. Originally, there is 25 samples and 70 samples"
                   " are asked.")
    with raises(ValueError, match=error_regex):
        check_ratio(ratio, y, 'under-sampling')
コード例 #13
0
def test_ratio_dict_error():
    y = np.array([1] * 50 + [2] * 100 + [3] * 25)
    ratio = {1: -100, 2: 50, 3: 25}
    with raises(ValueError, match="in a class cannot be negative."):
        check_ratio(ratio, y, 'under-sampling')
    ratio = {10: 10}
    with raises(ValueError, match="are not present in the data."):
        check_ratio(ratio, y, 'over-sampling')
    ratio = {1: 45, 2: 100, 3: 70}
    error_regex = ("With over-sampling methods, the number of samples in a"
                   " class should be greater or equal to the original number"
                   " of samples. Originally, there is 50 samples and 45"
                   " samples are asked.")
    with raises(ValueError, match=error_regex):
        check_ratio(ratio, y, 'over-sampling')

    error_regex = ("With under-sampling methods, the number of samples in a"
                   " class should be less or equal to the original number of"
                   " samples. Originally, there is 25 samples and 70 samples"
                   " are asked.")
    with raises(ValueError, match=error_regex):
                        check_ratio(ratio, y, 'under-sampling')
コード例 #14
0
def test_ratio_dict_under_sampling():
    y = np.array([1] * 50 + [2] * 100 + [3] * 25)
    ratio = {1: 30, 2: 45, 3: 25}
    ratio_ = check_ratio(ratio, y, 'under-sampling')
    assert ratio_ == ratio
コード例 #15
0
def test_ratio_majority_under_sampling():
    y = np.array([1] * 50 + [2] * 100 + [3] * 25)
    ratio = check_ratio('majority', y, 'under-sampling')
    assert ratio == {2: 25}
コード例 #16
0
def test_ratio_minority_over_sampling():
    y = np.array([1] * 50 + [2] * 100 + [3] * 25)
    ratio = check_ratio('minority', y, 'over-sampling')
    assert ratio == {3: 75}
コード例 #17
0
def test_ratio_not_minority_under_sampling():
    y = np.array([1] * 50 + [2] * 100 + [3] * 25)
    ratio = check_ratio('not minority', y, 'under-sampling')
    assert ratio == {1: 25, 2: 25}
    ratio = check_ratio('auto', y, 'under-sampling')
    assert ratio == {1: 25, 2: 25}
コード例 #18
0
def test_ratio_not_minority_over_sampling():
    y = np.array([1] * 50 + [2] * 100 + [3] * 25)
    ratio = check_ratio('not minority', y, 'over-sampling')
    assert ratio == {1: 50, 2: 0}
コード例 #19
0
def test_ratio_all_over_sampling():
    y = np.array([1] * 50 + [2] * 100 + [3] * 25)
    ratio = check_ratio('all', y, 'over-sampling')
    assert_equal(ratio, {1: 50, 2: 0, 3: 75})
    ratio = check_ratio('auto', y, 'over-sampling')
    assert_equal(ratio, {1: 50, 2: 0, 3: 75})
コード例 #20
0
def test_ratio_majority_under_sampling():
    y = np.array([1] * 50 + [2] * 100 + [3] * 25)
    ratio = check_ratio('majority', y, 'under-sampling')
    assert ratio == {2: 25}
コード例 #21
0
def test_ratio_all_under_sampling():
    y = np.array([1] * 50 + [2] * 100 + [3] * 25)
    ratio = check_ratio('all', y, 'under-sampling')
    assert ratio == {1: 25, 2: 25, 3: 25}
コード例 #22
0
def test_ratio_not_minority_over_sampling():
    y = np.array([1] * 50 + [2] * 100 + [3] * 25)
    ratio = check_ratio('not minority', y, 'over-sampling')
    assert ratio == {1: 50, 2: 0}
コード例 #23
0
def test_ratio_not_minority_under_sampling():
    y = np.array([1] * 50 + [2] * 100 + [3] * 25)
    ratio = check_ratio('not minority', y, 'under-sampling')
    assert ratio == {1: 25, 2: 25}
    ratio = check_ratio('auto', y, 'under-sampling')
    assert ratio == {1: 25, 2: 25}
コード例 #24
0
def test_ratio_minority_over_sampling():
    y = np.array([1] * 50 + [2] * 100 + [3] * 25)
    ratio = check_ratio('minority', y, 'over-sampling')
    assert ratio == {3: 75}
コード例 #25
0
def test_ratio_dict_under_sampling():
    y = np.array([1] * 50 + [2] * 100 + [3] * 25)
    ratio = {1: 30, 2: 45, 3: 25}
    ratio_ = check_ratio(ratio, y, 'under-sampling')
    assert ratio_ == ratio
コード例 #26
0
def test_ratio_float_over_sampling():
    y = np.array([1] * 50 + [2] * 100 + [3] * 25)
    ratio = 0.5
    ratio_ = check_ratio(ratio, y, 'over-sampling')
    assert_equal(ratio_, {1: 0, 3: 25})
コード例 #27
0
def test_ratio_all_over_sampling():
    y = np.array([1] * 50 + [2] * 100 + [3] * 25)
    for each in ('all', 'auto'):
        assert check_ratio(each, y, 'over-sampling') == {1: 50, 2: 0, 3: 75}
コード例 #28
0
def test_check_ratio(ratio, sampling_type, expected_ratio, target):
    with pytest.warns(DeprecationWarning, match="check_ratio is deprecated"):
        ratio_ = check_ratio(ratio, target, sampling_type)
        assert ratio_ == expected_ratio
コード例 #29
0
def test_ratio_all_over_sampling():
    y = np.array([1] * 50 + [2] * 100 + [3] * 25)
    for each in ('all', 'auto'):
        assert check_ratio(each, y, 'over-sampling') == {1: 50, 2: 0, 3: 75}
コード例 #30
0
def test_ratio_majority_over_sampling():
    error_regex = "'ratio'='majority' cannot be used with over-sampler."
    with raises(ValueError, match=error_regex):
        check_ratio('majority', np.array([1, 2, 3]), 'over-sampling')
コード例 #31
0
def test_ratio_majority_over_sampling():
    error_regex = "'ratio'='majority' cannot be used with over-sampler."
    with raises(ValueError, match=error_regex):
        check_ratio('majority', np.array([1, 2, 3]), 'over-sampling')
コード例 #32
0
def test_check_ratio(ratio, sampling_type, expected_ratio, target):
    with pytest.warns(DeprecationWarning, match="check_ratio is deprecated"):
        ratio_ = check_ratio(ratio, target, sampling_type)
        assert ratio_ == expected_ratio
コード例 #33
0
def test_ratio_all_under_sampling():
    y = np.array([1] * 50 + [2] * 100 + [3] * 25)
    ratio = check_ratio('all', y, 'under-sampling')
    assert ratio == {1: 25, 2: 25, 3: 25}
コード例 #34
0
def test_ratio_float_under_sampling():
    y = np.array([1] * 50 + [2] * 100 + [3] * 25)
    ratio = 0.5
    ratio_ = check_ratio(ratio, y, 'under-sampling')
    assert_equal(ratio_, {1: 50, 2: 50})