def test_iht_fit_sample_half():
    """Test the fit sample routine with a 0.5 ratio"""

    # Resample the data
    ratio = 0.7
    iht = InstanceHardnessThreshold(ESTIMATOR, ratio=ratio,
                                    random_state=RND_SEED)
    X_resampled, y_resampled = iht.fit_sample(X, Y)

    X_gt = np.array([[-0.3879569, 0.6894251],
                     [-0.09322739, 1.28177189],
                     [-0.77740357, 0.74097941],
                     [0.91542919, -0.65453327],
                     [-0.03852113, 0.40910479],
                     [-0.43877303, 1.07366684],
                     [-0.85795321, 0.82980738],
                     [-0.18430329, 0.52328473],
                     [-0.30126957, -0.66268378],
                     [-0.65571327, 0.42412021],
                     [-0.28305528, 0.30284991],
                     [1.06446472, -1.09279772],
                     [0.30543283, -0.02589502],
                     [-0.00717161, 0.00318087]])
    y_gt = np.array([0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def test_iht_fit_sample_with_indices():
    """Test the fit sample routine with indices support"""

    # Resample the data
    iht = InstanceHardnessThreshold(ESTIMATOR, return_indices=True,
                                    random_state=RND_SEED)
    X_resampled, y_resampled, idx_under = iht.fit_sample(X, Y)

    X_gt = np.array([[-0.3879569, 0.6894251],
                     [-0.09322739, 1.28177189],
                     [-0.77740357, 0.74097941],
                     [0.91542919, -0.65453327],
                     [-0.43877303, 1.07366684],
                     [-0.85795321, 0.82980738],
                     [-0.18430329, 0.52328473],
                     [-0.65571327, 0.42412021],
                     [-0.28305528, 0.30284991],
                     [1.06446472, -1.09279772],
                     [0.30543283, -0.02589502],
                     [-0.00717161, 0.00318087]])
    y_gt = np.array([0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0])
    idx_gt = np.array([0, 1, 2, 3, 5, 6, 7, 9, 10, 12, 13, 14])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)
 def under_samplin(self):
     from sklearn.linear_model import LogisticRegression
     from imblearn.under_sampling import InstanceHardnessThreshold
     iht = InstanceHardnessThreshold(
         random_state=0,
         estimator=LogisticRegression(solver='lbfgs', multi_class='auto'))
     self.X_resampled, self.y_resampled = iht.fit_resample(self.X, self.y)
Beispiel #4
0
def test_iht_fit_resample_wrong_class_obj():
    from sklearn.cluster import KMeans

    est = KMeans()
    iht = InstanceHardnessThreshold(estimator=est, random_state=RND_SEED)
    with pytest.raises(ValueError, match="Invalid parameter `estimator`"):
        iht.fit_resample(X, Y)
def test_iht_wrong_estimator():
    ratio = 0.7
    est = 'rnd'
    iht = InstanceHardnessThreshold(estimator=est,
                                    ratio=ratio,
                                    random_state=RND_SEED)
    with raises(NotImplementedError):
        iht.fit_sample(X, Y)
Beispiel #6
0
def test_iht_fit_resample_half():
    sampling_strategy = {0: 3, 1: 3}
    iht = InstanceHardnessThreshold(NB(),
                                    sampling_strategy=sampling_strategy,
                                    random_state=RND_SEED)
    X_resampled, y_resampled = iht.fit_resample(X, Y)
    assert X_resampled.shape == (6, 2)
    assert y_resampled.shape == (6, )
def test_iht_fit_resample_half():
    sampling_strategy = {0: 6, 1: 8}
    iht = InstanceHardnessThreshold(ESTIMATOR,
                                    sampling_strategy=sampling_strategy,
                                    random_state=RND_SEED)
    X_resampled, y_resampled = iht.fit_resample(X, Y)
    assert X_resampled.shape == (14, 2)
    assert y_resampled.shape == (14, )
def test_iht_sample_wrong_X():
    """Test either if an error is raised when X is different at fitting
    and sampling"""

    # Create the object
    iht = InstanceHardnessThreshold(random_state=RND_SEED)
    iht.fit(X, Y)
    assert_raises(RuntimeError, iht.sample, np.random.random((100, 40)),
                  np.array([0] * 50 + [1] * 50))
def test_iht_sample_wrong_X():
    """Test either if an error is raised when X is different at fitting
    and sampling"""

    # Create the object
    iht = InstanceHardnessThreshold(random_state=RND_SEED)
    iht.fit(X, Y)
    assert_raises(RuntimeError, iht.sample, np.random.random((100, 40)),
                  np.array([0] * 50 + [1] * 50))
Beispiel #10
0
def sample_func():
    X, y = make_classification(n_classes=2, class_sep=2,
    weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0,
    n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10)
    print('Original dataset shape %s' % Counter(y))
    cnv_vec = np.vectorize(convert_neg_class)
    y = cnv_vec(y)

    iht = InstanceHardnessThreshold(random_state=42)
    X_res, y_res = iht.fit_resample(X, y)
    print('Resampled dataset shape %s' % Counter(y_res))  # doctest: +ELLIPSIS
Beispiel #11
0
def test_iht_reproducibility():
    from sklearn.datasets import load_digits
    X_digits, y_digits = load_digits(return_X_y=True)
    idx_sampled = []
    for seed in range(5):
        est = RandomForestClassifier(n_estimators=10, random_state=seed)
        iht = InstanceHardnessThreshold(estimator=est, random_state=RND_SEED)
        iht.fit_resample(X_digits, y_digits)
        idx_sampled.append(iht.sample_indices_.copy())
    for idx_1, idx_2 in zip(idx_sampled, idx_sampled[1:]):
        assert_array_equal(idx_1, idx_2)
def test_iht_fit_sample():
    """Test the fit sample routine"""

    # Resample the data
    iht = InstanceHardnessThreshold(ESTIMATOR, random_state=RND_SEED)
    X_resampled, y_resampled = iht.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'iht_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'iht_y.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def test_iht_fit_sample():
    """Test the fit sample routine"""

    # Resample the data
    iht = InstanceHardnessThreshold(ESTIMATOR, random_state=RND_SEED)
    X_resampled, y_resampled = iht.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'iht_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'iht_y.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
Beispiel #14
0
def instance_hardness_thresold(X,
                               y,
                               visualize=False,
                               pca2d=True,
                               pca3d=True,
                               tsne=True,
                               pie_evr=True):
    iht = InstanceHardnessThreshold(random_state=42)
    X_res, y_res = iht.fit_resample(X, y)
    if visualize == True:
        hist_over_and_undersampling(y_res)
        pca_general(X_res, y_res, d2=pca2d, d3=pca3d, pie_evr=pie_evr)
    return X_res, y_res
def test_iht_fit_sample():
    iht = InstanceHardnessThreshold(ESTIMATOR, random_state=RND_SEED)
    X_resampled, y_resampled = iht.fit_sample(X, Y)

    X_gt = np.array([[-0.3879569, 0.6894251], [0.91542919, -0.65453327],
                     [-0.65571327, 0.42412021], [1.06446472, -1.09279772],
                     [0.30543283, -0.02589502], [-0.00717161, 0.00318087],
                     [-0.09322739, 1.28177189], [-0.77740357, 0.74097941],
                     [-0.43877303, 1.07366684], [-0.85795321, 0.82980738],
                     [-0.18430329, 0.52328473], [-0.28305528, 0.30284991]])
    y_gt = np.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def test_iht_fit_sample_linear_svm():
    """Test the fit sample routine with linear SVM"""

    # Resample the data
    est = 'linear-svm'
    iht = InstanceHardnessThreshold(est, random_state=RND_SEED)
    X_resampled, y_resampled = iht.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'iht_x_svm.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'iht_y_svm.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def test_iht_fit_sample_gradient_boosting():
    """Test the fit sample routine with gradient boosting"""

    # Resample the data
    est = 'gradient-boosting'
    iht = InstanceHardnessThreshold(est, random_state=RND_SEED)
    X_resampled, y_resampled = iht.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'iht_x_gb.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'iht_y_gb.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def test_iht_fit():
    """Test the fitting method"""

    # Create the object
    iht = InstanceHardnessThreshold(ESTIMATOR, random_state=RND_SEED)
    # Fit the data
    iht.fit(X, Y)

    # Check if the data information have been computed
    assert_equal(iht.min_c_, 0)
    assert_equal(iht.maj_c_, 1)
    assert_equal(iht.stats_c_[0], 500)
    assert_equal(iht.stats_c_[1], 4500)
def test_multiclass_error():
    """ Test either if an error is raised when the target are not binary
    type. """

    # continuous case
    y = np.linspace(0, 1, 5000)
    iht = InstanceHardnessThreshold(random_state=RND_SEED)
    assert_warns(UserWarning, iht.fit, X, y)

    # multiclass case
    y = np.array([0] * 2000 + [1] * 2000 + [2] * 1000)
    iht = InstanceHardnessThreshold(random_state=RND_SEED)
    assert_warns(UserWarning, iht.fit, X, y)
def test_iht_fit_sample_decision_tree():
    """Test the fit sample routine with decision-tree"""

    # Resample the data
    est = 'decision-tree'
    iht = InstanceHardnessThreshold(est, random_state=RND_SEED)
    X_resampled, y_resampled = iht.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'iht_x_dt.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'iht_y_dt.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def test_iht_fit_sample_gradient_boosting():
    """Test the fit sample routine with gradient boosting"""

    # Resample the data
    est = 'gradient-boosting'
    iht = InstanceHardnessThreshold(est, random_state=RND_SEED)
    X_resampled, y_resampled = iht.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'iht_x_gb.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'iht_y_gb.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def test_iht_fit_sample_linear_svm():
    """Test the fit sample routine with linear SVM"""

    # Resample the data
    est = 'linear-svm'
    iht = InstanceHardnessThreshold(est, random_state=RND_SEED)
    X_resampled, y_resampled = iht.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'iht_x_svm.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'iht_y_svm.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def test_iht_fit():
    """Test the fitting method"""

    # Create the object
    iht = InstanceHardnessThreshold(ESTIMATOR, random_state=RND_SEED)
    # Fit the data
    iht.fit(X, Y)

    # Check if the data information have been computed
    assert_equal(iht.min_c_, 0)
    assert_equal(iht.maj_c_, 1)
    assert_equal(iht.stats_c_[0], 500)
    assert_equal(iht.stats_c_[1], 4500)
def test_iht_fit_sample_decision_tree():
    """Test the fit sample routine with decision-tree"""

    # Resample the data
    est = 'decision-tree'
    iht = InstanceHardnessThreshold(est, random_state=RND_SEED)
    X_resampled, y_resampled = iht.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'iht_x_dt.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'iht_y_dt.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def test_iht_fit_resample():
    iht = InstanceHardnessThreshold(ESTIMATOR, random_state=RND_SEED)
    X_resampled, y_resampled = iht.fit_resample(X, Y)

    X_gt = np.array([[-0.3879569, 0.6894251], [0.91542919, -0.65453327], [
        -0.65571327, 0.42412021
    ], [1.06446472, -1.09279772], [0.30543283, -0.02589502], [
        -0.00717161, 0.00318087
    ], [-0.09322739, 1.28177189], [-0.77740357, 0.74097941],
                     [-0.43877303, 1.07366684], [-0.85795321, 0.82980738],
                     [-0.18430329, 0.52328473], [-0.28305528, 0.30284991]])
    y_gt = np.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def test_iht_fit_sample_knn():
    est = 'knn'
    iht = InstanceHardnessThreshold(est, random_state=RND_SEED)
    X_resampled, y_resampled = iht.fit_sample(X, Y)

    X_gt = np.array([[-0.3879569, 0.6894251], [0.91542919, -0.65453327],
                     [-0.65571327, 0.42412021], [1.06446472, -1.09279772],
                     [0.30543283, -0.02589502], [-0.00717161, 0.00318087],
                     [-0.09322739, 1.28177189], [-0.77740357, 0.74097941],
                     [-0.43877303, 1.07366684], [-0.85795321, 0.82980738],
                     [-0.30126957, -0.66268378], [0.20246714, -0.34727125]])
    y_gt = np.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def test_iht_fit_sample_linear_svm():
    # Resample the data
    est = 'linear-svm'
    iht = InstanceHardnessThreshold(est, random_state=RND_SEED)
    X_resampled, y_resampled = iht.fit_sample(X, Y)

    X_gt = np.array([[-0.3879569, 0.6894251], [-0.09322739, 1.28177189],
                     [-0.77740357, 0.74097941], [0.91542919, -0.65453327],
                     [-0.03852113, 0.40910479], [-0.43877303, 1.07366684],
                     [-0.18430329, 0.52328473], [-0.65571327, 0.42412021],
                     [-0.28305528, 0.30284991], [1.06446472, -1.09279772],
                     [0.30543283, -0.02589502], [-0.00717161, 0.00318087]])
    y_gt = np.array([0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def test_iht_fit_sample_with_indices():
    """Test the fit sample routine with indices support"""

    # Resample the data
    iht = InstanceHardnessThreshold(ESTIMATOR, return_indices=True,
                                    random_state=RND_SEED)
    X_resampled, y_resampled, idx_under = iht.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'iht_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'iht_y.npy'))
    idx_gt = np.load(os.path.join(currdir, 'data', 'iht_idx.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)
Beispiel #29
0
def iht(X, Y):
    from sklearn.linear_model import LogisticRegression
    from imblearn.under_sampling import InstanceHardnessThreshold
    iht = InstanceHardnessThreshold(random_state=0,
                                    estimator=LogisticRegression(
                                        solver='lbfgs', multi_class='auto'))
    Y = np.array(Y, dtype=int)
    iht.fit_resample(X, Y)
    indexes = iht.sample_indices_
    nobj = len(Y)
    mask = np.zeros(nobj, dtype=int)
    for i in range(nobj):
        if i in indexes:
            mask[i] = 1
    return True, mask
def test_iht_fit_sample_with_indices():
    """Test the fit sample routine with indices support"""

    # Resample the data
    iht = InstanceHardnessThreshold(ESTIMATOR, return_indices=True,
                                    random_state=RND_SEED)
    X_resampled, y_resampled, idx_under = iht.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'iht_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'iht_y.npy'))
    idx_gt = np.load(os.path.join(currdir, 'data', 'iht_idx.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)
Beispiel #31
0
    def get_sampler(self):
        sampler = None
        if self.sampler == 'random-over-sampler':
            sampler = RandomOverSampler(random_state=self.random_seed)

        elif self.sampler == 'adasyn':
            sampler = ADASYN(random_state=self.random_seed, n_jobs=self.njobs)

        elif self.sampler == 'smote':
            sampler = SMOTE(random_state=self.random_seed, n_jobs=self.njobs)

        elif self.sampler == 'svm-smote':
            sampler = SVMSMOTE(random_state=self.random_seed,
                               n_jobs=self.njobs)

        elif self.sampler == 'random-under-sampler':
            sampler = RandomUnderSampler(random_state=self.random_seed)

        elif self.sampler == 'tomek-links':
            sampler = TomekLinks(n_jobs=self.njobs)

        elif self.sampler == 'near-miss':
            sampler = NearMiss(n_jobs=self.njobs)

        elif self.sampler == 'instance-hardness':
            sampler = InstanceHardnessThreshold(random_state=self.random_seed,
                                                n_jobs=self.njobs)

        return sampler
Beispiel #32
0
def iht(X, Y):
    from sklearn.linear_model import LogisticRegression
    from imblearn.under_sampling import InstanceHardnessThreshold
    iht = InstanceHardnessThreshold(random_state=0,
                                    estimator=LogisticRegression(
                                        solver='lbfgs', multi_class='auto'))
    Y = np.array(Y, dtype=int)
    iht.fit_resample(X, Y)
    indexes = iht.sample_indices_
    mask = []
    for i in range(len(X)):
        if i in indexes:
            mask.append(1)
        else:
            mask.append(0)
    return True, np.asarray(mask)
Beispiel #33
0
def under_sample(X, y, sampler="RandomUnderSampler"):
    # list of all samplers, in case you want to iterate all of them
    samplers_list = ['RandomUnderSampler', 'ClusterCentroids', 'NearMiss', 'InstanceHardnessThreshold',
                     'CondensedNearestNeighbour', 'EditedNearestNeighbours', 'RepeatedEditedNearestNeighbours',
                     'AllKNN', 'NeighbourhoodCleaningRule', 'OneSidedSelection']
    print(samplers_list)

    # currently there is no parameters sampler
    # this dict is used to choose a resampler by user. default is random
    samplers = {
        "RandomUnderSampler": RandomUnderSampler(),
        "ClusterCentroids": ClusterCentroids(),
        "NearMiss": NearMiss(),
        "InstanceHardnessThreshold": InstanceHardnessThreshold(),
        "CondensedNearestNeighbour": CondensedNearestNeighbour(),
        "EditedNearestNeighbours": EditedNearestNeighbours(),
        "RepeatedEditedNearestNeighbours": RepeatedEditedNearestNeighbours(),
        "AllKNN": AllKNN(),
        "NeighbourhoodCleaningRule": NeighbourhoodCleaningRule(),
        "OneSidedSelection": OneSidedSelection(),
    }
    sampler = samplers[sampler]

    # plot y class count before and after resample
    print("before", sorted(Counter(y).items()))

    # to resample simply call fit_resample method of sampler
    X_resampled, y_resampled = sampler.fit_resample(X, y)

    print("after", sorted(Counter(y_resampled).items()))

    print('===' * 4, 'under_sample finished')

    return X_resampled, y_resampled
def test_iht_init():
    sampling_strategy = 'auto'
    iht = InstanceHardnessThreshold(
        ESTIMATOR, sampling_strategy=sampling_strategy, random_state=RND_SEED)

    assert iht.sampling_strategy == sampling_strategy
    assert iht.random_state == RND_SEED
def test_iht_sample_wt_fit():
    """Test either if an error is raised when sample is called before
    fitting"""

    # Create the object
    iht = InstanceHardnessThreshold(ESTIMATOR, random_state=RND_SEED)
    assert_raises(RuntimeError, iht.sample, X, Y)
def test_iht_wrong_estimator():
    # Resample the data
    ratio = 0.7
    est = 'rnd'
    iht = InstanceHardnessThreshold(
        estimator=est, ratio=ratio, random_state=RND_SEED)
    assert_raises(NotImplementedError, iht.fit_sample, X, Y)
class ResamplingAlgorithms(Enum):
    RO = ("Random Over-sampling", RandomOverSampler(random_state=1))
    SMOTE = ("Smote", SMOTE(random_state=1))
    ADASYN = ("ADASYN", ADASYN(random_state=1))
    SMOTE_TL = ('SMOTE+TL', SMOTETomek(random_state=1))
    SMOTE_ENN = ('SMOTE+ENN', SMOTEENN(random_state=1))
    SMOTE_BOOST = ("SMOTEBoost", smote_boost.SMOTEBoost())
    RU = ("Random Under-sampling", RandomUnderSampler(random_state=1))
    CLUSTERCENTROIDS = ("ClusterCentroids", ClusterCentroids(random_state=1))
    TOMEK_LINKS = ("TomekLinks", TomekLinks())
    NM1 = ("NM1", NearMiss(version=1))
    NM2 = ("NM2", NearMiss(version=2))
    NM3 = ("NM3", NearMiss(version=3))
    CNN = ("CNN", CondensedNearestNeighbour(random_state=1))
    OSS = ("OneSidedSelection", OneSidedSelection(random_state=1))
    ENN = ('ENN', EditedNearestNeighbours())
    NCL = ('NCL', NeighbourhoodCleaningRule())
    IHT = ('IHT', (InstanceHardnessThreshold(random_state=1)))
    RENN = ('RENN', RepeatedEditedNearestNeighbours())
    AllKNN = ('AllKNN', AllKNN())

    @classmethod
    def get_algorithm_by_name(cls, name):
        filtered_algos = filter(lambda ra: ra.value[0] == name,
                                ResamplingAlgorithms)
        return next(filtered_algos, ResamplingAlgorithms.RO)
def test_iht_fit_sample_wrong_class_obj():
    # Resample the data
    from sklearn.cluster import KMeans
    est = KMeans()
    iht = InstanceHardnessThreshold(estimator=est, random_state=RND_SEED)
    assert_raises_regex(ValueError, "Invalid parameter `estimator`",
                        iht.fit_sample, X, Y)
def test_iht_fit_sample_class_obj():
    """Test the fit sample routine passing a classifiermixin object"""

    # Resample the data
    est = GradientBoostingClassifier(random_state=RND_SEED)
    iht = InstanceHardnessThreshold(estimator=est, random_state=RND_SEED)
    X_resampled, y_resampled = iht.fit_sample(X, Y)

    X_gt = np.array([[-0.3879569, 0.6894251], [-0.09322739, 1.28177189],
                     [-0.77740357, 0.74097941], [0.91542919, -0.65453327],
                     [-0.43877303, 1.07366684], [-0.85795321, 0.82980738],
                     [-0.18430329, 0.52328473], [-0.65571327, 0.42412021],
                     [-0.28305528, 0.30284991], [1.06446472, -1.09279772],
                     [0.30543283, -0.02589502], [-0.00717161, 0.00318087]])
    y_gt = np.array([0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
Beispiel #40
0
def test_iht_fit_sample_gradient_boosting():
    """Test the fit sample routine with gradient boosting"""

    # Resample the data
    est = 'gradient-boosting'
    iht = InstanceHardnessThreshold(est, random_state=RND_SEED)
    X_resampled, y_resampled = iht.fit_sample(X, Y)

    X_gt = np.array([[-0.3879569, 0.6894251], [-0.09322739, 1.28177189],
                     [-0.77740357, 0.74097941], [0.91542919, -0.65453327],
                     [-0.43877303, 1.07366684], [-0.85795321, 0.82980738],
                     [-0.18430329, 0.52328473], [-0.65571327, 0.42412021],
                     [-0.28305528, 0.30284991], [1.06446472, -1.09279772],
                     [0.30543283, -0.02589502], [-0.00717161, 0.00318087]])
    y_gt = np.array([0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def test_iht_fit_sample_linear_svm():
    """Test the fit sample routine with linear SVM"""

    # Resample the data
    est = 'linear-svm'
    iht = InstanceHardnessThreshold(est, random_state=RND_SEED)
    X_resampled, y_resampled = iht.fit_sample(X, Y)

    X_gt = np.array([[-0.3879569, 0.6894251], [-0.09322739, 1.28177189],
                     [-0.77740357, 0.74097941], [0.91542919, -0.65453327],
                     [-0.03852113, 0.40910479], [-0.43877303, 1.07366684],
                     [-0.18430329, 0.52328473], [-0.65571327, 0.42412021],
                     [-0.28305528, 0.30284991], [1.06446472, -1.09279772],
                     [0.30543283, -0.02589502], [-0.00717161, 0.00318087]])
    y_gt = np.array([0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def test_iht_fit_sample_knn():
    """Test the fit sample routine with knn"""

    # Resample the data
    est = 'knn'
    iht = InstanceHardnessThreshold(est, random_state=RND_SEED)
    X_resampled, y_resampled = iht.fit_sample(X, Y)

    X_gt = np.array([[-0.3879569, 0.6894251], [-0.09322739, 1.28177189],
                     [-0.77740357, 0.74097941], [0.91542919, -0.65453327],
                     [-0.43877303, 1.07366684], [-0.85795321, 0.82980738],
                     [-0.30126957, -0.66268378], [-0.65571327, 0.42412021],
                     [0.20246714, -0.34727125], [1.06446472, -1.09279772],
                     [0.30543283, -0.02589502], [-0.00717161, 0.00318087]])
    y_gt = np.array([0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def test_iht_init():
    ratio = 'auto'
    iht = InstanceHardnessThreshold(ESTIMATOR,
                                    ratio=ratio,
                                    random_state=RND_SEED)

    assert iht.ratio == ratio
    assert iht.random_state == RND_SEED
def test_iht_fit_resample_wrong_class_obj():
    from sklearn.cluster import KMeans
    est = KMeans()
    iht = InstanceHardnessThreshold(estimator=est, random_state=RND_SEED)
    with raises(ValueError, match="Invalid parameter `estimator`"):
        iht.fit_resample(X, Y)
pca = PCA(n_components=2)
X_vis = pca.fit_transform(X)

# Two subplots, unpack the axes array immediately
f, axs = plt.subplots(2, 2)

axs = [a for ax in axs for a in ax]
for ax, ratio in zip(axs, [0.0, 0.1, 0.3, 0.5]):
    if ratio == 0.0:
        ax.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0",
                    alpha=0.5, edgecolor=almost_black, facecolor=palette[0],
                    linewidth=0.15)
        ax.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1",
                   alpha=0.5, edgecolor=almost_black, facecolor=palette[2],
                   linewidth=0.15)
        ax.set_title('Original set')
    else:
        iht = InstanceHardnessThreshold(ratio=ratio)
        X_res, y_res = iht.fit_sample(X, y)
        X_res_vis = pca.transform(X_res)

        ax.scatter(X_res_vis[y_res == 0, 0], X_res_vis[y_res == 0, 1],
                   label="Class #0", alpha=.5, edgecolor=almost_black,
                   facecolor=palette[0], linewidth=0.15)
        ax.scatter(X_res_vis[y_res == 1, 0], X_res_vis[y_res == 1, 1],
                   label="Class #1", alpha=.5, edgecolor=almost_black,
                   facecolor=palette[2], linewidth=0.15)
        ax.set_title('Instance Hardness Threshold ({})'.format(ratio))

plt.show()
pca = PCA(n_components=2)
X_vis = pca.fit_transform(X)

# Two subplots, unpack the axes array immediately
f, axs = plt.subplots(2, 2)

axs = [a for ax in axs for a in ax]
for ax, sampling_strategy in zip(axs, (0,
                                       {1: 25, 0: 10},
                                       {1: 14, 0: 10},
                                       {1: 10, 0: 10})):
    if sampling_strategy == 0:
        c0, c1 = plot_resampling(ax, X_vis, y, 'Original set')
    else:
        iht = InstanceHardnessThreshold(sampling_strategy=sampling_strategy,
                                        estimator=LogisticRegression(),
                                        return_indices=True)
        X_res, y_res, idx_res = iht.fit_resample(X, y)
        X_res_vis = pca.transform(X_res)
        plot_resampling(ax, X_res_vis, y_res,
                        'Instance Hardness Threshold ({})'
                        .format(sampling_strategy))
        # plot samples which have been removed
        idx_samples_removed = np.setdiff1d(np.arange(X_vis.shape[0]),
                                           idx_res)
        c3 = ax.scatter(X_vis[idx_samples_removed, 0],
                        X_vis[idx_samples_removed, 1],
                        alpha=.2, label='Removed samples')

plt.figlegend((c0, c1, c3), ('Class #0', 'Class #1', 'Removed samples'),
              loc='lower center', ncol=3, labelspacing=0.)