def test_fit_resample_auto(): sampling_strategy = 'auto' bc = BalanceCascade(sampling_strategy=sampling_strategy, random_state=RND_SEED, return_indices=True) X_resampled, y_resampled, idx_under = bc.fit_resample(X, Y) X_gt = np.array([[[1.15514042, 0.0129463], [0.08711622, 0.93259929], [0.70472253, -0.73309052], [-0.14374509, 0.27370049], [0.83680821, 1.72827342], [-0.18410027, -0.45194484], [-0.28162401, -2.10400981], [-1.11515198, -0.93689695], [0.11622591, -0.0317206], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-0.41635887, -0.38299653], [1.70580611, -0.11219234]], [[0.28893132, -0.38761769], [0.83680821, 1.72827342], [0.3084254, 0.33299982], [0.70472253, -0.73309052], [-0.14374509, 0.27370049], [0.77481731, 0.60935141], [-0.18410027, -0.45194484], [1.15514042, 0.0129463], [0.11622591, -0.0317206], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-0.41635887, -0.38299653], [1.70580611, -0.11219234]]]) y_gt = np.array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]) idx_gt = np.array( [[10, 18, 8, 16, 6, 14, 5, 13, 0, 2, 3, 4, 11, 12, 17, 19], [9, 6, 7, 8, 16, 1, 14, 10, 0, 2, 3, 4, 11, 12, 17, 19]]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert_array_equal(idx_under, idx_gt)
def test_fit_sample_auto_early_stop(): """Test the fit and sample routine with auto ratio with 1 subset.""" # Define the ratio parameter ratio = 'auto' n_subset = 1 # Create the sampling object bc = BalanceCascade( ratio=ratio, random_state=RND_SEED, return_indices=True, n_max_subset=n_subset) # Get the different subset X_resampled, y_resampled, idx_under = bc.fit_sample(X, Y) X_gt = np.array([[[0.11622591, -0.0317206], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-0.41635887, -0.38299653], [1.70580611, -0.11219234], [1.15514042, 0.0129463], [0.08711622, 0.93259929], [0.70472253, -0.73309052], [-0.14374509, 0.27370049], [0.83680821, 1.72827342], [-0.18410027, -0.45194484], [-0.28162401, -2.10400981], [-1.11515198, -0.93689695]]]) y_gt = np.array([[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]]) idx_gt = np.array( [[0, 2, 3, 4, 11, 12, 17, 19, 10, 18, 8, 16, 6, 14, 5, 13]]) # Check each array assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert_array_equal(idx_under, idx_gt)
def deep_ensemble_merged(smote=None): dt = DecisionTreeClassifier(max_features=0.2, random_state=KFOLD_SEED) ensembler = BalanceCascade(estimator=dt, n_max_subset=10, random_state=KFOLD_SEED) print("fitting sample") X_res, y_res = ensembler.fit_sample(features, labels_1d) print(X_res.shape, y_res.shape) print("training") # Merge sample batches Xs = None ys = None for i, X_train in enumerate(X_res): if Xs is None: Xs = np.array(X_res[i]) ys = np.array(y_res[i]) print(Xs.shape, ys.shape) else: Xs = np.concatenate((Xs, np.array(X_res[i]))) ys = np.concatenate((ys, np.array(y_res[i]))) print(Xs.shape, ys.shape) shuffle(Xs, ys) # Generate more synthetic samples if smote is not None: Xs, ys = smote.fit_sample(Xs, ys) shuffle(Xs, ys) ys = to_categorical(ys, 2) return Xs, ys
def test_give_classifier_wrong_obj(): ratio = 'auto' classifier = 2 bc = BalanceCascade(ratio=ratio, random_state=RND_SEED, return_indices=True, estimator=classifier) with raises(ValueError, match="Invalid parameter `estimator`"): bc.fit_sample(X, Y)
def Balance_classes(X_train, y_train, Sampling_Function): if Sampling_Function == 'RandomUnderSampler': us = RandomUnderSampler(ratio=0.5, random_state=1) elif Sampling_Function == 'NearMiss1': us = NearMiss(ratio=0.5, random_state=1, version=1, size_ngh=3) elif Sampling_Function == 'NearMiss2': us = NearMiss(ratio=0.5, random_state=1, version=2, size_ngh=3) elif Sampling_Function == 'NearMiss3': us = NearMiss(ratio=0.5, random_state=1, version=3, ver3_samp_ngh=3) elif Sampling_Function == 'CondensedNearestNeighbour': us = CondensedNearestNeighbour(random_state=1) elif Sampling_Function == 'EditedNearestNeighbours': us = EditedNearestNeighbours(random_state=1, size_ngh=5) elif Sampling_Function == 'RepeatedEditedNearestNeighbours': us = EditedNearestNeighbours(random_state=1, size_ngh=5) elif Sampling_Function == 'TomekLinks': us = TomekLinks(random_state=1) elif Sampling_Function == 'RandomOverSampler': us = RandomOverSampler(ratio=0.5, random_state=1) elif Sampling_Function == 'SMOTE': us = SMOTE(ratio=0.5, k=5, random_state=1) elif Sampling_Function == 'SMOTETomek': us = SMOTETomek(ratio=0.5, k=5, random_state=1) elif Sampling_Function == 'SMOTEENN': us = SMOTEENN(ratio=0.5, k=5, random_state=1, size_ngh=5) elif Sampling_Function == 'EasyEnsemble': us = EasyEnsemble() elif Sampling_Function == 'BalanceCascade_rf': us = BalanceCascade(classifier='random-forest', random_state=1) elif Sampling_Function == 'BalanceCascade_svm': us = BalanceCascade(classifier='linear-svm', random_state=1) X_train_res, y_train_res = us.fit_sample(X_train, y_train) return X_train_res, y_train_res
def test_fit_sample_auto_early_stop(): """Test the fit and sample routine with auto ratio with a static number of subsets.""" # Define the ratio parameter ratio = 'auto' n_subset = 4 # Create the sampling object bc = BalanceCascade(ratio=ratio, random_state=RND_SEED, return_indices=True, n_max_subset=n_subset) # Get the different subset X_resampled, y_resampled, idx_under = bc.fit_sample(X, Y) currdir = os.path.dirname(os.path.abspath(__file__)) X_gt = np.load(os.path.join(currdir, 'data', 'bc_x_n_sub.npy')) y_gt = np.load(os.path.join(currdir, 'data', 'bc_y_n_sub.npy')) idx_gt = np.load(os.path.join(currdir, 'data', 'bc_idx_n_sub.npy')) # Check each array for idx in range(X_gt.size): assert_array_equal(X_resampled[idx], X_gt[idx]) assert_array_equal(y_resampled[idx], y_gt[idx]) assert_array_equal(idx_under[idx], idx_gt[idx])
def test_fit_sample_auto_early_stop(): """Test the fit and sample routine with auto ratio with 1 subset.""" # Define the ratio parameter ratio = 'auto' n_subset = 1 # Create the sampling object bc = BalanceCascade(ratio=ratio, random_state=RND_SEED, return_indices=True, n_max_subset=n_subset) # Get the different subset X_resampled, y_resampled, idx_under = bc.fit_sample(X, Y) X_gt = np.array([[[0.11622591, -0.0317206], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-0.41635887, -0.38299653], [1.70580611, -0.11219234], [1.15514042, 0.0129463], [0.08711622, 0.93259929], [0.70472253, -0.73309052], [-0.14374509, 0.27370049], [0.83680821, 1.72827342], [-0.18410027, -0.45194484], [-0.28162401, -2.10400981], [-1.11515198, -0.93689695]]]) y_gt = np.array([[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]]) idx_gt = np.array( [[0, 2, 3, 4, 11, 12, 17, 19, 10, 18, 8, 16, 6, 14, 5, 13]]) # Check each array assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert_array_equal(idx_under, idx_gt)
def test_fit_sample_half(): ratio = 0.8 bc = BalanceCascade(ratio=ratio, random_state=RND_SEED) X_resampled, y_resampled = bc.fit_sample(X, Y) X_gt = np.array([[[1.15514042, 0.0129463], [0.08711622, 0.93259929], [0.70472253, -0.73309052], [-0.14374509, 0.27370049], [0.83680821, 1.72827342], [-0.18410027, -0.45194484], [-0.28162401, -2.10400981], [-1.11515198, -0.93689695], [0.9281014, 0.53085498], [0.3084254, 0.33299982], [0.11622591, -0.0317206], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-0.41635887, -0.38299653], [1.70580611, -0.11219234]]]) y_gt = np.array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def ensemble_adaboost(feat, label): print(type(label)) print(Counter(label)) bm = BalanceCascade(random_state=19, estimator='adaboost') feat_res, label_res = bm.fit_sample(feat, label) print(label_res.shape) return feat_res, label_res
def test_fit_sample_half(): ratio = {0: 8, 1: 10} bc = BalanceCascade(ratio=ratio, random_state=RND_SEED) X_resampled, y_resampled = bc.fit_sample(X, Y) X_gt = np.array([[[-0.41635887, -0.38299653], [0.53366841, -0.30312976], [1.25192108, -0.22367336], [1.70580611, -0.11219234], [1.52091956, -0.49283504], [0.11622591, -0.0317206], [1.31301027, -0.92648734], [0.88407872, 0.35454207], [0.3084254, 0.33299982], [0.08711622, 0.93259929], [-0.28162401, -2.10400981], [-0.14374509, 0.27370049], [0.9281014, 0.53085498], [-0.18410027, -0.45194484], [0.77481731, 0.60935141], [1.15514042, 0.0129463], [-1.11515198, -0.93689695], [0.70472253, -0.73309052]]]) y_gt = np.array([[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_fit_sample_auto_gradient_boosting(): """Test the fit and sample routine with auto ratio with a gradient boosting.""" # Define the ratio parameter ratio = 'auto' classifier = 'gradient-boosting' # Create the sampling object bc = BalanceCascade(ratio=ratio, random_state=RND_SEED, return_indices=True, classifier=classifier) # Get the different subset X_resampled, y_resampled, idx_under = bc.fit_sample(X, Y) currdir = os.path.dirname(os.path.abspath(__file__)) X_gt = np.load(os.path.join(currdir, 'data', 'bc_x_gb.npy')) y_gt = np.load(os.path.join(currdir, 'data', 'bc_y_gb.npy')) idx_gt = np.load(os.path.join(currdir, 'data', 'bc_idx_gb.npy')) # Check each array for idx in range(X_gt.size): assert_array_equal(X_resampled[idx], X_gt[idx]) assert_array_equal(y_resampled[idx], y_gt[idx]) assert_array_equal(idx_under[idx], idx_gt[idx])
def test_give_classifier_obj(): ratio = 'auto' classifier = RandomForestClassifier(random_state=RND_SEED) bc = BalanceCascade(ratio=ratio, random_state=RND_SEED, return_indices=False, estimator=classifier) X_resampled, y_resampled = bc.fit_sample(X, Y) X_gt = np.array([[[1.15514042, 0.0129463], [0.08711622, 0.93259929], [0.70472253, -0.73309052], [-0.14374509, 0.27370049], [0.83680821, 1.72827342], [-0.18410027, -0.45194484], [-0.28162401, -2.10400981], [-1.11515198, -0.93689695], [0.11622591, -0.0317206], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-0.41635887, -0.38299653], [1.70580611, -0.11219234]]]) y_gt = np.array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_give_classifier_obj(): ratio = 'auto' estimator = RandomForestClassifier(random_state=RND_SEED) bc = BalanceCascade(ratio=ratio, random_state=RND_SEED, return_indices=False, estimator=estimator) X_resampled, y_resampled = bc.fit_sample(X, Y) X_gt = np.array([[[1.15514042, 0.0129463], [0.08711622, 0.93259929], [0.70472253, -0.73309052], [-0.14374509, 0.27370049], [0.83680821, 1.72827342], [-0.18410027, -0.45194484], [-0.28162401, -2.10400981], [-1.11515198, -0.93689695], [0.11622591, -0.0317206], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-0.41635887, -0.38299653], [1.70580611, -0.11219234]]]) y_gt = np.array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_fit_sample_auto_linear_svm(): ratio = 'auto' classifier = 'linear-svm' bc = BalanceCascade(ratio=ratio, random_state=RND_SEED, return_indices=False, classifier=classifier) X_resampled, y_resampled = bc.fit_sample(X, Y) X_gt = np.array([[[1.15514042, 0.0129463], [0.08711622, 0.93259929], [0.70472253, -0.73309052], [-0.14374509, 0.27370049], [0.83680821, 1.72827342], [-0.18410027, -0.45194484], [-0.28162401, -2.10400981], [-1.11515198, -0.93689695], [0.11622591, -0.0317206], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-0.41635887, -0.38299653], [1.70580611, -0.11219234]], [[1.15514042, 0.0129463], [0.9281014, 0.53085498], [0.3084254, 0.33299982], [0.28893132, -0.38761769], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.70472253, -0.73309052], [0.77481731, 0.60935141], [0.11622591, -0.0317206], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-0.41635887, -0.38299653], [1.70580611, -0.11219234]]]) y_gt = np.array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_sample_wrong_X(): """Test either if an error is raised when X is different at fitting and sampling""" # Create the object bc = BalanceCascade(random_state=RND_SEED) bc.fit(X, Y) assert_raises(RuntimeError, bc.sample, np.random.random((100, 40)), np.array([0] * 50 + [1] * 50))
def test_rf_wth_bootstrap(): """Test the fit and sample routine with auto ratio with a random forest.""" # Define the ratio parameter ratio = 'auto' classifier = RandomForestClassifier(random_state=RND_SEED) # Create the sampling object bc = BalanceCascade( ratio=ratio, random_state=RND_SEED, return_indices=True, estimator=classifier, bootstrap=False) # Get the different subset X_resampled, y_resampled, idx_under = bc.fit_sample(X, Y) X_gt = np.array( [ np.array([[0.11622591, -0.0317206], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-0.41635887, -0.38299653], [1.70580611, -0.11219234], [1.15514042, 0.0129463], [0.08711622, 0.93259929], [0.70472253, -0.73309052], [-0.14374509, 0.27370049], [0.83680821, 1.72827342], [-0.18410027, -0.45194484], [-0.28162401, -2.10400981], [-1.11515198, -0.93689695]]), np.array([[0.11622591, -0.0317206], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-0.41635887, -0.38299653], [1.70580611, -0.11219234], [1.15514042, 0.0129463], [0.77481731, 0.60935141], [0.3084254, 0.33299982], [0.28893132, -0.38761769], [0.9281014, 0.53085498]]) ], dtype=object) y_gt = np.array( [ np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]), np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1]) ], dtype=object) idx_gt = np.array( [ np.array([0, 2, 3, 4, 11, 12, 17, 19, 10, 18, 8, 16, 6, 14, 5, 13]), np.array([0, 2, 3, 4, 11, 12, 17, 19, 10, 1, 7, 9, 15]) ], dtype=object) # Check each array for idx in range(X_gt.size): assert_array_equal(X_resampled[idx], X_gt[idx]) assert_array_equal(y_resampled[idx], y_gt[idx]) assert_array_equal(idx_under[idx], idx_gt[idx])
def test_give_classifier_wrong_obj(): sampling_strategy = 'auto' classifier = 2 bc = BalanceCascade(sampling_strategy=sampling_strategy, random_state=RND_SEED, return_indices=True, estimator=classifier) with raises(ValueError, match="Invalid parameter `estimator`"): bc.fit_resample(X, Y)
def test_rf_wth_bootstrap(): # Define the ratio parameter ratio = 'auto' classifier = RandomForestClassifier(random_state=RND_SEED) # Create the sampling object bc = BalanceCascade( ratio=ratio, random_state=RND_SEED, return_indices=True, estimator=classifier, bootstrap=False) # Get the different subset X_resampled, y_resampled, idx_under = bc.fit_sample(X, Y) X_gt = np.array( [ np.array([[0.11622591, -0.0317206], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-0.41635887, -0.38299653], [1.70580611, -0.11219234], [1.15514042, 0.0129463], [0.08711622, 0.93259929], [0.70472253, -0.73309052], [-0.14374509, 0.27370049], [0.83680821, 1.72827342], [-0.18410027, -0.45194484], [-0.28162401, -2.10400981], [-1.11515198, -0.93689695]]), np.array([[0.11622591, -0.0317206], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-0.41635887, -0.38299653], [1.70580611, -0.11219234], [1.15514042, 0.0129463], [0.77481731, 0.60935141], [0.3084254, 0.33299982], [0.28893132, -0.38761769], [0.9281014, 0.53085498]]) ], dtype=object) y_gt = np.array( [ np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]), np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1]) ], dtype=object) idx_gt = np.array( [ np.array([0, 2, 3, 4, 11, 12, 17, 19, 10, 18, 8, 16, 6, 14, 5, 13]), np.array([0, 2, 3, 4, 11, 12, 17, 19, 10, 1, 7, 9, 15]) ], dtype=object) # Check each array for idx in range(X_gt.size): assert_array_equal(X_resampled[idx], X_gt[idx]) assert_array_equal(y_resampled[idx], y_gt[idx]) assert_array_equal(idx_under[idx], idx_gt[idx])
def test_multiclass_error(): """ Test either if an error is raised when the target are not binary type. """ # continuous case y = np.linspace(0, 1, 20) bc = BalanceCascade(random_state=RND_SEED) assert_warns(UserWarning, bc.fit, X, y) # multiclass case y = np.array([0] * 3 + [1] * 2 + [2] * 15) bc = BalanceCascade(random_state=RND_SEED) assert_warns(UserWarning, bc.fit, X, y)
def test_fit_sample_auto_early_stop_2(): """Test the fit and sample routine with auto ratio with a 2 subsets.""" # Define the ratio parameter ratio = 'auto' n_subset = 2 # Create the sampling object bc = BalanceCascade(ratio=ratio, random_state=RND_SEED, return_indices=True, n_max_subset=n_subset) # Get the different subset X_resampled, y_resampled, idx_under = bc.fit_sample(X, Y) X_gt = np.array([ np.array([[0.11622591, -0.0317206], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-0.41635887, -0.38299653], [1.70580611, -0.11219234], [1.15514042, 0.0129463], [0.08711622, 0.93259929], [0.70472253, -0.73309052], [-0.14374509, 0.27370049], [0.83680821, 1.72827342], [-0.18410027, -0.45194484], [-0.28162401, -2.10400981], [-1.11515198, -0.93689695]]), np.array([[0.11622591, -0.0317206], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-0.41635887, -0.38299653], [1.70580611, -0.11219234], [1.15514042, 0.0129463], [0.70472253, -0.73309052], [-0.18410027, -0.45194484], [0.77481731, 0.60935141], [0.3084254, 0.33299982], [0.28893132, -0.38761769], [0.9281014, 0.53085498]]) ], dtype=object) y_gt = np.array([ np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]), np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]) ], dtype=object) idx_gt = np.array([ np.array([0, 2, 3, 4, 11, 12, 17, 19, 6, 11, 4, 10, 2, 8, 1, 7]), np.array([0, 2, 3, 4, 11, 12, 17, 19, 6, 4, 8, 0, 3, 5, 9]) ], dtype=object) # Check each array for idx in range(X_gt.size): assert_array_equal(X_resampled[idx], X_gt[idx]) assert_array_equal(y_resampled[idx], y_gt[idx]) assert_array_equal(idx_under[idx], idx_gt[idx])
def unbalanceProcess(params, X_train, y_train): pos_num = np.sum(y_train == 0) neg_num = y_train.shape[0] - pos_num ratio = {0: int(pos_num * 0.2), 1: int(neg_num * 1)} y_train = y_train.astype("int") sm = BalanceCascade(sampling_strategy=ratio,# replacement=True, random_state=params['random-state'], n_max_subset=10, estimator=LogisticRegression(solver='sag', max_iter=200, random_state=0)) X_train_res, y_train_res = sm.fit_sample(X_train, y_train) X_train_res = X_train_res[0]; y_train_res = y_train_res[0] return X_train_res, y_train_res
def test_give_classifier_wrong_obj(): ratio = 'auto' classifier = 2 bc = BalanceCascade(ratio=ratio, random_state=RND_SEED, return_indices=True, estimator=classifier) assert_raises_regex(ValueError, "Invalid parameter `estimator`", bc.fit_sample, X, Y)
def test_bc_fit(): """Test the fitting method""" # Define the parameter for the under-sampling ratio = 'auto' # Create the object bc = BalanceCascade(ratio=ratio, random_state=RND_SEED) # Fit the data bc.fit(X, Y) # Check if the data information have been computed assert_equal(bc.min_c_, 0) assert_equal(bc.maj_c_, 1) assert_equal(bc.stats_c_[0], 8) assert_equal(bc.stats_c_[1], 12)
def test_bc_fit(): """Test the fitting method""" # Define the parameter for the under-sampling ratio = 'auto' # Create the object bc = BalanceCascade(ratio=ratio, random_state=RND_SEED) # Fit the data bc.fit(X, Y) # Check if the data information have been computed assert_equal(bc.min_c_, 0) assert_equal(bc.maj_c_, 1) assert_equal(bc.stats_c_[0], 500) assert_equal(bc.stats_c_[1], 4500)
def test_init_wrong_classifier(): """Test either if an error is raised the classifier provided is unknown.""" # Define the ratio parameter classifier = 'rnd' bc = BalanceCascade(classifier=classifier) assert_raises(NotImplementedError, bc.fit_sample, X, Y)
def cross_validation_ensenble(name): with open('../data/conv_pred/train_data2_' + name + '.pickle', 'rb') as f: data = pickle.load(f) v = DictVectorizer() X = v.fit_transform(data['X']) y = np.array(data['y']) cv = 5 kf = KFold(n_splits=cv) fscore = 0 ftscore = 0 all_f_value = 0 for train_index, test_index in tqdm(kf.split(X)): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] ensenble_num = 1 bc = BalanceCascade(estimator=LogisticRegression(), n_max_subset=ensenble_num) bc_x, bc_y = bc.fit_sample(X_train, y_train) models = [] predicts = [] final_result = [] models.append(xgb.XGBClassifier(n_estimators=500, max_delta_step=1)) for i in range(ensenble_num): models[i].fit(bc_x[i], bc_y[i]) for i in range(ensenble_num): predicts.append(models[i].predict_proba(X_test)) for i in range(len(predicts[0])): result = [0, 0] for j in range(ensenble_num): result[0] += predicts[j][i][0] / ensenble_num result[1] += predicts[j][i][1] / ensenble_num final_result.append(result) precision, recall, f_value, _ = eval(y_test, final_result) fscore += precision ftscore += recall all_f_value += f_value # pprint(sorted( # zip(np.mean([est.steps[1][1].feature_importances_ for est in model.estimators_], axis=0), v.feature_names_), # key=lambda x: x[0], reverse=True)) print('\n') print('final precision : ', str(fscore / cv)) print('final recall : ', str(ftscore / cv)) print('final f-value : ', str(all_f_value / cv))
def test_fit_sample_auto(): ratio = 'auto' bc = BalanceCascade(ratio=ratio, random_state=RND_SEED, return_indices=True) X_resampled, y_resampled, idx_under = bc.fit_sample(X, Y) X_gt = np.array([[[1.15514042, 0.0129463], [0.08711622, 0.93259929], [0.70472253, -0.73309052], [-0.14374509, 0.27370049], [0.83680821, 1.72827342], [-0.18410027, -0.45194484], [-0.28162401, -2.10400981], [-1.11515198, -0.93689695], [0.11622591, -0.0317206], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-0.41635887, -0.38299653], [1.70580611, -0.11219234]], [[0.28893132, -0.38761769], [0.83680821, 1.72827342], [0.3084254, 0.33299982], [0.70472253, -0.73309052], [-0.14374509, 0.27370049], [0.77481731, 0.60935141], [-0.18410027, -0.45194484], [1.15514042, 0.0129463], [0.11622591, -0.0317206], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-0.41635887, -0.38299653], [1.70580611, -0.11219234]]]) y_gt = np.array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]) idx_gt = np.array( [[10, 18, 8, 16, 6, 14, 5, 13, 0, 2, 3, 4, 11, 12, 17, 19], [9, 6, 7, 8, 16, 1, 14, 10, 0, 2, 3, 4, 11, 12, 17, 19]]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert_array_equal(idx_under, idx_gt)
def test_bc_fit_invalid_ratio(): """Test either if an error is raised when the balancing ratio to fit is smaller than the one of the data""" # Create the object ratio = 1. / 10000. bc = BalanceCascade(ratio=ratio, random_state=RND_SEED) # Fit the data assert_raises(RuntimeError, bc.fit_sample, X, Y)
def test_bc_init(): # Define a ratio ratio = 1. bc = BalanceCascade(ratio=ratio, random_state=RND_SEED) assert_equal(bc.ratio, ratio) assert_equal(bc.bootstrap, True) assert_equal(bc.n_max_subset, None) assert_equal(bc.random_state, RND_SEED)
def test_fit_sample_half(): """Test the fit and sample routine with 0.5 ratio.""" # Define the ratio parameter ratio = 0.8 # Create the sampling object bc = BalanceCascade(ratio=ratio, random_state=RND_SEED, bootstrap=False) # Get the different subset X_resampled, y_resampled = bc.fit_sample(X, Y) X_gt = np.array( [ np.array([[0.11622591, -0.0317206], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-0.41635887, -0.38299653], [1.70580611, -0.11219234], [1.15514042, 0.0129463], [0.08711622, 0.93259929], [0.70472253, -0.73309052], [-0.14374509, 0.27370049], [0.83680821, 1.72827342], [-0.18410027, -0.45194484], [-0.28162401, -2.10400981], [-1.11515198, -0.93689695], [0.9281014, 0.53085498], [0.3084254, 0.33299982]]), np.array([[0.11622591, -0.0317206], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-0.41635887, -0.38299653], [1.70580611, -0.11219234], [1.15514042, 0.0129463], [0.70472253, -0.73309052], [-0.18410027, -0.45194484], [0.77481731, 0.60935141], [0.28893132, -0.38761769]]) ], dtype=object) y_gt = np.array( [ np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]), np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1]) ], dtype=object) # Check each array for idx in range(X_gt.size): assert_array_equal(X_resampled[idx], X_gt[idx]) assert_array_equal(y_resampled[idx], y_gt[idx])
def test_sample_wt_fit(): """Test either if an error is raised when sample is called before fitting""" # Define the parameter for the under-sampling ratio = 'auto' # Create the object bc = BalanceCascade(ratio=ratio, random_state=RND_SEED) assert_raises(RuntimeError, bc.sample, X, Y)
def test_fit_sample_half(): """Test the fit and sample routine with 0.5 ratio.""" # Define the ratio parameter ratio = 0.5 # Create the sampling object bc = BalanceCascade(ratio=ratio, random_state=RND_SEED) # Get the different subset X_resampled, y_resampled = bc.fit_sample(X, Y) currdir = os.path.dirname(os.path.abspath(__file__)) X_gt = np.load(os.path.join(currdir, 'data', 'bc_x_05.npy')) y_gt = np.load(os.path.join(currdir, 'data', 'bc_y_05.npy')) # Check each array for idx in range(X_gt.size): assert_array_equal(X_resampled[idx], X_gt[idx]) assert_array_equal(y_resampled[idx], y_gt[idx])
def smot(train_x, train_y, feature_columns): from imblearn.ensemble import BalanceCascade from sklearn.ensemble import RandomForestClassifier #sm = RandomOverSampler(ratio='majority') #from imblearn.ensemble import BalanceCascade sm = BalanceCascade(random_state=42, classifier=RandomForestClassifier()) print('Détail du nombre par CLASSE Y {}'.format(Counter(train_y))) X_res, y_res = sm.fit_sample(train_x, train_y) my_list = map(lambda x: x[0], y_res) train_y = pd.Series(my_list) print(' Détail du nombre par CLASSE Y {}'.format(Counter(train_y))) # reconstitution DATAFRAME train_x = pd.DataFrame(X_res, columns=feature_columns) return train_x, train_y
def test_fit_resample_auto_early_stop(): sampling_strategy = 'auto' estimator = LinearSVC(random_state=RND_SEED) bc = BalanceCascade(sampling_strategy=sampling_strategy, random_state=RND_SEED, return_indices=False, estimator=estimator, n_max_subset=1) X_resampled, y_resampled = bc.fit_resample(X, Y) X_gt = np.array([[[1.15514042, 0.0129463], [0.08711622, 0.93259929], [0.70472253, -0.73309052], [-0.14374509, 0.27370049], [0.83680821, 1.72827342], [-0.18410027, -0.45194484], [-0.28162401, -2.10400981], [-1.11515198, -0.93689695], [0.11622591, -0.0317206], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-0.41635887, -0.38299653], [1.70580611, -0.11219234]]]) y_gt = np.array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_bc_init(): """Test the initialisation of the object""" # Define a ratio ratio = 1. bc = BalanceCascade(ratio=ratio, random_state=RND_SEED) assert_equal(bc.ratio, ratio) assert_equal(bc.bootstrap, True) assert_equal(bc.n_max_subset, None) assert_equal(bc.random_state, RND_SEED)
def test_fit_sample_half(): """Test the fit and sample routine with 0.5 ratio.""" # Define the ratio parameter ratio = 0.8 # Create the sampling object bc = BalanceCascade(ratio=ratio, random_state=RND_SEED, bootstrap=False) # Get the different subset X_resampled, y_resampled = bc.fit_sample(X, Y) X_gt = np.array([ np.array([[0.11622591, -0.0317206], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-0.41635887, -0.38299653], [1.70580611, -0.11219234], [1.15514042, 0.0129463], [0.08711622, 0.93259929], [0.70472253, -0.73309052], [-0.14374509, 0.27370049], [0.83680821, 1.72827342], [-0.18410027, -0.45194484], [-0.28162401, -2.10400981], [-1.11515198, -0.93689695], [0.9281014, 0.53085498], [0.3084254, 0.33299982]]), np.array([[0.11622591, -0.0317206], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-0.41635887, -0.38299653], [1.70580611, -0.11219234], [1.15514042, 0.0129463], [0.70472253, -0.73309052], [-0.18410027, -0.45194484], [0.77481731, 0.60935141], [0.28893132, -0.38761769]]) ], dtype=object) y_gt = np.array([ np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]), np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1]) ], dtype=object) # Check each array for idx in range(X_gt.size): assert_array_equal(X_resampled[idx], X_gt[idx]) assert_array_equal(y_resampled[idx], y_gt[idx])
def test_bc_fit_single_class(): """Test either if an error when there is a single class""" # Define the parameter for the under-sampling ratio = 'auto' # Create the object bc = BalanceCascade(ratio=ratio, random_state=RND_SEED) # Resample the data # Create a wrong y y_single_class = np.zeros((X.shape[0], )) assert_warns(UserWarning, bc.fit, X, y_single_class)
def test_fit_sample_auto(): """Test the fit and sample routine with auto ratio.""" # Define the ratio parameter ratio = 'auto' # Create the sampling object bc = BalanceCascade(ratio=ratio, random_state=RND_SEED, return_indices=True) # Get the different subset X_resampled, y_resampled, idx_under = bc.fit_sample(X, Y) currdir = os.path.dirname(os.path.abspath(__file__)) X_gt = np.load(os.path.join(currdir, 'data', 'bc_x.npy')) y_gt = np.load(os.path.join(currdir, 'data', 'bc_y.npy')) idx_gt = np.load(os.path.join(currdir, 'data', 'bc_idx.npy')) # Check each array for idx in range(X_gt.size): assert_array_equal(X_resampled[idx], X_gt[idx]) assert_array_equal(y_resampled[idx], y_gt[idx]) assert_array_equal(idx_under[idx], idx_gt[idx])
def test_bc_bad_ratio(): """Test either if an error is raised with a wrong decimal value for the ratio""" # Define a negative ratio ratio = -1.0 bc = BalanceCascade(ratio=ratio) assert_raises(ValueError, bc.fit, X, Y) # Define a ratio greater than 1 ratio = 100.0 bc = BalanceCascade(ratio=ratio) assert_raises(ValueError, bc.fit, X, Y) # Define ratio as an unknown string ratio = 'rnd' bc = BalanceCascade(ratio=ratio) assert_raises(ValueError, bc.fit, X, Y) # Define ratio as a list which is not supported ratio = [.5, .5] bc = BalanceCascade(ratio=ratio) assert_raises(ValueError, bc.fit, X, Y)
print(__doc__) # Generate the dataset X, y = make_classification(n_classes=2, class_sep=2, weights=[0.3, 0.7], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=200, random_state=10) # Instanciate a PCA object for the sake of easy visualisation pca = PCA(n_components=2) # Fit and transform x to visualise inside a 2D feature space X_vis = pca.fit_transform(X) # Apply Balance Cascade method bc = BalanceCascade() X_resampled, y_resampled = bc.fit_resample(X, y) X_res_vis = [] for X_res in X_resampled: X_res_vis.append(pca.transform(X_res)) # Two subplots, unpack the axes array immediately f, (ax1, ax2) = plt.subplots(1, 2) ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5) ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1", alpha=0.5) ax1.set_title('Original set') ax2.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5) for iy, e in enumerate(X_res_vis): ax2.scatter(e[y_resampled[iy] == 1, 0], e[y_resampled[iy] == 1, 1],