def test_give_classifier_wrong_obj(): sampling_strategy = 'auto' classifier = 2 bc = BalanceCascade(sampling_strategy=sampling_strategy, random_state=RND_SEED, return_indices=True, estimator=classifier) with raises(ValueError, match="Invalid parameter `estimator`"): bc.fit_resample(X, Y)
def test_give_classifier_wrong_obj(): sampling_strategy = 'auto' classifier = 2 bc = BalanceCascade( sampling_strategy=sampling_strategy, random_state=RND_SEED, return_indices=True, estimator=classifier) with raises(ValueError, match="Invalid parameter `estimator`"): bc.fit_resample(X, Y)
def test_fit_resample_auto(): sampling_strategy = 'auto' bc = BalanceCascade(sampling_strategy=sampling_strategy, random_state=RND_SEED, return_indices=True) X_resampled, y_resampled, idx_under = bc.fit_resample(X, Y) X_gt = np.array([[[1.15514042, 0.0129463], [0.08711622, 0.93259929], [0.70472253, -0.73309052], [-0.14374509, 0.27370049], [0.83680821, 1.72827342], [-0.18410027, -0.45194484], [-0.28162401, -2.10400981], [-1.11515198, -0.93689695], [0.11622591, -0.0317206], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-0.41635887, -0.38299653], [1.70580611, -0.11219234]], [[0.28893132, -0.38761769], [0.83680821, 1.72827342], [0.3084254, 0.33299982], [0.70472253, -0.73309052], [-0.14374509, 0.27370049], [0.77481731, 0.60935141], [-0.18410027, -0.45194484], [1.15514042, 0.0129463], [0.11622591, -0.0317206], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-0.41635887, -0.38299653], [1.70580611, -0.11219234]]]) y_gt = np.array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]) idx_gt = np.array( [[10, 18, 8, 16, 6, 14, 5, 13, 0, 2, 3, 4, 11, 12, 17, 19], [9, 6, 7, 8, 16, 1, 14, 10, 0, 2, 3, 4, 11, 12, 17, 19]]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert_array_equal(idx_under, idx_gt)
def test_fit_resample_half(): sampling_strategy = {0: 8, 1: 10} bc = BalanceCascade(sampling_strategy=sampling_strategy, random_state=RND_SEED) X_resampled, y_resampled = bc.fit_resample(X, Y) X_gt = np.array([[[-0.41635887, -0.38299653], [0.53366841, -0.30312976], [1.25192108, -0.22367336], [1.70580611, -0.11219234], [1.52091956, -0.49283504], [0.11622591, -0.0317206], [1.31301027, -0.92648734], [0.88407872, 0.35454207], [0.3084254, 0.33299982], [0.08711622, 0.93259929], [-0.28162401, -2.10400981], [-0.14374509, 0.27370049], [0.9281014, 0.53085498], [-0.18410027, -0.45194484], [0.77481731, 0.60935141], [1.15514042, 0.0129463], [-1.11515198, -0.93689695], [0.70472253, -0.73309052]]]) y_gt = np.array([[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_give_classifier_obj(): sampling_strategy = 'auto' estimator = RandomForestClassifier(random_state=RND_SEED) bc = BalanceCascade(sampling_strategy=sampling_strategy, random_state=RND_SEED, return_indices=False, estimator=estimator) X_resampled, y_resampled = bc.fit_resample(X, Y) X_gt = np.array([[[1.15514042, 0.0129463], [0.08711622, 0.93259929], [0.70472253, -0.73309052], [-0.14374509, 0.27370049], [0.83680821, 1.72827342], [-0.18410027, -0.45194484], [-0.28162401, -2.10400981], [-1.11515198, -0.93689695], [0.11622591, -0.0317206], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-0.41635887, -0.38299653], [1.70580611, -0.11219234]]]) y_gt = np.array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_fit_resample_half(): sampling_strategy = {0: 8, 1: 10} bc = BalanceCascade( sampling_strategy=sampling_strategy, random_state=RND_SEED) X_resampled, y_resampled = bc.fit_resample(X, Y) X_gt = np.array([[[-0.41635887, -0.38299653], [0.53366841, -0.30312976], [ 1.25192108, -0.22367336 ], [1.70580611, -0.11219234], [1.52091956, -0.49283504], [ 0.11622591, -0.0317206 ], [1.31301027, -0.92648734], [0.88407872, 0.35454207], [ 0.3084254, 0.33299982 ], [0.08711622, 0.93259929], [-0.28162401, -2.10400981], [ -0.14374509, 0.27370049 ], [0.9281014, 0.53085498], [-0.18410027, -0.45194484], [0.77481731, 0.60935141], [1.15514042, 0.0129463], [-1.11515198, -0.93689695], [0.70472253, -0.73309052]]]) y_gt = np.array([[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_give_classifier_obj(): sampling_strategy = 'auto' estimator = RandomForestClassifier(n_estimators=10, random_state=RND_SEED) bc = BalanceCascade( sampling_strategy=sampling_strategy, random_state=RND_SEED, return_indices=False, estimator=estimator) X_resampled, y_resampled = bc.fit_resample(X, Y) X_gt = np.array([[[1.15514042, 0.0129463], [0.08711622, 0.93259929], [ 0.70472253, -0.73309052 ], [-0.14374509, 0.27370049], [0.83680821, 1.72827342], [ -0.18410027, -0.45194484 ], [-0.28162401, -2.10400981], [-1.11515198, -0.93689695], [ 0.11622591, -0.0317206 ], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [ 1.52091956, -0.49283504 ], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-0.41635887, -0.38299653], [1.70580611, -0.11219234]]]) y_gt = np.array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_fit_resample_auto(): sampling_strategy = 'auto' bc = BalanceCascade( sampling_strategy=sampling_strategy, random_state=RND_SEED, return_indices=True) X_resampled, y_resampled, idx_under = bc.fit_resample(X, Y) X_gt = np.array( [[[1.15514042, 0.0129463], [0.08711622, 0.93259929], [0.70472253, -0.73309052], [-0.14374509, 0.27370049], [0.83680821, 1.72827342], [ -0.18410027, -0.45194484 ], [-0.28162401, -2.10400981], [-1.11515198, -0.93689695], [ 0.11622591, -0.0317206 ], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [ 1.52091956, -0.49283504 ], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-0.41635887, -0.38299653], [1.70580611, -0.11219234]], [[0.28893132, -0.38761769], [0.83680821, 1.72827342], [0.3084254, 0.33299982], [0.70472253, -0.73309052], [-0.14374509, 0.27370049], [ 0.77481731, 0.60935141 ], [-0.18410027, -0.45194484], [1.15514042, 0.0129463], [0.11622591, -0.0317206], [1.25192108, -0.22367336], [ 0.53366841, -0.30312976 ], [1.52091956, -0.49283504], [0.88407872, 0.35454207], [ 1.31301027, -0.92648734 ], [-0.41635887, -0.38299653], [1.70580611, -0.11219234]]]) y_gt = np.array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]) idx_gt = np.array( [[10, 18, 8, 16, 6, 14, 5, 13, 0, 2, 3, 4, 11, 12, 17, 19], [9, 6, 7, 8, 16, 1, 14, 10, 0, 2, 3, 4, 11, 12, 17, 19]]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert_array_equal(idx_under, idx_gt)
def sampler(df_orig, *args, **kwargs): # read the data and take note of the ids and dates if 'file' in args: df_orig = pd.read_csv("prep(1).csv") IDs = df_orig.Quote_ID target = df_orig.QuoteConversion_Flag data = df_orig.drop(['QuoteConversion_Flag'], axis=1).values # print our class distribution for the user to see... [class, count] # print("Before oversampling: ", sorted(Counter(target).items())) print("Before cascade: ", sorted(Counter(target).items())) # now use our SMOTE method of choice, either ENN or Tomeks to produce synthetic samples... #### # ENN # SVM has better results with ENN #### # enn = ENN(sampling_strategy="not majority", kind_sel="mode", n_neighbors=5, n_jobs=4, random_state=0) # smote_enn = SMOTEENN(enn=enn, random_state=0) # X_resampled, y_resampled = smote_enn.fit_resample(data, target) # print("SMOTE ENN: ", sorted(Counter(y_resampled).items())) #### # Tomeks #### # smote_tomek = SMOTETomek(random_state=0) # X_resampled, y_resampled = smote_tomek.fit_resample(data, target) # print("Using SMOTE: ", sorted(Counter(y_resampled).items())) fields = [ 'Quote_ID', 'Field_info1', 'Field_info2', 'Field_info3', 'Field_info4', 'Coverage_info1', 'Coverage_info2', 'Coverage_info3', 'Sales_info1', 'Sales_info2', 'Sales_info3', 'Sales_info4', 'Sales_info5', 'Personal_info1', 'Personal_info2', 'Personal_info3', 'Personal_info4', 'Property_info1', 'Property_info3', 'Property_info4', 'Property_info5', 'Geographic_info1', 'Geographic_info2', 'Geographic_info3', 'Geographic_info4', 'Geographic_info5' ] # synth = pd.merge(result, remain, on='Quote_ID', left_index=True) # if 'file' in args: # synth.to_csv("sampled(2).csv", index=False) bc = BalanceCascade(random_state=42) X_resampled, y_resampled = bc.fit_resample(data, target) print("Balanced Cascade: %s" % Counter(target[0])) data = pd.DataFrame(data=X_resampled, columns=fields) target = pd.DataFrame(data=y_resampled, columns=['QuoteConversion_Flag']) # Now concat the data and target synth = pd.concat([target, data], axis=1) synth.Quote_ID = synth.Quote_ID.astype("int64") synth = pd.concat([target, data], axis=1) synth.Quote_ID = synth.Quote_ID.astype("int64") # synth = pd.merge(result, FIEL, on='Quote_ID', left_index=True) if 'file' in args: synth.to_csv("sampled(2).csv", index=False) return synth
print(__doc__) # Generate the dataset X, y = make_classification(n_classes=2, class_sep=2, weights=[0.3, 0.7], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=200, random_state=10) # Instanciate a PCA object for the sake of easy visualisation pca = PCA(n_components=2) # Fit and transform x to visualise inside a 2D feature space X_vis = pca.fit_transform(X) # Apply Balance Cascade method bc = BalanceCascade() X_resampled, y_resampled = bc.fit_resample(X, y) X_res_vis = [] for X_res in X_resampled: X_res_vis.append(pca.transform(X_res)) # Two subplots, unpack the axes array immediately f, (ax1, ax2) = plt.subplots(1, 2) ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5) ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1", alpha=0.5) ax1.set_title('Original set') ax2.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5) for iy, e in enumerate(X_res_vis): ax2.scatter(e[y_resampled[iy] == 1, 0], e[y_resampled[iy] == 1, 1], label="Class #1 - set #{}".format(iy), alpha=0.5)
n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=200, random_state=10) # Instanciate a PCA object for the sake of easy visualisation pca = PCA(n_components=2) # Fit and transform x to visualise inside a 2D feature space X_vis = pca.fit_transform(X) # Apply Balance Cascade method bc = BalanceCascade() X_resampled, y_resampled = bc.fit_resample(X, y) X_res_vis = [] for X_res in X_resampled: X_res_vis.append(pca.transform(X_res)) # Two subplots, unpack the axes array immediately f, (ax1, ax2) = plt.subplots(1, 2) ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5) ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1", alpha=0.5) ax1.set_title('Original set') ax2.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5) for iy, e in enumerate(X_res_vis): ax2.scatter(e[y_resampled[iy] == 1, 0], e[y_resampled[iy] == 1, 1],