Exemple #1
0
def test_give_classifier_wrong_obj():
    sampling_strategy = 'auto'
    classifier = 2
    bc = BalanceCascade(sampling_strategy=sampling_strategy,
                        random_state=RND_SEED,
                        return_indices=True,
                        estimator=classifier)
    with raises(ValueError, match="Invalid parameter `estimator`"):
        bc.fit_resample(X, Y)
def test_give_classifier_wrong_obj():
    sampling_strategy = 'auto'
    classifier = 2
    bc = BalanceCascade(
        sampling_strategy=sampling_strategy,
        random_state=RND_SEED,
        return_indices=True,
        estimator=classifier)
    with raises(ValueError, match="Invalid parameter `estimator`"):
        bc.fit_resample(X, Y)
Exemple #3
0
def test_fit_resample_auto():
    sampling_strategy = 'auto'
    bc = BalanceCascade(sampling_strategy=sampling_strategy,
                        random_state=RND_SEED,
                        return_indices=True)
    X_resampled, y_resampled, idx_under = bc.fit_resample(X, Y)
    X_gt = np.array([[[1.15514042, 0.0129463], [0.08711622, 0.93259929],
                      [0.70472253, -0.73309052], [-0.14374509, 0.27370049],
                      [0.83680821, 1.72827342], [-0.18410027, -0.45194484],
                      [-0.28162401, -2.10400981], [-1.11515198, -0.93689695],
                      [0.11622591, -0.0317206], [1.25192108, -0.22367336],
                      [0.53366841, -0.30312976], [1.52091956, -0.49283504],
                      [0.88407872, 0.35454207], [1.31301027, -0.92648734],
                      [-0.41635887, -0.38299653], [1.70580611, -0.11219234]],
                     [[0.28893132, -0.38761769], [0.83680821, 1.72827342],
                      [0.3084254, 0.33299982], [0.70472253, -0.73309052],
                      [-0.14374509, 0.27370049], [0.77481731, 0.60935141],
                      [-0.18410027, -0.45194484], [1.15514042, 0.0129463],
                      [0.11622591, -0.0317206], [1.25192108, -0.22367336],
                      [0.53366841, -0.30312976], [1.52091956, -0.49283504],
                      [0.88407872, 0.35454207], [1.31301027, -0.92648734],
                      [-0.41635887, -0.38299653], [1.70580611, -0.11219234]]])
    y_gt = np.array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
                     [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])
    idx_gt = np.array(
        [[10, 18, 8, 16, 6, 14, 5, 13, 0, 2, 3, 4, 11, 12, 17, 19],
         [9, 6, 7, 8, 16, 1, 14, 10, 0, 2, 3, 4, 11, 12, 17, 19]])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)
Exemple #4
0
def test_fit_resample_half():
    sampling_strategy = {0: 8, 1: 10}
    bc = BalanceCascade(sampling_strategy=sampling_strategy,
                        random_state=RND_SEED)
    X_resampled, y_resampled = bc.fit_resample(X, Y)
    X_gt = np.array([[[-0.41635887, -0.38299653], [0.53366841, -0.30312976],
                      [1.25192108, -0.22367336], [1.70580611, -0.11219234],
                      [1.52091956, -0.49283504], [0.11622591, -0.0317206],
                      [1.31301027, -0.92648734], [0.88407872, 0.35454207],
                      [0.3084254, 0.33299982], [0.08711622, 0.93259929],
                      [-0.28162401, -2.10400981], [-0.14374509, 0.27370049],
                      [0.9281014, 0.53085498], [-0.18410027, -0.45194484],
                      [0.77481731, 0.60935141], [1.15514042, 0.0129463],
                      [-1.11515198, -0.93689695], [0.70472253, -0.73309052]]])
    y_gt = np.array([[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
Exemple #5
0
def test_give_classifier_obj():
    sampling_strategy = 'auto'
    estimator = RandomForestClassifier(random_state=RND_SEED)
    bc = BalanceCascade(sampling_strategy=sampling_strategy,
                        random_state=RND_SEED,
                        return_indices=False,
                        estimator=estimator)
    X_resampled, y_resampled = bc.fit_resample(X, Y)
    X_gt = np.array([[[1.15514042, 0.0129463], [0.08711622, 0.93259929],
                      [0.70472253, -0.73309052], [-0.14374509, 0.27370049],
                      [0.83680821, 1.72827342], [-0.18410027, -0.45194484],
                      [-0.28162401, -2.10400981], [-1.11515198, -0.93689695],
                      [0.11622591, -0.0317206], [1.25192108, -0.22367336],
                      [0.53366841, -0.30312976], [1.52091956, -0.49283504],
                      [0.88407872, 0.35454207], [1.31301027, -0.92648734],
                      [-0.41635887, -0.38299653], [1.70580611, -0.11219234]]])
    y_gt = np.array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def test_fit_resample_half():
    sampling_strategy = {0: 8, 1: 10}
    bc = BalanceCascade(
        sampling_strategy=sampling_strategy, random_state=RND_SEED)
    X_resampled, y_resampled = bc.fit_resample(X, Y)
    X_gt = np.array([[[-0.41635887, -0.38299653], [0.53366841, -0.30312976], [
        1.25192108, -0.22367336
    ], [1.70580611, -0.11219234], [1.52091956, -0.49283504], [
        0.11622591, -0.0317206
    ], [1.31301027, -0.92648734], [0.88407872, 0.35454207], [
        0.3084254, 0.33299982
    ], [0.08711622, 0.93259929], [-0.28162401, -2.10400981], [
        -0.14374509, 0.27370049
    ], [0.9281014, 0.53085498], [-0.18410027, -0.45194484],
                      [0.77481731, 0.60935141], [1.15514042, 0.0129463],
                      [-1.11515198, -0.93689695], [0.70472253, -0.73309052]]])
    y_gt = np.array([[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def test_give_classifier_obj():
    sampling_strategy = 'auto'
    estimator = RandomForestClassifier(n_estimators=10, random_state=RND_SEED)
    bc = BalanceCascade(
        sampling_strategy=sampling_strategy,
        random_state=RND_SEED,
        return_indices=False,
        estimator=estimator)
    X_resampled, y_resampled = bc.fit_resample(X, Y)
    X_gt = np.array([[[1.15514042, 0.0129463], [0.08711622, 0.93259929], [
        0.70472253, -0.73309052
    ], [-0.14374509, 0.27370049], [0.83680821, 1.72827342], [
        -0.18410027, -0.45194484
    ], [-0.28162401, -2.10400981], [-1.11515198, -0.93689695], [
        0.11622591, -0.0317206
    ], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [
        1.52091956, -0.49283504
    ], [0.88407872, 0.35454207], [1.31301027, -0.92648734],
                      [-0.41635887, -0.38299653], [1.70580611, -0.11219234]]])
    y_gt = np.array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def test_fit_resample_auto():
    sampling_strategy = 'auto'
    bc = BalanceCascade(
        sampling_strategy=sampling_strategy,
        random_state=RND_SEED,
        return_indices=True)
    X_resampled, y_resampled, idx_under = bc.fit_resample(X, Y)
    X_gt = np.array(
        [[[1.15514042, 0.0129463], [0.08711622, 0.93259929],
          [0.70472253,
           -0.73309052], [-0.14374509, 0.27370049], [0.83680821, 1.72827342], [
               -0.18410027, -0.45194484
           ], [-0.28162401, -2.10400981], [-1.11515198, -0.93689695], [
               0.11622591, -0.0317206
           ], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [
               1.52091956, -0.49283504
           ], [0.88407872, 0.35454207], [1.31301027, -0.92648734],
          [-0.41635887, -0.38299653], [1.70580611, -0.11219234]],
         [[0.28893132,
           -0.38761769], [0.83680821, 1.72827342], [0.3084254, 0.33299982],
          [0.70472253, -0.73309052], [-0.14374509, 0.27370049], [
              0.77481731, 0.60935141
          ], [-0.18410027, -0.45194484], [1.15514042, 0.0129463],
          [0.11622591, -0.0317206], [1.25192108, -0.22367336], [
              0.53366841, -0.30312976
          ], [1.52091956, -0.49283504], [0.88407872, 0.35454207], [
              1.31301027, -0.92648734
          ], [-0.41635887, -0.38299653], [1.70580611, -0.11219234]]])
    y_gt = np.array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
                     [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])
    idx_gt = np.array(
        [[10, 18, 8, 16, 6, 14, 5, 13, 0, 2, 3, 4, 11, 12, 17, 19],
         [9, 6, 7, 8, 16, 1, 14, 10, 0, 2, 3, 4, 11, 12, 17, 19]])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)
Exemple #9
0
def sampler(df_orig, *args, **kwargs):
    # read the data and take note of the ids and dates
    if 'file' in args:
        df_orig = pd.read_csv("prep(1).csv")

    IDs = df_orig.Quote_ID
    target = df_orig.QuoteConversion_Flag
    data = df_orig.drop(['QuoteConversion_Flag'], axis=1).values

    # print our class distribution for the user to see... [class, count]
    # print("Before oversampling: ", sorted(Counter(target).items()))
    print("Before cascade: ", sorted(Counter(target).items()))

    # now use our SMOTE method of choice, either ENN or Tomeks to produce synthetic samples...

    ####
    # ENN
    # SVM has better results with ENN
    ####
    # enn = ENN(sampling_strategy="not majority", kind_sel="mode", n_neighbors=5, n_jobs=4, random_state=0)
    # smote_enn = SMOTEENN(enn=enn, random_state=0)
    # X_resampled, y_resampled = smote_enn.fit_resample(data, target)
    # print("SMOTE ENN: ", sorted(Counter(y_resampled).items()))

    ####
    # Tomeks
    ####
    # smote_tomek = SMOTETomek(random_state=0)
    # X_resampled, y_resampled = smote_tomek.fit_resample(data, target)
    # print("Using SMOTE: ", sorted(Counter(y_resampled).items()))

    fields = [
        'Quote_ID', 'Field_info1', 'Field_info2', 'Field_info3', 'Field_info4',
        'Coverage_info1', 'Coverage_info2', 'Coverage_info3', 'Sales_info1',
        'Sales_info2', 'Sales_info3', 'Sales_info4', 'Sales_info5',
        'Personal_info1', 'Personal_info2', 'Personal_info3', 'Personal_info4',
        'Property_info1', 'Property_info3', 'Property_info4', 'Property_info5',
        'Geographic_info1', 'Geographic_info2', 'Geographic_info3',
        'Geographic_info4', 'Geographic_info5'
    ]

    # synth = pd.merge(result, remain, on='Quote_ID', left_index=True)
    # if 'file' in args:
    # 	synth.to_csv("sampled(2).csv", index=False)

    bc = BalanceCascade(random_state=42)
    X_resampled, y_resampled = bc.fit_resample(data, target)
    print("Balanced Cascade: %s" % Counter(target[0]))

    data = pd.DataFrame(data=X_resampled, columns=fields)
    target = pd.DataFrame(data=y_resampled, columns=['QuoteConversion_Flag'])
    # Now concat the data and target
    synth = pd.concat([target, data], axis=1)
    synth.Quote_ID = synth.Quote_ID.astype("int64")

    synth = pd.concat([target, data], axis=1)
    synth.Quote_ID = synth.Quote_ID.astype("int64")

    # synth = pd.merge(result, FIEL, on='Quote_ID', left_index=True)
    if 'file' in args:
        synth.to_csv("sampled(2).csv", index=False)

    return synth
print(__doc__)

# Generate the dataset
X, y = make_classification(n_classes=2, class_sep=2, weights=[0.3, 0.7],
                           n_informative=3, n_redundant=1, flip_y=0,
                           n_features=20, n_clusters_per_class=1,
                           n_samples=200, random_state=10)

# Instanciate a PCA object for the sake of easy visualisation
pca = PCA(n_components=2)
# Fit and transform x to visualise inside a 2D feature space
X_vis = pca.fit_transform(X)

# Apply Balance Cascade method
bc = BalanceCascade()
X_resampled, y_resampled = bc.fit_resample(X, y)
X_res_vis = []
for X_res in X_resampled:
    X_res_vis.append(pca.transform(X_res))

# Two subplots, unpack the axes array immediately
f, (ax1, ax2) = plt.subplots(1, 2)

ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5)
ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1", alpha=0.5)
ax1.set_title('Original set')

ax2.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5)
for iy, e in enumerate(X_res_vis):
    ax2.scatter(e[y_resampled[iy] == 1, 0], e[y_resampled[iy] == 1, 1],
                label="Class #1 - set #{}".format(iy), alpha=0.5)
                           n_informative=3,
                           n_redundant=1,
                           flip_y=0,
                           n_features=20,
                           n_clusters_per_class=1,
                           n_samples=200,
                           random_state=10)

# Instanciate a PCA object for the sake of easy visualisation
pca = PCA(n_components=2)
# Fit and transform x to visualise inside a 2D feature space
X_vis = pca.fit_transform(X)

# Apply Balance Cascade method
bc = BalanceCascade()
X_resampled, y_resampled = bc.fit_resample(X, y)
X_res_vis = []
for X_res in X_resampled:
    X_res_vis.append(pca.transform(X_res))

# Two subplots, unpack the axes array immediately
f, (ax1, ax2) = plt.subplots(1, 2)

ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5)
ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1", alpha=0.5)
ax1.set_title('Original set')

ax2.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5)
for iy, e in enumerate(X_res_vis):
    ax2.scatter(e[y_resampled[iy] == 1, 0],
                e[y_resampled[iy] == 1, 1],