Beispiel #1
0
def smoter(df):
    IDs = df.Quote_ID
    target = df.QuoteConversion_Flag
    data = df.drop(['QuoteConversion_Flag'], axis=1).values
    print("Before SMOTE: ", sorted(Counter(target).items()))

    ####
    # ENN
    ####
    enn = ENN(sampling_strategy="not majority",
              kind_sel="mode",
              n_neighbors=5,
              n_jobs=-1,
              random_state=RANDOM_STATE)
    smote_enn = SMOTEENN(enn=enn, random_state=RANDOM_STATE)
    X_resampled, y_resampled = smote_enn.fit_resample(data, target)
    print("SMOTE ENN: ", sorted(Counter(y_resampled).items()))

    ####
    # Tomeks
    ####
    # smote_tomek = SMOTETomek(random_state=0)
    # X_resampled, y_resampled = smote_tomek.fit_resample(data, target)
    # print("Using SMOTE: ", sorted(Counter(y_resampled).items()))

    data = pd.DataFrame(data=X_resampled, columns=FIELDS)
    target = pd.DataFrame(data=y_resampled, columns=['QuoteConversion_Flag'])

    return data, target
Beispiel #2
0
def balance_smote(features, target):
    #only balancing training data so test data doesn't overfit
    oversample = SMOTE(sampling_strategy=0.3,
                       k_neighbors=5,
                       random_state=config.RANDOM_STATE)
    undersample = ENN(n_neighbors=5)
    pipeline = make_pipeline(oversample, undersample)
    features, target = pipeline.fit_resample(features, target)

    return features, target
Beispiel #3
0
def renn_sampling(X,Y):
    enn = ENN(return_indices=True)
    nsamples, nx, ny = X.shape
    print(X.shape)
    X = X.reshape((nsamples, nx*ny))

    X, Y, idx_resampled = enn.fit_sample(X,Y)
    
    nsamples, ny = X.shape
    print(X.shape)
    X = X.reshape((nsamples, nx, ny/nx))
    Y = Y.reshape((nsamples, 1))
    return X, Y
Beispiel #4
0
def test_pipeline_fit_then_sample_of_three_samplers_with_sampler_last_estimator():
    X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9],
        n_informative=3, n_redundant=1, flip_y=0,
        n_features=20, n_clusters_per_class=1,
        n_samples=50000, random_state=0)

    rus = RandomUnderSampler(random_state=42)
    enn = ENN()
    pipeline = make_pipeline(rus, enn, rus)
    X_fit_sample_resampled, y_fit_sample_resampled = pipeline.fit_sample(X,y)
    pipeline = make_pipeline(rus, enn, rus)
    pipeline.fit(X,y)
    X_fit_then_sample_resampled, y_fit_then_sample_resampled = pipeline.sample(X,y)
    assert_array_equal(X_fit_sample_resampled, X_fit_then_sample_resampled)
    assert_array_equal(y_fit_sample_resampled, y_fit_then_sample_resampled)
Beispiel #5
0
    def fit_resample(self, X, y):
        """
        Resample the dataset.

        First standardize X, then perform SMOTEENN, then de-standardize
        to return results in the same "units" as input.

        Parameters
        ----------
        X : ndarray
            Dense, feature matrix where rows are observations.
        y : ndarray
            1-D array of responses.

        Returns
        -------
        X_resample, y_resampled : ndarray, ndarray
            Resampled X and y in the original "units" of X.
        """
        ss = StandardScaler()
        X_std = ss.fit_transform(X)

        sm = SMOTEENN(
            sampling_strategy=self.sampling_strategy_smoteenn,
            random_state=self.random_state,
            smote=SMOTE(
                random_state=self.random_state,
                k_neighbors=self.k_smote,
                sampling_strategy=self.sampling_strategy_smote,
            ),
            enn=ENN(
                sampling_strategy=self.sampling_strategy_enn,
                n_neighbors=self.k_enn,
                kind_sel=self.kind_sel_enn,
            ),
        )

        X_res, y_res = sm.fit_resample(X_std, y)

        return ss.inverse_transform(X_res), y_res
print(classification_report(Y_test,predictions))
print("Balanced Accuracy:",balanced_accuracy_score(Y_test, predictions))
# print("best parameters Random Forest",model_rf.best_params_)f
t1 = pl.time.time() - t0
print("Time taken: {:.0f} min {:.0f} secs".format(*divmod(t1, 60)))
print("best parameters:",GB_model.best_params_)
plot_confusion_matrix(confusion_matrix(Y_test,predictions),['Dolphin','Non-Dolphin'])
#13th Test----------------------MLP-------------------------------- simple
from sklearn.neural_network import MLPClassifier
from imblearn.under_sampling import TomekLinks
from imblearn.under_sampling import EditedNearestNeighbours as ENN
from imblearn.combine import SMOTETomek

param={"activation": ['identity', 'logistic', 'tanh', 'relu'], "alpha": [0.0001, 0.001, 0.01, 0.1,1], "learning_rate_init": [0.0001, 0.001, 0.01, 0.1,1] , "solver":['lbfgs', 'sgd', 'adam']}
t0 = pl.time.time()

sample=SMOTETomek(random_state=49, sampling_strategy='minority')
sample= ENN()
Xtrain_sample, Ytrain_sample= sample.fit_sample(X_train, Y_train)
mlp_model = GridSearchCV(MLPClassifier(random_state=49,  max_iter=5000, hidden_layer_sizes=(5,2), activation='identity', alpha=0.0001, learning_rate_init=0.0001, solver='lbfgs'),param,cv=3)
mlp_model= MLPClassifier(random_state=49,  max_iter=5000, hidden_layer_sizes=(6,4), activation='relu', alpha=0.0001,  solver='lbfgs')
mlp_model.fit(Xtrain_sample, Ytrain_sample.values.ravel())
predictions = mlp_model.predict(X_test)
from sklearn.metrics import classification_report, confusion_matrix, balanced_accuracy_score
print(confusion_matrix(Y_test,predictions))
print(classification_report(Y_test,predictions))
print("Balanced Accuracy:",balanced_accuracy_score(Y_test, predictions))
# print("best parameters MLP",mlp_model.best_params_)
t1 = pl.time.time() - t0
print("Time taken: {:.0f} min {:.0f} secs".format(*divmod(t1, 60)))
plot_confusion_matrix(confusion_matrix(Y_test,predictions),['Dolphin','Non-Dolphin'])