def smoter(df): IDs = df.Quote_ID target = df.QuoteConversion_Flag data = df.drop(['QuoteConversion_Flag'], axis=1).values print("Before SMOTE: ", sorted(Counter(target).items())) #### # ENN #### enn = ENN(sampling_strategy="not majority", kind_sel="mode", n_neighbors=5, n_jobs=-1, random_state=RANDOM_STATE) smote_enn = SMOTEENN(enn=enn, random_state=RANDOM_STATE) X_resampled, y_resampled = smote_enn.fit_resample(data, target) print("SMOTE ENN: ", sorted(Counter(y_resampled).items())) #### # Tomeks #### # smote_tomek = SMOTETomek(random_state=0) # X_resampled, y_resampled = smote_tomek.fit_resample(data, target) # print("Using SMOTE: ", sorted(Counter(y_resampled).items())) data = pd.DataFrame(data=X_resampled, columns=FIELDS) target = pd.DataFrame(data=y_resampled, columns=['QuoteConversion_Flag']) return data, target
def balance_smote(features, target): #only balancing training data so test data doesn't overfit oversample = SMOTE(sampling_strategy=0.3, k_neighbors=5, random_state=config.RANDOM_STATE) undersample = ENN(n_neighbors=5) pipeline = make_pipeline(oversample, undersample) features, target = pipeline.fit_resample(features, target) return features, target
def renn_sampling(X,Y): enn = ENN(return_indices=True) nsamples, nx, ny = X.shape print(X.shape) X = X.reshape((nsamples, nx*ny)) X, Y, idx_resampled = enn.fit_sample(X,Y) nsamples, ny = X.shape print(X.shape) X = X.reshape((nsamples, nx, ny/nx)) Y = Y.reshape((nsamples, 1)) return X, Y
def test_pipeline_fit_then_sample_of_three_samplers_with_sampler_last_estimator(): X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=50000, random_state=0) rus = RandomUnderSampler(random_state=42) enn = ENN() pipeline = make_pipeline(rus, enn, rus) X_fit_sample_resampled, y_fit_sample_resampled = pipeline.fit_sample(X,y) pipeline = make_pipeline(rus, enn, rus) pipeline.fit(X,y) X_fit_then_sample_resampled, y_fit_then_sample_resampled = pipeline.sample(X,y) assert_array_equal(X_fit_sample_resampled, X_fit_then_sample_resampled) assert_array_equal(y_fit_sample_resampled, y_fit_then_sample_resampled)
def fit_resample(self, X, y): """ Resample the dataset. First standardize X, then perform SMOTEENN, then de-standardize to return results in the same "units" as input. Parameters ---------- X : ndarray Dense, feature matrix where rows are observations. y : ndarray 1-D array of responses. Returns ------- X_resample, y_resampled : ndarray, ndarray Resampled X and y in the original "units" of X. """ ss = StandardScaler() X_std = ss.fit_transform(X) sm = SMOTEENN( sampling_strategy=self.sampling_strategy_smoteenn, random_state=self.random_state, smote=SMOTE( random_state=self.random_state, k_neighbors=self.k_smote, sampling_strategy=self.sampling_strategy_smote, ), enn=ENN( sampling_strategy=self.sampling_strategy_enn, n_neighbors=self.k_enn, kind_sel=self.kind_sel_enn, ), ) X_res, y_res = sm.fit_resample(X_std, y) return ss.inverse_transform(X_res), y_res
print(classification_report(Y_test,predictions)) print("Balanced Accuracy:",balanced_accuracy_score(Y_test, predictions)) # print("best parameters Random Forest",model_rf.best_params_)f t1 = pl.time.time() - t0 print("Time taken: {:.0f} min {:.0f} secs".format(*divmod(t1, 60))) print("best parameters:",GB_model.best_params_) plot_confusion_matrix(confusion_matrix(Y_test,predictions),['Dolphin','Non-Dolphin']) #13th Test----------------------MLP-------------------------------- simple from sklearn.neural_network import MLPClassifier from imblearn.under_sampling import TomekLinks from imblearn.under_sampling import EditedNearestNeighbours as ENN from imblearn.combine import SMOTETomek param={"activation": ['identity', 'logistic', 'tanh', 'relu'], "alpha": [0.0001, 0.001, 0.01, 0.1,1], "learning_rate_init": [0.0001, 0.001, 0.01, 0.1,1] , "solver":['lbfgs', 'sgd', 'adam']} t0 = pl.time.time() sample=SMOTETomek(random_state=49, sampling_strategy='minority') sample= ENN() Xtrain_sample, Ytrain_sample= sample.fit_sample(X_train, Y_train) mlp_model = GridSearchCV(MLPClassifier(random_state=49, max_iter=5000, hidden_layer_sizes=(5,2), activation='identity', alpha=0.0001, learning_rate_init=0.0001, solver='lbfgs'),param,cv=3) mlp_model= MLPClassifier(random_state=49, max_iter=5000, hidden_layer_sizes=(6,4), activation='relu', alpha=0.0001, solver='lbfgs') mlp_model.fit(Xtrain_sample, Ytrain_sample.values.ravel()) predictions = mlp_model.predict(X_test) from sklearn.metrics import classification_report, confusion_matrix, balanced_accuracy_score print(confusion_matrix(Y_test,predictions)) print(classification_report(Y_test,predictions)) print("Balanced Accuracy:",balanced_accuracy_score(Y_test, predictions)) # print("best parameters MLP",mlp_model.best_params_) t1 = pl.time.time() - t0 print("Time taken: {:.0f} min {:.0f} secs".format(*divmod(t1, 60))) plot_confusion_matrix(confusion_matrix(Y_test,predictions),['Dolphin','Non-Dolphin'])