def oversample(self): """Function tranforming train data using undersampling and oversampling. Uses undersampling as well as oversampling if the ratio between classes is highly imbalanced. Otherwise only oversampling will be used Keyword args: None Returns: os_X_train, os_y_train. As matrices (as returned by SMOTE). os_X_train and os_y_train are saved to class and will be automatically applied during the search method, if oversample method is run first. """ unique, counts = np.unique(self.y_train, return_counts=True) count_dict = dict(zip(unique, counts)) y_keys, y_counts = count_dict.keys(), count_dict.values() sorted_counts = sorted(y_counts, reverse=True) highest = sorted_counts[0] second_highest = sorted_counts[1] ratio = highest / second_highest # under sample then oversample if (ratio > 4): under = RandomUnderSampler(sampling_strategy=0.5) over = SMOTE() pipeline_steps = [('under_sample', under), ('over_sample', over)] pipeline = Pipeline(steps=pipeline_steps) os_X, os_y = pipeline.fit_resample(self.X_train, self.y_train) else: over = SMOTE() pipeline_steps = [('over_sample', over)] pipeline = Pipeline(steps=pipeline_steps) os_X, os_y = pipeline.fit_resample(self.X_train, self.y_train) self.imb_pipeline_steps = pipeline_steps # osx is returned if you choose to use it yourself, however pipeline will be used in grid search automatically # https://stackoverflow.com/questions/50245684/using-smote-with-gridsearchcv-in-scikit-learn return os_X, os_y
} model = RandomForestClassifier() conf_matrix_list_of_arrays = [] scores=[] for i in range(10): for fold_ind, (train_index, test_index) in enumerate(stratified_group_k_fold(X, y, ids, k=8)): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] train_groups, test_groups = ids[train_index], ids[test_index] ipca = IncrementalPCA(n_components=X_train.shape[1]//5, batch_size=120) ipca.fit(X_train) X_train=ipca.transform(X_train) X_test=ipca.transform(X_test) X_train, y_train = pipeline.fit_resample(X_train, y_train)#Smote clf = GridSearchCV(model, parameters,cv=5, n_jobs=4) clf.fit(X_train, y_train) pred = clf.predict(X_test) conf_matrix = confusion_matrix(y_test, pred) conf_matrix_list_of_arrays.append(conf_matrix) score=accuracy_score(y_test, pred) scores.append(score) mean_of_conf_matrix_arrays = np.mean(conf_matrix_list_of_arrays, axis=0) print(mean_of_conf_matrix_arrays,file=f) print('Accuracy: %.7f (%.7f)' % (np.mean(scores), np.std(scores)),file=f) f.close()
conf_matrix_list_of_arrays = [] scores=[] for i in range(10): for fold_ind, (train_index, test_index) in enumerate(stratified_group_k_fold(Xlog, y, ids, k=8)): print("Fold ", fold_ind) Xlog_train, Xlog_test = Xlog[train_index], Xlog[test_index] Xeye_train, Xeye_test = Xeye[train_index], Xeye[test_index] y_train, y_test = y[train_index], y[test_index] train_groups, test_groups = ids[train_index], ids[test_index] # print(data,X_train.shape) pca = PCA(n_components=0.9999) pca.fit(Xlog_train) Xlog_train=pca.transform(Xlog_train) Xlog_test=pca.transform(Xlog_test) Xlog_train, ylog_train = pipeline.fit_resample(Xlog_train, y_train)#Smote # print(data,X_train.shape) pca = PCA(n_components=0.9999) pca.fit(Xeye_train) Xeye_train=pca.transform(Xeye_train) Xeye_test=pca.transform(Xeye_test) Xeye_train, yeye_train = pipeline.fit_resample(Xeye_train, y_train)#Smote clf1 = GridSearchCV(rf, parameters_rf,cv=5, n_jobs=4) clf1.fit(Xlog_train, ylog_train) y_pred = clf1.best_estimator_.predict(Xlog_train) m1=confusion_matrix(ylog_train, y_pred) pred1 = clf1.predict_proba(Xlog_test)