def test_svm_smote(): svm_smote = SVMSMOTE(random_state=42) svm_smote_nn = SVMSMOTE(random_state=42, k_neighbors=NearestNeighbors(n_neighbors=6), m_neighbors=NearestNeighbors(n_neighbors=11), svm_estimator=SVC(random_state=42)) X_res_1, y_res_1 = svm_smote.fit_sample(X, Y) X_res_2, y_res_2 = svm_smote_nn.fit_sample(X, Y) assert_allclose(X_res_1, X_res_2) assert_array_equal(y_res_1, y_res_2)
def create_model_from_training_data(self): training_comments = [] training_ratings = [] print("Training classifier model..") for sentidata in self.training_data: comments = preprocess_text(sentidata.text) training_comments.append(comments) training_ratings.append(sentidata.rating) # discard stopwords, apply stemming, and discard words present in less than 3 comments self.vectorizer = TfidfVectorizer(tokenizer=tokenize_and_stem, sublinear_tf=True, max_df=0.5, stop_words=mystop_words, min_df=3) X_train = self.vectorizer.fit_transform(training_comments).toarray() Y_train = np.array(training_ratings) #Apply SMOTE to improve ratio of the minority class smote_model = SVMSMOTE(sampling_strategy=0.5, random_state=None, k_neighbors=15, m_neighbors=15, out_step=.0001, svm_estimator=None, n_jobs=1) X_resampled, Y_resampled = smote_model.fit_sample(X_train, Y_train) model = self.get_classifier() model.fit(X_resampled, Y_resampled) return model
def getData(splitData=True, useImbalancer=False, useStratify=False): global standard_scaler data = pd.read_csv(filepath_or_buffer="DataSource/binary.csv") X = data.values[:, 1:-1] rank_dummy = pd.get_dummies(data['rank'], drop_first=True).to_numpy() X = np.concatenate((X, rank_dummy), axis=1) y = data.values[:, 0].reshape(-1, 1) if useStratify: stratify = y else: stratify = None if splitData: X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101, shuffle=True, stratify=stratify) else: X_train = X y_train = y if useImbalancer and splitData: tl = TomekLinks(sampling_strategy='majority') X_train, y_train = tl.fit_sample(X=X_train, y=y_train) # print("After 1st pass: "******"After 2nd pass: "******"After 3rd pass: "******"After 4th pass: "******"After 5th pass: "******"After 6th pass: "******"y_train\n", np.asarray((unique, counts)).T) if splitData: unique, counts = np.unique(y_test, return_counts=True) # print("y_test\n", np.asarray((unique, counts)).T) if splitData: return X_train, X_test, y_train.ravel(), y_test.ravel() else: return X_train, y_train.ravel()
def roc_curves(df, number_of_matches): number_of_matches = int(number_of_matches) df_played_matches = df.iloc[0:number_of_matches-1] classifier = LogisticRegression(max_iter=300, multi_class = 'multinomial', solver = 'saga',penalty='elasticnet',l1_ratio = .95) classifier = OneVsRestClassifier(classifier) count = 0 Data = df_played_matches[['home_pos', 'visitor_pos', 'spi1', 'spi2', 'draw%', 'home_form', 'visitor_form', 'importance1', 'importance2', 'xG1', 'xG2']] Target = df_played_matches['home_result'] y = np.asarray(Target) enc = LabelEncoder() label_encoder = enc.fit(y) y = label_encoder.transform(y) X = np.asarray(Data) n_classes = 3 n_samples, n_features = X.shape X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2,random_state=5) from imblearn.over_sampling import SVMSMOTE SVMSMOTE = SVMSMOTE() columns = Data.columns up_sampled_X,up_sampled_y=SVMSMOTE.fit_sample(X_train, y_train) up_sampled_X = pd.DataFrame(data=up_sampled_X,columns=columns ) up_sampled_y= pd.DataFrame(data=up_sampled_y,columns=['home_result']) scaler = RobustScaler() scaler.fit(up_sampled_X) X_train = scaler.transform(up_sampled_X) X_test = scaler.transform(X_test) y_train = label_binarize(np.asarray(up_sampled_y), classes=[0, 1, 2]) y_test = label_binarize(np.asarray(y_test), classes=[0, 1, 2]) y_score = classifier.fit(X_train, y_train).predict_proba(X_test) fpr = dict() tpr = dict() roc_auc = dict() for i in range(n_classes): fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i]) roc_auc[i] = auc(fpr[i], tpr[i]) fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel()) roc_auc["micro"] = auc(fpr["micro"], tpr["micro"]) data = [fpr[2], tpr[2]] dataset = pd.DataFrame({'FPR': data[0], 'TPR': data[1]}) dataset.to_csv("reticulate1.csv")
def oversampling(X_i, X_p1, X_p2, y_i, strategy='auto', ratio=None, sampling_type='svm'): ''' ratio = # majority class instances / # minority class instances Parameters ---------- Returns ---------- ''' # Initialize and join 3 matrices _,ei = X_i.shape _,e1 = X_p1.shape _,e2 = X_p2.shape X_total = np.concatenate((X_i, X_p1, X_p2), axis = 1) # Over sampling if sampling_type=='svm': from imblearn.over_sampling import SVMSMOTE if ratio == None: os = SVMSMOTE(random_state=0) else: os = SVMSMOTE(sampling_strategy=1/ratio, random_state=0) else: from imblearn.over_sampling import RandomOverSampler if ratio == None: os = RandomOverSampler(random_state=0) else: os = RandomOverSampler(sampling_strategy=1/ratio, random_state=0) X_res, y_res = os.fit_sample(X_total, y_i) # Separate in 3 matrices X_i_res = X_res[:,0:ei] X_p1_res = X_res[:,ei:ei+e1] X_p2_res = X_res[:,ei+e1:] return X_i_res, X_p1_res, X_p2_res, y_res, X_res
def _SMOTE_SVM(self): # Oversampling - SMOTE - Synthetic Minority Over-sampling Technique # print('before SMOTE df', self.x_train) print("before SMOTE df", self.x_train.shape) smote = SVMSMOTE( k_neighbors=5, m_neighbors=5, random_state=self.seed ) # sampling_strategy=0.8 self.X_train_smote, self.y_train_smote = smote.fit_sample( self.x_train, self.y_train ) print("X_train_SMOTE:\n", self.X_train_smote[1]) self.x_train = pd.DataFrame(self.X_train_smote, columns=self.x_train.columns) self.y_train = pd.DataFrame( self.y_train_smote, columns=["Local Relapse Y(1) /N(0)"] ) # print('len smote: \n', len(self.X_train_smote)) print("len new x_train after smote: \n", len(self.x_train)) number_pos_x = self.y_train.loc[self.y_train["Local Relapse Y(1) /N(0)"] == 1] print("number positive responses y_train:\n", len(number_pos_x))
def train_models(pos="F", adj=1, clusters=0, classifier=2, imb=1): ''' train models :param pos: position (F/D) :param adj: apply league/era adjustments :param clusters: level of clustering to include :param classifier: which classifier to predict :param imb: 1 to apply oversampling, 2 to apply undersampling :return: ''' import warnings warnings.simplefilter(action='ignore', category=Warning) conn = sqlite3.connect('nhl_draft.db') X_train_original = pd.read_sql_query( '''SELECT * FROM X_TRAIN{}{}'''.format(str(classifier), pos), conn) X_test_original = pd.read_sql_query( '''SELECT * FROM X_TEST{}{}'''.format(str(classifier), pos), conn) y_train = pd.read_sql_query( '''SELECT * FROM Y_TRAIN{}{}'''.format(str(classifier), pos), conn) y_test = pd.read_sql_query( '''SELECT * FROM Y_TEST{}{}'''.format(str(classifier), pos), conn) if adj == 1: X_train = X_train_original[[ i for i in X_train_original.columns if i not in ['g_gp17', 'a_gp17', 'g_gp18', 'a_gp18'] ]] X_test = X_test_original[[ i for i in X_test_original.columns if i not in ['g_gp17', 'a_gp17', 'g_gp18', 'a_gp18'] ]] else: X_train = X_train_original[[ i for i in X_train_original.columns if i not in [ 'adj_p_a17', 'adj_p_a18', 'adj_g_a17', 'adj_g_a18', 'adj_a_a17', 'adj_a_a18' ] ]] X_test = X_test_original[[ i for i in X_test_original.columns if i not in [ 'adj_p_a17', 'adj_p_a18', 'adj_g_a17', 'adj_g_a18', 'adj_a_a17', 'adj_a_a18' ] ]] if clusters == 1: X_train = X_train[[ i for i in X_train.columns if i not in ['clusters100', 'clusters200'] ]] X_test = X_test[[ i for i in X_test.columns if i not in ['clusters100', 'clusters200'] ]] elif clusters == 2: X_train = X_train[[ i for i in X_train.columns if i not in ['clusters50', 'clusters200'] ]] X_test = X_test[[ i for i in X_test.columns if i not in ['clusters50', 'clusters200'] ]] elif clusters == 3: X_train = X_train[[ i for i in X_train.columns if i not in ['clusters50', 'clusters100'] ]] X_test = X_test[[ i for i in X_test.columns if i not in ['clusters50', 'clusters100'] ]] else: X_train = X_train[[ i for i in X_train.columns if i not in ['clusters50', 'clusters100', 'clusters200'] ]] X_test = X_test[[ i for i in X_test.columns if i not in ['clusters50', 'clusters100', 'clusters200'] ]] X_train, X_test = one_hot_encoding(X_train, X_test, clusters) X_train = X_train[[ i for i in X_train.columns if i not in ['player_id', 'adj_g_a17', 'adj_g_a18', 'adj_a_a17', 'adj_a_a18'] ]] X_test = X_test[[ i for i in X_test.columns if i not in ['player_id', 'adj_g_a17', 'adj_g_a18', 'adj_a_a17', 'adj_a_a18'] ]] X_train, X_test, selected_feat = select_features( X_train, y_train, X_test, GradientBoostingClassifier(), threshold=0.01) # models = [KNeighborsClassifier()]#, MLPClassifier(), SVC(), RandomForestClassifier(), GaussianNB(), KNeighborsClassifier()] models = [ VotingClassifier(estimators=[('gbc', GradientBoostingClassifier()), ('gnb', GaussianNB())], voting='soft') ] for model in models: print(type(model).__name__) if imb == 1: # SMOTE, SMOTENC, BorderlineSMOTE, SVMSMOTE, ADASYN, KMeansSMOTE, RandomOverSampler smt = SVMSMOTE() elif imb == 2: smt = NearMiss() cv = StratifiedKFold(n_splits=3) for train_idx, test_idx, in cv.split(X_train, y_train): X_train1, X_test1 = X_train[train_idx], X_train[test_idx] try: y_train1, y_test1 = y_train.loc[train_idx], y_train.loc[ test_idx] except: y_train1, y_test1 = y_train[train_idx], y_train[test_idx] if imb != 0: X_train1, y_train1 = smt.fit_sample(X_train1, y_train1) clf = model clf.fit(X_train1, y_train1) y_train_pred = clf.predict(X_test1) print(confusion_matrix(y_test1, y_train_pred)) print("Train f1: {}".format(f1_score(y_test1, y_train_pred))) print("Train Precision:", precision_score(y_test1, y_train_pred)) print("Train Recall:", recall_score(y_test1, y_train_pred)) print("Train Accuracy:", accuracy_score(y_test1, y_train_pred)) if imb != 0: X_train, y_train = smt.fit_sample(X_train, y_train) clf = model clf.fit(X_train, y_train) y_pred = clf.predict(X_test) # y_train_pred = cross_val_predict(clf, X_train, y_train, cv=3) print(confusion_matrix(y_test, y_pred)) print("Test f1: {}".format(f1_score(y_test, y_pred))) print("Test Precision:", precision_score(y_test, y_pred)) print("Test Recall:", recall_score(y_test, y_pred)) print("Test Accuracy:", accuracy_score(y_test, y_pred)) print() print(selected_feat) try: print(clf.feature_importances_) except: pass clf.feature_names = selected_feat # save_model(clf, 'models/clf{}_{}2.sav'.format(classifier, pos.lower())) return
adaBoost = AdaBoostClassifier(random_state=0) adaBoost.fit(X_train, y_train) res = adaBoost.predict(features[test_index]) km_scores['AB'] += metrics.f1_score(res, target[test_index]) km_con_mat['AB'] += confusion_matrix(y_true=target[test_index], y_pred=res) # Gradient Boost Classifier gradBoost = GradientBoostingClassifier(random_state=0) gradBoost.fit(X_train, y_train) res = gradBoost.predict(features[test_index]) km_scores['GB'] += metrics.f1_score(res, target[test_index]) km_con_mat['GB'] += confusion_matrix(y_true=target[test_index], y_pred=res) # SVM Smote svm_smote = SVMSMOTE(random_state=0) X_train, y_train = svm_smote.fit_sample(features[train_index], target[train_index]) # Logistic Regression logistic = LogisticRegression(random_state=0) logistic.fit(X_train, y_train) res = logistic.predict(features[test_index]) svm_sm_scores['LR'] += metrics.f1_score(res, target[test_index]) svm_sm_con_mat['LR'] += confusion_matrix(y_true=target[test_index], y_pred=res) # # Ada Boost Classifier adaBoost = AdaBoostClassifier(random_state=0) adaBoost.fit(X_train, y_train) res = adaBoost.predict(features[test_index]) svm_sm_scores['AB'] += metrics.f1_score(res, target[test_index]) svm_sm_con_mat['AB'] += confusion_matrix(y_true=target[test_index],
def runSMOTEvariationsGen(self, folder): """ Create files with SMOTE preprocessing and without preprocessing. :param datasets: datasets. :param folder: cross-validation folders. :return: """ smote = SMOTE() borderline1 = BorderlineSMOTE(kind='borderline-1') borderline2 = BorderlineSMOTE(kind='borderline-2') smoteSVM = SVMSMOTE() geometric_smote = GeometricSMOTE(n_jobs=-1) for dataset in datasets: # biclass e multiclass for fold in range(5): path = os.path.join(folder, dataset, str(fold), ''.join([dataset, "_train.csv"])) train = np.genfromtxt(path, delimiter=',') X = train[:, 0:train.shape[1] - 1] Y = train[:, train.shape[1] - 1] # SMOTE print("SMOTE..." + dataset) X_res, y_res = smote.fit_sample(X, Y) y_res = y_res.reshape(len(y_res), 1) newdata = np.hstack([X_res, y_res]) newtrain = pd.DataFrame(newdata) newtrain.to_csv(os.path.join(folder, dataset, str(fold), ''.join([dataset, "_SMOTE.csv"])), header=False, index=False) # SMOTE BORDERLINE1 print("Borderline1..." + dataset) X_res, y_res = borderline1.fit_sample(X, Y) y_res = y_res.reshape(len(y_res), 1) newdata = np.hstack([X_res, y_res]) newtrain = pd.DataFrame(newdata) newtrain.to_csv(os.path.join( folder, dataset, str(fold), ''.join([dataset, "_Borderline1.csv"])), header=False, index=False) # SMOTE BORDERLINE2 print("Borderline2..." + dataset) X_res, y_res = borderline2.fit_sample(X, Y) y_res = y_res.reshape(len(y_res), 1) newdata = np.hstack([X_res, y_res]) newtrain = pd.DataFrame(newdata) newtrain.to_csv(os.path.join( folder, dataset, str(fold), ''.join([dataset, "_Borderline2.csv"])), header=False, index=False) # SMOTE SVM print("SMOTE SVM..." + dataset) X_res, y_res = smoteSVM.fit_sample(X, Y) y_res = y_res.reshape(len(y_res), 1) newdata = np.hstack([X_res, y_res]) newtrain = pd.DataFrame(newdata) newtrain.to_csv(os.path.join( folder, dataset, str(fold), ''.join([dataset, "_smoteSVM.csv"])), header=False, index=False) # GEOMETRIC SMOTE print("GEOMETRIC SMOTE..." + dataset) X_res, y_res = geometric_smote.fit_resample(X, Y) y_res = y_res.reshape(len(y_res), 1) newdata = np.hstack([X_res, y_res]) newtrain = pd.DataFrame(newdata) newtrain.to_csv(os.path.join( folder, dataset, str(fold), ''.join([dataset, "_Geometric_SMOTE.csv"])), header=False, index=False)
# Unscaled Features X = data_with_targets.drop([target_variable], axis=1) # Target Variable y = data_with_targets[target_variable] ##################################################### ### SMOTE Sampling to deal with imbalance classes ### ##################################################### # Setting Seed Value seed = 81 smote = SVMSMOTE(random_state=seed) resampled_X, resampled_y = smote.fit_sample(X, y) ########################################## ### Splitting into Train and Test sets ### ########################################## X_train, X_test, y_train, y_test = model_selection.train_test_split( resampled_X, resampled_y, test_size=0.3, stratify=resampled_y, random_state=seed) #################################### ### Scalling Train and Test sets ### ####################################
def upsampleSvmSmote(params, X, y): svmsmote = SVMSMOTE(**params) X_rs, y_rs = svmsmote.fit_sample(X, y) return X_rs, y_rs
# BLSM (Borderline SMOTE) from imblearn.over_sampling import BorderlineSMOTE sm2 = BorderlineSMOTE(random_state = 42) # BLSM 알고리즘 적용 X_train_res2, y_train_res2 = sm2.fit_sample(X_train, y_train.ravel()) # Over Sampling 적용 lgbm_clf3 = lgbm.LGBMClassifier(n_estimators = 50, random_state = 42) # LGB Classifier lgbm_clf3.fit(X_train_res2, y_train_res2) # 학습 진행 y_pred3 = lgbm_clf3.predict(X_test) # 평가 데이터셋 예측 print("Confusion_Matrix: \n", confusion_matrix(y_test, y_pred3)) # 혼돈행렬 print('\n') print("Model Evaluation Result: \n", classification_report(y_test, y_pred3)) # 전체적인 성능 평가 # SVMSMOTE from imblearn.over_sampling import SVMSMOTE sm3 = SVMSMOTE(random_state = 42) # SVMSMOTE 알고리즘 적용 X_train_res3, y_train_res3 = sm3.fit_sample(X_train, y_train.ravel()) # Over Sampling 적용 lgbm_clf4 = lgbm.LGBMClassifier(n_estimators = 50, random_state = 42) # LGB Classifier lgbm_clf4.fit(X_train_res3, y_train_res3) # 학습 진행 y_pred4 = lgbm_clf4.predict(X_test) # 평가 데이터셋 예측 print("Confusion_Matrix: \n", confusion_matrix(y_test, y_pred4)) # 혼돈행렬 print('\n') print("Model Evaluation Result: \n", classification_report(y_test, y_pred4)) # 전체적인 성능 평가 # BLSM을 이용해서 Oversampling 한 학습 데이터 셋 : X_train_res2, y_train_res2 from sklearn.linear_model import LogisticRegression lr_model = LogisticRegression(C = 1e+10) # sklearn 의 Logistic Regression은 기본적으로 Ridge 정규화가 포함되어 있기 때문에, # 정규화 텀을 억제하는 C를 크게 적용한다 (C:Inverse of regularization strength) lr_model.fit(X_train_res2, y_train_res2) # 로지스틱 회귀 모형 학습 lr_predict = lr_model.predict(X_test) # 학습 결과를 바탕으로 검증 데이터를 예측 print("Confusion_Matrix: \n", confusion_matrix(y_test, lr_predict)) # 혼돈행렬
# - On these **97 examples** we'll perform **SVMSMOTE** and **train a statistical model.** That model will **predict** on this **validation set.** # # # - We've **dedicated module** to work on **Imbalanced dataset.** By **imblearn's SVMSMOTE** it becomes easy to perform **SVMSMOTE.** # # # - We will use **"not majority"** as our **sampling strategy** parameter as it will **not sample majority** but all **minority classes.** We've to do this in case of **multi-class Imbalanced data.** # In[68]: #Applying SVMSMOTE from imblearn.over_sampling import SMOTE, ADASYN, SVMSMOTE smote = SVMSMOTE(sampling_strategy='not majority') x_s_res, y_s_res = smote.fit_sample(x_train, y_train) print(y_train.value_counts(), '\n') np.bincount(y_s_res) # - We can examine above that :<b> # - Class 1 : 35 Samples # - Class 2 : 22 Samples # - Class 3 : 22 Samples # - Class 0 : 08 Samples # - Class 4 : 06 Samples # # # - And After **SVMSMOTE application** we get **26 Samples of one Class & 36 Samples of rest of the classes.** # ### Building a Statistical model using SVM Classifier
X_train = df_train[[ 'home_pos', 'visitor_pos', 'spi1', 'spi2', 'win%', 'loss%', 'importance1', 'importance2', 'xG1', 'xG2' ]] y_train = df_train['home_result'] X_test = df_test[[ 'home_pos', 'visitor_pos', 'spi1', 'spi2', 'win%', 'loss%', 'importance1', 'importance2', 'xG1', 'xG2' ]] from imblearn.over_sampling import SVMSMOTE SVMSMOTE = SVMSMOTE() columns = X_train.columns up_sampled_X, up_sampled_y = SVMSMOTE.fit_sample(X_train, y_train) up_sampled_X = pd.DataFrame(data=up_sampled_X, columns=columns) up_sampled_y = pd.DataFrame(data=up_sampled_y, columns=['home_result']) from sklearn import preprocessing scaler = preprocessing.StandardScaler(with_mean=False) scaler.fit(up_sampled_X) X_train = scaler.transform(up_sampled_X) X_test = scaler.transform(X_test) classifier = LogisticRegression(max_iter=10000, multi_class='multinomial', solver='saga', penalty='elasticnet', l1_ratio=.5)
def balance_dataset(df, dfLabel, strategy="all"): sm = SVMSMOTE(random_state=42, sampling_strategy=strategy) trainOver, labelOver = sm.fit_sample(df, dfLabel) return trainOver, labelOver