def run_model_with_smote(dataset,target,model): try: if (model.__class__ == LogisticRegression) or (model.__class__ == KNeighborsClassifier): X_new = build_dataset(dataset,1) else: X_new = build_dataset(dataset) sm = SMOTE(random_state =42) X_smote,y_smote = sm.fit_resample(X_new,target) X_train,X_test,y_train,y_test = train_test_split(X_smote,y_smote,random_state = 42, test_size =0.2,stratify = y_smote) print('Class ratio after applyin SMOTE : \n',check_imbalance(y_smote)) model.fit(X_train,y_train) y_pred = model.predict(X_test) print('===='*20) print(type(model)) print('===='*20) print('Classification Report : \n',metrics.classification_report(y_test,y_pred)) print('Confusion Matrix : \n',metrics.confusion_matrix(y_test,y_pred)) print('Accuracy score: \n',metrics.accuracy_score(y_test, y_pred)) fpr,tpr,threshold = metrics.roc_curve(y_test,y_pred) plt.plot(fpr, tpr) plt.xlabel('FPR') plt.ylabel('TPR') plt.title('ROC curve') plt.show() print('AUC : ',metrics.roc_auc_score(y_test, y_pred)) return model,X_train.columns.tolist() except Exception as e : print('run_model_with_smote failed : \n',str(e.message()))
def SMOTE_sampling(self, ds): self.report.append('SMOTE_sampling') Y = ds["Response"] X = ds.drop(columns=["Response"]) sm = SMOTE(random_state=self.seed) X_res, Y_res = sm.fit_resample(X, Y) sampled_ds = pd.DataFrame(X_res) sampled_ds['Response'] = Y_res # sampled_ds.index=ds.index sampled_ds.columns = ds.columns return round(sampled_ds, 2)
def SMOTE_NC(self): categories = self.training.dtypes self.report.append('SMOTE_NC_sampling') Y = self.training["Response"] X = self.training.drop(columns=["Response"]) x_cols = X.columns cat_cols = X.loc[:, self.training.dtypes == 'category'].columns if len(cat_cols) > 0: sm = SMOTENC(random_state=self.seed, categorical_features=[cat_cols.get_loc(col) for col in cat_cols]) else: sm = SMOTE(random_state=self.seed) X_res, Y_res = sm.fit_resample(X.values, Y.values) sampled_ds = pd.DataFrame(X_res, columns=x_cols) sampled_ds['Response'] = Y_res # sampled_ds.index=ds.index self.training = sampled_ds
def boosting_with_smote(dataset,target): ''' Use SMOTE to oversample minority class and find accuracy using XGBoost Parameters ---------- dataset : Dataframe DESCRIPTION. target : Series DESCRIPTION. Returns ------- None. ''' try: xgb = XGBClassifier(random_state = 42) X_new = build_dataset(dataset) sm = SMOTE(sampling_strategy = 'minority' ,random_state = 10) X_smote,y_smote = sm.fit_resample(X_new, target) print('Shape after SMOTE : ',X_smote.shape) X_train,X_test,y_train,y_test = train_test_split(X_smote,y_smote,random_state = 42, test_size =0.2,stratify = y_smote) xgb.fit(X_train,y_train) y_pred = xgb.predict(X_test) print('===='*20) print(type(xgb)) print('===='*20) print('Classification Report : \n',metrics.classification_report(y_test,y_pred)) print('Confusion Matrix : \n',metrics.confusion_matrix(y_test,y_pred)) print('Accuracy score: \n',metrics.accuracy_score(y_test, y_pred)) fpr,tpr,threshold = metrics.roc_curve(y_test,y_pred) plt.plot(fpr, tpr) plt.xlabel('FPR') plt.ylabel('TPR') plt.title('ROC curve') plt.show() print('AUC : ',metrics.roc_auc_score(y_test, y_pred)) except Exception as e: print('boosting_with_smote failed : \n',+str(e.message()))
# # Over-Sampled Model # ## Using SMOTE, we over-sample the minority class (MENTHLTH2 = 1) and take care to test/train split before preoceeding with re-sampling. # In[100]: from imblearn.over_sampling import SMOTENC # setting up testing and training sets X_train3, X_test3, y_train3, y_test3 = train_test_split(X, y, test_size=0.3, random_state=0) sm = SMOTENC(categorical_features=[1,2,3,4,5,6,7,8,9,10,11,12,13], sampling_strategy='minority', random_state=0, k_neighbors=5) X_train_over, y_train_over = sm.fit_resample(X_train3, y_train3) # describes info about train and test set print("Number of rows/columns in X_test3 dataset: ", X_test3.shape) print("Number of rows/columns in y_test3 dataset: ", y_test3.shape) print("Number of rows/columns in X_train_over dataset: ", X_train_over.shape) print("Number of rows/columns in y_train_over dataset: ", y_train_over.shape) # In[101]: unique, counts = np.unique(y_train3, return_counts=True) dict(zip(unique, counts))
X = users[features].values y = users[['sortie_client']].values.flatten() X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.001) df = pd.DataFrame(X_train, columns=[ 'mariee', 'retraite', 'a_charge', 'facture_mensuelle', 'telephone', 'plusieurs_numeros', 'internet', 'total_factures', 'contrat', 'facture_par_mail', 'client_depuis_mois' ]) columns = df.columns sm = SMOTE(random_state=42, sampling_strategy=1) X_train_sm, y_train_sm = sm.fit_resample(X_train, y_train) nn = 0 def tree_to_code(tree, feature_names): tree_ = tree.tree_ feature_name = [ feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!" for i in tree_.feature ] # print ("def tree({}):" .format(", " .join(feature_names))) def recurse(node, depth): indent = " " * depth
X_train['educational'] = X_train['educational'].astype(float) X_train['home_improvement'] = X_train['home_improvement'].astype(float) X_train['house'] = X_train['house'].astype(float) X_train['major_purchase'] = X_train['major_purchase'].astype(float) X_train['medical'] = X_train['medical'].astype(float) X_train['moving'] = X_train['moving'].astype(float) X_train['other'] = X_train['other'].astype(float) X_train['renewable_energy'] = X_train['renewable_energy'].astype(float) X_train['small_business'] = X_train['small_business'].astype(float) X_train['vacation'] = X_train['vacation'].astype(float) X_train['wedding'] = X_train['wedding'].astype(float) X_train['w'] = X_train['w'].astype(float) # In[119]: X_train_smote, y_train_smote = sm.fit_resample(X_train.astype('float'), y_train) # In[120]: from collections import Counter print("Before SMOTE :", Counter(y_train)) print("After SMOTE :", Counter(y_train_smote)) # # Logistic Regression after Balancing # In[121]: logreg = LogisticRegression() logreg.fit(X_train_smote, y_train_smote) y_pred = logreg.predict(X_test) print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(
X = telcom.iloc[:, 1: 32] # delete customer ID from X because won't use it to predict churn. X = X.drop(['Churn'], axis = 1) # create dataframe with only independent variables. X = X.astype(float) # keep a dataframe but where all columns are floats. Z = X.values # Save X as an array under the name Z y = telcom['Churn'].values # dependent variables, as array. features = [i for i in telcom.columns if i not in Id_col + target_col] # Store the name of all variables. # Splitting the dataset into the Training set and Test set X_train, X_test, y_train, y_test = train_test_split(Z, y, test_size = 0.2, random_state=44) # Implement SMOTE sm = SMOTENC(random_state=40, categorical_features=categorical_features) X_smote, y_smote = sm.fit_resample(X_train, y_train) # Train the model on the resampled training dataset. No more unbalanced classes. X_train = X_smote y_train = y_smote ####### MODEL IMPLEMENTATION # Fitting tuned XGBoost to the Training set classifier = XGBClassifier( n_estimators=150, learning_rate=0.15, max_depth=3, min_child_weight=0.6, colsample_bytree=1, subsample=1, reg_alpha=0, gamma=0,
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold kfold, scores = KFold(n_splits=5, shuffle=True, random_state=0), list() for train, test in kfold.split(X_train): x_train, x_test = X_train[train], X_train[test] y_train, y_test = Y[train], Y[test] num_class1, num_class2, num_class3 = Counter(y_train)[1], Counter( y_train)[2], Counter(y_train)[3] sm = SMOTE(random_state=27, sampling_strategy={ 1: int(2.0 * num_class1), 2: int(1.6 * num_class2), 3: int(1.6 * num_class3) }) x_train, y_train = sm.fit_resample(x_train, y_train) model = LGBMClassifier(random_state=27, max_depth=6, n_estimators=400) model.fit(x_train, y_train, categorical_feature=[1, 2, 4, 5, 11]) preds = model.predict(x_test) score = f1_score(y_test, preds, average="weighted") scores.append(score) print(score) print("Average: ", sum(scores) / len(scores)) ##Make final prediction using Lightgbm # We apply SMOTE on all classes, thus increasing total sample size of each class # This generalizes the decision boundary num_class1, num_class2, num_class3 = Counter(Y)[1], Counter(Y)[2], Counter( Y)[3]