def train_unbalanced(features, model): # Get relevant columns: X is features, y is label print("Getting relevant columns") X = df_features[features] y = df_features['90%rejected'] # one-hot encode the categorical features print("One-hot encoding categorical features") enc.fit(X) X = enc.transform(X) # Split 80/20 training and test data print("Splitting data") X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42) X_train2, X_test2, y_train2, y_test2 = train_test_split(X_train, y_train, test_size=0.10, random_state=12) # oversample training data print("Oversampling training data") sm = SMOTE(random_state=12, ratio=1.0) X_train_res, y_train_res = sm.fit_sample(X_train2, y_train2) # Classify with given model print("Fitting training data to model") model.fit(X_train_res, y_train_res) # Validate: mean accuracy and recall score print('Validation Results') print(model.score(X_test2, y_test2), 'out of 1.00 predictions were correct') y_pred_2 = model.predict(X_test2) #print(np.unique(y_pred_2)) print(recall_score(y_test2, y_pred_2), 'out of 1.00 of bad data identified') print(y_pred_2.sum()) print('\nTest Results') print(model.score(X_test, y_test), 'out of 1.00 predictions were correct') y_pred = model.predict(X_test) #print(np.unique(y_pred)) print(recall_score(y_test, y_pred), 'out of 1.00 of bad data identified') print(classification_report(y_test, y_pred)) print(y_pred.sum()) print('training data Results') y_model = model.predict(X_train_res) print(classification_report(y_train_res, y_model)) print(y_model.sum()) return model
def gen_resample(X,y,ratio): ''' ''' sm = SMOTE(ratio=ratio,random_state=42) X_rsmpl, y_rsmpl = sm.fit_sample(X, y) df_rsmpl = pd.DataFrame(np.concatenate((X_rsmpl,y_rsmpl.reshape(-1,1)),axis=1),columns=list(X.columns)+[y.name]) ratio_name = re.sub(r'[.]','',str(ratio)) pickle.dump(X_rsmpl, open('pickles/X_rsmpl_train_'+ratio_name+'.pkl', 'wb')) pickle.dump(y_rsmpl,open('pickles/y_rsmpl_train_'+ratio_name+'.pkl','wb')) pickle.dump(df_rsmpl, open('pickles/df_rsmpl_train_'+ratio_name+'.pkl', 'wb')) return(X_rsmpl,y_rsmpl)
'facture_par_mail', 'client_depuis_mois' ] X = telecom_users[features].values #variables predictives sur le users y = telecom_users[['sortie_client']].values.flatten() #variable a predire X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42) #decoupage des donnees pour le test et train #application du naive bayes avec la matrice de confusion from sklearn.naive_bayes import ComplementNB clf = ComplementNB() clf.fit(X_train, y_train) X_test_eval = clf.predict(X_test) confusion_matrix(y_test, X_test_eval) print((confusion_matrix(y_test, X_test_eval))) #over sampling car resultat de la matrice pas concluant from imblearn.over_sampling import SMOTE sm = SMOTE(random_state=42, ratio=1.0) X_train_res, y_train_res = sm.fit_sample(X_train, y_train) clf = ComplementNB() clf.fit(X_train_res, y_train_res) X_test_eval = clf.predict(X_train_res) confusion_matrix(y_train_res, X_test_eval) #prediction sur users eval X_test_eval = telecom_users_eval[features].values X_pred = clf.predict(X_test_eval) X_pred
df = pd.read_csv('framingham.csv') df = df.dropna() zeros = df[df['TenYearCHD'] == 0] ones = df[df['TenYearCHD'] == 1] df_new = pd.concat([zeros,ones], axis = 0) X = df_new[['age','glucose','male','sysBP','totChol','cigsPerDay']] y = df_new['TenYearCHD'] from imblearn.over_sampling import SMOTE sm = SMOTE(random_state=42) X_res, y_res = sm.fit_sample(X, y.ravel()) X_new = pd.DataFrame(X_res, columns=list(X.columns)) y_new = pd.Series(y_res)
test_score = test_result.score(X_test.iloc[:, test_selected_features], Y_test) test_scores.append(test_score) print(statistics.mean(test_scores)) # For l=4, the accuracy of train set is 0.88571, while the accuracy for test set is 0.65 # ### 3.6 Well-separated(dvi) # The classes are well-seperated, and some of the p-value of the parameters are higher than expected, which means the model is very instable. # ### 3.7 Case-control sampling (dvii) # Yes, one class binary classes has 9 intances, while the other has 60, which is imbalanced. # In[10]: #d vii sm = SMOTE() X_res, Y_res = sm.fit_sample(best_X_train, best_Y_train) X_res = pd.DataFrame(X_res) Y_res = pd.DataFrame(Y_res) res_model = LogisticRegression(C=1e9) res_selector = RFECV(res_model, step=1, cv=5) res_selector = res_selector.fit(X_res, Y_res) res_selected_features = [ x for x in range(0, len(res_selector.support_)) if res_selector.support_[x] == True ] res_result = res_model.fit(X_res.iloc[:, res_selected_features], Y_res) res_pred = res_model.predict(X_res.iloc[:, res_selected_features]) #confusion matrix con_matrix = confusion_matrix(Y_res, (res_pred > 0.5).astype(int)) TP = con_matrix[1][1]
df= df.select_dtypes(np.number) for i in df.columns: print(df[i].corr(y)) df.corr() df= pd.concat([df, subset],sort= False, axis=1) X= df m= sm.OLS(y,X).fit() pca = PCA(n_components=2) principalComponents = pca.fit_transform(X) temp= pd.DataFrame(data= principalComponents) temp['target']= y plot= sns.pairplot(temp, hue='target', diag_kind='hist') X, y = sm.fit_sample(X, y.ravel()) scaler= MinMaxScaler() X= scaler.fit_transform(X) x_train,x_test,y_train,y_test= train_test_split(X, y, test_size= .2) #Random Forest Classifier model= RandomForestClassifier(n_estimators=1000) model.fit(X, y) model.score(x_test, y_test) #XGBoost Classifier model= xg_reg = xgb.XGBClassifier(subsample= 1.0, min_child_weight= 10, learning_rate= 0.1,
plt.yticks(np.arange(0.0, 1.1, step=0.1)) plt.ylabel("True Positive Rate", fontsize=15) plt.title('ROC Curve Analysis', fontweight='bold', fontsize=15) plt.legend(prop={'size':13}, loc='lower right') plt.show() #---------------------------------------------------- # SMOTE METHOD #--------------------------------------------------- (np.random.seed(1234)) from imblearn.over_sampling import SMOTE sm = SMOTE() X_smote, y_smote = sm.fit_sample(X_new, y) # Splitting the dataset into the Training set and Test set from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X_smote, y_smote, test_size = 0.2, random_state = 0) # Instantiate the classfiers and make a list classifiers = [LogisticRegression(random_state=1234), GaussianNB(), svm.SVC(), RandomForestClassifier(random_state=1234), XGBClassifier()] # Define a result table as a DataFrame result_table = pd.DataFrame(columns=['classifiers', 'fpr','tpr','auc'])
Xvars['strategy'] = sub_df2['Strategy'] yvars = sub_df2['categorical'] #yvars.reset_index(inplace = True) #split data np.random.seed(100) ###############sklearn#################### index_nan = Xvars['ages'][np.isnan(Xvars['ages'])].index Xvars.drop(index_nan, axis=0, inplace=True) yvars.drop(index_nan, axis=0, inplace=True) X_train, X_test, y_train, y_test = train_test_split(Xvars, yvars, test_size=0.2) sm = SMOTE(random_state=42) X_sm, y_sm = sm.fit_sample(Xvars, yvars) #lm = linear_model.LinearRegression() ######################random forest################### ###tune hyperparameters rf = ensemble.RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1, min_samples_leaf=1, criterion='entropy', #class_weight = 'balanced') ) rf_mod = rf.fit(X_sm, y_sm) n_estimators = [int(x) for x in np.linspace(start=10, stop=200, num=10)] # Number of features to consider at every split max_features = ['auto', 'sqrt']
# RFC with best parameters clf = RandomForestClassifier(n_estimators=100, min_samples_leaf=2, random_state=1991) clf.fit(training_regressors, training_target) predictions_rfc = clf.predict(test_regressors) accuracy_score(test_target, predictions_rfc) print(confusion_matrix(test_target, predictions_rfc)) # SMOTE Oversampling sm = SMOTE(random_state=1991, ratio="auto") regressors_train, target_train = sm.fit_sample(training_regressors, training_target) # RFC with best parameters clf = RandomForestClassifier(n_estimators=400, min_samples_leaf=2, random_state=1991) clf.fit(regressors_train, target_train) predictions_rfc = clf.predict(test_regressors) accuracy_score(test_target, predictions_rfc) print(confusion_matrix(test_target, predictions_rfc)) ## Random Forest def random_forest_func(pred_train, pred_test, tar_train, tar_test):
del raw_data print(x_train.shape, x_test.shape) x_train = ((x_train - np.mean(x_train, axis=1).reshape(-1, 1)) / np.std(x_train, axis=1).reshape(-1, 1)) x_test = ((x_test - np.mean(x_test, axis=1).reshape(-1, 1)) / np.std(x_test, axis=1).reshape(-1, 1)) print(x_train.shape, x_test.shape) #x_train = np.concatenate((x_train, x_test), axis=0) #y_train = np.concatenate((y_train, y_test), axis=0) seed = 7 np.random.seed(seed) sm = SMOTE(ratio=1.0) print(x_train.shape, y_train.shape) x_train_sm, y_train_sm = sm.fit_sample(x_train, y_train) print(len(x_train_sm)) print(x_train_sm.shape, y_train_sm.shape) x_train_sm = np.stack( [x_train_sm, uniform_filter1d(x_train_sm, axis=1, size=200)], axis=2) x_test = np.stack([x_test, uniform_filter1d(x_test, axis=1, size=200)], axis=2) def create_model(init_mode='glorot_uniform', activation='relu', dropout_rate=0.5, neurons=64, optimizer='sgd', filters=8):
print(auc_xg) # ## using smote # In[110]: from imblearn.over_sampling import SMOTE # In[111]: sm = SMOTE(random_state=2) X_train, y_train = sm.fit_sample(xtrain, ytrain) # In[112]: y_train=pd.DataFrame(data=y_train,columns=["Converted_y_N"]) # **Find the important features by Recursive Feature Elimination** # In[117]: logreg = LogisticRegression() rfe = RFE(logreg, 20)
# #### ROC for both the Random Forest Classifier and Logistic Regression were very similar. Both had Area Under the Curve (AUC) at .77. # # Over-Sampled Model # ## Using SMOTE, we over-sample the minority class (MENTHLTH2 = 1) and take care to test/train split before preoceeding with re-sampling. # In[30]: from imblearn.over_sampling import SMOTENC # setting up testing and training sets X_train3, X_test3, y_train3, y_test3 = train_test_split(X, y, test_size=0.3, random_state=0) sm = SMOTENC(categorical_features=[1,2,3,4,5,6,7,8,9,10,11,12,13],sampling_strategy='minority', random_state=0) X_train_over, y_train_over = sm.fit_sample(X_train3, y_train3) # describes info about train and test set print("Number of rows/columns in X_test3 dataset: ", X_test3.shape) print("Number of rows/columns in y_test3 dataset: ", y_test3.shape) print("Number of rows/columns in X_train_over dataset: ", X_train_over.shape) print("Number of rows/columns in y_train_over dataset: ", y_train_over.shape) # In[31]: unique, counts = np.unique(y_train3, return_counts=True) dict(zip(unique, counts))
# stratified so that the split of the imbalance dataset between train and test set will be the same X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=3, stratify=y) # scale X matrix with StandardScaler ss = StandardScaler() Xss_train = ss.fit_transform(X_train) Xss_test = ss.transform(X_test) # SMOTE the training set as the data set is skewed towards having more non_defaulters sm = SMOTE(random_state=1, ratio='minority') Xss_sm_train, y_sm_train = sm.fit_sample(Xss_train, y_train) # using RandomForestClassifer as an estimator rclf = RandomForestClassifier(n_estimators=1000, max_depth=5, random_state=1) # train 30% of the training set rclf.fit(Xss_sm_train, y_sm_train) gs_params = { 'criterion': ['gini'], 'max_depth': [None, 1, 5, 10], 'max_features': ['auto', 3, 7], 'n_estimators': [200, 500, 1000], 'random_state': [1] }
X.shape X.head() Y = y Y.shape Y.head() """SMOTE Analysis""" print("Before OverSampling, counts of label '1': {}".format(sum(Y == 1))) print("Before OverSampling, counts of label '0': {} \n".format(sum(Y == 0))) # import SMOTE module from imblearn library # pip install imblearn (if you don't have imblearn in your system) from imblearn.over_sampling import SMOTE sm = SMOTE(random_state=2) X_train_res, y_train_res = sm.fit_sample(X, Y.ravel()) print('After OverSampling, the shape of train_X: {}'.format(X.shape)) print('After OverSampling, the shape of train_y: {} \n'.format(Y.shape)) print("After OverSampling, counts of label '1': {}".format( sum(y_train_res == 1))) print("After OverSampling, counts of label '0': {}".format( sum(y_train_res == 0))) #BALANCED CLASS THROUGH SMOT ANALYSIS sns.countplot(y_train_res) """Train Test Split""" from sklearn.model_selection import train_test_split x_train, x_test, y_train, y_test = train_test_split(X_train_res,
def oversampling(label): X_res, Y_res = sm.fit_sample(X_train, y_train[label]) X_res = pd.DataFrame(X_res) Y_res = pd.DataFrame(Y_res) return X_res, Y_res