def train_unbalanced(features, model):
    # Get relevant columns: X is features, y is label
    print("Getting relevant columns")
    X = df_features[features]
    y = df_features['90%rejected']

    # one-hot encode the categorical features
    print("One-hot encoding categorical features")
    enc.fit(X)
    X = enc.transform(X)

    # Split 80/20 training and test data
    print("Splitting data")
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.20,
                                                        random_state=42)
    X_train2, X_test2, y_train2, y_test2 = train_test_split(X_train,
                                                            y_train,
                                                            test_size=0.10,
                                                            random_state=12)

    # oversample training data
    print("Oversampling training data")
    sm = SMOTE(random_state=12, ratio=1.0)
    X_train_res, y_train_res = sm.fit_sample(X_train2, y_train2)

    # Classify with given model
    print("Fitting training data to model")
    model.fit(X_train_res, y_train_res)

    # Validate: mean accuracy and recall score
    print('Validation Results')
    print(model.score(X_test2, y_test2),
          'out of 1.00 predictions were correct')
    y_pred_2 = model.predict(X_test2)
    #print(np.unique(y_pred_2))
    print(recall_score(y_test2, y_pred_2),
          'out of 1.00 of bad data identified')
    print(y_pred_2.sum())

    print('\nTest Results')
    print(model.score(X_test, y_test), 'out of 1.00 predictions were correct')
    y_pred = model.predict(X_test)
    #print(np.unique(y_pred))
    print(recall_score(y_test, y_pred), 'out of 1.00 of bad data identified')
    print(classification_report(y_test, y_pred))
    print(y_pred.sum())

    print('training data Results')
    y_model = model.predict(X_train_res)
    print(classification_report(y_train_res, y_model))
    print(y_model.sum())

    return model
Esempio n. 2
0
def gen_resample(X,y,ratio):
    '''

    '''
    sm = SMOTE(ratio=ratio,random_state=42)
    X_rsmpl, y_rsmpl = sm.fit_sample(X, y)
    df_rsmpl = pd.DataFrame(np.concatenate((X_rsmpl,y_rsmpl.reshape(-1,1)),axis=1),columns=list(X.columns)+[y.name])   
    ratio_name = re.sub(r'[.]','',str(ratio))

    pickle.dump(X_rsmpl, open('pickles/X_rsmpl_train_'+ratio_name+'.pkl', 'wb'))
    pickle.dump(y_rsmpl,open('pickles/y_rsmpl_train_'+ratio_name+'.pkl','wb'))
    pickle.dump(df_rsmpl, open('pickles/df_rsmpl_train_'+ratio_name+'.pkl', 'wb'))
    
    return(X_rsmpl,y_rsmpl)
Esempio n. 3
0
    'facture_par_mail', 'client_depuis_mois'
]
X = telecom_users[features].values  #variables predictives sur le users
y = telecom_users[['sortie_client']].values.flatten()  #variable a predire

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2,
    random_state=42)  #decoupage des donnees pour le test et train

#application du naive bayes avec la matrice de confusion
from sklearn.naive_bayes import ComplementNB
clf = ComplementNB()
clf.fit(X_train, y_train)
X_test_eval = clf.predict(X_test)
confusion_matrix(y_test, X_test_eval)
print((confusion_matrix(y_test, X_test_eval)))

#over sampling car resultat de la matrice pas concluant
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=42, ratio=1.0)
X_train_res, y_train_res = sm.fit_sample(X_train, y_train)

clf = ComplementNB()
clf.fit(X_train_res, y_train_res)
X_test_eval = clf.predict(X_train_res)
confusion_matrix(y_train_res, X_test_eval)

#prediction sur users eval
X_test_eval = telecom_users_eval[features].values
X_pred = clf.predict(X_test_eval)
X_pred
Esempio n. 4
0
df = pd.read_csv('framingham.csv')
df = df.dropna()
zeros = df[df['TenYearCHD'] == 0]
ones = df[df['TenYearCHD'] == 1]
df_new = pd.concat([zeros,ones], axis = 0)




X = df_new[['age','glucose','male','sysBP','totChol','cigsPerDay']]
y = df_new['TenYearCHD']




from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_sample(X, y.ravel())

X_new = pd.DataFrame(X_res, columns=list(X.columns))
y_new = pd.Series(y_res)









Esempio n. 5
0
    test_score = test_result.score(X_test.iloc[:, test_selected_features],
                                   Y_test)
    test_scores.append(test_score)
print(statistics.mean(test_scores))

# For l=4, the accuracy of train set is 0.88571, while the accuracy for test set is 0.65
# ### 3.6 Well-separated(dvi)
# The classes are well-seperated, and some of the p-value of the parameters are higher than expected, which means the model is very instable.
# ### 3.7 Case-control sampling (dvii)
# Yes, one class binary classes has 9 intances, while the other has 60, which is imbalanced.

# In[10]:

#d vii
sm = SMOTE()
X_res, Y_res = sm.fit_sample(best_X_train, best_Y_train)
X_res = pd.DataFrame(X_res)
Y_res = pd.DataFrame(Y_res)
res_model = LogisticRegression(C=1e9)
res_selector = RFECV(res_model, step=1, cv=5)
res_selector = res_selector.fit(X_res, Y_res)
res_selected_features = [
    x for x in range(0, len(res_selector.support_))
    if res_selector.support_[x] == True
]
res_result = res_model.fit(X_res.iloc[:, res_selected_features], Y_res)
res_pred = res_model.predict(X_res.iloc[:, res_selected_features])

#confusion matrix
con_matrix = confusion_matrix(Y_res, (res_pred > 0.5).astype(int))
TP = con_matrix[1][1]
Esempio n. 6
0
df= df.select_dtypes(np.number)
for i in df.columns:
    print(df[i].corr(y))

df.corr()

df= pd.concat([df, subset],sort= False, axis=1)
X= df
m= sm.OLS(y,X).fit()
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(X)
temp= pd.DataFrame(data= principalComponents)
temp['target']= y

plot= sns.pairplot(temp,  hue='target', diag_kind='hist')
X, y = sm.fit_sample(X, y.ravel())
scaler= MinMaxScaler()
X= scaler.fit_transform(X)
x_train,x_test,y_train,y_test= train_test_split(X, y, test_size= .2)


#Random Forest Classifier
model= RandomForestClassifier(n_estimators=1000)
model.fit(X, y)
model.score(x_test, y_test)


#XGBoost Classifier
model= xg_reg = xgb.XGBClassifier(subsample= 1.0,
                                 min_child_weight= 10,
                                 learning_rate= 0.1,
plt.yticks(np.arange(0.0, 1.1, step=0.1))
plt.ylabel("True Positive Rate", fontsize=15)

plt.title('ROC Curve Analysis', fontweight='bold', fontsize=15)
plt.legend(prop={'size':13}, loc='lower right')

plt.show()

#----------------------------------------------------
#                    SMOTE METHOD
#---------------------------------------------------
(np.random.seed(1234))
from imblearn.over_sampling import SMOTE
sm = SMOTE()
X_smote, y_smote = sm.fit_sample(X_new, y)

# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_smote, y_smote, test_size = 0.2, random_state = 0)


# Instantiate the classfiers and make a list
classifiers = [LogisticRegression(random_state=1234), 
               GaussianNB(),
               svm.SVC(),  
               RandomForestClassifier(random_state=1234),
               XGBClassifier()]

# Define a result table as a DataFrame
result_table = pd.DataFrame(columns=['classifiers', 'fpr','tpr','auc'])
Esempio n. 8
0
Xvars['strategy'] = sub_df2['Strategy']

yvars = sub_df2['categorical']
#yvars.reset_index(inplace = True)

#split data
np.random.seed(100)
###############sklearn####################
index_nan = Xvars['ages'][np.isnan(Xvars['ages'])].index
Xvars.drop(index_nan, axis=0, inplace=True)
yvars.drop(index_nan, axis=0, inplace=True)


X_train, X_test, y_train, y_test = train_test_split(Xvars, yvars, test_size=0.2)
sm = SMOTE(random_state=42)
X_sm, y_sm = sm.fit_sample(Xvars, yvars)
#lm = linear_model.LinearRegression()

######################random forest###################
###tune hyperparameters
rf = ensemble.RandomForestClassifier(n_estimators=100,
                                     random_state=42,
                                     n_jobs=-1,
                                     min_samples_leaf=1,
                                     criterion='entropy',
                                     #class_weight = 'balanced')
                                    )
rf_mod = rf.fit(X_sm, y_sm)
n_estimators = [int(x) for x in np.linspace(start=10, stop=200, num=10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
Esempio n. 9
0
# RFC with best parameters
clf = RandomForestClassifier(n_estimators=100,
                             min_samples_leaf=2,
                             random_state=1991)
clf.fit(training_regressors, training_target)

predictions_rfc = clf.predict(test_regressors)

accuracy_score(test_target, predictions_rfc)
print(confusion_matrix(test_target, predictions_rfc))

# SMOTE Oversampling

sm = SMOTE(random_state=1991, ratio="auto")
regressors_train, target_train = sm.fit_sample(training_regressors,
                                               training_target)

# RFC with best parameters
clf = RandomForestClassifier(n_estimators=400,
                             min_samples_leaf=2,
                             random_state=1991)
clf.fit(regressors_train, target_train)

predictions_rfc = clf.predict(test_regressors)

accuracy_score(test_target, predictions_rfc)
print(confusion_matrix(test_target, predictions_rfc))


## Random Forest
def random_forest_func(pred_train, pred_test, tar_train, tar_test):
Esempio n. 10
0
del raw_data

print(x_train.shape, x_test.shape)
x_train = ((x_train - np.mean(x_train, axis=1).reshape(-1, 1)) /
           np.std(x_train, axis=1).reshape(-1, 1))
x_test = ((x_test - np.mean(x_test, axis=1).reshape(-1, 1)) /
          np.std(x_test, axis=1).reshape(-1, 1))
print(x_train.shape, x_test.shape)
#x_train = np.concatenate((x_train, x_test), axis=0)
#y_train = np.concatenate((y_train, y_test), axis=0)

seed = 7
np.random.seed(seed)
sm = SMOTE(ratio=1.0)
print(x_train.shape, y_train.shape)
x_train_sm, y_train_sm = sm.fit_sample(x_train, y_train)

print(len(x_train_sm))
print(x_train_sm.shape, y_train_sm.shape)
x_train_sm = np.stack(
    [x_train_sm, uniform_filter1d(x_train_sm, axis=1, size=200)], axis=2)
x_test = np.stack([x_test, uniform_filter1d(x_test, axis=1, size=200)], axis=2)


def create_model(init_mode='glorot_uniform',
                 activation='relu',
                 dropout_rate=0.5,
                 neurons=64,
                 optimizer='sgd',
                 filters=8):
print(auc_xg)


# ## using smote

# In[110]:


from imblearn.over_sampling import SMOTE


# In[111]:


sm = SMOTE(random_state=2)
X_train, y_train = sm.fit_sample(xtrain, ytrain)


# In[112]:


y_train=pd.DataFrame(data=y_train,columns=["Converted_y_N"])


# **Find the important features by Recursive Feature Elimination**

# In[117]:


logreg = LogisticRegression()
rfe = RFE(logreg, 20)
# #### ROC for both the Random Forest Classifier and Logistic Regression were very similar. Both had Area Under the Curve (AUC) at .77. 

# # Over-Sampled Model

# ## Using SMOTE, we over-sample the minority class (MENTHLTH2 = 1) and take care to test/train split before preoceeding with re-sampling.

# In[30]:


from imblearn.over_sampling import SMOTENC

# setting up testing and training sets
X_train3, X_test3, y_train3, y_test3 = train_test_split(X, y, test_size=0.3, random_state=0)

sm = SMOTENC(categorical_features=[1,2,3,4,5,6,7,8,9,10,11,12,13],sampling_strategy='minority', random_state=0)
X_train_over, y_train_over = sm.fit_sample(X_train3, y_train3)

# describes info about train and test set 
print("Number of rows/columns in X_test3 dataset: ", X_test3.shape) 
print("Number of rows/columns in y_test3 dataset: ", y_test3.shape) 
print("Number of rows/columns in X_train_over dataset: ", X_train_over.shape) 
print("Number of rows/columns in y_train_over dataset: ", y_train_over.shape) 


# In[31]:


unique, counts = np.unique(y_train3, return_counts=True)
dict(zip(unique, counts))

Esempio n. 13
0
# stratified so that the split of the imbalance dataset between train and test set will be the same
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.33,
                                                    random_state=3,
                                                    stratify=y)

# scale X matrix with StandardScaler
ss = StandardScaler()
Xss_train = ss.fit_transform(X_train)

Xss_test = ss.transform(X_test)

# SMOTE the training set as the data set is skewed towards having more non_defaulters
sm = SMOTE(random_state=1, ratio='minority')
Xss_sm_train, y_sm_train = sm.fit_sample(Xss_train, y_train)

# using RandomForestClassifer as an estimator
rclf = RandomForestClassifier(n_estimators=1000, max_depth=5, random_state=1)

# train 30% of the training set
rclf.fit(Xss_sm_train, y_sm_train)

gs_params = {
    'criterion': ['gini'],
    'max_depth': [None, 1, 5, 10],
    'max_features': ['auto', 3, 7],
    'n_estimators': [200, 500, 1000],
    'random_state': [1]
}
Esempio n. 14
0
X.shape
X.head()

Y = y
Y.shape
Y.head()
"""SMOTE Analysis"""

print("Before OverSampling, counts of label '1': {}".format(sum(Y == 1)))
print("Before OverSampling, counts of label '0': {} \n".format(sum(Y == 0)))

# import SMOTE module from imblearn library
# pip install imblearn (if you don't have imblearn in your system)
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=2)
X_train_res, y_train_res = sm.fit_sample(X, Y.ravel())

print('After OverSampling, the shape of train_X: {}'.format(X.shape))
print('After OverSampling, the shape of train_y: {} \n'.format(Y.shape))

print("After OverSampling, counts of label '1': {}".format(
    sum(y_train_res == 1)))
print("After OverSampling, counts of label '0': {}".format(
    sum(y_train_res == 0)))

#BALANCED CLASS THROUGH SMOT ANALYSIS
sns.countplot(y_train_res)
"""Train Test Split"""

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X_train_res,
def oversampling(label):
    X_res, Y_res = sm.fit_sample(X_train, y_train[label])
    X_res = pd.DataFrame(X_res)
    Y_res = pd.DataFrame(Y_res)
    return X_res, Y_res