Esempio n. 1
0
def train_VotingClassifier():

    data = pd.read_csv("processed_data.csv", sep=',')
    data = shuffle(data)
    data_X = data.drop('Empathy', axis=1)
    data_Y = data['Empathy']
    
    """TRAINING"""

    clf1 = joblib.load('SGD_model.pkl')
    clf2 = joblib.load('SVC_model.pkl')
    clf3 = joblib.load('RF_model.pkl')
#     clf4 = joblib.load('Bernoulli_nb_model.pkl')
#     clf5 = joblib.load('Multinomial_nb_model.pkl')
    clf6 = joblib.load('LR_model.pkl')
    clf7 = joblib.load('ET_model.pkl')

    eclf1 = VotingClassifier(estimators=[('SGD', clf1),('SVC', clf2), ('RF', clf3), ('ET', clf7)], voting='hard')
    eclf1.fit(data_X, data_Y)
    joblib.dump(eclf1, 'Voting_model.pkl')
    
    """PERFORMANCE EVALUATION"""

    accuracy, clf_report = get_cv_metrics(eclf1, data_X, data_Y, k_split=10)
    print("Accuracy: ", accuracy) #Accuracy: 73.06
    print(clf_report)

    return eclf1
Esempio n. 2
0
def train_LR():

    data = pd.read_csv("processed_data.csv", sep=',')
    data = shuffle(data)
    data_X = data.drop('Empathy', axis=1)
    data_Y = data['Empathy']
    '''TRAINING'''
    clf = LogisticRegression(C=2.6,
                             dual=False,
                             penalty='l2',
                             tol=1e-4,
                             solver='saga',
                             n_jobs=-1,
                             multi_class='ovr',
                             fit_intercept=True,
                             intercept_scaling=1.4).fit(
                                 data_X, data_Y.values.ravel())
    ##    clf = RFECV(clf, step=1, cv=5, n_jobs=-1).fit(data_X,data_Y)
    joblib.dump(clf, 'LR_model.pkl')
    """PERFORMANCE EVALUATION"""
    accuracy, clf_report = get_cv_metrics(clf,
                                          data_X,
                                          data_Y.values.ravel(),
                                          k_split=10)
    print("Accuracy: ", accuracy)
    print(clf_report)
    return clf
Esempio n. 3
0
def train_SVM():

    data = pd.read_csv("processed_data.csv", sep=',')
    data = shuffle(data)
    data_X = data.drop('Empathy', axis=1)
    data_Y = data['Empathy']
    '''TRAINING'''
    clf = SVC(C=3.1, kernel='rbf', gamma=0.1,
              tol=1e-1).fit(data_X, data_Y.values.ravel())
    joblib.dump(clf, 'SVC_model.pkl')
    """PERFORMANCE EVALUATION"""
    accuracy, clf_report = get_cv_metrics(clf,
                                          data_X,
                                          data_Y.values.ravel(),
                                          k_split=10)
    print("Accuracy: ", accuracy)
    print(clf_report)
    return clf
Esempio n. 4
0
def train_RF():
    
    data = pd.read_csv("processed_data.csv", sep=',')
    data = shuffle(data)
    data_X = data.drop('Empathy', axis=1)
    data_Y = data['Empathy']
    
    '''TRAINING'''
    clf = RandomForestClassifier(n_estimators = 700, max_depth=300, random_state=16, n_jobs = -1)
##    clf = RFECV(clf, step=1, cv=5, n_jobs=-1).fit(data_X,data_Y.values.ravel())
    joblib.dump(clf, 'RF_model.pkl')

    
    """PERFORMANCE EVALUATION"""
    accuracy, clf_report = get_cv_metrics(clf, data_X, data_Y.values.ravel(), k_split=10)
    print("Accuracy: ", accuracy)
    print(clf_report)
    return clf
Esempio n. 5
0
def train_BernoulliNB():

    data = pd.read_csv("processed_data.csv", sep=',')
    data = shuffle(data)
    data_X = data.drop('Empathy', axis=1)
    data_Y = data['Empathy']
    '''TRAINING'''

    clf = BernoulliNB(alpha=1.4, fit_prior=True, class_prior=None)
    clf = RFECV(clf, step=1, cv=5, n_jobs=-1).fit(data_X, data_Y)
    joblib.dump(clf, 'Bernoulli_nb_model.pkl')
    """PERFORMANCE EVALUATION"""
    accuracy, clf_report = get_cv_metrics(clf,
                                          data_X,
                                          data_Y.values.ravel(),
                                          k_split=10)
    print("Accuracy: ", accuracy)
    print(clf_report)
    return clf