def train_VotingClassifier(): data = pd.read_csv("processed_data.csv", sep=',') data = shuffle(data) data_X = data.drop('Empathy', axis=1) data_Y = data['Empathy'] """TRAINING""" clf1 = joblib.load('SGD_model.pkl') clf2 = joblib.load('SVC_model.pkl') clf3 = joblib.load('RF_model.pkl') # clf4 = joblib.load('Bernoulli_nb_model.pkl') # clf5 = joblib.load('Multinomial_nb_model.pkl') clf6 = joblib.load('LR_model.pkl') clf7 = joblib.load('ET_model.pkl') eclf1 = VotingClassifier(estimators=[('SGD', clf1),('SVC', clf2), ('RF', clf3), ('ET', clf7)], voting='hard') eclf1.fit(data_X, data_Y) joblib.dump(eclf1, 'Voting_model.pkl') """PERFORMANCE EVALUATION""" accuracy, clf_report = get_cv_metrics(eclf1, data_X, data_Y, k_split=10) print("Accuracy: ", accuracy) #Accuracy: 73.06 print(clf_report) return eclf1
def train_LR(): data = pd.read_csv("processed_data.csv", sep=',') data = shuffle(data) data_X = data.drop('Empathy', axis=1) data_Y = data['Empathy'] '''TRAINING''' clf = LogisticRegression(C=2.6, dual=False, penalty='l2', tol=1e-4, solver='saga', n_jobs=-1, multi_class='ovr', fit_intercept=True, intercept_scaling=1.4).fit( data_X, data_Y.values.ravel()) ## clf = RFECV(clf, step=1, cv=5, n_jobs=-1).fit(data_X,data_Y) joblib.dump(clf, 'LR_model.pkl') """PERFORMANCE EVALUATION""" accuracy, clf_report = get_cv_metrics(clf, data_X, data_Y.values.ravel(), k_split=10) print("Accuracy: ", accuracy) print(clf_report) return clf
def train_SVM(): data = pd.read_csv("processed_data.csv", sep=',') data = shuffle(data) data_X = data.drop('Empathy', axis=1) data_Y = data['Empathy'] '''TRAINING''' clf = SVC(C=3.1, kernel='rbf', gamma=0.1, tol=1e-1).fit(data_X, data_Y.values.ravel()) joblib.dump(clf, 'SVC_model.pkl') """PERFORMANCE EVALUATION""" accuracy, clf_report = get_cv_metrics(clf, data_X, data_Y.values.ravel(), k_split=10) print("Accuracy: ", accuracy) print(clf_report) return clf
def train_RF(): data = pd.read_csv("processed_data.csv", sep=',') data = shuffle(data) data_X = data.drop('Empathy', axis=1) data_Y = data['Empathy'] '''TRAINING''' clf = RandomForestClassifier(n_estimators = 700, max_depth=300, random_state=16, n_jobs = -1) ## clf = RFECV(clf, step=1, cv=5, n_jobs=-1).fit(data_X,data_Y.values.ravel()) joblib.dump(clf, 'RF_model.pkl') """PERFORMANCE EVALUATION""" accuracy, clf_report = get_cv_metrics(clf, data_X, data_Y.values.ravel(), k_split=10) print("Accuracy: ", accuracy) print(clf_report) return clf
def train_BernoulliNB(): data = pd.read_csv("processed_data.csv", sep=',') data = shuffle(data) data_X = data.drop('Empathy', axis=1) data_Y = data['Empathy'] '''TRAINING''' clf = BernoulliNB(alpha=1.4, fit_prior=True, class_prior=None) clf = RFECV(clf, step=1, cv=5, n_jobs=-1).fit(data_X, data_Y) joblib.dump(clf, 'Bernoulli_nb_model.pkl') """PERFORMANCE EVALUATION""" accuracy, clf_report = get_cv_metrics(clf, data_X, data_Y.values.ravel(), k_split=10) print("Accuracy: ", accuracy) print(clf_report) return clf