def LogReg(n=5): X, y = feature_extraction.pre_process_comments() # get the data X = SelectKBest(chi2, k=17000).fit_transform(X, y) ''' LogReg_clf = Pipeline([ ('feature_selection', SelectFromModel(LinearSVC(penalty="l1", dual=False, multi_class='crammer_singer', max_iter=5000), threshold="0.8*mean", max_features=50000)), ('classification', LogisticRegression(penalty='l1', solver='saga')) ]) ''' LogReg_clf = LogisticRegression(penalty='l1', dual=False) print("before fitting") LogReg_cv = cross_val_score(LogReg_clf, X, y, cv=n) LogReg_clf.fit(X, y) LogReg_pred = LogReg_clf.predict(X) LogReg_acc = accuracy_score(y_pred=LogReg_pred, y_true=y) print("cv acc of LogReg is ", LogReg_cv.mean()) print("acc of logreg is ", LogReg_acc) return LogReg_clf
def Bernoulli(n=5): X, y = feature_extraction.pre_process_comments() # get the data X = SelectKBest(chi2, k=17000).fit_transform(X, y) Bernoulli_clf = BernoulliNB() Bernoulli_cv = cross_val_score(Bernoulli_clf, X, y, cv=n) Bernoulli_clf.fit(X, y) Bernoulli_pred = Bernoulli_clf.predict(X) Bernoulli_acc = accuracy_score(y_true=y, y_pred=Bernoulli_pred) print("acc of BNB(sklearn) is ", Bernoulli_acc) print("cv acc of BNB(sklearn) is ", Bernoulli_cv.mean()) return Bernoulli_clf
def MultiNB_Kaggle(): X, y, test, test_ID = feature_extraction.pre_process_comments( ) # get the data k_best = SelectKBest(chi2, k=17000) X = k_best.fit_transform(X, y) MultiNB_clf = MultinomialNB(alpha=0.22) test_X = k_best.transform(test) MultiNB_clf.fit(X, y) MultiNB_pred = MultiNB_clf.predict(test_X) np.savetxt('predict.csv', np.array([test_ID, MultiNB_pred]).transpose(), delimiter=',', fmt='%s', header='Id, Category') return MultiNB_clf
def MultiNB(n=5): X, y = feature_extraction.pre_process_comments() # get the data X = SelectKBest(chi2, k=17000).fit_transform(X, y) MultiNB_clf = MultinomialNB(alpha=0.22) MultiNB_cv = cross_val_score(MultiNB_clf, X, y, cv=n) MultiNB_clf.fit(X, y) MultiNB_pred = MultiNB_clf.predict(X) MultiNB_acc = accuracy_score(y_pred=MultiNB_pred, y_true=y) print("cv acc of multiNB is ", MultiNB_cv.mean()) print("acc of multiNB is ", MultiNB_acc) return MultiNB_clf
def LSVC(n=5): X, y = feature_extraction.pre_process_comments() # get the data X = SelectKBest(k=17000).fit_transform(X, y) LSVC_clf = LinearSVC(penalty='l1', dual=False) #LSVC_clf = BaggingClassifier(base_estimator=LSVC_clf, max_samples=0.6, max_features=12000, n_estimators=24, random_state=1) print("before fitting") LSVC_cv = cross_val_score(LSVC_clf, X, y, cv=n) LSVC_clf.fit(X, y) LSVC_pred = LSVC_clf.predict(X) LSVC_acc = accuracy_score(y_pred=LSVC_pred, y_true=y) print("cross validation accuracy", LSVC_cv.mean()) print("training accuracy", LSVC_acc) return LSVC_clf
def ensemble(n=5): X, y = feature_extraction.pre_process_comments() # get the data X = SelectKBest(k=17000).fit_transform(X, y) LogReg_clf = LogisticRegression(penalty='l1', dual=False) MNB_clf = MultinomialNB(alpha=0.22) LinearSVC_clf = LinearSVC(penalty='l1', dual=False) Ensemble_clf = VotingClassifier(estimators=[('lg', LogReg_clf), ('mnb', MNB_clf)], voting='soft') Ensemble_cv = cross_val_score(Ensemble_clf, X, y, cv=n) Ensemble_clf.fit(X, y) score = Ensemble_clf.score(X, y) print("cv acc of ensemble is ", Ensemble_cv.mean()) print("acc of ensemble is ", score) return Ensemble_clf
from sklearn.model_selection import cross_val_score from sklearn.tree import DecisionTreeClassifier from sklearn.svm import LinearSVC from sklearn.pipeline import Pipeline import tensorflow as tf from tensorflow.keras import layers import feature_extraction from sklearn.naive_bayes import MultinomialNB from sklearn.ensemble import RandomForestClassifier, VotingClassifier import models.models as models from sklearn.feature_selection import SelectKBest, chi2, VarianceThreshold from sklearn.feature_selection import RFE from sklearn.svm import SVR from sklearn.preprocessing import LabelEncoder X, y = feature_extraction.pre_process_comments() # get the data X = SelectKBest(chi2, k=17000).fit_transform(X, y) LogReg_clf = LogisticRegression(penalty='l1', dual=False) MNB_clf = MultinomialNB(alpha=0.3) LinearSVC_clf = LinearSVC(penalty='l1', dual=False) Ensemble_clf = VotingClassifier(estimators=[('lg', LogReg_clf), ('mnb', MNB_clf)], voting='soft') Ensemble_cv = cross_val_score(Ensemble_clf, X, y, cv=3) Ensemble_clf.fit(X, y)