from sklearn.metrics import accuracy_score from features import make_Dictionary,extract_features ##Create the dictionary training_dir = 'Train' dictionary = make_Dictionary(training_dir) ##vectors ##0 stands for a SPAM traning = np.zeros(139) traning[88:139] = 1 train_matrix = extract_features(training_dir,dictionary) # NB classifier model1 = MultinomialNB() model2 = GaussianNB() model1.fit(train_matrix,traning) model2.fit(train_matrix,traning) # Test the unseen mails for SPAM test_dir = 'TESTING_RESULT' test_matrix = extract_features(test_dir,dictionary) test = np.zeros(202) test[180:202] = 1 result1 = model1.predict(test_matrix) result2 = model2.predict(test_matrix) print("MultinomialNB | HAM | SPAM") print(confusion_matrix(test,result1)) ##Calculate the accuracy score print(accuracy_score(test, result1))
# Menjadi dataset ke dalam Training set and Test set from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0) # Feature Scaling from sklearn.preprocessing import StandardScaler sc = StandardScaler() X_train = sc.fit_transform(X_train) X_test = sc.transform(X_test) # Membuat model Naive Bayes terhadap Training set from sklearn.naive_bayes import GaussianNB classifier = GaussianNB() classifier.fit(X_train, y_train) # Memprediksi hasil test set y_pred = classifier.predict(X_test) # Membuat confusion matrix from sklearn.metrics import confusion_matrix cm = confusion_matrix(y_test, y_pred) # Visualisasi hasil model Naive Bayes dari Training set from matplotlib.colors import ListedColormap X_set, y_set = X_train, y_train X1, X2 = np.meshgrid( np.arange(start=X_set[:, 0].min() - 1, stop=X_set[:, 0].max() + 1,
def evaluate(model, data, alg = None, classifier="lr",fast=False,ratio = None,cv=10,normalize=False,random_state = None,return_y = False): X = model Y = data micros = [] macros = [] # for y,key in enumerate(data.labels.keys()): # for index,paper in enumerate(data.labels[key]): # if paper not in model.paper2id: # print("paper not in model: ", paper) # continue # X.append(model.paper_embeddings[model.paper2id[paper]]) # Y.append(y) print("len X: ", len(X)) print("len Y: ", len(Y)) if normalize: X = sk_normalize(X) scaler = StandardScaler() X = scaler.fit_transform(X) clf = LogisticRegression() df = defaultdict(list) if ratio is None: ratio = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9] for r in ratio: if r <= 0: continue elif r >= 1: break micros = [] macros = [] for i in range(cv): clf = LogisticRegression() if classifier.lower() == "svm": clf = SVC(cache_size=5000) elif classifier.lower() == "mlp": clf = MLPClassifier() elif classifier.lower() == "nb": clf = GaussianNB() X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=1-r,random_state=random_state) clf.fit(X_train,Y_train) prediction = clf.predict(X_test) #lpred = clf.predict_proba(X_test) #print("prediction shape: ", prediction[0]) #print("y_test shape: ", Y_test[0]) #print("Loss: ", log_loss(Y_test,lpred)) micro = f1_score(Y_test, prediction, average='micro') macro = f1_score(Y_test, prediction, average='macro') micros.append(micro) macros.append(macro) micros = np.mean(micros) macros = np.mean(macros) df["ratio"].append(r) df["micro"].append(np.mean(micro)) df["macro"].append(np.mean(macro)) #df["alg"].append(alg) #df["data"].append(str(data)) #df["total_samples"] = model.total_samples #df["negative"].append(model.negative) #df["walk_window"].append(model.walk_window) #df["walk_probability"].append(model.walk_probability) #df["L2"].append(model.l2) logging.info("ratio: %.4f : f1_micro %.4f, f1_macro %.4f" % (r,micros,macros)) if fast: if return_y: return micros,macros,Y_test,prediction return micros,macros else: return pd.DataFrame(df)
def self_projection( X, cell_types, classifier="LR", penalty="l1", sparsity=0.5, fraction=0.5, solver="liblinear", n=0, cv=5, whole=False, n_jobs=None, ): # n = 100 should be good. """ This is the core function for running self-projection. Input ----- X: `numpy.array` or sparse matrix the expression matrix, e.g. ad.raw.X. cell_types: `list of String/int` the cell clustering assignment classifier: `String` optional (defatul: 'LR') a machine learning model in "LR" (logistic regression), \ "RF" (Random Forest), "GNB"(Gaussion Naive Bayes), "SVM" (Support Vector Machine) and "DT"(Decision Tree). penalty: `String` optional (default: 'l2') the standardization mode of logistic regression. Use 'l1' or 'l2'. sparsity: `fload` optional (default: 0.5) The sparsity parameter (C in sklearn.linear_model.LogisticRegression) for the logistic regression model. fraction: `float` optional (default: 0.5) Fraction of data included in the training set. 0.5 means use half of the data for training, if half of the data is fewer than maximum number of cells (n). n: `int` optional (default: 100) Maximum number of cell included in the training set for each cluster of cells. only fraction is used to split the dataset if n is 0. cv: `int` optional (default: 5) fold for cross-validation on the training set. 0 means no cross-validation. whole: `bool` optional (default: False) if measure the performance on the whole dataset (include training and test). n_jobs: `int` optional, number of threads to use with the different classifiers (default: None - unlimited). return ----- y_prob, y_pred, y_test, clf y_prob: `matrix of float` prediction probability y_pred: `list of string/int` predicted clustering of the test set y_test: `list of string/int` real clustering of the test set clf: the classifier model. """ # split the data into training and testing if n > 0: X_train, X_test, y_train, y_test = train_test_split_per_type( X, cell_types, n=n, frac=(1 - fraction)) else: X_train, X_test, y_train, y_test = train_test_split( X, cell_types, stratify=cell_types, test_size=fraction) # fraction means test size # set the classifier if classifier == "LR": clf = LogisticRegression( random_state=1, penalty=penalty, C=sparsity, multi_class="ovr", solver=solver, ) elif classifier == "RF": clf = RandomForestClassifier(random_state=1, n_jobs=n_jobs) elif classifier == "GNB": clf = GaussianNB() elif classifier == "GPC": clf = GaussianProcessClassifier(n_jobs=n_jobs) elif classifier == "SVM": clf = SVC(probability=True) elif classifier == "SH": clf = SGDClassifier(loss="squared_hinge", n_jobs=n_jobs) elif classifier == "PCP": clf = SGDClassifier(loss="perceptron", n_jobs=n_jobs) elif classifier == "DT": clf = DecisionTreeClassifier() # mean cross validation score cvsm = 0 if cv > 0: cvs = cross_val_score(clf, X_train, np.array(y_train), cv=cv, scoring="accuracy", n_jobs=n_jobs) cvsm = cvs.mean() print("Mean CV accuracy: %.4f" % cvsm) # accuracy on cross validation and on test set clf.fit(X_train, y_train) accuracy = clf.score(X_train, y_train) print("Accuracy on the training set: %.4f" % accuracy) accuracy_test = clf.score(X_test, y_test) print("Accuracy on the hold-out set: %.4f" % accuracy_test) # accuracy of the whole dataset if whole: accuracy = clf.score(X, cell_types) print("Accuracy on the whole set: %.4f" % accuracy) # get predicted probability on the test set y_prob = None if not classifier in ["SH", "PCP"]: y_prob = clf.predict_proba(X_test) y_pred = clf.predict(X_test) return y_prob, y_pred, y_test, clf, cvsm, accuracy_test
import pandas as pd import numpy as np from sklearn.model_selection import train_test_split from xgboost import XGBClassifier from sklearn.naive_bayes import GaussianNB from sklearn.neighbors import KNeighborsClassifier from sklearn.ensemble.gradient_boosting import GradientBoostingClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.svm import SVC from sklearn.model_selection import cross_validate from varname import nameof sv = SVC() RFC = RandomForestClassifier() GaussianN = GaussianNB() KNC = KNeighborsClassifier(n_neighbors=7) xgboost = XGBClassifier() gradientboost = GradientBoostingClassifier() df = pd.read_csv(r'dataframes/full_csv', index_col=[0]) # with open(r'objects/wektor_lst', 'rb') as f: # res_wek = np.load(f) res_wek = np.load(r'objects/wektors.npy', allow_pickle=True) res_wek = [wek[0:20] for wek in res_wek] zzz = np.stack(res_wek) res_wek = zzz.reshape([7023, 2000]) scoring = ['precision', 'recall', 'f1', 'accuracy'] sv_score_array = cross_validate(sv,
return y_pred, y_pred_prob ## function to get classifiers score def print_scores(y_test,y_pred,y_pred_prob): print('test-set confusion matrix:\n', confusion_matrix(y_test,y_pred)) print("recall score: ", recall_score(y_test,y_pred)) print("precision score: ", precision_score(y_test,y_pred)) print("f1 score: ", f1_score(y_test,y_pred)) print("accuracy score: ", accuracy_score(y_test,y_pred)) print("ROC AUC: {}".format(roc_auc_score(y_test, y_pred_prob[:,1]))) #%% # training a naive bayes model for classification y_pred, y_pred_prob = get_predictions(GaussianNB(), X_train, y_train, X_test) print_scores(y_test,y_pred,y_pred_prob) # Accuracy = 96.91 % # hence we can see that the model has correclty classified all the 135 values as frauds/ shill bidders #%% # training a logistic regression model y_pred, y_pred_prob = get_predictions(LogisticRegression(C = 0.01, penalty = 'l1'), X_train, y_train, X_test) print_scores(y_test,y_pred,y_pred_prob) # Accuracy = 96.28 %
# Import libraries from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB # define data, create model and fit data X = Variables Y = Classes Model = GaussianNB(params).fit(X, Y) # Score model Model.score(X, Y) # Predict new classes NewY = Model.Predict(NewX)
scaled = ['age', 'balance', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous'] bank[scaled] = sklearn.preprocessing.scale(bank[scaled].astype(float)) # Training set and targets X = bank.drop(columns='y').values t = bank['y'].values #experiment 1 from sklearn.model_selection import train_test_split X_train, X_test, t_train, t_test = train_test_split(X, t, test_size = 0.2, shuffle = True) #experiment 2 from sklearn.naive_bayes import GaussianNB gaussian_clf = GaussianNB() gaussian_clf.fit(X_train, t_train) #experiment 3 from sklearn.metrix import confusion_matrix gaussian_score = gaussian_clf.score(X_test, t_test) gaussian_pred - gaussian_clf.predict(X_test) cm = confusion_matrix(t_test, gaussian_pred) gaussian_proba = gaussian_clf.predict_proba(X_test)[:, 1] fpr, tpr, thresholds = roc_curve(t_test, gaussian_proba) auc = roc_auc_score(t_test, gaussian_proba) print "Gausian CLF Score: " + str(gaussian_score) print "Confusion Matrix "
def getNaiveBayesWinsconsinAccuracy(): from sklearn.naive_bayes import GaussianNB gnb = GaussianNB() pred_gnb = gnb.fit(data_train_feature_selected, target_train).predict(data_test_feature_selected) return accuracy_score(target_test, pred_gnb, normalize = True)
sgd.score(X_train, Y_train) acc_sgd = round(sgd.score(X_train, Y_train) * 100, 2) #Random Forest: random_forest = RandomForestClassifier(n_estimators=100) random_forest.fit(X_train, Y_train) Y_prediction = random_forest.predict(X_test) random_forest.score(X_train, Y_train) acc_random_forest = round(random_forest.score(X_train, Y_train) * 100, 2) #Gaussian Naive Bayes: gaussian = GaussianNB() gaussian.fit(X_train, Y_train) Y_pred = gaussian.predict(X_test) acc_gaussian = round(gaussian.score(X_train, Y_train) * 100, 2) #Decision Tree decision_tree = DecisionTreeClassifier() decision_tree.fit(X_train, Y_train) Y_pred = decision_tree.predict(X_test) acc_decision_tree = round(decision_tree.score(X_train, Y_train) * 100, 2) #Which is the best Model ? results = pd.DataFrame({ 'Model': [ 'Random Forest', 'Stochastic Gradient Decent', 'Gaussian Naive Bayes', 'Decision Tree'
'AST', 'BLK'] #Pandas DataFrame allows you to select columns. #We use column selection to split the data into features and class. nba_feature = nba[feature_columns] nba_class = nba[class_column] print(nba_feature[0:3]) print(list(nba_class[0:3])) train_feature, test_feature, train_class, test_class = \ train_test_split(nba_feature, nba_class, stratify=nba_class, \ train_size=0.75, test_size=0.25, random_state=0) training_accuracy = [] test_accuracy = [] nb = GaussianNB().fit(train_feature, train_class) print("Test set score: {:.3f}".format(nb.score(test_feature, test_class))) prediction = nb.predict(test_feature) print("Confusion matrix:") print( pd.crosstab(test_class, prediction, rownames=['True'], colnames=['Predicted'], margins=True)) scores = cross_val_score(nb, nba_feature, nba_class, cv=10) print("Cross-validation scores: {}".format(scores)) print("Average cross-validation score: {:.2f}".format(scores.mean()))
Y = df.Churn #PREPARING TRAINING DATASET AND TEST DATASET from sklearn.model_selection import train_test_split X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=1) import time #GAUSSIAN(NB) NAIVE BAYES start = time.time() from sklearn.naive_bayes import GaussianNB from sklearn import metrics gaussianNB = GaussianNB() gaussianNB.fit(X_train, Y_train) Y_pred_gnb = gaussianNB.predict(X_test) acc_score_gnb = round(metrics.accuracy_score(Y_test, Y_pred_gnb) * 100) confusion_gnb = metrics.confusion_matrix(Y_test, Y_pred_gnb) end = time.time() proc_time_gnb = end - start #print("Total execution time: {}".format(proc_time_gnb), "seconds") #K-NEAREST NEIGHBORS(KNN) start = time.time() from sklearn.neighbors import KNeighborsClassifier knn = KNeighborsClassifier(n_neighbors=7) knn.fit(X_train, Y_train) Y_pred_knn = knn.predict(X_test) acc_score_knn = round(metrics.accuracy_score(Y_test, Y_pred_knn) * 100)
from csm import SEA, StratifiedBagging, REA, LearnppCDS, LearnppNIE, OUSE, KMeanClustering from strlearn.evaluators import TestThenTrain from sklearn.naive_bayes import GaussianNB from strlearn.metrics import (balanced_accuracy_score, f1_score, geometric_mean_score_1, precision, recall, specificity) import sys from sklearn.base import clone from sklearn.tree import DecisionTreeClassifier from skmultiflow.trees import HoeffdingTree # Select streams and methods streams = h.realstreams2() print(len(streams)) rea = REA(base_classifier=StratifiedBagging(base_estimator=GaussianNB(), random_state=42), number_of_classifiers=5) cds = LearnppCDS(base_classifier=StratifiedBagging(base_estimator=GaussianNB(), random_state=42), number_of_classifiers=5) nie = LearnppNIE(base_classifier=StratifiedBagging(base_estimator=GaussianNB(), random_state=42), number_of_classifiers=5) ouse = OUSE(base_classifier=StratifiedBagging(base_estimator=GaussianNB(), random_state=42), number_of_classifiers=5) kmc = KMeanClustering(base_classifier=StratifiedBagging( base_estimator=GaussianNB(), random_state=42), number_of_classifiers=5) ros_knorau2 = SEA(base_estimator=StratifiedBagging(base_estimator=GaussianNB(),
from sklearn.neighbors import KNeighborsClassifier from sklearn.ensemble import GradientBoostingClassifier from sklearn.gaussian_process.kernels import RBF from sklearn.ensemble import RandomForestClassifier from sklearn.naive_bayes import GaussianNB # need refractor here method_dict = { "LG": LogisticRegression(), "KN": KNeighborsClassifier(), "SV": SVC(), "GB": GradientBoostingClassifier(n_estimators=1000), "DT": tree.DecisionTreeClassifier(), "RF": RandomForestClassifier(n_estimators=1000), "MP": MLPClassifier(alpha=1), "NB": GaussianNB(), } dict_classifiers = { "Logistic Regression": LogisticRegression(), "Nearest Neighbors": KNeighborsClassifier(), "Linear SVM": SVC(), "Gradient Boosting Classifier": GradientBoostingClassifier(n_estimators=1000), "Decision Tree": tree.DecisionTreeClassifier(), "Random Forest": RandomForestClassifier(n_estimators=1000), "Neural Net": MLPClassifier(alpha=1), "Naive Bayes": GaussianNB(), # "AdaBoost": AdaBoostClassifier(), # "QDA": QuadraticDiscriminantAnalysis(), # "Gaussian Process": GaussianProcessClassifier()
def classify(features_train, labels_train): clf = GaussianNB() clf.fit(features_train, labels_train) return clf
array = dataset.values X = array[:, 0:4] y = array[:, 4] X_train, X_validation, Y_train, Y_validation = train_test_split(X, y, test_size=0.20, random_state=1, shuffle=True) # Spot Check Algorithms models = [] models.append(('LR', LogisticRegression(solver='liblinear', multi_class='ovr'))) models.append(('LDA', LinearDiscriminantAnalysis())) models.append(('KNN', KNeighborsClassifier())) models.append(('CART', DecisionTreeClassifier())) models.append(('NB', GaussianNB())) models.append(('SVM', SVC(gamma='auto'))) # evaluate each model in turn results = [] names = [] for name, model in models: kfold = StratifiedKFold(n_splits=10, random_state=1, shuffle=True) cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring='accuracy') results.append(cv_results) names.append(name) print('%s: %f (%f)' % (name, cv_results.mean(), cv_results.std())) # Compare Algorithms
a = pd.Series() x = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10] for i in list(range(1, 11)): model = KNeighborsClassifier(n_neighbors=i) model.fit(train_X, train_Y) prediction = model.predict(test_X) a = a.append(pd.Series(metrics.accuracy_score(prediction, test_Y))) plt.plot(a_index, a) plt.xticks(x) fig = plt.gcf() fig.set_size_inches(12, 6) plt.show() print('Accuracies for different values of n are:', a.values, 'with the max value as ', a.values.max()) model = GaussianNB() model.fit(train_X, train_Y) prediction6 = model.predict(test_X) print('The accuracy of the NaiveBayes is', metrics.accuracy_score(prediction6, test_Y)) model = RandomForestClassifier(n_estimators=100) model.fit(train_X, train_Y) prediction7 = model.predict(test_X) print('The accuracy of the Random Forests is', metrics.accuracy_score(prediction7, test_Y)) from sklearn.model_selection import KFold #for K-fold cross validation from sklearn.model_selection import cross_val_score #score evaluation from sklearn.model_selection import cross_val_predict #prediction kfold = KFold(n_splits=10,
n_non_POI = int(len(labels)) - n_POI file.write('Cleaned dataset has {0:d} POI and {1:d} non POI\n'.format( n_POI, n_non_POI)) file.write('\n') for classifier_name in classifier_list: print classifier_name file.write('Classifier name: {0}\n'.format(classifier_name)) # hyperparameter setting parameters = {} if classifier_name == 'NB': # Naive Bayes --> no hyperparameter to tune classifier = GaussianNB() if classifier_name == 'SVC': # Support vector machine--> kernel and margin are tuned classifier = SVC() parameters['classifier__kernel'] = ['linear', 'poly', 'rbf'] parameters['classifier__C'] = [10, 100, 1000] parameters['classifier__gamma'] = [0.01, 0.1, 1, 'scale'] if classifier_name == 'KNN': # KNN --> number of neighbors, weight function and power parameters are tuned classifier = KNeighborsClassifier(algorithm='auto') parameters['classifier__n_neighbors'] = [5, 10, 15] parameters['classifier__weights'] = ['distance', 'uniform'] parameters['classifier__p'] = [1, 2]
if not pca and estimator_name not in ['GaussianNB', 'NeuralNetwork']: process_feature_importances(model, estimator_name, pca, fine_tune) gridsearch_param = {'scoring': 'roc_auc', 'verbose': 2 , 'n_jobs': -1, 'cv': 3} estimators_params_grid = { 'LogisticRegression': {'C' : [10**i for i in range(-3,4)], 'penalty': ['l2', 'l1']}, 'DecisionTreeClassifier': {'min_samples_split': [1600, 1800, 2000, 2200, 2400]}, 'RandomForestClassifier': {'n_estimators' : [50,100,200,300,400], 'min_samples_split': [50, 100, 150, 200]}, 'LGBMClassifier': {'num_leaves': [500, 1000, 1500, 2000, 2500], 'n_estimators': [200, 400, 600, 800, 1000]}, } print_info('Start experiments') experiment(LogisticRegression(random_state=SEED, n_jobs=-1, solver='saga', max_iter=500), train_x, train_y, test_x, test_y, pca = False, fine_tune = True) experiment(DecisionTreeClassifier(random_state=SEED), train_x, train_y, test_x, test_y, pca = False, fine_tune = True) experiment(GaussianNB(), train_x, train_y, test_x, test_y, pca = False, fine_tune = False) experiment(RandomForestClassifier(random_state=SEED, n_jobs=-1), train_x, train_y, test_x, test_y, pca = False, fine_tune = True) lgbm = lgb.LGBMClassifier(objective='binary', random_state = SEED, feature_fraction=0.7, learning_rate=0.05, n_jobs=-1, silent = False, ) experiment(lgbm, train_x, train_y, test_x, test_y, pca = False, fine_tune = True) """ Bagging with Lightgbm (Combine boosting and bagging)""" print_info('Start Bagging with Lightgbm') lgbm = lgb.LGBMClassifier(objective='binary', random_state = SEED,
additional_features.remove('salary') additional_features.remove('poi') additional_features.remove('email_address') initial_features = ['poi', 'salary'] # initiating automatic feature search ''' final_features_SVC = auto_feature(SVC(), my_dataset, additional_features, initial_features, iterate=2) final_features_NB = auto_feature(GaussianNB(), my_dataset, additional_features, initial_features, iterate=2) final_features_FR = auto_feature(RandomForestClassifier(), my_dataset, additional_features, initial_features, iterate=1) ''' final_features_NB = auto_feature(GaussianNB(), my_dataset, additional_features, initial_features, iterate=2) # OPTIMIZING SELECTED CLASSIFIER # ****************************** # optimizing features in classifier using default parameters clf_def = DecisionTreeClassifier() optimal_features = auto_feature(clf_def, my_dataset, additional_features, initial_features, iterate=5)
from sklearn.linear_model import LogisticRegression from sklearn.neighbors import KNeighborsClassifier from sklearn.naive_bayes import GaussianNB from sklearn.ensemble import RandomForestClassifier from mlxtend.classifier import StackingClassifier ################## load data ##################### iris = datasets.load_iris() x, y = iris.data[:, 1:3], iris.target ################## define classifier ##################### clf1 = KNeighborsClassifier(n_neighbors=1) clf2 = RandomForestClassifier(random_state=1) clf3 = GaussianNB() lr = LogisticRegression() sclf = StackingClassifier(classifiers=[clf1, clf2, clf3], meta_classifier=lr) ################## class result ##################### for clf, label in zip( [clf1, clf2, clf3, sclf], ['KNN', 'Random Forest', 'Naive Bayes', 'StackingClassifier']): scores = model_selection.cross_val_score(clf, x, y, cv=3, scoring='accuracy')
X_train = np.concatenate( (X_train, np.array(weight[idx[doc]]).reshape(1, d)), axis=0).reshape(cnt + 1, d) Y_train = np.concatenate( (Y_train, np.array([score]).reshape(1, 1)), axis=0).reshape(cnt + 1, 1) cnt += 1 line = next(f) # X_train_PCA = print('training...') train_start_t = time.time() #lasso.train(X_train, Y_train) #train #call sklearn.Lasso() clfRand = GaussianNB() clfRand.fit(X_train, Y_train) train_end_t = time.time() print('training finish, cost time: %d' % (train_end_t - train_start_t)) test_set = dict() f = line_reader('F:/ECNU/Course/KnowledgeAna/tag_data_set/' + QID + '/test_set') line = next(f) print('loading testing set...') cnt = 0 while line: content = re.split('\s+', line) doc = content[0] score = float(content[1])
scoring = 'accuracy' # Classifiers names = ["Nearest Neighbors", "Gaussian Process", "Decision Tree", "Random Forest", "Neural Net", "AdaBoost", "Naive Bayes", "SVM Linear", "SVM RBF", "SVM Sigmoid"] classifiers = [ KNeighborsClassifier(n_neighbors = 3), GaussianProcessClassifier(1.0 * RBF(1.0)), DecisionTreeClassifier(max_depth=5), RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1), MLPClassifier(alpha=1), AdaBoostClassifier(), GaussianNB(), SVC(kernel = 'linear'), SVC(kernel = 'rbf'), SVC(kernel = 'sigmoid') ] models = zip(names, classifiers) # evaluation classifiers results = [] names = [] for name, model in models: kfold = model_selection.KFold(n_splits=10, random_state = seed) cv_results = model_selection.cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring) results.append(cv_results) names.append(name)
def __init__(self, **kwargs): super(NaiveBayes, self).__init__() super(NaiveBayes, self).SetModel(GaussianNB(**kwargs))
print('Accuracy of LDA classifier on training set: {:.2f}' .format(lda.score(scaled_X_train, Y_train))) print('Accuracy of LDA classifier on test set: {:.2f}' .format(lda.score(scaled_X_test, Y_test))) pred_lda = lda.predict(scaled_X_test) print(confusion_matrix(Y_test, pred_lda)) print(classification_report(Y_test, pred_lda)) # In[47]: #fit a naive bayes model from sklearn.naive_bayes import GaussianNB gnb = GaussianNB() gnb.fit(scaled_X_train, Y_train) print('Accuracy of GNB classifier on training set: {:.2f}' .format(gnb.score(scaled_X_train, Y_train))) print('Accuracy of GNB classifier on test set: {:.2f}' .format(gnb.score(scaled_X_test, Y_test))) pred_gnb = gnb.predict(scaled_X_test) print(confusion_matrix(Y_test, pred_gnb)) print(classification_report(Y_test, pred_gnb)) # In[48]: #fit a svm classifier
def __init__(self): super(GaussianNaiveBayes, self).__init__(name="Gaussian Bayes") self.model = GaussianNB()
from email_preprocess import preprocess ### features_train and features_test are the features for the training ### and testing datasets, respectively ### labels_train and labels_test are the corresponding item labels features_train, features_test, labels_train, labels_test = preprocess() ######################################################### ### your code goes here ### from sklearn.naive_bayes import GaussianNB clf = GaussianNB() t0 = time() clf.fit(features_train, labels_train) print "training time:", round(time()-t0, 3), "s" t0 = time() pred = clf.predict(features_test) print "testing time:", round(time()-t0, 3), "s" accuracy = clf.score(features_test, labels_test) print "Accuracy: " print accuracy #########################################################
training_L =[] testing_L=[] for i in range(len(training)): training_L.append([training[i]]) for i in range(len(testing)): testing_L.append([testing[i]]) classes = [] for i in cmu_sum: if (i in less_than): classes.append(0) else: classes.append(1) class_train= classes[0:int(len(classes) * split_percentage)] class_test=classes[len(class_train):len(classes)] #Create a Gaussian Classifier model = GaussianNB() # Train the model using the training sets model.fit(training_L, class_train) predicts = model.predict(testing_L) # Calculate Accuracy Rate by using accuracy_score() print ("Accuracy Rate: %f" % accuracy_score(class_test, predicts))
def evaluate_multilabel(model, data, alg = None, classifier="lr",fast=False,ratio = None, cv = 10, random_state = None,normalize=False): X = [] Y = [] for pid in range(len(model.word2id)): X.append(model.word_embeddings[pid]) Y = np.zeros((len(X),len(data.labels))) for y,key in enumerate(data.labels.keys()): for index,paper in enumerate(data.labels[key]): pid = model.word2id[paper] Y[pid][y] = 1 if normalize: X = sk_normalize(X) scaler = StandardScaler() X = scaler.fit_transform(X) df = defaultdict(list) if ratio is None: ratio = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9] for r in ratio: if r <= 0: continue elif r >= 1: break if classifier.lower() == 'lr': clf = LogisticRegression() elif classifier.lower() == "svm": clf = SVC(cache_size=5000) elif classifier.lower() == "mlp": clf = MLPClassifier() elif classifier.lower() == "nb": clf = GaussianNB() micros = [] macros = [] for i in range(cv): micro,macro = evaluateNodeClassification(X,Y,1-r,clf=clf,random_state = random_state) micros.append(micro) macros.append(macro) micros = np.mean(micros) macros = np.mean(macros) df["ratio"].append(r) df["micro"].append(micros) df["macro"].append(macros) #df["alg"].append(alg) #df["data"].append(str(data)) #df["total_samples"].append(model.total_samples) #df["negative"].append(model.negative) #df["walk_window"].append(model.walk_window) #df["walk_probability"].append(model.walk_probability) #df["L2"].append(model.l2) logging.info("ratio: %.4f : f1_micro %.4f, f1_macro %.4f" % (r,micros,macros)) if fast: return micros,macros else: return df
import numpy as np import sys sys.path.append("..") from utile.Processor import processor # noqa from utile.Timer import timer # noqa data_frame = load_iris() input_data = data_frame.data data_targets = data_frame.target X_train, X_test, y_train, y_test = train_test_split(input_data, data_targets, test_size=0.2) model_knc = KNeighborsClassifier(n_neighbors=5) model_svc = SVC() model_rfc = RandomForestClassifier(n_estimators=10) model_gnb = GaussianNB() model_mnb = MultinomialNB() def knc_modeler(): model_knc.fit(X_train, y_train) value = model_knc.score(X_test, y_test) return value def svc_modeler(): model_svc.fit(X_train, y_train) value = model_svc.score(X_test, y_test) return value