def NB_experiment(data_fold, train, test, dumper): print "Ready to find the Best Parameters for Naive Bayes" print 'Gaussian Naive Bayes' nb = GNB() print "fitting NaiveBayes Experiment" dumper.write('Classifier: Naive Bayes\n') scores = cross_validation.cross_val_score(nb, train[0], train[1], cv=data_fold, score_func=accus) reports = "Accuracy on Train: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() / 2) print reports dumper.write(reports + '\n') reports = " ".join(['%0.2f' % (item) for item in scores]) dumper.write(reports + '\n') nb = GNB() nb.fit(train[0], train[1]) pred = clf_test(nb, test) output_ranking(pred, codecs.open('nb.ranking', 'w', 'utf-8')) return None
def get_new_model(self): if (self.model_type.split("_")[-1] == "Regressor"): if (self.model_type == "Linear-Regressor"): from sklearn.linear_model import LinearRegression self.model = LinearRegression(**self.model_args) elif (self.model_type == "Support-Vector-Regressor"): import sklearn.svm as SVR self.model = SVR(**self.model_args) elif (self.model_type == "Decision-Tree-Regressor"): from sklearn.tree import DecisionTreeRegressor as DTR self.model = DTR(**self.model_args) elif (self.model_type == "Random-Forest-Regressor"): from sklearn.ensemble import RandomForestRegressor as RFR self.model = RFR(**self.model_args) else: if (self.model_type == "Logistic-Regression-Classifier"): from sklearn.linear_model import LogisticRegression self.model = LogisticRegression(**self.model_args) elif (self.model_type == "KNN-Classifier"): from sklearn.neighbors import KNeighborsClassifier as KNN self.model = KNN(**self.model_args) elif (self.model_type == "Support-Vector-Classifier"): import sklearn.svm as SVC self.model = SVC(**self.model_args) elif (self.model_type == "Naive-Bayes-Classifier"): from sklearn.naive_bayes import GNB self.model = GNB(**self.model_args) elif (self.model_type == "Decision-Tree-Classifier"): from sklearn.tree import DecisionTreeClassifier as DTC self.model = DTC(**self.model_args) elif (self.model_type == "Random-Forest-Classifier"): from sklearn.ensemble import RandomForestClassifier as RFC self.model = RFC(**self.model_args)
def gbn_word2vec(): """""" model_GNB = GNB() train_data, test_data, label, train, test = get_data() path = "../data/word2vec-nlp" model_name = "%s/%s" % (path, "300features_40minwords_10context") model = Word2Vec.load(model_name) train_data_vecs = word2vec_model.get_avg_feature_vecs( train_data, model, 300) test_data_vecs = word2vec_model.get_avg_feature_vecs(test_data, model, 300) model_GNB.fit(train_data_vecs, label) print( "高斯贝叶斯分类器10折交叉验证得分: ", np.mean( cross_val_score(model_GNB, train_data_vecs, label, cv=10, scoring='roc_auc'))) print('保存结果...') result = model_GNB.predict(test_data_vecs) submission_df = pd.DataFrame(data={'id': test['id'], 'sentiment': result}) print(submission_df.head(10))
def NLMmodelexp1(): modelExperiment( nlmInsampleData, nlmOutsampleData, 'NLMdata/', fullFV, [LR(), DT(), KNC(), RF(), ABC(), GNB(), QDA()], [ 'LogisticRegression', 'DTree', 'KNN', 'RandomForest', 'AdaBoosted', 'GaussianNB', 'QuadraticDiscriminantAnalysis' ], 'NLMmodelExperiment1.csv', 'NLMclassifier_plot1.png', True)
def train(self): logging.info('-' * 20) logging.info('Start training the %s model', self.model) train_data = self.feature_extractor.extract_feature( self.data_loader.get_trainset()) if self.model == 'GNB': # Gaussian naive bayes self.classifier = GNB() elif self.model == 'BNB': # Bernoulli naive bayes self.classifier = BNB() # self.tok = RT(r'\w+') # vectorizer = Vectorizer(tokenizer=self.tok.tokenize) # train_data = self.data_loader.get_trainset() # train_data = [vectorizer.fit_transform(train_data[0]).toarray(), train_data[1]] # self.vocabulary = vectorizer.get_feature_names() elif self.model == 'MNB': # Multinomial naive bayes self.classifier = MNB() elif self.model == 'LR': # Logistic regression param = {'C': [10, 5, 2, 1, 0.5, 0.2, 0.1, 0.05, 0.02, 0.01]} self.classifier = GS(cv=5, estimator=LR(penalty=self.penalty, max_iter=self.epoch, solver='liblinear'), param_grid=param) elif self.model == 'SVM': # Support vector machine self.penalty = self.penalty if self.penalty in ['l1', 'l2' ] else 'l2' dual = self.penalty == 'l2' #self.classifier = SVM(penalty=self.penalty, C=self.c, max_iter=self.epoch, dual=dual) param = {'C': [10, 5, 2, 1, 0.5, 0.2, 0.1, 0.05, 0.02, 0.01]} self.classifier = GS(cv=5, estimator=SVM(penalty=self.penalty, dual=dual, max_iter=self.epoch), param_grid=param) elif self.model == 'R': # RandomGuess self.classifier = DC(strategy='stratified') else: logging.info('Unsupported model : %s', self.model) exit(0) self.classifier.fit(train_data[0], train_data[1]) self.classifier.predict(train_data[0]) predictions = self.classifier.predict(train_data[0]) acc = evaluator.accuracy_score(train_data[1], predictions) return acc
def main(): mnist = input_data.read_data_sets('MNIST_DATA', one_hot=False) clf1 = LR() clf2 = RFC() clf3 = GNB() eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='soft') X = mnist.train.images y = mnist.train.labels print('starting') eclf = eclf.fit(X, y) print(eclf.score(X, y), eclf.score(mnist.test.images, mnist.test.labels))
def SOmodelexp1(): modelExperiment( SOInsampleData, SOOutsampleData, 'stackoverflowdata/', fullFV, [LR(), DT(), KNC(), RF(n_estimators=200), ABC(), GNB(), QDA()], [ 'LogisticRegression', 'DTree', 'KNN', 'RandomForest', 'AdaBoosted', 'GaussianNB', 'QuadraticDiscriminantAnalysis' ], 'SOmodelExperiment1.csv', 'SOclassifier_plot1.png', True)
def create_estimator(self): """Method that instantiates an estimator""" estimator = None if mlc.is_SVM_id(self.estimator_id): ## SVM estimator = SVC() estimator.set_params(**self.SVM_params) elif mlc.is_RandomForest_id(self.estimator_id): ## RF estimator = RF() estimator.set_params(**self.RF_params) elif mlc.is_NaiveBayes_id(self.estimator_id): ## GNB estimator = GNB() estimator.set_params(**self.GNB_params) return estimator
def __init__(self, **kwargs): r"""Initialize GaussianNB instance. """ warnings.filterwarnings(action='ignore', category=ChangedBehaviorWarning) warnings.filterwarnings(action='ignore', category=ConvergenceWarning) warnings.filterwarnings(action='ignore', category=DataConversionWarning) warnings.filterwarnings(action='ignore', category=DataDimensionalityWarning) warnings.filterwarnings(action='ignore', category=EfficiencyWarning) warnings.filterwarnings(action='ignore', category=FitFailedWarning) warnings.filterwarnings(action='ignore', category=NonBLASDotWarning) warnings.filterwarnings(action='ignore', category=UndefinedMetricWarning) self.__gaussian_nb = GNB() super(GaussianNB, self).__init__()
def main(): digits = load_digits() noised_data = digits.data + np.random.random(digits.data.shape) * 15 X_train, X_test, y_train, y_test = train_test_split(noised_data, digits.target, test_size=0.8) svm = SVC(C=5, gamma=0.001, probability=True) lr = LogisticRegression() knn = KNN(n_jobs=-1) nb = GNB() rfc = RFC(n_estimators=500, n_jobs=-1) bgg = BaggingClassifier(n_estimators=300, n_jobs=-1) mlp = MLPClassifier(hidden_layer_sizes=(40, 20), max_iter=1000) xgb = XGBClassifier(n_estimators=300, n_jobs=-1) estimators = list( zip(["svm", "lr", "knn", "nb", "rfc", "bgg", "mlp", "xgb"], [svm, lr, knn, nb, rfc, bgg, mlp, xgb])) for name, clf in estimators: clf.fit(X_train, y_train) preds = clf.predict(X_test) print(name) print("p:{0:.4f} r:{1:.4f} f1:{2:.4f}".format( *precision_recall_fscore_support(y_test, preds, average="macro"))) for v in ["hard", "soft"]: vc_hard = VotingClassifier(estimators, voting=v) vc_hard.fit(X_train, y_train) preds = vc_hard.predict(X_test) print(v, "voting") print("p:{0:.4f} r:{1:.4f} f1:{2:.4f}".format( *precision_recall_fscore_support(y_test, preds, average="macro"))) stcl = StackingClassifier(estimators, RFC(n_estimators=2000, n_jobs=-1)) stcl.fit(X_train, y_train) preds = stcl.predict(X_test) print("stacking") print("p:{0:.4f} r:{1:.4f} f1:{2:.4f}".format( *precision_recall_fscore_support(y_test, preds, average="macro")))
def learn(): prep = preprocess() chunks = prep[0] test = prep[1] nbmodel = GNB() svmodel = SVC(gamma='auto', kernel='linear') nbnum = 0 svnum = 0 for j in range(len(chunks)): chunks.insert(0, chunks.pop(-1)) dev = chunks[0] train = [[], []] for t in chunks[1:]: train[0] += t[0] train[1] += t[1] nbmodel.fit(train[0], train[1]) data = nbmodel.predict(dev[0]) for i in range(len(data)): if data[i] == dev[1][i]: nbnum += 1 print('Finished GNB loop {}'.format(j + 1)) print(nbnum / len(data) / (j + 1)) print() svmodel.fit(train[0], train[1]) data = svmodel.predict(dev[0]) for i in range(len(data)): if data[i] == dev[1][i]: svnum += 1 print('Finished SVM loop {}'.format(j + 1)) print(svnum / len(data) / (j + 1)) print() if svnum >= nbnum: return testm(svmodel, chunks, test) else: return testm(nbmodel, chunks, test)
def bayes(self, X, y, valid, test): # Using data priors worked best nb_model = GNB() start = time.time() nb_model.fit(X, y) end = time.time() # TRAIN DATA # y_score = nb_model.predict_proba(X)[:, 1] # results = nb_model.predict(X) # # # Get metrics # mets = self.compute_metrics(y, results, y_score) # # print('AUROC:', mets['auroc']) # print('Accuracy:', mets['accuracy']) # print('Precision:', mets['precision']) # print('Recall:', mets['recall']) # print('F Score:', mets['f']) # print('Average Precision', mets['ap']) # print(mets['confusion']) # VALID DATA # y_score = nb_model.predict_proba(valid.drop("Class", axis=1).drop("Time", axis=1))[:, 1] # results = nb_model.predict(valid.drop("Class", axis=1).drop("Time", axis=1)) # # # Get metrics # mets = self.compute_metrics(valid["Class"], results, y_score) # # print('AUROC:', mets['auroc']) # print('Accuracy:', mets['accuracy']) # print('Precision:', mets['precision']) # print('Recall:', mets['recall']) # print('F Score:', mets['f']) # print('Average Precision', mets['ap']) # print(mets['confusion']) # TEST DATA y_score = nb_model.predict_proba( test.drop("Class", axis=1).drop("Time", axis=1))[:, 1] results = nb_model.predict( test.drop("Class", axis=1).drop("Time", axis=1)) # Get metrics mets = self.compute_metrics(test["Class"], results, y_score) mets['time'] = end - start print('AUROC:', mets['auroc']) print('Accuracy:', mets['accuracy']) print('Precision:', mets['precision']) print('Recall:', mets['recall']) print('F Score:', mets['f']) print('Average Precision', mets['ap']) print(mets['confusion'], '\n') # Precision recall measure #self.plot_precision_recall(test["Class"], y_score, 'Naive Bayes') # Plot ROC #self.plotROC(mets['fpr'], mets['tpr'], mets['auroc'], 'Naive Bayes') return mets
X_train, X_validation, Y_train, Y_validation = m_s.train_test_split( X, Y, test_size=validation_size, random_state=seed) # define 'scoring' parameter as 'accuracy' scoring = 'accuracy' # define array to hold candidate models models = [] # instantiate candidate models and add to array print('\n instantiating candidate models...') models.append(('LR', LR())) models.append(('LDA', LDA())) models.append(('KNC', KNC())) models.append(('DTC', DTC())) models.append(('GNB', GNB())) # run test harness results = [] names = [] print('\n running test harness...') for name, model in models: # 'kfold' var sets up the k-fold cross validation kfold = m_s.KFold(n_splits=10, random_state=seed) # 'cv_results' applies cross validation process to each model using the # training data i.e. features matrix X_train and results vector Y_train cv_results = m_s.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
def make_feature_graph(self, feature_list, labels_filename="trainingSetLabels.dat"): ''' Function to plot 2 graphs: 1. Decision Boundaries: Takes atmost 2 features for every sample and plots decision boundaries defined by 5 classifiers: ['Logistic Regression', 'Random Forest', 'Naive Bayes', 'SVM', 'AdaBoost'] 2. Scatter Plot: Plots the values of each data point on a Scatter plot to visualise how separable they seem. This is not performed on any classifier. For manual evaluation only. Parametrs: feature_list: A list of lists containing the features for each sample. labels_filename: Path to the filename containing the labels for the training data ''' y = [] with open(labels_filename) as label_file: x_true_list = [] x_fake_list = [] for idx, label in enumerate(label_file): if int(label): y.append(1) x_true_list.append(feature_list[idx]) else: y.append(0) x_fake_list.append(feature_list[idx]) y = np.array(y) X_plot = feature_list #---------------------------- Decision Boundary Plot -----------------------# if len(feature_list[0]) == 1 or len(feature_list[0]) == 2: print "Now plotting Decision boundary Plot. (Works best for 2 features)" gs = gridspec.GridSpec(2, 2) fig = plt.figure(figsize=(10, 8)) clf1 = LogisticRegression(random_state=1) clf2 = RFC(n_estimators=100, random_state=1) clf3 = GNB() clf4 = SVC() clf5 = ABC() labels = [ 'Logistic Regression', 'Random Forest', 'Naive Bayes', 'SVM', 'AdaBoost' ] for clf, lab, grd in zip([clf1, clf2, clf3, clf4, clf5], labels, itertools.product([0, 1], repeat=2)): clf.fit(X_plot, y) ax = plt.subplot(gs[grd[0], grd[1]]) fig = plot_decision_regions(X=X_plot, y=y, clf=clf, legend=2) plt.title(lab) plt.show() #---------------------------- Individual Scatter Plot -----------------------# plot_idx = 0 if len(feature_list[0]) != 1: plot_idx = int( raw_input( "Your list has more than 1 feature. Which feature would you like to observe? (Insert Index): " )) print "Now plotting scatter plot of feature:" x_true = [feat[plot_idx] for feat in x_true_list] x_fake = [feat[plot_idx] for feat in x_fake_list] x_true = np.array(x_true) x_fake = np.array(x_fake) y_plot = np.arange(max(len(x_true), len(x_fake))) trace_true = go.Scatter(y=x_true, x=y_plot, mode='markers', text="True") trace_fake = go.Scatter(y=x_fake, x=y_plot, mode='markers', text="Fake") data = [trace_true, trace_fake] layout = go.Layout(showlegend=False) fig = go.Figure(data=data, layout=layout) plot_url = offline.plot(fig, filename='text-chart-basic')
def article_classifier(self): train_pos, dev_pos = self.pos_load_features() rare_ttr_perplexity_4gram_features = list( extractFourGram('featureFour.txt', 'basic.csv')) X_dev = list(extractFourGram('featureFour_dev.txt', 'basic_dev.csv')) y_dev = self.get_dev_labels() X = rare_ttr_perplexity_4gram_features y = self.labels X.append(train_pos) X_dev.append(dev_pos) X = np.array(X).T[:, :] X_dev = np.array(X_dev).T[:, :] # self.make_feature_graph(X[:,1:3],"trainingSetLabels.dat") lr_clf = LogisticRegression() lr_clf.fit(X, y) lr_predicted = lr_clf.predict(X_dev) lr_scores = cross_val_score(lr_clf, X, y, cv=5, n_jobs=5) print lr_scores, np.mean(lr_scores), np.std(lr_scores) # svm_predicted = cross_val_predict(lr_clf, X, y, cv=5) print accuracy_score(y_dev, lr_predicted) # SVM Parameters: # {'C': [0.1,1.0,10.0,100.0], 'gamma':[1.0,2.0,'auto',0.1,0.01,0.001], 'kernel':['rbf','linear']} svm_clf = SVC(probability=True) svm_clf.fit(X, y) svm_predicted = svm_clf.predict(X_dev) svm_scores = cross_val_score(svm_clf, X, y, cv=5, n_jobs=5) print svm_scores, np.mean(svm_scores), np.std(svm_scores) # svm_predicted = cross_val_predict(svm_clf, X, y, cv=5) print accuracy_score(y_dev, svm_predicted) # RandomForest Parameters: # {'n_estimators':[10,20,5,30],'criterion':['gini','entropy']} rf_clf = RFC() rf_clf.fit(X, y) rf_predicted = rf_clf.predict(X_dev) rf_scores = cross_val_score(rf_clf, X, y, cv=5, n_jobs=5) print rf_scores, np.mean(rf_scores), np.std(rf_scores) # rf_predicted = cross_val_predict(rf_clf, X, y, cv=5) print accuracy_score(y_dev, rf_predicted) # AdaBoost Parameters: # {'n_estimators':[10,20,5,30],'learning_rate':[1.0,0.1,0.01,0.001,0.05]} ab_clf = ABC() ab_clf.fit(X, y) ab_predicted = ab_clf.predict(X_dev) ab_scores = cross_val_score(ab_clf, X, y, cv=5, n_jobs=5) print ab_scores, np.mean(ab_scores), np.std(ab_scores) # ab_predicted = cross_val_predict(ab_clf, X, y, cv=5) print accuracy_score(y_dev, ab_predicted) # Gaussian NB Parameters: # {'n_estimators':[10,20,5,30],'learning_rate':[1.0,0.1,0.01,0.001,0.05]} nb_clf = GNB() nb_clf.fit(X, y) nb_predicted = nb_clf.predict(X_dev) nb_scores = cross_val_score(nb_clf, X, y, cv=5, n_jobs=5) print nb_scores, np.mean(nb_scores), np.std(nb_scores) # nb_predicted = cross_val_predict(nb_clf, X, y, cv=5) print accuracy_score(y_dev, nb_predicted)
df['OCS3'] = replace_non_numeric(df['OCS3'], 'absent') df['OCS4'] = replace_non_numeric(df['OCS4'], 'absent') position_of_split = 10 y_data = df.iloc[:, -1] x_data = df.iloc[:, 0:-1] print 'shape of data is ', x_data.shape, 'while shape of target is ', y_data.shape #%% Select base learners learnersX = [ KNN(n_neighbors=5), EXTRA(n_estimators=30, random_state=rs_l), RF(n_estimators=30, random_state=rs_l), GNB(), GraB(random_state=rs_l) ] learnersY = [ KNN(n_neighbors=5), EXTRA(n_estimators=30, random_state=rs_l), RF(n_estimators=30, random_state=rs_l), GNB(), GraB(random_state=rs_l) ] for ww in range(0, len(rs1_list)): flag = False # a parameter for writing helpful headings to xls files w_count = 1 sheet1 = book.add_sheet('sheet' + str(ww) + '_' + ff)
# pred_probas=clf.decision_function(x_test)#y_score # precision_svm,recall_svm,auc_svm = get_pr(pred_probas,y_test) precision_svm, recall_svm, _ = precision_recall_curve(y_test, pred_probas) auc_svm = average_precision_score(y_test, pred_probas) # precision,recall,_=precision_recall_curve(y_test,pred_probas) # plt.step(recall, precision, color='b', alpha=0.2, # where='post') # plt.fill_between(recall, precision, step='post', alpha=0.2, # color='b') # plt.xlabel('Recall') # plt.ylabel('Precision') # plt.ylim([0.0, 1.05]) # plt.xlim([0.0, 1.0]) # 朴素贝叶斯,只适用于分类问题,比线性模型速度还快,适用于非常大的数据集和高维数据,但精度通常低于线性模型 gnb = GNB() gnb.fit(x_train, y_train) y_pred = gnb.predict(x_test) print('Training Score: ', gnb.score(x_train, y_train)) print('Testing Score: ', gnb.score(x_test, y_test)) print(len(y_test), len(y_pred)) print('准确率: ', accuracy_score(y_test, y_pred)) #准确率,准确率是分类正确的样本占总样本个数的比例 print('精确率: ', precision_score(y_test, y_pred)) #精确率指模型预测为正的样本中实际也为正的样本占被预测为正的样本的比例 print('召回率: ', recall_score(y_test, y_pred)) #召回率指实际为正的样本中被预测为正的样本所占实际为正的样本的比例 print("F1值: %.3f" % f1_score(y_test, y_pred)) pred_probas = gnb.predict_proba(x_test)[:, 1] #score precision_bays, recall_bays, _ = precision_recall_curve(y_test, pred_probas) auc_bays = average_precision_score(y_test, pred_probas) #逻辑回归
if __name__ == "__main__": #Loading the Dataset from sklearn.datasets import load_breast_cancer from sklearn.model_selection import train_test_split from sklearn.naive_bayes import GaussianNB as GNB from sklearn import metrics dataloader = load_breast_cancer() # keeping 80% as training data and 20% as testing data. X_train, X_test, y_train, y_test = train_test_split(dataloader.data, dataloader.target, test_size=0.2, random_state=20) model0 = GNB() model = GaussianNB(num_classes=2) # model = SoftmaxClassifier() model0.fit(X_train, y_train) model.fit(X_train, y_train) y_pred0 = model0.predict(X_test) y_pred, _ = model.predict(X_test) accu0 = metrics.accuracy_score(y_pred0, y_test) accu = metrics.accuracy_score(y_pred, y_test) # accu = np.sum(np.equal(y_pred, y_test))/len(y_test) print(f"accu0 is {accu0} and accu is {accu}")
def get_model(name): return { "mock": ClassifierMock(), "lda": LDA(), "qda": QDA(), "gnb": GNB(), "knn": KNN(), "forest": RandomForestClassifier(), "logistic": LogisticRegression(class_weight="balanced"), "svm": SVC(kernel="linear", class_weight="balanced"), }[name]
data_all = pd.concat([data_all, Sex_dummies, Pclass_dummies, Embarked_dummies], axis=1) feature = [ 'Age', 'Fare', 'FamilySize', 'Cabin_null', 'Cabin_nnull', 'Sex_female', 'Sex_male', 'Pclass_1', 'Pclass_2', 'Pclass_3', 'Embarked_C', 'Embarked_Q', 'Embarked_S' ] X = data_all.loc[data.index][feature] y = data.Survived modelDict = { 'DT': DT(), 'SVC': SVC(), 'GNB': GNB(), 'KNN': KNN(n_neighbors=3), 'MLP': MLP(hidden_layer_sizes=(500, )), 'LogR': LogR(C=1.0, penalty='l1', tol=1e-6), 'RF': RF(n_estimators=300), 'GB': GB(n_estimators=500) } for model in modelDict.keys(): clf = modelDict.get(model) scores = cross_val_score(clf, X, y, cv=5) print(model + ' accuracy: ' + '%.3f' % (scores.mean() * 100) + '%') votingC = VotingClassifier(estimators=[('clf_GB', GB(n_estimators=500)), ('clf_RF', RF(n_estimators=300)), ('clf_SVC', SVC(probability=True)),
"bumpiness": bumpy_bkg } } ############################################################################################################ from sklearn.naive_bayes import GaussianNB as GNB x_min = 0.0 x_max = 1.0 y_min = 0.0 y_max = 1.0 # Plot the decision boundary. For that, we will assign a color to each # point in the mesh [x_min, m_max]x[y_min, y_max]. clf = GNB() clf.fit(X_train, y_train) pred = clf.predict(X_test) h = .01 # step size in the mesh xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) # Put the result into a color plot Z = Z.reshape(xx.shape) plt.xlim(xx.min(), xx.max()) plt.ylim(yy.min(), yy.max()) plt.pcolormesh(xx, yy, Z, cmap=pl.cm.seismic)
def NaiveBayes(x_train, x_test, y_train, y_test): gnb = GNB() clf = gnb.fit(x_train, y_train.ravel()) score = clf.score(x_test, y_test) print("c_index = ", c_index, " 精度为", score)
def Bokeh_Decision_Boundaries_2D(X, Variable1, Variable2, y, Estimators = [LDA(),QDA(),KNN(),GNB(),TREE(), Random_Forest_Classifier(Rand_Param_Search = False), SVC(probability = True)], Notebook_Url = "None", Test_Size = 0.3, Random_State = None, Scale = True , Palette = "RdBu", Delta = 0.02, Output = 1): X=X Variable1 = Variable1 Variable2 = Variable2 y=y Estimators =Estimators Notebook_Url = Notebook_Url Test_Size =Test_Size Random_State = Random_State Scale = Scale Palette = Palette Delta = Delta Output = Output Estimator_Names = [str(estimator).split("(")[0] for estimator in Estimators] output_notebook() if Notebook_Url =="None": raise ValueError("Must specify the Notebook_Url i.e. localhost:port_number where port_number is the port on which the notebook is running") # def modify_doc(doc): # The callback for update def update_plots(Variable1, Variable2, y, Active_Estimators): print("Starting update") nonlocal Estimators if not isinstance(Estimators, (type(np.array), list)): Estimators = np.array([Estimators]) estimator_names = np.array(list(Active_Estimators)) ix = np.isin(Estimator_Names, estimator_names) estimator_indices = [int(i) for i in np.where(ix)[0].flatten()] estimators = np.array(Estimators)[estimator_indices] variable1 = Variable1 variable2 = Variable2 y= y plots = [None for i in range(len(estimators))] image_sources = [None for i in range(len(estimators))] observation_sources = [None for i in range(len(estimators))] hover_tools = [None for i in range(len(estimators))] model_score_sources= [None for i in range(len(estimators))] glyphs0= [None for i in range(len(estimators))] color_bars= [None for i in range(len(estimators))] p_circles = [None for i in range(len(estimators))] p_images = [None for i in range(len(estimators))] #Iterate over the estimators for idx, estimator in enumerate(estimators): #Find the title for each plot estimator_name = str(estimator).split('(')[0] #Extract the needed data full_mat = X[[variable1, variable2, y]].dropna(how = "any", axis = 0) #Define a class bijection for class colour mapping unique_classes, y_bijection = np.unique(full_mat[y], return_inverse = True) full_mat['y_bijection'] = y_bijection #Rescale the X Data so that the data fits nicely on the axis/predictions are reliable full_mat[variable1 + "_s"] = StandardScaler().fit_transform(full_mat[variable1].values.reshape((-1,1))) full_mat[variable2 + "_s"] = StandardScaler().fit_transform(full_mat[variable2].values.reshape((-1,1))) #Define the Step size in the mesh delta = Delta #Separate the data into arrays so it is easy to work with X1 = full_mat[variable1 + "_s"].values X2 = full_mat[variable2 + "_s"].values Y = full_mat["y_bijection"].values #Define the mesh-grid co-ordiantes over which to colour in x1_min, x1_max = X1.min() -0.5, X1.max() +0.5 x2_min, x2_max = X2.min() -0.5, X2.max() +0.5 #Create the meshgrid itself x1, x2 = np.arange(x1_min, x1_max, delta), np.arange(x2_min, x2_max, delta) x1x1, x2x2 = np.meshgrid(x1, x2) #Create the train test split X_train, X_test, y_train, y_test = train_test_split(full_mat[[variable1+"_s",variable2+"_s"]], Y, test_size = Test_Size, random_state = Random_State) #Fit and predict/score the model model = estimator.fit(X= X_train, y= y_train) # train_preds = model.predict(X_train) # test_preds = model.predict(X_test) model_score = model.score(X_test, y_test) model_score_text = "Model score: %.2f" % model_score if hasattr(model, "decision_function"): Z = model.decision_function(np.c_[x1x1.ravel(), x2x2.ravel()]) elif hasattr(model, "predict_proba"): Z = model.predict_proba(np.c_[x1x1.ravel(), x2x2.ravel()]) else: print("This Estimator doesn't have a decision_function attribute and can't predict probabilities") Z = np.argmax(Z, axis = 1) Z_uniques = np.unique(Z) unique_predictions = unique_classes[Z_uniques] Z = Z.reshape(x1x1.shape) #Add in the probabilities and predicitions for the tooltips full_mat["probability"] = np.amax(model.predict_proba(full_mat[[variable1 + "_s", variable2 + "_s"]]), axis = 1) bijected_predictions= model.predict(full_mat[[variable1 + "_s", variable2 + "_s"]]) full_mat["prediction"] = unique_classes[bijected_predictions] #Add an associated color to the predictions number_of_colors= len(np.unique(y_bijection)) #Create the hover tool to be updated hover = HoverTool(tooltips = [ (variable1,"@"+variable1), (variable2, "@"+variable2), ("Probability", "@probability"), ("Prediction", "@prediction"), ("Actual", "@"+y)]) #Create the axes for all the plots plots[idx] = figure(x_axis_label = variable1, y_axis_label = variable2, title = estimator_name, x_range = (x1x1.min(),x1x1.max()),y_range = (x2x2.min(),x2x2.max()), plot_height = 600, plot_width = 600) #Create all the image sources image_data = dict() image_data['x'] = np.array([x1x1.min()]) image_data["y"] = np.array([x2x2.min()]) image_data['dw'] = np.array([x1x1.max()-x1x1.min()]) image_data['dh'] = np.array([x2x2.max() - x2x2.min()]) image_data['boundaries'] = [Z] image_sources[idx] = ColumnDataSource(image_data) #Create all the updatable images (boundaries) p_images[idx] = plots[idx].image(image = 'boundaries', x= 'x', y = 'y', dw = 'dw', dh= 'dh', palette = "RdBu11", source = image_sources[idx]) #Create the sources to update the observation points observation_sources[idx] = ColumnDataSource(data = full_mat) #Create all the updatable points low = full_mat["y_bijection"].min() high = full_mat["y_bijection"].max() cbar_mapper = LinearColorMapper(palette = RdBu[number_of_colors], high = high, low = low) p_circles[idx] = plots[idx].circle(x =variable1 +"_s", y= variable2 + "_s", color = dict(field = 'y_bijection', transform = cbar_mapper), source = observation_sources[idx], line_color = "black") #Create the hovertool for each plot hover_tools[idx] = hover #Add the hover tools to each plot plots[idx].add_tools(hover_tools[idx]) #Create all the text sources (model scores) for the plots model_score_sources[idx] = ColumnDataSource(data = dict(x=[x1x1.min()+0.3], y=[x2x2.min()+0.3], text=[model_score_text])) #Add the model scores to all the plots score_as_text = Text(x = "x", y = "y", text = "text") glyphs0[idx] = plots[idx].add_glyph(model_score_sources[idx], score_as_text) #Add a colorbar color_bars[idx] = ColorBar(color_mapper= cbar_mapper , ticker=BasicTicker(desired_num_ticks = number_of_colors), label_standoff=12, location=(0,0), bar_line_color = "black") plots[idx].add_layout(color_bars[idx],"right") plots[idx].add_tools(LassoSelectTool(), WheelZoomTool()) # configure so that no drag tools are active plots[idx].toolbar.tools = plots[idx].toolbar.tools[1:] plots[idx].toolbar.tools[0], plots[idx].toolbar.tools[-2] = plots[idx].toolbar.tools[-2], plots[idx].toolbar.tools[0] # nonlocal layout layout =gridplot([],[row(plot) for plot in plots]) handle0 = show(layout,notebook_url = Notebook_Url, notebook_handle = True) #Finished the callback push_notebook(handle = handle0) estimators_used_widget = widgets.SelectMultiple(options = Estimator_Names, value = Estimator_Names, description = "Estimators", disabled = False) interact_manual(update_plots, Variable1 = [Variable1]+ list(X.columns.values), Variable2 = [Variable2] + list(X.columns.values), y =[y] +list(X.columns.values) , Active_Estimators = estimators_used_widget)
import pandas as pd import seaborn as sns import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split as t_t_s from sklearn.naive_bayes import GaussianNB as GNB from sklearn.decomposition import PCA from sklearn.mixture import GaussianMixture as GM from sklearn.metrics import accuracy_score as a_s df = sns.load_dataset('iris') x = df.drop('species', axis=1) y = df['species'] xtr, xte, ytr, yte = t_t_s(x, y, test_size=0.25, random_state=0) print(xtr.shape, yte.shape) # (112, 4) (38,) model = GNB() model.fit(xtr, ytr) ypred = model.predict(xte) print("分类准确率:{0:.2%}".format(a_s(yte, ypred))) # dimensionality reduction pca = PCA(n_components=2) new_x = pca.fit_transform(x) xtr_new, xte_new, ytr_new, yte_new = t_t_s(new_x, y, test_size=0.25, random_state=0) print(xtr_new.shape, yte_new.shape) # (112, 2) (38,) model1 = GNB() model1.fit(xtr_new, ytr_new) ypred1 = model1.predict(xte_new)
cross_val_score(model_lr, train_data, label, cv=10, scoring='roc_auc')) result = model_lr.predict(test_data) output = pd.DataFrame(data={ "PassengerId": test["PassengerId"], "Survived": result }) output.to_csv("lr.csv", index=False, quoting=3) # #### 提交kaggle后准确率:0.78469 # ### 高斯贝叶斯 # In[20]: model_GNB = GNB() model_GNB.fit(train_data, label) print "高斯贝叶斯分类器10折交叉验证得分: ", np.mean( cross_val_score(model_GNB, train_data, label, cv=10, scoring='roc_auc')) result = model_GNB.predict(test_data) output = pd.DataFrame(data={ "PassengerId": test["PassengerId"], "Survived": result }) output.to_csv("gnb.csv", index=False, quoting=3) # #### 提交kaggle后准确率:0.74163 # ### 随机森林
def new_gnb(): args = { } return GNB(**args)
def roc(df, target, model_to_fit, ax): from sklearn.model_selection import train_test_split as tts from sklearn.metrics import plot_roc_curve y = df[target].to_numpy() predictors = df.drop(target, axis=1) model = model_to_fit for col in predictors.columns: X = df[col].to_numpy() xtrain, xtest, ytrain, ytest = tts(X, y) fit = model.fit(xtrain.reshape(-1, 1), ytrain) plot_roc_curve(model, xtest.reshape(-1, 1), ytest, ax=ax, label=col) # using the function with LR, KNN, GNB models on the data set to evaluate their regression capabilities from sklearn.linear_model import LogisticRegression as LR from sklearn.neighbors import KNeighborsClassifier as KNN from sklearn.naive_bayes import GaussianNB as GNB fig, ax = plt.subplots(nrows=3, ncols=1, figsize=(18, 18), sharex=True, sharey=True) roc(data, 'Class', LR(), ax[0]) roc(data, 'Class', KNN(), ax[1]) roc(data, 'Class', GNB(), ax[2])
from sklearn.naive_bayes import GaussianNB as GNB import sklearn.model_selection as skl from sklearn import preprocessing as ppr import read_preprocess_data as rpd from sklearn import metrics as met # an arbitrary try to Naive Bayes musicdata = pd.read_csv('sample-data.csv') le = ppr.LabelEncoder() # first performing encoding to the dependent and independant columns as it is categorical # - unfortunately, the encoding part is not functioning right... x = musicdata.iloc[:, :-1].values y = musicdata.iloc[:, -1].values # x = le.fit_transform(x) # y = le.fit_transform(y) # split into tuples of train and test # x_train, x_test, y_train, y_test = skl.train_test_split(x, y) # print(x_test) # print(y_test) # invoking get split data of our global file, read_preprocess_data (x_train, y_train), (x_test, y_test) = rpd.getSplitData() # invoking the Naive Bayes Function model = GNB() # fit the training set model.fit(x_train, y_train) # predict the testset. predicted = model.predict(x_test) print("Model accuracy is : ", met.accuracy_score(y_test, predicted))
def main(): # ---------------------------- # Training data # ---------------------------- # Loading training data trainingDataFile = 'Training_set.csv' trainingData = pd.read_csv(trainingDataFile) # Obtaining unique cases of events (Note: This remains the same for both training and test data) myEventSet = [] for x in trainingData.events: if x not in myEventSet: myEventSet.append(x) print('Unique events are as follows: \n', myEventSet,'\n') # Event string value reassignment based on unique event cases in 'myEventSet' newEvents = [] for x in trainingData.events: for i in range(len(myEventSet)): if x == myEventSet[i]: newEvents.append(i) # Converting datetime to Seconds and saving day of the week day = [] numDateTrainData = [] for i in range(len(trainingData.date)): date_obj = datetime.strptime(str(trainingData.date[i]), '%Y-%m-%d') numDateTrainData.append(date_obj.timestamp()) day.append(date_obj.weekday()) #print(trainingData.date) dictReqCount = {} for i in range(len(trainingData.date)): if day[i] not in dictReqCount.keys(): dictReqCount[day[i]] = [] dictReqCount[day[i]].append(trainingData.request_count[i]) #print(dictReqCount) dictAvgReqCount = {} for key,val in dictReqCount.items(): dictAvgReqCount[key] = sum(val)/len(val) #print(dictAvgReqCount) maxValue = max(dictAvgReqCount.values()) maxKey = [key for key,val in dictAvgReqCount.items() if val == maxValue] print('Day #{} of the week has the max mean request count'.format(maxKey[0])) minValue = min(dictAvgReqCount.values()) minKey = [key for key, val in dictAvgReqCount.items() if val == minValue] print('Day #{} of the week has the min mean request count'.format(minKey[0])) # Assembling feature arrays features_trainingData = [] for i in range(len(numDateTrainData)): row = [numDateTrainData[i], day[i], trainingData.calendar_code[i], trainingData.site_count[i], trainingData.max_temp[i], trainingData.min_temp[i], trainingData.precipitation[i], newEvents[i]]; features_trainingData.append(row) #for i in range(len(features_trainingData)): # print(len(features_trainingData[i])) #Y = list(trainingData.request_count) Y = trainingData.request_count X = features_trainingData #print('length of Y =', len(Y)) #print(features_trainingData) # Models that work on both continuous and discrete data scoring = 'neg_mean_squared_error' models = [DTR(),GNB(),RFR(),KNR()] '''models = [[DTR(), DTR(max_depth=2), DTR(max_depth=5)], [GNB(), GNB(priors=None)], [RFR(), RFR(), RFR()], [KNR(), KNR(), KNR()]] ''' seed = 7 kfold = MS.KFold(n_splits=10, random_state=seed) i = 0 mErr = [] for model in models: results = MS.cross_val_score(model, X, Y, cv=kfold, scoring=scoring) mErr.append(results.mean()) i += 1 #print(mErr) best_model_index = 0 maxAbsErrInd = math.fabs(mErr[0]) for i in range(1, len(mErr)): if (math.fabs(mErr[i]) < maxAbsErrInd): best_model_index = i maxAbsErrInd = math.fabs(mErr[i]) print('\nModel #%d (i.e. %s) performed best' %(best_model_index, str(models[best_model_index]).split('(')[0])) # ------------------------------------------------------- # Test Data # ------------------------------------------------------- # Loading test data testDataFile = 'Test_set.csv' testData = pd.read_csv(testDataFile) # Event string reassignment using myEventSet from training data newEvents = [] for x in testData.events: for i in range(len(myEventSet)): if x == myEventSet[i]: newEvents.append(i) # Converting datetime to Seconds and determining days of the week day = [] numDateTestData = [] for i in range(len(testData.date)): date_obj = datetime.strptime(str(testData.date[i]), '%Y-%m-%d') numDateTestData.append(date_obj.timestamp()) day.append(date_obj.weekday()) # Assembling feature arrays features_testData = [] for i in range(len(numDateTestData)): row = [numDateTestData[i], day[i], testData.calendar_code[i], testData.site_count[i], testData.max_temp[i], testData.min_temp[i], testData.precipitation[i], newEvents[i]]; features_testData.append(row) # Test data features X_test = features_testData # Test data prediction bestModel = models[best_model_index] Y_pred = bestModel.fit(X, Y).predict(X_test) Y_pred_train = bestModel.fit(X, Y).predict(X) print('\nThe predicted values for request count using the test data is as follows:\n',Y_pred) output_file = open('predicted_request_count.csv','w') for i in range(len(Y_pred)): output_file.write(str(Y_pred[i])+'\n') output_file.close() # Plot the results plt.figure(1) plt.scatter(numDateTrainData, Y, c="darkorange", label="Training data") plt.scatter(numDateTestData, Y_pred, c="cornflowerblue", label="Test data model prediction") plt.scatter(numDateTrainData, Y_pred_train, c="red", label="Training data model prediction") plt.xlabel("Numerical Date") plt.ylabel("Page Count") plt.title("Best Model") plt.legend() plt.show()
" " * 4, _Grid_Result.best_params_, "\n", sep="", end="\n") print() return _Grid_Result ################################################################################ # 分类算法审查 _ORDINARY_MODELS = { "LR": LR(), "LDA": LDA(), "KNC": KNC(), "GNB": GNB(), "DTC": DTC(), "SVC": SVC() } _ORDINARY_ALGORITHM_CMP_RESULTS = _Models_Cmp( _Models=_ORDINARY_MODELS, _Figure_Title="ALGORITHM COMPARISON") # Best: KNC _Model_Run(_Model=_ORDINARY_MODELS["KNC"], _Report_Title="KNC-K近邻算法") ################################################################################ # 数据正态化后重审 _SCALED_MODELS = { "LR": pipeline.Pipeline([("Scaler", preprocessing.StandardScaler()), ("LR", LR())]), "LDA": pipeline.Pipeline([("Scaler", preprocessing.StandardScaler()),