def NBAccuracy(features_train, labels_train, features_test, labels_test): """ compute the accuracy of your Naive Bayes classifier """ ### import the sklearn module for GaussianNB from sklearn.naive_bayes import GaussianNB from sklearn.metrics import accuracy_score ### create classifier clf = GaussianNB() #TODO ### fit the classifier on the training features and labels #TODO clf.fit(features_train,labels_train) ### use the trained classifier to predict labels for the test features pred = clf.predict(features_test) ### calculate and return the accuracy on the test data ### this is slightly different than the example, ### where we just print the accuracy ### you might need to import an sklearn module #num = len(pred) #corr = 0.0 #for i in range(len(pred)): # if pred[i] == labels_test[i]: # corr += 1 #accuracy = corr/num accuracy = accuracy_score(pred,labels_test) return accuracy
def test_string_labels_refit_false(): np.random.seed(123) clf1 = LogisticRegression() clf2 = RandomForestClassifier() clf3 = GaussianNB() y_str = y.copy() y_str = y_str.astype(str) y_str[:50] = 'a' y_str[50:100] = 'b' y_str[100:150] = 'c' clf1.fit(X, y_str) clf2.fit(X, y_str) clf3.fit(X, y_str) eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3], voting='hard', refit=False) eclf.fit(X, y_str) assert round(eclf.score(X, y_str), 2) == 0.97 eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3], voting='soft', refit=False) eclf.fit(X, y_str) assert round(eclf.score(X, y_str), 2) == 0.97
def NBAccuracy(features_train, labels_train, features_test, labels_test): """ compute the accuracy of your Naive Bayes classifier """ ### import the sklearn module for GaussianNB from sklearn.naive_bayes import GaussianNB ### create classifier clf = GaussianNB() ### fit the classifier on the training features and labels clf.fit(features_train, labels_train) ### use the trained classifier to predict labels for the test features pred = clf.predict(features_test) ### calculate and return the accuracy on the test data ### this is slightly different than the example, ### where we just print the accuracy ### you might need to import an sklearn module intersect = [i for i, j in zip(pred, labels_test) if i == j] matched = len(intersect) total = len(labels_test) accuracy = float(matched) / float(total) return accuracy
def main(): """ 主函数 """ # 准备数据集 train_data, test_data = utils.prepare_data() # 查看数据集 utils.inspect_dataset(train_data, test_data) # 特征工程处理 # 构建训练测试数据 X_train, X_test = utils.do_feature_engineering(train_data, test_data) print('共有{}维特征。'.format(X_train.shape[1])) # 标签处理 y_train = train_data['label'].values y_test = test_data['label'].values # 数据建模及验证 print('\n===================== 数据建模及验证 =====================') nb_model = GaussianNB() nb_model.fit(X_train, y_train) y_pred = nb_model.predict(X_test) print('准确率:', accuracy_score(y_test, y_pred)) print('AUC值:', roc_auc_score(y_test, y_pred))
def scikitNBClassfier(self): dataMat, labels = self.loadProcessedData() bayesian = Bayesian() myVocabList = bayesian.createVocabList(dataMat) ## 建立bag of words 矩阵 trainMat = [] for postinDoc in dataMat: trainMat.append(bayesian.setOfWords2Vec(myVocabList, postinDoc)) from sklearn.naive_bayes import GaussianNB gnb = GaussianNB() X = array(trainMat) y = labels testText = "美国军队的军舰今天访问了巴西港口城市,并首次展示了核潜艇攻击能力,飞机,监听。他们表演了足球。" testEntry = self.testEntryProcess(testText) bayesian = Bayesian() thisDoc = array(bayesian.setOfWords2Vec(myVocabList, testEntry)) ## 拟合并预测 y_pred = gnb.fit(X, y).predict(thisDoc) clabels = ['军事', '体育'] y_pred = gnb.fit(X, y).predict(X) print("Number of mislabeled points : %d" % (labels != y_pred).sum())
def NBAccuracy(features_train, labels_train, features_test, labels_test): """ compute the accuracy of your Naive Bayes classifier """ ### import the sklearn module for GaussianNB from sklearn.naive_bayes import GaussianNB ### create classifier clf = GaussianNB() ### fit the classifier on the training features and labels clf.fit(features_train, labels_train) ### use the trained classifier to predict labels for the test features pred = clf.predict(features_test) ### calculate and return the accuracy on the test data ### this is slightly different than the example, ### where we just print the accuracy ### you might need to import an sklearn module total = len(labels_test) correct = (pred == labels_test).sum() accuracy = correct/float(total) from sklearn.metrics import accuracy_score accuracy = accuracy_score(labels_test,pred ) return accuracy
def categorize(train_data,test_data,train_class,n_features): #cf= ExtraTreesClassifier() #cf.fit(train_data,train_class) #print (cf.feature_importances_) #lsvmcf = sklearn.svm.LinearSVC(penalty='l2', loss='l2', dual=True, tol=0.0001, C=100.0) model = LogisticRegression() lgr = LogisticRegression(C=100.0,penalty='l1') #knn = KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=10, p=2, metric='minkowski', metric_params=None) svmlcf = sklearn.svm.SVC(C=1000.0, kernel='linear', degree=1, gamma=0.01, probability=True)#2 svmcf = sklearn.svm.SVC(C=1000.0, kernel='rbf', degree=1, gamma=0.01, probability=True)#2 cf = DecisionTreeClassifier() dct = DecisionTreeClassifier(criterion='gini', splitter='best', min_samples_split=7, min_samples_leaf=4) rf = RandomForestClassifier(n_estimators=10, criterion='gini', min_samples_split=7, min_samples_leaf=4, max_features='auto') gnb = GaussianNB() #1 adbst = sklearn.ensemble.AdaBoostClassifier(base_estimator=rf, n_estimators=5, learning_rate=1.0, algorithm='SAMME.R', random_state=True) #ch2 = SelectKBest(chi2, k=n_features) #train_data = ch2.fit_transform(train_data, train_class) #test_data = ch2.transform(test_data) #rfe = RFE(svmlcf,n_features) #rfe = rfe.fit(train_data, train_class) gnb.fit(train_data,train_class) return gnb.predict(test_data)
class GaussianColorClassifier(ContourClassifier): ''' A contour classifier which classifies a contour based on it's mean color in BGR, HSV, and LAB colorspaces, using a Gaussian classifier for these features. For more usage info, see class ContourClassifier ''' FEATURES = ['B', 'G', 'R', 'H', 'S', 'V', 'L', 'A', 'B'] def __init__(self, classes, **kwargs): super(GaussianColorClassifier, self).__init__(classes, **kwargs) self.classifier = GaussianNB() def get_features(self, img, mask): mean = cv2.mean(img, mask) mean = np.array([[mean[:3]]], dtype=np.uint8) mean_hsv = cv2.cvtColor(mean, cv2.COLOR_BGR2HSV) mean_lab = cv2.cvtColor(mean, cv2.COLOR_BGR2LAB) features = np.hstack((mean.flatten(), mean_hsv.flatten(), mean_lab.flatten())) return features def classify_features(self, features): return self.classifier.predict(features) def feature_probabilities(self, features): return self.classifier.predict_proba(features) def train(self, features, classes): self.classifier.fit(features, classes)
def NB_experiment(data_fold, train, test, dumper): print "Ready to find the Best Parameters for Naive Bayes" print 'Gaussian Naive Bayes' nb = GNB() print "fitting NaiveBayes Experiment" dumper.write('Classifier: Naive Bayes\n') scores = cross_validation.cross_val_score(nb, train[0], train[1], cv = data_fold, score_func=accus) reports = "Accuracy on Train: %0.2f (+/- %0.2f)"%(scores.mean(), scores.std()/2) print reports dumper.write(reports+'\n') reports = " ".join(['%0.2f'%(item) for item in scores]) dumper.write(reports+'\n') nb = GNB() nb.fit(train[0], train[1]) pred = clf_test(nb, test) output_ranking(pred, codecs.open('nb.ranking', 'w', 'utf-8')) return None
def getGaussianPred(featureMatrix, labels, testSet, testSet_docIndex): """ All input arguments are return of getTrainTestData() :param featureMatrix: :param labels: :param testSet: :param testSet_docIndex: :return docIndexPred: dict{docid: [index1, index2, ...], ...} key is docid value is all cognates' index """ gnb = GaussianNB() gnb.fit(featureMatrix, labels) # pred = gnb.predict(featureMatrix) pred = gnb.predict(testSet) docIndexPred = dict() for i, p in enumerate(pred): if p: docid = testSet_docIndex[i, 0] index = testSet_docIndex[i, 1] if docid in docIndexPred: docIndexPred[docid].append(index) else: docIndexPred[docid] = [index] return docIndexPred
def NBAccuracy(features_train, labels_train, features_test, labels_test): """ compute the accuracy of your Naive Bayes classifier """ ### import the sklearn module for GaussianNB from sklearn.naive_bayes import GaussianNB ### create classifier clf = GaussianNB() t0 = time() ### fit the classifier on the training features and labels clf.fit(features_train, labels_train) print "training time:", round(time()-t0, 3), "s" ### use the trained classifier to predict labels for the test features import numpy as np t1 = time() pred = clf.predict(features_test) print "predicting time:", round(time()-t1, 3), "s" ### calculate and return the accuracy on the test data ### this is slightly different than the example, ### where we just print the accuracy ### you might need to import an sklearn module accuracy = clf.score(features_test, labels_test) return accuracy
def NBAccuracy(features_train, labels_train, features_test, labels_test): #Import sklearn modules for GaussianNB from sklearn.naive_bayes import GaussianNB from sklearn.metrics import accuracy_score #Create classifer classifer = GaussianNB(); #Timing fit algorithm t0 = time(); #Fit classier on the training features classifer.fit(features_train, labels_train); print "Training Time: ", round(time() - t0, 3), "s"; GaussianNB(); #Timing prediction algorithm t0=time(); #Use trained classifer to predict labels for test features pred = classifer.predict(features_test); print "Prediction Time: ", round(time() - t0, 3), "s"; #Calculate accuracy from features_test with answer in labels_test accuracy = accuracy_score(pred, labels_test); return accuracy;
def performNB(trainingScores, trainingResults, testScores): print "->Gaussian NB" X = [] for currMark in trainingScores: pass for idx in range(0, len(trainingScores[currMark])): X.append([]) for currMark in trainingScores: if "Asym" in currMark: continue print currMark, for idx in range(0, len(trainingScores[currMark])): X[idx].append(trainingScores[currMark][idx]) X_test = [] for idx in range(0, len(testScores[currMark])): X_test.append([]) for currMark in trainingScores: if "Asym" in currMark: continue for idx in range(0, len(testScores[currMark])): X_test[idx].append(testScores[currMark][idx]) gnb = GaussianNB() gnb.fit(X, np.array(trainingResults)) y_pred = gnb.predict_proba(X_test)[:, 1] print "->Gaussian NB" return y_pred
def NB(text): ### features_train and features_test are the features for the training ### and testing datasets, respectively ### labels_train and labels_test are the corresponding item labels features_train, features_test, labels_train, labels_test = Preprocess() Ifeatures_train,Ifeatures_test,Ilabels_train=preprocess_input([text]) # classification goes here clf = GaussianNB() # training train_t0 = time() clf.fit(features_train, labels_train) train_t1 = time() # prediction or testing test_t0 = time() predict = clf.predict(features_test) test_t1 = time() print "accuracy: ", clf.score(features_test, labels_test) print "#################################" print "tain time: ", round(train_t1 - train_t0, 3), "s" print "prediction time: ", round(test_t1 - test_t0, 3), "s" print "#################################" clf.fit(Ifeatures_train,Ilabels_train) print ("prediction of ",str(clf.predict(Ifeatures_test))[1]) #print "prediction of ", clf.predict(preprocess_input(text)) return str(clf.predict(Ifeatures_test))[1]
class GaussianNBClassifier: def __init__(self): """ This is the constructor responsible for initializing the classifier """ self.outputHeader = "#gnb" self.clf = None def buildModel(self): """ This builds the model of the Gaussian NB classifier """ self.clf = GaussianNB() def trainGaussianNB(self,X, Y): """ Training the Gaussian NB Classifier """ self.clf.fit(X, Y) def validateGaussianNB(self,X, Y): """ Validate the Gaussian NB Classifier """ YPred = self.clf.predict(X) print accuracy_score(Y, YPred) def testGaussianNB(self,X, Y): """ Test the Gaussian NB Classifier """ YPred = self.clf.predict(X) print accuracy_score(Y, YPred)
def classify(features_train, labels_train): clf = GaussianNB() clf.fit(features_train, labels_train) ### import the sklearn module for GaussianNB ### create classifier ### fit the classifier on the training features and labels return clf
def naive_bayes(features, labels): classifier = GaussianNB() classifier.fit(features, labels) scores = cross_validation.cross_val_score( classifier, features, labels, cv=10, score_func=metrics.precision_recall_fscore_support ) print_table("Naive Bayes", numpy.around(numpy.mean(scores, axis=0), 2))
def test_gnb_prior(): # Test whether class priors are properly set. clf = GaussianNB().fit(X, y) assert_array_almost_equal(np.array([3, 3]) / 6.0, clf.class_prior_, 8) clf.fit(X1, y1) # Check that the class priors sum to 1 assert_array_almost_equal(clf.class_prior_.sum(), 1)
def nb_names(): #generate list of tuple names engine = create_engine('sqlite:///names.db') DBSession = sessionmaker(bind=engine) session = DBSession() db_names = names.Names.getAllNames(session) names_list = [(x,'name') for x in db_names] words_list = generate_words() sample_names = [names_list[i] for i in sorted(random.sample(xrange(len(names_list)), len(words_list)))] data = sample_names + words_list shuffled_data = np.random.permutation(data) strings = [] classification = [] for item in shuffled_data: strings.append([item[0]]) classification.append(str(item[1])) X = np.array(strings) Y = np.array(classification) print X,Y clf = GaussianNB() clf.fit(X, Y)
def trainNB(): featureVector = [] classVector = [] temp= [] headerLine = True #training train = open(r'C:\Python34\alchemyapi_python\TrainingDataDummy.csv') for line in train: if(headerLine): headerLine = False else: temp = line.split(",") x = [float(temp[i]) for i in activeFeatureIndex] #print(x) featureVector.append(x) #temp = [int(x) for x in line.split(",")[-1].rstrip("\n")] classVector.append(int(line.split(",")[-1].rstrip("\n"))) fVector = np.array(featureVector) cVector = np.array(classVector) #print(classVector) print(fVector.shape) print(cVector.shape) clf = GaussianNB() clf.fit(fVector,cVector) train.close() return clf
class CruiseAlgorithm(object): # cruise algorithm is used to classify the cruise phase vs noncruise phase, it uses the differential change in data stream as the input matrix def __init__(self, testing=False): self.core = GaussianNB() self.scaler = RobustScaler() self.X_prev = None self.testing = testing def fit(self,X,Y): # Y should be the label of cruise or not X = self.prepare(X) self.core.fit(X,Y.ravel()) def predict(self, X): if self.testing: X_t = self.prepare(X) else: if self.X_prev: X_t = X - self.X_prev else: X_t = X self.X_prev = X print repr(X_t) prediction_result = self.core.predict(X_t) return np.asmatrix(prediction_result) def prepare(self,X): a = np.zeros((X.shape[0],X.shape[1])) for i in xrange(X.shape[0]-1): a[i+1,:] = X[i+1] - X[i] return a
def selectKBest(previous_result, data): # remove 'restricted_stock_deferred' and 'director_fees' previous_result.pop(4) previous_result.pop(4) result = [] _k = 10 for k in range(0,_k): feature_list = ['poi'] for n in range(0,k+1): feature_list.append(previous_result[n][0]) data = featureFormat(my_dataset, feature_list, sort_keys = True, remove_all_zeroes = False) labels, features = targetFeatureSplit(data) features = [abs(x) for x in features] from sklearn.cross_validation import StratifiedShuffleSplit cv = StratifiedShuffleSplit(labels, 1000, random_state = 42) features_train = [] features_test = [] labels_train = [] labels_test = [] for train_idx, test_idx in cv: for ii in train_idx: features_train.append( features[ii] ) labels_train.append( labels[ii] ) for jj in test_idx: features_test.append( features[jj] ) labels_test.append( labels[jj] ) from sklearn.naive_bayes import GaussianNB clf = GaussianNB() clf.fit(features_train, labels_train) predictions = clf.predict(features_test) score = score_func(labels_test,predictions) result.append((k+1,score[0],score[1],score[2])) return result
class RegularizedGaussianNB: """ Three types of regularization are possible: - regularized the variance of a feature within a class toward the average variance of all features from that class - regularize the variance of a feature within a class toward its pooled variance across all classes - add some constant amount of variance to each feature In practice, the latter seems to work the best, though the regularization value should be cross-validated. """ def __init__(self, avg_weight = 0, pooled_weight = 0, extra_variance = 0.1): self.pooled_weight = pooled_weight self.avg_weight = avg_weight self.extra_variance = extra_variance self.model = GaussianNB() def fit(self, X,Y): self.model.fit(X,Y) p = self.pooled_weight a = self.avg_weight ev = self.extra_variance original_weight = 1.0 - p - a pooled_variances = np.var(X, 0) for i in xrange(self.model.sigma_.shape[0]): class_variances = self.model.sigma_[i, :] new_variances = original_weight*class_variances + \ p * pooled_variances + \ a * np.mean(class_variances) + \ ev self.model.sigma_[i, :] = new_variances def predict(self, X): return self.model.predict(X)
def test_classification(): t = zeros(len(target)) t[target == 'setosa'] = 1 t[target == 'versicolor'] = 2 t[target == 'virginica'] = 3 from sklearn.naive_bayes import GaussianNB classifier = GaussianNB() classifier.fit(data,t) # training on the iris dataset print classifier.predict(data[0]) print t[0] from sklearn import cross_validation train, test, t_train, t_test = cross_validation.train_test_split(data, t, test_size=0.4, random_state=0) classifier.fit(train,t_train) # train print classifier.score(test,t_test) # test from sklearn.metrics import confusion_matrix print confusion_matrix(classifier.predict(test),t_test) from sklearn.metrics import classification_report print classification_report(classifier.predict(test), t_test, target_names=['setosa', 'versicolor', 'virginica']) from sklearn.cross_validation import cross_val_score # cross validation with 6 iterations scores = cross_val_score(classifier, data, t, cv=6) print scores from numpy import mean print mean(scores)
def simple_svm_train(emotion, training_set): song_list = [] sizes_list = [] other_emotions = [] # print 'Start to sample set' # Setting up the data sampled_dict = create_sample_dict(training_set) # print 'Set sampled, extracting features' feature_vector, class_vector, test_values, test_class = extract_features(sampled_dict, emotion, training_set) # Creating the classifier using sklearn # print 'Extracted features, training classifier' clf = GaussianNB() clf.fit(feature_vector,class_vector) # clf = svm.SVC(max_iter = 10000) # clf.fit(feature_vector,class_vector) # print 'Finished training classifier' # Testing and analyzing results results = test_classifier(clf, emotion, test_values) return post_process_results(results, emotion)
def MyNaiveBayes(object): pre = PreProcess() (training_value, test_value, test_pos_x, test_pos_y, training_pos_x, training_pos_y) = pre.split() # 模型初始化 clf_x = GaussianNB() clf_y = GaussianNB() # 进行模型的训练 clf_x.fit(training_value, training_pos_x) clf_y.fit(training_value, training_pos_y) # 计算结果 result_pos_x = clf_x.predict(test_value) result_pos_y = clf_y.predict(test_value) ''' print result_pos_x print test_pos_x print result_pos_y print test_pos_y ''' # 计算误差 x_dis = [] y_dis = [] d_dis = [] for i in range(len(result_pos_x)): x_dis.append(abs(result_pos_x[i] - test_pos_x[i])) y_dis.append(abs(result_pos_y[i] - test_pos_y[i])) d_dis.append(math.sqrt((result_pos_x[i]-test_pos_x[i])**2+(result_pos_y[i]-test_pos_y[i])**2)) x = (sum(x_dis))/len(result_pos_x) y = (sum(y_dis))/len(result_pos_y) d = (sum(d_dis))/len(d_dis) print x, y, d return x, y, d
def myClassifier(X,Y,model,CV=4, scoreType='pure'): # X = [[0, 0], [1, 1],[1, 2]] # y = [0, 1, 2] score = {} print "Error Analysis using", scoreType if model == "SVM": clf = svm.SVC(probability=True, random_state=0, kernel='rbf') #clf = svm.SVR(cache_size=7000) elif model == "LR": clf = linear_model.LogisticRegression() clf.fit(X, Y) elif model == "NB": clf = GaussianNB() clf.fit(X, Y) elif model=='MLP': # multilayer perceptron clf = MLPClassifier( hidden_layer_sizes=[100],algorithm='l-bfgs') clf.fit(X, Y) if scoreType == 'cv': accu = np.mean(cross_validation.cross_val_score(clf, X, Y, scoring='accuracy',cv=CV)) elif scoreType == 'pure': predictions=clf.predict(X) accu = sum([int(predictions[q]==Y[q]) for q in range(len(Y))])/len(Y) return accu, clf
def createNaiveBayesModel(feature_vector_data): ''' Uses the dimensionally reduced feature vectors of each of the instance, sense id pairs to create a naive bayes model ''' naive_bayes_model_word_type = {} for word_type, instance_sense_dict in feature_vector_data.iteritems(): vectors = [] senses = [] for i in xrange(len(instance_sense_dict)): sense = instance_sense_dict.keys()[i][1] data_type = instance_sense_dict.keys()[i][2] #Need to grab the TSNE vectors and senses of only the training data #Thus, we ignore all the validation data if data_type == "training": vectors.append(instance_sense_dict.values()[i]) senses.append(sense) vectors = np.array(vectors) senses = np.array(senses) nb = GaussianNB() nb.fit(vectors, senses) naive_bayes_model_word_type[word_type] = nb return naive_bayes_model_word_type
def boundaries(): # import some data to play with iris = datasets.load_iris() X = iris.data[:, :2] y = iris.target h = .02 means = np.empty((X.shape[1], len(set(y)))) for i,lab in enumerate(list(set(y))): means[:,i] = X[y==lab].mean(axis=0) x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) nb = GaussianNB() nb.fit(X, y) Z = nb.predict(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) plt.contourf(xx, yy, Z, cmap=plt.cm.Paired, alpha=0.8) plt.scatter(means[0,:], means[1,:]) plt.xlabel('Sepal length') plt.ylabel('Sepal width') plt.xlim(xx.min(), xx.max()) plt.ylim(yy.min(), yy.max()) plt.xticks(()) plt.yticks(()) plt.savefig("decision_boundary.pdf") plt.clf()
def univariateFeatureSelection(f_list, my_dataset): result = [] for feature in f_list: # Replace 'NaN' with 0 for name in my_dataset: data_point = my_dataset[name] if not data_point[feature]: data_point[feature] = 0 elif data_point[feature] == 'NaN': data_point[feature] =0 data = featureFormat(my_dataset, ['poi',feature], sort_keys = True, remove_all_zeroes = False) labels, features = targetFeatureSplit(data) features = [abs(x) for x in features] from sklearn.cross_validation import StratifiedShuffleSplit cv = StratifiedShuffleSplit(labels, 1000, random_state = 42) features_train = [] features_test = [] labels_train = [] labels_test = [] for train_idx, test_idx in cv: for ii in train_idx: features_train.append( features[ii] ) labels_train.append( labels[ii] ) for jj in test_idx: features_test.append( features[jj] ) labels_test.append( labels[jj] ) from sklearn.naive_bayes import GaussianNB clf = GaussianNB() clf.fit(features_train, labels_train) predictions = clf.predict(features_test) score = score_func(labels_test,predictions) result.append((feature,score[0],score[1],score[2])) result = sorted(result, reverse=True, key=lambda x: x[3]) return result
accuracy5 = clf5.score(X_test,y_test) clf6.fit(X_train,y_train) accuracy6 = clf6.score(X_test,y_test) clf7.fit(X_train,y_train) accuracy7 = clf7.score(X_test,y_test) clf8.fit(X_train,y_train) accuracy8 = clf8.score(X_test,y_test) print(accuracy1,accuracy2,accuracy3,accuracy4,accuracy5,accuracy6,accuracy7,accuracy8) from sklearn.naive_bayes import GaussianNB clfnb = GaussianNB() clfnb.fit(X_train, y_train) accuracyNB = clfnb.score(X_test,y_test) print("In Gaussian NB") print (accuracyNB) ##WITH TruncatedSVD + KNN from sklearn.decomposition import PCA, FastICA,TruncatedSVD from sklearn.pipeline import Pipeline trun = TruncatedSVD() dm_reductions = [trun] clf_details = [clf] estimators = [('dm_reduce', trun), ('clf', clf)] pipeline = Pipeline(estimators) best_pipe = pipeline.fit(X_train, y_train)
n = [1500,5000,7000,10000,20000] y_data[y_data == 6] = 0 y_data[y_data == 7] = 1 y_data[y_data == 8] = 1 y_data[y_data == 9] = 2 y_data[y_data == 10] = 2 df = pd.DataFrame(y_data) fs = [] acc = [] df=df.astype('int') genes_transpose = np.transpose(x_data) for i in range(0,5): X_new = SelectKBest(chi2, k=n[i]).fit_transform(genes_transpose, df) classifier = GaussianNB() X_train, X_test,y_train,y_test = train_test_split(X_new,df,test_size=0.3) classifier.fit(X_train,y_train) y_pred = classifier.predict(X_test) cnf_matrix = confusion_matrix(y_test, y_pred) cnf_matrix.astype(float) FP = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix) FN = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix) TP = np.diag(cnf_matrix) TN = cnf_matrix.sum() - (FP + FN + TP) FP = FP.astype('float') FN = FN.astype('float') TP = TP.astype('float') TN = TN.astype('float') # Sensitivity, hit rate, recall, or true positive rate TPR = TP/(TP+FN) # Specificity or true negative rate TNR = TN/(TN+FP)
Y = df.iloc[:, -1] # Encoding categorical values X = pd.get_dummies(X, columns=['Gender'], drop_first=True) # Train-Test-Split X_train = X.sample(frac=0.8, random_state=1) X_test = X.drop(X_train.index) Y_test = Y.drop(X_train.index) Y_train = Y.drop(Y_test.index) X_train = X_train.sort_index() # Scaling values from sklearn.preprocessing import StandardScaler Sc_X = StandardScaler() X_train = Sc_X.fit_transform(X_train) X_test = Sc_X.transform(X_test) # Making a Naive Bayes Classifier from sklearn.naive_bayes import GaussianNB clf = GaussianNB() clf.fit(X_train, Y_train) Y_pred = clf.predict(X_test) from sklearn.metrics import confusion_matrix c_m = confusion_matrix(Y_test, Y_pred) print(c_m)
def domestic_model_initialise(): connection = pymysql.connect(host='localhost', user='******', password='', db='crickml', charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor) try: with connection.cursor() as cursor: # Read a single record sql = "SELECT * FROM `domestic_stats`" cursor.execute(sql) result = cursor.fetchall() player_list = [] for player in result: career_score = batsmen_model(player['overall_matches'], player['overall_innings'], player['overall_average'], player['overall_100s'], player['overall_50s']) player_list.append([ player['overall_average'] * player['overall_strike_rate'], career_score ]) with connection.cursor() as cursor: # Read a single record sql = "SELECT * FROM `domestic_stats`" cursor.execute(sql) result = cursor.fetchall() intl_performance_list = [] performance_list = [] for player in result: performance_score = batsmen_performance_model( player['intl_matches'], player['intl_innings'], player['intl_average'], player['intl_100s'], player['intl_50s']) intl_performance_list.append([performance_score]) finally: print('done') # return pre_performance np_intl_performances_list = np.array(intl_performance_list) mean_performance = sum( np_intl_performances_list[:, 0]) / len(np_intl_performances_list) for performance in intl_performance_list: if (performance <= mean_performance): performance_list.append(0) else: performance_list.append(1) # finally: # connection.close() # # print(np_players) np_players = np.array(player_list) np_players = np_players.astype(float) np_performances = np.array(performance_list) max_batting_pos = np.max(np_players[:, 0]) max_milestone_score = np.max(np_players[:, 1]) for player in np_players: batting_pos_score = player[0] batting_milestone_score = player[1] # batting_runs_score = player[2] # print(batting_pos_score) # print(max_batting_pos) normalized_batting_pos_score = batting_pos_score / max_batting_pos normalized_batting_milestone_score = batting_milestone_score / max_milestone_score # normalized_runs_score = batting_runs_score/max_runs_score # print(normalized_batting_pos_score) # exit() player[0] = normalized_batting_pos_score player[1] = normalized_batting_milestone_score # player[2] = normalized_runs_score sm = SMOTE(random_state=42) np_players_resampled, np_performances_resampled = sm.fit_resample( np_players, np_performances) feature_train, feature_test, target_train, target_test = train_test_split( np_players_resampled, np_performances_resampled, test_size=0.20, random_state=42) print("Training Domestoc Models") print(feature_test) svm_clf = SVC(C=1000, kernel='sigmoid', gamma=0.001, probability=True) svm_clf.fit(feature_train, target_train) svm_pred = svm_clf.predict(feature_test) svm_pred_prob = svm_clf.predict_proba(feature_test) # print(svm_pred_prob) # acc = accuracy_score(svm_pred, target_test) # print('Accuracy :', acc) # print(classification_report(target_test, svm_pred)) gnb = GaussianNB() gnb.fit(feature_train, target_train) nb_pred_prob = gnb.predict_proba(feature_test) nb_pred = gnb.predict(feature_test) # acc = accuracy_score(nb_pred, target_test) # print('Accuracy :', acc) # print(classification_report(target_test, nb_pred)) desT = DecisionTreeClassifier() desT.fit(feature_train, target_train) desc_pred = desT.predict(feature_test) desc_pred_prob = desT.predict_proba(feature_test) mlp_clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1) mlp_clf.fit(feature_train, target_train) mlp_pred_prob = mlp_clf.predict_proba(feature_test) mlp_pred = mlp_clf.predict(feature_test) # acc = accuracy_score(desc_pred, target_test) # print('Accuracy :', acc) # print(classification_report(target_test, desc_pred)) miss_nb = 0 for index, pred in enumerate(nb_pred): if (pred != target_test[index]): miss_nb += 1 amt_say_nb = 1 / 2 * (math.log((1 - (miss_nb / 119)) / (miss_nb / 119))) miss_mlp = 0 for index, pred in enumerate(mlp_pred): if (pred != target_test[index]): miss_mlp += 1 amt_say_mlp = 1 / 2 * (math.log((1 - (miss_mlp / 119)) / (miss_mlp / 119))) miss_svm = 0 for index, pred in enumerate(svm_pred): if (pred != target_test[index]): miss_svm += 1 amt_say_svm = 1 / 2 * (math.log((1 - (miss_svm / 119)) / (miss_svm / 119))) miss_desc = 0 for index, pred in enumerate(desc_pred): if (pred != target_test[index]): miss_desc += 1 amt_say_desc = 1 / 2 * (math.log( (1 - (miss_desc / 119)) / (miss_desc / 119))) print('Amount of say NB :', amt_say_nb) print('Amount of say MLP :', amt_say_mlp) print('Amount of say SVM :', amt_say_svm) print('Amount of say Descision Tree :', amt_say_desc) return connection, gnb, mlp_clf, svm_clf, desT, amt_say_desc, amt_say_mlp, amt_say_nb, amt_say_svm, max_batting_pos, max_milestone_score, feature_train, feature_test, target_train, target_test
X=[] Y=[] i=0 with open(sys.argv[1], "r") as ins: for line in ins: line = line.strip() line1 = line.split(',') if(i==0): i+=1 continue X.append(map(int,line1[:-1])) Y.append(int(line1[-1])) clf = GaussianNB() clf.fit(X, Y) already = "../../Suites/Ccausalmarital" num_atr=[10,8,70,16,7,14,6,5,2,100,40,100,40] map={} def check_ratio(fixed,clf): if option==3 or option==4: fin = open(already,"r") requeried={} num=0 den=0 for line in fin: line = line.strip() line = line.split(',') line = line[:-1] i=0 pos=0
Y = [ 'male', 'female', 'female', 'female', 'male', 'male', 'male', 'female', 'male', 'female', 'male' ] #classifiers clf_tree = tree.DecisionTreeClassifier() clf_svc = svm.SVC() clf_KNN = KNeighborsClassifier() clf_NB = GaussianNB() #training the models clf_tree = clf_tree.fit(X, Y) clf_svc = clf_svc.fit(X, Y) clf_KNN = clf_KNN.fit(X, Y) clf_NB = clf_NB.fit(X, Y) prediction_tree = clf_tree.predict(X) prediction_svc = clf_svc.predict(X) prediction_KNN = clf_KNN.predict(X) prediction_NB = clf_NB.predict(X) result = accuracy_score(Y, prediction_tree) result1 = accuracy_score(Y, prediction_svc) result2 = accuracy_score(Y, prediction_KNN) result3 = accuracy_score(Y, prediction_NB) print(result) print(result1) print(result2) print(result3)
#OUTPUT:- #MODEL-1: Accuracy of LogisticRegression : 77.09 #MODEL-2) Gaussian Naive Bayes #------------------------------------------ from sklearn.naive_bayes import GaussianNB gaussian = GaussianNB() gaussian.fit(x_train, y_train) y_pred = gaussian.predict(x_val) acc_gaussian = round(accuracy_score(y_pred, y_val) * 100, 2) print( "MODEL-2: Accuracy of GaussianNB : ", acc_gaussian ) #OUTPUT:- #MODEL-2: Accuracy of GaussianNB : 78.68 #MODEL-3) Support Vector Machines #------------------------------------------ from sklearn.svm import SVC svc = SVC()
#Import Library of Gaussian Naive Bayes model from sklearn.naive_bayes import GaussianNB import numpy as np #assigning predictor and target variables x = np.array([[-3, 7], [1, 5], [1, 2], [-2, 0], [2, 3], [-4, 0], [-1, 1], [1, 1], [-2, 2], [2, 7], [-4, 1], [-2, 7]]) Y = np.array([3, 3, 3, 3, 4, 3, 3, 4, 3, 4, 4, 4]) #Create a Gaussian Classifier model = GaussianNB() # Train the model using the training sets model.fit(x, Y) #Predict Output predicted = model.predict([[1, 2], [3, 4]]) print(predicted)
def analysis(): (train_original, test_original, full_data) = featureextraction(False) for i in range(5): test = train_original.iloc[178*i:178*(i+1),:].copy() test = test.drop(labels=["Survived"],axis = 1) train = train_original.loc[~train_original['PassengerId'].isin(test['PassengerId'])] X_train = train.drop("Survived", axis=1) X_train = X_train.drop("PassengerId", axis=1).copy() Y_train = train["Survived"] X_test = test.drop("PassengerId", axis=1).copy() X_train.shape, Y_train.shape, X_test.shape logreg = LogisticRegression() logreg.fit(X_train, Y_train) Y_pred = logreg.predict(X_test) acc_log = round(logreg.score(X_train, Y_train) * 100, 2) print('Logistic regression:',acc_log,'%') svc = SVC() svc.fit(X_train, Y_train) Y_pred = svc.predict(X_test) acc_svc = round(svc.score(X_train, Y_train) * 100, 2) print('SVC:',acc_svc,'%') for k in range(3,8,2): knn = KNeighborsClassifier(n_neighbors = k) knn.fit(X_train, Y_train) Y_pred = knn.predict(X_test) acc_knn = round(knn.score(X_train, Y_train) * 100, 2) print('%s KNeighbors:'%k,acc_knn,'%') decision_tree = DecisionTreeClassifier() decision_tree.fit(X_train, Y_train) Y_pred = decision_tree.predict(X_test) acc_decision_tree = round(decision_tree.score(X_train, Y_train) * 100, 2) print('Decision Tree:',acc_decision_tree,'%') random_forest = RandomForestClassifier(n_estimators=100) random_forest.fit(X_train, Y_train) Y_pred = random_forest.predict(X_test) random_forest.score(X_train, Y_train) acc_random_forest = round(random_forest.score(X_train, Y_train) * 100, 2) print('Random Forest:',acc_random_forest,'%') Naive_bayes = GaussianNB() Naive_bayes.fit(X_train, Y_train) Y_pred = Naive_bayes.predict(X_test) Naive_bayes.score(X_train, Y_train) acc_Naive_bayes = round(Naive_bayes.score(X_train, Y_train) * 100, 2) print('Naive Bayes:',acc_Naive_bayes,'%') MLP = MLPClassifier(hidden_layer_sizes=(15,15,15)) MLP.fit(X_train, Y_train) Y_pred = MLP.predict(X_test) MLP.score(X_train, Y_train) acc_MLP = round(MLP.score(X_train, Y_train) * 100, 2) print('MLP:',acc_MLP,'%') print('\n') X_train = train_original.drop("Survived", axis=1) X_train = X_train.drop("PassengerId", axis=1) Y_train = train_original["Survived"] X_test = test_original.drop("PassengerId", axis=1).copy() X_train.shape, Y_train.shape, X_test.shape Submission_classifier = KNeighborsClassifier(n_neighbors = 7) Submission_classifier.fit(X_train, Y_train) Y_pred = Submission_classifier.predict(X_test) Submission_classifier.score(X_train, Y_train) Submission_classifier_score = round(Submission_classifier.score(X_train, Y_train) * 100, 2) submission = pd.DataFrame({ "PassengerId": test_original["PassengerId"], "Survived": Y_pred.astype(int) }) submission.to_csv('submission_KNN_2.csv', index=False) print('Submission accuracy:',Submission_classifier_score,'%')
def trainModel(X, results): print 'Building model...' clf = GaussianNB() clf.fit(X, results) return clf
my_data = pd.read_csv('C:\Projects\ML-BinaryClassification\iris\Iris.csv') #split the dataset into features and labels features = my_data.iloc[:, :5] labels = my_data[my_data.columns[-1]] # Split our data train, test, train_labels, test_labels = train_test_split(features, labels, test_size=0.20, random_state=42) #print(test_labels) #print (features) #print (labels) # Initialize our classifier gnb = GaussianNB() # Train our classifier model = gnb.fit(train, train_labels) # Make predictions print(test) preds = gnb.predict(test) print(preds) # Evaluate accuracy print(accuracy_score(test_labels, preds))
# train and test dataset splitting from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0) #feature scaler from sklearn.preprocessing import StandardScaler SS_X = StandardScaler() X_train = SS_X.fit_transform(X_train) X_test = SS_X.transform(X_test) #fitting logistic regression to the training data set from sklearn.naive_bayes import GaussianNB gnb = GaussianNB() gnb.fit(X_train, y_train) #predicting the trest set results y_pred = gnb.predict(X_test) # Visualising the Training set results from matplotlib.colors import ListedColormap X_set, y_set = X_train, y_train X1 = np.arange(start=X_set[:, 0].min() - 1, stop=X_set[:, 0].max() + 1, step=0.01) X2 = np.arange(start=X_set[:, 1].min() - 1, stop=X_set[:, 1].max() + 1, step=0.01) X1, X2 = np.meshgrid(X1, X2) plt.contourf(X1,
######################################### - Fitting Model- ########################################### # Model 1 # Multinomial Naive Bayes smnb = MultinomialNB() smnb.fit(X_train_count,y_train) ## Multinomial Model Accuracy smnb.score(X_train_count,y_train) # 0.99 smnb.score(X_test_count,y_test) # 0.98 # Model 2 # Gaussian Naive Bayes sgnb = GaussianNB() sgnb.fit(X_train_count_array,y_train) ## Gaussian Model Accuracy sgnb.score(X_train_count_array,y_train) # 0.90 sgnb.score(X_test_count_array,y_test) # 0.85 # From Above we can Conclude that Multinomial Naive Bayes Model gives us best result. So we are using it for future Predication. # Prediction on Train & Test Data pred_train = smnb.predict(X_train_count) pred_test = smnb.predict(X_test_count) # Confusion matrix of Train and Test ## Train confusion_matrix_train = pd.crosstab(y_train,pred_train,rownames=['Actual'],colnames= ['Train Predictions']) sns.heatmap(confusion_matrix_train, annot = True, cmap = 'Blues',fmt='g')
import pandas as pd import numpy as np import matplotlib.pyplot as plt from sklearn.naive_bayes import GaussianNB dataset = pd.read_csv("../datasheets/projection.csv") cases = np.array(dataset.india.values.tolist()) # y days = list(range(1, (len(cases) + 1))) # X days = np.array([[day] for day in days]) days_pred = list(range(1, (len(cases) + 5))) days_pred = np.array([[day] for day in days_pred]) clf = GaussianNB() clf.fit(days, cases) print(clf.predict(days_pred))
"""우리는 Feature 중 sepal에 관련된 두 개의 feature만 이용해서 학습할 것이다. 따라서 이외의 feature는 제거해준다. 그리고 target 값은 현재의 string에서 숫자로 변환해준다. 그 후 격자 안의 모든 점을 가우시안 나이브 베이즈 모델을 이용하여 예측하고 해당 예측을 통해서 decision boundary를 visualization해준다. 결과는 아래와 같다. """ import matplotlib.colors as colors from sklearn.naive_bayes import GaussianNB df1 = iris_frame[["sepal length (cm)", "sepal width (cm)", "target"]] X = df1.iloc[:, 0:2] Y = df1.iloc[:, 2].replace({ 'setosa': 0, 'versicolor': 1, 'virginica': 2 }).copy() NB = GaussianNB() NB.fit(X, Y) N = 100 X_ = np.linspace(4, 8, N) Y_ = np.linspace(1.5, 5, N) X_, Y_ = np.meshgrid(X_, Y_) color_list = ['Blues', 'Greens', 'Reds'] my_norm = colors.Normalize(vmin=-1, vmax=1) g = sn.FacetGrid(iris_frame, hue="target", size=10, palette='colorblind').map( plt.scatter, "sepal length (cm)", "sepal width (cm)", ).add_legend() my_ax = g.ax
print('Empirical learning curve for RF generated') X, Y = ([] for i in range(2)) test_label = [train_label[i] for i in range(len(test_data))] original_test_data = np.array(test_data) # same for every iteration clf = GaussianNB() for sample_size in range(1, len(train_label) / CIGTOTAL): # train with given sample size X.append(sample_size) train_subset_label = [ train_label[i] for i in range(CIGTOTAL * sample_size) ] train_subset_data = [train_data[i] for i in range(CIGTOTAL * sample_size)] train_subset_label = np.array(train_subset_label) train_subset_data = np.array(train_subset_data) clf.fit(train_subset_data, train_subset_label) # test the trained classifier predict = clf.predict(original_test_data) Y.append(getY(predict, test_label)) fig, ax = plt.subplots(1, figsize=(11, 8)) ax.plot(X, Y) plt.xticks(np.arange(1, len(train_label) / CIGTOTAL, 1.)) plt.xlabel('sample size') plt.ylabel('accuracy') plt.title('Empirical Gaussian NB learning curve for Halo, Juul, Blu, and V2') fig.savefig('2_ss_lc/nb_lc.png') plt.show()
class EndgamePredictor(): def __init__(self): data = pd.read_csv('CheckEndgame.csv') data["Pieces"] = data.apply( lambda row: self.gettotalpieces(chess.Board(row["FEN"])), axis=1) data["Material"] = data.apply( lambda row: self.gettotalmaterial(chess.Board(row["FEN"])), axis=1) data["Major Pieces"] = data.apply( lambda row: self.getmajorpieces(chess.Board(row["FEN"])), axis=1) x = data[['Pieces', 'Material', 'Major Pieces']] y = data.Endgame self.model = GaussianNB() self.model.fit(x, y) def is_endgame(self, fen: str): board = chess.Board(fen) arr = np.array([ self.gettotalpieces(board), self.gettotalmaterial(board), self.getmajorpieces(board) ]) result = self.model.predict(arr.reshape(-1, 1)) if (result.any()): return True else: return False def gettotalmaterial(self, board: chess.Board): i = 0 valfinder = SquareValue() material = 0 while (i < 64): piece = board.piece_at(i) if (piece): if ((piece.piece_type > 1) and (piece.piece_type < 6)): material += abs( valfinder.getpiecevalue(i, chess.WHITE, piece, True)) i += 1 return material def gettotalpieces(self, board: chess.Board): i = 0 pieces = 0 while (i < 64): piece = board.piece_at(i) if (piece): pieces += 1 i += 1 return pieces - 2 def getmajorpieces(self, board: chess.Board): i = 0 pieces = 0 while (i < 64): piece = board.piece_at(i) if (piece): if ((piece.piece_type > 1) and (piece.piece_type < 6)): pieces += 1 i += 1 return pieces
y_data = y_data.reset_index(drop=True) print('\nPre-processing Done.') print('\nCount of different classes in Train set:') print(X_train['Class'].value_counts()) print('\nCount of different classes in Test set:') print(X_test['Class'].value_counts()) feats=[c for c in X_train.columns if c!='Class'] # Train classifier print('\nImplementing Gaussian Naive Bayes Model.') gnb = GaussianNB() gnb.fit( X_train[feats].values, y_train['Class'] ) y_pred = gnb.predict(X_test[feats].values) print("\nNumber of mislabeled points out of a total {} points : {}, Accuracy: {:05.5f}%" .format( X_test.shape[0], (X_test["Class"] != y_pred).sum(), 100*(1-(X_test["Class"] != y_pred).sum()/X_test.shape[0]) )) cv = KFold(n_splits=5) clf = GaussianNB() X_data=X_data.values y_data=y_data.values
def result(): if request.method == 'POST': path = request.files.get('myFile') df = pd.read_csv(path, encoding="ISO-8859-1") filename = request.form['filename'] str1 = request.form['feature'] str2 = request.form['label'] if str1 in list(df) and str2 in list(df): y = df[str2] X = df[str1] else: return render_template('nameError.html') x = [] for subject in X: result = re.sub(r"http\S+", "", subject) replaced = re.sub(r'[^a-zA-Z0-9 ]+', '', result) x.append(replaced) X = pd.Series(x) X = X.str.lower() """ texts = [] for doc in X: doc = nlp(doc, disable=['parser', 'ner']) tokens = [tok.lemma_.lower().strip() for tok in doc if tok.lemma_ != '-PRON-'] tokens = [tok for tok in tokens if tok not in stopwords] tokens = ' '.join(tokens) texts.append(tokens) X = pd.Series(texts) """ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, shuffle=True) tfidfvect = TfidfVectorizer(ngram_range=(1, 1)) X_train_tfidf = tfidfvect.fit_transform(X_train) start = time() clf1 = LinearSVC() clf1.fit(X_train_tfidf, y_train) pred_SVC = clf1.predict(tfidfvect.transform(X_test)) a1 = accuracy_score(y_test, pred_SVC) end = time() print("accuracy SVC: {} and time: {} s".format(a1, (end - start))) start = time() clf2 = LogisticRegression(n_jobs=-1, multi_class='multinomial', solver='newton-cg') clf2.fit(X_train_tfidf, y_train) pred_LR = clf2.predict(tfidfvect.transform(X_test)) a2 = accuracy_score(y_test, pred_LR) end = time() print("accuracy LR: {} and time: {}".format(a2, (end - start))) start = time() clf3 = RandomForestClassifier(n_jobs=-1) clf3.fit(X_train_tfidf, y_train) pred = clf3.predict(tfidfvect.transform(X_test)) a3 = accuracy_score(y_test, pred) end = time() print("accuracy RFC: {} and time: {}".format(a3, (end - start))) start = time() clf4 = MultinomialNB() clf4.fit(X_train_tfidf, y_train) pred = clf4.predict(tfidfvect.transform(X_test)) a4 = accuracy_score(y_test, pred) end = time() print("accuracy MNB: {} and time: {}".format(a4, (end - start))) start = time() clf5 = GaussianNB() clf5.fit(X_train_tfidf.toarray(), y_train) pred = clf5.predict(tfidfvect.transform(X_test).toarray()) a5 = accuracy_score(y_test, pred) end = time() print("accuracy GNB: {} and time: {}".format(a5, (end - start))) start = time() clf6 = LogisticRegressionCV(n_jobs=-1) clf6.fit(X_train_tfidf, y_train) pred_LR = clf6.predict(tfidfvect.transform(X_test)) a6 = accuracy_score(y_test, pred_LR) end = time() print("accuracy LRCV: {} and time: {}".format(a6, (end - start))) start = time() clf7 = AdaBoostClassifier() clf7.fit(X_train_tfidf, y_train) pred_LR = clf7.predict(tfidfvect.transform(X_test)) a7 = accuracy_score(y_test, pred_LR) end = time() print("accuracy ABC: {} and time: {}".format(a7, (end - start))) start = time() clf8 = BernoulliNB() clf8.fit(X_train_tfidf.toarray(), y_train) pred = clf8.predict(tfidfvect.transform(X_test).toarray()) a8 = accuracy_score(y_test, pred) end = time() print("accuracy BNB: {} and time: {}".format(a8, (end - start))) start = time() clf9 = Perceptron(n_jobs=-1) clf9.fit(X_train_tfidf.toarray(), y_train) pred = clf9.predict(tfidfvect.transform(X_test).toarray()) a9 = accuracy_score(y_test, pred) end = time() print("accuracy Per: {} and time: {}".format(a9, (end - start))) start = time() clf10 = RidgeClassifierCV() clf10.fit(X_train_tfidf.toarray(), y_train) pred = clf10.predict(tfidfvect.transform(X_test).toarray()) a10 = accuracy_score(y_test, pred) end = time() print("accuracy RidCV: {} and time: {}".format(a10, (end - start))) start = time() clf11 = SGDClassifier(n_jobs=-1) clf11.fit(X_train_tfidf.toarray(), y_train) pred = clf11.predict(tfidfvect.transform(X_test).toarray()) a11 = accuracy_score(y_test, pred) end = time() print("accuracy SGDC: {} and time: {}".format(a11, (end - start))) start = time() clf12 = SGDClassifier(n_jobs=-1) clf12.fit(X_train_tfidf.toarray(), y_train) pred = clf12.predict(tfidfvect.transform(X_test).toarray()) a12 = accuracy_score(y_test, pred) end = time() print("accuracy XGBC: {} and time: {}".format(a12, (end - start))) acu_list = [a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12] max_list = max(acu_list) if max_list == a1: pickle.dump(clf1, open(filename + '_model', 'wb')) elif max_list == a2: pickle.dump(clf2, open(filename + '_model', 'wb')) elif max_list == a3: pickle.dump(clf3, open(filename + '_model', 'wb')) elif max_list == a4: pickle.dump(clf4, open(filename + '_model', 'wb')) elif max_list == a5: pickle.dump(clf5, open(filename + '_model', 'wb')) elif max_list == a6: pickle.dump(clf6, open(filename + '_model', 'wb')) elif max_list == a7: pickle.dump(clf7, open(filename + '_model', 'wb')) elif max_list == a8: pickle.dump(clf8, open(filename + '_model', 'wb')) elif max_list == a9: pickle.dump(clf9, open(filename + '_model', 'wb')) elif max_list == a10: pickle.dump(clf10, open(filename + '_model', 'wb')) elif max_list == a11: pickle.dump(clf11, open(filename + '_model', 'wb')) elif max_list == a12: pickle.dump(clf12, open(filename + '_model', 'wb')) pickle.dump(tfidfvect, open(filename + '_tfidfVect', 'wb')) return render_template("result.html", ac1=a1, ac2=a2, ac3=a3, ac4=a4, ac5=a5, ac6=a6, ac7=a7, ac8=a8, ac9=a9, ac10=a10, ac11=a11, ac12=a12)
X_train = np.empty(shape=(len(Li), len(feature_dict) + 3)) Y_train = np.empty(shape=len(Li)) for i in range(len(Li)): #print(Li[i]) List = tokenize(Li[i][0]) # score_snippet(List,dal) X_train[i] = get_features(List) Y_train[i] = Li[i][1] # for i in range(len(Li)): # #print(Li[i]) # print(X_train[i],":",Y_train[i]) normalized_X = (normalize(X_train)) clf = GaussianNB() clf.fit(normalized_X, Y_train) clf_lr = LogisticRegression() clf_lr.fit(normalized_X, Y_train) Li_test = load_corpus("/Users/sravyakurra/Desktop/NLP/HW@2/test.txt") X_test = np.empty(shape=(len(Li_test), len(feature_dict) + 3)) Y_test = np.empty(shape=len(Li_test)) for i in range(len(Li_test)): #print(Li[i]) List = tokenize(Li_test[i][0]) X_test[i] = get_features(List) Y_test[i] = Li_test[i][1] #print(X_test) #print("hiii")
class Model_Finder: """ This class shall be used to find the model with best accuracy and AUC score. """ def __init__(self, file_object, logger_object): self.file_object = file_object self.logger_object = logger_object self.gnb = GaussianNB() self.xgb = XGBClassifier(objective='binary:logistic', n_jobs=-1) def get_best_params_for_naive_bayes(self, train_x, train_y): """ Method Name: get_best_params_for_naive_bayes Description: get the parameters for the Naive Bayes's Algorithm which give the best accuracy. Use Hyper Parameter Tuning. Output: The model with the best parameters On Failure: Raise Exception """ self.logger_object.log( self.file_object, 'Entered the get_best_params_for_naive_bayes method of the Model_Finder class' ) try: # initializing with different combination of parameters self.param_grid = { "var_smoothing": [ 1e-9, 0.1, 0.001, 0.5, 0.05, 0.01, 1e-8, 1e-7, 1e-6, 1e-10, 1e-11 ] } #Creating an object of the Grid Search class self.grid = GridSearchCV(estimator=self.gnb, param_grid=self.param_grid, cv=3, verbose=3) #finding the best parameters self.grid.fit(train_x, train_y) #extracting the best parameters self.var_smoothing = self.grid.best_params_['var_smoothing'] #creating a new model with the best parameters self.gnb = GaussianNB(var_smoothing=self.var_smoothing) # training the mew model self.gnb.fit(train_x, train_y) self.logger_object.log( self.file_object, 'Naive Bayes best params: ' + str(self.grid.best_params_) + '. Exited the get_best_params_for_naive_bayes method of the Model_Finder class' ) return self.gnb except Exception as e: self.logger_object.log( self.file_object, 'Exception occured in get_best_params_for_naive_bayes method of the Model_Finder class. Exception message: ' + str(e)) self.logger_object.log( self.file_object, 'Naive Bayes Parameter tuning failed. Exited the get_best_params_for_naive_bayes method of the Model_Finder class' ) raise Exception() def get_best_params_for_xgboost(self, train_x, train_y): """ Method Name: get_best_params_for_xgboost Description: get the parameters for XGBoost Algorithm which give the best accuracy. Use Hyper Parameter Tuning. Output: The model with the best parameters On Failure: Raise Exception """ self.logger_object.log( self.file_object, 'Entered the get_best_params_for_xgboost method of the Model_Finder class' ) try: # initializing with different combination of parameters self.param_grid_xgboost = { "n_estimators": [50, 100, 130], "max_depth": range(3, 11, 1), "random_state": [0, 50, 100] } # Creating an object of the Grid Search class self.grid = GridSearchCV( XGBClassifier(objective='binary:logistic'), self.param_grid_xgboost, verbose=3, cv=2, n_jobs=-1) # finding the best parameters self.grid.fit(train_x, train_y) # extracting the best parameters self.random_state = self.grid.best_params_['random_state'] self.max_depth = self.grid.best_params_['max_depth'] self.n_estimators = self.grid.best_params_['n_estimators'] # creating a new model with the best parameters self.xgb = XGBClassifier(random_state=self.random_state, max_depth=self.max_depth, n_estimators=self.n_estimators, n_jobs=-1) # training the mew model self.xgb.fit(train_x, train_y) self.logger_object.log( self.file_object, 'XGBoost best params: ' + str(self.grid.best_params_) + '. Exited the get_best_params_for_xgboost method of the Model_Finder class' ) return self.xgb except Exception as e: self.logger_object.log( self.file_object, 'Exception occured in get_best_params_for_xgboost method of the Model_Finder class. Exception message: ' + str(e)) self.logger_object.log( self.file_object, 'XGBoost Parameter tuning failed. Exited the get_best_params_for_xgboost method of the Model_Finder class' ) raise Exception() def get_best_model(self, train_x, train_y, test_x, test_y): """ Method Name: get_best_model Description: Find out the Model which has the best AUC score. Output: The best model name and the model object On Failure: Raise Exception """ self.logger_object.log( self.file_object, 'Entered the get_best_model method of the Model_Finder class') # create best model for XGBoost try: self.xgboost = self.get_best_params_for_xgboost(train_x, train_y) self.prediction_xgboost = self.xgboost.predict( test_x) # Predictions using the XGBoost Model if len( test_y.unique() ) == 1: #if there is only one label in y, then roc_auc_score returns error. We will use accuracy in that case self.xgboost_score = accuracy_score(test_y, self.prediction_xgboost) self.logger_object.log(self.file_object, 'Accuracy for XGBoost:' + str(self.xgboost_score)) # Log AUC else: self.xgboost_score = roc_auc_score( test_y, self.prediction_xgboost) # AUC for XGBoost self.logger_object.log(self.file_object, 'AUC for XGBoost:' + str(self.xgboost_score)) # Log AUC # create best model for Random Forest self.naive_bayes = self.get_best_params_for_naive_bayes( train_x, train_y) self.prediction_naive_bayes = self.naive_bayes.predict( test_x) # prediction using the Random Forest Algorithm if len( test_y.unique() ) == 1: #if there is only one label in y, then roc_auc_score returns error. We will use accuracy in that case self.naive_bayes_score = accuracy_score( test_y, self.prediction_naive_bayes) self.logger_object.log( self.file_object, 'Accuracy for NB:' + str(self.naive_bayes_score)) else: self.naive_bayes_score = roc_auc_score( test_y, self.prediction_naive_bayes) # AUC for Random Forest self.logger_object.log( self.file_object, 'AUC for RF:' + str(self.naive_bayes_score)) #comparing the two models if (self.naive_bayes_score < self.xgboost_score): return 'XGBoost', self.xgboost else: return 'NaiveBayes', self.naive_bayes except Exception as e: self.logger_object.log( self.file_object, 'Exception occured in get_best_model method of the Model_Finder class. Exception message: ' + str(e)) self.logger_object.log( self.file_object, 'Model Selection Failed. Exited the get_best_model method of the Model_Finder class' ) raise Exception()
class NaiveBayes: def __init__(self, features={}, split=0.8, distribution="Bernoulli", isSummary=False): self.Tags = [ "OTH", "BKG", "CTR", "NA", "AIM", "OWN", "BAS", "TXT", "", "BEGIN" ] self.Locations = [ "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z" ] self.ParaLocations = ["INITIAL", "MEDIAL", "FINAL"] self.Headlines = [ "Introduction", "Implementation", "Example", "Conclusion", "Result", "Evaluation", "Solution", "Discussion", "Further Work", "Data", "Related Work", "Experiment", "Problems", "Method", "Problem Statement", "Non-Prototypical" ] self.YESorNO = ["YES", "NO"] self.SecLocations = [ "FIRST", "SECOND", "THIRD", "LAST", "SECOND-LAST", "THIRD-LAST", "SOMEWHERE" ] self.Tenses = ["PRESENT", "PAST", "FUTURE", "NOVERB"] self.Modals = ["MODAL", "NOMODAL", "NOVERB"] self.Voices = ["Active", "Passive", "NOVERB"] self.isSummary = isSummary self.features = features self.transformFeatures() self.distribution = distribution self.split = split self.splitData() def reloadDis(self): if self.distribution == "Bernoulli": self.nb = BernoulliNB() elif self.distribution == "Multinomial": self.nb = MultinomialNB() elif self.distribution == "Complement": self.nb = ComplementNB() else: self.nb = GaussianNB() def splitData(self): if not self.isSummary: print "Data split between train and test: " + str(self.split) papers = self.features.keys() order = np.random.permutation(len(papers)) self.train_papers = [] for i in range(int(self.split * len(papers))): self.train_papers.append(papers[order[i]]) self.test_papers = [] for i in range(int(self.split * len(papers)) + 1, len(papers)): self.test_papers.append(papers[order[i]]) self.train_X, self.train_y = self.getFeatures(self.train_papers) self.test_X, self.test_y = self.getFeatures(self.test_papers) def transformFeatures(self): self.transformed_features = dict() for filename in self.features.keys(): self.transformed_features[filename] = dict() for sentId in self.features[filename].keys(): self.transformed_features[filename][sentId] = dict() self.transformed_features[filename][sentId][ 'loc'] = self.Locations.index( self.features[filename][sentId]['loc']) self.transformed_features[filename][sentId][ 'parloc'] = self.ParaLocations.index( self.features[filename][sentId]['parloc']) self.transformed_features[filename][sentId][ 'val'] = self.Tags.index( self.features[filename][sentId]['val']) self.transformed_features[filename][sentId][ 'Title'] = self.YESorNO.index( self.features[filename][sentId]['Title']) self.transformed_features[filename][sentId][ 'len'] = self.YESorNO.index( self.features[filename][sentId]['len']) self.transformed_features[filename][sentId][ 'tfidf'] = self.YESorNO.index( self.features[filename][sentId]['tfidf']) self.transformed_features[filename][sentId][ 'secloc'] = self.SecLocations.index( self.features[filename][sentId]['secloc']) self.transformed_features[filename][sentId][ 'Headlines'] = self.Headlines.index( self.features[filename][sentId]['Headlines']) self.transformed_features[filename][sentId][ 'history'] = self.Tags.index( self.features[filename][sentId]['history']) self.transformed_features[filename][sentId][ 'tense'] = self.Tenses.index( self.features[filename][sentId]['tense']) self.transformed_features[filename][sentId][ 'voice'] = self.Voices.index( self.features[filename][sentId]['voice']) self.transformed_features[filename][sentId][ 'modal'] = self.Modals.index( self.features[filename][sentId]['modal']) def getFeatures(self, filenames): X = [] y = [] for filename in filenames: for sentId in self.transformed_features[filename].keys(): X.append(self.transformed_features[filename][sentId].values()) y.append(self.transformed_features[filename][sentId]['val']) X = np.asarray(X) y = np.asarray(y) return X, y def getSummary(self, filename): summary = [] for sentId in self.transformed_features[filename].keys(): feature = self.transformed_features[filename][sentId].values() y = self.nb.predict([feature]) if y in [1, 2, 4, 6]: summary.append(self.features[filename][sentId]['data']) if y in [0, 1, 5]: if random.uniform(0, 1) > 0.96: summary.append(self.features[filename][sentId]['data']) return "\n".join(summary) def train(self): if not self.isSummary: print "Train dataset: ", len(self.train_papers) self.reloadDis() y_pred = self.nb.fit(self.train_X, self.train_y).predict(self.train_X) if not self.isSummary: print "Mislabelled sentences: " + str( (self.train_y != y_pred).sum()) + " out of " + str( self.train_X.shape[0]) print "Train Accuracy: " + str( self.accuracy( (self.train_y != y_pred).sum(), self.train_X.shape[0])) def test(self, generate_histogram=False): print "Test dataset length: ", len(self.test_papers) y_pred = self.nb.predict(self.test_X) if generate_histogram: plt.hist(y_pred, density=True) plt.savefig('histogram.png') print "Mislabelled sentences: " + str( (self.test_y != y_pred).sum()) + " out of " + str( self.test_X.shape[0]) print "Test Accuracy: " + str( self.accuracy((self.test_y != y_pred).sum(), self.test_X.shape[0])) # return self.getConfusionMatrix(self.test_y, y_pred) def accuracy(self, misclassifications, samples): return (1 - (misclassifications / (samples * 1.0))) * 100.0 def plotConfusionMatrix(self, cm, classes, normalize=True, title='Confusion matrix', cmap=plt.cm.Blues): """ This function prints and plots the confusion matrix. Normalization can be applied by setting `normalize=True`. """ if normalize: cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] print("Normalized confusion matrix") else: print('Confusion matrix, without normalization') print(cm) plt.imshow(cm, interpolation='nearest', cmap=cmap) plt.title(title) plt.colorbar() tick_marks = np.arange(len(classes)) plt.xticks(tick_marks, classes, rotation=45) plt.yticks(tick_marks, classes) fmt = '.2f' if normalize else 'd' thresh = cm.max() / 2. for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): plt.text(j, i, format(cm[i, j], fmt), horizontalalignment="center", color="white" if cm[i, j] > thresh else "black") plt.ylabel('True label') plt.xlabel('Predicted label') plt.tight_layout() plt.savefig('confusion_matrix.png') def getConfusionMatrix(self, y_true, y_pred): return confusion_matrix(y_true, y_pred)
import numpy as np import pandas as pd from sklearn.model_selection import train_test_split from sklearn.naive_bayes import GaussianNB from sklearn.metrics import accuracy_score import matplotlib.pyplot as plt import seaborn as sns df = pd.read_csv('diabetes.csv') x = df.drop('diabetes' ,axis=1) y = df['diabetes'] x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.25, random_state=42) model = GaussianNB() model.fit(x_train, y_train) y_pred = model.predict(x_test) y_pred accuracy = accuracy_score(y_test, y_pred)*100 accuracy from sklearn.model_selection import train_test_split x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state = 0) print(x_train) print(x_test) print(y_train)
import matplotlib.pyplot as plt import pandas as pd dt = pd.read_csv('Data.csv') print(dt) X = dt.iloc[:, 1:-1].values y = dt.iloc[:, -1].values from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0) from sklearn.preprocessing import StandardScaler sc = StandardScaler() X_train = sc.fit_transform(X_train) X_test = sc.transform(X_test) from sklearn.naive_bayes import GaussianNB reg = GaussianNB() reg.fit(X_train, y_train) y_pred = reg.predict(X_test) print( np.concatenate( (y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_test), 1)), 1)) from sklearn.metrics import confusion_matrix, accuracy_score cm = confusion_matrix(y_test, y_pred) print(cm) accuracy_score(y_test, y_pred)
X = pd.get_dummies(train_data[features]) X_test = pd.get_dummies(test_data[features]) from sklearn.ensemble import RandomForestClassifier from sklearn.naive_bayes import GaussianNB from sklearn.neural_network import MLPClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.metrics import confusion_matrix from sklearn.impute import SimpleImputer my_imputer = SimpleImputer() X = my_imputer.fit_transform(X) X_test = my_imputer.fit_transform(X_test) model1 = GaussianNB() model1.fit(X, y) model2 = RandomForestClassifier(max_depth=15, n_estimators=100, bootstrap=False, max_features='sqrt', min_samples_leaf=4, min_samples_split=10) model2.fit(X, y) model3 = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1, max_iter=2000) model3.fit(X, y) model4 = KNeighborsClassifier(3) model4.fit(X, y)
plt.show() """ #Split the data X, x, Y, y = train_test_split(features, targets, test_size=0.2, random_state=10) #Let us try different algorithms to find the best match #1. Naive-Bayes from sklearn.naive_bayes import GaussianNB from sklearn.metrics import accuracy_score #Create a GaussianNB object gnb = GaussianNB() pred = gnb.fit(X, Y).predict(x) print("Naive-Bayes accuracy: ", accuracy_score(y, pred, normalize=True)) #2. Linear Support Vector Classifier from sklearn.svm import LinearSVC svc_model = LinearSVC(random_state=0) pred = svc_model.fit(X, Y).predict(x) print("Linear SVC accuracy: ", accuracy_score(y, pred, normalize=True)) #3. k-Nearest-Neighbours classifier from sklearn.neighbors import KNeighborsClassifier neigh = KNeighborsClassifier(n_neighbors=3) neigh.fit(X, Y) pred = neigh.predict(x) print("k-Nearest-Neighbours score: ", accuracy_score(y, pred))
# Read the small handwritten digit dataset digitsData = pd.read_csv('digits_small.csv') y_digits = digitsData['0'] X_digits = digitsData.drop('0', axis=1) # Split the data to train and test dataset with 20% Xtrain, Xtest, ytrain, ytest = train_test_split(X_digits, y_digits, random_state=0, test_size=0.2) # Choose Gaussian Naive Bayes model model = GaussianNB() # Fit the model with the iris dataset model.fit(Xtrain, ytrain) # Evaluate the outcome for Xtest y_fitted = model.predict(Xtest) # Print the accuracy. It is 0.825. print("The accuracy of GaussianNB is %f" % (accuracy_score(ytest, y_fitted))) confusionMat = confusion_matrix(ytest, y_fitted) sns.heatmap(confusionMat, cbar=False, square=True, annot=True) plt.xlabel('predicted digits') plt.ylabel('true digits') # Evaluate five-fold cross validation scores cv_score = cross_val_score(model, X_digits, y_digits, cv=5) print("Cross Validation Scores:", cv_score)
class Emoji(object): def __init__(self): # fit the Naive Bayes np.random.seed(42) self.emojis = pd.read_pickle('../database/df_emojis.pkl') def fit(self): # ------- this part needs work try: self.labeled_tweets = pd.read_pickle('../database/labeled.pkl') print 'it worked' except: from label_tweets import label_tweets tweets = np.array(list(pickle.load(open('../database/yay_moji.pkl','rb')))) self.by_emoji,self.labeled_tweets = label_tweets(tweets,self.emojis,top = 50, save = True) self.y = self.labeled_tweets['emoji'].values self.X = self.labeled_tweets['tweet'].values def model(self, max_df_ = .8, min_df_ = .001, ngram = (1,2)): self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X,self.y) stopwords = set(list(ENGLISH_STOP_WORDS) + ['rt', 'follow', 'dm', 'https', 'ur', 'll' ,'amp', 'subscribe', 'don', 've', 'retweet', 'im', 'http','lt']) # fit the tfidf or CountVectorizer self.tfidf = TfidfVectorizer(max_features=10000, max_df = max_df_, min_df=min_df_, stop_words = stopwords, ngram_range = ngram) self.tfidf.fit(self.X_train) self.vector = self.tfidf.transform(self.X_train) # --> add the emoji name to bag of words for each emoji self.bag = np.array(self.tfidf.get_feature_names()) self.nb = GaussianNB() self.nb.fit(self.vector.todense(), self.y_train) def internal_predict(self, print_side_by_side = True): test_tfidf = self.tfidf.transform(self.X_test) predicted = self.nb.predict(test_tfidf.todense()) print 'labeled' acc = np.mean(self.y_test == predicted) print 'Test accuracy =',acc print '' if print_side_by_side: for true,predict in zip(self.y_test,predicted): print '-->',true,predict def predict(self,text): top_n = 3 test_tfidf = self.tfidf.transform([text]) probs = self.nb.predict_proba(test_tfidf.todense()) probs = probs.flatten() above_0 = np.argwhere(probs>0).flatten() above_0 = np.sort(above_0)[::-1] print '-->',text,'=', for i in above_0[:5]: print self.nb.classes_[i],' ',#probs.flatten()[i],' ', print '' return probs def print_top_words(self,top_n_words=5): # printing top words for each emoji print '' print '----- Top {} words for each Emoji in Train set'.format(top_n_words) print '-'*60 for i in range(len(self.nb.classes_)): top = self.bag[self.nb.theta_[i].argsort()[::-1]][:top_n_words] print self.nb.classes_[i],' -->',top print ''
def naive_bayes(training_file, test_file): start = time.time() #-----------------------------------DATA PREPARATION----------------------------------- training_set = pd.read_csv(training_file, header=None) test_set = pd.read_csv(test_file, header=None) #encoding the training set categorical_feature_mask = training_set.dtypes == object categorical_cols = training_set.columns[categorical_feature_mask].tolist() le = LabelEncoder() training_set[categorical_cols] = training_set[categorical_cols].apply( lambda col: le.fit_transform(col)) #encoding the test set categorical_feature_mask = test_set.dtypes == object categorical_cols = test_set.columns[categorical_feature_mask].tolist() le = LabelEncoder() test_set[categorical_cols] = test_set[categorical_cols].apply( lambda col: le.fit_transform(col)) l = (len(training_set.columns) - 1) x = training_set.drop([l], axis=1) y = training_set[l] l = (len(training_set.columns) - 1) x_test = test_set.drop([l], axis=1) y_test = test_set[l] #-----------------------------------MODEL GENERATION AND PREDICTION----------------------------------- # Decision tree gb = GaussianNB() # Performing training gb.fit(x, y) #prediciton of test set class attribute pred = gb.predict(x_test) #-----------------------------------COMPARISON AND OUTPUT----------------------------------- #ytestar = y_test.to_numpy() true_values = [] y_test_rows = y_test.shape[0] for i in range(0, (y_test_rows)): if np.array_equal(y_test[i], pred[i]): true_values.append(1) else: true_values.append(0) for i in range(0, y_test_rows): print("ID = " + str(i) + " predicted = " + str(pred[i]) + " true = " + str(y_test[i]) + " accuracy = " + str(true_values[i])) print("\nClassification report:\n" + classification_report(y_test, pred)) print("Runtime: ") print(time.time() - start)
train_y = np.mat(df_train.iloc[select_idx]['label'].tolist()).reshape((-1, 1)) ##构建测试集矩阵 test_x = makeDataMat(df_train.iloc[test_select_idx], vocabList) test_y = np.mat(df_train.iloc[test_select_idx]['label'].tolist()).reshape( (-1, 1)) # In[12]: train_x.shape, train_y.shape # In[13]: #模型训练 model = GaussianNB() model.fit(train_x, train_y) # In[14]: #二分类参数 model.class_prior_ # In[15]: sum(train_y) # In[16]: model.predict( np.array(word2Vect('这个真的好可爱啊!我超喜欢这里的', vocabList)).reshape(1, -1))