class GNB(object): def __init__(self): self.gnb = GaussianNB() def predict(self, X): return self.gnb.predict_proba(X)[:,1][:,np.newaxis] def fit(self, X, y): self.gnb.fit(X,y)
def test_gnb_priors(): """Test whether the class prior override is properly used""" clf = GaussianNB(priors=np.array([0.3, 0.7])).fit(X, y) assert_array_almost_equal(clf.predict_proba([[-0.1, -0.1]]), np.array([[0.825303662161683, 0.174696337838317]]), 8) assert_array_equal(clf.class_prior_, np.array([0.3, 0.7]))
def gnbmodel(d,X_2,y_2,X_3,y_3,X_test,y_test): X_3_copy = X_3.copy(deep=True) X_3_copy['chance']=0 index = 0 ########## k折交叉验证 ########################### scores = cross_val_score(GaussianNB(), X_2, y_2, cv=5, scoring='accuracy') score_mean =scores.mean() print(d+'5折交互检验:'+str(score_mean)) ################################################# gnb = GaussianNB().fit(X_2,y_2) ################ 预测测试集 ################ answer_gnb = gnb.predict(X_test) accuracy = metrics.accuracy_score(y_test,answer_gnb) print(d+'预测:'+str(accuracy)) ############################################### chance = gnb.predict_proba(X_3)[:,1] for c in chance: X_3_copy.iloc[index,len(X_3_copy.columns)-1]=c index += 1 chance_que = X_3_copy.iloc[:,len(X_3_copy.columns)-1] return chance_que
def performNB(trainingScores, trainingResults, testScores): print "->Gaussian NB" X = [] for currMark in trainingScores: pass for idx in range(0, len(trainingScores[currMark])): X.append([]) for currMark in trainingScores: if "Asym" in currMark: continue print currMark, for idx in range(0, len(trainingScores[currMark])): X[idx].append(trainingScores[currMark][idx]) X_test = [] for idx in range(0, len(testScores[currMark])): X_test.append([]) for currMark in trainingScores: if "Asym" in currMark: continue for idx in range(0, len(testScores[currMark])): X_test[idx].append(testScores[currMark][idx]) gnb = GaussianNB() gnb.fit(X, np.array(trainingResults)) y_pred = gnb.predict_proba(X_test)[:, 1] print "->Gaussian NB" return y_pred
class GaussianColorClassifier(ContourClassifier): ''' A contour classifier which classifies a contour based on it's mean color in BGR, HSV, and LAB colorspaces, using a Gaussian classifier for these features. For more usage info, see class ContourClassifier ''' FEATURES = ['B', 'G', 'R', 'H', 'S', 'V', 'L', 'A', 'B'] def __init__(self, classes, **kwargs): super(GaussianColorClassifier, self).__init__(classes, **kwargs) self.classifier = GaussianNB() def get_features(self, img, mask): mean = cv2.mean(img, mask) mean = np.array([[mean[:3]]], dtype=np.uint8) mean_hsv = cv2.cvtColor(mean, cv2.COLOR_BGR2HSV) mean_lab = cv2.cvtColor(mean, cv2.COLOR_BGR2LAB) features = np.hstack((mean.flatten(), mean_hsv.flatten(), mean_lab.flatten())) return features def classify_features(self, features): return self.classifier.predict(features) def feature_probabilities(self, features): return self.classifier.predict_proba(features) def train(self, features, classes): self.classifier.fit(features, classes)
def trainData(username): """ Trains the data based on the users performance so far Returns a trained Gaussian Naive Bayes model and updates result collection """ X = getFeatures(username) Y = getClassList(username) trainX = np.array(X) trainY = np.array(Y) gnb = GaussianNB() gnb.fit(trainX, trainY) print "Score with Naive Bayes: ", gnb.score(trainX, trainY) testData = words.posts.find({}, {'id' : 1, 'points' : 1, 'diff' : 1, '_id' : 0}) testData = map(lambda x : (x['id'], x['points'], x['diff']), testData) with warnings.catch_warnings(): warnings.simplefilter('ignore') for data in testData: testWord = words.posts.find_one({'id' : data[0]}, {'word' : 1, '_id' : 0})['word'] wordClass = setWordClass(list(gnb.predict_proba(data))[0]) classWord = result.posts.update({'username' : username}, {'$set' : {testWord : wordClass}}, upsert = True)
def naiveBayesClassifierTraining(compounds_all): print "Building naive Bayes classifier (" + str(NB_FOLDS) + "-fold cross-validation)..." # get the data keys = compounds_all.keys() fingerprint_data = [compounds_all[cmpnd_id]['fingerprint'] for cmpnd_id in keys] fingerprint_data = numpy.asarray(fingerprint_data) activity_data = [compounds_all[cmpnd_id]['active'] for cmpnd_id in keys] activity_data = numpy.asarray(activity_data) # perform K-fold cross-validation classifier = GaussianNB() kfold_xv_strat = cross_validation.StratifiedKFold(activity_data, NB_FOLDS, indices=False) confusion_matrices = [] probabilities = [] scores = [] models = [] true_activities = [] aucs = [] for train, test in kfold_xv_strat: fingerprint_data_train = fingerprint_data[train] fingerprint_data_test = fingerprint_data[test] activity_data_train = activity_data[train] activity_data_test = activity_data[test] # model building classifier.fit(fingerprint_data_train, activity_data_train) # testing activity_data_predictions = classifier.predict(fingerprint_data_test) models.append(classifier) probability_estimates = classifier.predict_proba(fingerprint_data_test) probabilities.append(probability_estimates) scores.append(classifier.score(fingerprint_data_test, activity_data_test)) activity_confusion_matrix = confusion_matrix(activity_data_test, activity_data_predictions) confusion_matrices.append(activity_confusion_matrix) true_activities.append(activity_data_test) # ROC curves fpr, tpr, thresholds = roc_curve(activity_data_test, probability_estimates[:, 1]) aucs.append(auc(fpr, tpr)) classifier.fit(fingerprint_data, activity_data) print "Done." return { 'confusion_matrices' : confusion_matrices , 'probabilities' : probabilities , 'scores' : scores , 'models' : models , 'true_activity_data' : true_activities , 'AUCs' : aucs , 'fingerprint_data' : fingerprint_data , 'activity_data' : activity_data , 'final_model' : classifier }
def bayseFilter(X,y): clf = GaussianNB() clf.fit(X,y) bayseX = clf.predict_proba(X) t = np.ones(bayseX.shape[0]) for i in range(0,bayseX.shape[1]): t = t*bayseX[:,i] bayseXfilter = t return bayseXfilter
class NaiveBayes: __theta = 0 __sigma = 0 def __init__(self): pass #self.__new_data = 0 def learning(self,x_data,y_data): self.rssi = np.loadtxt(x_data, delimiter=',') print(self.rssi) self.position = np.loadtxt(y_data, delimiter=',') print(self.position) self.gaussian_nb = GaussianNB() from sklearn.cross_validation import train_test_split rssi_train, rssi_test, position_train, position_test = train_test_split(self.rssi, self.position, random_state=0) self.gaussian_nb.fit(rssi_train,position_train) print("theta",self.gaussian_nb.theta_) print("sigma",self.gaussian_nb.sigma_) predicted = self.gaussian_nb.predict(rssi_test) print(metrics.accuracy_score(position_test, predicted)) ''' def set_params(self,theta,sigma): __theta = theta __sigma = sigma print __theta print __sigma ''' def inference(self,r_data): self.predicted_class = self.gaussian_nb.predict(r_data) post_prob = self.gaussian_nb.predict_proba(r_data) log_prob = self.gaussian_nb.predict_log_proba(r_data) self.post_prob_float16 = post_prob.astype(np.float16) #E = 1*self.post_prob_float16[0][0]+2*self.post_prob_float16[0][1]+3*self.post_prob_float16[0][2] #var = (1*self.post_prob_float16[0][0]+4*self.post_prob_float16[0][1]+9*self.post_prob_float16[0][2])-E**2 #print(self.post_prob_float16) #print(self.post_prob_float16[0]) #print(var) print(self.predicted_class) #print(self.gaussian_nb.class_prior_) #print(log_prob) return self.predicted_class def output(self): output = graph.Graph() output.bar_graph(self.post_prob_float16[0])
def nbayes(source, target): """ Naive Bayes Classifier """ source = SMOTE(source) clf = GaussianNB() features = source.columns[:-1] klass = source[source.columns[-1]] clf.fit(source[features], klass) preds = clf.predict(target[target.columns[:-1]]) distr = clf.predict_proba(target[target.columns[:-1]]) return preds, distr[:,1]
def main(): train = p.read_table('../train.tsv').replace('?',0) # target = np.array(train)[:,-1] train['alchemy_category'] = train.groupby('alchemy_category').grouper.group_info[0] train['alchemy_category_score'] = train['alchemy_category_score'].astype(float) # train = np.array(train)[:,:-1] train = np.array(train)[:,3:] test = p.read_table('../test.tsv').replace('?',0) test['alchemy_category'] = test.groupby('alchemy_category').grouper.group_info[0] test['alchemy_category_score'] = test['alchemy_category_score'].astype(float) valid_index = list(np.array(test)[:,1]) orig_test = np.array(test)[:,3:] test = train test = outlier(test,20) target = test[:,-1] test = test[:,:-1] print len(test) r = [] r.append([0,0.000]) for j in range(1,10): n = int((8.5*len(train))/10) X_train = test[:n] X_test = test[n:] y_train = target[:n] y_test = target[n:] # run the model #classifier = RandomForestClassifier(n_estimators=1000,verbose=0,n_jobs=20,min_samples_split=5,random_state=1034324) classifier = GaussianNB() classifier.fit(X_train, y_train) pred = classifier.predict_proba(X_test) fpr, tpr, thresholds = roc_curve(y_test,pred[:,1]) roc_auc = auc(fpr, tpr) print("%d Area under the ROC curve : %f" %(i,roc_auc)) r.append([j,roc_auc]) plt.grid(True) #print r x = [i[0]*10 for i in r] y = [i[1]*100 for i in r] plt.plot(x,y,linewidth=3) plt.axis([0,100,0,100]) plt.xlabel("training % data") plt.ylabel('Accuracy (CV score k=20)') plt.show() # gnb.fit(X_train, y_train) # pred = gnb.predict(X_test) # fpr, tpr, thresholds = roc_curve(y_test,pred) # roc_auc = auc(fpr, tpr) # print("Area under the ROC curve : %f" % roc_auc) # write writer = csv.writer(open("predictions", "w"), lineterminator="\n") rows = [x for x in zip(valid_index, classifier.predict(orig_test))] writer.writerow(("urlid","label")) writer.writerows(rows)
def train_NB_model(trackset, training_set): useful_features = ['acousticness','danceability','instrumentalness','energy','speechiness','tempo','valence'] X = training_set[useful_features] Y = training_set.status w = training_set.weight clf = GaussianNB() clf.fit(X, Y, sample_weight=w) predicts = pd.DataFrame(clf.predict_proba(trackset[useful_features])) predicts.columns = ['P_reject','P_accept'] trackset.P_accept = predicts['P_accept'].values return trackset.sort_values(by=['P_accept'], ascending=False)
def gNB(train_data, train_labels, test, save_result=False): log_state('Use Gaussian Naive Bayes classifier') clf = GaussianNB() clf.fit(train_data, train_labels) predict_labels = clf.predict(test) predict_proba = clf.predict_proba(test) if save_result == True: dump_picle(predict_labels, './data/predict_labels/predict_labels.p') dump_picle(predict_proba, './data/predict_labels/predict_proba.p') logger.info('Classifier training complete, saved predict labels to pickle') return predict_labels
def nb(data,yind, xind): model = NB() Y = data.iloc[range(0,data.shape[0]),yind] X = data.iloc[range(0,data.shape[0]),xind] model.fit(X,Y) Z = model.predict(X.iloc[X.shape[0]-1,:]) Z = Z.tolist() prob = model.predict_proba(X.iloc[X.shape[0]-1,:]) prob = prob.tolist() classes = model.classes_.tolist() output = [Z,prob, classes] return output
def trainModel(X_train, Y_train, X_test, Y_test, model="NB"): if model == "NB": clf = GaussianNB() elif model == "RF": clf = ensemble.RandomForestClassifier() elif model == "GB": clf = ensemble.GradientBoostingClassifier(learning_rate=0.1, n_estimators=100,verbose=1) clf.fit(X_train, Y_train) Y_score = clf.predict_proba(X_test) auc = em.get_roc(Y_test,Y_score[:,1]) return clf, auc
def naiveBayesModel(train_data, test_data, train_Y, test_Y): # Build Naive Bayes Model model = GaussianNB() model.fit(train_data, train_Y) # print(model) # Make predictions predicted = model.predict_proba(test_data) # print predicted[0:,1] print "Naive Bayes :" print 'Log Loss :', metrics.log_loss(test_Y, predicted[0:,1])
def naive_bayes_crossval_network(title): csv = pandas.read_csv("data/cables2009WithRefAttributes.csv", sep=";") X, Y = get_xy_from_csv2(csv) fold_size = len(Y)/10 for fold in xrange(0, 10): if fold == 9: last = len(Y) - (fold + 1) * fold_size else: last = 0 test = range(fold * fold_size, (fold + 1) * fold_size + last) train = list(set(range(len(Y))) - set(test)) clf = GaussianNB() clf.fit(X[train, :], Y[train]) if fold == 0: print "Naive bayes 10-fold crossval: 0", probs = clf.predict_proba(X[test, :]) else: print fold, probs = np.concatenate([probs, clf.predict_proba(X[test, :])]) print " " plot_ROC_of_graph(0, 0, True, Y, probs, title)
def main(): args = getOptions() print "options:" print args fn = "nbsubmission.csv" print fn print "train file read" train_x, train_y = readfile(args.train,'train') print "test file read" test_x, test_y = readfile(args.test,'test') #remove feature with no distinction and less important print "remove feature with no distinction and less important" indices = [i for i in range(len(train_x[0]))] frqIndex = trimfrq(train_x) for i in frqIndex: indices.remove(i) train_x_uniq = indexTodata(train_x, indices) test_x_uniq = indexTodata(test_x, indices) #normalization print "normalization" train_x_nor, mean, std = normalize(train_x_uniq) test_x_nor, mean, std = normalize(test_x_uniq, mean, std) #feature selection print "feature selection" ftsel = ExtraTreesClassifier() ftsel.fit(train_x_nor, train_y) # importances = ftsel.feature_importances_ # indices_test = np.argsort(importances)[::-1] # indices_test = indices_test.tolist() train_x_trans = ftsel.transform(train_x_nor) test_x_trans = ftsel.transform(test_x_nor) #modelsing print "modelsing" clf = GaussianNB() clf.fit(train_x_trans, train_y) train_pdt = clf.predict(train_x_trans) MCC, Acc_p , Acc_n, Acc_all = get_Accs(train_y, train_pdt) print "MCC, Acc_p , Acc_n, Acc_all(train): " print "%s,%s,%s,%s" % (str(MCC), str(Acc_p) , str(Acc_n), str(Acc_all)) test_pdt = clf.predict_proba(test_x_trans) # MCC, Acc_p , Acc_n, Acc_all = get_Accs(test_y, test_pdt) # print "MCC, Acc_p , Acc_n, Acc_all(test): " # print "%s,%s,%s,%s" % (str(MCC), str(Acc_p) , str(Acc_n), str(Acc_all)) fout=open(fn,'w') fout.write("ID,target\n") for index, eachline in enumerate(test_pdt): fout.write("%s,%s\n" % (str(int(test_x[index][0])),str(test_pdt[index][1]))) fout.close()
def fit_model_9(self,toWrite=False): model = GaussianNB() for data in self.cv_data: X_train, X_test, Y_train, Y_test = data model.fit(X_train,Y_train) pred = model.predict_proba(X_test)[:,1] print("Model 9 score %f" % (logloss(Y_test,pred),)) if toWrite: f2 = open('model9/model.pkl','w') pickle.dump(model,f2) f2.close()
class GaussianNaiveBayes(AbstractLearner): def __init__(self): self.learner = GaussianNB() def _train(self, x_train, y_train): self.learner = self.learner.fit(x_train, y_train) def _predict(self, x): return self.learner.predict(x) def _predict_proba(self, x): return self.learner.predict_proba(x)
def Gaussian_NB_predict(new_train_data, new_train_labels, test_data, test_labels): # Create a classifier: a Gaussian Naive Bayesian classifier = GaussianNB() # We learn the digits on the first half of the digits classifier.fit(new_train_data, new_train_labels) # Now predict the value of the digit on the second half: expected = test_labels predicted = classifier.predict_proba(test_data) return predicted
def train_by_lr(conf,ctype): """ Arguments: - `conf`: """ #read train test y print "load data..." train,test,y,test_label = read_data(conf) train,test,y = np.array(train),np.array(test),np.array(y) print "train shape",train.shape print "test shape",test.shape print "norm" scaler = preprocessing.StandardScaler().fit(train) train = scaler.transform(train) test = scaler.transform(test) print "pca" pca = PCA(n_components=23,whiten=True) pca.fit(train) train = pca.transform(train) test = pca.transform(test) #clf = LogisticRegression(penalty='l2',dual=True,fit_intercept=False,C=2,tol=1e-9,class_weight=None, random_state=None, intercept_scaling=1.0) clf = GaussianNB() #clf = MultinomialNB() #clf = GradientBoostingClassifier(n_estimators=400) #clf = RandomForestClassifier(n_estimators=400) #clf = RandomForestClassifier(n_estimators=100,max_depth=8,min_samples_leaf=4,n_jobs=3) #clf = SGDClassifier(loss="log", penalty="l2",alpha=0.1) #clf = svm.SVC(C = 1.0, kernel = 'rbf', probability = True) if ctype == "cv": print "交叉验证" hehe = cross_validation.cross_val_score(clf,train,y,cv=3,scoring='roc_auc',n_jobs=-1) print hehe print np.mean(hehe) elif ctype =="predict": clf.fit(train,y) predict = clf.predict_proba(test)[:,1] if len(predict)!=len(test_label): print "predict!=test label" sys.exit(1) rf = open(conf["result_dir"],"w") rf.write("id,repeatProbability\n") for i in range(len(predict)): rf.write("%s,%s\n"%(test_label[i],predict[i]))
def main(): #create the training & test sets, skipping the header row with [1:] fnc = loadarff(open('Train/train_FNC_attrSelected.arff','r')) sbm = loadarff(open('Train/train_SBM_attrSelected.arff','r')) testf = genfromtxt(open('Test/test_FNC.csv','r'), delimiter=',', dtype='f8')[1:] tests = genfromtxt(open('Test/test_SMB.csv','r'), delimiter=',', dtype='f8')[1:] gnb = GaussianNB() y_pred = gnb.fit(iris.data, iris.target).predict(iris.data) predicted_probs = [[index + 1, x[1]] for index, x in enumerate(gnb.predict_proba(test))] savetxt('Data/submission.csv', predicted_probs, delimiter=',', fmt='%d,%f', header='MoleculeId,PredictedProbability', comments = '')
def main(path): X_train, X_test, Y_train, Y_test, X_train_case, X_test_case, X_train_judge, X_test_judge = load_data(path) output_imp = pd.DataFrame(columns=['rf_imp','rf_name','rf_yerr','rf_case_imp','rf_case_name', 'rf_case_yerr','rf_judge_imp','rf_judge_name','rf_judge_yerr']) col_names = X_train.columns.values col_names_case = X_train_case.columns.values col_names_judge = X_train_judge.columns.values ytest = pd.DataFrame(Y_test) ytest.to_csv('y_test.csv',index=False) rf = RandomForestClassifier(n_estimators=500, random_state=123, bootstrap=False).fit(X_train, Y_train) rf_case = RandomForestClassifier(n_estimators=200, random_state=123, bootstrap=False).fit(X_train_case,Y_train) rf_judge = RandomForestClassifier(n_estimators=500, random_state=123, bootstrap=False).fit(X_train_judge,Y_train) importances = rf.feature_importances_ std = np.std([tree.feature_importances_ for tree in rf.estimators_], axis=0) indices = np.argsort(importances)[::-1] output_imp.rf_name = col_names[indices[:20]] output_imp.rf_imp = importances[indices[:20]] output_imp.rf_yerr = std[indices[:20]] importances = rf_case.feature_importances_ std = np.std([tree.feature_importances_ for tree in rf_case.estimators_], axis=0) indices = np.argsort(importances)[::-1] output_imp.rf_case_name = col_names_case[indices[:20]] output_imp.rf_case_imp = importances[indices[:20]] output_imp.rf_case_yerr = std[indices[:20]] importances = rf_judge.feature_importances_ std = np.std([tree.feature_importances_ for tree in rf_judge.estimators_], axis=0) indices = np.argsort(importances)[::-1] output_imp.rf_judge_name = col_names_judge[indices[:20]] output_imp.rf_judge_imp = importances[indices[:20]] output_imp.rf_judge_yerr = std[indices[:20]] output_imp.to_csv('importance.csv',index=False) lr_l1 = LogisticRegression(penalty='l1', random_state=123).fit(X_train, Y_train) lr_l2 = LogisticRegression(penalty='l2', random_state=123).fit(X_train, Y_train) nb = GaussianNB().fit(X_train, Y_train) pred = [lr_l1.predict_proba(X_test)[:,1], lr_l2.predict_proba(X_test)[:,1], rf.predict_proba(X_test)[:,1], rf_case.predict_proba(X_test_case)[:,1], rf_judge.predict_proba(X_test_judge)[:,1],nb.predict_proba(X_test)[:,1]] labels = ['LR_L1','LR_L2','RF','RF_case','RF_judge','NB'] output_data = pd.DataFrame(np.array(pred).T, columns = labels) output_data.to_csv('output_plot_auc.csv',index=False)
def bayes_ROC(features, target): model = GaussianNB().fit(features,target) target_predicted_proba = model.predict_proba(features) fpr, tpr, thresholds = roc_curve(target, target_predicted_proba[:, 1]) roc_auc = auc(fpr, tpr) plt.plot(fpr, tpr, label='ROC curve (area = %0.3f)' % roc_auc) plt.plot([0, 1], [0, 1], 'k--') # random predictions curve plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.0]) plt.xlabel('False Positive Rate or (1 - Specifity)') plt.ylabel('True Positive Rate or (Sensitivity)') plt.title('Receiver Operating Characteristic') plt.legend(loc="lower right") plt.show()
def nb_xyat_weight1(df_cell_train_feats, y_train, df_cell_test_feats): def prepare_feats(df): df_new = pd.DataFrame() df_new["x"] = df["x"] df_new["y"] = df["y"] df_new["hour"] = df["hour"] df_new["weekday"] = df["weekday"] df_new["accuracy"] = df["accuracy"].apply(np.log10) return df_new logging.info("train nb_xyat_weight1 model") clf = GaussianNB() clf.fit(prepare_feats(df_cell_train_feats), y_train, df_cell_train_feats["time"] ** 2) y_test_pred = clf.predict_proba(prepare_feats(df_cell_test_feats)) return y_test_pred
def svm_classify(threshold): global data data = pd.DataFrame() # i=0 # xprev=0 # xprev2=0 for x in cot.columns[:-1]: data[x] = cot[x] / pd.rolling_mean(cot[x], 5) # data[x+'_polynomial2']=data[x]*data[x] # data[x+'_polynomial3']=data[x]*data[x]*data[x] # if (xprev!=0): # data[x+'_polynomial_x_2']=data[x]*data[xprev] # if (xprev2!=0): # data[x+'_polynomial_x_3']=data[x]*data[xprev2]*data[xprev] # i=i+1 # xprev=x # xprev2=xprev data["return"] = ((futures.shift(-4).Rate / futures.shift(-1).Rate) - 1) > 0 data = data[8:].dropna(1) x_train, x_test, y_train, y_test = train_test_split(data.iloc[:-1, :-1], data.iloc[:-1, -1], test_size=0.5) classifier = GaussianNB() # SVC (kernel='linear',probability=True,C=1) classifier.fit(x_train, y_train) # min_max_scaler=MinMaxScaler() # mms=min_max_scaler.fit(list(max(a) for a in classifier.predict_proba(x_train))) pr = list(max(a) for a in classifier.predict_proba(x_test)) Y = pd.DataFrame() Y["actual"] = y_test Y["predicted"] = classifier.predict(x_test) Y["P"] = list(max(a) for a in classifier.predict_proba(x_test)) Y_filtered = Y[Y.P > threshold] cm = confusion_matrix(Y_filtered.actual, Y_filtered.predicted) # return [cm,'Prediction of UP is %s; P = %s' %(classifier.predict(data.iloc[-1:,:-1])[0], # list((max(x)) for x in classifier.predict_proba(data.iloc[-1:,:-1]))[0] # ),futures] cr = classification_report(Y_filtered.actual, Y_filtered.predicted) return [cm, cr]
def decision_surface(first,second): """ Draws a scatter plot for two features with decision surface for classifying persons into POI/not POI """ features_list = ['poi',first,second] data_dict = pickle.load(open("final_project_dataset.pkl", "r") ) createFraction(data_dict) features = data_dict["TOTAL"] data_dict.pop("TOTAL",0) for i in features: poi,notpoi = gather_values(data_dict,i) print i, round(poi.count("NaN")/18.0,2), round(notpoi.count("NaN")/127.0,2), poi.count("NaN") > 5 data = featureFormat(data_dict, features_list, sort_keys = True) labels, features = targetFeatureSplit(data) from sklearn.naive_bayes import GaussianNB clf = GaussianNB()# Provided to give you a starting point. Try a varity of classifiers. clf.fit(features,labels) predictions = clf.predict(features) from sklearn.metrics import classification_report print classification_report(labels,predictions) x = data[:,1] y = data[:,2] color = data[:,0] xlim = (int(min(x)*0.9),int(max(x)*1.1)) ylim = (int(min(y)*0.9),int(max(y)*1.1)) import numpy as np xx, yy = np.meshgrid(np.linspace(xlim[0], xlim[1], 71), np.linspace(ylim[0], ylim[1], 81)) z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()]) z = z[:, 1].reshape(xx.shape) plt.scatter(x,y,c=color,s=50) plt.contour(xx,yy,z,[0.5],colors="k") plt.show()
def nbayes(source, target): """ Naive Bayes Classifier """ clf = GaussianNB() source.loc[source[source.columns[-1]] > 0, source.columns[-1]] = 1 source.loc[source[source.columns[-1]] < 1, source.columns[-1]] = 0 # set_trace() # source = SMOTE(source, k=1) # set_trace() features = source.columns[:-1] klass = source[source.columns[-1]] clf.fit(source[features], klass) preds = clf.predict(target[target.columns[:-1]]) distr = clf.predict_proba(target[target.columns[:-1]]) return preds, distr[:,1]
def test_gnb(): """ Gaussian Naive Bayes classification. This checks that GaussianNB implements fit and predict and returns correct values for a simple toy dataset. """ clf = GaussianNB() y_pred = clf.fit(X, y).predict(X) assert_array_equal(y_pred, y) y_pred_proba = clf.predict_proba(X) y_pred_log_proba = clf.predict_log_proba(X) assert_array_almost_equal(np.log(y_pred_proba), y_pred_log_proba, 8)
class Classifier: def __init__(self, method): if method == 'knn': self.name = 'knn_classifier' self.fit = self._knn_fit self.predict = self._knn_predict self.predict_proba = self._knn_predict_proba elif method == 'random_forest': self.name = 'random_forest_classifier' self.fit = self._randomf_fit self.predict = self._randomf_predict self.predict_proba = self._randomf_predict_proba elif method == 'bayes': self.name = 'naive_bayes_classifier' self.fit = self._bayes_fit self.predict = self._bayes_predict self.predict_proba = self._bayes_predict_proba elif method == 'tree': self.name = 'decision_tree_classifier' self.fit = self._tree_fit self.predict = self._tree_predict self.predict_proba = self._tree_predict_proba elif method == 'svc': self.name = 'support_vector_classification' self.fit = self._svc_fit self.predict = self._svc_predict self.predict_proba = self._svc_predict_proba elif method == 'linearsvc': self.name = 'linear_support_vector_classification' self.fit = self._lsvc_fit self.predict = self._lsvc_predict self.predict_proba = self._lsvc_predict_proba elif method == 'logisticregression': self.name = 'logistic_regression' self.fit = self._lr_fit self.predict = self._lr_predict #self.predict_proba = self._lsvc_predict_proba else: print('Classifying method not found') sys.exit(-1) def _knn_fit(self, X, y): print('Training the knn classifier...') self._classifier = KNeighborsClassifier(n_neighbors=5, n_jobs=-1) self._classifier.fit(X, y) print('Done!') def _knn_predict(self, X): predictions = self._classifier.predict(X) values, counts = np.unique(predictions, return_counts=True) return values[np.argmax(counts)] def _knn_predict_proba(self, X): pred_probabilities = self._classifier.predict_proba(X) print 'Predicted probabilities' return pred_probabilities def _lr_fit(self, X, y): print('Training the logistic regression...') self._classifier = LogisticRegression() self._classifier.fit(X, y) print('Done!') def _lr_predict(self, X): predictions = self._classifier.predict(X) values, counts = np.unique(predictions, return_counts=True) return values[np.argmax(counts)] def _randomf_fit(self, X, y): print('Training the Random forest classifier...') self._classifier = RandomForestClassifier(max_depth=3, random_state=0, n_jobs=-1) self._classifier.fit(X, y) print('Done!') def _randomf_predict(self, X): predictions = self._classifier.predict(X) values, counts = np.unique(predictions, return_counts=True) return values[np.argmax(counts)] def _randomf_predict_proba(self, X): pred_probabilities = self._classifier.predict_proba(X) print 'Predicted probabilities' return pred_probabilities def _bayes_fit(self, X, y): print('Training the Gaussian Naive Bayes classifier...') self._classifier = GaussianNB() self._classifier.fit(X, y) print('Done!') def _bayes_predict(self, X): predictions = self._classifier.predict(X) values, counts = np.unique(predictions, return_counts=True) return values[np.argmax(counts)] def _bayes_predict_proba(self, X): pred_probabilities = self._classifier.predict_proba(X) print 'Predicted probabilities' print pred_probabilities return pred_probabilities def _tree_fit(self, X, y): print('Training the Decision tree classifier...') self._classifier = tree.DecisionTreeClassifier() self._classifier.fit(X, y) print('Done!') def _tree_predict(self, X): predictions = self._classifier.predict(X) values, counts = np.unique(predictions, return_counts=True) return values[np.argmax(counts)] def _tree_predict_proba(self, X): pred_probabilities = self._classifier.predict_proba(X) print 'Predicted probabilities' return pred_probabilities def _svc_fit(self, X, y): print('Training the Support Vector classifier...') # ovr = one vs. rest | ovo = one vs. one self._classifier = svm.SVC(decision_function_shape='ovr') self._classifier.fit(X, y) print('Done!') def _svc_predict(self, X): predictions = self._classifier.predict(X) values, counts = np.unique(predictions, return_counts=True) return values[np.argmax(counts)] def _svc_predict_proba(self, X): pred_probabilities = self._classifier.predict_proba(X) print 'Predicted probabilities' return pred_probabilities def _lsvc_fit(self, X, y): print('Training the Linear Support Vector classifier...') self._classifier = svm.LinearSVC() self._classifier.fit(X, y) print('Done!') def _lsvc_predict(self, X): predictions = self._classifier.predict(X) values, counts = np.unique(predictions, return_counts=True) return values[np.argmax(counts)] def _lsvc_predict_proba(self, X): pred_probabilities = self._classifier.predict_proba(X) print 'Predicted probabilities' return pred_probabilities
data.shape, iris.target.shape ((150, 4), (150, )) X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.5, random_state=0) X_train.shape, y_train.shape ((90, 4), (90, )) X_test.shape, y_test.shape ((60, 4), (60, )) classifier = GaussianNB() model = classifier.fit(X_train, y_train) y = classifier.predict_proba(X_train) print(y) abc = classifier.predict(X_test) print(abc) print(metrics.accuracy_score(y_test, abc)) sl = LabelEncoder() r_data = np.array(sl.fit_transform(data.Species)) vpred = cross_val_predict(classifier, iris.data, iris.target, cv=5) print(vpred) print(metrics.accuracy_score(iris.target, vpred)) x1 = metrics.mean_absolute_error(r_data, vpred) x12 = math.sqrt(metrics.mean_absolute_error(r_data, vpred))
x_train, x_test, y_train, y_test = train_test_split(selected, labels, random_state=0) y_train = y_train.ravel() y_test_hot = label_binarize(y_test, classes=range(1, 10)) knn = KNeighborsClassifier(n_neighbors=3) knn.fit(x_train, y_train) knn_score = knn.predict_proba(x_test) knn_fpr, knn_tpr, knn_thresholds = roc_curve(y_test_hot.ravel(), knn_score.ravel()) knn_auc = auc(knn_fpr, knn_tpr) knn_accuracy = knn.score(x_test, y_test) accuracy[0].append(knn_accuracy) AUC[0].append(knn_auc) print("KNN分类精度:", knn_accuracy) print("AUC值:", knn_auc) NB = GaussianNB() NB.fit(x_train, y_train) NB_score = NB.predict_proba(x_test) NB_fpr, NB_tpr, NB_thresholds = roc_curve(y_test_hot.ravel(), NB_score.ravel()) NB_auc = auc(NB_fpr, NB_tpr) NB_accuracy = NB.score(x_test, y_test) accuracy[1].append(NB_accuracy) AUC[1].append(NB_auc) print("NB分类精度:", NB_accuracy) print("AUC值:", NB_auc) clf = svm.SVC(probability=True) clf.fit(x_train, y_train) svm_score = clf.predict_proba(x_test) svm_fpr, svm_tpr, svm_thresholds = roc_curve(y_test_hot.ravel(), svm_score.ravel()) svm_auc = auc(svm_fpr, svm_tpr) svm_accuracy = clf.score(x_test, y_test) accuracy[2].append(svm_accuracy) AUC[2].append(svm_auc)
# Save the full_test file as a Pandas Dataframe file = response["Body"].read() test = pd.read_csv(io.BytesIO(file), delimiter=",") # Fill Nan values with 0 test = test.fillna(0) # Assign Features Columns from Test dataset to variables 'features' to be used # in predicting the targets features = list(test.values[:, 4:]) # Predict Target Probability for the test target_pred = clf.predict_proba(features) # Create a for loop to predict each row of the final test and save it to # final_pred dataframe final_pred = [] for i in (list(range(len(test)))): # print(i) test_id = str(test.id.iloc[i]) # Because the predict_proba gives as array for the probability for each # class 0 and 1 in our case. We will only use the # Probability of class 1 which is the second element of the array predicted_rating = target_pred[i][1]
# 1/5 (20%) test from sklearn.model_selection import train_test_split x = df.iloc[:, 0:4] # features # ending index is exclusive y = df.iloc[:, 4] # target (index:5) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2) # Fit - Train Naive Bayes print('Train Naive Bayes') from sklearn.naive_bayes import GaussianNB clf = GaussianNB() clf.fit(x_train, y_train) # Predict / Test print('Predict') y_pred = clf.predict(x_test) # Predicted classes y_pred_prob = clf.predict_proba(x_test)[:, 1] # Probability # Accurancy Score from sklearn.metrics import accuracy_score accuracy = accuracy_score(y_test, y_pred) print('\nAccuracy Score: ' + str(accuracy)) print('\nConfusion Matrix - Check Accurancy') # Confusion Matrix - Check Accurancy confusion_matrix = pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Prediction']) print(confusion_matrix) # Precision, recall and f1-score
print("\n") print("W2V Gaussian Naive Bayes") # Compute accuracy accuracy = metrics.accuracy_score(t, p, normalize=False) print("Accuracy: ", (accuracy / len(t)) * 100) # Confusion matrix confusion_matrix = metrics.confusion_matrix(t, p) print("Confusion Matrix:\n", confusion_matrix) # Replace 4s with 1s t[np.where(t == 4)] = 1 p[np.where(p == 4)] = 1 y_score = clf.predict_proba(z) # Plot the Precision-Recall curve precision, recall, _ = metrics.precision_recall_curve(t, y_score[:, 1]) plt.step(recall, precision, color='b', alpha=0.2, where='post') plt.fill_between(recall, precision, step='post', alpha=0.2, color='b') plt.xlabel('Recall') plt.ylabel('Precision') plt.ylim([0.0, 1.05]) plt.xlim([0.0, 1.0]) average_precision = metrics.average_precision_score(t, p) plt.title('W2V Gaussian NB Precision-Recall curve: AP={0:0.2f}'.format( average_precision)) plt.savefig('data/w2v_GaussianNB_precisionRecall.png') plt.show()
#Decision tree from sklearn.tree import DecisionTreeClassifier rf3 = DecisionTreeClassifier() rf3.fit(X_train, y_train) y_val_pred3 = rf3.predict_proba(X_val) y_val_pred_acc3 = rf3.predict(X_val) print(log_loss(y_val, y_val_pred3)) print(accuracy_score(y_val, y_val_pred_acc3)) #Naive Bayes from sklearn.naive_bayes import GaussianNB rf4 = GaussianNB() rf4.fit(X_train, y_train) y_val_pred4 = rf4.predict_proba(X_val) y_val_pred_acc4 = rf4.predict(X_val) print(log_loss(y_val, y_val_pred4)) print(accuracy_score(y_val, y_val_pred_acc4)) #Bagging from sklearn.ensemble import BaggingClassifier rf5 = BaggingClassifier() rf5.fit(X_train, y_train) y_val_pred5 = rf5.predict_proba(X_val) y_val_pred_acc5 = rf5.predict(X_val) print(log_loss(y_val, y_val_pred5)) print(accuracy_score(y_val, y_val_pred_acc5)) #KNN
df = pd.read_csv("~/Desktop/My DM/Baltimore/Baltimore.csv", low_memory=False) features = [ "Month of the Crime", "Mean Temperature", "Mean Dew Point", "Mean Visibility", "Max Humidity", "Mean Wind Speed", "Max Sea Level" ] x = df[features] y = df["Crime Type"] print 'Partial Fit - training classifier' clf_pf = GaussianNB() clf_pf.partial_fit(x, y, np.unique(y)) print '--Cross Validation--' scores = cross_validation.cross_val_score(clf_pf, x, y, cv=5) print scores.mean() print '--Random Split--' X_train, X_test, Y_train, Y_test = cross_validation.train_test_split( x, y, test_size=0.2, random_state=0) clf1 = GaussianNB().fit(X_train, Y_train) print clf1.score(X_test, Y_test) # Test file df_test = pd.read_csv("~/Desktop/My DM/Baltimore/Test_Baltimore.csv", low_memory=False) xt = df_test[features] print 'Partial Fit Predicted - ' + str(clf_pf.predict(xt)) print 'Predict Probability - ' + str(clf_pf.predict_proba(xt))
dropSimilarity = [ p for col, p in zip(namesToPlot, densitySimilarity) if p > th ] #g = sns.FacetGrid(df, hue='Class') X = df[namesToPlot].drop(dropList, axis=1) y = df['Class'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y) clf_nb = GaussianNB() clf_nb.fit(X_train, y_train) y_pred = clf_nb.predict(X_test) y_pred_prob = clf_nb.predict_proba(X_test) print(classification_report(y_test, y_pred)) print(confusion_matrix(y_test, y_pred)) print('balanced accuracy score', balanced_accuracy_score(y_test, y_pred)) print(roc_auc_score(1 - y_test, y_pred_prob[:, 0])) print(roc_auc_score(y_test, y_pred_prob[:, 1])) accuracy.append(balanced_accuracy_score(y_test, y_pred)) plt.figure(3, figsize=(6, 6)) plt.plot(thresh, accuracy) plt.xlabel('thresh') plt.ylabel('accuracy score')
decTreeClassifier.fit(X_train, y_train) predTree = decTreeClassifier.predict(X_test) probaTree = decTreeClassifier.predict_proba(X_test) probaTree = probaTree[:, 1] # Look to change n_estimators (trees), criterion, max_depth and others randomForest = RandomForestClassifier(random_state=24) randomForest.fit(X_train, y_train) predForest = randomForest.predict(X_test) probaForest = randomForest.predict_proba(X_test) probaForest = probaForest[:, 1] gaussNB = GaussianNB(random_state=24) gaussNB.fit(X_train, y_train) predGauss = gaussNB.predict(X_test) probaGauss = gaussNB.predict_proba(X_test) probaGauss = probaGauss[:, 1] names = ["Logistic", "Gaussian", "RanForest", "DecTree", "SVC", "KNN"] predictions = {"Logistic" : [probaLog, predLog], "Gaussian" : [probaGauss, predGauss], "RanForest" : [probaForest, predForest], "DecTree" : [probaTree, predTree],\ "SVC": [probaSVC, predSVC],"KNN" : [probaNeigh, predNeigh]} auc_results = {} # store ROC-AUC results accuracy_results = {} class_report_results = {} def metrics_calculator(y_test, predictions, names): for name in names: auc_score = roc_auc_score(y_test, predictions[name][0]) accuracy = accuracy_score(y_test, predictions[name][1])
rec_loss_train[epoch] /= N_batch kl_loss_train[epoch] /= N_batch total_loss_train[epoch] /= N_batch codings_val = sess.run([code], {data: config.reshape([-1, size, size])})[0] output_config = sess.run([decoder], {data: config.reshape([-1, size, size])})[0] #-----------------------gaussian naive bayes classification score--------------------- clf = GaussianNB() clf.fit(codings_val, labels_actual) MAP_score = np.round(clf.score(codings_val, labels_actual), 4) print('classification score = ', MAP_score) predict = clf.predict_proba(codings_val) #sort the labels in decending order in probability, then add 1 (becasue it was counting from 0) sorted_predict = np.argsort(-predict) + 1 #calculate hamming distance between predicted history and actual history hamming_dist_all = np.zeros(N) for i in range(N): hamming_dist_all[i] = distance.hamming(field_history[i], sorted_predict[i]) hamming_dist = np.mean(hamming_dist_all) #-------------------------------------------------plotting code----------------------------------------------------- idx = np.random.choice(test_index, 12)
#Plot precision rate of each method index = 0 for method in method_list.loc[0, :]: clf = method clf.fit(xtrain, ytrain) buypredicted = clf.predict_proba(xtest) precision, recall, threshold = precision_recall_curve( ytest, buypredicted[:, 1]) plot_precision_recall_vs_threshold(index, stock, method_list, precision, recall, threshold) plt.show() index = index + 1 #%% Naive Bayes clfbuy = GaussianNB(var_smoothing=1) clfbuy.fit(xtrain, ytrain) buypredicted = clfbuy.predict_proba(xshow) dfplot = pd.DataFrame() dfplot.loc[:, 'Close'] = rawdata dfplot.loc[:, 'GoodBuyProb'] = buypredicted[:, 1] plot_buy('Naive Bayes', dfplot, stock, 0.9, 1, 0.03) #%% SVM clfbuy = svm.SVC(C=1, kernel='linear', probability=True) clfbuy.fit(xtrain, ytrain) buypredicted = clfbuy.predict_proba(xshow) dfplot = pd.DataFrame() dfplot.loc[:, 'Close'] = rawdata dfplot.loc[:, 'GoodBuyProb'] = buypredicted[:, 1] plot_buy('SVM Linear', dfplot, stock, 0.9, 0.99, 0.02) #%% SVM clfbuy = svm.SVC(C=1, probability=True) clfbuy.fit(xtrain, ytrain)
clf.partial_fit(iris.data, iris.target,classes=[0,1,2]) ''' #学习后模型中的一些参数 clf.set_params( priors=[0.333, 0.333, 0.333]) #这里要设一下各个类标记对应的先验概率,如果不设置直接clf.priors返回的是None(不知道为什么?) print(clf.priors) #获取各个类标记对应的先验概率 print(clf.class_prior_ ) #同priors一样,都是获取各个类标记对应的先验概率,区别在于priors属性返回列表,class_prior_返回的是数组 print(clf.get_params(deep=True)) #返回priors与其参数值组成字典 print(clf.class_count_) #获取各类标记对应的训练样本数 print(clf.theta_) #获取各个类标记在各个特征上的均值 print(clf.sigma_) #获取各个类标记在各个特征上的方差 #测试数据 data_test = np.array([6, 4, 6, 2]) data = data_test.reshape(1, -1) Result_predict = clf.predict(data) Score = clf.score([[6, 8, 5, 3], [5, 3, 4, 2], [4, 6, 7, 2]], [2, 0, 1], sample_weight=[0.3, 0.5, 0.2]) Result_predict_proba = clf.predict_proba(data) Result_predict_log_proba = clf.predict_log_proba(data) print(Result_predict) #预测所属类别 print(Result_predict_proba) #输出测试样本在各个类标记上预测概率值 print(Result_predict_log_proba) #输出测试样本在各个类标记上预测概率值对应对数值 print(Score) #返回测试样本映射到指定类标记上的得分(准确率)
#print(testdata.shape) #print(traindata.shape) #X represents model input, Y represents binary labels traindataX = traindata.iloc[:, 4:622] traindataY = traindata.iloc[:, 622] testdataX = testdata.iloc[:, 4:622] testdataY = testdata.iloc[:, 622] #Create Naive Bayes Model gnb = GaussianNB() #train model gnb.fit(traindataX, traindataY) #print("Model started") predictionsproba = gnb.predict_proba(testdataX)[:, 1] #print(roc_auc_score(testdataY, predictionsproba)) AUC.append(roc_auc_score(testdataY, predictionsproba)) globpred += predictionsproba.tolist() globy_test += testdataY.tolist() #print out AUC and AUC graph print "The AUC is" print(roc_auc_score(globy_test, globpred)) #print np.mean(AUC) false_positive_rate, true_positive_rate, thresholds = roc_curve( globy_test, globpred) roc_auc = auc(false_positive_rate, true_positive_rate) plt.title('Receiver Operating Characteristic GNB Custom Split') plt.plot(false_positive_rate, true_positive_rate,
Наивность предпологает независимость всех признаков """ """ Вероятно, самый простой для понимания наивный юайесовский классификатор - Гауссов. В этом классификаторе допущение состоит в том, что данные всех категорний взяты из простого нормального распределения (без ковариации между измерениями) """ fig, ax = plt.subplots() X, y = make_blobs(100, 2, centers=2, random_state=2, cluster_std=1.5) ax.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='RdBu') plt.savefig('images\\gaussian1') model = GaussianNB() model.fit(X, y) rng = np.random.RandomState(0) Xnew = [-6, -14] + [14, 18] * rng.rand(2000, 2) ynew = model.predict(Xnew) fig, ax = plt.subplots() ax.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='RdBu') ax.scatter(Xnew[:, 0], Xnew[:, 1], c=ynew, s=20, cmap='RdBu', alpha=0.1) plt.savefig('images\\gaussian2') """ Положительная сторона этого байесовского формального представления заключается в возможности естественной вероятности классификатора """ yprob = model.predict_proba(Xnew) print(yprob.round(2)) plt.show()
rm = comments['replies'].max() if lm != 0: comments['likes'] = comments['likes'] / lm if rm != 0: comments['replies'] = comments['replies'] / rm comments['sum'] = comments['likes'] + comments['replies'] comments = comments.sort_values('sum', ascending=False) comments = comments.reset_index(drop=True) ver_kw = pd.read_csv('./verification-keywords') ver_kw = ver_kw.columns comments['ver'] = 0 for com in range(len(comments)): ver = 0 for word in comments.loc[com, 's_text']: if word in ver_kw: ver = 1 comments.loc[com, 'ver'] = ver comments.sort_values('ver', ascending=False) comments['len'] = comments['text'].apply(len) # The file which contains the data of the comments(panda): # if file does not exist write header X_test = comments[['len', 'likes', 'replies']] Y_test = comments['ver'] print(model.predict(X_test)) print(model.predict_proba(X_test)) pred = model.predict_proba(X_test) pred = pd.DataFrame(pred) comments['result'] = pred
#!/usr/bin/env python # -*- coding=utf-8 -*- __author__ = "柯博文老師 Powen Ko, www.powenko.com" from sklearn.naive_bayes import GaussianNB import numpy as np X = np.array([[9, 9], [9.2, 9.2], [9.6, 9.2], [9.2, 9.2], [6.7, 7.1], [7, 7.4], [7.6, 7.5], [7.2, 10.3], [7.3, 10.5], [7.2, 9.2], [7.3, 10.2], [7.2, 9.7], [7.3, 10.1], [7.3, 10.1]]) Y = np.array([1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2]) model = GaussianNB() model.fit(X, Y) print(model.class_prior_) print(model.get_params()) #Predict Output x_test = np.array([[8, 8], [8.3, 8.3]]) predicted = model.predict(x_test) print(predicted) print(model.predict_proba(x_test))
def single_modality_classification(modality_type): """Classify data based on data's modality :param number_of_folds: int. Number of folds for crossvalidation :param modality_type: string. Data's modality, namely 'a' for audio and 'v' for video """ #path = '/home/samuel/Dropbox/Dissertacao/repo/samples/smalldataset/' globals.path_init('geometry') X = [] Y = [] mean_acc = [] std = [] with open(globals.path + modality_type + '_features.csv', 'rb') as csvfile: reader = csv.reader(csvfile, delimiter=',', quotechar='|') for row in reader: X.append([float(x) for x in row[:-1]]) Y.append(int(row[-1])) # if modality_type is 'v':# or modality_type is 'av': # transformer = TfidfTransformer() # X = transformer.fit_transform(X).todense().tolist() #X = transformer.toarray() #E = np.random.uniform(0, 0.1, size=(len(X), 20)) # Add the noisy data to the informative features # X = np.array(X) # X_indices = np.arange(X.shape[-1]) # selector = SelectPercentile(f_classif, percentile=10) # selector.fit(X, Y) # scores = -np.log10(selector.pvalues_) # scores /= scores.max() # pl.clf() # pl.bar(X_indices - .45, scores, width=.2, color='g') # pl.ylabel(r'Escore univariado ($-Log(p_{value})$)') # pl.xlabel('Numero da feature') # pl.title('Discriminancia das features - ' + modality_type) # pl.show() # print '!!!!!!!!!!!!!!!!!!!!!!!!!!!!' # print len(scores) Y_ground_truth = [] Y_predicted = [] Y_prob = [] for classifier in ['NaiveBayes', 'RandomForest']: #NaiveBayes', 'DecisionTree', 'LogisticRegression', 'LDA', 'Adaboost', 'GradientBoosting', 'RandomForest', 'ANN', 'SVM', 'KNN']: #print "Training %s" % modality_type Y_ground_truth = [] Y_predicted = [] Y_prob = [] acc = [] precision = [] f1 = [] kf = KFold(len(X), globals.number_of_folds) for train_index, test_index in kf: # KFold split x_train = np.zeros(shape=(len(train_index), len(X[0]))) y_train = np.zeros(shape=(len(train_index))) for i in range(len(train_index)): for j in range(len(X[0])): x_train[i][j] = X[train_index[i]][j] y_train[i] = Y[train_index[i]] x_test = np.zeros(shape=(len(test_index), len(X[0]))) y_test = np.zeros(shape=(len(test_index))) for i in range(len(test_index)): for j in range(len(X[0])): x_test[i][j] = X[test_index[i]][j] y_test[i] = Y[test_index[i]] #subsetSize = 0.6 #x_train, unusedX, y_train, unusedY = train_test_split(x_train, y_train, train_size=subsetSize, random_state=1) clf = None if classifier == 'NaiveBayes': clf = GaussianNB().fit(x_train, y_train) elif classifier == 'DecisionTree': clf = DecisionTreeClassifier().fit(x_train, y_train) elif classifier == 'LogisticRegression': clf = LogisticRegression().fit(x_train, y_train) elif classifier == 'LDA': clf = LDA().fit(x_train, y_train) elif classifier == 'Adaboost': clf = AdaBoostClassifier(n_estimators=100).fit( x_train, y_train) elif classifier == 'GradientBoosting': clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0).fit( x_train, y_train) elif classifier == 'RandomForest': clf = RandomForestClassifier(n_estimators=100).fit( x_train, y_train) elif classifier == 'ANN': clf = Perceptron(penalty=None, alpha=0.0001, fit_intercept=True, n_iter=20, shuffle=False, verbose=0, eta0=1.0, n_jobs=1, random_state=0, class_weight=None, warm_start=False, seed=None).fit(x_train, y_train) elif classifier == 'SVM': clf = SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0, kernel='linear', max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001, verbose=False).fit(x_train, y_train) elif classifier == 'KNN': clf = KNeighborsClassifier(n_neighbors=20).fit( x_train, y_train) else: pass y_pred = clf.predict(x_test) acc.append(accuracy_score(y_test, y_pred)) Y_ground_truth.extend(y_test) Y_predicted.extend(y_pred) Y_prob.extend(clf.predict_proba(x_test)) # print '#################################' # print acc # print np.mean(acc) # print np.std(acc) # # print get_mean_ci(acc) # print '----------------------------------' # fpr, tpr, thresholds = roc_curve(Y_ground_truth, Y_prob) # roc_auc = auc(fpr, tpr) # print auc(Y_ground_truth, Y_predicted), f1_score(Y_ground_truth, Y_predicted), accuracy_score(Y_ground_truth, Y_predicted) #Mean accuracy and std of each classifier mean_acc.append(100 * np.mean(acc)) std.append(100 * np.std(acc)) cm = confusion_matrix(Y_ground_truth, Y_predicted) return mean_acc, std, cm
# Training the model or loading the trained model----------------------------------------------------------------------------------------------------------------------------------------------------------- from sklearn.preprocessing import LabelEncoder #, OneHotEncoder # Encoding the Dependent Variable labelencoder_query_class = LabelEncoder() query_class = labelencoder_query_class.fit_transform(query_class) # Fitting Naive Bayes to the Dataset from sklearn.naive_bayes import GaussianNB classifier = GaussianNB() classifier.fit(corpus_piazza_classify, query_class) # Predicting the category in which the latest read mail belongs to.... test_input_clean = clean_dataset([test_input_og]) test_input = cv.transform(test_input_clean).toarray() test_prediction = classifier.predict(test_input) test_prediction_proba = classifier.predict_proba( test_input) # Get probability for each category for given test input test_prediction_text = labelencoder_query_class.inverse_transform( test_prediction) print("The query belong to category --> ", test_prediction_text[0]) # Setting the label to the mail using gmail API--------------------------------------------------------------------------------------------------------------------------------------------------------- # Modifying message's label print("Adding label to the latest received message....") msg_labels = CreateMsgLabels() # Create object to update labels msg_labels['addLabelIds'] = [ get_label_id(test_prediction_text[0], service, user_id) ] ModifyMessage(service, user_id, latest_received_msg_id, msg_labels) # Selecting the most appropriate reply by sentence similarity-------------------------------------------------------------------------------------------------------------------------------------------
y_pred3 = classifier3.predict(X_test) cm3 = confusion_matrix(y_test, y_pred3) # SVM Kernel (Gaussian RBF Kernel) Algo : landmark optimized (Strongest so far) from sklearn.svm import SVC classifier4 = SVC(kernel = 'rbf', probability = True) classifier4.fit(X_train, y_train) y_prob4 = classifier4.predict_proba(X_test) y_pred4 = classifier4.predict(X_test) cm4 = confusion_matrix(y_test, y_pred4) # Baye's theorum based algorithm from sklearn.naive_bayes import GaussianNB classifier5 = GaussianNB() classifier5.fit(X_train, y_train) y_prob5 = classifier5.predict_proba(X_test) y_pred5 = classifier5.predict(X_test) cm5 = confusion_matrix(y_test, y_pred5) # rubbish overfitting from sklearn.tree import DecisionTreeClassifier classifier6 = DecisionTreeClassifier(criterion = 'entropy', random_state = 0) classifier6.fit(X_train, y_train) y_prob6 = classifier6.predict_proba(X_test) y_pred6 = classifier6.predict(X_test) cm6 = confusion_matrix(y_test, y_pred6) # rubbish overfitting but a lot useful from sklearn.ensemble import RandomForestClassifier classifier7 = RandomForestClassifier(n_estimators = 50, criterion ="entropy") classifier7.fit(X_train, y_train)
W_train = data_train[:, 31][r < p] # Last 10% are validation Y_valid = data_train[:, 32][r >= p] X_valid = data_train[:, 1:31][r >= p] W_valid = data_train[:, 31][r >= p] ############################################################## ########## Training the Classifier and select threshold ###### ############################################################## #Training gaussean naive bayes classifier classifier = GaussianNB() classifier.fit(X_train, Y_train, W_train) #Testing the classifier prob_predict_train = classifier.predict_proba(X_train)[:, 1] prob_predict_valid = classifier.predict_proba(X_valid)[:, 1] # decide the threshold amstrain = [] amsvalid = [] x_axis = [] for i in range(1, 100): pcut = np.percentile(prob_predict_train, i) #print(i) # This are the final signal and background predictions Yhat_train = prob_predict_train > pcut Yhat_valid = prob_predict_valid > pcut # To calculate the AMS data, first get the true positives and true negatives
C11=tmp_v[7][1, 1]) svm_best_f1 = f1(C00=tmp[7][0, 0], C01=tmp[7][0, 1], C10=tmp[7][1, 0], C11=tmp[7][1, 1]) svm_matrix[K] = tmp[7] svm_matrix_v[K] = tmp_v[7] fpr, tpr, thresholds = metrics.roc_curve(label_total, df, pos_label=1) svm_auc[K] = metrics.auc(fpr, tpr) svm_auc_v[K] = auc_v gb = GaussianNB() predict_label = gb.fit(data1, label1).predict(data) df = gb.predict_proba(data)[:, 1] df = np.concatenate((df, df_fixed), axis=0) tmp = np.zeros((10, 2, 2)) tmp[7] = confusion_matrix(label, predict_label) tmp[7][0, 0] = tmp[7][0, 0] + n00 tmp[7][1, 0] = tmp[7][1, 0] + n10 tmp[7][0, 1] = tmp[7][0, 1] + n01 tmp[7][1, 1] = tmp[7][1, 1] + n11 gb_v = GaussianNB() predict_label_v = gb_v.fit(data1, label1).predict(data_v) df_v = gb_v.predict_proba(data_v)[:, 1] df_v = np.concatenate((df_v, df_fixed_v), axis=0) tmp_v = np.zeros((10, 2, 2)) tmp_v[7] = confusion_matrix(label_v, predict_label_v) tmp_v[7][0, 0] = tmp_v[7][0, 0] + n00_v
ans = [] for j in dataset_train[i]: ans.append(float(j)) dataset_train_float.append(ans) for i in range(len(dataset_test)): ans = [] for j in dataset_test[i]: ans.append(float(j)) dataset_test_float.append(ans) clf4 = GaussianNB() clf4 = clf4.fit(dataset_train_float, list(chain.from_iterable(datalabels_train))) predicted4 = clf4.predict(dataset_test_float) probas_ = clf4.predict_proba(dataset_test_float) fpr_NB, tpr_NB, thresholds_NB = metrics.roc_curve(datalabels_test, probas_[:, 1]) roc_auc_NB = metrics.auc(fpr_NB, tpr_NB) #plt.plot(fpr_NB, tpr_NB, lw=1, label='naive bayes' ) print("roc_auc_NB", roc_auc_NB) print('naive bayes accuracy:', clf4.score(dataset_test_float, datalabels_test)) print(metrics.classification_report(datalabels_test, predicted4,)) # print(metrics.confusion_matrix(datalabels_test, predicted4)) # xgboost print("Xgboost") seed = 2 test_size = 0.2
def ranked_panchayats(request): df = pd.read_csv(cs) district = df.District taluk = df.Taluk gram_panchayat = df.Grampanchayat stdofliving = df.Standardoflivingindex health = df.Healthindex education = df.Educationindex hdi = df.HDI n_points = 999 village_info = [[districtz, talukz, gram_panchayatz] for districtz, talukz, gram_panchayatz in zip( district, taluk, gram_panchayat)] village_number = [[stdoflivingz, healthz, educationz, hdiz] for stdoflivingz, healthz, educationz, hdiz in zip( stdofliving, health, education, hdi)] village = [[ districtz, talukz, gram_panchayatz, stdoflivingz, healthz, educationz, hdiz ] for districtz, talukz, gram_panchayatz, stdoflivingz, healthz, educationz, hdiz in zip(district, taluk, gram_panchayat, stdofliving, health, education, hdi)] avg_stdofliving = np.sum(stdofliving) / n_points avg_health = np.sum(health) / n_points avg_education = np.sum(education) / n_points avg_hdi = np.sum(hdi) / n_points Y = [0] * n_points for i in range(0, n_points): count = 0 if stdofliving[i] < avg_stdofliving: count = count + 1 if health[i] < avg_health: count = count + 1 if education[i] < avg_education: count = count + 1 if hdi[i] < avg_hdi: count = count + 1 if count >= 2: Y[i] = 1 print(np.sum(Y)) X_train = village_number[:750] X_test = village_number[750:] Y_train = Y[:750] Y_test = Y[750:] from sklearn.naive_bayes import GaussianNB clf = GaussianNB() from time import time t0 = time() clf.fit(X_train, Y_train) print("Classification training time:", round(time() - t0, 3), "s") pred = clf.predict(X_test) # print(pred) prob = clf.predict_proba(X_test) # print(prob) from sklearn.metrics import accuracy_score print("Accuracy of Program: ", accuracy_score(pred, Y_test) * 100, "%") # print (hdi) probability = [] for i in range(0, 249): ss = (1 - prob[i][1]) * 100 probability.append(ss) # print(final_list) rank_list = [] for i in range(0, 249): rank_list.append(i + 1) final_list = [[ probab, gram, ran ] for probab, gram, ran in zip(probability, gram_panchayat, rank_list)] final_list.sort() RankedPanchayat.objects.all().delete() for ii in range(0, 249): panchayat = RankedPanchayat() panchayat.panchayat = final_list[ii][1] panchayat.dev_index = final_list[ii][0] panchayat.rank = ii + 1 panchayat.save() return render(request, 'gaa/index.html')
target_names = ['Helmet', 'No Helmet'] print("\n\nClassification Report: \n") print("Accuracy: %s" % round(accuracy_score(y_test, y_pred), 4)) print("Precision \t: %s" % round(precision_score(y_test, y_pred, average='macro'), 4)) print("Recall \t\t: %s" % round(recall_score(y_test, y_pred, average='macro'), 4)) print("F1 \t\t: %s" % round(f1_score(y_test, y_pred, average='macro'), 4)) #Percentage of False Negatives y = y_test - y_pred fn = sum(y[y > 0]) * 100 / len(y_test) print("There are %s%% False Negatives" % round(fn, 4)) print("\nExecution time: %s ms" % round((end - start) * 1000, 4)) #ROC curve y_prob = clf.predict_proba(X_test)[:, 1] fpr, tpr, thresholds = roc_curve(y_test, y_prob, pos_label=1) roc_auc = auc(fpr, tpr) plt.title('Naive Bayes') plt.plot(fpr, tpr, 'b', label='AUC = %s' % round(roc_auc, 4)) print("\nAUC \t: %s" % round(roc_auc, 4)) plt.legend(loc='lower right') plt.plot([0, 1], [0, 1], 'r--') plt.xlim([-0.05, 1.0]) plt.ylim([0.0, 1.05]) plt.ylabel('True Positive Rate') plt.xlabel('False Positive Rate') plt.show()
def run_training(fold_): total_roc = [] total_conf =[] t0=time.time() #df = pd.read_csv("../input/embedded_train_tiny_folds.csv") df = pd.read_hdf( path_or_buf="../input/tiny_data/full_data_folds.h5", key='dataset' ) #print("tg\n",df.target.value_counts()) #print(" ") t1=time.time() total_time = t1-t0 print("time to read file",total_time) print(f"fold: {fold_}") t0=time.time() train_df = df[df.kfold != fold_].reset_index(drop = True) test_df = df[df.kfold == fold_].reset_index(drop = True) # print("train shape\n", train_df.shape) # print("test shape\n", test_df.shape) #features xtrain = train_df.drop(["kfold","target"],axis=1) xtest = test_df.drop(["kfold","target"],axis=1) # Standard scaler sc = StandardScaler() sc.fit(xtrain) xtrain = sc.transform(xtrain) xtest = sc.transform(xtest) # target # First make the target binary train_df.target = train_df.target.apply( lambda x:'open' if x=='open' else 'closed' ) test_df.target = test_df.target.apply( lambda x:'open' if x=='open' else 'closed' ) ytrain = train_df.target ytest = test_df.target #model model=GaussianNB() #fit the model on training data model.fit(xtrain,ytrain) # make predictions preds = model.predict(xtest) preds_proba=model.predict_proba(xtest)[:,1] # print('preds shape',preds_proba.shape) t1=time.time() total_time = t1-t0 print('time to fit model:', total_time) accuracy_score = np.sum(preds == ytest) / len(ytest) #log_loss= metrics.log_loss(train_df.OpenStatus,preds) #print(f"Fold:{fold_}") #print(f"Accuracy={accuracy_score}") conf_m=confusion_matrix(ytest,preds) #print('Confusion matrix\n',conf_m) roc_score=roc_auc_score(ytest, preds_proba) print('ROC AUC score\n', roc_score) t=[fold_,roc_score] total_conf.append(conf_m) total_roc.append(t) test_df.loc[:,"GNB_pred"] = preds_proba return test_df[["id","target","kfold","GNB_pred"]], np.mean(total_roc,axis=0)[1]
class EnsembleModel: def __init__(self, models, **params): self.models = models.values() self.model_funcs = [j.model for j in models.values()] self.params = params self._pca = PCA(n_components=0.99) self._clf = None def fit(self, x, y): train_x, test_x, train_y, test_y, = train_test_split(x, y, test_size=0.2) pca_train_x = self._pca.fit_transform(train_x) pca_test_x = self._pca.transform(test_x) for model, model_func in zip(self.models, self.model_funcs): if model.json.get("use_pca", False): train_x = pca_train_x test_x = pca_test_x else: pass model_func.fit(train_x, train_y) self._fit_meta_estimator(test_x, test_y) return self def _fit_meta_estimator(self, x, y): predictions = self._predictions(x).T y = numpy.atleast_2d(y).T labels = numpy.argmin( abs(predictions - y * numpy.ones((1, predictions.shape[1]))), 1) self._clf = GaussianNB().fit(x, labels) def _predictions(self, x): pca_x = self._pca.transform(x) predictions = [] weights = [] for model, model_func in zip(self.models, self.model_funcs): if model.json.get("use_pca", False): test_x = pca_x else: test_x = x predictions.append(model_func.predict_proba(test_x)[:, 1]) weights.append(model.best_params()["loss"]) return numpy.array(predictions) def predict_proba(self, x): blend = self.params.get("blend", "mean") predictions = self._predictions(x) if blend == "median": return numpy.median(predictions, 0) if blend == "meta": probs = self._clf.predict_proba(x) preds = [] for row, prob in zip(predictions.T, probs): if max(prob) > 0.99: preds.append(row[numpy.argmax(prob)]) else: preds.append(numpy.median(row)) return numpy.array(preds) return predictions.mean(0)
def list(request): df = pd.read_csv(cs) district = df.District taluk = df.Taluk gram_panchayat = df.Grampanchayat stdofliving = df.Standardoflivingindex health = df.Healthindex education = df.Educationindex hdi = df.HDI n_points = 999 village_info = [[districtz, talukz, gram_panchayatz] for districtz, talukz, gram_panchayatz in zip( district, taluk, gram_panchayat)] village_number = [[stdoflivingz, healthz, educationz, hdiz] for stdoflivingz, healthz, educationz, hdiz in zip( stdofliving, health, education, hdi)] avg_stdofliving = np.sum(stdofliving) / n_points avg_health = np.sum(health) / n_points avg_education = np.sum(education) / n_points avg_hdi = np.sum(hdi) / n_points Y = [0] * n_points for i in range(0, n_points): count = 0 if stdofliving[i] < avg_stdofliving: count = count + 1 if health[i] < avg_health: count = count + 1 if education[i] < avg_education: count = count + 1 if hdi[i] < avg_hdi: count = count + 1 if count >= 2: Y[i] = 1 print(np.sum(Y)) X_train = village_number[:750] X_test = village_number[750:] Y_train = Y[:750] Y_test = Y[750:] from sklearn.naive_bayes import GaussianNB clf = GaussianNB() from time import time t0 = time() clf.fit(X_train, Y_train) print("Classification training time:", round(time() - t0, 3), "s") pred = clf.predict(X_test) # print(pred) prob = clf.predict_proba(X_test) # print(prob) from sklearn.metrics import accuracy_score print("Accuracy of Program: ", accuracy_score(pred, Y_test) * 100, "%") # print (hdi) probability = [] for i in range(0, 249): ss = (1 - prob[i][1]) * 100 probability.append(ss) #print(final_list) rank_list = [] for i in range(0, 249): rank_list.append(i + 1) final_list = [[ probab, gram, ran ] for probab, gram, ran in zip(probability, gram_panchayat, rank_list)] final_list.sort() for i in range(0, 249): final_list[i][2] = i + 1 #for ii in range(0,249): #final_list[ii][0] final_list[ii][1] final_list[ii][2] page = request.GET.get('page', 1) paginator = Paginator(final_list, 10) try: users = paginator.page(page) except PageNotAnInteger: users = paginator.page(1) except EmptyPage: users = paginator.page(paginator.num_pages) return render(request, 'gaa/index.html', { 'users': users, 'panchayat': panchayat, })
# In[ ]: #Naive Bayes #Fit The Model nb_clf = GaussianNB() nb_clf.fit(X_train, y_train) print("Naive Bayes") print() train_accuracy = nb_clf.score(X_train, y_train) test_accuracy = nb_clf.score(X_test, y_test) #Calculate Out of Sample Predictions y_pred = nb_clf.predict(X_test) y_pred_prob = nb_clf.predict_proba(X_test) #K Fold Validation results = cross_val_score(nb_clf, X, y, cv=kfold, scoring=scoring) print("10-fold cross validation average accuracy: %.3f" % (results.mean())) #Model Train classification_output_report(X_train, y_train, X_test, y_test, y_pred, y_pred_prob, train_accuracy, test_accuracy) print(''' Analyst Comments: The Kfold accuracy is .63 while the train and test accuracy's were 0.66 and 0.67 respectively. The recall from the model is 0.84. ''')
path = 'lr_submission.csv' out = open(path, "w") out.write("id,hotel_cluster\n") for i in range(len(test['id'].values)): out.write(str(test['id'].values[i]) + ',' + ' ' + str(lr_preds[i][0]) + ' ' + str(lr_preds[i][1]) + ' ' + str(lr_preds[i][2]) + ' ' + str(lr_preds[i][3]) + ' ' + str(lr_preds[i][4])) out.write("\n") out.close() print('Starting Gaussian Naive Bayes') train_gnb = train.drop("hotel_cluster", axis = 1) test_gnb = test.drop("id", axis = 1) train_gnb = train.fillna(0) test_gnb = test.fillna(0) gnb_clf = GaussianNB() gnb_clf.fit(train_gnb, train_gnb['hotel_cluster'].values) prediction = gnb_clf.predict_proba(test_gnb) gnb_preds = [] for i in range(len(prediction)): gnb_preds.append(prediction[i].argsort()[-5:][::-1]) path = 'gnb_submission.csv' out = open(path, "w") out.write("id,hotel_cluster\n") for i in range(len(test['id'].values)): out.write(str(test['id'].values[i]) + ',' + ' ' + str(gnb_preds[i][0]) + ' ' + str(gnb_preds[i][1]) + ' ' + str(gnb_preds[i][2]) + ' ' + str(gnb_preds[i][3]) + ' ' + str(gnb_preds[i][4])) out.write("\n") out.close() print('Starting KNN') train_knn = train.drop("hotel_cluster", axis = 1) test_knn = test.drop("id", axis = 1) train_knn = train.fillna(0)