def test_staged_predict(): """Check staged predictions.""" # AdaBoost classification for alg in ['SAMME', 'SAMME.R']: clf = AdaBoostClassifier(algorithm=alg, n_estimators=10) clf.fit(iris.data, iris.target) predictions = clf.predict(iris.data) staged_predictions = [p for p in clf.staged_predict(iris.data)] proba = clf.predict_proba(iris.data) staged_probas = [p for p in clf.staged_predict_proba(iris.data)] score = clf.score(iris.data, iris.target) staged_scores = [s for s in clf.staged_score(iris.data, iris.target)] assert_equal(len(staged_predictions), 10) assert_array_almost_equal(predictions, staged_predictions[-1]) assert_equal(len(staged_probas), 10) assert_array_almost_equal(proba, staged_probas[-1]) assert_equal(len(staged_scores), 10) assert_array_almost_equal(score, staged_scores[-1]) # AdaBoost regression clf = AdaBoostRegressor(n_estimators=10) clf.fit(boston.data, boston.target) predictions = clf.predict(boston.data) staged_predictions = [p for p in clf.staged_predict(boston.data)] score = clf.score(boston.data, boston.target) staged_scores = [s for s in clf.staged_score(boston.data, boston.target)] assert_equal(len(staged_predictions), 10) assert_array_almost_equal(predictions, staged_predictions[-1]) assert_equal(len(staged_scores), 10) assert_array_almost_equal(score, staged_scores[-1])
def cook(): x, y, weights = load_data() n_components = 200 svd = TruncatedSVD(n_components, random_state=42) x_unweighted = svd.fit_transform(x) x_weighted = svd.fit_transform(weighted(x, weights)) for i in range(9): frac = 1 - (i * 0.01 + 0.01) print frac x_train, x_test, y_train, y_test = train_test_split(x_unweighted, y, test_size=frac) classifier = AdaBoostClassifier(n_estimators=100) classifier.fit(x_train, y_train) print "Unweighted: ", classifier.score(x_test, y_test) x_train, x_test, y_train, y_test = train_test_split(x_weighted, y, test_size=frac) classifier = AdaBoostClassifier(n_estimators=100) classifier.fit(x_train, y_train) print "Weighted: ", classifier.score(x_test, y_test) print '--------------------------' '''
class AdaBoost: def __init__(self, data, n_estimators=50, learning_rate=1.0): features, weights, labels = data self.clf = AdaBoostClassifier(n_estimators=n_estimators, learning_rate=learning_rate) self.predictions, self.trnaccuracy, self.tstaccuracy = None, None, None self.dataset = split_dataset(features, weights, labels) def train(self): """ Train Ada Boost on the higgs dataset """ self.clf = self.clf.fit(self.dataset['training']['features'], self.dataset['training']['labels']) def predict(self): """ Predict label using Ada Boost :return: """ self.predictions = self.clf.predict(self.dataset['test']['features']) def evaluate(self): self.trnaccuracy = self.clf.score(self.dataset['training']['features'], self.dataset['training']['labels'], sample_weight=self.dataset['training']['weights']) self.tstaccuracy = self.clf.score(self.dataset['test']['features'], self.dataset['test']['labels'], sample_weight=self.dataset['test']['weights'])
def cvalidate(): targetset = np.genfromtxt(open('trainLabels.csv','r'), dtype='f16') y = [x for x in targetset] trainset = np.genfromtxt(open('train.csv','r'), delimiter=',', dtype='f16') X = np.array([x for x in trainset]) X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size = 0.3, random_state = 0) X_train, X_test = decomposition_pca(X_train, X_test) #SVM c_range = 10.0 ** np.arange(6.5,7.5,.25) gamma_range = 10.0 ** np.arange(-2.5,0.5,.25) parameters = {'kernel':['rbf'], 'C':c_range, 'gamma':gamma_range} svr = SVC() clf = grid_search.GridSearchCV(svr, parameters) clf.fit(X_train, y_train) bdt = AdaBoostClassifier(base_estimator = clf.best_estimator_, algorithm="SAMME", n_estimators=100) #bdt = AdaBoostClassifier(base_estimator = KNeighborsClassifier(n_neighbors=10)) bdt.fit(X_train, y_train) print bdt.score(X_test, y_test)
def test_pickle(): # Check pickability. import pickle # Adaboost classifier for alg in ['SAMME', 'SAMME.R']: obj = AdaBoostClassifier(algorithm=alg) obj.fit(iris.data, iris.target) score = obj.score(iris.data, iris.target) s = pickle.dumps(obj) obj2 = pickle.loads(s) assert_equal(type(obj2), obj.__class__) score2 = obj2.score(iris.data, iris.target) assert_equal(score, score2) # Adaboost regressor obj = AdaBoostRegressor(random_state=0) obj.fit(boston.data, boston.target) score = obj.score(boston.data, boston.target) s = pickle.dumps(obj) obj2 = pickle.loads(s) assert_equal(type(obj2), obj.__class__) score2 = obj2.score(boston.data, boston.target) assert_equal(score, score2)
def cvalidate(): from sklearn import cross_validation trainset = np.genfromtxt(open('train.csv','r'), delimiter=',')[1:] X = np.array([x[1:8] for x in trainset]) y = np.array([x[8] for x in trainset]) #print X,y import math for i, x in enumerate(X): for j, xx in enumerate(x): if(math.isnan(xx)): X[i][j] = 26.6 #print X[0:3] #print y[0:3] X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size = 0.3, random_state = 0) X_train, X_test = decomposition_pca(X_train, X_test) bdt = AdaBoostClassifier(base_estimator = KNeighborsClassifier(n_neighbors=20, algorithm = 'auto'), algorithm="SAMME", n_estimators = 200) bdt.fit(X_train, y_train) print bdt.score(X_test, y_test)
class Model_Adaboost(object): def __init__(self,model,parameter = {"n_estimators" : 50, "CV_size": 0}): self.train = model.train self.test = model.test self.CVsize = float(parameter["CV_size"].get()) train = np.array(self.train) self.X_train = train[:, :-1] self.y_train = train[:, -1] self.X_train,self.X_CV,self.y_train,self.y_CV = train_test_split(self.X_train, self.y_train, test_size=self.CVsize) if self.CVsize == 0: self.clf = AdaBoostClassifier(n_estimators = int(parameter["n_estimators"].get())) self.model = model def fit(self): self.clf.fit(self.X_train,self.y_train) def score(self): pre = self.clf.predict(self.X_train) truth = self.y_train print ("score: " + str(self.clf.score(self.X_train,truth))) print ("f1: " + str(f1_score(truth,pre, average=None))) print ("AUC score: " + str(roc_auc_score(truth,pre))) def save_results(self): pre = self.model.clf.predict(self.model.test) df = pd.DataFrame({"predict":pre}) fileName = tkFileDialog.asksaveasfilename() df.to_csv(fileName) def crossValidation(self): estimatorList = [3,5,7,10,13,15,20,25,30,50] bestScore = [0,0] #score,n_estimator bestF1ScoreNeg = [0,0] bestF1ScorePos = [0,0] #bestAUCScore = [0,0] for e in estimatorList: self.clf = AdaBoostClassifier(n_estimators = e) self.clf.fit(self.X_train,self.y_train) pre = self.clf.predict(self.X_CV) truth = self.y_CV score = self.clf.score(self.X_CV,truth) if score > bestScore[0]: bestScore[0] = score bestScore[1] = e f1pos = f1_score(truth,pre, average=None)[1] if f1pos > bestF1ScorePos[0]: bestF1ScorePos[0] = f1pos bestF1ScorePos[1] = e f1neg = f1_score(truth,pre, average=None)[0] if f1neg > bestF1ScoreNeg[0]: bestF1ScoreNeg[0] = f1neg bestF1ScoreNeg[1] = e print ("Adaboost:") print ("Best [score,n_estimators] on Cross Validation set: " + str(bestScore)) print ("Best [f1(pos),n_estimators] on Cross Validation set: " + str(bestF1ScorePos)) print ("Best [f1(neg),n_estimators] on Cross Validation set" + str(bestF1ScoreNeg))
def adaboost(df,label_name,feature_names,features_len,ifeat,n_estimators=100): # TODO: just copied from RF, needs real code from sklearn.ensemble import AdaBoostClassifier print('---------------------------------------------------') print(ifeat,features_len,'Adaboost, features:',feature_names) df_train_Y = df[label_name] train_Y = df_train_Y.values.ravel() # turn from 2D to 1D df_train_X = df[feature_names] train_X = df_train_X.values clf =AdaBoostClassifier(n_estimators=n_estimators) clf = clf.fit(train_X,train_Y) # output = clf.predict(train_X) E_in = round(1.-clf.score(train_X, train_Y),5) # 'in sample' error #print('\tE_in :',E_in) # ----- # Kfold as estimator for 'out of sample' error kf=skl.cross_validation.KFold(n=len(train_X), n_folds=5) cv_scores=skl.cross_validation.cross_val_score(clf, train_X, y=train_Y, cv=kf) E_out = round(1.-np.mean(cv_scores),5) #print("\tE_out:",E_out) return E_in,E_out
def test_iris(): # Check consistency on dataset iris. classes = np.unique(iris.target) clf_samme = prob_samme = None for alg in ['SAMME', 'SAMME.R']: clf = AdaBoostClassifier(algorithm=alg) clf.fit(iris.data, iris.target) assert_array_equal(classes, clf.classes_) proba = clf.predict_proba(iris.data) if alg == "SAMME": clf_samme = clf prob_samme = proba assert_equal(proba.shape[1], len(classes)) assert_equal(clf.decision_function(iris.data).shape[1], len(classes)) score = clf.score(iris.data, iris.target) assert score > 0.9, "Failed with algorithm %s and score = %f" % \ (alg, score) # Somewhat hacky regression test: prior to # ae7adc880d624615a34bafdb1d75ef67051b8200, # predict_proba returned SAMME.R values for SAMME. clf_samme.algorithm = "SAMME.R" assert_array_less(0, np.abs(clf_samme.predict_proba(iris.data) - prob_samme))
def prediction(feat,label): x_train, x_test, y_train, y_test = cross_validation.train_test_split(feat, label, test_size = 0.25, random_state = 0) num_leaves = [] accuracy_score = [] auc_score = [] # for depth in range(1,10): # clf = tree.DecisionTreeClassifier(max_depth = depth) # clf.fit(x_train,y_train) # predictions = clf.predict(x_test) # accuracy = clf.score(x_test,y_test) # auc = metrics.roc_auc_score(y_test,predictions) # num_leaves.append(depth) # accuracy_score.append(accuracy) # auc_score.append(auc) for depth in range(1,10): clf = AdaBoostClassifier(tree.DecisionTreeClassifier(max_depth = depth), n_estimators = 100) clf.fit(x_train,y_train) predictions = clf.predict(x_test) accuracy = clf.score(x_test,y_test) auc = metrics.roc_auc_score(y_test,predictions) num_leaves.append(depth) accuracy_score.append(accuracy) auc_score.append(auc) return num_leaves,accuracy_score,auc_score
def ADA_Classifier(X_train, X_cv, X_test, Y_train,Y_cv,Y_test, Actual_DS): print("***************Starting AdaBoost Classifier***************") t0 = time() clf = AdaBoostClassifier(n_estimators=300) clf.fit(X_train, Y_train) preds = clf.predict(X_cv) score = clf.score(X_cv,Y_cv) print("AdaBoost Classifier - {0:.2f}%".format(100 * score)) Summary = pd.crosstab(label_enc.inverse_transform(Y_cv), label_enc.inverse_transform(preds), rownames=['actual'], colnames=['preds']) Summary['pct'] = (Summary.divide(Summary.sum(axis=1), axis=1)).max(axis=1)*100 print(Summary) #Check with log loss function epsilon = 1e-15 #ll_output = log_loss_func(Y_cv, preds, epsilon) preds2 = clf.predict_proba(X_cv) ll_output2= log_loss(Y_cv, preds2, eps=1e-15, normalize=True) print(ll_output2) print("done in %0.3fs" % (time() - t0)) preds3 = clf.predict_proba(X_test) #preds4 = clf.predict_proba((Actual_DS.ix[:,'feat_1':])) preds4 = clf.predict_proba(Actual_DS) print("***************Ending AdaBoost Classifier***************") return pd.DataFrame(preds2) , pd.DataFrame(preds3),pd.DataFrame(preds4)
def adaboost_skin(X_train, y_train, X_test, y_test): """Learn the skin data sets with AdaBoost. X_*: Samples. y_*: labels. """ print 'AdaBoost' min_iter = 1 max_iter = 200 steps = 30 diff = (max_iter - min_iter) / steps iterations = [min_iter + diff * step for step in xrange(steps+1)] scores = [] for T in iterations: clf = AdaBoostClassifier( base_estimator=DecisionTreeClassifier(max_depth=1), algorithm="SAMME", n_estimators=T) clf.fit(X_train.toarray(), y_train) scores.append(100 * clf.score(X_test.toarray(), y_test)) print '\t%d Iterations: %.2f%%' % (T, scores[-1]) return iterations, scores
def boost_report(): svm_train_features = list() svm_train_classes = list() svm_test_features = list() svm_test_classes = list() for record in mit_records: svm_train_features.append(list(record.features.values())) svm_train_classes.append(record.my_class) for record in mim_records: svm_test_features.append(list(record.features.values())) svm_test_classes.append(record.my_class) svm_classifier = svm.SVC(kernel="linear", C=0.1) svm_classifier.fit(svm_train_features, svm_train_classes) print("linear kernel svm accuracy: " + str(svm_classifier.score(svm_test_features, svm_test_classes))) classifier = AdaBoostClassifier( base_estimator=svm_classifier, n_estimators=100, algorithm='SAMME') classifier.fit(svm_train_features, svm_train_classes) print("adaboost accuracy: " + str(classifier.score(svm_test_features, svm_test_classes)))
class AdaBoostcls(object): """docstring for ClassName""" def __init__(self): self.adaboost_cls = AdaBoostClassifier() self.prediction = None self.train_x = None self.train_y = None def train_model(self, train_x, train_y): try: self.train_x = train_x self.train_y = train_y self.adaboost_cls.fit(train_x, train_y) except: print(traceback.format_exc()) def predict(self, test_x): try: self.test_x = test_x self.prediction = self.adaboost_cls.predict(test_x) return self.prediction except: print(traceback.format_exc()) def accuracy_score(self, test_y): try: # return r2_score(test_y, self.prediction) return self.adaboost_cls.score(self.test_x, test_y) except: print(traceback.format_exc())
def AB_results(): # AdaBoostClassifier print "--------------AdaBoostClassifier-----------------" rang = [60, 80] # print "--------------With HOG-----------------" # ans = [] # print "n_estimators Accuracy" # for i in rang: # clf = AdaBoostClassifier(n_estimators=i) # clf.fit(X_train_hog, y_train) # mean_accuracy = clf.score(X_test_hog, y_test) # print i, " ", mean_accuracy # ans.append('('+str(i)+", "+str(mean_accuracy)+')') # print ans # plt.plot(rang, ans, linewidth=2.0) # plt.xlabel("n_estimators") # plt.ylabel("mean_accuracy") # plt.savefig("temp_hog.png") print "\n--------------Without HOG-----------------" ans = [] print "n_estimators Accuracy" for i in rang: clf = AdaBoostClassifier(n_estimators=i) clf.fit(X_train, y_train) mean_accuracy = clf.score(X_test, y_test) print i, " ", mean_accuracy ans.append('('+str(i)+", "+str(mean_accuracy)+')') print ans plt.plot(rang, ans, linewidth=2.0) plt.xlabel("n_estimators") plt.ylabel("mean_accuracy") plt.savefig("temp_plain.png")
def boost_report(test_split_size): scd_count = 0 for record in records: if (record.my_class == "SCD"): scd_count += 1 print(scd_count) shuffle(records) split = int(len(records) * (1 / test_split_size)) print(len(records)) train_set = records[:(len(records) - split)] test_set = records[split:] print("split:", test_split_size, "train:", len(train_set), "test:", split) svm_train_features = list() svm_train_classes = list() svm_test_features = list() svm_test_classes = list() for record in train_set: svm_train_features.append(list(record.features.values())) svm_train_classes.append(record.my_class) for record in test_set: svm_test_features.append(list(record.features.values())) svm_test_classes.append(record.my_class) svm_classifier = svm.SVC(kernel="linear", C=0.1) svm_classifier.fit(svm_train_features, svm_train_classes) print("linear kernel svm accuracy: " + str(svm_classifier.score(svm_test_features, svm_test_classes))) classifier = AdaBoostClassifier( base_estimator=svm_classifier, n_estimators=50, algorithm='SAMME' ) classifier.fit(svm_train_features, svm_train_classes) print("adaboost accuracy: " + str(classifier.score(svm_test_features, svm_test_classes))) classifier2 = AdaBoostClassifier( n_estimators=50, algorithm='SAMME' ) classifier2.fit(svm_train_features, svm_train_classes) print("adaboost2 accuracy: " + str(classifier2.score(svm_test_features, svm_test_classes)))
def test_iris(): """Check consistency on dataset iris.""" for alg in ['SAMME', 'SAMME.R']: clf = AdaBoostClassifier(algorithm=alg) clf.fit(iris.data, iris.target) score = clf.score(iris.data, iris.target) assert score > 0.9, "Failed with algorithm %s and score = %f" % \ (alg, score)
def run_cv_model(self, max_depth=3, criterion='entropy', learning_rate=1., n_estimators=300, do_plot=True): # use k-fold cross validation # Supported criteria are gini for the Gini impurity and entropy for the information gain. tree = DecisionTreeClassifier(criterion=criterion, max_depth=max_depth, random_state=0) clf = AdaBoostClassifier(base_estimator=tree, n_estimators=n_estimators, learning_rate=learning_rate, random_state=0) # resample the test data without replacement. This means that each data point is part of a test a # training set only once. (paraphrased from Raschka p.176). In Stratified KFold, the features are # evenly disributed such that each test and training set is an accurate representation of the whole # this is the 0.17 version #kfold = StratifiedKFold(y=self.y_train, n_folds=self.cv, random_state=0) # this is the 0.18dev version skf = StratifiedKFold(n_folds=self.cv, random_state=0) # do the cross validation train_scores = [] test_scores = [] #for k, (train, test) in enumerate(kfold): for k, (train, test) in enumerate(skf.split(X=self.x_train, y=self.y_train)): # run the learning algorithm clf.fit(self.x_train[train], self.y_train[train]) train_score = clf.score(self.x_train[test], self.y_train[test]) train_scores.append(train_score) test_score = clf.score(self.x_test, self.y_test) test_scores.append(test_score) print('Fold:', k+1, ', Training score:', train_score, ', Test score:', test_score) train_score = np.mean(train_scores) print('Training score is', train_score) test_score = np.mean(test_scores) print('Test score is', test_score) if do_plot: self.__plot_learning_curve(clf) return train_score, test_score
def model_design(run_as_main=False): from skimage.data import imread from skimage.filters import threshold_adaptive from skimage.restoration import denoise_tv_bregman from sklearn.cross_validation import train_test_split, StratifiedKFold from sklearn.svm import SVC from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier labels = pd.read_csv('../data/trainLabels.csv', sep=',') X, y = [], np.array(labels.Class) for ID in labels.ID: original = imread('../data/trainResized/' + str(ID) +'.Bmp', as_grey=True) denoised = denoise_tv_bregman(original, 3) binarilized = threshold_adaptive(denoised, block_size=13, method='gaussian') feature = binarilized.reshape(1,400)[0] X.append(feature) X = np.array(X) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1) clf = AdaBoostClassifier(base_estimator= ExtraTreesClassifier( n_estimators=500, criterion='entropy', class_weight='auto', n_jobs=-1 ), n_estimators=50) # clf = AdaBoostClassifier(base_estimator= # RandomForestClassifier( # n_estimators=500, # criterion='entropy', # class_weight='auto', # n_jobs=-1 # ), n_estimators=20) clf.fit(X_train, y_train) print clf.score(X_test, y_test)
def AdaBoost(X, y, tst_size, n_est): X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size = tst_size, random_state = 0) clf = AdaBoostClassifier(n_estimators = n_est) score = 0 for i in range(100): clf.fit(X_train, y_train) score += clf.score(X_test, y_test) score = score/100 return score
def test_sample_weight_elm(): """Smoke test - AdaBoostClassifier should work with ELMClassifer.""" X = Xdigits_binary[:50] y = ydigits_binary[:50] elm = ELMClassifier(n_hidden=20) clf = AdaBoostClassifier(n_estimators=3, base_estimator=elm) clf.fit(X, y) assert_greater(clf.score(X, y), 0.9)
def performAdaBoostClass(X_train, y_train, X_test, y_test, parameters, fout, savemodel): """ Ada boosting binary classification """ clf = AdaBoostClassifier() clf.fit(X_train, y_train) accuracy = clf.score(X_test, y_test) return accuracy
def ab_classify(self): print "Adaboost" clf = AdaBoostClassifier() clf.fit(self.descr, self.target) mean = clf.score(self.test_descr, self.test_target) pred = clf.predict(self.test_descr) print "Pred ", pred print "Mean : %3f" % mean print "Feature Importances ", clf.feature_importances_
def test_staged_predict(): # Check staged predictions. rng = np.random.RandomState(0) iris_weights = rng.randint(10, size=iris.target.shape) boston_weights = rng.randint(10, size=boston.target.shape) # AdaBoost classification for alg in ['SAMME', 'SAMME.R']: clf = AdaBoostClassifier(algorithm=alg, n_estimators=10) clf.fit(iris.data, iris.target, sample_weight=iris_weights) predictions = clf.predict(iris.data) staged_predictions = [p for p in clf.staged_predict(iris.data)] proba = clf.predict_proba(iris.data) staged_probas = [p for p in clf.staged_predict_proba(iris.data)] score = clf.score(iris.data, iris.target, sample_weight=iris_weights) staged_scores = [ s for s in clf.staged_score( iris.data, iris.target, sample_weight=iris_weights)] assert_equal(len(staged_predictions), 10) assert_array_almost_equal(predictions, staged_predictions[-1]) assert_equal(len(staged_probas), 10) assert_array_almost_equal(proba, staged_probas[-1]) assert_equal(len(staged_scores), 10) assert_array_almost_equal(score, staged_scores[-1]) # AdaBoost regression clf = AdaBoostRegressor(n_estimators=10, random_state=0) clf.fit(boston.data, boston.target, sample_weight=boston_weights) predictions = clf.predict(boston.data) staged_predictions = [p for p in clf.staged_predict(boston.data)] score = clf.score(boston.data, boston.target, sample_weight=boston_weights) staged_scores = [ s for s in clf.staged_score( boston.data, boston.target, sample_weight=boston_weights)] assert_equal(len(staged_predictions), 10) assert_array_almost_equal(predictions, staged_predictions[-1]) assert_equal(len(staged_scores), 10) assert_array_almost_equal(score, staged_scores[-1])
def classify(features_train, labels_train, features_test, labels_test): adaBoost = AdaBoostClassifier() adaBoost.fit(features_train, labels_train) aBScore = adaBoost.score(features_test, labels_test) #aBScore = 0 #print("Ada Boost: ", aBScore) #%timeit adaBoost.fit(features_train, labels_train) adaBoostCust = AdaBoostCustom() adaBoostCust.fit(features_train, labels_train) aBCScore = adaBoostCust.score(features_test, labels_test) #aBCScore = 0 #print("AdaBoost Custom: ", aBCScore) #%timeit adaBoostCust.fit(features_train, labels_train) decisionTree = DecisionTreeClassifier(random_state=0) decisionTree.fit(features_train, labels_train) dTScore = decisionTree.score(features_test, labels_test) #dTScore = 0 #print("decision Tree: ", dTScore) #%timeit decisionTree.fit(features_train, labels_train) logReg = LogisticRegression() logReg.fit(features_train, labels_train) logRegScore = logReg.score(features_test, labels_test) #logRegScore = 0 #print("logReg Score: ", logRegScore) #%timeit logReg.fit(features_train, labels_train) logRegCust = LogisticRegressionCustom() logRegCust.fitMulticlassOneVsOne(addOnesCol(features_train), labels_train, alpha = 0.1, nrIt = 800) logRegCustScore = logRegCust.scoreMulticlassOneVsOne(addOnesCol(features_test), labels_test) #logRegCustScore = 0 #print("LogRegCust Score: ", logRegCustScore) #%timeit logRegCust.fitMulticlass(features_train, labels_train) linReg = LinearRegression() linReg.fit(features_train, labels_train) pred = linReg.predict(features_test) linRegScore = scoreForLinReg(pred, labels_test) #linRegScore = linReg.score(features_test, labels_test) #linRegScore = 0 linRegCust = LinearLeastSquares(features_train, number_iteration=800, feature_normalizer=True) linRegCust.fit(labels_train) linRegCustScore = linRegCust.score(features_test, labels_test) #linRegCustScore = 0 locWeigRegCust = LocalWeightedRegressionCustom() locWeigRegCustScore = locWeigRegCust.score(features_train, labels_train, features_test, labels_test, 1) #locWeigRegCustScore = 0 return aBScore, aBCScore, dTScore, logRegScore, logRegCustScore, linRegScore, linRegCustScore, locWeigRegCustScore
def performAdaBoostClass(X_train, y_train, X_test, y_test, parameters): """ Ada Boosting binary Classification """ n = parameters[0] l = parameters[1] clf = AdaBoostClassifier(n_estimators = n, learning_rate = l) clf.fit(X_train, y_train) accuracy = clf.score(X_test, y_test) #auc = roc_auc_score(y_test, clf.predict(X_test)) return accuracy
def voting(): X, y = preprocess() X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1) clfs = [] clfs.append(RandomForestClassifier(n_estimators=500, criterion="entropy", class_weight="auto", n_jobs=-1)) clfs.append(RandomForestClassifier(n_estimators=200, criterion="entropy", class_weight="auto", n_jobs=-1)) clfs.append(RandomForestClassifier(n_estimators=500, criterion="gini", class_weight="auto", n_jobs=-1)) clfs.append(RandomForestClassifier(n_estimators=200, criterion="gini", class_weight="auto", n_jobs=-1)) clfs.append(ExtraTreesClassifier(n_estimators=500, criterion="entropy", class_weight="auto", n_jobs=-1)) clfs.append(ExtraTreesClassifier(n_estimators=200, criterion="entropy", class_weight="auto", n_jobs=-1)) clfs.append(ExtraTreesClassifier(n_estimators=500, criterion="gini", class_weight="auto", n_jobs=-1)) clfs.append(ExtraTreesClassifier(n_estimators=200, criterion="gini", class_weight="auto", n_jobs=-1)) ab = AdaBoostClassifier( base_estimator=ExtraTreesClassifier(n_estimators=500, criterion="entropy", class_weight="auto", n_jobs=-1), n_estimators=10, ) ab.fit(X_train, y_train) print ab.score(X_test, y_test) sys.exit() for clf in clfs: clf.fit(X_train, y_train) for clf in clfs: print clf.score(X_train, y_train), clf.score(X_test, y_test) y_pred = [] test_num = len(y_test) / 10 pbar_cnt = 0 widgets = ["Predicting...", Percentage(), " ", Bar(marker=RotatingMarker()), " ", ETA()] pbar = ProgressBar(widgets=widgets, maxval=test_num).start() for i in xrange(test_num): pbar_cnt += 1 pbar.update(pbar_cnt) prediction = [clf.predict(X_test[i])[0] for clf in clfs] y_pred.append(most_freq_term(prediction)) pbar.finish() print accuracy_score(y_test[:test_num], y_pred)
def run_model(self, max_depth=3, criterion='entropy', learning_rate=1., n_estimators=300, do_plot=True): # Supported criteria for tree are gini for the Gini impurity and entropy for the information gain. tree = DecisionTreeClassifier(criterion='entropy', max_depth=max_depth, random_state=0) clf = AdaBoostClassifier(base_estimator=tree, n_estimators=n_estimators, learning_rate=learning_rate, random_state=0) clf.fit(self.x_train, self.y_train) # check model accuracy train_score = clf.score(self.x_train, self.y_train) print('Training score is', train_score) test_score = clf.score(self.x_test, self.y_test) print('Test score is', test_score) if do_plot: self.__plot_learning_curve(clf) return train_score, test_score
def test_classifiers2(data, ind): from sklearn.ensemble import AdaBoostClassifier clf = AdaBoostClassifier(n_estimators=100) clf.fit(data[ind[:1000], :-1], data[ind[:1000], -1]) print clf.score(data[ind[1000:], :-1], data[ind[1000:], -1]) out = clf.predict(data[ind[1000:], :-1]) print(confusion_matrix(data[ind[1000:], -1], out)) from sklearn.ensemble import GradientBoostingClassifier clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0) clf.fit(data[ind[:1000], :-1], data[ind[:1000], -1]) print clf.score(data[ind[1000:], :-1], data[ind[1000:], -1]) out = clf.predict(data[ind[1000:], :-1]) print(confusion_matrix(data[ind[1000:], -1], out)) from sklearn.neural_network import MLPClassifier clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(10, 10), random_state=1) clf.fit(data[ind[:1000], :-1], data[ind[:1000], -1]) print clf.score(data[ind[1000:], :-1], data[ind[1000:], -1]) out = clf.predict(data[ind[1000:], :-1]) print(confusion_matrix(data[ind[1000:], -1], out)) import xgboost as xgb xgb_model = xgb.XGBClassifier().fit(data[ind[:1000], :-1], data[ind[:1000], -1]) out = xgb_model.predict(data[ind[1000:], :-1]) a = confusion_matrix(data[ind[1000:], -1], out) print float(a[0, 0] + a[1, 1]) / np.sum(a) print a
def boosting(train_x, train_y, test_x, test_y, n_estimators, iterations): name = ( "Results/adaboost_" + str(n_estimators) + "_results" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + ".csv" ) file = open(name, "w") file.write( "AdaBoost w/ n_estimators = " + str(n_estimators) + " Analysis Started on " + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") ) file.write("Iteration, Instances, Train Time, Test Time, Training Accuracy, Testing Accuracy") logging.info("Starting Boosting Analysis") outer_time = datetime.datetime.now() boost = AdaBoostClassifier(tree.DecisionTreeClassifier(max_depth=19), n_estimators=n_estimators) for i in range(iterations): sample_size = int(random.uniform(0.001, 1.0) * train_y.shape[0]) index = random.sample(xrange(0, train_y.shape[0]), sample_size) start = datetime.datetime.now() boost.fit(train_x[index], train_y[index]) end = datetime.datetime.now() train_time = end - start train_score = boost.score(train_x, train_y) start = datetime.datetime.now() test_score = boost.score(test_x, test_y) test_time = datetime.datetime.now() - start file.write( "%4d, %4d, %s, %s, %2.6f, %2.6f \n" % (i, len(index), train_time, test_time, train_score, test_score) ) logging.info("Analysis completed in %s" % (datetime.datetime.now() - outer_time)) file.close()
bagging_clf3.fit(X, y) # print(bagging_clf3.oob_score_)# 0.856 from sklearn.ensemble import RandomForestClassifier tf_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, oob_score=True, random_state=666, n_jobs=-1) tf_clf.fit(X, y) # print('tf',tf_clf.oob_score_) #0.92 from sklearn.ensemble import ExtraTreesClassifier et_clf = ExtraTreesClassifier(n_estimators=500, bootstrap=True, oob_score=True, random_state=666, n_jobs=-1) et_clf.fit(X, y) # print('et',et_clf.oob_score_) #0.892 from sklearn.ensemble import AdaBoostClassifier ada_clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2), n_estimators=500) ada_clf.fit(X_train, y_train) print(ada_clf.score(X_test, y_test)) # 0.832 from sklearn.ensemble import GradientBoostingClassifier gb_clf = GradientBoostingClassifier(max_depth=2, n_estimators=30) gb_clf.fit(X_train, y_train) print(gb_clf.score(X_test, y_test)) # 0.912
training_classes = input_data.loc[training_indices, 'class'].values testing_features = input_data.loc[testing_indices].drop('class', axis=1).values testing_classes = input_data.loc[testing_indices, 'class'].values ss = StandardScaler() training_features = ss.fit_transform(training_features.astype(float)) testing_features = ss.transform(testing_features.astype(float)) # Create and fit the model on the training data try: clf = AdaBoostClassifier(learning_rate=learning_rate, n_estimators=n_estimators) clf.fit(training_features, training_classes) testing_score = clf.score(testing_features, testing_classes) except KeyboardInterrupt: sys.exit(1) except: continue param_string = '' param_string += 'learning_rate={},'.format(learning_rate) param_string += 'n_estimators={}'.format(n_estimators) out_text = '\t'.join([ dataset.split('/')[-1][:-7], 'AdaBoostClassifier', param_string, str(testing_score) ]) print(out_text)
if data < 10000.0: data = 1 elif data < 50000.0: data = 2 elif data < 250000.0: data = 3 elif data < 1000000.0: data = 4 else: data = 5 testTarget[x] = data x += 1 viewPredict = AdaBoostClassifier(n_estimators=400, learning_rate=.7) viewPredict.fit(trainData, trainTarget) prediction = viewPredict.predict(testData) sampleTest = [] print(viewPredict.score(testData, testTarget)) disp = plot_confusion_matrix(viewPredict, testData, testTarget, normalize='true') print("Confusion Matrix") print(disp.confusion_matrix) dump(viewPredict, 'Prediction.joblib')
estimators = bags.estimators_ tree.export_graphviz(estimators[0], out_file='tree_balanced_1.dot') tree.export_graphviz(estimators[9], out_file='tree_balanced_10.dot') ## se puede visualizar copiando y pegando el contenido del archivo .dot en http://www.webgraphviz.com/ ## predicciones y probabilidades en la muestra de entrenamiento pred_train = bags.predict(X_train) #predicción probs_train = bags.predict_proba(X_train) #probabilidad #calculemos el ajuste accuracy del modelo en train y en test probs2 = bags.predict_proba(X_test) #una vez que ya tengo las predicciones puedo calcular matriz de confusión, accuracy, recall, precision, auc, etc etc score_train = bags.score(X_train, y_train) score_testing = bags.score(X_test, y_test) #para calcular otras métricas precision, recall, etc necesito también predecir en testing pred_testing = bags.predict(X_test) #predigo en base testing #calculo recall, precision, auc, fpr, tpr, auc, f1 etc en ambas bases fpr_train, tpr_train, thresholds_train = metrics.roc_curve(y_train, pred_train, pos_label=1) fpr_test, tpr_test, thresholds_test = metrics.roc_curve(y_test, pred_testing, pos_label=1) auc_train = metrics.auc(fpr_train, tpr_train)
aggClassEst = np.zeros([m, 1]) for i in range(len(self.weakclass)): classEst = self.stumpClassify(X,self.weakclass[i]['dim'],\ self.weakclass[i]['thresh'],\ self.weakclass[i]['ineq']) aggClassEst += self.weakclass[i]['alpha'] * classEst return np.sign(aggClassEst) def score(self, X_test, y_test): y_pred = self.predict(X_test) m = X.shape[0] y_test = y_test.reshape(-1, 1) count = np.ones([m, 1]) count[y_pred != y_test] = 0 accuracy = np.sum(count) / m return accuracy #%% X = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]).reshape(-1, 1) y = np.array([1, 1, 1, -1, -1, -1, 1, 1, 1, -1]) #%% clf1 = AdaBoost(30, 1) clf1.fit(X, y) accuracy1 = clf1.score(X, y) #%% from sklearn.ensemble import AdaBoostClassifier clf2 = AdaBoostClassifier(n_estimators=100, random_state=0) clf2.fit(X, y) accuracy2 = clf2.score(X, y)
y = train["class"] clh = tree.DecisionTreeClassifier(max_depth=7) clf = AdaBoostClassifier(base_estimator=clh, n_estimators=10) clf.fit(x, y) print(np.mean(cross_val_score(clf, x, y, cv=8))) print(confusion_matrix(y, clf.predict(x))) #test_x = test[["meanfreq","sd","median","Q25","Q75","IQR","skew","kurt","sp.ent","sfm","mode", # "centroid","peakf","meanfun","minfun", # "maxfun","meandom","mindom","maxdom","dfrange","modindx"]] test_x = test[[ "meanfreq", "sd", "freq.median", "freq.Q25", "freq.Q75", "freq.IQR", "skew", "kurt", "sp.ent", "sfm", "meanfun", "minfun", "maxfun", "meandom", "mindom", "maxdom", "dfrange", "modindx", "dfslope", "meanpeakf" ]] print(clf.score(test_x, test["class"])) #---------------------------------------------------------------------------# svcfit = SVC(C=0.01, kernel='linear') x = preprocessing.scale(x) svcfit.fit(x, y) print(np.mean(cross_val_score(svcfit, x, y, cv=8))) print(confusion_matrix(y, svcfit.predict(x))) test_x = preprocessing.scale(test_x) print(svcfit.score(test_x, test["class"])) #---------------------------------------------------------------------------#
X_train, X_test = X[:n_split], X[n_split:] y_train, y_test = y[:n_split], y[n_split:] bdt_real = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2), n_estimators=600, learning_rate=1) bdt_discrete = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2), n_estimators=600, learning_rate=1.5, algorithm="SAMME") bdt_real.fit(X_train, y_train) bdt_discrete.fit(X_train, y_train) print bdt_real.score(X_test, y_test) real_test_errors = [] discrete_test_errors = [] for real_test_predict, discrete_train_predict in zip( bdt_real.staged_predict(X_test), bdt_discrete.staged_predict(X_test)): real_test_errors.append(1. - accuracy_score(real_test_predict, y_test)) discrete_test_errors.append(1. - accuracy_score(discrete_train_predict, y_test)) n_trees_discrete = len(bdt_discrete) n_trees_real = len(bdt_real) # Boosting might terminate early, but the following arrays are always # n_estimators long. We crop them to the actual number of trees here:
# df_train = pd.read_csv("data/training_set_VU_DM.csv") df_test = pd.read_csv("data/test_short.csv") data, df_test = prep_data(df_train, df_test) predictors = [ c for c in data.columns if c not in ["booking_bool", "click_bool", "gross_bookings_usd", "position"] ] X = data[predictors] y = data.booking_bool.astype(int) clf = AdaBoostClassifier(n_estimators=100) training = clf.fit(X, y) score = clf.score(X, y) print(score, "score") scores = cross_val_score(clf, X, y, cv=5) print("mean score: ", scores.mean()) print("ada scores:") print(scores) prediction_test_set = clf.predict(df_test) predictions = pd.DataFrame({ 'hotel_id': df_test.prop_id, 'search_id': df_test.srch_id, 'booking_prob': prediction_test_set }) predictions.to_csv('wattt.csv', index=False) clf = RandomForestClassifier(n_estimators=10, min_weight_fraction_leaf=0.1)
pickle.dump(training_labels, f) if not os.path.isfile(DataFile_test): with open(DataFile_test, "wb") as f: pickle.dump(test_data, f) if not os.path.isfile(LabelFile_test): with open(LabelFile_test, "wb") as f: pickle.dump(test_labels, f) else: if os.path.isfile(DataFile) and os.path.isfile(LabelFile): with open(DataFile, "rb") as f: training_data = pickle.load(f) with open(LabelFile, "rb") as f: training_labels = pickle.load(f) with open(DataFile_test, "rb") as f: test_data = pickle.load(f) with open(LabelFile_test, "rb") as f: test_labels = pickle.load(f) X_train, X_val, y_train, y_val = model_selection.train_test_split(training_data, training_labels, train_size=.99, test_size=.01) print("training!") #boosting = GradientBoostingClassifier(n_estimators=250, loss='exponential', learning_rate=0.2) #boosting = AdaBoostClassifier(RandomForestClassifier(n_estimators=600, max_depth=15, n_jobs=7), n_estimators=25, learning_rate=.1) boosting = AdaBoostClassifier(SVC(C=50, gamma=2), n_estimators=25, learning_rate=.1) boosting.fit(X_train, y_train[:,1]) print("Classifier has a score of %0.4f" % (boosting.score(test_data, test_labels[:,1])))
def main(): df = pd.read_csv("Dataset/winequality-white.csv", delimiter=";") n, bins, patches = plt.hist(x=np.array(df.iloc[:, -1]), bins='auto', color='#0504aa', alpha=0.7) plt.grid(axis='y', alpha=0.75) plt.xlabel('Quality') plt.ylabel('Count') plt.xticks(np.arange(10), ('0', '1', '2', '3', '4', '5', '6', '7', '8', '9')) plt.title('Class Distribution') plt.ylim(ymax=2200) plt.savefig('WinePlots/wineClassDistributionOriginal.png') plt.close() lowquality = df.loc[df['quality'] <= 6].index highquality = df.loc[df['quality'] > 6].index df.iloc[lowquality, df.columns.get_loc('quality')] = 0 df.iloc[highquality, df.columns.get_loc('quality')] = 1 seed = 200 np.random.seed(seed) X = np.array(df.iloc[:, 0:-1]) Y = np.array(df.iloc[:, -1]) n, bins, patches = plt.hist(x=Y, bins='auto', color='#0504aa', alpha=0.7) plt.grid(axis='y', alpha=0.75) plt.xlabel('Quality') plt.ylabel('Count') plt.xticks(np.arange(2), ('Low', 'High')) plt.title('Class Distribution') plt.ylim(ymax=4000) plt.savefig('WinePlots/wineClassDistribution.png') plt.close() training_x1, testing_x1, training_y, testing_y = train_test_split( X, Y, test_size=0.3, random_state=seed, shuffle=True, stratify=Y) standardScalerX = StandardScaler() training_x = standardScalerX.fit_transform(training_x1) testing_x = standardScalerX.fit_transform(testing_x1) # DT Max Depth Gini max_depth_array = [] training_depth_array = [] testing_depth_array = [] cross_val_score_array = [] print('DT Max Depth Gini') for i in range(1, 50): max_depth_array.append(i) learner = DecisionTreeClassifier(criterion='gini', max_depth=i + 1, random_state=seed) cross_val_score_array.append( cross_val_score(learner, training_x, training_y, cv=10).mean()) learner.fit(training_x, training_y) training_depth_array.append(learner.score(training_x, training_y)) testing_depth_array.append(learner.score(testing_x, testing_y)) plot_validation_curve(max_depth_array, training_depth_array, cross_val_score_array, "Cross Validation Score vs Max Depth Gini", 'Score', 'Max Depth', [1, 50], 'WinePlots/winemaxdepthGini.png') # DT Max Depth Entropy max_depth_array = [] training_depth_array = [] testing_depth_array = [] cross_val_score_array = [] print('DT Max Depth Entropy') for i in range(1, 50): max_depth_array.append(i) learner = DecisionTreeClassifier(criterion='entropy', max_depth=i + 1, random_state=seed) cross_val_score_array.append( cross_val_score(learner, training_x, training_y, cv=10).mean()) learner.fit(training_x, training_y) training_depth_array.append(learner.score(training_x, training_y)) testing_depth_array.append(learner.score(testing_x, testing_y)) plot_validation_curve(max_depth_array, training_depth_array, cross_val_score_array, "Cross Validation Score vs Max Depth Entropy", 'Score', 'Max Depth', [1, 50], 'WinePlots/winemaxdepthEntropy.png') # DT Random Search & Learning Curve max_depths = np.arange(1, 20, 1) params = {'criterion': ['gini', 'entropy'], 'max_depth': max_depths} learner = DecisionTreeClassifier(random_state=seed) start = time.clock() dt_cv = RandomizedSearchCV(learner, n_jobs=1, param_distributions=params, refit=True, n_iter=40) dt_cv.fit(training_x, training_y) print(dt_cv.best_params_) dt_train_time = time.clock() - start print('Time to Train: ' + str(dt_train_time)) print('Training Accuracy: ' + str(dt_cv.score(training_x, training_y))) print('Testing Accuracy: ' + str(dt_cv.score(testing_x, testing_y))) print(dt_cv.best_params_) # entropy, max depth 11 start = time.clock() test_y_predicted = dt_cv.predict(testing_x) dt_query_time = time.clock() - start print('Time to Query: ' + str(dt_query_time)) y_true = pd.Series(testing_y) y_pred = pd.Series(test_y_predicted) print( pd.crosstab(y_true, y_pred, rownames=['True'], colnames=['Predicted'], margins=True)) train_sizes, train_scores, test_scores = learning_curve( dt_cv, training_x, training_y, n_jobs=-1, cv=10, train_sizes=np.linspace(.1, 1.0, 10), random_state=seed) plot_learning_curve(train_scores, test_scores, train_sizes, 'WinePlots/wineDTLearningCurve.png', "Learning Curve DT") # Adaboost Max Depth max_depth_array = [] training_depth_array = [] testing_depth_array = [] cross_val_score_array = [] print('Adaboost Max Depth') for i in range(1, 20, 2): max_depth_array.append(i) learner2 = DecisionTreeClassifier(max_depth=i, criterion='gini', random_state=seed) boosted_learner2 = AdaBoostClassifier(base_estimator=learner2, random_state=seed, algorithm='SAMME') cross_val_score_array.append( cross_val_score(boosted_learner2, training_x, training_y, cv=10).mean()) boosted_learner2.fit(training_x, training_y) training_depth_array.append( boosted_learner2.score(training_x, training_y)) plot_validation_curve( max_depth_array, training_depth_array, cross_val_score_array, "Cross Validation Score vs Max Depth of Base Estimator", 'Score', 'Max Depth', [1, 20], 'WinePlots/wineboostedmaxdepth.png') # Adaboost Estimators estimator_array = [] training_depth_array = [] testing_depth_array = [] cross_val_score_array = [] n_estimators = range(1, 55, 5) print('Adaboost Estimators') for i in n_estimators: estimator_array.append(i) boosted_learner2 = AdaBoostClassifier( base_estimator=DecisionTreeClassifier(criterion='gini', max_depth=1), algorithm='SAMME', random_state=seed, n_estimators=i) cross_val_score_array.append( cross_val_score(boosted_learner2, training_x, training_y, cv=10).mean()) boosted_learner2.fit(training_x, training_y) training_depth_array.append( boosted_learner2.score(training_x, training_y)) plot_validation_curve(estimator_array, training_depth_array, cross_val_score_array, "Cross Validation Score vs Number of Estimators", 'Score', 'Number of Estimators', [1, 50], 'WinePlots/wineboostedestimators.png') # Adaboost Learning Rate learning_rate_array = [] training_depth_array = [] testing_depth_array = [] cross_val_score_array = [] learning_rates = [0.01, 0.02, 0.04, 0.08, 0.16, 0.32, 0.64, 1, 1.5, 2] print('Adaboost Learning Rate') for i in learning_rates: learning_rate_array.append(i) boosted_learner2 = AdaBoostClassifier( base_estimator=DecisionTreeClassifier(criterion='gini', max_depth=1), algorithm='SAMME', random_state=seed, learning_rate=i) cross_val_score_array.append( cross_val_score(boosted_learner2, training_x, training_y, cv=10).mean()) boosted_learner2.fit(training_x, training_y) training_depth_array.append( boosted_learner2.score(training_x, training_y)) plot_validation_curve(learning_rate_array, training_depth_array, cross_val_score_array, "Cross Validation Score vs Learning Rates", 'Score', 'Learning Rate', [0, 2], 'WinePlots/wineboostedLearningRate.png') # Adaboost Random Search & Learning Curve max_depths = np.arange(1, 20, 1) params = { 'n_estimators': [10, 15, 20, 25, 30, 40, 50], 'learning_rate': [0.01, 0.02, 0.04, 0.08, 0.16, 0.32, 0.64, 1], 'base_estimator__max_depth': max_depths } learner = AdaBoostClassifier( base_estimator=DecisionTreeClassifier(criterion='gini'), random_state=seed) print('starting grid search') boost_cv = RandomizedSearchCV(learner, n_jobs=1, param_distributions=params, refit=True, n_iter=50) start = time.clock() boost_cv.fit(training_x, training_y) dt_train_time = time.clock() - start print('Time to Train: ' + str(dt_train_time)) print('Training Accuracy: ' + str(boost_cv.score(training_x, training_y))) print('Testing Accuracy: ' + str(boost_cv.score(testing_x, testing_y))) print(boost_cv.best_params_) start = time.clock() test_y_predicted = boost_cv.predict(testing_x) dt_query_time = time.clock() - start print('Time to Query: ' + str(dt_query_time)) y_true = pd.Series(testing_y) y_pred = pd.Series(test_y_predicted) print( pd.crosstab(y_true, y_pred, rownames=['True'], colnames=['Predicted'], margins=True)) train_sizes, train_scores, test_scores = learning_curve( boost_cv, training_x, training_y, n_jobs=-1, cv=10, train_sizes=np.linspace(.1, 1.0, 10), random_state=seed) plot_learning_curve(train_scores, test_scores, train_sizes, 'WinePlots/wineboostedLearningCurve.png') # KNN Number of Neighbors knn_array = [] training_depth_array = [] testing_depth_array = [] cross_val_score_array = [] print('KNN Number of Neighbors with Manhattan Distance') for i in range(1, 50, 2): knn_array.append(i) learner = KNeighborsClassifier(n_neighbors=i, p=1) cross_val_score_array.append( cross_val_score(learner, training_x, training_y, cv=10).mean()) learner.fit(training_x, training_y) training_depth_array.append(learner.score(training_x, training_y)) plot_validation_curve(knn_array, training_depth_array, cross_val_score_array, "Cross Validation Score vs k Neighbors Manhattan", 'Score', 'k Neighbors', [1, 50], 'WinePlots/wineManhattanKNN.png') knn_array = [] training_depth_array = [] testing_depth_array = [] cross_val_score_array = [] print('KNN Number of Neighbors with Euclidean Distance') for i in range(1, 50, 2): knn_array.append(i) learner = KNeighborsClassifier(n_neighbors=i, p=2) cross_val_score_array.append( cross_val_score(learner, training_x, training_y, cv=10).mean()) learner.fit(training_x, training_y) training_depth_array.append(learner.score(training_x, training_y)) plot_validation_curve(knn_array, training_depth_array, cross_val_score_array, "Cross Validation Score vs k Neighbors Euclidean", 'Score', 'k Neighbors', [1, 50], 'WinePlots/wineEuclideanKNN.png') params = {'p': [1, 2], 'n_neighbors': np.arange(2, 50, 1)} learner = KNeighborsClassifier() print('starting random search') knn_cv = RandomizedSearchCV(learner, n_jobs=1, param_distributions=params, refit=True, n_iter=100) start = time.clock() knn_cv.fit(training_x, training_y) dt_train_time = time.clock() - start print('Time to Train: ' + str(dt_train_time)) print('Training Accuracy: ' + str(knn_cv.score(training_x, training_y))) print('Testing Accuracy: ' + str(knn_cv.score(testing_x, testing_y))) print(knn_cv.best_params_) start = time.clock() test_y_predicted = knn_cv.predict(testing_x) dt_query_time = time.clock() - start print('Time to Query: ' + str(dt_query_time)) y_true = pd.Series(testing_y) y_pred = pd.Series(test_y_predicted) print( pd.crosstab(y_true, y_pred, rownames=['True'], colnames=['Predicted'], margins=True)) train_sizes, train_scores, test_scores = learning_curve( knn_cv, training_x, training_y, n_jobs=-1, cv=10, train_sizes=np.linspace(.1, 1.0, 10), random_state=seed) plot_learning_curve(train_scores, test_scores, train_sizes, 'WinePlots/wineKNNLearningCurve.png') # ANN 1 Layer with different number of neurons ann_array = [] training_depth_array = [] testing_depth_array = [] cross_val_score_array = [] print('ANN Number of Neurons') for i in [ 1, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 70, 80, 90, 100 ]: print('------hey we are on ' + str(i)) ann_array.append(i) learner = MLPClassifier(hidden_layer_sizes=([i])) cross_val_score_array.append( cross_val_score(learner, training_x, training_y, cv=10).mean()) learner.fit(training_x, training_y) training_depth_array.append(learner.score(training_x, training_y)) plot_validation_curve( ann_array, training_depth_array, cross_val_score_array, "Cross Validation Score vs Neurons in One Hidden Layer", 'Score', 'Number of Neurons', [1, 100], 'WinePlots/wineANNNeurons.png') # ANN Neurons per Layers ann_array = [] training_depth_array = [] testing_depth_array = [] cross_val_score_array = [] print('ANN Number of Layers') for i in [1, 3, 5, 8, 10, 11, 13, 15, 17, 20, 23, 25]: print('------hey we are on ' + str(i)) hidden_layers = [] for x in range(i): hidden_layers.append(22) ann_array.append(i) learner = MLPClassifier(hidden_layer_sizes=(hidden_layers), activation='relu', alpha=0.0051) cross_val_score_array.append( cross_val_score(learner, training_x, training_y, cv=10).mean()) learner.fit(training_x, training_y) training_depth_array.append(learner.score(training_x, training_y)) plot_validation_curve(ann_array, training_depth_array, cross_val_score_array, "Cross Validation Score vs # of Hidden Layers", 'Score', 'Number of Hidden Layers', [1, 25], 'WinePlots/wineANNLayers.png') # ANN Learning Curve params = { 'hidden_layer_sizes': [(11, 11), (5, 5), (11, ), (5, ), (22, ), (22, 22), (5, 5, 5), (11, 11, 11), (22, 22, 22)], 'alpha': np.arange(0.0001, 0.01, 0.005), 'activation': ['relu', 'logistic'] } learner = MLPClassifier(max_iter=500, random_state=seed) ##### best params {'hidden_layer_sizes': (11,11), 'alpha': 0.0001, 'activation': 'relu'} # ann_cv = MLPClassifier(max_iter=3000,hidden_layer_sizes=(22,22,22), alpha=0.0051, activation='relu', random_state=seed) print('starting random search') ann_cv = RandomizedSearchCV(learner, n_jobs=1, param_distributions=params, refit=True, n_iter=20, verbose=1000) ann_cv.fit(training_x, training_y) print(ann_cv.best_params_) final_ann = MLPClassifier(**ann_cv.best_params_) start = time.clock() final_ann.fit(training_x, training_y) dt_train_time = time.clock() - start # print('refit time: ' + str(final_ann.refit_time_)) # print(final_ann.best_params_) print('Time to Train: ' + str(dt_train_time)) print('Training Accuracy: ' + str(final_ann.score(training_x, training_y))) print('Testing Accuracy: ' + str(final_ann.score(testing_x, testing_y))) # print(final_ann.best_params_) start = time.clock() test_y_predicted = final_ann.predict(testing_x) dt_query_time = time.clock() - start print('Time to Query: ' + str(dt_query_time)) y_true = pd.Series(testing_y) y_pred = pd.Series(test_y_predicted) print( pd.crosstab(y_true, y_pred, rownames=['True'], colnames=['Predicted'], margins=True)) train_sizes, train_scores, test_scores = learning_curve( final_ann, training_x, training_y, n_jobs=-1, cv=5, train_sizes=np.linspace(.1, 1.0, 10), random_state=seed) plot_learning_curve(train_scores, test_scores, train_sizes, 'WinePlots/wineANNLearningCurve.png') # ANN over epochs ann_array = [] training_depth_array = [] cross_val_score_array = [] testing_depth_array = [] learner = MLPClassifier(hidden_layer_sizes=(22, ), alpha=0.0051, activation='relu', max_iter=1, random_state=seed, verbose=10, warm_start=True) for i in np.arange(3000): ann_array.append(i) learner = learner.fit(training_x, training_y) score = learner.score(training_x, training_y) print(score) training_depth_array.append(score) cross_score = learner.score(testing_x, testing_y) cross_val_score_array.append(cross_score) print(cross_score) plot_validation_curve(ann_array, training_depth_array, cross_val_score_array, "Cross Validation Score vs. Epochs", 'Score', 'Epochs', [0, 3000], 'WinePlots/wineANNEpochs.png') # SVM Kernels Sigmoid vs RBF svm_array = [] training_depth_array = [] testing_depth_array = [] cross_val_score_array = [] print('SVM Kernels Sigmoid Different Gamma Values') for i in np.arange(0.01, 2, 0.1): print('------hey we are on ' + str(i)) svm_array.append(i) learner = svm.SVC(kernel='sigmoid', gamma=i) cross_val_score_array.append( cross_val_score(learner, training_x, training_y, cv=10).mean()) learner.fit(training_x, training_y) training_depth_array.append(learner.score(training_x, training_y)) plt.plot(svm_array, training_depth_array, label='Training') plt.plot(svm_array, cross_val_score_array, label='Cross Validation') plt.legend(loc=4, fontsize=8) plt.title("Cross Validation Score vs. Gamma Values - Sigmoid Kernel") plt.ylabel('Score') plt.xlabel('Gamma Values') plt.xlim([0.00, 2.0]) plt.savefig('WinePlots/wineGammaSigmoid.png') plt.close() svm_array = [] training_depth_array = [] testing_depth_array = [] cross_val_score_array = [] print('SVM Kernels RBF Different Gamma Values') for i in np.arange(0.01, 2, 0.1): print('------hey we are on ' + str(i)) svm_array.append(i) learner = svm.SVC(kernel='rbf', gamma=i) cross_score = cross_val_score(learner, training_x, training_y, cv=10).mean() print(cross_score) cross_val_score_array.append(cross_score) learner.fit(training_x, training_y) training_depth_array.append(learner.score(training_x, training_y)) plt.plot(svm_array, training_depth_array, label='Training') plt.plot(svm_array, cross_val_score_array, label='Cross Validation') plt.legend(loc=4, fontsize=8) plt.title("Cross Validation Score vs. Gamma Values - RBF Kernel") plt.ylabel('Score') plt.xlabel('Gamma Values') plt.xlim([0.00, 2.0]) plt.savefig('WinePlots/wineGammaRBF.png') plt.close() # SVM C Values svm_array = [] training_depth_array = [] testing_depth_array = [] cross_val_score_array = [] print('SVM Kernels Sigmoid Different C Values') for i in np.arange(0.01, 2, 0.1): print('------hey we are on ' + str(i)) svm_array.append(i) learner = svm.SVC(kernel='sigmoid', C=i) cross_val_score_array.append( cross_val_score(learner, training_x, training_y, cv=10).mean()) learner.fit(training_x, training_y) training_depth_array.append(learner.score(training_x, training_y)) plot_validation_curve( svm_array, training_depth_array, cross_val_score_array, "Cross Validation Score vs. C Values - Sigmoid Kernel", 'Score', 'C Values', [0.00, 2.0], 'WinePlots/wineCSigmoid.png') svm_array = [] training_depth_array = [] testing_depth_array = [] cross_val_score_array = [] print('SVM Kernels RBF Different C Values') for i in np.arange(0.01, 2, 0.1): print('------hey we are on ' + str(i)) svm_array.append(i) learner = svm.SVC(kernel='rbf', C=i) cross_val_score_array.append( cross_val_score(learner, training_x, training_y, cv=10).mean()) learner.fit(training_x, training_y) training_depth_array.append(learner.score(training_x, training_y)) plot_validation_curve(svm_array, training_depth_array, cross_val_score_array, "Cross Validation Score vs. C Values - RBF Kernel", 'Score', 'C Values', [0.00, 2.0], 'WinePlots/wineCRBF.png') #Learning Curve Sigmoid params = {'gamma': np.arange(0.01, 2, 0.1), 'C': np.arange(0.01, 1, 0.1)} learner = svm.SVC(kernel='sigmoid') print('starting grid search') svc_cv = RandomizedSearchCV(learner, n_jobs=1, param_distributions=params, refit=True, n_iter=50) svc_cv.fit(training_x, training_y) best_params = svc_cv.best_params_ #{'gamma': 0.51, 'C': 0.01} print(best_params) final_svc = svm.SVC(kernel='sigmoid', **best_params) final_svc.fit(training_x, training_y) print(final_svc.score(testing_x, testing_y)) print(final_svc.score(training_x, training_y)) test_y_predicted = final_svc.predict(testing_x) y_true = pd.Series(testing_y) y_pred = pd.Series(test_y_predicted) print( pd.crosstab(y_true, y_pred, rownames=['True'], colnames=['Predicted'], margins=True)) train_sizes, train_scores, test_scores = learning_curve( final_svc, training_x, training_y, n_jobs=-1, cv=10, train_sizes=np.linspace(.1, 1.0, 10), random_state=seed) plot_learning_curve(train_scores, test_scores, train_sizes, 'WinePlots/wineSVCLearningCurveSigmoid.png') # Learning Curve RBF params = {'gamma': np.arange(0.01, 2, 0.1), 'C': np.arange(0.01, 1, 0.1)} learner = svm.SVC(kernel='rbf') print('starting grid search') svc_cv = RandomizedSearchCV(learner, n_jobs=1, param_distributions=params, refit=True, n_iter=50) svc_cv.fit(training_x, training_y) best_params = svc_cv.best_params_ #{'gamma': 1.31, 'C': 0.91} print(best_params) final_svc = svm.SVC(kernel='rbf', **best_params) final_svc.fit(training_x, training_y) print(final_svc.score(testing_x, testing_y)) print(final_svc.score(training_x, training_y)) test_y_predicted = final_svc.predict(testing_x) y_true = pd.Series(testing_y) y_pred = pd.Series(test_y_predicted) print( pd.crosstab(y_true, y_pred, rownames=['True'], colnames=['Predicted'], margins=True)) train_sizes, train_scores, test_scores = learning_curve( final_svc, training_x, training_y, n_jobs=-1, cv=10, train_sizes=np.linspace(.1, 1.0, 10), random_state=seed) plot_learning_curve(train_scores, test_scores, train_sizes, 'WinePlots/wineSVCLearningCurveRBF.png') # SVM over Epochs Sigmoid svm_array = [] training_depth_array = [] testing_depth_array = [] cross_val_score_array = [] print('SVM Different Epochs Sigmoid') for i in np.arange(1000): svm_array.append(i) learner = svm.SVC(kernel='sigmoid', verbose=100, max_iter=i) learner = learner.fit(training_x, training_y) score = learner.score(training_x, training_y) print(score) training_depth_array.append(score) cross_score = learner.score(testing_x, testing_y) cross_val_score_array.append(cross_score) plot_validation_curve(svm_array, training_depth_array, cross_val_score_array, "Cross Validation Score vs. Epochs", 'Score', 'Epochs', [0, 1000], 'WinePlots/wineSVMEpochsSigmoid.png') # SVM over Epochs RBF svm_array = [] training_depth_array = [] cross_val_score_array = [] print('SVM Different Epochs RBF') for i in np.arange(1000): svm_array.append(i) learner = svm.SVC(kernel='rbf', verbose=100, max_iter=i) learner = learner.fit(training_x, training_y) score = learner.score(training_x, training_y) print(score) training_depth_array.append(score) cross_score = learner.score(testing_x, testing_y) cross_val_score_array.append(cross_score) plot_validation_curve(svm_array, training_depth_array, cross_val_score_array, "Cross Validation Score vs. Epochs", 'Score', 'Epochs', [0, 1000], 'WinePlots/wineSVMEpochsRBF.png') # Timing Wine # Training Time dt_clf = DecisionTreeClassifier(max_depth=19, criterion='gini') ada_clf = AdaBoostClassifier( base_estimator=DecisionTreeClassifier(max_depth=11), n_estimators=40, learning_rate=1) knn_clf = KNeighborsClassifier(p=1, n_neighbors=2) ann_clf = MLPClassifier(hidden_layer_sizes=(22, ), alpha=0.0051, activation='relu') svm_rbf_clf = svm.SVC(kernel='rbf', gamma=1.31, C=0.91) svm_sigmoid_clf = svm.SVC(kernel='sigmoid', gamma=0.51, C=0.01) labels = [ "Decision Tree", "Adaboost", "KNN", "ANN", "SVM_RBF", "SVM_Sigmoid" ] count = 0 for clf in [ dt_clf, ada_clf, knn_clf, ann_clf, svm_rbf_clf, svm_sigmoid_clf ]: iteration_array = [] train_array = [] query_array = [] for i in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]: if count == 3: clf = MLPClassifier(hidden_layer_sizes=(22, ), alpha=0.0051, activation='relu') if count == 4: clf = svm.SVC(kernel='rbf', gamma=1.31, C=0.91) if count == 5: clf = svm.SVC(kernel='sigmoid', gamma=0.51, C=0.01) X_train = training_x[:int(training_x.shape[0] * i), :] Y_train = training_y[:int(training_y.shape[0] * i)] iteration_array.append(X_train.shape[0]) st = time.clock() clf.fit(X_train, Y_train) train_time = time.clock() - st train_array.append(train_time) plt.plot(iteration_array, train_array, label=labels[count]) plt.legend(loc=4, fontsize=8) plt.title("Training Times for Learners", fontdict={'size': 16}) plt.ylabel("Time") plt.xlabel("Iteration Size") count = count + 1 plt.savefig("WineTrainingTimes.png") plt.close() # Query Time dt_clf = DecisionTreeClassifier(max_depth=19, criterion='gini') ada_clf = AdaBoostClassifier( base_estimator=DecisionTreeClassifier(max_depth=11), n_estimators=40, learning_rate=1) knn_clf = KNeighborsClassifier(p=1, n_neighbors=2) ann_clf = MLPClassifier(hidden_layer_sizes=(22, ), alpha=0.0051, activation='relu') svm_rbf_clf = svm.SVC(kernel='rbf', gamma=1.31, C=0.91) svm_sigmoid_clf = svm.SVC(kernel='sigmoid', gamma=0.51, C=0.01) labels = [ "Decision Tree", "Adaboost", "KNN", "ANN", "SVM_RBF", "SVM_Sigmoid" ] count = 0 for clf in [ dt_clf, ada_clf, knn_clf, ann_clf, svm_rbf_clf, svm_sigmoid_clf ]: iteration_array = [] query_array = [] clf.fit(training_x, training_y) for i in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]: X_test = testing_x[:int(testing_x.shape[0] * i), :] iteration_array.append(X_test.shape[0]) st = time.clock() clf.predict(X_test) query_time = time.clock() - st query_array.append(query_time) plt.plot(iteration_array, query_array, label=labels[count]) plt.legend(loc=4, fontsize=8) plt.title("Query Times for Learners", fontdict={'size': 16}) plt.ylabel("Time") plt.xlabel("Iteration Size") count = count + 1 plt.savefig("WineQueryTimes.png") plt.close()
random_forest_list = [] for i in range(len(size_list)): np.random.shuffle(x_y_set) RandomForestClassifierModel.fit(x_y_set[:size_list[i], :2], x_y_set[:size_list[i], 2]) random_forest_list.append( RandomForestClassifierModel.score(X_test, Y_test)) # AdaBoost AdaBoostClassifierModel = AdaBoostClassifier() ada_boost_list = [] for i in range(len(size_list)): np.random.shuffle(x_y_set) AdaBoostClassifierModel.fit(x_y_set[:size_list[i], :2], x_y_set[:size_list[i], 2]) ada_boost_list.append(AdaBoostClassifierModel.score(X_test, Y_test)) # LogisticRegression LogisticRegressionModel = LogisticRegression() logistic_regression_list = [] for i in range(len(size_list)): np.random.shuffle(x_y_set) LogisticRegressionModel.fit(x_y_set[:size_list[i], :2], x_y_set[:size_list[i], 2]) logistic_regression_list.append( LogisticRegressionModel.score(X_test, Y_test)) # MLPClassifier MLPClassifierModel = MLPClassifier() neural_network_list = [] for i in range(len(size_list)):
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]): print(mean_score, params) """ 0.10030303030303031 {'n_neighbors': 5, 'p': 1} 0.10242424242424245 {'n_neighbors': 5, 'p': 2} 0.12121212121212119 {'n_neighbors': 15, 'p': 1} 0.11787878787878787 {'n_neighbors': 15, 'p': 2} 0.11424242424242426 {'n_neighbors': 30, 'p': 1} 0.11363636363636362 {'n_neighbors': 30, 'p': 2} """ # %% ada_clf = AdaBoostClassifier(n_estimators=100) ada_clf.fit(X_train, y_train) pred_train, pred_dev = ada_clf.predict(X_train), ada_clf.predict(X_dev) train_acc = ada_clf.score(X_train, y_train) dev_acc = ada_clf.score(X_dev, y_dev) train_uar = recall_score(y_train, pred_train, average='macro') dev_uar = recall_score(y_dev, pred_dev, average='macro') print(f"train_acc = {train_acc:.2f}, dev_acc = {dev_acc:.2f}") print(f"train_uar = {train_uar:.2f}, dev_uar = {dev_uar:.2f}") """train_acc = 0.29, dev_acc = 0.24 train_uar = 0.29, dev_uar = 0.25""" # %% gboost_clf = GradientBoostingClassifier(n_estimators=100) gboost_clf.fit(X_train, y_train) pred_train, pred_dev = gboost_clf.predict(X_train), gboost_clf.predict(X_dev) train_acc = gboost_clf.score(X_train, y_train)
model_gnb = GaussianNB() model_gnb.fit(xtr, ytr) bayes_score = model_gnb.score(xte, yte) model_lr = LogisticRegression() model_lr.fit(xtr, ytr) logreg_score = model_lr.score(xte, yte) model_rc = RidgeClassifier() model_rc.fit(xtr, ytr) ridge_score = model_rc.score(xte, yte) #model_sgd = SGDClassifier();model_sgd.fit(xtr,ytr) #sgd_score = model_sgd.score(xte,yte) #%% xtr, xte, ytr, yte = train_test_split(X, y, train_size=0.75) model_gbc = GradientBoostingClassifier(learning_rate=0.01, n_estimators=256) model_gbc.fit(xtr, ytr) gbc_score = model_gbc.score(xte, yte) model_ada = AdaBoostClassifier(learning_rate=0.01) model_ada.fit(xtr, ytr) ada_score = model_ada.score(xte, yte) model_rf = RandomForestClassifier(n_estimators=256) model_rf.fit(xtr, ytr) rf_score = model_rf.score(xte, yte) avg_guess = (model_gbc.predict(xte) + model_ada.predict(xte) + model_rf.predict(xte)) / 3 avg_score = np.mean( [bayes_score, logreg_score, ridge_score, gbc_score, ada_score, rf_score])
for i, line in enumerate(data.splitlines()): line = line.split(",") X[i, :], y[i] = line[:4], line[4] y -= (y == 0) # In[13]: X = StandardScaler().fit_transform(X) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4, random_state=42) clf.fit(X_train, y_train) score = clf.score(X_test, y_test) print(f" score = {score} ") plt.figure(1) plt.rcParams["figure.figsize"] = [20, 10] fig, axs = plt.subplots(1, 2) axs[0].scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=plt.cm.viridis, marker='o') axs[0].set_title('Données d entraînement', fontsize=14) axs[1].scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=plt.cm.viridis,
'VisitTeamstartupDef', 'VisitTeamstartupPer' ]] trainY = file2016['WoL'] testX = testfile[[ 'HomeTeamBackupDef', 'HomeTeambackupPer', 'HomeTeamstartupDef', 'HomeTeamstartupPer', 'VisitBackupDef', 'VisitTeambackupPer', 'VisitTeamstartupDef', 'VisitTeamstartupPer' ]] testY = testfile['WoL'] ##plt.scatter(X[:,0],Y[:,0],marker='o',c=Y) #bdt = AdaBoostClassifier(base_estimator=linear_model.LogisticRegression(),algorithm='SAMME',n_estimators=2000,learning_rate=0.02) bdt = AdaBoostClassifier(DecisionTreeClassifier(max_depth=5, min_samples_split=5, min_samples_leaf=5), algorithm='SAMME.R', n_estimators=500, learning_rate=0.2) bdt.fit(trainX, trainY) preictions = bdt.predict(testX) print(bdt.estimator_weights_) print("score : ", bdt.score(testX, testY)) print("\n", preictions) ''' for i,row in testfile.iterrows(): if (row['WoL'] == ) ''' confusion_matrix = confusion_matrix(testY, preictions) print(confusion_matrix)
print('X type is {Xtype}'.format(Xtype=type(X))) Xdf = pd.DataFrame(X) print('Xdf head()---------------------------') print(Xdf.head()) ydf = pd.DataFrame(y) print('ydf head()---------------------------') print(ydf.head()) clf = AdaBoostClassifier(n_estimators=100, random_state=0) clf.fit(X, y) print clf.feature_importances_ print clf.predict([[0, 0, 0, 0]]) print clf.score(X, y) dfclf = AdaBoostClassifier(n_estimators=100, random_state=0) dfclf.fit(Xdf, ydf) print dfclf.feature_importances_ # print dfclf.predict([[0, 0, 0, 0]]) print dfclf.predict_proba(Xdf) print dfclf.predict(Xdf) print dfclf.score(Xdf, ydf) print ydf.head(10)
yTest = workARR[test_index] print("start random forrest") if cnt < 2: randForrC.fit(trainX, yTrain) tmpSCR = randForrC.score(testX, yTest) scores['rand Forest'][label].append(tmpSCR) else: randForrR.fit(trainX, yTrain) tmpSCR = randForrR.score(testX, yTest) scores['rand Forest'][label].append(tmpSCR) print("start adaBoost") if cnt < 2: adaBoostC.fit(trainX, yTrain) tmpSCR = adaBoostC.score(testX, yTest) scores['adaBoost'][label].append(tmpSCR) else: adaBoostR.fit(trainX, yTrain) tmpSCR = adaBoostR.score(testX, yTest) scores['adaBoost'][label].append(tmpSCR) print("start bagging withOUT out-of-bag") if cnt < 2: bagCoobN.fit(trainX, yTrain) tmpSCR = bagCoobN.score(testX, yTest) scores['bagging (NO out of bag)'][label].append(tmpSCR) else: bagRoobN.fit(trainX, yTrain) tmpSCR = bagRoobN.score(testX, yTest) scores['bagging (NO out of bag)'][label].append(tmpSCR)
placeToTakeForTest = int(random() * len(usedData)) x = usedData.pop(placeToTakeForTest) y = usedValue.pop(placeToTakeForTest) testSetX.append(x) testSetY.append(y) for n_estimators in [1800]: usedData = np.array(usedData) usedValue = np.array(usedValue) testSetX = np.array(testSetX) testSetY = np.array(testSetY) clf = AdaBoostClassifier(n_estimators=n_estimators) clf = clf.fit(usedData, usedValue) value1 = clf.score(testSetX, testSetY) if value1 > best: best = value1 nbBest = n_estimators print("best nb : ") print(nbBest) print(best) tot += best if nbBest in result: result[nbBest] += 1 else: result[nbBest] = 1 print("\n")
scaler = StandardScaler() scaler.fit(x_train) x_train = scaler.transform(x_train) x_test = scaler.transform(x_test) ab = AdaBoostClassifier() ab.fit(x_train, y_train) y_pred_train = ab.predict(x_train) y_pred_test = ab.predict(x_test) print("classifier", ab) print ("Accuracy on Train Set") print (ab.score(x_train, y_train)) print ("MLP Classifier") print ("Accuracy on Test Set") print (ab.score(x_test, y_test)) print ("Report") print (classification_report(y_test,ab.predict(x_test))) param_grid = { 'n_estimators': [1000,2000,3000], 'learning_rate':[1.0,5.0,10.0], 'algorithm':['SAMME.R','SAMME'] }
if __name__ == '__main__': np.set_printoptions(edgeitems=5) trainingdata = sio.loadmat('mnist_dataset/mnist_train.mat') traininglabeldata = sio.loadmat('mnist_dataset/mnist_train_labels.mat') testdata = sio.loadmat('mnist_dataset/mnist_test.mat') testlabeldata = sio.loadmat('mnist_dataset/mnist_test_labels.mat') trainingImg = trainingdata["mnist_train"] trainingLabel = traininglabeldata["mnist_train_labels"] testImg = testdata["mnist_test"] testLabel = testlabeldata["mnist_test_labels"] trainingImg /= 255 testImg /= 255 num_TestData = testImg.shape[0] bdt = AdaBoostClassifier(DecisionTreeClassifier(criterion="entropy")) bdt.fit(trainingImg, trainingLabel.ravel()) predict = bdt.predict(testImg).reshape(num_TestData, 1) score = bdt.score(testImg, testLabel) wrongCount = num_TestData - np.count_nonzero(predict == testLabel) with open('adaboost_result.csv', 'w') as file: file.write("Error Rate: %f\n" %(wrongCount / 10000)) file.write("Score: %f\n" %(score)) for i in range(0, num_TestData): file.write("%d %d\n" %(predict[i], testLabel[i]))
Xb_test = X_basic.iloc[train_inds:, :] yb_test = y_basic.iloc[train_inds:, :] # null accuracy rates for basic dataset (dict may not be great for this) null_rates_basic = list(y_basic.mean()) ########################## # AdaBoost on basic data # ########################## ADB_Scores = list(np.zeros(y_basic.shape[1])) for i in range(y_basic.shape[1]): from sklearn.ensemble import AdaBoostClassifier adb = AdaBoostClassifier(n_estimators=100) adb.fit(Xb_train, yb_train.iloc[:, i]) ADB_Scores[i] = adb.score(Xb_test, yb_test.iloc[:, i]) # compare to null accuracy rates; difference in accuracy: for i in range(len(ADB_Scores)): print str(list(y_basic.mean())[i]) + '\t' + str( ADB_Scores[i]) + '\t' + str(ADB_Scores[i] - null_rates_basic[i]) ############################### # Random Forest on basic data # ############################### RF_Scores = list(np.zeros(y_basic.shape[1])) for i in range(y_basic.shape[1]): from sklearn.ensemble import RandomForestClassifier rf = RandomForestClassifier(n_estimators=20) rf.fit(Xb_train, yb_train.iloc[:, i])
data = pd.read_csv("spambase.data", sep=",", header=None) Shuffle_data = shuffle(data) inputs = Shuffle_data.iloc[:, :57] labels = Shuffle_data.iloc[:, -1] x_train, x_test, y_train, y_test = train_test_split(inputs, labels, test_size=0.3) model_01 = MultinomialNB() model_01.fit(x_train, y_train) score_01 = model_01.score(x_test, y_test) model_02 = AdaBoostClassifier() model_02.fit(x_train, y_train) score_02 = model_02.score(x_test, y_test) print("Using NB : ", score_01, " Using AdaBoost : ", score_02) prediction01 = model_01.predict(x_test) prediction02 = model_02.predict(x_test) for i in range(5): print("NB : ", prediction01[i], " Ada: ", prediction02[i], " Actual", list(y_test)[i])
# In[5]: help(AdaBoostClassifier) # In[6]: #参数解释: clf = AdaBoostClassifier( base_estimator=None, n_estimators=1000, learning_rate=0.01, algorithm='SAMME.R', random_state=2020, ) t1 = time.time() clf = clf.fit(x_train, y_train) pred = clf.predict(x_test) print('predict:', pred) score = clf.score(x_test, y_test) print('score:', score) t2 = time.time() # # 时间成本 # In[7]: print('time:', t2 - t1) # In[ ]:
X_train, X_test, y_train, y_test = train_test_split(tfidf, label, test_size=0.7, random_state=75) classifier_nb = MultinomialNB(class_prior=None, fit_prior=False).fit(X_train, y_train) filename = 'finalized_model.sav' joblib.dump(classifier_nb, filename) score = classifier_nb.score(X_test, y_test) classifier_en = AdaBoostClassifier(n_estimators=100) classifier_en.fit(tfidf, label) score2 = classifier_en.score(X_test, y_test) print "Score for Naive--- " print score print "For ADAboost is " print score2 classifier_svm = SVC(probability=True, kernel='sigmoid') classifier_svm.fit(X_test, y_test) score3 = classifier_svm.score(X_test, y_test) print "Score for SVC is--- " print score3 sentiments = pd.DataFrame(columns=['text', 'class', 'prob']) i = 0
min_samples_split=20, min_samples_leaf=5), algorithm="SAMME", n_estimators=21, learning_rate=0.8) print(Ada) epoch = 5 scores = [] # Train model start = datetime.now() for i in range(epoch): Ada.fit(feature_train_scaled, y_train_decode) scores.append(Ada.score(feature_test_scaled, y_test_decode)) #准确率 print("This took ", datetime.now() - start) print(u'The accuracy of the model is: ') display_scores(scores) #准确率 params = { "learning_rate": uniform(0.1, 0.9), # default 0.1 "n_estimators": randint(10, 120) # default 5 # "max_depth": randint(2, 6) # default 3 } search = RandomizedSearchCV(Ada, param_distributions=params, random_state=42, n_iter=30,
from kullback_leibner_divergence_criterion import KullbackLeibnerCriterion kldc = KullbackLeibnerCriterion(1, np.array([2], dtype='int64')) #Create the tree dt = DecisionTreeClassifier(max_depth=2, criterion=kldc) # Create and fit an AdaBoosted decision tree bdt = AdaBoostClassifier(dt, algorithm="SAMME", n_estimators=200) bdt.fit(X, y, w) #from sklearn.ensemble import RandomForestClassifier #bdt = RandomForestClassifier(criterion=kldc, max_depth=2, n_estimators=100) #bdt.fit(X, y) print('distance score: ', bdt.score(X, y)) plot_colors = "br" plot_step = 0.02 class_names = "AB" plt.figure(figsize=(10, 5)) # Plot the decision boundaries plt.subplot(121) x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step), np.arange(y_min, y_max, plot_step)) Z = bdt.predict(np.c_[xx.ravel(), yy.ravel()])
print('Accuracy (a decision tree):', dt.score(test_data, test_labels)) #http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html rfc = RandomForestClassifier(n_estimators=1000) rfc.fit(train_data, train_labels) print('Accuracy (a random forest):', rfc.score(test_data, test_labels)) #http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html abc = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1), n_estimators=1000, learning_rate=0.1) abc.fit(train_data, train_labels) print('Accuracy (adaboost with decision trees):', abc.score(test_data, test_labels)) ###Own implementation of Bagging: np.random.seed(1) B = 1000 n = train_data.shape[0] sn = int(n*2.0/3.0) # nr of training data in subset for each tree nf = train_data.shape[1] all_preds = np.zeros((B,test_data.shape[0])) for b in range(B): bs_sample_index = np.random.choice(range(n), size=sn, replace=True) subt = train_data[bs_sample_index,]
dtree.fit(x_train, y_train) dtree.score(x_test, y_test) # 0.625 dtree.score(x_train, y_train) # 100% # model is overfitting ################ Now With BaggingClassifier ############ bg = BaggingClassifier(DecisionTreeClassifier(), max_samples=0.5, max_features=1.0, n_estimators=20) bg.fit(x_train, y_train) #fitting the model bg.score(x_test, y_test) # 66.25 bg.score(x_train, y_train) # 94.375 # model is Underfitting ################ Now With BoostingClassifier ########### ada = AdaBoostClassifier(DecisionTreeClassifier(), n_estimators=10, learning_rate=1) ada.fit(x_train, y_train) ada.score(x_test, y_test) ada.score(x_train, y_train) # 100% # model is overfitting # By looking at all the above model we can tell that "Bagging model" is gives the good result
score_time = time() - start # get the score time print("{:<15}| score = {:.3f} | time = {:,.3f}s/{:,.3f}s".format( name, score, train_time, score_time)) # Fitting AdaBoost Classification to the Training set classifier = AdaBoostClassifier(DecisionTreeClassifier(max_depth=7, min_samples_split=20, min_samples_leaf=5), algorithm="SAMME", n_estimators=200, learning_rate=0.8) start = time() classifier.fit(X_train, y_train) train_time = time() - start start = time() score = classifier.score(X_test, y_test) score_time = time() - start print("score = {:.3f} | time = {:,.3f}s/{:,.3f}s".format( score, train_time, score_time)) # Calculating feature inportance feature_name = cv.get_feature_names() feature_name = np.array(feature_name) feature_name = np.insert(feature_name, 0, "avg_star_rating", axis=0) importances = classifier.feature_importances_ indices = np.argsort(importances)[::-1] feature_name = np.array(feature_name) for f in range(100): print("%2d) %-*s %f" % (f + 1, 30, feature_name[indices[f]], importances[indices[f]])) # 1) dryer 0.155159
testSetX1 = np.array(testSetX1) testSetY1 = np.array(testSetY1) scaler1 = StandardScaler() scaler1.fit(usedData) usedDataScale2 = scaler1.transform(usedData) testSetXScale2 = scaler1.transform(testSetX) clf1 = AdaBoostClassifier(n_estimators=200) clf1 = clf1.fit(usedDataScale2, usedValue) #print("\n importance 1:") print(clf1.predict_proba(testSetXScale2)) print(clf1.predict(testSetXScale2)) value1 = clf1.predict(testSetXScale2) value1Score = clf1.score(testSetXScale2, testSetY) clf2 = MLPClassifier() clf2 = clf2.fit(usedData, usedValue) #print("\n importance 2:") print(clf2.predict(testSetX)) print(clf2.predict_proba(testSetX)) value2 = clf2.predict(testSetX) value2Score = clf2.score(testSetX, testSetY) clf3 = KNeighborsClassifier(n_neighbors=nbNeighbors, weights=weightValue) clf3 = clf3.fit(usedData1, usedValue1) #print("\n importance 3:") #print(clf3.predict_proba(testSetX1)) print(clf3.predict(testSetX1)) value3 = clf3.predict(testSetX1)