def baggingknn(trainmat, laber, testmat, tlaber, mat, mlaber): clf1 = AdaBoostClassifier(DecisionTreeClassifier(max_depth=15, max_features=None, min_samples_split=6), algorithm="SAMME", n_estimators=30, learning_rate=0.9) clf = KNeighborsClassifier(weights='distance', n_neighbors=1, algorithm='ball_tree', metric='minkowski', p=1) # clf =svm.SVC(C=500, kernel='rbf', gamma=0.001, decision_function_shape='ovr') clfb = BaggingClassifier(base_estimator=clf1, max_samples=1.0, max_features=1.0, n_estimators=20) clfb.fit(trainmat, laber) # predict = clf.predict(trainmat) # result= clfb.predict(testmat) # print(clf.score(trainmat,laber)) # print(clf.score(testmat,tlaber)) # result2=clfb.score(testmat, tlaber) score = clfb.score(testmat, tlaber) print(clfb.score(testmat, tlaber)) # print(result) score1c1 = cross_val_score(clf, mat, mlaber, cv=5, scoring='accuracy') scorec2 = cross_val_score(clfb, mat, mlaber, cv=5, scoring='accuracy') print('knn') print(score1c1.mean()) print('bagging') print(scorec2.mean()) return score
def checkBaggingEffectOnOverFitting_decisionTree(): depths = np.arange(1, 50) train_accuracy = np.empty(len(depths)) test_accuracy = np.empty(len(depths)) num = 20 for i, k in enumerate(depths): knn = KNeighborsClassifier(n_neighbors=k) bagging = BaggingClassifier(base_estimator=knn, max_samples=0.5, max_features=0.5, n_estimators=num, random_state=12) bagging.fit(X_train, Y_train) train_accuracy[i] = bagging.score(X_train, Y_train) test_accuracy[i] = bagging.score(X_test, Y_test) plt.plot(depths, test_accuracy, label='Testing dataset Accuracy - Bagging Overfit') plt.plot(depths, train_accuracy, label='Training dataset Accuracy - Bagging Overfit') plt.legend() plt.xlabel('Max Depth') plt.ylabel('Accuracy') plt.show()
def main(): #getting and splitting data print("parsing data") patients, egm_matrix, cancer_onehot = get_data() train_X, test_X, train_Y, test_Y = train_test_split(egm_matrix, cancer_onehot, test_size=0.20, random_state=0) #form scaled data scaler = preprocessing.StandardScaler().fit(train_X) print(scaler) scaled_train_X = scaler.transform(train_X) scaled_test_X = scaler.transform(test_X) bagging = BaggingClassifier(DecisionTreeClassifier(max_depth=3), max_features=20000, max_samples=1.0, n_estimators=11, random_state=2) bagging.fit(scaled_train_X, train_Y) pred_Y = bagging.predict(scaled_test_X) f1_score_bagging = f1_score(test_Y, pred_Y, average=None) print("f1 score is: " + str(f1_score_bagging) ) #max_features=20000 -> f1 score is: [0.77852349 0.47619048] mean_train_accuracy = bagging.score(scaled_train_X, train_Y) print("mean_train_accuracy is: " + str(mean_train_accuracy)) mean_test_accuracy = bagging.score(scaled_test_X, test_Y) print("mean_test_accuracy is: " + str(mean_test_accuracy)) print(bagging)
def bagging(x_train, x_test, y_train, y_test, n, classifier=None): print("---------------------------------------------\n") print("Rezultati za bagging: [" + str(classifier) + " " + str(n) + "] \n") beginTime = time.time() print("Kreiranje klasifikatora ... \n") if (classifier == "svc"): unit = SVC(kernel='poly', gamma='auto') elif (classifier == "tree"): unit = tree.DecisionTreeClassifier() elif (classifier == "knn"): unit = KNeighborsClassifier(3, algorithm="brute") else: unit = None clf = BaggingClassifier(unit, n_estimators=n) clf.fit(x_train, y_train.ravel()) save_model(clf, "bagging_" + str(classifier) + "_" + str(n)) print('Trening tacnost: {}'.format(clf.score(x_train, y_train))) print('Test tacnost: {}'.format(clf.score(x_test, y_test))) y_predict_train = clf.predict(x_train) y_predict_test = clf.predict(x_test) print("Matrica kofuzije trening vrednosti: \n" + str(confusion_matrix(y_train, y_predict_train))) print("Matrica kofuzije test vrednosti: \n" + str(confusion_matrix(y_test, y_predict_test))) endTime = time.time() elapsedTime = endTime - beginTime print(f"Vreme potrebno za izvrsavanje: {elapsedTime:.4f} \n")
def best_first(self, pool, score_index, x_train, y_train, x_validation, y_validation, x_test, y_test): BagPercepCurrent = BaggingClassifier( linear_model.Perceptron(max_iter=5), self.pool_size) BagPercepCurrent.fit(x_train, y_train) BagPercepCurrent.estimators_ = [pool.estimators_[score_index[0]]] best_score = BagPercepCurrent.score(x_validation, y_validation) best_score_test = BagPercepCurrent.score(x_test, y_test) metrics = (best_score_test, ) + self.calc_metrics( BagPercepCurrent.predict(x_test), y_test) best_index = 1 best_score_test = 0 diversity_kappa = 0 for i, j in enumerate(list(score_index[1:])): BagPercepCurrent.estimators_ += [pool.estimators_[j]] score_current = BagPercepCurrent.score(x_validation, y_validation) if best_score < score_current: best_score = score_current best_index = i best_score_test = BagPercepCurrent.score(x_test, y_test) metrics = (best_score_test, ) + self.calc_metrics( BagPercepCurrent.predict(x_test), y_test) diversity_kappa = self.pairwise_diversity_measure( BagPercepCurrent, len(BagPercepCurrent.estimators_), x_teste) best_index += 2 # print("best index", best_index, best_score, best_score_test) return (metrics) + (diversity_kappa, )
def svm(): training_set_size = [.1,.25,.5,.75,.9] kernels = ['rbf', 'poly'] columns = ['Kernel', 'Training Set Size', 'Training Score', 'Test Score', 'Train Time', 'Test Time'] df = pd.DataFrame(columns=columns) for kernel in kernels: for tset_size in training_set_size: X_train, X_test, y_train, y_test = train_test_split( encoded_data[list(set(encoded_data.columns) - set(['Target']))], encoded_data['Target'], train_size=tset_size) scaler = preprocessing.StandardScaler() X_train = pd.DataFrame(scaler.fit_transform(X_train.astype('float32')), columns=X_train.columns) X_test = scaler.transform(X_test.astype('float32')) start = time.time() bagging_svm = BaggingClassifier(SVC(kernel=kernel, cache_size=1000), n_jobs=-1) print(bagging_svm) bagging_svm.fit(X_train, y_train) end_train = time.time() - start # y_pred = bagging_svm.predict(X_test) train_score = bagging_svm.score(X_train, y_train) start_test = time.time() test_score = bagging_svm.score(X_test, y_test) end_test = time.time() - start_test values = [kernel, tset_size, train_score, test_score, end_train, end_test] df.loc[len(df)] = values print(' '.join(str(col) for col in columns)) print(' '.join(str(val) for val in values)) df.to_excel('diabetes_svm.xls')
def reduce_error(self, pool, score_index, x_train, y_train, x_validation, y_validation, x_test, y_test): BagPercepCurrent = BaggingClassifier( linear_model.Perceptron(max_iter=5), self.pool_size) BagPercepCurrent.fit(x_train, y_train) ensemble_index = set() ensemble_index.add(score_index[0]) ensemble = [] ensemble.append(pool.estimators_[score_index[0]]) BagPercepCurrent.estimators_ = ensemble best_score = BagPercepCurrent.score(x_validation, y_validation) # metrics = (None, None, None, None) while (True): index_best_score = 0 BagPercepCurrent.estimators_ = ensemble best_score_test = BagPercepCurrent.score(x_test, y_test) metrics = (best_score_test, ) + self.calc_metrics( BagPercepCurrent.predict(x_test), y_test) for i in list(score_index): if i not in ensemble_index: BagPercepCurrent.estimators_ = ensemble + [ pool.estimators_[i] ] score_current = BagPercepCurrent.score( x_validation, y_validation) if best_score < score_current: best_score = score_current index_best_score = i if index_best_score != 0: ensemble_index.add(index_best_score) ensemble.append(pool.estimators_[index_best_score]) else: # print("best index", len(ensemble), best_score, best_score_test) kappa_diversity = self.pairwise_diversity_measure( BagPercepCurrent, len(BagPercepCurrent.estimators_), x_test) disagreement_diversity_ = self.disagreement_diversity_measure( BagPercepCurrent, len(BagPercepCurrent.estimators_), x_test) return (metrics) + ( kappa_diversity, disagreement_diversity_, ) if len(ensemble_index) == self.pool_size: return (metrics) + ( kappa_diversity, disagreement_diversity_, )
def BaggingFunc(samples): bg = BaggingClassifier(DecisionTreeClassifier(), max_samples=samples, max_features=1.0, n_estimators=25, bootstrap=True) bg.fit(xtr, ytr) score = round(bg.score(xtst, ytst), 4) print("{} örnekli test Kümesi için doğruluk : ".format(samples), round(bg.score(xtst, ytst), 4)) return score
def Bagging_Using_DT(): data = pd.read_csv('mnist.csv') DF_x = data.iloc[:,1:] # Labels DF_y = data.iloc[:,0] # Pixels x_train, x_test, y_train, y_test = train_test_split(DF_x, DF_y, test_size = 0.3, random_state = 5) #===================== # # Using Decision Tree # #===================== DT = DecisionTreeClassifier() DT.fit(x_train, y_train) print("\n----------------------------------------------------\n") print("Training Accuracy Using Decision Tree : ", DT.score(x_train, y_train) *100) print("Testing Accuracy Using Decision Tree : ", DT.score(x_test,y_test) *100) print("\n----------------------------------------------------\n") #================================================== # # Using Random Forest - Ensemble Of Decision Trees # #================================================== RF = RandomForestClassifier(n_estimators = 20) RF.fit(x_train, y_train) print("Training Accuracy Using Random Forest : ", RF.score(x_train, y_train) *100) print("Testing Accuracy Using Random Forest : ", RF.score(x_test ,y_test) *100) print("\n----------------------------------------------------\n") #=============================== # # Bagging Using Decision Tree # #=============================== BG = BaggingClassifier(DecisionTreeClassifier(), max_samples = 0.7, max_features = 1.0, n_estimators = 25) BG.fit(x_train, y_train) print("Training Accuracy Using Bagging Classifier : ", BG.score(x_train, y_train) *100) print("Testing Accuracy Using Bagging Classifier : ", BG.score(x_test, y_test) *100) print("\n----------------------------------------------------\n")
def model_fusion(model, X, y): """ 模型融合,使用Bagging方法进行模型融合,同样也使用训练集,验证集,测试集, 输出融合只后的准确率,以及用在测试集上面的准确率。 """ X_train_all, X_test, y_train_all, y_test = train_test_split(X, y, test_size=0.3, random_state=1) train_data, ver_data, train_label, ver_label = train_test_split(X_train_all, y_train_all, test_size=0.3, random_state=1) bag = BaggingClassifier(base_estimator=model, n_estimators=10, random_state=1, max_samples=0.2, max_features=0.9) bag.fit(train_data, train_label) print('model_fusion', bag.score(ver_data, ver_label)) print('result-->', bag.score(X_test, y_test)) return bag
def task1(): #1. Load digit dataset (D). #X,y = load_digits(return_X_y=True) digits = datasets.load_digits() X = digits.data y = digits.target #2. 70% tuples are used for training while 30% tuples are used for testing. X_train, X_test, y_train, y_test = model_selection.train_test_split( X, y, test_size=0.3, train_size=0.7, random_state=42) scaler = StandardScaler() scaler.fit(X_train) X_train = scaler.transform( X_train) # Now apply the transformations to the data X_test = scaler.transform(X_test) #3. Create an instance of multi-layer perceptron network mlp = MLPClassifier( hidden_layer_sizes=(16, 8, 4, 2), max_iter=1001 ) #(four hidden layers with 16, 8, 4,and 2 neurons in order) mlp.fit(X_train, y_train) #4. Apply bagging classifier with eight base classifiers created at the previous step. clf = BaggingClassifier(mlp, n_estimators=8) clf.fit(X_train, y_train) clf.score(X_test, y_test) predictions = mlp.predict(X_test) cm = confusion_matrix(y_test, predictions, labels=mlp.classes_) #print("Confusion Matrix:\n",cm,"\n") #confusion matrix accuracy = accuracy_score(y_test, predictions) #print("\nClassification Report:\n",classification_report(y_test,predictions),"\n") #print(accuracy) predicted_instances_per_class = cm[np.eye(len( clf.classes_)).astype("bool")] #6. Print your findings estimators = clf.estimators_ #print(len(estimators), type(estimators[0])) #pred_list = [] #5. Calculate number of correctly classified test instance for each base classifier and finally for bagging classifier. #print(X_test.shape[1]) #for base_estimator in estimators: #pred_list.append(base_estimator.predict(X_test)) #print(X_test.shape[base_estimator]) for i in predicted_instances_per_class: print(i, " out of 540 instances are correctly classified by learner") print("-------------------------------------------")
class ELMensemble(object): def __init__(self, n_hidden, C, n_estimators): self.n_hidden = n_hidden self.C = C self.ensemble = BaggingClassifier(base_estimator=ELMClassifier(n_hidden=n_hidden, C=C), n_jobs=-1, n_estimators=n_estimators, max_samples=0.5, max_features=0.5, bootstrap=True, bootstrap_features=False, oob_score=False) def train(self, X, y): self.ensemble.fit(X, y) return self.ensemble.score(X, y) def score(self, X, y): return self.ensemble.score(X, y)
def bagging_classifier(X_train, y_train, X_test, y_test): bagged_tree = BaggingClassifier(DecisionTreeClassifier( criterion=_hyper['criterion'], max_depth=_hyper['max_depth']), n_estimators=20) # Fit to the training data bagged_tree.fit(X_train, y_train) # Training & testing accuracy score train_accuracy = bagged_tree.score(X_train, y_train) test_accuracy = bagged_tree.score(X_test, y_test) print( f"Bagging classifier - train accuracy: {train_accuracy} test_accuracy: {test_accuracy}" ) return bagged_tree
def main(): # load data file_lssvm = "hw2_lssvm_all.dat" size_train = 400 with open(file_lssvm, 'r') as fr: list_line = fr.readlines() x_train, y_train = load_xy(list_line[:size_train]) x_test, y_test = load_xy(list_line[size_train:]) list_lambda = [0.05, 0.5, 5, 50, 500] # Q9, Q10: run regression ein = [] eout = [] for lam in list_lambda: rcf = RidgeClassifier(alpha=lam) rcf.fit(x_train, y_train) Ein = 1 - rcf.score(x_train, y_train) Eout = 1 - rcf.score(x_test, y_test) ein.append(Ein) eout.append(Eout) print("Ridge Classifier:") print("argminEin = {}, minEin_lambda = {}".format( min(ein), list_lambda[ein.index(min(ein))])) print("argminEout = {}, minEout_lambda = {}".format( min(eout), list_lambda[eout.index(min(eout))])) print("==========") # Q11, Q12: Bagging ein.clear() eout.clear() num_iter = 250 for lam in list_lambda: rcf = RidgeClassifier(alpha=lam) bcf = BaggingClassifier(base_estimator=rcf, n_estimators=num_iter, n_jobs=-1, random_state=0) bcf.fit(x_train, y_train) Ein = 1 - bcf.score(x_train, y_train) Eout = 1 - bcf.score(x_test, y_test) ein.append(Ein) eout.append(Eout) print("Bagging Ridge Classifier:") print("argminEin = {}, minEin_lambda = {}".format( min(ein), list_lambda[ein.index(min(ein))])) print("argminEout = {}, minEout_lambda = {}".format( min(eout), list_lambda[eout.index(min(eout))])) return
def bagging(x_train, y_train): model = BaggingClassifier(base_estimator=SVC(), n_estimators=10, random_state=0) model.fit(x_train, y_train) score = model.score(x_train, y_train) return score
def bagging(df, dep_var, features, test): print 'Bagging' best = [] #best_maxfeautures = [] #best_n_estimators = [] for sample in [1, 2, 3, 4, 5]: for x in range(1, 6): for n_est in range(3, 21, 3): start_time = time.time() clf = BaggingClassifier(max_features=x, max_samples=sample, n_estimators=n_est) clf.fit(df[features], df[dep_var]) score = clf.score(test[features], test[dep_var]) print 'sample: ', sample, ' Max_F: ', x, 'n estimators: ', n_est, score end_time = time.time() tm = end_time - start_time print 'Time: ', tm best.append([score, (x, sample, n_est, tm), clf]) best.sort(reverse=True) print best[0] best = best[0] return { 'score': best[0], 'max_features': best[1][0], 'max_sample': best[1][1], 'n_estimators': best[1][2], 'time': best[1][3], 'clf': clf }
def baggingClassifier_Model(df): # Calling & passing train_test_data_split method X_train, X_test, y_train, y_test = train_test_data_split(df) # Calling & passing scaler_features method X_train_norm, X_test_norm = scaler_features(X_train, X_test) # Declaring multiple models lr = LogisticRegression() knn = KNeighborsClassifier() dtc = DecisionTreeClassifier() gnb = GaussianNB() # Building the Bagging models models = [lr, knn, gnb, dtc] for model in models: # Creating Bagging Classifier bag = BaggingClassifier(base_estimator=model, n_estimators=10, bootstrap=True) # Fit the classifier on the training features and labels. bag = bag.fit(X_train_norm, y_train) # Predicting using X_test_norm y_pred_bag = bag.predict(X_test_norm) # Computing for the accuracy, precision, & recall result = bag.score(X_test_norm, y_test) print("Accuracy: {:.2%}".format(result), [model])
def main(): """ Main function. Args: """ # prepare data trainingSet=[] testSet=[] accuracy = 0.0 split = 0.25 loadDataset('../Dataset/combined.csv', split, trainingSet, testSet) print 'Train set: ' + repr(len(trainingSet)) print 'Test set: ' + repr(len(testSet)) # generate predictions predictions=[] trainData = np.array(trainingSet)[:,0:np.array(trainingSet).shape[1] - 1] columns = trainData.shape[1] X = np.array(trainData) y = np.array(trainingSet)[:,columns] clf = BaggingClassifier(QDA()) clf.fit(X, y) testData = np.array(testSet)[:,0:np.array(trainingSet).shape[1] - 1] X_test = np.array(testData) y_test = np.array(testSet)[:,columns] accuracy = clf.score(X_test,y_test) accuracy *= 100 print("Accuracy %:",accuracy)
def bagging(self): bag = BaggingClassifier(n_estimators=100) bag.fit(self.X_train, self.y_train) acc = round(bag.score(self.X_train, self.y_train) * 100, 2) print("acc with bagging:", acc) self.y_pred = bag.predict(self.X_test)
def call_function(): # prepare data try: trainingSet=[] testSet=[] accuracy = 0.0 split = 0.25 loadDataset("/".join([DATASET_FOLDER, 'comb.csv']), split, trainingSet, testSet) print('Train set: ' + repr(len(trainingSet))) print('Test set: ' + repr(len(testSet))) # generate predictions predictions=[] trainData = np.array(trainingSet)[:,0:np.array(trainingSet).shape[1] - 1] columns = trainData.shape[1] X = np.array(trainData) y = np.array(trainingSet)[:,columns] clf = BaggingClassifier(KNN(n_neighbors=10, weights='uniform', algorithm='auto', leaf_size=10, p=1, metric='minkowski', metric_params=None, n_jobs=1)) clf.fit(X, y) testData = np.array(testSet)[:,0:np.array(trainingSet).shape[1] - 1] X_test = np.array(testData) y_test = np.array(testSet)[:,columns] accuracy = clf.score(X_test,y_test) accuracy *= 100 print("Accuracy %:",accuracy) except: e = sys.exc_info()[0] print( "<p>Error: %s</p>" % e )
def call_function(): # prepare data try: trainingSet=[] testSet=[] accuracy = 0.0 split = 0.25 loadDataset("/".join([DATASET_FOLDER, 'comb.csv']), split, trainingSet, testSet) print('Train set: ' + repr(len(trainingSet))) print('Test set: ' + repr(len(testSet))) # generate predictions predictions=[] trainData = np.array(trainingSet)[:,0:np.array(trainingSet).shape[1] - 1] columns = trainData.shape[1] X = np.array(trainData) y = np.array(trainingSet)[:,columns] clf = BaggingClassifier(SVC(C=1.0, kernel='linear', degree=5, gamma='auto', coef0=0.0, shrinking=True, probability=False,tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1, random_state=None)) clf.fit(X, y) testData = np.array(testSet)[:,0:np.array(trainingSet).shape[1] - 1] X_test = np.array(testData) y_test = np.array(testSet)[:,columns] accuracy = clf.score(X_test,y_test) accuracy *= 100 print("Accuracy %:",accuracy) except: e = sys.exc_info()[0] print( "<p>Error: %s</p>" % e )
def call_function(): # prepare data try: trainingSet = [] testSet = [] accuracy = 0.0 split = 0.25 loadDataset("/".join([DATASET_FOLDER, 'LDAdata.csv']), split, trainingSet, testSet) print('Train set: ' + repr(len(trainingSet))) print('Test set: ' + repr(len(testSet))) trainData = np.array(trainingSet)[:, 0:np.array(trainingSet).shape[1] - 1] columns = trainData.shape[1] X = np.array(trainData) y = np.array(trainingSet)[:, columns] clf = BaggingClassifier(LDA()) clf.fit(X, y) testData = np.array(testSet)[:, 0:np.array(trainingSet).shape[1] - 1] X_test = np.array(testData) y_test = np.array(testSet)[:, columns] accuracy = clf.score(X_test, y_test) accuracy *= 100 print("Accuracy %:", accuracy) except: e = sys.exc_info()[0] print("<p>Error: %s</p>" % e)
def test_oob_score_classification(): # Check that oob prediction is a good estimation of the generalization # error. rng = check_random_state(0) X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=rng) for base_estimator in [DecisionTreeClassifier(), SVC()]: clf = BaggingClassifier(base_estimator=base_estimator, n_estimators=100, bootstrap=True, oob_score=True, random_state=rng).fit(X_train, y_train) test_score = clf.score(X_test, y_test) assert abs(test_score - clf.oob_score_) < 0.1 # Test with few estimators warn_msg = ( "Some inputs do not have OOB scores. This probably means too few " "estimators were used to compute any reliable oob estimates.") with pytest.warns(UserWarning, match=warn_msg): clf = BaggingClassifier( base_estimator=base_estimator, n_estimators=1, bootstrap=True, oob_score=True, random_state=rng, ) clf.fit(X_train, y_train)
def main(): """ Main function. Args: """ # prepare data trainingSet=[] testSet=[] accuracy = 0.0 split = 0.25 loadDataset('../Dataset/combined.csv', split, trainingSet, testSet) print 'Train set: ' + repr(len(trainingSet)) print 'Test set: ' + repr(len(testSet)) # generate predictions predictions=[] trainData = np.array(trainingSet)[:,0:np.array(trainingSet).shape[1] - 1] columns = trainData.shape[1] X = np.array(trainData) y = np.array(trainingSet)[:,columns] clf = BaggingClassifier(KNN(n_neighbors=10, weights='uniform', algorithm='auto', leaf_size=10, p=1, metric='minkowski', metric_params=None, n_jobs=1)) clf.fit(X, y) testData = np.array(testSet)[:,0:np.array(trainingSet).shape[1] - 1] X_test = np.array(testData) y_test = np.array(testSet)[:,columns] accuracy = clf.score(X_test,y_test) accuracy *= 100 print("Accuracy %:",accuracy)
def main(): """ Main function. Args: """ # prepare data trainingSet=[] testSet=[] accuracy = 0.0 split = 0.25 loadDataset('../Dataset/combined.csv', split, trainingSet, testSet) print 'Train set: ' + repr(len(trainingSet)) print 'Test set: ' + repr(len(testSet)) # generate predictions predictions=[] trainData = np.array(trainingSet)[:,0:np.array(trainingSet).shape[1] - 1] columns = trainData.shape[1] X = np.array(trainData) y = np.array(trainingSet)[:,columns] clf = BaggingClassifier(SVC(C=1.0, kernel='linear', degree=5, gamma='auto', coef0=0.0, shrinking=True, probability=False,tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1, random_state=None)) clf.fit(X, y) testData = np.array(testSet)[:,0:np.array(trainingSet).shape[1] - 1] X_test = np.array(testData) y_test = np.array(testSet)[:,columns] accuracy = clf.score(X_test,y_test) accuracy *= 100 print("Accuracy %:",accuracy)
def analysis(x_tr, y_tr, x_te=None, y_te=None): #print("Performing Bagging Classification!") # Create the classifier clf = BaggingClassifier(n_estimators=100) # Train the model clf.fit(x_tr, y_tr) # Compute the training accuracy acc = clf.score(x_tr, y_tr) # Compute the CV scores scores = cross_val_score(clf, x_tr, y_tr, cv=5) print("\n") print("Bagging Accuracy = %3.4f" % (acc)) print("CV Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) # Classify the data test_score = 0 if x_te is not None: yhat = clf.predict(x_te) test_score, notneeded = hp.check_accuracy(yhat, y_te) else: yhat = None data_scores = np.array([scores.mean(), scores.std(), acc, test_score]) return yhat, data_scores
def test_oob_score_classification(): # Check that oob prediction is a good estimation of the generalization # error. rng = check_random_state(0) X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=rng) for base_estimator in [DecisionTreeClassifier(), SVC()]: clf = BaggingClassifier(base_estimator=base_estimator, n_estimators=100, bootstrap=True, oob_score=True, random_state=rng).fit(X_train, y_train) test_score = clf.score(X_test, y_test) assert_less(abs(test_score - clf.oob_score_), 0.1) # Test with few estimators assert_warns(UserWarning, BaggingClassifier(base_estimator=base_estimator, n_estimators=1, bootstrap=True, oob_score=True, random_state=rng).fit, X_train, y_train)
def test_oob_score_classification(): # Check that oob prediction is a good estimation of the generalization # error. rng = check_random_state(0) X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=rng) for base_estimator in [DecisionTreeClassifier(), SVC()]: clf = BaggingClassifier(base_estimator=base_estimator, n_estimators=100, bootstrap=True, oob_score=True, random_state=rng).fit(X_train, y_train) test_score = clf.score(X_test, y_test) assert_less(abs(test_score - clf.oob_score_), 0.1) # Test with few estimators assert_warns( UserWarning, BaggingClassifier(base_estimator=base_estimator, n_estimators=1, bootstrap=True, oob_score=True, random_state=rng).fit, X_train, y_train)
def do_Bagging(): x_train, _, y_train, _, x_test, y_test = Rf.read_data() bagging = BaggingClassifier() bagging.fit(x_train, y_train) score = bagging.score(x_test, y_test) print(score) Rf.save_model("Bagging2", bagging)
def Teste(): # j48 = tree.DecisionTreeClassifier() # j48 = j48.fit(data_training, target_training) bagging = BaggingClassifier(tree.DecisionTreeClassifier(), max_samples=1.0, max_features=0.5) bagging.fit(data_training, target_training) print(bagging.score(data_test, target_test))
def decisionTreee(depth, numberOfbags): decisionTre = DecisionTreeClassifier(max_depth = int(depth)) baggClass = BaggingClassifier(decisionTre, n_estimators=int(numberOfbags), max_samples= 0.5, max_features = 1.0) baggClass.fit(X_train,Y_train) return baggClass.score(X_dev,Y_dev)
def main(): '''main function''' bagging = BaggingClassifier(DecisionTreeClassifier()) iris = load_iris() x = iris.data y = iris.target #train, test, train_, test_ = train_test_split(x, y, test_size=0.2, random_state=42) bagging.fit(x, y) bagging.predict(x[:2]) print(bagging.score(x[:2], y[:2]))
def train_bagging(): model = build_model() bagging_model = BaggingClassifier(base_estimator=model,n_estimators=bagging_num_estimator, max_samples=bagging_sample_fraction,oob_score=bagging_use_oob) #train model bagging_model.fit(XC, yc) #persist model if persist_model: models = bagging_model.estimators_ for m in zip(range(0, len(models)), models): model_file = model_file_directory + "/" + model_file_prefix + "_" + str(m[0] + 1) + ".mod" joblib.dump(m[1], model_file) score = bagging_model.score(XC, yc) print "average error %.3f" %(1.0 - score)
def bagging(df, dep_var, features, test): print 'Bagging' best = [] #best_maxfeautures = [] #best_n_estimators = [] for sample in [1, 2, 3, 4, 5]: for x in range(1, 6): for n_est in range(3, 21, 3): start_time = time.time() clf = BaggingClassifier(max_features=x, max_samples=sample, n_estimators=n_est) clf.fit (df[features], df[dep_var]) score = clf.score(test[features], test[dep_var]) print 'sample: ', sample, ' Max_F: ', x, 'n estimators: ', n_est, score end_time = time.time() tm =end_time-start_time print 'Time: ', tm best.append([score, (x, sample, n_est, tm), clf]) best.sort(reverse=True) print best[0] best = best[0] return {'score': best[0], 'max_features': best[1][0], 'max_sample': best[1][1] , 'n_estimators': best[1][2], 'time': best[1][3], 'clf': clf}
#0.904761904762 export_graphviz(ctree, out_file='ctree_entropy.dot', feature_names=words, class_names=author_names, filled=True, rounded=True, special_characters=True) graph_gini = pydot.graph_from_dot_file('ctree_entropy.dot') graph_gini.write_png('ctree_entropy.png') # feature evaluation ind_entropy = np.argsort(ctree.feature_importances_) features_entropy = np.array(words)[ind_entropy][::-1] ############################################################################### # Bagging bagging = BaggingClassifier() bagging.fit(training_data, training_label) err_bag_tr = bagging.score(training_data, training_label) err_bag_ts = bagging.score(test_data,test_label) #0.996604414261 #0.94444444444 ############################################################################### # Boosting # AdaBoost adaboost = AdaBoostClassifier() adaboost.fit(training_data, training_label) err_ada_tr = adaboost.score(training_data, training_label) err_ada_ts = adaboost.score(test_data,test_label) #0.9015280135823429 #0.8134920634920634 ind_adaboost = np.argsort(adaboost.feature_importances_)
br.fit(X, y) print 'Score BaggingRegressor = %s' % (br.score(X, y)) scores_br = cross_val_score(br, X, y, cv=5) print 'Cross Val Scores of BR = %s' %(np.mean(scores_br)) if name=='Iris' or name=='Digits': # Classificaiton problem rfc = RandomForestClassifier(**params) rfc.fit(X, y) print 'Score RandomForestClassifier = %s' % (rfc.score(X, y)) scores_rfc = cross_val_score(rfc, X, y ,cv=5) print 'Corss Val Scores of RandomForestClassifier = %s' %(np.mean(scores_rfc)) bc = BaggingClassifier(base_estimator=DecisionTreeClassifier(max_depth=max_depth), n_estimators=n_estimators) bc.fit(X, y) print 'Score BaggingClassifier == %s' % (bc.score(X, y)) scores_bc = cross_val_score(bc, X, y, cv=5) print 'Cross Val Scores of BaggingClassifier = %s' %(np.mean(scores_bc)) # ************************************* # Question 15 # ************************************* from utils import * import numpy as np import matplotlib.pyplot as plt from sklearn import datasets from sklearn.ensemble import (RandomForestClassifier, RandomForestRegressor) from sklearn.ensemble import (BaggingClassifier, BaggingRegressor) from sklearn.tree import (DecisionTreeClassifier, DecisionTreeRegressor) from sklearn.utils import shuffle
proba=pd.DataFrame(rf.predict_proba(x_test))[1] false_positive_rate, true_positive_rate, thresholds = skrc(y_test,proba) auc(false_positive_rate, true_positive_rate) #Extra Trees Accuracy (not as good as random forest) et = ExtraTreesClassifier(class_weight='balanced') et.fit(x_train,y_train) et.score(x_test,y_test) proba=pd.DataFrame(et.predict_proba(x_test))[1] false_positive_rate, true_positive_rate, thresholds = skrc(y_test,proba) auc(false_positive_rate, true_positive_rate) #Bagging Accuracy (Competitive for best depending on features) bc = BaggingClassifier(dt) bc.fit(x_train,y_train) bc.score(x_test,y_test) proba=pd.DataFrame(bc.predict_proba(x_test))[1] false_positive_rate, true_positive_rate, thresholds = skrc(y_test,proba) auc(false_positive_rate, true_positive_rate) #Boosting Accuracy (worst) #also takes too long to build model, avoid ab = AdaBoostClassifier(dt) ab.fit(x_train,y_train) ab.score(x_test,y_test) proba=pd.DataFrame(ab.predict_proba(x_test))[1] false_positive_rate, true_positive_rate, thresholds = skrc(y_test,proba) auc(false_positive_rate, true_positive_rate) #Gradient Boosting Accuracy (Competitive for best depending on features) gb = GradientBoostingClassifier()
from sklearn.ensemble import BaggingClassifier from sklearn import datasets if __name__ == '__main__': data = datasets.load_digits() X_train = data.data[:-20] y_train = data.target[:-20] X_test = data.data[-20:] y_test = data.target[-20:] for num in range(1,6): clf = BaggingClassifier(n_estimators=num, n_jobs=4) clf.fit(X_train, y_train) #y_pred = clf.predict(X_test) score = clf.score(X_test, y_test) print(num,score)
max_samples=0.1) bagged.fit(x_train, y_train) # initialize a random forest classifier print 'Training random forest...' rfc = RandomForestClassifier(n_estimators=200, max_features=40, min_samples_split=2, min_samples_leaf=1) rfc.fit(x_train, y_train) # training scores print "Training scores..." print bdt.score(x_train, y_train) print bagged.score(x_train, y_train) print rfc.score(x_train, y_train) # score the classfier on the test set # print "Scoring..." # print bdt.score(x_test, y_test) # print bagged.score(x_test, y_test) # print rfc.score(x_test, y_test) # print "Writing predictions..." predictions1 = bdt.predict(x_test) predictions2 = bagged.predict(x_test) predictions3 = rfc.predict(x_test) predictions = [] for i in range(100):
# this file tests bagging on various algorithms from sklearn.svm import SVC from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier from dlinghu_functions import * x_train, y_train, x_test = read_data() svm = SVC(C=128.0, gamma=8.0) svm.fit(x_train, y_train) print_cv_scores(svm, x_train, y_train) ######################################################### # test bagging sample ratio, without replacement for max_sample in np.arange(0.1, 1.0, 0.1): print 'max_sample ratio = %s' % max_sample svm_bagging = BaggingClassifier(svm, bootstrap=False, max_samples=max_sample, n_estimators=50) svm_bagging.fit(x_train, y_train) # test bagging print "In-sample score = %s" % svm_bagging.score(x_train, y_train) print_cv_scores(svm_bagging, x_train, y_train) ######################################################### svm_bagging = BaggingClassifier(svm, bootstrap=True, n_estimators=50) svm_bagging.fit(x_train, y_train) print_cv_scores(svm_bagging, x_train, y_train)
data = [] tfile = '../exp2_raw_data/train11w.data' train = pd.read_csv(tfile,sep = '\t') #preprocess cateMap = {} tmp = np.array(train[train['category_id']>0][['creative_id','category_id']]) for i,j in tmp: cateMap[i] = j train['category_id'] = train['creative_id'].map(cateMap) train = train.dropna(axis = 0) #init train x = np.array(train.drop(['qq','description','imp_time','pic_url','web_url', 'product_id','advertiser_id','series_id','creative_id','product_type','click_num', 'pos_id'], axis = 1)) y = np.array(train['click_num']) xTrain, xTest, yTrain, yTest = cross_validation.train_test_split(x, y, test_size=0.1, random_state=0) # some model if __name__ == '__main__': # clf = MultinomialNB(alpha = 0.1) # clf = tree.DecisionTreeClassifier(criterion='entropy', max_depth= 20, min_samples_split = 100 , class_weight = 'balanced') clf = BaggingClassifier(KNeighborsClassifier(),max_samples=0.5, max_features=0.5) # clf = AdaBoostClassifier(n_estimators=350, learning_rate=0.03) #clf = xgb.XGBClassifier(missing=np.nan, max_depth=5, n_estimators=350, learning_rate=0.03, nthread=4, subsample=0.95, colsample_bytree=0.85, seed=4242) clf.fit(xTrain, yTrain) print clf.score(xTrain, yTrain) print clf.score(xTest, yTest)
def myclassify(numfiers=5,xtrain=xtrain,ytrain=ytrain,xtest=xtest,ytest=ytest): count = 0 bagging2 = BaggingClassifier(ETC(),bootstrap=False,bootstrap_features=False) bagging2.fit(xtrain,ytrain) #print bagging2.score(xtest,ytest) count += 1 classifiers = [bagging2.score(xtest,ytest)] if count < numfiers: tree2 = ETC() tree2.fit(xtrain,ytrain) #print tree2.fit(xtrain,ytrain) #print tree2.score(xtest,ytest) count+=1 classifiers = np.append(classifiers,tree2.score(xtest,ytest)) print "1" print tree2.score(xtest,ytest) if count < numfiers: bagging1 = BaggingClassifier(ETC()) bagging1.fit(xtrain,ytrain) #print bagging1.score(xtest,ytest) count+=1 classifiers = np.append(classifiers,bagging1.score(xtest,ytest)) print "2" print bagging1.score(xtest,ytest) # if count < numfiers: # # votingClassifiers combine completely different machine learning classifiers and use a majority vote # clff1 = SVC() # clff2 = RFC(bootstrap=False) # clff3 = ETC() # clff4 = neighbors.KNeighborsClassifier() # clff5 = quadda() # print"3" # eclf = VotingClassifier(estimators = [('svc',clff1),('rfc',clff2),('etc',clff3),('knn',clff4),('qda',clff5)]) # eclf = eclf.fit(xtrain,ytrain) # #print(eclf.score(xtest,ytest)) # # for claf, label in zip([clff1,clff2,clff3,clff4,clff5,eclf],['SVC','RFC','ETC','KNN','QDA','Ensemble']): # # cla # # scores = crossvalidation.cross_val_score(claf,xtrain,ytrain,scoring='accuracy') # # print () # count+=1 # classifiers = np.append(classifiers,eclf.score(xtest,ytest)) # if count < numfiers: # svc1 = SVC() # svc1.fit(xtrain,ytrain) # dec = svc1.score(xtest,ytest) # count+=1 # classifiers = np.append(classifiers,svc1.score(xtest,ytest)) # print "3" if count < numfiers: # Quadradic discriminant analysis - classifier with quadratic decision boundary - qda = quadda() qda.fit(xtrain,ytrain) #print(qda.score(xtest,ytest)) count+=1 classifiers = np.append(classifiers,qda.score(xtest,ytest)) print "4" if count < numfiers: tree1 = DTC() tree1.fit(xtrain,ytrain) #print tree1.fit(xtrain,ytrain) #print tree1.score(xtest,ytest) count+=1 classifiers = np.append(classifiers,tree1.score(xtest,ytest)) if count < numfiers: knn1 = neighbors.KNeighborsClassifier() # this classifies based on the #k nearest neighbors, where k is definted by the user. knn1.fit(xtrain,ytrain) #print(knn1.score(xtest,ytest)) count+=1 classifiers = np.append(classifiers,knn1.score(xtest,ytest)) if count < numfiers: # linear discriminant analysis - classifier with linear decision boundary - lda = linda() lda.fit(xtrain,ytrain) #print(lda.score(xtest,ytest)) count+=1 classifiers = np.append(classifiers,lda.score(xtest,ytest)) if count < numfiers: tree3 = RFC() tree3.fit(xtrain,ytrain) #print tree3.score(xtest,ytest) count+=1 classifiers = np.append(classifiers,tree3.score(xtest,ytest)) if count < numfiers: bagging3 = BaggingClassifier(RFC(),bootstrap=False,bootstrap_features=False) bagging3.fit(xtrain,ytrain) #print bagging3.score(xtest,ytest) count+=1 classifiers = np.append(classifiers,bagging3.score(xtest,ytest)) if count < numfiers: bagging4 = BaggingClassifier(SVC(),bootstrap=False,bootstrap_features=False) bagging4.fit(xtrain,ytrain) #print bagging4.score(xtest,ytest) count+=1 classifiers = np.append(classifiers,bagging4.score(xtest,ytest)) if count < numfiers: tree4 = RFC(bootstrap=False) tree4.fit(xtrain,ytrain) #print tree4.score(xtest,ytest) count+=1 classifiers = np.append(classifiers,tree4.score(xtest,ytest)) if count < numfiers: tree6 = GBC() tree6.fit(xtrain,ytrain) #print(tree6.score(xtest,ytest)) count+=1 classifiers = np.append(classifiers,tree6.score(xtest,ytest)) if count < numfiers: knn2 = neighbors.KNeighborsClassifier(n_neighbors = 10) knn2.fit(xtrain,ytrain) #print(knn2.score(xtest,ytest)) count+=1 classifiers = np.append(classifiers,knn2.score(xtest,ytest)) if count < numfiers: knn3 = neighbors.KNeighborsClassifier(n_neighbors = 3) knn3.fit(xtrain,ytrain) #print(knn3.score(xtest,ytest)) count+=1 classifiers = np.append(classifiers,knn3.score(xtest,ytest)) if count < numfiers: knn4 = neighbors.KNeighborsClassifier(algorithm = 'ball_tree') knn4.fit(xtrain,ytrain) #print(knn4.score(xtest,ytest)) count+=1 classifiers = np.append(classifiers,knn4.score(xtest,ytest)) if count < numfiers: knn5 = neighbors.KNeighborsClassifier(algorithm = 'kd_tree') knn5.fit(xtrain,ytrain) #print(knn5.score(xtest,ytest)) count+=1 classifiers = np.append(classifiers,knn5.score(xtest,ytest)) if count < numfiers: ncc1 = NearestCentroid() ncc1.fit(xtrain,ytrain) #print (ncc1.score(xtest,ytest)) count+=1 classifiers = np.append(classifiers,ncc1.score(xtest,ytest)) if count < numfiers: # Nearest shrunken Centroid for shrinkage in [None,0.05,0.1,0.2,0.3,0.4,0.5]: ncc2 = NearestCentroid(shrink_threshold = shrinkage) ncc2.fit(xtrain,ytrain) #print(ncc2.score(xtest,ytest)) count+=1 classifiers = np.append(classifiers,ncc2.score(xtest,ytest)) if count < numfiers: tree5 = ABC() tree5.fit(xtrain,ytrain) #print(tree5.score(xtest,ytest)) count+=1 classifiers = np.append(classifiers,tree5.score(xtest,ytest)) classifierlabel = ["BaggingETC (with bootstraps set to false)","ETC","BaggingETC","Voting Classifier","svm","QDA","DTC","KNN (default)","LDA","RFC", "BaggingRFC (with bootstraps set to false)","BaggingSVC (with bootstraps set to false)","RFC (bootstrap false)","GBC", "knn (n_neighbors = 10)","knn (n_neighbors = 3)","knn (ball tree algorithm)","knn (kd_tree algorithm)", "Nearest Centroid","Shrunken Centroid?","ABC"] classifierlabel = classifierlabel[:len(classifiers)] #print len(classifiers) #print classifiers for i in range(len(classifiers)): print ("{} classifier has percent correct {}".format(classifierlabel[i],classifiers[i]))
def SVM(submit): labeled_images_data = spio.loadmat("labeled_images.mat") unlabeled_images_data = spio.loadmat("unlabeled_images.mat") public_test_data = spio.loadmat("public_test_images.mat") hidden_test_data = spio.loadmat("hidden_test_images.mat") hidden_faces = hidden_test_data.get("hidden_test_images") faces_test = public_test_data.get("public_test_images") unlabeled_faces = unlabeled_images_data.get("unlabeled_images") labels = labeled_images_data.get("tr_labels") identities = labeled_images_data.get("tr_identity") faces = labeled_images_data.get("tr_images") faces = faces.transpose(2, 0, 1) faces = faces.reshape((faces.shape[0], -1)) hidden_faces = hidden_faces.transpose(2, 0, 1) hidden_faces = hidden_faces.reshape((hidden_faces.shape[0], -1)) unlabeled_faces = unlabeled_faces.transpose(2, 0, 1) unlabeled_faces = unlabeled_faces.reshape((unlabeled_faces.shape[0], -1)) faces_test = faces_test.transpose(2, 0, 1) faces_test = faces_test.reshape((faces_test.shape[0], -1)) # train_data, test_data, train_targets, test_targets, train_ident, target_ident = splitSet(faces, labels, identities, 0.2) labels_s = labels.squeeze() # train_data, test_data, train_targets, test_targets, train_ident, test_ident = train_test_split(faces, labels_s, identities, train_size=0.9) # test = np.intersect1d(train_ident, test_ident) # small_faces = faces # small_identities = identities # small_labels = labels_s # aug = np.column_stack((small_identities, small_labels,small_faces)) # # one_array = np.array(filter(lambda row: row[1]==1, aug)) # two_array = np.array(filter(lambda row: row[1]==2, aug)) # three_array = np.array(filter(lambda row: row[1]==3, aug)) # four_array = np.array(filter(lambda row: row[1]==4, aug)) # five_array = np.array(filter(lambda row: row[1]==5, aug)) # six_array = np.array(filter(lambda row: row[1]==6, aug)) # seven_array = np.array(filter(lambda row: row[1]==7, aug)) # # label_arrays = [one_array, two_array, three_array, four_array, five_array, six_array, seven_array] # # for j in range(len(label_arrays)): # label_arrays[j] = label_arrays[j][label_arrays[j][:,0].argsort()[::-1]] # # # master_array = aug.copy() # # #save_object(label_arrays, "label_arrays") # # label_arrays = load_object("label_arrays") # # i = 0 # while i < len(faces): # for j in range(len(label_arrays)): # if i < len(faces) and len(label_arrays[j]>0): # if(j==6): # master_array[i] = label_arrays[j][0] # label_arrays[j] = np.delete(label_arrays[j] , 0, axis=0) # i = i+1 # master_array[i] = label_arrays[j][0] # label_arrays[j] = np.delete(label_arrays[j] , 0, axis=0) # #label_arrays[j] = np.zeros(3) # i = i+1 # #save_object(master_array, "master_canny_100-201") master_array = load_object("master") master_ident = master_array[:, 0] master_array = np.delete(master_array, 0, 1) master_labels = master_array[:, 0] master_array = np.delete(master_array, 0, 1) master_faces = master_array # train_data, test_data, train_targets, test_targets, train_ident, test_ident = splitSet(master_faces, master_labels, master_ident, 0.1) # train_data, test_data, train_targets, test_targets, train_ident, test_ident = splitSet(faces, labels_s, identities, 0.3) # common_idents_array = np.intersect1d(train_ident, test_ident) n_eigenfaces = 121 # print("- Performing PCA reduction -") # pca = RandomizedPCA(n_components=n_eigenfaces, whiten=True).fit(unlabeled_faces) # save_object(pca, "pca") # pca = load_object("pca") # #train_data = pca.transform(train_data) # #test_data = pca.transform(test_data) # print("- Finished PCA reduction -") # # print('PCA captures {:.2f} percent of the variance in the dataset'.format(pca.explained_variance_ratio_.sum() * 100)) # PUT YOUR PROCESSING HERE # Reshape hidden_faces = preprocessing.normalize(hidden_faces, norm="l2") master_faces = preprocessing.normalize(master_faces, norm="l2") faces_test = preprocessing.normalize(faces_test, norm="l2") hidden_faces = hidden_faces.reshape(len(hidden_faces), 32, 32) master_faces = master_faces.reshape(len(master_faces), 32, 32) faces_test = faces_test.reshape(len(faces_test), 32, 32) plt.subplot(122), plt.imshow(faces_test[3], cmap="gray") plt.title("Normal"), plt.xticks([]), plt.yticks([]) # plt.show() # Gamma correction hidden_faces = all_gamma(hidden_faces) master_faces = all_gamma(master_faces) faces_test = all_gamma(faces_test) plt.subplot(122), plt.imshow(faces_test[3], cmap="gray") plt.title("Gamma correction"), plt.xticks([]), plt.yticks([]) # plt.show() # #Dog filter # master_faces -= cv2.GaussianBlur(master_faces, (3, 3),1) # faces_test -= cv2.GaussianBlur(faces_test, (3, 3),1) # plt.subplot(122),plt.imshow(faces_test[1], cmap='gray') # plt.title('Dog Filter'), plt.xticks([]), plt.yticks([]) # plt.show() # #Rescale intensity # master_faces = testing(master_faces) # faces_test = testing(faces_test) # # plt.subplot(122),plt.imshow(faces_test[15], cmap='gray') # plt.title('Rescale'), plt.xticks([]), plt.yticks([]) # plt.show() # Equalization of variance TODO hidden_faces = EQ(hidden_faces) master_faces = EQ(master_faces) faces_test = EQ(faces_test) plt.subplot(122), plt.imshow(faces_test[3], cmap="gray") plt.title("Equalization"), plt.xticks([]), plt.yticks([]) # plt.show() # Reshape master_faces = master_faces.reshape((master_faces.shape[0], -1)) faces_test = faces_test.reshape((faces_test.shape[0], -1)) hidden_faces = hidden_faces.reshape((hidden_faces.shape[0], -1)) tuples = kfold(master_faces, master_labels, master_ident, 13) success_rates_train = [] success_rate_valid = [] if not submit: for tuple in tuples: train_data, test_data, train_targets, test_targets, train_ident, test_ident = tuple # train_data = pca.transform(train_data) # test_data = pca.transform(test_data) classifier = svm.SVC(gamma=0.5, C=1, kernel="poly") model = BaggingClassifier(classifier, n_estimators=10, bootstrap=True, verbose=1) model.fit(train_data, train_targets) # Train score = model.score(train_data, train_targets) valid_score = model.score(test_data, test_targets) print("Training :") print(score) success_rates_train.append(score) # Validation print("Validation :") print(valid_score) success_rate_valid.append(valid_score) print("Training rates :") print(success_rates_train) print("Training average :") print(np.average(success_rates_train)) print("Validation rates :") print(success_rate_valid) print("Validation average :") print(np.average(success_rate_valid)) if submit: classification = svm.SVC(gamma=0.5, C=1, kernel="poly") model = BaggingClassifier(classification, n_estimators=20, bootstrap_features=True, bootstrap=True, verbose=1) model.fit(master_faces, master_labels) test_predictions = model.predict(faces_test) hidden_predictions = model.predict(hidden_faces) # Test predictions ascending = np.zeros(1253) for i in range(len(ascending)): ascending[i] = i + 1 ascending = ascending.astype(int) hidden_guesses = hidden_predictions test_predictions = np.concatenate([test_predictions, hidden_guesses]) test_predictions = test_predictions.astype(int) csv = np.column_stack((ascending, test_predictions)) np.savetxt("hidden.csv", csv, delimiter=",") return
print 'Cross Val : std = %s' %(diabetes[i,6]) if name=='Iris': # Classificaiton problem rfc = RandomForestClassifier(**params) rfc.fit(X, y) scores_rfc = cross_val_score(rfc, X, y ,cv=5) bc = BaggingClassifier(base_estimator=DecisionTreeClassifier(max_depth=max_depth), n_estimators=n_estimators) bc.fit(X, y) scores_bc = cross_val_score(bc, X, y, cv=5) iris[i,1] = rfc.score(X, y) iris[i,2] = np.mean(scores_rfc) iris[i,3] = np.std(scores_rfc) iris[i,4] = bc.score(X, y) iris[i,5] = np.mean(scores_bc) iris[i,6] = np.std(scores_bc) print 'Score RandomForestClassifier = %s' % (iris[i,1]) print 'Corss Val : mean = %s' %(iris[i,2]) print 'Corss Val : std = %s' %(iris[i,3]) print 'Score BaggingClassifier == %s' % (iris[i,4]) print 'Cross Val : mean = %s' %(iris[i,5]) print 'Cross Val : std = %s' %(iris[i,6]) if name=='Digits': # Classificaiton problem rfc = RandomForestClassifier(**params) rfc.fit(X, y) scores_rfc = cross_val_score(rfc, X, y ,cv=5)
# In[22]: from sklearn.tree import ExtraTreeClassifier as ETC tree2 = ETC() print tree2 tree2.fit(xtrain,ytrain1) print tree2.fit(xtrain,ytrain1) print tree2.score(xtest,ytest1) # In[23]: from sklearn.ensemble import BaggingClassifier bagging1 = BaggingClassifier(ETC()) bagging1.fit(xtrain,ytrain1) print bagging1.score(xtest,ytest1) # In[24]: from sklearn.ensemble import BaggingClassifier bagging2 = BaggingClassifier(ETC(),bootstrap=False,bootstrap_features=False) bagging2.fit(xtrain,ytrain1) print bagging2.score(xtest,ytest1) # In[25]: from sklearn.ensemble import RandomForestClassifier as RFC tree3 = RFC() tree3.fit(xtrain,ytrain1)