def baggingknn(trainmat, laber, testmat, tlaber, mat, mlaber):
    clf1 = AdaBoostClassifier(DecisionTreeClassifier(max_depth=15,
                                                     max_features=None,
                                                     min_samples_split=6),
                              algorithm="SAMME",
                              n_estimators=30,
                              learning_rate=0.9)
    clf = KNeighborsClassifier(weights='distance',
                               n_neighbors=1,
                               algorithm='ball_tree',
                               metric='minkowski',
                               p=1)
    # clf =svm.SVC(C=500, kernel='rbf', gamma=0.001, decision_function_shape='ovr')
    clfb = BaggingClassifier(base_estimator=clf1,
                             max_samples=1.0,
                             max_features=1.0,
                             n_estimators=20)
    clfb.fit(trainmat, laber)

    # predict = clf.predict(trainmat)
    # result= clfb.predict(testmat)

    # print(clf.score(trainmat,laber))
    # print(clf.score(testmat,tlaber))
    # result2=clfb.score(testmat, tlaber)
    score = clfb.score(testmat, tlaber)
    print(clfb.score(testmat, tlaber))
    # print(result)
    score1c1 = cross_val_score(clf, mat, mlaber, cv=5, scoring='accuracy')
    scorec2 = cross_val_score(clfb, mat, mlaber, cv=5, scoring='accuracy')
    print('knn')
    print(score1c1.mean())
    print('bagging')
    print(scorec2.mean())
    return score
Example #2
0
def checkBaggingEffectOnOverFitting_decisionTree():

    depths = np.arange(1, 50)
    train_accuracy = np.empty(len(depths))
    test_accuracy = np.empty(len(depths))

    num = 20

    for i, k in enumerate(depths):
        knn = KNeighborsClassifier(n_neighbors=k)
        bagging = BaggingClassifier(base_estimator=knn,
                                    max_samples=0.5,
                                    max_features=0.5,
                                    n_estimators=num,
                                    random_state=12)
        bagging.fit(X_train, Y_train)
        train_accuracy[i] = bagging.score(X_train, Y_train)
        test_accuracy[i] = bagging.score(X_test, Y_test)

    plt.plot(depths,
             test_accuracy,
             label='Testing dataset Accuracy - Bagging Overfit')
    plt.plot(depths,
             train_accuracy,
             label='Training dataset Accuracy - Bagging Overfit')

    plt.legend()
    plt.xlabel('Max Depth')
    plt.ylabel('Accuracy')
    plt.show()
Example #3
0
def main():
    #getting and splitting data
    print("parsing data")
    patients, egm_matrix, cancer_onehot = get_data()
    train_X, test_X, train_Y, test_Y = train_test_split(egm_matrix,
                                                        cancer_onehot,
                                                        test_size=0.20,
                                                        random_state=0)

    #form scaled data
    scaler = preprocessing.StandardScaler().fit(train_X)
    print(scaler)
    scaled_train_X = scaler.transform(train_X)
    scaled_test_X = scaler.transform(test_X)

    bagging = BaggingClassifier(DecisionTreeClassifier(max_depth=3),
                                max_features=20000,
                                max_samples=1.0,
                                n_estimators=11,
                                random_state=2)

    bagging.fit(scaled_train_X, train_Y)

    pred_Y = bagging.predict(scaled_test_X)

    f1_score_bagging = f1_score(test_Y, pred_Y, average=None)
    print("f1 score is: " + str(f1_score_bagging)
          )  #max_features=20000 -> f1 score is: [0.77852349 0.47619048]

    mean_train_accuracy = bagging.score(scaled_train_X, train_Y)
    print("mean_train_accuracy is: " + str(mean_train_accuracy))
    mean_test_accuracy = bagging.score(scaled_test_X, test_Y)
    print("mean_test_accuracy is: " + str(mean_test_accuracy))
    print(bagging)
def bagging(x_train, x_test, y_train, y_test, n, classifier=None):
    print("---------------------------------------------\n")
    print("Rezultati za bagging: [" + str(classifier) + " " + str(n) + "] \n")
    beginTime = time.time()
    print("Kreiranje klasifikatora ... \n")
    if (classifier == "svc"):
        unit = SVC(kernel='poly', gamma='auto')
    elif (classifier == "tree"):
        unit = tree.DecisionTreeClassifier()
    elif (classifier == "knn"):
        unit = KNeighborsClassifier(3, algorithm="brute")
    else:
        unit = None

    clf = BaggingClassifier(unit, n_estimators=n)
    clf.fit(x_train, y_train.ravel())
    save_model(clf, "bagging_" + str(classifier) + "_" + str(n))
    print('Trening tacnost: {}'.format(clf.score(x_train, y_train)))
    print('Test tacnost: {}'.format(clf.score(x_test, y_test)))
    y_predict_train = clf.predict(x_train)
    y_predict_test = clf.predict(x_test)
    print("Matrica kofuzije trening vrednosti: \n" +
          str(confusion_matrix(y_train, y_predict_train)))
    print("Matrica kofuzije test vrednosti: \n" +
          str(confusion_matrix(y_test, y_predict_test)))
    endTime = time.time()
    elapsedTime = endTime - beginTime
    print(f"Vreme potrebno za izvrsavanje: {elapsedTime:.4f} \n")
    def best_first(self, pool, score_index, x_train, y_train, x_validation,
                   y_validation, x_test, y_test):
        BagPercepCurrent = BaggingClassifier(
            linear_model.Perceptron(max_iter=5), self.pool_size)
        BagPercepCurrent.fit(x_train, y_train)

        BagPercepCurrent.estimators_ = [pool.estimators_[score_index[0]]]
        best_score = BagPercepCurrent.score(x_validation, y_validation)
        best_score_test = BagPercepCurrent.score(x_test, y_test)
        metrics = (best_score_test, ) + self.calc_metrics(
            BagPercepCurrent.predict(x_test), y_test)
        best_index = 1
        best_score_test = 0
        diversity_kappa = 0
        for i, j in enumerate(list(score_index[1:])):
            BagPercepCurrent.estimators_ += [pool.estimators_[j]]
            score_current = BagPercepCurrent.score(x_validation, y_validation)

            if best_score < score_current:
                best_score = score_current
                best_index = i
                best_score_test = BagPercepCurrent.score(x_test, y_test)
                metrics = (best_score_test, ) + self.calc_metrics(
                    BagPercepCurrent.predict(x_test), y_test)
                diversity_kappa = self.pairwise_diversity_measure(
                    BagPercepCurrent, len(BagPercepCurrent.estimators_),
                    x_teste)

        best_index += 2
        # print("best index", best_index, best_score, best_score_test)
        return (metrics) + (diversity_kappa, )
Example #6
0
def svm():
    training_set_size = [.1,.25,.5,.75,.9]
    kernels = ['rbf', 'poly']

    columns = ['Kernel', 'Training Set Size', 'Training Score', 'Test Score', 'Train Time', 'Test Time']
    df = pd.DataFrame(columns=columns)

    for kernel in kernels:
        for tset_size in training_set_size:
            X_train, X_test, y_train, y_test = train_test_split(
                encoded_data[list(set(encoded_data.columns) - set(['Target']))],
                encoded_data['Target'], train_size=tset_size)
            scaler = preprocessing.StandardScaler()
            X_train = pd.DataFrame(scaler.fit_transform(X_train.astype('float32')), columns=X_train.columns)
            X_test = scaler.transform(X_test.astype('float32'))

            start = time.time()
            bagging_svm = BaggingClassifier(SVC(kernel=kernel, cache_size=1000), n_jobs=-1)
            print(bagging_svm)
            bagging_svm.fit(X_train, y_train)
            end_train = time.time() - start

            # y_pred = bagging_svm.predict(X_test)
            train_score = bagging_svm.score(X_train, y_train)
            start_test = time.time()
            test_score = bagging_svm.score(X_test, y_test)
            end_test = time.time() - start_test
            values = [kernel, tset_size, train_score, test_score, end_train, end_test]
            df.loc[len(df)] = values
            print(' '.join(str(col) for col in columns))
            print(' '.join(str(val) for val in values))
    df.to_excel('diabetes_svm.xls')
    def reduce_error(self, pool, score_index, x_train, y_train, x_validation,
                     y_validation, x_test, y_test):
        BagPercepCurrent = BaggingClassifier(
            linear_model.Perceptron(max_iter=5), self.pool_size)
        BagPercepCurrent.fit(x_train, y_train)

        ensemble_index = set()
        ensemble_index.add(score_index[0])

        ensemble = []
        ensemble.append(pool.estimators_[score_index[0]])

        BagPercepCurrent.estimators_ = ensemble
        best_score = BagPercepCurrent.score(x_validation, y_validation)
        # metrics = (None, None, None, None)
        while (True):
            index_best_score = 0
            BagPercepCurrent.estimators_ = ensemble
            best_score_test = BagPercepCurrent.score(x_test, y_test)

            metrics = (best_score_test, ) + self.calc_metrics(
                BagPercepCurrent.predict(x_test), y_test)

            for i in list(score_index):
                if i not in ensemble_index:
                    BagPercepCurrent.estimators_ = ensemble + [
                        pool.estimators_[i]
                    ]
                    score_current = BagPercepCurrent.score(
                        x_validation, y_validation)

                    if best_score < score_current:
                        best_score = score_current
                        index_best_score = i
            if index_best_score != 0:
                ensemble_index.add(index_best_score)
                ensemble.append(pool.estimators_[index_best_score])
            else:
                # print("best index", len(ensemble), best_score, best_score_test)
                kappa_diversity = self.pairwise_diversity_measure(
                    BagPercepCurrent, len(BagPercepCurrent.estimators_),
                    x_test)
                disagreement_diversity_ = self.disagreement_diversity_measure(
                    BagPercepCurrent, len(BagPercepCurrent.estimators_),
                    x_test)
                return (metrics) + (
                    kappa_diversity,
                    disagreement_diversity_,
                )
            if len(ensemble_index) == self.pool_size:
                return (metrics) + (
                    kappa_diversity,
                    disagreement_diversity_,
                )
Example #8
0
def BaggingFunc(samples):
    bg = BaggingClassifier(DecisionTreeClassifier(),
                           max_samples=samples,
                           max_features=1.0,
                           n_estimators=25,
                           bootstrap=True)
    bg.fit(xtr, ytr)
    score = round(bg.score(xtst, ytst), 4)
    print("{} örnekli test Kümesi için doğruluk : ".format(samples),
          round(bg.score(xtst, ytst), 4))
    return score
Example #9
0
def Bagging_Using_DT():


    data = pd.read_csv('mnist.csv')

    DF_x = data.iloc[:,1:]  # Labels
    DF_y = data.iloc[:,0]   # Pixels

    x_train, x_test, y_train, y_test = train_test_split(DF_x, DF_y, test_size = 0.3, random_state = 5)

    #=====================
    #
    # Using Decision Tree
    #
    #=====================
    
    DT = DecisionTreeClassifier()
    DT.fit(x_train, y_train)

    print("\n----------------------------------------------------\n")

    print("Training Accuracy Using Decision Tree : ", DT.score(x_train, y_train) *100)
    print("Testing Accuracy Using Decision Tree : ", DT.score(x_test,y_test) *100)

    print("\n----------------------------------------------------\n")

    #==================================================
    #
    # Using Random Forest - Ensemble Of Decision Trees
    #
    #==================================================

    RF = RandomForestClassifier(n_estimators = 20)
    RF.fit(x_train, y_train)

    print("Training Accuracy Using Random Forest : ", RF.score(x_train, y_train) *100)
    print("Testing Accuracy Using Random Forest : ", RF.score(x_test ,y_test) *100)

    print("\n----------------------------------------------------\n")

    #===============================
    #
    # Bagging Using Decision Tree
    #
    #===============================

    BG = BaggingClassifier(DecisionTreeClassifier(), max_samples = 0.7, max_features = 1.0, n_estimators = 25)
    BG.fit(x_train, y_train)

    print("Training Accuracy Using Bagging Classifier : ", BG.score(x_train, y_train) *100)
    print("Testing Accuracy Using Bagging Classifier : ", BG.score(x_test, y_test) *100)

    print("\n----------------------------------------------------\n")
def model_fusion(model, X, y):
    """
    模型融合,使用Bagging方法进行模型融合,同样也使用训练集,验证集,测试集,
    输出融合只后的准确率,以及用在测试集上面的准确率。
    """
    X_train_all, X_test, y_train_all, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
    train_data, ver_data, train_label, ver_label = train_test_split(X_train_all, y_train_all,
                                                                    test_size=0.3, random_state=1)
    bag = BaggingClassifier(base_estimator=model, n_estimators=10, random_state=1, max_samples=0.2, max_features=0.9)
    bag.fit(train_data, train_label)
    print('model_fusion', bag.score(ver_data, ver_label))
    print('result-->', bag.score(X_test, y_test))
    return bag
def task1():
    #1. Load digit dataset (D).
    #X,y = load_digits(return_X_y=True)
    digits = datasets.load_digits()
    X = digits.data
    y = digits.target

    #2. 70% tuples are used for training while 30% tuples are used for testing.
    X_train, X_test, y_train, y_test = model_selection.train_test_split(
        X, y, test_size=0.3, train_size=0.7, random_state=42)
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = scaler.transform(
        X_train)  # Now apply the transformations to the data
    X_test = scaler.transform(X_test)

    #3. Create an instance of multi-layer perceptron network
    mlp = MLPClassifier(
        hidden_layer_sizes=(16, 8, 4, 2), max_iter=1001
    )  #(four hidden layers with 16, 8, 4,and 2 neurons in order)
    mlp.fit(X_train, y_train)

    #4. Apply bagging classifier  with eight base classifiers created at the previous step.
    clf = BaggingClassifier(mlp, n_estimators=8)
    clf.fit(X_train, y_train)
    clf.score(X_test, y_test)

    predictions = mlp.predict(X_test)
    cm = confusion_matrix(y_test, predictions, labels=mlp.classes_)
    #print("Confusion Matrix:\n",cm,"\n") #confusion matrix
    accuracy = accuracy_score(y_test, predictions)
    #print("\nClassification Report:\n",classification_report(y_test,predictions),"\n")
    #print(accuracy)
    predicted_instances_per_class = cm[np.eye(len(
        clf.classes_)).astype("bool")]

    #6. Print your findings
    estimators = clf.estimators_
    #print(len(estimators), type(estimators[0]))
    #pred_list = []
    #5. Calculate number of correctly classified test instance for each base classifier and finally for bagging classifier.

    #print(X_test.shape[1])
    #for base_estimator in estimators:
    #pred_list.append(base_estimator.predict(X_test))
    #print(X_test.shape[base_estimator])
    for i in predicted_instances_per_class:
        print(i, " out of 540 instances are correctly classified by learner")

    print("-------------------------------------------")
Example #12
0
class ELMensemble(object):
    def __init__(self, n_hidden, C, n_estimators):
        self.n_hidden = n_hidden
        self.C = C
        self.ensemble = BaggingClassifier(base_estimator=ELMClassifier(n_hidden=n_hidden, C=C), n_jobs=-1,
                                          n_estimators=n_estimators, max_samples=0.5, max_features=0.5,
                                          bootstrap=True, bootstrap_features=False, oob_score=False)

    def train(self, X, y):
        self.ensemble.fit(X, y)
        return self.ensemble.score(X, y)

    def score(self, X, y):
        return self.ensemble.score(X, y)
def bagging_classifier(X_train, y_train, X_test, y_test):
    bagged_tree = BaggingClassifier(DecisionTreeClassifier(
        criterion=_hyper['criterion'], max_depth=_hyper['max_depth']),
                                    n_estimators=20)

    # Fit to the training data
    bagged_tree.fit(X_train, y_train)

    # Training & testing accuracy score
    train_accuracy = bagged_tree.score(X_train, y_train)
    test_accuracy = bagged_tree.score(X_test, y_test)
    print(
        f"Bagging classifier - train accuracy: {train_accuracy}  test_accuracy: {test_accuracy}"
    )
    return bagged_tree
Example #14
0
def main():
    # load data
    file_lssvm = "hw2_lssvm_all.dat"
    size_train = 400
    with open(file_lssvm, 'r') as fr:
        list_line = fr.readlines()
        x_train, y_train = load_xy(list_line[:size_train])
        x_test, y_test = load_xy(list_line[size_train:])
    list_lambda = [0.05, 0.5, 5, 50, 500]

    # Q9, Q10: run regression
    ein = []
    eout = []
    for lam in list_lambda:
        rcf = RidgeClassifier(alpha=lam)
        rcf.fit(x_train, y_train)
        Ein = 1 - rcf.score(x_train, y_train)
        Eout = 1 - rcf.score(x_test, y_test)
        ein.append(Ein)
        eout.append(Eout)
    print("Ridge Classifier:")
    print("argminEin = {}, minEin_lambda = {}".format(
        min(ein), list_lambda[ein.index(min(ein))]))
    print("argminEout = {}, minEout_lambda = {}".format(
        min(eout), list_lambda[eout.index(min(eout))]))
    print("==========")
    # Q11, Q12: Bagging
    ein.clear()
    eout.clear()
    num_iter = 250
    for lam in list_lambda:
        rcf = RidgeClassifier(alpha=lam)
        bcf = BaggingClassifier(base_estimator=rcf,
                                n_estimators=num_iter,
                                n_jobs=-1,
                                random_state=0)
        bcf.fit(x_train, y_train)
        Ein = 1 - bcf.score(x_train, y_train)
        Eout = 1 - bcf.score(x_test, y_test)
        ein.append(Ein)
        eout.append(Eout)
    print("Bagging Ridge Classifier:")
    print("argminEin = {}, minEin_lambda = {}".format(
        min(ein), list_lambda[ein.index(min(ein))]))
    print("argminEout = {}, minEout_lambda = {}".format(
        min(eout), list_lambda[eout.index(min(eout))]))

    return
Example #15
0
def bagging(x_train, y_train):
    model = BaggingClassifier(base_estimator=SVC(),
                              n_estimators=10,
                              random_state=0)
    model.fit(x_train, y_train)
    score = model.score(x_train, y_train)
    return score
Example #16
0
def bagging(df, dep_var, features, test):
    print 'Bagging'
    best = []
    #best_maxfeautures = []
    #best_n_estimators = []
    for sample in [1, 2, 3, 4, 5]:
        for x in range(1, 6):
            for n_est in range(3, 21, 3):
                start_time = time.time()
                clf = BaggingClassifier(max_features=x,
                                        max_samples=sample,
                                        n_estimators=n_est)
                clf.fit(df[features], df[dep_var])
                score = clf.score(test[features], test[dep_var])
                print 'sample: ', sample, ' Max_F: ', x, 'n estimators: ', n_est, score
                end_time = time.time()
                tm = end_time - start_time
                print 'Time: ', tm
                best.append([score, (x, sample, n_est, tm), clf])
    best.sort(reverse=True)
    print best[0]
    best = best[0]
    return {
        'score': best[0],
        'max_features': best[1][0],
        'max_sample': best[1][1],
        'n_estimators': best[1][2],
        'time': best[1][3],
        'clf': clf
    }
Example #17
0
def baggingClassifier_Model(df):

    # Calling & passing train_test_data_split method
    X_train, X_test, y_train, y_test = train_test_data_split(df)

    # Calling & passing scaler_features method
    X_train_norm, X_test_norm = scaler_features(X_train, X_test)

    # Declaring multiple models
    lr = LogisticRegression()
    knn = KNeighborsClassifier()
    dtc = DecisionTreeClassifier()
    gnb = GaussianNB()

    # Building the Bagging models
    models = [lr, knn, gnb, dtc]
    for model in models:
        # Creating Bagging Classifier
        bag = BaggingClassifier(base_estimator=model,
                                n_estimators=10,
                                bootstrap=True)

        # Fit the classifier on the training features and labels.
        bag = bag.fit(X_train_norm, y_train)

        # Predicting using X_test_norm
        y_pred_bag = bag.predict(X_test_norm)

        # Computing for the accuracy, precision, & recall
        result = bag.score(X_test_norm, y_test)
        print("Accuracy: {:.2%}".format(result), [model])
Example #18
0
def main():
    """
    Main function.

    Args:
    """
	# prepare data
	trainingSet=[]
	testSet=[]
	accuracy = 0.0
	split = 0.25
	loadDataset('../Dataset/combined.csv', split, trainingSet, testSet)
	print 'Train set: ' + repr(len(trainingSet))
	print 'Test set: ' + repr(len(testSet))
	# generate predictions
	predictions=[]
	trainData = np.array(trainingSet)[:,0:np.array(trainingSet).shape[1] - 1]
  	columns = trainData.shape[1] 
	X = np.array(trainData)
	y = np.array(trainingSet)[:,columns]
	clf = BaggingClassifier(QDA())
	clf.fit(X, y)
	testData = np.array(testSet)[:,0:np.array(trainingSet).shape[1] - 1]
	X_test = np.array(testData)
	y_test = np.array(testSet)[:,columns]
	accuracy = clf.score(X_test,y_test)
	accuracy *= 100
	print("Accuracy %:",accuracy)	
    def bagging(self):
        bag = BaggingClassifier(n_estimators=100)
        bag.fit(self.X_train, self.y_train)

        acc = round(bag.score(self.X_train, self.y_train) * 100, 2)
        print("acc with bagging:", acc)
        self.y_pred = bag.predict(self.X_test)
def call_function():
	# prepare data
    try:
        trainingSet=[]
        testSet=[]
        accuracy = 0.0
        split = 0.25
        loadDataset("/".join([DATASET_FOLDER, 'comb.csv']), split, trainingSet, testSet)
        print('Train set: ' + repr(len(trainingSet)))
        print('Test set: ' + repr(len(testSet)))
        # generate predictions
        predictions=[]
        trainData = np.array(trainingSet)[:,0:np.array(trainingSet).shape[1] - 1]
        columns = trainData.shape[1] 
        X = np.array(trainData)
        y = np.array(trainingSet)[:,columns]
        clf = BaggingClassifier(KNN(n_neighbors=10, weights='uniform', algorithm='auto', leaf_size=10, p=1, metric='minkowski', metric_params=None, n_jobs=1))
        clf.fit(X, y)
        testData = np.array(testSet)[:,0:np.array(trainingSet).shape[1] - 1]
        X_test = np.array(testData)
        y_test = np.array(testSet)[:,columns]
        accuracy = clf.score(X_test,y_test)
        accuracy *= 100
        print("Accuracy %:",accuracy)
    except:
        e = sys.exc_info()[0]
        print( "<p>Error: %s</p>" % e )
def call_function():
	# prepare data
    try:
        trainingSet=[]
        testSet=[]
        accuracy = 0.0
        split = 0.25
        loadDataset("/".join([DATASET_FOLDER, 'comb.csv']), split, trainingSet, testSet)
        print('Train set: ' + repr(len(trainingSet)))
        print('Test set: ' + repr(len(testSet)))
        # generate predictions
        predictions=[]
        trainData = np.array(trainingSet)[:,0:np.array(trainingSet).shape[1] - 1]
        columns = trainData.shape[1] 
        X = np.array(trainData)
        y = np.array(trainingSet)[:,columns]
        clf = BaggingClassifier(SVC(C=1.0, kernel='linear', degree=5, gamma='auto', coef0=0.0, shrinking=True, probability=False,tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1, random_state=None))
        clf.fit(X, y)
        testData = np.array(testSet)[:,0:np.array(trainingSet).shape[1] - 1]
        X_test = np.array(testData)
        y_test = np.array(testSet)[:,columns]
        accuracy = clf.score(X_test,y_test)
        accuracy *= 100
        print("Accuracy %:",accuracy)
    except:
        e = sys.exc_info()[0]
        print( "<p>Error: %s</p>" % e )
def call_function():
    # prepare data
    try:
        trainingSet = []
        testSet = []
        accuracy = 0.0
        split = 0.25
        loadDataset("/".join([DATASET_FOLDER, 'LDAdata.csv']), split,
                    trainingSet, testSet)
        print('Train set: ' + repr(len(trainingSet)))
        print('Test set: ' + repr(len(testSet)))
        trainData = np.array(trainingSet)[:,
                                          0:np.array(trainingSet).shape[1] - 1]
        columns = trainData.shape[1]
        X = np.array(trainData)
        y = np.array(trainingSet)[:, columns]
        clf = BaggingClassifier(LDA())
        clf.fit(X, y)
        testData = np.array(testSet)[:, 0:np.array(trainingSet).shape[1] - 1]
        X_test = np.array(testData)
        y_test = np.array(testSet)[:, columns]
        accuracy = clf.score(X_test, y_test)
        accuracy *= 100
        print("Accuracy %:", accuracy)
    except:
        e = sys.exc_info()[0]
        print("<p>Error: %s</p>" % e)
Example #23
0
def test_oob_score_classification():
    # Check that oob prediction is a good estimation of the generalization
    # error.
    rng = check_random_state(0)
    X_train, X_test, y_train, y_test = train_test_split(iris.data,
                                                        iris.target,
                                                        random_state=rng)

    for base_estimator in [DecisionTreeClassifier(), SVC()]:
        clf = BaggingClassifier(base_estimator=base_estimator,
                                n_estimators=100,
                                bootstrap=True,
                                oob_score=True,
                                random_state=rng).fit(X_train, y_train)

        test_score = clf.score(X_test, y_test)

        assert abs(test_score - clf.oob_score_) < 0.1

        # Test with few estimators
        warn_msg = (
            "Some inputs do not have OOB scores. This probably means too few "
            "estimators were used to compute any reliable oob estimates.")
        with pytest.warns(UserWarning, match=warn_msg):
            clf = BaggingClassifier(
                base_estimator=base_estimator,
                n_estimators=1,
                bootstrap=True,
                oob_score=True,
                random_state=rng,
            )
            clf.fit(X_train, y_train)
Example #24
0
def main():
    """
    Main function.

    Args:
    """
	# prepare data
	trainingSet=[]
	testSet=[]
	accuracy = 0.0
	split = 0.25
	loadDataset('../Dataset/combined.csv', split, trainingSet, testSet)
	print 'Train set: ' + repr(len(trainingSet))
	print 'Test set: ' + repr(len(testSet))
	# generate predictions
	predictions=[]
	trainData = np.array(trainingSet)[:,0:np.array(trainingSet).shape[1] - 1]
  	columns = trainData.shape[1] 
	X = np.array(trainData)
	y = np.array(trainingSet)[:,columns]
	clf = BaggingClassifier(KNN(n_neighbors=10, weights='uniform', algorithm='auto', leaf_size=10, p=1, metric='minkowski', metric_params=None, n_jobs=1))
	clf.fit(X, y)
	testData = np.array(testSet)[:,0:np.array(trainingSet).shape[1] - 1]
	X_test = np.array(testData)
	y_test = np.array(testSet)[:,columns]
	accuracy = clf.score(X_test,y_test)
	accuracy *= 100
	print("Accuracy %:",accuracy)	
Example #25
0
def main():
    """
    Main function.

    Args:
    """
	# prepare data
	trainingSet=[]
	testSet=[]
	accuracy = 0.0
	split = 0.25
	loadDataset('../Dataset/combined.csv', split, trainingSet, testSet)
	print 'Train set: ' + repr(len(trainingSet))
	print 'Test set: ' + repr(len(testSet))
	# generate predictions
	predictions=[]
	trainData = np.array(trainingSet)[:,0:np.array(trainingSet).shape[1] - 1]
  	columns = trainData.shape[1] 
	X = np.array(trainData)
	y = np.array(trainingSet)[:,columns]
	clf = BaggingClassifier(SVC(C=1.0, kernel='linear', degree=5, gamma='auto', coef0=0.0, shrinking=True, probability=False,tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1, random_state=None))
	clf.fit(X, y)
	testData = np.array(testSet)[:,0:np.array(trainingSet).shape[1] - 1]
	X_test = np.array(testData)
	y_test = np.array(testSet)[:,columns]
	accuracy = clf.score(X_test,y_test)
	accuracy *= 100
	print("Accuracy %:",accuracy)	
def analysis(x_tr, y_tr, x_te=None, y_te=None):
    #print("Performing Bagging Classification!")

    # Create the classifier
    clf = BaggingClassifier(n_estimators=100)

    # Train the model
    clf.fit(x_tr, y_tr)

    # Compute the training accuracy
    acc = clf.score(x_tr, y_tr)

    # Compute the CV scores
    scores = cross_val_score(clf, x_tr, y_tr, cv=5)

    print("\n")
    print("Bagging Accuracy = %3.4f" % (acc))
    print("CV Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

    # Classify the data
    test_score = 0
    if x_te is not None:
        yhat = clf.predict(x_te)
        test_score, notneeded = hp.check_accuracy(yhat, y_te)
    else:
        yhat = None

    data_scores = np.array([scores.mean(), scores.std(), acc, test_score])
    return yhat, data_scores
Example #27
0
def test_oob_score_classification():
    # Check that oob prediction is a good estimation of the generalization
    # error.
    rng = check_random_state(0)
    X_train, X_test, y_train, y_test = train_test_split(iris.data,
                                                        iris.target,
                                                        random_state=rng)

    for base_estimator in [DecisionTreeClassifier(), SVC()]:
        clf = BaggingClassifier(base_estimator=base_estimator,
                                n_estimators=100,
                                bootstrap=True,
                                oob_score=True,
                                random_state=rng).fit(X_train, y_train)

        test_score = clf.score(X_test, y_test)

        assert_less(abs(test_score - clf.oob_score_), 0.1)

        # Test with few estimators
        assert_warns(UserWarning,
                     BaggingClassifier(base_estimator=base_estimator,
                                       n_estimators=1,
                                       bootstrap=True,
                                       oob_score=True,
                                       random_state=rng).fit,
                     X_train,
                     y_train)
Example #28
0
def test_oob_score_classification():
    # Check that oob prediction is a good estimation of the generalization
    # error.
    rng = check_random_state(0)
    X_train, X_test, y_train, y_test = train_test_split(iris.data,
                                                        iris.target,
                                                        random_state=rng)

    for base_estimator in [DecisionTreeClassifier(), SVC()]:
        clf = BaggingClassifier(base_estimator=base_estimator,
                                n_estimators=100,
                                bootstrap=True,
                                oob_score=True,
                                random_state=rng).fit(X_train, y_train)

        test_score = clf.score(X_test, y_test)

        assert_less(abs(test_score - clf.oob_score_), 0.1)

        # Test with few estimators
        assert_warns(
            UserWarning,
            BaggingClassifier(base_estimator=base_estimator,
                              n_estimators=1,
                              bootstrap=True,
                              oob_score=True,
                              random_state=rng).fit, X_train, y_train)
Example #29
0
def do_Bagging():
    x_train, _, y_train, _, x_test, y_test = Rf.read_data()
    bagging = BaggingClassifier()
    bagging.fit(x_train, y_train)
    score = bagging.score(x_test, y_test)
    print(score)
    Rf.save_model("Bagging2", bagging)
Example #30
0
def Teste():
    # j48 = tree.DecisionTreeClassifier()
    # j48 = j48.fit(data_training, target_training)
    bagging = BaggingClassifier(tree.DecisionTreeClassifier(),
                                max_samples=1.0,
                                max_features=0.5)
    bagging.fit(data_training, target_training)
    print(bagging.score(data_test, target_test))
Example #31
0
def decisionTreee(depth, numberOfbags):
    decisionTre = DecisionTreeClassifier(max_depth = int(depth))
    baggClass = BaggingClassifier(decisionTre,
                            n_estimators=int(numberOfbags), 
                            max_samples= 0.5, 
                            max_features = 1.0)
    baggClass.fit(X_train,Y_train)
    return baggClass.score(X_dev,Y_dev)
Example #32
0
def main():
    '''main function'''
    bagging = BaggingClassifier(DecisionTreeClassifier())
    iris = load_iris()
    x = iris.data
    y = iris.target
    #train, test, train_, test_ = train_test_split(x, y, test_size=0.2, random_state=42)
    bagging.fit(x, y)
    bagging.predict(x[:2])
    print(bagging.score(x[:2], y[:2]))
Example #33
0
def train_bagging():
	model = build_model()
	bagging_model = BaggingClassifier(base_estimator=model,n_estimators=bagging_num_estimator,
	max_samples=bagging_sample_fraction,oob_score=bagging_use_oob)
	
	#train model
	bagging_model.fit(XC, yc) 
	
	#persist model
	if persist_model:
		models = bagging_model.estimators_
		for m in zip(range(0, len(models)), models):
			model_file = model_file_directory + "/" + model_file_prefix + "_" + str(m[0] + 1) + ".mod"
			joblib.dump(m[1], model_file) 

	score = bagging_model.score(XC, yc)
	print "average error %.3f" %(1.0 - score)
Example #34
0
def bagging(df, dep_var, features, test):
    print 'Bagging'
    best = []
    #best_maxfeautures = []
    #best_n_estimators = []
    for sample in [1, 2, 3, 4, 5]:
        for x in range(1, 6):
            for n_est in range(3, 21, 3):
                start_time = time.time()
                clf = BaggingClassifier(max_features=x, max_samples=sample, n_estimators=n_est)
                clf.fit (df[features], df[dep_var])
                score = clf.score(test[features], test[dep_var])
                print 'sample: ', sample, ' Max_F: ', x, 'n estimators: ', n_est, score
                end_time = time.time()
                tm =end_time-start_time
                print 'Time: ', tm
                best.append([score, (x, sample, n_est, tm), clf])
    best.sort(reverse=True)
    print best[0]
    best = best[0]
    return {'score': best[0], 'max_features': best[1][0], 'max_sample': best[1][1] , 'n_estimators': best[1][2], 'time': best[1][3], 'clf': clf}
#0.904761904762
export_graphviz(ctree, out_file='ctree_entropy.dot',
                feature_names=words, class_names=author_names,
                filled=True, rounded=True,
                special_characters=True)
graph_gini = pydot.graph_from_dot_file('ctree_entropy.dot')
graph_gini.write_png('ctree_entropy.png')
# feature evaluation
ind_entropy = np.argsort(ctree.feature_importances_)
features_entropy = np.array(words)[ind_entropy][::-1]

###############################################################################
# Bagging
bagging = BaggingClassifier()
bagging.fit(training_data, training_label)
err_bag_tr =  bagging.score(training_data, training_label)
err_bag_ts =  bagging.score(test_data,test_label)
#0.996604414261
#0.94444444444


###############################################################################
# Boosting
# AdaBoost
adaboost = AdaBoostClassifier()
adaboost.fit(training_data, training_label)
err_ada_tr =  adaboost.score(training_data, training_label)
err_ada_ts =  adaboost.score(test_data,test_label)
#0.9015280135823429
#0.8134920634920634
ind_adaboost = np.argsort(adaboost.feature_importances_)
Example #36
0
        br.fit(X, y)
        print 'Score BaggingRegressor = %s' % (br.score(X, y))
        scores_br = cross_val_score(br, X, y, cv=5)
        print 'Cross Val Scores of BR = %s' %(np.mean(scores_br))
        
    if name=='Iris' or name=='Digits': # Classificaiton problem
    
        rfc = RandomForestClassifier(**params)
        rfc.fit(X, y)
        print 'Score RandomForestClassifier = %s' % (rfc.score(X, y))
        scores_rfc = cross_val_score(rfc, X, y ,cv=5)
        print 'Corss Val Scores of RandomForestClassifier = %s' %(np.mean(scores_rfc))

        bc = BaggingClassifier(base_estimator=DecisionTreeClassifier(max_depth=max_depth), n_estimators=n_estimators)
        bc.fit(X, y)        
        print 'Score BaggingClassifier == %s' % (bc.score(X, y))
        scores_bc = cross_val_score(bc, X, y, cv=5)
        print 'Cross Val Scores of BaggingClassifier = %s' %(np.mean(scores_bc))

# *************************************
# Question 15
# *************************************

from utils import *
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.ensemble import (RandomForestClassifier, RandomForestRegressor)
from sklearn.ensemble import (BaggingClassifier, BaggingRegressor)
from sklearn.tree import (DecisionTreeClassifier, DecisionTreeRegressor)
from sklearn.utils import shuffle
Example #37
0
proba=pd.DataFrame(rf.predict_proba(x_test))[1]
false_positive_rate, true_positive_rate, thresholds = skrc(y_test,proba)
auc(false_positive_rate, true_positive_rate)

#Extra Trees Accuracy (not as good as random forest)
et = ExtraTreesClassifier(class_weight='balanced')
et.fit(x_train,y_train)
et.score(x_test,y_test)
proba=pd.DataFrame(et.predict_proba(x_test))[1]
false_positive_rate, true_positive_rate, thresholds = skrc(y_test,proba)
auc(false_positive_rate, true_positive_rate)

#Bagging Accuracy (Competitive for best depending on features)
bc = BaggingClassifier(dt)
bc.fit(x_train,y_train)
bc.score(x_test,y_test)
proba=pd.DataFrame(bc.predict_proba(x_test))[1]
false_positive_rate, true_positive_rate, thresholds = skrc(y_test,proba)
auc(false_positive_rate, true_positive_rate)

#Boosting Accuracy (worst)
#also takes too long to build model, avoid
ab = AdaBoostClassifier(dt)
ab.fit(x_train,y_train)
ab.score(x_test,y_test)
proba=pd.DataFrame(ab.predict_proba(x_test))[1]
false_positive_rate, true_positive_rate, thresholds = skrc(y_test,proba)
auc(false_positive_rate, true_positive_rate)

#Gradient Boosting Accuracy (Competitive for best depending on features)
gb = GradientBoostingClassifier()
from sklearn.ensemble import BaggingClassifier
from sklearn import datasets


if __name__ == '__main__':
    data = datasets.load_digits()
    X_train = data.data[:-20]
    y_train = data.target[:-20]
    X_test = data.data[-20:]
    y_test = data.target[-20:]
    for num in range(1,6):
        clf = BaggingClassifier(n_estimators=num, n_jobs=4)
        clf.fit(X_train, y_train)
        #y_pred = clf.predict(X_test)
        score = clf.score(X_test, y_test)
        print(num,score)
Example #39
0
                           max_samples=0.1)
bagged.fit(x_train, y_train)


# initialize a random forest classifier 
print 'Training random forest...'
rfc = RandomForestClassifier(n_estimators=200,
                             max_features=40,
                             min_samples_split=2,
                             min_samples_leaf=1)
rfc.fit(x_train, y_train)

# training scores
print "Training scores..."
print bdt.score(x_train, y_train)
print bagged.score(x_train, y_train)
print rfc.score(x_train, y_train)

# score the classfier on the test set 
# print "Scoring..."
# print bdt.score(x_test, y_test)
# print bagged.score(x_test, y_test)
# print rfc.score(x_test, y_test)

# print "Writing predictions..."
predictions1 = bdt.predict(x_test)
predictions2 = bagged.predict(x_test)
predictions3 = rfc.predict(x_test)
predictions = []

for i in range(100):
Example #40
0
# this file tests bagging on various algorithms

from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier
from dlinghu_functions import *


x_train, y_train, x_test = read_data()
svm = SVC(C=128.0, gamma=8.0)
svm.fit(x_train, y_train)
print_cv_scores(svm, x_train, y_train)
#########################################################
# test bagging sample ratio, without replacement
for max_sample in np.arange(0.1, 1.0, 0.1):
    print 'max_sample ratio = %s' % max_sample
    svm_bagging = BaggingClassifier(svm, bootstrap=False, max_samples=max_sample, n_estimators=50)
    svm_bagging.fit(x_train, y_train)
    # test bagging
    print "In-sample score = %s" % svm_bagging.score(x_train, y_train)
    print_cv_scores(svm_bagging, x_train, y_train)
#########################################################
svm_bagging = BaggingClassifier(svm, bootstrap=True, n_estimators=50)
svm_bagging.fit(x_train, y_train)
print_cv_scores(svm_bagging, x_train, y_train)
Example #41
0
File: code.py Project: haleylu/DIP
data = []
tfile = '../exp2_raw_data/train11w.data'
train = pd.read_csv(tfile,sep = '\t')

#preprocess
cateMap = {}
tmp = np.array(train[train['category_id']>0][['creative_id','category_id']])
for i,j in tmp:
    cateMap[i] = j

train['category_id'] = train['creative_id'].map(cateMap)
train = train.dropna(axis = 0)

#init train
x = np.array(train.drop(['qq','description','imp_time','pic_url','web_url', 'product_id','advertiser_id','series_id','creative_id','product_type','click_num', 'pos_id'], axis = 1))
y = np.array(train['click_num'])

xTrain, xTest, yTrain, yTest = cross_validation.train_test_split(x, y, test_size=0.1, random_state=0)

# some model

if __name__ == '__main__':
    # clf = MultinomialNB(alpha = 0.1)
    # clf = tree.DecisionTreeClassifier(criterion='entropy', max_depth= 20, min_samples_split = 100 , class_weight = 'balanced')
    clf = BaggingClassifier(KNeighborsClassifier(),max_samples=0.5, max_features=0.5)
    # clf = AdaBoostClassifier(n_estimators=350, learning_rate=0.03)
    #clf = xgb.XGBClassifier(missing=np.nan, max_depth=5, n_estimators=350, learning_rate=0.03, nthread=4, subsample=0.95, colsample_bytree=0.85, seed=4242)

    clf.fit(xTrain, yTrain)
    print clf.score(xTrain, yTrain)
    print clf.score(xTest, yTest)
def myclassify(numfiers=5,xtrain=xtrain,ytrain=ytrain,xtest=xtest,ytest=ytest):
    count = 0



    bagging2 = BaggingClassifier(ETC(),bootstrap=False,bootstrap_features=False)
    bagging2.fit(xtrain,ytrain)
    #print bagging2.score(xtest,ytest)
    count += 1
    classifiers = [bagging2.score(xtest,ytest)]

    if count < numfiers:

        tree2 = ETC()
        tree2.fit(xtrain,ytrain)
        #print tree2.fit(xtrain,ytrain)
        #print tree2.score(xtest,ytest)
        count+=1
        classifiers = np.append(classifiers,tree2.score(xtest,ytest))
        print "1"
        print tree2.score(xtest,ytest)

    if count < numfiers:
        bagging1 = BaggingClassifier(ETC())
        bagging1.fit(xtrain,ytrain)
        #print bagging1.score(xtest,ytest)
        count+=1
        classifiers = np.append(classifiers,bagging1.score(xtest,ytest))
        print "2"
        print bagging1.score(xtest,ytest)

#     if count < numfiers:
#         # votingClassifiers combine completely different machine learning classifiers and use a majority vote
#         clff1 = SVC()
#         clff2 = RFC(bootstrap=False)
#         clff3 = ETC()
#         clff4 = neighbors.KNeighborsClassifier()
#         clff5 = quadda()
#         print"3"


#         eclf = VotingClassifier(estimators = [('svc',clff1),('rfc',clff2),('etc',clff3),('knn',clff4),('qda',clff5)])
#         eclf = eclf.fit(xtrain,ytrain)
#         #print(eclf.score(xtest,ytest))
#         # for claf, label in zip([clff1,clff2,clff3,clff4,clff5,eclf],['SVC','RFC','ETC','KNN','QDA','Ensemble']):
#         #     cla
#         #     scores = crossvalidation.cross_val_score(claf,xtrain,ytrain,scoring='accuracy')
#         #     print ()
#         count+=1
#         classifiers = np.append(classifiers,eclf.score(xtest,ytest))


#     if count < numfiers:
#         svc1 = SVC()
#         svc1.fit(xtrain,ytrain)
#         dec = svc1.score(xtest,ytest)
#         count+=1
#         classifiers = np.append(classifiers,svc1.score(xtest,ytest))
#         print "3"

    if count < numfiers:
        # Quadradic discriminant analysis - classifier with quadratic decision boundary -
        qda = quadda()
        qda.fit(xtrain,ytrain)
        #print(qda.score(xtest,ytest))
        count+=1
        classifiers = np.append(classifiers,qda.score(xtest,ytest))
        print "4"


    if count < numfiers:

        tree1 = DTC()
        tree1.fit(xtrain,ytrain)
        #print tree1.fit(xtrain,ytrain)
        #print tree1.score(xtest,ytest)
        count+=1
        classifiers = np.append(classifiers,tree1.score(xtest,ytest))

    if count < numfiers:
        knn1 = neighbors.KNeighborsClassifier() # this classifies based on the #k nearest neighbors, where k is definted by the user.
        knn1.fit(xtrain,ytrain)
        #print(knn1.score(xtest,ytest))
        count+=1
        classifiers = np.append(classifiers,knn1.score(xtest,ytest))

    if count < numfiers:
        # linear discriminant analysis - classifier with linear decision boundary -
        lda = linda()
        lda.fit(xtrain,ytrain)
        #print(lda.score(xtest,ytest))
        count+=1
        classifiers = np.append(classifiers,lda.score(xtest,ytest))

    if count < numfiers:
        tree3 = RFC()
        tree3.fit(xtrain,ytrain)
        #print tree3.score(xtest,ytest)
        count+=1
        classifiers = np.append(classifiers,tree3.score(xtest,ytest))

    if count < numfiers:
        bagging3 = BaggingClassifier(RFC(),bootstrap=False,bootstrap_features=False)
        bagging3.fit(xtrain,ytrain)
        #print bagging3.score(xtest,ytest)
        count+=1
        classifiers = np.append(classifiers,bagging3.score(xtest,ytest))


    if count < numfiers:
        bagging4 = BaggingClassifier(SVC(),bootstrap=False,bootstrap_features=False)
        bagging4.fit(xtrain,ytrain)
        #print bagging4.score(xtest,ytest)
        count+=1
        classifiers = np.append(classifiers,bagging4.score(xtest,ytest))

    if count < numfiers:
        tree4 = RFC(bootstrap=False)
        tree4.fit(xtrain,ytrain)
        #print tree4.score(xtest,ytest)
        count+=1
        classifiers = np.append(classifiers,tree4.score(xtest,ytest))

    if count < numfiers:
        tree6 = GBC()
        tree6.fit(xtrain,ytrain)
        #print(tree6.score(xtest,ytest))
        count+=1
        classifiers = np.append(classifiers,tree6.score(xtest,ytest))

    if count < numfiers:
        knn2 = neighbors.KNeighborsClassifier(n_neighbors = 10)
        knn2.fit(xtrain,ytrain)
        #print(knn2.score(xtest,ytest))
        count+=1
        classifiers = np.append(classifiers,knn2.score(xtest,ytest))

    if count < numfiers:
        knn3 = neighbors.KNeighborsClassifier(n_neighbors = 3)
        knn3.fit(xtrain,ytrain)
        #print(knn3.score(xtest,ytest))
        count+=1
        classifiers = np.append(classifiers,knn3.score(xtest,ytest))

    if count < numfiers:
        knn4 = neighbors.KNeighborsClassifier(algorithm = 'ball_tree')
        knn4.fit(xtrain,ytrain)
        #print(knn4.score(xtest,ytest))
        count+=1
        classifiers = np.append(classifiers,knn4.score(xtest,ytest))

    if count < numfiers:
        knn5 = neighbors.KNeighborsClassifier(algorithm = 'kd_tree')
        knn5.fit(xtrain,ytrain)
        #print(knn5.score(xtest,ytest))
        count+=1
        classifiers = np.append(classifiers,knn5.score(xtest,ytest))

    if count < numfiers:
        ncc1 = NearestCentroid()
        ncc1.fit(xtrain,ytrain)
        #print (ncc1.score(xtest,ytest))
        count+=1
        classifiers = np.append(classifiers,ncc1.score(xtest,ytest))

    if count < numfiers:
    # Nearest shrunken Centroid
        for shrinkage in [None,0.05,0.1,0.2,0.3,0.4,0.5]:
            ncc2 = NearestCentroid(shrink_threshold = shrinkage)
            ncc2.fit(xtrain,ytrain)
            #print(ncc2.score(xtest,ytest))

        count+=1
        classifiers = np.append(classifiers,ncc2.score(xtest,ytest))

    if count < numfiers:
        tree5 = ABC()
        tree5.fit(xtrain,ytrain)
        #print(tree5.score(xtest,ytest))
        count+=1
        classifiers = np.append(classifiers,tree5.score(xtest,ytest))

    classifierlabel = ["BaggingETC (with bootstraps set to false)","ETC","BaggingETC","Voting Classifier","svm","QDA","DTC","KNN (default)","LDA","RFC",
                       "BaggingRFC (with bootstraps set to false)","BaggingSVC (with bootstraps set to false)","RFC (bootstrap false)","GBC",
                        "knn (n_neighbors = 10)","knn (n_neighbors = 3)","knn (ball tree algorithm)","knn (kd_tree algorithm)",
                       "Nearest Centroid","Shrunken Centroid?","ABC"]


    classifierlabel = classifierlabel[:len(classifiers)]
    #print len(classifiers)
    #print classifiers
    for i in range(len(classifiers)):


        print ("{} classifier has percent correct {}".format(classifierlabel[i],classifiers[i]))
Example #43
0
def SVM(submit):
    labeled_images_data = spio.loadmat("labeled_images.mat")
    unlabeled_images_data = spio.loadmat("unlabeled_images.mat")
    public_test_data = spio.loadmat("public_test_images.mat")
    hidden_test_data = spio.loadmat("hidden_test_images.mat")
    hidden_faces = hidden_test_data.get("hidden_test_images")
    faces_test = public_test_data.get("public_test_images")
    unlabeled_faces = unlabeled_images_data.get("unlabeled_images")
    labels = labeled_images_data.get("tr_labels")
    identities = labeled_images_data.get("tr_identity")
    faces = labeled_images_data.get("tr_images")
    faces = faces.transpose(2, 0, 1)
    faces = faces.reshape((faces.shape[0], -1))
    hidden_faces = hidden_faces.transpose(2, 0, 1)
    hidden_faces = hidden_faces.reshape((hidden_faces.shape[0], -1))

    unlabeled_faces = unlabeled_faces.transpose(2, 0, 1)
    unlabeled_faces = unlabeled_faces.reshape((unlabeled_faces.shape[0], -1))
    faces_test = faces_test.transpose(2, 0, 1)
    faces_test = faces_test.reshape((faces_test.shape[0], -1))
    # train_data, test_data, train_targets, test_targets, train_ident, target_ident = splitSet(faces, labels, identities, 0.2)
    labels_s = labels.squeeze()

    # train_data, test_data, train_targets, test_targets, train_ident, test_ident = train_test_split(faces, labels_s, identities, train_size=0.9)
    # test = np.intersect1d(train_ident, test_ident)

    # small_faces = faces
    # small_identities = identities
    # small_labels = labels_s
    # aug = np.column_stack((small_identities, small_labels,small_faces))
    #
    # one_array = np.array(filter(lambda row: row[1]==1, aug))
    # two_array = np.array(filter(lambda row: row[1]==2, aug))
    # three_array = np.array(filter(lambda row: row[1]==3, aug))
    # four_array = np.array(filter(lambda row: row[1]==4, aug))
    # five_array = np.array(filter(lambda row: row[1]==5, aug))
    # six_array = np.array(filter(lambda row: row[1]==6, aug))
    # seven_array = np.array(filter(lambda row: row[1]==7, aug))
    #
    # label_arrays = [one_array, two_array, three_array, four_array, five_array, six_array, seven_array]
    #
    # for j in range(len(label_arrays)):
    #     label_arrays[j] = label_arrays[j][label_arrays[j][:,0].argsort()[::-1]]
    #
    #
    # master_array = aug.copy()
    #
    # #save_object(label_arrays, "label_arrays")
    # # label_arrays = load_object("label_arrays")
    #
    # i = 0
    # while i < len(faces):
    #     for j in range(len(label_arrays)):
    #         if i < len(faces) and len(label_arrays[j]>0):
    #             if(j==6):
    #                  master_array[i] = label_arrays[j][0]
    #                  label_arrays[j] = np.delete(label_arrays[j] , 0, axis=0)
    #                  i = i+1
    #             master_array[i] = label_arrays[j][0]
    #             label_arrays[j] = np.delete(label_arrays[j] , 0, axis=0)
    #             #label_arrays[j] = np.zeros(3)
    #             i = i+1
    # #save_object(master_array, "master_canny_100-201")

    master_array = load_object("master")

    master_ident = master_array[:, 0]
    master_array = np.delete(master_array, 0, 1)
    master_labels = master_array[:, 0]
    master_array = np.delete(master_array, 0, 1)
    master_faces = master_array
    # train_data, test_data, train_targets, test_targets, train_ident, test_ident = splitSet(master_faces, master_labels, master_ident, 0.1)
    # train_data, test_data, train_targets, test_targets, train_ident, test_ident = splitSet(faces, labels_s, identities, 0.3)

    # common_idents_array = np.intersect1d(train_ident, test_ident)

    n_eigenfaces = 121

    # print("-   Performing PCA reduction    -")
    # pca = RandomizedPCA(n_components=n_eigenfaces, whiten=True).fit(unlabeled_faces)
    # save_object(pca, "pca")
    # pca = load_object("pca")
    # #train_data = pca.transform(train_data)
    # #test_data = pca.transform(test_data)
    # print("-   Finished PCA reduction    -")
    #
    # print('PCA captures {:.2f} percent of the variance in the dataset'.format(pca.explained_variance_ratio_.sum() * 100))

    # PUT YOUR PROCESSING HERE
    # Reshape
    hidden_faces = preprocessing.normalize(hidden_faces, norm="l2")
    master_faces = preprocessing.normalize(master_faces, norm="l2")
    faces_test = preprocessing.normalize(faces_test, norm="l2")
    hidden_faces = hidden_faces.reshape(len(hidden_faces), 32, 32)

    master_faces = master_faces.reshape(len(master_faces), 32, 32)
    faces_test = faces_test.reshape(len(faces_test), 32, 32)
    plt.subplot(122), plt.imshow(faces_test[3], cmap="gray")
    plt.title("Normal"), plt.xticks([]), plt.yticks([])
    # plt.show()

    # Gamma correction
    hidden_faces = all_gamma(hidden_faces)

    master_faces = all_gamma(master_faces)
    faces_test = all_gamma(faces_test)
    plt.subplot(122), plt.imshow(faces_test[3], cmap="gray")
    plt.title("Gamma correction"), plt.xticks([]), plt.yticks([])
    # plt.show()

    # #Dog filter
    # master_faces -= cv2.GaussianBlur(master_faces, (3, 3),1)
    # faces_test -= cv2.GaussianBlur(faces_test, (3, 3),1)
    # plt.subplot(122),plt.imshow(faces_test[1], cmap='gray')
    # plt.title('Dog Filter'), plt.xticks([]), plt.yticks([])
    # plt.show()

    # #Rescale intensity
    # master_faces = testing(master_faces)
    # faces_test = testing(faces_test)
    #
    # plt.subplot(122),plt.imshow(faces_test[15], cmap='gray')
    # plt.title('Rescale'), plt.xticks([]), plt.yticks([])
    # plt.show()

    # Equalization of variance TODO
    hidden_faces = EQ(hidden_faces)
    master_faces = EQ(master_faces)

    faces_test = EQ(faces_test)
    plt.subplot(122), plt.imshow(faces_test[3], cmap="gray")
    plt.title("Equalization"), plt.xticks([]), plt.yticks([])
    # plt.show()

    # Reshape
    master_faces = master_faces.reshape((master_faces.shape[0], -1))
    faces_test = faces_test.reshape((faces_test.shape[0], -1))
    hidden_faces = hidden_faces.reshape((hidden_faces.shape[0], -1))

    tuples = kfold(master_faces, master_labels, master_ident, 13)
    success_rates_train = []
    success_rate_valid = []
    if not submit:
        for tuple in tuples:

            train_data, test_data, train_targets, test_targets, train_ident, test_ident = tuple
            # train_data = pca.transform(train_data)
            # test_data = pca.transform(test_data)

            classifier = svm.SVC(gamma=0.5, C=1, kernel="poly")
            model = BaggingClassifier(classifier, n_estimators=10, bootstrap=True, verbose=1)
            model.fit(train_data, train_targets)

            # Train
            score = model.score(train_data, train_targets)
            valid_score = model.score(test_data, test_targets)

            print("Training :")
            print(score)
            success_rates_train.append(score)

            # Validation
            print("Validation :")
            print(valid_score)
            success_rate_valid.append(valid_score)

        print("Training rates :")
        print(success_rates_train)
        print("Training average :")
        print(np.average(success_rates_train))

        print("Validation rates :")
        print(success_rate_valid)
        print("Validation average :")
        print(np.average(success_rate_valid))
    if submit:
        classification = svm.SVC(gamma=0.5, C=1, kernel="poly")
        model = BaggingClassifier(classification, n_estimators=20, bootstrap_features=True, bootstrap=True, verbose=1)
        model.fit(master_faces, master_labels)
        test_predictions = model.predict(faces_test)
        hidden_predictions = model.predict(hidden_faces)

        # Test predictions

        ascending = np.zeros(1253)

        for i in range(len(ascending)):
            ascending[i] = i + 1
        ascending = ascending.astype(int)
        hidden_guesses = hidden_predictions
        test_predictions = np.concatenate([test_predictions, hidden_guesses])
        test_predictions = test_predictions.astype(int)
        csv = np.column_stack((ascending, test_predictions))
        np.savetxt("hidden.csv", csv, delimiter=",")
    return
            print 'Cross Val : std = %s' %(diabetes[i,6])
            
        if name=='Iris': # Classificaiton problem
        
            rfc = RandomForestClassifier(**params)
            rfc.fit(X, y)
            scores_rfc = cross_val_score(rfc, X, y ,cv=5)

            bc = BaggingClassifier(base_estimator=DecisionTreeClassifier(max_depth=max_depth), n_estimators=n_estimators)
            bc.fit(X, y)        
            scores_bc = cross_val_score(bc, X, y, cv=5)

            iris[i,1] = rfc.score(X, y)
            iris[i,2] = np.mean(scores_rfc)
            iris[i,3] = np.std(scores_rfc)
            iris[i,4] = bc.score(X, y)
            iris[i,5] = np.mean(scores_bc)
            iris[i,6] = np.std(scores_bc)

            print 'Score RandomForestClassifier = %s' % (iris[i,1])
            print 'Corss Val : mean = %s' %(iris[i,2])
            print 'Corss Val : std = %s' %(iris[i,3])
            print 'Score BaggingClassifier == %s' % (iris[i,4])
            print 'Cross Val : mean = %s' %(iris[i,5])
            print 'Cross Val : std = %s' %(iris[i,6])
            
        if name=='Digits': # Classificaiton problem
        
            rfc = RandomForestClassifier(**params)
            rfc.fit(X, y)
            scores_rfc = cross_val_score(rfc, X, y ,cv=5)
# In[22]:

from sklearn.tree import ExtraTreeClassifier as ETC
tree2 = ETC()
print tree2
tree2.fit(xtrain,ytrain1)
print tree2.fit(xtrain,ytrain1)
print tree2.score(xtest,ytest1)


# In[23]:

from sklearn.ensemble import BaggingClassifier
bagging1 = BaggingClassifier(ETC())
bagging1.fit(xtrain,ytrain1)
print bagging1.score(xtest,ytest1)


# In[24]:

from sklearn.ensemble import BaggingClassifier
bagging2 = BaggingClassifier(ETC(),bootstrap=False,bootstrap_features=False)
bagging2.fit(xtrain,ytrain1)
print bagging2.score(xtest,ytest1)


# In[25]:

from sklearn.ensemble import RandomForestClassifier as RFC
tree3 = RFC()
tree3.fit(xtrain,ytrain1)