def extratree(typ, X_train, Y_train, X_test, Y_test, text):
    text.delete(1.0, tk.END)
    text.insert(
        tk.END,
        "\n\nIMPORTING ExtraTree" + "\nProcessing this might take a while...",
        "bold")
    text.update_idletasks()
    from sklearn.tree import ExtraTreeClassifier
    ETC = ExtraTreeClassifier()
    ETC.fit(X_train, Y_train)
    Y_pred = ETC.predict(X_test)
    text.insert(
        tk.END, "\n\nExtra Tree Classifier report \n" +
        classification_report(Y_pred, Y_test), "bold")
    text.insert(
        tk.END,
        "*****roc_auc_score: %0.3f*****\n" % roc_auc_score(Y_pred, Y_test),
        "bold")
    text.insert(
        tk.END, "Extra Tree Classifier confusion matrix \n" +
        str(confusion_matrix(Y_pred, Y_test)), "bold")
    score = accuracy_score(Y_pred, Y_pred)
    text.insert(tk.END, "Extra tree score= ", score)
    text.update_idletasks()
    roc_curve_acc(Y_test, Y_pred, 'ETC')
    if typ == "s":
        plt.show()
    elif typ == "a":
        pass
Beispiel #2
0
def dTree(data, labels, test, impurity="gini", mdepth=None):
    newData = pd.DataFrame()
    newTest = pd.DataFrame()
    le = LabelEncoder()
    for datum in data:
        newData[datum] = le.fit_transform(data[datum])
    for testItem in test:
        newTest[testItem] = le.fit_transform(test[testItem])
    tree1 = DecisionTreeClassifier(criterion=impurity,
                                   max_depth=mdepth,
                                   random_state=42)
    tree2 = ExtraTreeClassifier(criterion=impurity,
                                max_depth=mdepth,
                                random_state=42)
    tree3 = RandomForestClassifier(criterion=impurity,
                                   max_depth=mdepth,
                                   random_state=42)
    tree1.fit(newData, labels)
    tree2.fit(newData, labels)
    tree3.fit(newData, labels)
    predict1 = tree1.predict(newTest)
    print("tree1", evaluate(predict1, validation_genres))
    predict2 = tree2.predict(newTest)
    print("tree2", evaluate(predict2, validation_genres))
    predict3 = tree3.predict(newTest)
    print("tree3", evaluate(predict3, validation_genres))
    combined_prediction = voting([predict1, predict2, predict3], [1, 1, 1])
    return combined_prediction
Beispiel #3
0
def apply_extra_trees_classifier(trainData, targetTrain, testData, targetTest):
    """
    Applies decision tree algorithm on the dataset, by tuning various parameters

    Args:
        dataframe: The input trainData, testData and class label on which the decision tree algorithm has to be applied

    """
    # fit a CART model to the data
    etc = ExtraTreeClassifier(class_weight=None,
                              criterion='gini',
                              max_depth=None,
                              max_features='auto',
                              max_leaf_nodes=None,
                              min_samples_leaf=1,
                              min_samples_split=2,
                              min_weight_fraction_leaf=0.0,
                              random_state=None,
                              splitter='random')
    etc.fit(trainData, targetTrain)
    print(etc)
    # make predictions
    expected = targetTest
    predicted = etc.predict(testData)
    # summarize the fit of the model
    print(accuracy_score(expected, predicted))
Beispiel #4
0
 def extra_tree_classifier(self):
     self.log.writeToLog('Running Extra Tree Classifier Model...')
     X_train, X_test, y_train, y_test = self.train_test_split()
     et = ExtraTreeClassifier()
     trained_model = et.fit(X_train, y_train)
     self.save_pickle(trained_model)
     y_pred = et.predict(X_test)
     self.model_auc_roc(y_test, y_pred, "Extra Tree Classifier Model")
     self.model_evaluation(y_test, y_pred, "Extra Tree Classifier Model")
class ExtraTreeClassifier(Classifier):
	
	def __init__(self, matrixdatabase):
		self._matrix_database = matrixdatabase
		self._has_fit = False
		self._etc = ETC()

	def learn(self, ingredients, cuisine):
		return

	def classify(self, ingredients):
		if not self._has_fit:
			matrix, classes = self._matrix_database.make_train_matrix()
			self._etc = self._etc.fit(matrix, classes)
			print 'Fitting complete...'
			self._has_fit = True
		output = self._etc.predict(self._matrix_database.make_row_from_recipe(ingredients))
		return output[0]
class ExtraTreeClassifier(Classifier):
    def __init__(self, matrixdatabase):
        self._matrix_database = matrixdatabase
        self._has_fit = False
        self._etc = ETC()

    def learn(self, ingredients, cuisine):
        return

    def classify(self, ingredients):
        if not self._has_fit:
            matrix, classes = self._matrix_database.make_train_matrix()
            self._etc = self._etc.fit(matrix, classes)
            print('Fitting complete...')
            self._has_fit = True
        output = self._etc.predict(
            self._matrix_database.make_row_from_recipe(ingredients))
        return output[0]
    def fit(self, X, y):
        """Build a random decision tree based classifier from the training set (X, y)."""

        # Remove protected features
        X_protect = np.delete(X, [self.prot_class], axis=1)

        num_tr = len(y)
        num_prot_1 = sum(X[:, self.prot_class])
        num_prot_0 = num_tr - num_prot_1

        #X_protect = X
        i = 0
        fair_trees = []
        predictions = []

        # Pick up fair trees
        while i < self.num_fair_trees:
            new_tree = ExtraTreeClassifier(
                max_depth=self.max_depth,
                min_samples_leaf=self.min_samples_leaf,
                max_features=1)
            new_tree.fit(X_protect, y)
            new_prediction = new_tree.predict(X_protect)

            # Calculate the probability we predict someone will dropout between groups (Statistical Parity)
            num_pred_1 = len([
                e for e in range(0, num_tr)
                if new_prediction[e] == 0 and X[e, self.prot_class] == 1
            ])
            num_pred_0 = len([
                e for e in range(0, num_tr)
                if new_prediction[e] == 0 and X[e, self.prot_class] == 0
            ])
            stat_parity = abs(num_pred_1 / num_prot_1 -
                              num_pred_0 / num_prot_0)

            if stat_parity < self.rho:
                i += 1
                fair_trees.append(new_tree)
                predictions.append(new_prediction)

        self.ridge_model.fit(np.transpose(np.asarray(predictions)), y)
        self.decision_trees = fair_trees
Beispiel #8
0
def train_extratree_model():
    results_extratree_model = {}
    results_extratree_model['acc'] = []
    results_extratree_model['p_r_f1_s'] = []
    for i in range(30):
        train_features, train_labels = get_train_data()
        test_features, test_labels = get_test_data()

        clf = ExtraTreeClassifier()
        clf.fit(train_features, train_labels)
        predictions = clf.predict(test_features)
        p_r_f1_s = precision_recall_fscore_support(test_labels, predictions)
        acc = accuracy_score(test_labels, predictions)
        print("ExtraTree Model Classifier : ", acc)
        print(
            "ExtraTree Model Classifier Precision, Recall, F1-Score, Support: ",
            p_r_f1_s)
        results_extratree_model['acc'].append(acc)
        results_extratree_model['p_r_f1_s'].append(p_r_f1_s)
        time.sleep(10)
    pickle.dump(results_extratree_model,
                open('results_extratree_model.pkl', 'wb'))
Beispiel #9
0
clf = ExtraTreeClassifier(random_state=103, splitter='random', max_features=9)

##Get dataset
X = np.array(traindata.iloc[:, :10])
y = np.array(traindata.iloc[:, 10])

##build decision tree
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.1,
                                                    random_state=11)
clf.fit(X_train, y_train)

print('Finish Extra tree training')

predicttest = clf.predict(X_test)

##count click (0 or 1)
countClick = [0, 0]
for i in predicttest:
    if i == 0:
        countClick[0] += 1
    else:
        countClick[1] += 1
print(countClick)

##get accuracy, precision, recall, f_measure
tn, fp, fn, tp = confusion_matrix(y_test, predicttest).ravel()
print('tp: {0}, tn: {1}, fp: {2}, fn: {3}'.format(tp, tn, fp, fn))
acc = float((tp + tn) / (tp + tn + fp + fn))
precision = float(tp / (tp + fp))
sc = MinMaxScaler(feature_range=(0,1))
X_train = sc.fit_transform(X_train)

features2 = features2.replace('mod', 0)
features2 = features2.replace('unm', 1)
features2 = features2.replace(np.nan, 0, regex=True)

# print(features)
X_test = features2[['q1', 'q2', 'q3', 'q4', 'q5', 'mis1', 'mis2', 'mis3', 'mis4', 'mis5']].astype(float)
y_test = features2['sample'].astype(int)


sc = MinMaxScaler(feature_range=(0,1))
X_test = sc.fit_transform(X_test)

# X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, shuffle=True)

#Classifier
from sklearn.ensemble import BaggingClassifier
clf = ExtraTreeClassifier()
# clf = BaggingClassifier(clf, random_state=0)

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

confusion_matrix = pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'])
sn.heatmap(confusion_matrix, annot=True)

print('Accuracy: ', metrics.accuracy_score(y_test, y_pred))
plt.show()
    print "Cross validation"
    scores = cross_val_score(RandomForestClassifier(), training, classes,
                             cv=KFold(n=len(training), n_folds=5, random_state=42),
                             scoring="accuracy")
    print("CV error = %f +-%f" % (1. - np.mean(scores), np.std(scores)))
    print("Accuracy =", accuracy_score(y_test, tlf.predict(X_test)))
    print("Precision =", precision_score(y_test, tlf.predict(X_test)))
    print("Recall =", recall_score(y_test, tlf.predict(X_test)))
    print("F =", fbeta_score(y_test, tlf.predict(X_test), beta=1))
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"

    print "Extra Tree classifier"
    rlf = ExtraTreeClassifier()
    rlf.fit(training, classes)

    print("Training error =", zero_one_loss(classes, rlf.predict(training)))

    X_train, X_test, y_train, y_test = train_test_split(training, classes)
    rlf = ExtraTreeClassifier()
    rlf.fit(X_train, y_train)
    print("Training error =", zero_one_loss(y_train, rlf.predict(X_train)))
    print("Test error =", zero_one_loss(y_test, rlf.predict(X_test)))

    scores = []
    print "K-fold cross validation"
    for train, test in KFold(n=len(training), n_folds=5, random_state=42):
        X_train, y_train = training[train], classes[train]
        X_test, y_test = training[test], classes[test]
        rlf = ExtraTreeClassifier().fit(X_train, y_train)
        scores.append(zero_one_loss(y_test, rlf.predict(X_test)))
    #
Beispiel #12
0
# ExtraTree
brk = ExtraTreeClassifier(criterion='gini',
                          splitter='random',
                          max_depth=None,
                          min_samples_split=2,
                          min_samples_leaf=1,
                          min_weight_fraction_leaf=0.0,
                          max_features='auto',
                          random_state=None,
                          max_leaf_nodes=None,
                          min_impurity_decrease=0.0,
                          min_impurity_split=None,
                          class_weight=None,
                          ccp_alpha=0.0).fit(X_train_counts,
                                             y_train['prdtypecode'])
pred_ETC = brk.predict(X_test_counts)

# In[16]:

# Adding extratree predictions to dataframe
df2['ExtraTree'] = pred_ETC

# In[18]:

from sklearn.ensemble import GradientBoostingClassifier

# Gradient Boosting
gb_clf_test = GradientBoostingClassifier(n_estimators=10).fit(
    X_train_counts, y_train['prdtypecode'])
pred_gb_test = gb_clf_test.predict(X_test_counts)
Beispiel #13
0
#    data = ''
    with open(fname) as f:
        for s in f:
            tmp = map(int, s.split())
            labels.append(tmp[-1])
            res.append(tmp[:-1])
#            data += (str(tmp)[1:-1]).replace(',', '')+'\n'
#    with open('out.txt', 'w') as o:
#        o.write(str(data)[1:-1])
    return res, labels

X, Y = readData('german.data-numeric.txt')
Xt = X[:-200] ; Yt = Y[:-200]
XT = X[-200:] ; YT = Y[-200:]
print len(Xt)
clf = ExtraTreeClassifier(max_depth=None, random_state=0)
clf = clf.fit(Xt, Yt)

#proba = clf.predict_proba(XT)
#print len(proba)
#print proba

err = 0
for i, x in enumerate(XT):
    if clf.predict(x) != YT[i]: 
        prob = clf.predict_proba(x)
#        print prob
        err += 1

print err
Beispiel #14
0
# wielokrotna 5krotna walidacja krzyzowa (10x5)
rskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=2, random_state=42)
scores = np.zeros((len(preprocs), 5 * 2, len(metrics)))

for fold_id, (train, test) in enumerate(rskf.split(X, y)):
    for preproc_id, preproc in enumerate(preprocs):
        clf = clone(clf)

        if preprocs[preproc] == None:
            X_train, y_train = X[train], y[train]
        else:
            X_train, y_train = preprocs[preproc].fit_resample(
                X[train], y[train])

        clf.fit(X_train, y_train)
        y_pred = clf.predict(X[test])

        for metric_id, metric in enumerate(metrics):
            scores[preproc_id, fold_id, metric_id] = metrics[metric](y[test],
                                                                     y_pred)

# Save scores to a file
writeResToFile(scores)

# Load scores from file
scores = loadResFromFile()

# Results table
table = getResultsFromFileAsArray(scores)
print(table)
# Save table to a file
Beispiel #15
0
Gboost = GradientBoostingClassifier()
Xgboost = XGBClassifier()  #reg_lambda=2

#i=0
for train, test in sfk.split(X, Y):
    #    print(i)
    x_train = X.iloc[train, :]
    x_test = X.iloc[test, :]
    y_train = Y.iloc[train]
    y_test = Y.iloc[test]

    LogReg.fit(x_train, y_train)
    all_prediction.iloc[0, test] = LogReg.predict(x_test)
    #
    Extr_tree.fit(x_train, y_train)
    all_prediction.iloc[1, test] = Extr_tree.predict(x_test)
    #
    D_tree.fit(x_train, y_train)
    all_prediction.iloc[2, test] = D_tree.predict(x_test)
    #
    Rnd_frst.fit(x_train, y_train)
    all_prediction.iloc[3, test] = Rnd_frst.predict(x_test)
    #
    Gboost.fit(x_train, y_train)
    all_prediction.iloc[4, test] = Gboost.predict(x_test)

    Xgboost.fit(x_train, y_train)
    all_prediction.iloc[5, test] = Xgboost.predict(x_test)
#    i += 1

print(accuracy_score(Y, all_prediction.iloc[:, :].values[5]) * 100)
Beispiel #16
0
clf_entropy.fit(X_train, y_train)  # Training entropy tree

# Creating SVM with polynomial kernel
clf_svc = svm.SVC(random_state=100, kernel='poly')
clf_svc.fit(X_train, y_train)  # Training SVM

# Extra trees classifier
clf_ext = ExtraTreeClassifier(random_state=100,
                              max_depth=3,
                              min_samples_leaf=5)
clf_ext.fit(X_train, y_train)  # Training extra tree

clf_Lin = LinearDiscriminantAnalysis(solver='lsqr', shrinkage='auto')

y_pred_gi = clf_gini.predict(X_test)  # gini tree prediction test
y_pred_en = clf_entropy.predict(X_test)  # entropy tree prediction test
y_pred_sv = clf_svc.predict(X_test)  # SVM prediction test
y_pred_et = clf_ext.predict(X_test)  # extra tree prediction test

clf_Lin.fit(X_train, y_train)
y_pred_L = clf_Lin.predict(X_test)

# Print accuracy scores
print("Gini accuracy score: ", accuracy_score(y_test, y_pred_gi) * 100)
print("Entropy accuracy score: ", accuracy_score(y_test, y_pred_en) * 100)
print("SVM accuracy score: ", accuracy_score(y_test, y_pred_sv) * 100)
print("Extra tree accuracy score: ", accuracy_score(y_test, y_pred_et) * 100)
print("LinearDiscriminant accuracy score: ",
      accuracy_score(y_test, y_pred_L) * 100)
print(y_test)
print(y_pred_sv)
Beispiel #17
0
def extratree(typ, X_train, Y_train, X_test, Y_test, text):
    text.delete(1.0, tk.END)
    text.insert(
        tk.END,
        "\n\nIMPORTING ExtraTree" + "\nProcessing this might take a while...",
        "bold")
    text.update_idletasks()
    from sklearn.tree import ExtraTreeClassifier
    ETC = ExtraTreeClassifier()
    text.insert(tk.END,
                "\n\n Number of Features for Training : " + str(len(X_train)),
                "bold")
    text.update_idletasks()
    text.insert(tk.END,
                "\n\n Number of Labels for Training : " + str(len(Y_train)),
                "bold")
    text.update_idletasks()
    text.insert(
        tk.END,
        "\n\n *** Training ExtraTree using the above Features and Labels ***",
        "bold")
    text.update_idletasks()
    ETC.fit(X_train, Y_train)
    text.insert(tk.END, "\n\n Number of Test Features : " + str(len(X_test)),
                "bold")
    text.update_idletasks()
    text.insert(
        tk.END,
        "\n\n Predicting the Test Labels for the above Test Features using ExtraTree ",
        "bold")
    text.update_idletasks()
    Y_pred = ETC.predict(X_test)
    text.insert(tk.END, "\n\n Number of Actual Labels : " + str(len(Y_test)),
                "bold")
    text.update_idletasks()
    text.insert(
        tk.END, "\n\n Number of Test Labels Predicted by DExtraTree --> " +
        str(len(Y_pred)), "bold")
    text.insert(tk.END,
                "\n\n ---------------------------------------------------")
    text.update_idletasks()
    text.insert(
        tk.END, "\n\n Number of LABELS MATCHED : " +
        str(accuracy_score(Y_test, Y_pred, normalize=False)), "bold")
    text.update_idletasks()
    text.insert(
        tk.END,
        "\n\n Calculating Accuracy of ExtraTree = Label Matched/Actual Labels ",
        "bold")
    text.update_idletasks()
    text.insert(
        tk.END, "\n\n Accuracy Score : " +
        str(accuracy_score(Y_test, Y_pred, normalize=True)), "bold")
    text.update_idletasks()
    text.insert(
        tk.END,
        "\n\n ExtraTree report \n" + classification_report(Y_test, Y_pred),
        "bold")
    text.update_idletasks()
    roc_curve_acc(Y_test, Y_pred, 'ETC')
    if typ == "s":
        plt.show()
    elif typ == "a":
        pass
Beispiel #18
0
for col in cols:
    hotlab[col] = data[col]

promoted = data[["is_promoted"]]

#%%
x_train, x_test, y_train, y_test = train_test_split(hotlab, promoted)

sm = SMOTE(random_state=20)

train_input_new, train_output_new = sm.fit_sample(x_train, y_train)

#%%
class1 = ExtraTreeClassifier()
class1.fit(x_train, y_train)
pred1 = class1.predict(x_test)
score = f1_score(y_test, pred1)

#%%

confussion = confusion_matrix(y_test, pred1)

#%%
#For submission
submission_data = pd.read_csv("D:\\Hackathons\\Promotion\\test_2umaH9m.csv")

#%%
submission_data["education"] = submission_data["education"].fillna("Unknown")
submission_data["previous_year_rating"] = submission_data["previous_year_rating"].fillna(np.mean(submission_data["previous_year_rating"]))

submission_data["education"] = np.where(submission_data["age"] > 35, "Does not matter", submission_data["education"])
Beispiel #19
0
class ExtraTreeClass:
    """
    Name      : ExtraTreeClassifier
    Attribute : None
    Method    : predict, predict_by_cv, save_model
    """
    def __init__(self):
        # 알고리즘 이름
        self._name = 'extratree'
        # 기본 경로
        self._f_path = os.path.abspath(
            os.path.join(os.path.dirname(os.path.abspath(__file__)),
                         os.pardir))

        # 경고 메시지 삭제
        warnings.filterwarnings('ignore')

        # 원본 데이터 로드
        data = pd.read_csv(self._f_path +
                           "/classifier/resource/classifier_sample.csv",
                           sep=",",
                           encoding="utf-8")

        # 학습 및 레이블(정답) 데이터 분리
        self._x = data.drop("quality", axis=1)
        self._y = data["quality"]

        # 학습 데이터 및 테스트 데이터 분리
        self._x_train, self._x_test, self._y_train, self._y_test = train_test_split(
            self._x, self._y, test_size=0.2, shuffle=True, random_state=42)
        # 모델 선언
        self._model = ExtraTreeClassifier()

        # 모델 학습
        self._model.fit(self._x_train, self._y_train)

    # 일반 예측
    def predict(self):
        # 예측
        y_pred = self._model.predict(self._x_test)

        # 리포트 출력
        print(classification_report(self._y_test, y_pred))

        score = accuracy_score(self._y_test, y_pred)

        # 스코어 확인
        print(f'Score = {score}')
        # 스코어 리턴
        return score

    #  CV 예측(Cross Validation)
    def predict_by_cv(self):
        cv = KFold(n_splits=5, shuffle=True)
        # CV 지원 여부
        if hasattr(self._model, "score"):
            cv_score = cross_val_score(self._model, self._x, self._y, cv=cv)
            # 스코어 확인
            print(f'Score = {cv_score}')
            # 스코어 리턴
            return cv_score
        else:
            raise Exception('Not Support CrossValidation')

    #  GridSearchCV 예측
    def predict_by_gs(self):
        pass

    # 모델 저장 및 갱신
    def save_model(self, renew=False):
        # 모델 저장
        if not renew:
            # 처음 저장
            joblib.dump(self._model, self._f_path + f'/model/{self._name}.pkl')
        else:
            # 기존 모델 대체
            if os.path.isfile(self._f_path + f'/model/{self._name}.pkl'):
                os.rename(
                    self._f_path + f'/model/{self._name}.pkl', self._f_path +
                    f'/model/{str(self._name) + str(time.time())}.pkl')
            joblib.dump(self._model, self._f_path + f'/model/{self._name}.pkl')

    def __del__(self):
        del self._x_train, self._x_test, self._y_train, self._y_test, self._x, self._y, self._model
Beispiel #20
0
plot_step = 0.02

for pairidx, pair in enumerate([[0, 1], [0, 2], [0, 3], [1, 2], [1, 3], [2,
                                                                         3]]):
    print(pair, pairidx)
    X = iris.data[:, pair]
    Y = iris.target
    clf = ExtraTreeClassifier(max_depth=3).fit(X, Y)

    plt.subplot(2, 3, pairidx + 1)

    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step),
                         np.arange(y_min, y_max, plot_step))
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    cs = plt.contourf(xx, yy, Z, cmap=plt.cm.Paired)

    plt.xlabel(iris.feature_names[pair[0]])
    plt.ylabel(iris.feature_names[pair[1]])
    plt.axis("tight")

    for i, color in zip(range(n_classes), plot_colors):
        idx = np.where(Y == i)
        print(i)
        # print (idx)
        # plt.scatter(X[idx, 0], X[idx, 1], c=color, label=iris.target_names[i],
        # 			cmap=plt.cm.Paired)
    plt.axis("tight")
plt.suptitle("Ejemplos de clasificador de arboles")
Beispiel #21
0
sgd.score(x_test_3, y_test_3)
sgd = SGDClassifier(loss='log', shuffle=True, random_state=171)
sgd.fit(x_train_3, y_train_3)
sgd.predict(x_train_3)
sgd.score(x_test_3, y_test_3)
sgd = SGDClassifier(shuffle=True, random_state=171)
sgd.fit(x_train_3, y_train_3)
sgd.predict(x_train_3)
sgd.score(x_test_3, y_test_3)
submission = pd.DataFrame({'Id': test.Id, 'Cover_Type': ensemble_test_pred})
submission.head()
submission.to_csv('submission.csv', index=False)
submission_tree = pd.DataFrame({'Id': test.Id, 'Cover_Type': tree_test_pred})
submission_tree.head()
submission_tree.to_csv('submission2.csv', index=False)
#Extra tree classifier is a tree based model for classification problems
et = ExtraTreeClassifier()
et.fit(x_train_3, y_train_3)
et.predict(x_train_3)
et.score(x_test_3, y_test_3)
from sklearn.semi_supervised import LabelPropagation
lb = LabelPropagation()
lb.fit(x_train_3, y_train_3)
lb.predict(x_train_3)
lb.score(x_test_3, y_test_3)
from sklearn.neighbors import KNeighborsClassifier
knng = KNeighborsClassifier()
knng.fit(x_train_3, y_train_3)
knng.predict(x_train_3)
knng.score(x_test_3, y_test_3)
Beispiel #22
0
# In[ ]:

DTree = DecisionTreeClassifier(max_depth=3)
DTree.fit(x_train, y_train)
yhat = DTree.predict(x_test)
print("DecisionTreeClassifier")
print("Train set Accuracy: ",
      metrics.accuracy_score(y_train, DTree.predict(x_train)))
print("Test set Accuracy: ", metrics.accuracy_score(y_test, yhat))

# In[ ]:

ETree = ExtraTreeClassifier(max_depth=3)
ETree.fit(x_train, y_train)
yhat = ETree.predict(x_test)
print("ExtraTreeClassifier")
print("Train set Accuracy: ",
      metrics.accuracy_score(y_train, ETree.predict(x_train)))
print("Test set Accuracy: ", metrics.accuracy_score(y_test, yhat))

# In[ ]:

Ada = AdaBoostClassifier()
Ada.fit(x_train, y_train)
yhat = Ada.predict(x_test)
print("AdaBoostClassifier")
print("Train set Accuracy: ",
      metrics.accuracy_score(y_train, Ada.predict(x_train)))
print("Test set Accuracy: ", metrics.accuracy_score(y_test, yhat))
Beispiel #23
0
for f in range(X.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))
# Plotting featre importance
plt.figure(figsize=(10,5))
plt.title("Feature importances")
plt.bar(range(X.shape[1]), importances[indices],
       color="g", yerr=std[indices], align="center")
plt.xticks(range(X.shape[1]), features,rotation=90)
plt.xlim([-1, X.shape[1]])
plt.show()    
# Importing,intitiating and fitting Extra trees classifier
from sklearn.tree import ExtraTreeClassifier
extree = ExtraTreeClassifier(max_features=11,min_samples_split=21,
                             random_state=101,max_depth =28)
extree.fit(X_train_sm1,y_train_sm1)
extree_predict=extree.predict(X_test)
#checking performacne of the extra trees classifier
print(confusion_matrix(y_test,extree_predict))
print(classification_report(y_test,extree_predict))
#Importing test data
test=pd.read_csv('FIA_predictions.csv')
# getting columns same as training data 
test=test.iloc[:,0:33]
#converting data type for categorical variables
test['NAICS2']=test['NAICS2'].astype('category')
test['NAICS4']=test['NAICS4'].astype('category')
test['NAICS_CD']=test['NAICS_CD'].astype('category')
test['Restricted_Vertical']=test['Restricted_Vertical'].astype('category')
test['LCTN_TYP_VAL']=test['LCTN_TYP_VAL'].astype('category')
test['srvc_five_dgt_zip']=test['srvc_five_dgt_zip'].astype('category')
test['data_srvc_rnge_6_flg']=test['data_srvc_rnge_6_flg'].astype('category')
Beispiel #24
0
print("Confusion matrix = %s"%confusion_matrix(rnd_test_y, clf_cart.predict(rnd_test_X)))
roc_auc_scorer = get_scorer("roc_auc")
print("ROC AUC = %s"%roc_auc_scorer(clf_cart, rnd_test_X, rnd_test_y))
fpr, tpr, thresholds = roc_curve(rnd_test_y, clf_cart.predict_proba(rnd_test_X)[:, 1])
axes_roc.plot(fpr, tpr, label = 'CART-2')

## randomized tree with default setting
clf_rnd_tree = ExtraTreeClassifier()
clf_rnd_tree.fit(rnd_training_X, rnd_training_y)
export_graphviz(clf_rnd_tree, out_file = 'default_rnd_tree.dot',
                feature_names = attribute_names,
                class_names = bi_class_target_attrs,
                filled = True, rounded = True,
                special_characters = True)
print(check_output('dot -Tpdf default_rnd_tree.dot -o default_rnd_tree.pdf', shell = True))
print("Accuracy = %s"%accuracy_score(rnd_test_y, clf_rnd_tree.predict(rnd_test_X)))
print("Precision = %s"%precision_score(rnd_test_y, clf_rnd_tree.predict(rnd_test_X)))
print("Recall = %s"%recall_score(rnd_test_y, clf_rnd_tree.predict(rnd_test_X)))
print("F = %s"%fbeta_score(rnd_test_y, clf_rnd_tree.predict(rnd_test_X), beta=1))
print("Confusion matrix = %s"%confusion_matrix(rnd_test_y, clf_rnd_tree.predict(rnd_test_X)))
fpr, tpr, thresholds = roc_curve(rnd_test_y, clf_rnd_tree.predict_proba(rnd_test_X)[:, 1])
axes_roc.plot(fpr, tpr, label = "Randomized tree-1")
axes_roc.set_title("ROC of CART and a randomized tree")
axes_roc.set_xlabel("FPR")
axes_roc.set_ylabel("TPR")
axes_roc.set_ylim(0, 1.1)
axes_roc.legend(loc = 'best', fontsize = 'medium')
roc_auc_scorer = get_scorer("roc_auc")
print("ROC AUC = %s"%roc_auc_scorer(clf_rnd_tree, rnd_test_X, rnd_test_y))

# randomized tree with max_depth = 4, min_samples_leaf = 5
def myclassify_AudPow(numfiers,xtrain_1,xtrain_2,ytrain_1,ytrain_2,xtest):

    # remove NaN, Inf, and -Inf values from the xtest feature matrix
    xtest = xtest[~np.isnan(xtest).any(axis=1),:]
    xtest = xtest[~np.isinf(xtest).any(axis=1),:]

    xtrain = np.append(xtrain_1,xtrain_2,0)
    ytrain = np.append(ytrain_1,ytrain_2)
    ytrain = np.ravel(ytrain)
    xtrunclength = sio.loadmat('../Files/xtrunclength.mat')
    xtrunclength = xtrunclength['xtrunclength'][0]



    #if xtest is NxM matrix, returns Nxnumifiers matrix where each column corresponds to a classifiers prediction vector
    count = 0
    # print numfiers

    predictionMat = np.empty((xtest.shape[0],numfiers))
    predictionStringMat = []
    finalPredMat = []

    bagging2 = BaggingClassifier(ETC(),bootstrap=False,bootstrap_features=False)
    bagging2.fit(xtrain,ytrain)
    #print bagging2.score(xtest,ytest)
    ytest = bagging2.predict(xtest)
    predictionMat[:,count] = ytest
    count += 1


    if count < numfiers:

        tree2 = ETC()
        tree2.fit(xtrain,ytrain)
        ytest = tree2.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1


    if count < numfiers:
        bagging1 = BaggingClassifier(ETC())
        bagging1.fit(xtrain,ytrain)
        #print bagging1.score(xtest,ytest)
        ytest = bagging1.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1

    if count < numfiers:
        # votingClassifiers combine completely different machine learning classifiers and use a majority vote
        clff1 = SVC()
        clff2 = RFC(bootstrap=False)
        clff3 = ETC()
        clff4 = neighbors.KNeighborsClassifier()
        clff5 = quadda()



        eclf = VotingClassifier(estimators = [('svc',clff1),('rfc',clff2),('etc',clff3),('knn',clff4),('qda',clff5)])
        eclf = eclf.fit(xtrain,ytrain)
        #print(eclf.score(xtest,ytest))
        # for claf, label in zip([clff1,clff2,clff3,clff4,clff5,eclf],['SVC','RFC','ETC','KNN','QDA','Ensemble']):
        #     cla
        #     scores = crossvalidation.cross_val_score(claf,xtrain,ytrain,scoring='accuracy')
        #     print ()
        ytest = eclf.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1


    if count < numfiers:
        svc1 = SVC()
        svc1.fit(xtrain,ytrain)
        ytest = svc1.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1

    if count < numfiers:
        # Quadradic discriminant analysis - classifier with quadratic decision boundary -
        qda = quadda()
        qda.fit(xtrain,ytrain)
        #print(qda.score(xtest,ytest))
        ytest = qda.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1


    if count < numfiers:

        tree1 = DTC()
        tree1.fit(xtrain,ytrain)
        ytest = tree1.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1

    if count < numfiers:
        knn1 = neighbors.KNeighborsClassifier() # this classifies based on the #k nearest neighbors, where k is definted by the user.
        knn1.fit(xtrain,ytrain)
        ytest = knn1.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1


    if count < numfiers:
        # linear discriminant analysis - classifier with linear decision boundary -
        lda = linda()
        lda.fit(xtrain,ytrain)
        ytest = lda.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1

    if count < numfiers:
        tree3 = RFC()
        tree3.fit(xtrain,ytrain)
        ytest = tree3.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1

    if count < numfiers:
        bagging3 = BaggingClassifier(RFC(),bootstrap=False,bootstrap_features=False)
        bagging3.fit(xtrain,ytrain)
        ytest = bagging3.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1



    if count < numfiers:
        bagging4 = BaggingClassifier(SVC(),bootstrap=False,bootstrap_features=False)
        bagging4.fit(xtrain,ytrain)
        ytest = bagging4.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1


    if count < numfiers:
        tree4 = RFC(bootstrap=False)
        tree4.fit(xtrain,ytrain)
        ytest = tree4.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1

    if count < numfiers:
        tree6 = GBC()
        tree6.fit(xtrain,ytrain)
        ytest = tree6.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1

    if count < numfiers:
        knn2 = neighbors.KNeighborsClassifier(n_neighbors = 10)
        knn2.fit(xtrain,ytrain)
        ytest = knn2.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1


    if count < numfiers:
        knn3 = neighbors.KNeighborsClassifier(n_neighbors = 3)
        knn3.fit(xtrain,ytrain)
        ytest = knn3.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1


    if count < numfiers:
        knn4 = neighbors.KNeighborsClassifier(algorithm = 'ball_tree')
        knn4.fit(xtrain,ytrain)
        ytest = knn4.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1


    if count < numfiers:
        knn5 = neighbors.KNeighborsClassifier(algorithm = 'kd_tree')
        knn5.fit(xtrain,ytrain)
        ytest = knn5.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1


    if count < numfiers:
        ncc1 = NearestCentroid()
        ncc1.fit(xtrain,ytrain)
        ytest = ncc1.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1

    if count < numfiers:
        tree5 = ABC()
        tree5.fit(xtrain,ytrain)
        ytest = tree5.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1



    for colCount in range(predictionMat.shape[1]):
        tempCol = predictionMat[:,colCount]
        modeCol = predWindowVecModeFinder(tempCol,xtrunclength)
        modeStr = predVec2Str(modeCol)
        predictionStringMat.append(modeStr)
        finalPredMat += map(int,modeCol)

    return predictionStringMat,finalPredMat
def myclassify_practice_set(numfiers,xtrain,ytrain,xtltrain,xtltest,xtest,ytarget=None,testing=False,grids='ABCDEFGHI'):
    #NOTE we might not need xtltrain
    # xtrain and ytrain are your training set. xtltrain is the indices of corresponding recordings in xtrain and ytrain. these will always be present
    #xtest is your testing set. xtltest is the corresponding indices of the recording. for the practice set xtltest = xtrunclength
    # ytest is optional and depends on if you are using a testing set or the practice set

    # remove NaN, Inf, and -Inf values from the xtest feature matrix
    xtest,xtltest,ytarget = removeNanAndInf(xtest,xtltest,ytarget)
    # print 'finished removal of Nans'

    ytrain = np.ravel(ytrain)
    ytarget = np.ravel(ytarget)


    #if xtest is NxM matrix, returns Nxnumifiers matrix where each column corresponds to a classifiers prediction vector
    count = 0
    # print numfiers

    predictionMat = np.empty((xtest.shape[0],numfiers))
    predictionStringMat = []
    finalPredMat = []
    targetStringMat = []
    targets1 = []
    predictions1 = []

    # svc1 = SVC()
    # svc1.fit(xtrain,ytrain)
    # ytest = svc1.predict(xtest)
    # predictionMat[:,count] = ytest
    # count+=1
    if count < numfiers:
        # votingClassifiers combine completely different machine learning classifiers and use a majority vote
        clff1 = SVC()
        clff2 = RFC(bootstrap=False)
        clff3 = ETC()
        clff4 = neighbors.KNeighborsClassifier()
        clff5 = quadda()



        eclf = VotingClassifier(estimators = [('svc',clff1),('rfc',clff2),('etc',clff3),('knn',clff4),('qda',clff5)])
        eclf = eclf.fit(xtrain,ytrain)
        #print(eclf.score(xtest,ytest))
        # for claf, label in zip([clff1,clff2,clff3,clff4,clff5,eclf],['SVC','RFC','ETC','KNN','QDA','Ensemble']):
        #     cla
        #     scores = crossvalidation.cross_val_score(claf,xtrain,ytrain,scoring='accuracy')
        #     print ()
        ytest = eclf.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1

    if count < numfiers:

        bagging2 = BaggingClassifier(ETC(),bootstrap=False,bootstrap_features=False)
        bagging2.fit(xtrain,ytrain)
        #print bagging2.score(xtest,ytest)
        ytest = bagging2.predict(xtest)
        predictionMat[:,count] = ytest
        count += 1


    if count < numfiers:

        tree2 = ETC()
        tree2.fit(xtrain,ytrain)
        ytest = tree2.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1


    if count < numfiers:
        bagging1 = BaggingClassifier(ETC())
        bagging1.fit(xtrain,ytrain)
        #print bagging1.score(xtest,ytest)
        ytest = bagging1.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1



    if count < numfiers:
        svc1 = SVC()
        svc1.fit(xtrain,ytrain)
        ytest = svc1.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1

    if count < numfiers:
        # Quadradic discriminant analysis - classifier with quadratic decision boundary -
        qda = quadda()
        qda.fit(xtrain,ytrain)
        #print(qda.score(xtest,ytest))
        ytest = qda.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1



    if count < numfiers:

        tree1 = DTC()
        tree1.fit(xtrain,ytrain)
        ytest = tree1.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1

    if count < numfiers:
        knn1 = neighbors.KNeighborsClassifier() # this classifies based on the #k nearest neighbors, where k is definted by the user.
        knn1.fit(xtrain,ytrain)
        ytest = knn1.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1


    if count < numfiers:
        # linear discriminant analysis - classifier with linear decision boundary -
        lda = linda()
        lda.fit(xtrain,ytrain)
        ytest = lda.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1

    if count < numfiers:
        tree3 = RFC()
        tree3.fit(xtrain,ytrain)
        ytest = tree3.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1

    if count < numfiers:
        bagging3 = BaggingClassifier(RFC(),bootstrap=False,bootstrap_features=False)
        bagging3.fit(xtrain,ytrain)
        ytest = bagging3.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1



    if count < numfiers:
        bagging4 = BaggingClassifier(SVC(),bootstrap=False,bootstrap_features=False)
        bagging4.fit(xtrain,ytrain)
        ytest = bagging4.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1


    if count < numfiers:
        tree4 = RFC(bootstrap=False)
        tree4.fit(xtrain,ytrain)
        ytest = tree4.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1

    if count < numfiers:
        tree6 = GBC()
        tree6.fit(xtrain,ytrain)
        ytest = tree6.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1

    if count < numfiers:
        knn2 = neighbors.KNeighborsClassifier(n_neighbors = 10)
        knn2.fit(xtrain,ytrain)
        ytest = knn2.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1


    if count < numfiers:
        knn3 = neighbors.KNeighborsClassifier(n_neighbors = 3)
        knn3.fit(xtrain,ytrain)
        ytest = knn3.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1


    if count < numfiers:
        knn4 = neighbors.KNeighborsClassifier(algorithm = 'ball_tree')
        knn4.fit(xtrain,ytrain)
        ytest = knn4.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1


    if count < numfiers:
        knn5 = neighbors.KNeighborsClassifier(algorithm = 'kd_tree')
        knn5.fit(xtrain,ytrain)
        ytest = knn5.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1



    if count < numfiers:
        ncc1 = NearestCentroid()
        ncc1.fit(xtrain,ytrain)
        ytest = ncc1.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1

    if count < numfiers:
        tree5 = ABC()
        tree5.fit(xtrain,ytrain)
        ytest = tree5.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1


    # print xtltest
    # print len(ytest)
    for colCount in range(predictionMat.shape[1]):
        tempCol = predictionMat[:,colCount]
        if testing:
            modeCol = temppredWindowVecModeFinder(tempCol,xtltest,4,grids,isPrint=0)
        else:
            modeCol = predWindowVecModeFinder(tempCol,xtltest,4,isPrint=0)

        ytarg = predWindowVecModeFinder(ytarget,xtltest,1,isPrint=0)
        if testing:
             modeStr = temppredVec2Str(modeCol,grids)
        else:
            modeStr = predVec2Str(modeCol)
        modeStrans = predVec2Str(ytarg)
        predictionStringMat.append(modeStr)
        predictions1.append(modeCol)
        finalPredMat += map(int,modeCol)
        targetStringMat.append(modeStrans)
        targets1.append(ytarg)
        if testing == False:
            if ytarget != None:
                #print targets1
                #print ""
                #print predictions1
                confusionme = confusion_matrix(targets1[0],predictions1[0])
                #print "Confusion Matrix is: "
                #print confusionme


    return predictionStringMat, targetStringMat, finalPredMat
Beispiel #27
0
class stacked_generalization():
    def __init__(self, data, target):
        self.data = data
        if len(target.shape) == 2:
            # Convert 2-dim target array into 1-dim target array
            self.target = target.reshape(target.shape[0])
        else:
            self.target = target

        self.training_data = None
        self.training_target = None
        self.test_data = None
        self.test_target = None

        # Construct 3 Tier-1 (base) classifiers
        self.Tier1_classifier1 = LogisticRegression(solver="lbfgs")
        self.Tier1_classifier2 = MultinomialNB()
        self.Tier1_classifier3 = LinearSVC(penalty="l2")
        self.Tier1_classifier4 = ExtraTreeClassifier()
        # self.Tier1_classifier5 = SGDClassifier(max_iter=1000, tol=1e-3)

        # Construct Tier-2 (meta) classifier
        # self.meta_classifier = LogisticRegression(solver="lbfgs")
        # self.meta_classifier = MultinomialNB()
        # self.meta_classifier = LinearSVC(penalty = "l2")
        self.meta_classifier = ExtraTreeClassifier()
        # self.meta_classifier = XGBClassifier()
        # self.meta_classifier = RandomForestClassifier(n_estimators=100)

    # Divide training data into different n_split training blocks and evaluation blocks
    # Create T Tier-1 classifiers, C1,..,CT, based on a cross-validation partition of the training data. To do so,
    # the entire training dataset is divided into B blocks, and each Tier-1 classifier is first trained on (a different set of)
    # B-1 blocks of the training data. Each classifier is then evaluated on the Bth (pseudo-test) block
    def TrainingData_Stratified_KFold_split(self, n_split=5, shuffle=False):
        # Blocks of training data Partition. n_splits cannot be greater than the number of members in each class
        skf_blocks = StratifiedKFold(n_splits=n_split, shuffle=shuffle)

        # Creat the indexes of blocks of training data. The number of blocks is n_split
        training_blocks_index = []
        evaluation_blocks_index = []

        for trainingBlock_index, evaluationBlock_index in skf_blocks.split(
                self.training_data, self.training_target):
            training_blocks_index.append(trainingBlock_index)
            evaluation_blocks_index.append(evaluationBlock_index)

        training_blocks_data = [
            self.training_data[index, :] for index in training_blocks_index
        ]
        training_blocks_target = [
            self.training_target[index] for index in training_blocks_index
        ]

        evaluation_blocks_data = [
            self.training_data[index, :] for index in evaluation_blocks_index
        ]
        evaluation_blocks_target = [
            self.training_target[index] for index in evaluation_blocks_index
        ]

        return training_blocks_data, training_blocks_target, evaluation_blocks_data, evaluation_blocks_target

    def train_meta_classifier(self):
        training_blocks_data, training_blocks_target, evaluation_blocks_data, evaluation_blocks_target = self.TrainingData_Stratified_KFold_split(
        )

        # The classification outputs of all Tier-1 classifiers on each training data block (5 blocls now) are saved in list Tier1_outputs
        Tier1_outputs = []

        for block in range(len(training_blocks_data)):
            # all Tier-1 base classifiers fit n-1 training data blocks (n blocks totally)
            self.Tier1_classifier1.fit(training_blocks_data[block],
                                       training_blocks_target[block])
            self.Tier1_classifier2.fit(training_blocks_data[block],
                                       training_blocks_target[block])
            self.Tier1_classifier3.fit(training_blocks_data[block],
                                       training_blocks_target[block])
            self.Tier1_classifier4.fit(training_blocks_data[block],
                                       training_blocks_target[block])
            # self.Tier1_classifier5.fit(training_blocks_data[block],training_blocks_target[block])

            # All Tier-1 base classifiers fit nth training data blocks (n blocks totally).The outputs of all Tier-1 base
            # classifiers on each training data block (5 blocls now) are saved in list Tier1_outputs
            output_C1 = self.Tier1_classifier1.predict(
                evaluation_blocks_data[block])
            output_C1 = output_C1.reshape(output_C1.shape[0], 1)

            output_C2 = self.Tier1_classifier2.predict(
                evaluation_blocks_data[block])
            output_C2 = output_C2.reshape(output_C2.shape[0], 1)

            output_C3 = self.Tier1_classifier3.predict(
                evaluation_blocks_data[block])
            output_C3 = output_C3.reshape(output_C3.shape[0], 1)

            output_C4 = self.Tier1_classifier4.predict(
                evaluation_blocks_data[block])
            output_C4 = output_C4.reshape(output_C4.shape[0], 1)

            # output_C5 = self.Tier1_classifier5.predict(evaluation_blocks_data[block])
            # output_C5 = output_C5.reshape(output_C5.shape[0],1)

            # The classification outputs of all Tier-1 classifiers on each training data block (5 blocls now) are saved in list Tier1_outputs
            block_outputs = np.hstack((output_C1, output_C2, output_C3,
                                       output_C4))  # horizontally combined
            Tier1_outputs.append(block_outputs)

        # Vertically combine all training data blocks' classification outputs of all Tier-1 classifiers.
        # The function np.vstack() can be given a list
        Tier1_outputs = np.vstack(Tier1_outputs)
        # Combine all training data blocks' real labels
        evaluation_blocks_target = np.concatenate([
            eva_block_target for eva_block_target in evaluation_blocks_target
        ])

        # Using all training data blocks' classification outputs of all Tier-1 classifiers and all training data blocks'
        # real labels to train the meta classifier
        self.meta_classifier.fit(Tier1_outputs, evaluation_blocks_target)

        print("The training of meta classifier is finished")
        # return accuracy, recall and precision of test data

    # Train stacked generalization by cross-validation partition.
    def train_stacked_generalization_CV(self, n_split=5, shuffle=False):
        # Cross-validation Partition.  n_splits cannot be greater than the number of members in each class
        skf_cv = StratifiedKFold(n_splits=n_split, shuffle=shuffle)

        # Creat the indexes of training data and test data
        training_sets_index = []
        test_sets_index = []

        for training_index, test_index in skf_cv.split(self.data, self.target):
            training_sets_index.append(training_index)
            test_sets_index.append(test_index)

        training_sets_data = [
            self.data[index, :] for index in training_sets_index
        ]
        training_sets_target = [
            self.target[index] for index in training_sets_index
        ]

        test_sets_data = [self.data[index, :] for index in test_sets_index]
        test_sets_target = [self.target[index] for index in test_sets_index]

        # Store all metrics of cross-validation in different lists
        test_cv_accuracy = []
        test_cv_recall = []
        test_cv_precision = []

        time_start = time.time()  # start time

        for cv_time in range(n_split):
            self.training_data = training_sets_data[cv_time]
            self.training_target = training_sets_target[cv_time]
            self.test_data = test_sets_data[cv_time]
            self.test_target = test_sets_target[cv_time]

            # train the meta classifier
            self.train_meta_classifier()

            # Using all training data to retrain the all Tier-1 base classifiers
            self.Tier1_classifier1.fit(self.training_data,
                                       self.training_target)
            self.Tier1_classifier2.fit(self.training_data,
                                       self.training_target)
            self.Tier1_classifier3.fit(self.training_data,
                                       self.training_target)
            self.Tier1_classifier4.fit(self.training_data,
                                       self.training_target)
            # self.Tier1_classifier5.fit(self.training_data,self.training_target)

            # All retrained Tier-1 base classifiers are utilized to predict the test data
            testset_output_C1 = self.Tier1_classifier1.predict(self.test_data)
            testset_output_C1 = testset_output_C1.reshape(
                testset_output_C1.shape[0], 1)

            testset_output_C2 = self.Tier1_classifier2.predict(self.test_data)
            testset_output_C2 = testset_output_C2.reshape(
                testset_output_C2.shape[0], 1)

            testset_output_C3 = self.Tier1_classifier3.predict(self.test_data)
            testset_output_C3 = testset_output_C3.reshape(
                testset_output_C3.shape[0], 1)

            testset_output_C4 = self.Tier1_classifier4.predict(self.test_data)
            testset_output_C4 = testset_output_C4.reshape(
                testset_output_C4.shape[0], 1)

            # testset_output_C5 = self.Tier1_classifier5.predict(self.test_data)
            # testset_output_C5 = testset_output_C5.reshape(testset_output_C5.shape[0],1)

            # Horizontally combine all Tier-1 base classifiers' predictions on test data
            testset_outputs_Tier1 = np.hstack(
                (testset_output_C1, testset_output_C2, testset_output_C3,
                 testset_output_C4))

            # Based on predictions on test data, of all Tier-1 base classifiers , it would use the meta classifier to predict labels of test data
            testset_outputs_meta = self.meta_classifier.predict(
                testset_outputs_Tier1)
            # Round all predictions of meta classifier xgboost
            testset_outputs_meta = np.round(testset_outputs_meta)

            # Store all metrics of cross-validation in different lists
            test_cv_accuracy.append(
                accuracy_score(self.test_target, testset_outputs_meta))
            test_cv_recall.append(
                recall_score(self.test_target, testset_outputs_meta))
            test_cv_precision.append(
                precision_score(self.test_target, testset_outputs_meta))

        # Convert lists into numpy arrays, since only numpy arrays can be used to calculate mean values, min values, max values and std values
        test_cv_accuracy = np.array(test_cv_accuracy)
        test_cv_recall = np.array(test_cv_recall)
        test_cv_precision = np.array(test_cv_precision)

        time_end = time.time()  # end time
        print("\nTime cost: ", time_end - time_start, "seconds")

        cv_scores = {
            "test_accuracy": test_cv_accuracy,
            "test_precision_weighted": test_cv_recall,
            "test_recall_weighted": test_cv_precision
        }
        return cv_scores