Ejemplo n.º 1
0
    def get_AdaMachine(self,
                       k,
                       branch_num=4,
                       impurity_fun=mylib.entropy,
                       sub_space_fun=DT.sub_p(method=2),
                       seed=20):
        """
        Based on pseudocode : Algorithm 4.6 AdaBoost algorithm
            <Introduction to Data Mining 2nd Edition> Pang-ning Tang
        Purpose:
            get random RF_machine object.
        Input:
            k: int, the number of trees in forest.
            branch_num: int, the numbers of branch for continuous features in decision tree.
            impurity_fun: function, the function to measure impurity, 
                          including error, entropy, gini. 
            sub_space_fun: function, the function to determine the percentage of features used 
                           for build tree. 
            seed: int, use to get random object.
        Output:
            a AdaMachine object.
        """
        rand = np.random.RandomState(seed)
        n, d = self.training_data.shape
        true_label = mylib.convert_label(self.training_label)
        vec = np.arange(n)
        weights = np.asarray([1 / n for i in range(n)])
        classifiers = []
        importances = []
        i = 0
        while i < k:
            index = rand.choice(vec, n, replace=True, p=weights)
            data = self.training_data[index, :]
            label = self.training_label[index]
            factory = DT.TreeFactory(data, label)
            dtree = factory.get_DT_machine(branch_num, impurity_fun,
                                           sub_space_fun, seed)
            res_label = dtree.predict(self.training_data)
            error_vect = res_label != true_label
            error = np.sum(weights[error_vect])
            if error > 0.5:
                print("error greater than 0.5")
                #reset weights and go back the head of loop
                weights = np.asarray([1 / n for i in range(n)])
                continue

            ai = 1 / 2 * np.log((1 - error) / error)

            #update weights
            X1 = weights * np.exp(-ai * true_label * res_label)
            norm = np.sum(X1)
            weights = X1 / norm

            classifiers.append(dtree)
            importances.append(ai)
            i += 1

        return AdaMachine(classifiers, np.asarray(importances))
    def get_DT_machine(self, gate_num, impurity_fun, sub_space_fun, seed):
        """
        Purpose:
            To build DT_machine object. 
        Input:
            You can see parameters description in tree_growth function.
        Output:
            A DT_machine object.
        """
        data, gates, nominal_features = self.scale_features(gate_num)
        label = mylib.convert_label(self.training_label)
        rand = np.random.RandomState(seed)
        Es = set(range(data.shape[0]))
        Fs = set(range(data.shape[1]))
        root = self.tree_growth(data, label, Es, Fs, gate_num, impurity_fun,
                                sub_space_fun, rand)

        return DT_machine(root, gates, nominal_features)
def show_res(raw_set, n, k, branch_num, impurity_fun, sub_space_fun, seed):

    for i in range(n):
        training_set, test_set = mylib.n_fold(n, i, raw_set)
        training_data, training_label = mylib.get_data_label(training_set)
        factory = ForestFactory(training_data, training_label)
        randForest = factory.get_RF(k, branch_num, impurity_fun, sub_space_fun,
                                    seed)
        test_data, test_label = mylib.get_data_label(test_set)
        true_label = mylib.convert_label(test_label)
        res_label = randForest.predict(test_data)
        confusion = mylib.confusion_matrix(true_label, res_label)
        accuracy = mylib.get_accuracy(confusion)
        precision = mylib.get_precision(confusion)
        recall = mylib.get_recall(confusion)
        f1_score = mylib.get_f1_score(confusion)
        print("**************itr: ", i, " **************")
        print("confusion matrix:")
        print(confusion)
        print("accuracy: ", accuracy)
        print("precision: ", precision)
        print("recall: ", recall)
        print("f1_score: ", f1_score)
def show_res(raw_set, n, gate_num, sub_space_fun, seed):

    for i in range(n):
        training_set, test_set = mylib.n_fold(n, i, raw_set)
        training_data, training_label = mylib.get_data_label(training_set)
        factory = TreeFactory(training_data, training_label)

        dtree = factory.get_DT_machine(gate_num, mylib.entropy, sub_space_fun,
                                       seed)
        test_data, test_label = mylib.get_data_label(test_set)
        true_label = mylib.convert_label(test_label)
        res_label = dtree.predict(test_data)
        confusion = mylib.confusion_matrix(true_label, res_label)
        accuracy = mylib.get_accuracy(confusion)
        precision = mylib.get_precision(confusion)
        recall = mylib.get_recall(confusion)
        f1_score = mylib.get_f1_score(confusion)
        print("**************itr: ", i, " **************")
        print("confusion matrix:")
        print(confusion)
        print("accuracy: ", accuracy)
        print("precision: ", precision)
        print("recall: ", recall)
        print("f1_score: ", f1_score)