コード例 #1
0
def gen_tree(attribute, X, y, weights=None):
    """
    Generate trees with height = 1.
               a
             /   \
    """
    trees = []
    for l1 in [-1, 1]:
        for l2 in [-1, 1]:
            # create a decision tree
            tree = DecisionTree()
            tree.labels = set(y)
            root = Branch()
            tree.tree = root
            # split attribute 1
            root.split_feature = attribute
            # left branch of root
            left = Branch()
            left.predict = l1
            root.children[0] = left
            # right branch of root
            right = Branch()
            right.predict = l2
            root.children[1] = right
            # append tree to the list
            trees.append(tree)
    return trees
    def train(self, data, col_y, iteration=50, max_height=2, print_flag=False):
        s = np.zeros((data.shape[0]))  # initialize s vector
        self.max_height = max_height
        X = data.drop(col_y, axis=1)
        y = data[col_y]

        if print_flag:
            print('Start training')

        for i in range(iteration):
            if print_flag:
                if i % 10 == 9:
                    print(' ... %i-th iteration' % (i + 1))

            data[col_y] = y - s
            Tree = DecisionTree()
            Tree.construct_tree(data, col_y, max_height=self.max_height)
            self.trees[i] = Tree

            # One variable linear regression (fit residual)
            g_t = np.array([Tree.predict(x)
                            for x in np.array(X)])  # prediction
            if np.sum(g_t**2) == 0:
                alpha = 0
            else:
                alpha = np.sum(g_t * (y - s)) / np.sum(
                    g_t**2)  # compute regression coefficient
            self.coeff[i] = alpha
            s += alpha * g_t  # update s

        if print_flag:
            print('-----  END  -----')
コード例 #3
0
 def __init__(self, max_depth=2, min_size=2, cost='mse'):
     DecisionTree.__init__(self, max_depth, min_size)
     self.cost_function = None
     if cost == 'mse':
         self.cost_function = cost
     else:
         raise NameError('Not valid cost function')
コード例 #4
0
ファイル: MDP.py プロジェクト: prashanth818/SummerWork
 def simulation(self, errorstate, stackstate, attributes):
     training_set = []
     print "Simulating"
     for state in stackstate:
         self.rewardmat[state.getLabel()] = -3.5
         print "stack state"
     self.rewardmat[errorstate.getLabel()] = 7.5
     for i in range(0,10):
         currentstate = r.choice(self.statelist)
         while(currentstate != self.statelist[errorstate.getLabel()]):
             #choose State action must change.
             #the probabilities will be different because the q_values are different
             action_chosen = currentstate.chooseStateAction(self.probmat[currentstate.getLabel()])
             nextstate = self.statelist[action_chosen.getNextStateAddr()]
             #bellmanFordFunction function will be different
             self.bellmanFordFunction(currentstate, action_chosen)
             #updating the probabilities will need to change
             self.updateProbabilityMatrix(self.probmat[currentstate.getLabel()], self.qmat[currentstate.getLabel()])
             currentstate = nextstate
         # Update the examples using the bellman ford functions
         # Create the tree here.
         training_set = self.generateTrainingSet()
         ldt = DecisionTree(attributes, training_set)
         rules = ldt.getRules()
         # For every state in
         for state in self.statelist:
             for action in state.getActions():
                 action.setVisited(False)
     """END"""
コード例 #5
0
 def __init__(self, *args, **kwargs ):
     if kwargs and args:
         raise ValueError(  
                '''BoostedDecisionTree constructor can only be called with keyword arguments for
                   the following keywords: training_datafile, entropy_threshold,
                   max_depth_desired, csv_class_column_index,
                   symbolic_to_numeric_cardinality_threshold,
                   number_of_histogram_bins, csv_columns_for_features,
                   number_of_histogram_bins, how_many_stages, debug1''') 
     allowed_keys = 'training_datafile','entropy_threshold','max_depth_desired','csv_class_column_index',\
                    'symbolic_to_numeric_cardinality_threshold','csv_columns_for_features',\
                    'number_of_histogram_bins', 'how_many_stages','debug1','stagedebug'
     keywords_used = kwargs.keys()
     for keyword in keywords_used:
         if keyword not in allowed_keys:
             raise ValueError(keyword + ":  Wrong keyword used --- check spelling") 
     training_datafile=entropy_threshold=max_depth_desired=csv_class_column_index=number_of_histogram_bins= None
     symbolic_to_numeric_cardinality_threshold=csv_columns_for_features=how_many_stages=stagedebug=None
     if kwargs and not args:
         if 'how_many_stages' in kwargs      :      how_many_stages = kwargs.pop('how_many_stages')
     DecisionTree.__init__(self, **kwargs)    
     if how_many_stages is not None: 
         self._how_many_stages               =      how_many_stages
     else:
         self._how_many_stages               =      4
     self._all_trees                         =      {i:DecisionTree(**kwargs) for i in range(how_many_stages)}
     self._training_samples                  =      {i:[]for i in range(how_many_stages)}
     self._root_nodes                        =      {i:None for i in range(how_many_stages)}
     self._sample_selection_probs            =      {i:{} for i in range(how_many_stages)}
     self._trust_factors                     =      {i:None for i in range(how_many_stages)}
     self._misclassified_samples             =      {i:[] for i in range(how_many_stages)}
     self._classifications                   =      None
     self._trust_weighted_decision_classes   =      None
     self._stagedebug                        =      0
    def train(self, records, attributes):
        """
        This function will train the random forest, the basic idea of training a
        Random Forest is as follows:
        1. Draw n bootstrap samples using bootstrap() function
        2. For each of the bootstrap samples, grow a tree with a subset of
            original attributes, which is of size m (m << # of total attributes)
        """
        # Your code here

        for tree in range(self.tree_num):

            # creating a tree
            tree = DecisionTree()

            # randomly selecting 50% attributes for the tree
            tree_attributes = random.sample(attributes,
                                            int(len(attributes) * 0.5))

            # selecting bootstrap samples for the tree by calling the bootstrap method
            bootstrap_samples = self.bootstrap(records)

            # training the tree
            tree.train(bootstrap_samples, tree_attributes)

            # adding the tree to the forest list
            self.forest.append(tree)
コード例 #7
0
    def train(self, records, attributes):
        """
        This function will train the random forest, the basic idea of training a
        Random Forest is as follows:
        1. Draw n bootstrap samples using bootstrap() function
        2. For each of the bootstrap samples, grow a tree with a subset of
            original attributes, which is of size m (m << # of total attributes)
        """

        for count in range(0, int(self.tree_num)):
            # Step 1 :Finding out the samples using bootstrap() for every treenum
            sample_rec = self.bootstrap(records)

            # Step 2 : For every treenum selecting 50% of the sample attributes
            #           at random (without replacement) to be used for tree contruction
            sample_attr = []
            while len(sample_attr) < ceil(0.5 * len(attributes)):
                rand = random.choice(attributes)
                if not rand in sample_attr:
                    sample_attr.append(rand)

            # Creating a new Tree instance, training it based on the records and
            # attributes bootstraped above and adding to the forest
            Tree = DecisionTree()
            Tree.train(sample_rec, sample_attr)
            self.forest.append(Tree)
コード例 #8
0
def spam():

    #load all the spam data
    spam_data = scipy.io.loadmat('spam-dataset/spam_data.mat')
    test_data = spam_data['test_data']
    training_labels = spam_data['training_labels']
    training_data = spam_data['training_data']
    print(training_data.shape[1], 'how many features used')
    training_data, training_labels = sklearn.utils.shuffle(
        training_data, training_labels)

    #split training data
    #learn_set, learn_labels = training_data[:4000], training_labels[:4000]
    learn_set, learn_labels = training_data, training_labels
    valid_set, valid_labels = training_data[4000:], training_labels[4000:]

    #train and predict on a single tree
    # spamTree = DecisionTree(learn_set, learn_labels)
    # spamTree.train(learn_set, learn_labels, spamTree.root)
    # pred_labels = spamTree.predict(test_data)
    #print(benchmark(pred_labels, valid_labels)[0])

    #make random forest
    NUM_TREES = 100
    forest = []
    # pred_labels = np.zeros((valid_set.shape[0], 1))
    # sumOfPred = np.zeros((valid_set.shape[0], 1))
    pred_labels = np.zeros((test_data.shape[0], 1))
    sumOfPred = np.zeros((test_data.shape[0], 1))
    for i in range(0, NUM_TREES):
        print('Now at tree #', i)
        nPrime = np.random.choice(learn_set.shape[0], learn_set.shape[0], True)
        x = learn_set[nPrime]
        y = learn_labels[nPrime]
        tree = DecisionTree(x, y)
        tree.train(x, y, tree.root, True)
        forest.append(tree)
    for tree in forest:
        #	sumOfPred += tree.predict(valid_set)
        sumOfPred += tree.predict(test_data)

    for i in range(0, test_data.shape[0]):
        if sumOfPred[i] / NUM_TREES > .5:
            pred_labels[i] = 1
        elif sumOfPred[i] / NUM_TREES < .5:
            pred_labels[i] = 0
        else:
            pred_labels[i] = random.randint(0, 1)
    #print(benchmark(pred_labels, valid_labels)[0])

    #make csv
    csvList = [['Id,Category']]
    for i in range(1, 5858):
        csvList.append([i, int(pred_labels[i - 1][0])])
    with open('spamForest.csv', 'w', newline='') as fp:
        a = csv.writer(fp, delimiter=',')
        a.writerows(csvList)

    return 0
コード例 #9
0
ファイル: proj3.py プロジェクト: dangbert/AI
def train(data, labels):
    # data is an array of attribute vectors
    # e.g. [[0, 7, 5, 2, 3, 4, 0, 18], [1, 3, 0, 4, 2, 0, 1, 0], ...]
    # labels is an array of class labels (as integers)
    # e.g. [0, 1, ...]
    model = DecisionTree()
    model.train(data, labels)
    return model
コード例 #10
0
 def testSmoke(self):
     from DecisionTree import DecisionTree
     dt = DecisionTree()
     feature = np.array([[0, 1], [1, 0], [1, 2], [0, 0], [1, 1]])
     label = np.array([0, 1, 0, 0, 1])
     dt.fit(feature, label)
     y_pred = dt.predict(feature)
     assert (y_pred == label).all()
コード例 #11
0
    def test_decision_tree(self):
        tree = DecisionTree()
        X = np.asarray([[1, 1],[0, 2], [3, 2]])
        y = np.asarray([0, 1, 1])

        tree.fit(X_=X, y_=y)

        self.assertTrue(tree.predict(np.asarray([[1,1]]))[0] == 0)
コード例 #12
0
 def test_DT(self):
     records, attributes = load_data("data/mushrooms_train.data")
     test_records = load_data("data/mushrooms_train.data")[0]
     #print(records, attributes)
     dt = DecisionTree()
     best_index, best_index_dict = dt.find_best_split(records,
                                                      attributes,
                                                      class_index=0)
     dt.shuffle_dataset(best_index_dict)
コード例 #13
0
ファイル: RandomForest.py プロジェクト: sambaths/sandbox
 def fit(self, X, y):
     self.trees = []
     for _ in range(self.estimators):
         tree = DecisionTree(min_samples_split=self.min_samples_split,
                             max_depth=self.max_depth,
                             n_features=self.n_features)
         X_sample, y_sample = bootstrap_sample(X, y)
         tree.fit(X_sample, y_sample)
         self.trees.append(tree)
コード例 #14
0
ファイル: main.py プロジェクト: newsatit/CS760-DecisionTree
def train_and_plot(X, Y, X_test, Y_test):
    decision_tree = DecisionTree().fit(X, Y)
    # decision_tree.print_tree()
    plot_decision_boundary(decision_tree, X, Y)
    Y_predict = decision_tree.predict(X_test.to_numpy())
    test_error = 1 - metrics.accuracy_score(Y_test, Y_predict)
    num_nodes = decision_tree.count_nodes()
    print('test error : ', test_error)
    return (num_nodes, test_error)
コード例 #15
0
 def fit(self, X, Y):
     self.trees = []
     for _ in range(self.n_trees):
         tree = DecisionTree(min_samples_split=self.min_samples_split,
                             max_depth=self.max_depth,
                             n_feats=self.n_feats)
         X_sample, Y_sample = BootstrapSample(X, Y)
         tree.fit(X_sample, Y_sample)
         self.trees.append(tree)
コード例 #16
0
    def fit(self, X, y):

        self._initial_approximation(X, y)

        for i in range(self._n_estimators):
            anti_grad = self.calculate_antigradient(X, y)
            estimator = DecisionTree(max_depth=self._max_depth, is_classification=False, impurity=self._impurity, min_samples_leaf=self._min_samples_leaf, max_features=self._max_features, min_features=self._min_features, max_steps=self._max_steps, rsm=self._rsm)
            estimator.fit(X, anti_grad)
            self._estimators.append(estimator)
コード例 #17
0
class RandomForest:
    def __init__(self, n_subsets=1, n_replacement=5):
        self.innermodel = DecisionTree()
        self.n_subsets = n_subsets
        self.n_replacement = n_replacement

    def generate_subset(self, group_number, data):
        subset = []
        i = 0
        while i < len(data):
            if i % self.n_subsets == group_number:
                subset.append(data[i])
            i += 1

        return subset

    def fit(self, data):
        ## save the original data
        self.dataset = data
        forest = []  ## use to save the tree of each subset

        i = 0
        while i < self.n_replacement:
            random.shuffle(data)
            ## split data into subsets and save to a list
            subsets = []
            j = 0
            while j < self.n_subsets:
                subset = self.generate_subset(j, data)
                subsets.append(subset)
                j += 1

            for subset in subsets:
                tree = self.innermodel.fit(subset)
                forest.append(tree)
            i += 1

        return forest

    def classify(self, row, forest):
        results = []
        confidence = 0
        for node in forest:
            result = self.innermodel.classify(row, node)
            for key in result:
                if len(result) == 1:
                    results.append(key)
                else:
                    results.append(max(result, key=result.get))
                    break

        classification = max(set(results), key=results.count)
        confidence = round(
            results.count(max(set(results), key=results.count)) /
            len(results) * 100, 2)
        return [classification, confidence]
コード例 #18
0
def nbcTest(trainSet, trainLab, testSet, testLab):
    nbc = DecisionTree()
    nbc.fit(trainSet, trainLab)
    predLab = nbc.predict(testSet)
    print("nbc errors:")
    print(sum(testLab != predLab))
    print(1 - sum(testLab != predLab) / len(testLab))
    print("nbc confusion matrix:")
    print(confusion_matrix(testLab, predLab))
    print()
コード例 #19
0
 def train_decision_tree(self, train):
     sample_indices = [
         random.choice(range(len(train.data)))
         for _ in range(int(self.bagging_data_fraction * len(train.data)))
     ]
     sample = Dataset([train.data[i] for i in sample_indices],
                      [train.labels[i] for i in sample_indices])
     tree = DecisionTree(self.max_depth, self.num_features)
     tree.train(sample)
     self.trees.append(tree)
コード例 #20
0
    def train(self, dataset: pd.DataFrame, targetClass: str, n: int, m: int, verbose: bool):
        self.dataset = dataset
        self.targetClass = targetClass
        self.trees = []

        for i in range(n):
            treeData, _ = bootstrap(self.dataset, self.dataset.shape[0])
            tree = DecisionTree()
            tree.train(treeData, targetClass, m, verbose)
            self.trees.append(tree)
コード例 #21
0
 def build_tree(self, X, y):
     tree = DecisionTree(
         self.n_features,
         self.max_depth,
         self.min_samples_leaf
     )
     # print(1)
     tree.fit(X, y)
     # print(tree)
     self.trees.append(tree)
コード例 #22
0
ファイル: RandomForest.py プロジェクト: KOPFYF/MLFromScratch
	def fit(self, X, y):
		self.trees = []
		n_samples, n_features = X.shape[0], X.shape[1]
		for _ in range(self.n_trees):
			dt =  DecisionTree(min_samples_split=self.min_samples_split,
                	max_depth=self.max_depth, n_features=self.n_features)

			idxs = np.random.choice(n_samples, n_samples, replace=True)
			dt.fit(X[idxs], y[idxs])
			self.trees.append(dt)
コード例 #23
0
def decision_tree_classification(X, y, test_dat):
    classifier = DecisionTree(45)
    classifier.train(X, y)
    y_hat = classifier.predict(test_dat)

    f = open("census_predictions_decision_tree.csv", 'w')
    f.write("Id,Category\n")
    for i in range(np.size(test_dat, 0)):
        f.write(str(i + 1) + "," + str(int(y_hat[i, 0])) + "\n")
    f.close()
    print("DONE")
コード例 #24
0
def generate_tree(tup):
    '''@parameters:
		tup: (trainX, trainY, testX)
	@return: (DecisionTree, array, array, array): tree, prediction, Ein, out_prediction'''
    trainX, trainY = tup[0], tup[1]
    bagX, bagY = bagging(trainX, trainY, 0.8)
    testX = tup[2]

    tree = DecisionTree().fit(bagX, bagY)
    prediction = tree.predict(trainX)
    return tree, prediction, np.mean(prediction != trainY), tree.predict(testX)
コード例 #25
0
 def fit(self, X, y):
     m, n = np.shape(X)
     self._tree_list = []
     for i in range(self._max_depth):
         tree_tmp = DecisionTree(type=self._type, criterion=self._criterion, splitter=self._splitter,
                                 min_impurity_decrease=self._min_impurity_decrease,
                                 min_impurity_split=self._min_impurity_split,
                                 min_samples_split=self._min_samples_split, max_depth=self._max_depth)
         X_train, y_train, row, column = self.random_sample(X, y, self._bagging_fraction*m, self._feature_fraction*n)
         tree_tmp.fit(X_train, y_train)
         self._tree_list.append([column, tree_tmp])
コード例 #26
0
def plotDecisionTreewithPieceNum(dataset):
    dt = DecisionTree(dataset)
    size = len(dataset)
    x = [i for i in range(2, size // 2 + 1) if size % i == 0]
    y = []
    for t in x:
        y.append(dt.crossValidation(t))
    plt.scatter(x, y, edgecolors="blue")
    plt.xlabel("Cross validation piece")
    plt.ylabel("Error rate")
    plt.title("Error rate vs piece number")
    plt.show()
コード例 #27
0
def PrintTable(train, test, rangeEnd, ChangeUnknown=False):
    print('\t\t\tEntropy\t\tME\t\tGini')
    for maxDepth in range(1, rangeEnd + 1):
        EntropyTree = DecisionTree(train, maxDepth, 0, ChangeUnknown)
        METree = DecisionTree(train, maxDepth, 1, ChangeUnknown)
        GiniTree = DecisionTree(train, maxDepth, 2, ChangeUnknown)
        print("%2d & %5.4f & %5.4f & %5.4f \\\\ \\hline" %
              (maxDepth, EntropyTree.GetAccuracyLevel(test),
               METree.GetAccuracyLevel(test), GiniTree.GetAccuracyLevel(test)))
コード例 #28
0
 def create_classifier(self,inputs,outputs,weights):
     new_tree=DecisionTree(3,use_weights=True)
     new_tree.fit(inputs,outputs,weights)
     terror=np.empty(len(outputs),dtype=float)
     for indx,(data,truth) in enumerate(zip(inputs,outputs)):
         predict=new_tree.predict(data)
         terror[indx]=1.0-float(predict==truth)
     self.classifiers.append(new_tree)
     error=np.sum(weights*terror)/np.sum(weights)
     stage=np.log((1.0-error)/error)
     self.weights.append(stage)
     weights=weights*np.exp(stage*terror)
     return weights
コード例 #29
0
    def train(self, dataset, m, n):
        # m = number of attributes.
        # n = number o trees
        self.trees = []

        for _ in range(n):
            bootstrap = dataset.bootstrap()
            t = DecisionTree()
            t.n_attr = m
            t.train(bootstrap)
            self.trees.append(t)

        self.trained = True
コード例 #30
0
def best_params():
    acc_max = 0
    depth_max = 0
    depth_list = [i * 10 for i in range(1, 21)]
    for depth in depth_list:
        clf = DecisionTree(max_depth=depth)
        clf.fit(X_train, Y_train)
        predictions = clf.predict(X_test)
        acc = accuracy(Y_test, predictions)
        if acc > acc_max:
            acc_max = acc
            depth_max = depth
    return (depth_max, acc_max)
コード例 #31
0
def main():
    attributes_train, data_train = read_from_file("train.txt")

    # DTL
    dtl = DecisionTree()
    tree = dtl.build(data_train, attributes_train)
    with open("output_tree.txt", "w") as file:
        tree_string = dtl.write_tree_to_file(tree, attributes_train, 0)
        file.write(tree_string[:len(tree_string) - 1])
    # KNN
    knn = KNearestNeighbors(attributes_train, data_train)
    # NAIVE BAYES
    naive_bayes = NaiveBayes(attributes_train, data_train)
    attribute_text, data_test = read_from_file("test.txt")
    knn_result = []
    naive_bayes_result = []
    dtl_result = []
    real_classify = []

    for line in data_test:
        real_classify.append(line[-1])
        entry = line[:-1]
        knn_result.append(knn.predict(entry, 5))
        naive_bayes_result.append(naive_bayes.predict(entry))
        dtl_result.append(dtl.predict(tree, entry, attribute_text))
    acc_knn = 0
    acc_nb = 0
    acc_dtl = 0
    # get accuracy
    for (dtl, knn, nb, real) in zip(dtl_result, knn_result, naive_bayes_result,
                                    real_classify):
        if dtl == real:
            acc_dtl += 1
        if knn == real:
            acc_knn += 1
        if nb == real:
            acc_nb += 1
    acc_knn /= len(real_classify)
    acc_nb /= len(real_classify)
    acc_dtl /= len(real_classify)
    acc_knn = float(math.ceil(acc_knn * 100)) / float(100)
    acc_nb = float(math.ceil(acc_nb * 100)) / float(100)
    acc_dtl = float(math.ceil(acc_dtl * 100)) / float(100)

    with open('output.txt', 'w') as output:
        output.write("Num\tDT\tKNN\tnaiveBase\n")
        for i, (a, b, c) in (enumerate(
                zip(dtl_result, knn_result, naive_bayes_result))):
            output.write(str(i + 1) + "\t" + a + "\t" + b + "\t" + c + "\n")
        output.write("\t" + str(acc_dtl) + "\t" + str(acc_knn) + "\t" +
                     str(acc_nb) + "\n")
コード例 #32
0
class Decider(object):
    def __init__(self, trainingData='data/training.dat'):
        self.dt = DecisionTree(training_datafile=trainingData,
                               debug1=0,
                               debug2=0)
        self.dt.get_training_data()
        self.rootNode = self.dt.construct_decision_tree_classifier()

    def play(self, agentList, agent, map):
        from random import random
        troops = agent.aliveList()
        if random() > 0.5: troops.reverse()
        for troop in troops:
            bestValue = float('-inf')
            actions, teammateList, enemyList = map.legalActions(troop)
            if random() > 0.5: actions.reverse()
            for action in actions:
                self.makeDecision(agentList, agent, troop, action, map)

    def makeDecision(self, agentList, agent, troop, action, map):
        s1 = 'general=>' + generalSituation(agent, troop,
                                            action['target'])  # situation 1
        s2 = 'situation=>' + situation(troop, action['target'])
        s3 = 'injury=>' + maxInjury(troop, action['target'])
        s4 = 'attackGeneral=>' + str(
            maxAttackOnGeneral(troop, action['target']))

        testSample = [s1, s2, s3, s4]
        try:
            classification = self.dt.classify(self.rootNode, testSample)
        except:
            print 'somethign wrong with dt!', testSample
            classification = {'positive': 0, 'negative': 1}
            print classification

        if classification['positive'] > classification['negative']:
            troop.move(action['target'])
            attackList = map.legalAttacks(troop, troop.posX, troop.posY)
            bestValue = float('inf')
            for enemy in attackList:
                if enemy['targetTroopId'] == 1:
                    print 'enemy general!!!!'
                    target = enemy
                    break
                elif enemy['targetLife'] < bestValue:
                    bestValue = enemy['targetLife']
                    target = enemy
            try:
                troop.doAttack(agentList, target['targetTroopId'])
            except UnboundLocalError:
                pass
コード例 #33
0
    def test_is_categorial(self):
        tree = DecisionTree()

        y = np.asarray([1,1,1,1,0,0,0])
        self.assertTrue(tree._is_categorical(y))

        y = np.asarray([1,1,2,4,1,2,4,4,4,4,4])
        self.assertTrue(tree._is_categorical(y))

        y = np.asarray([1.1,0.8,2.1,4,1,2.5,4,4,4,4.8,4])
        self.assertFalse(tree._is_categorical(y))

        y = np.asarray([100000002131, 12, 12])
        self.assertTrue(tree._is_categorical(y))
コード例 #34
0
 def build_forest(self, X, y, num_trees, num_samples, num_features):
     '''
     Return a list of num_trees DecisionTrees.
     '''
     forest = []
     for i in xrange(num_trees):
         sample_indices = np.random.choice(X.shape[0], num_samples, \
                                           replace=True)
         sample_X = np.array(X[sample_indices])
         sample_y = np.array(y[sample_indices])
         dt = DecisionTree(self.impurity_criterion)
         dt.fit(sample_X, sample_y)
         forest.append(dt)
     return forest
コード例 #35
0
 def build_forest(self, X, y, num_trees, num_samples, num_features):
     '''
     Return a list of num_trees DecisionTrees.
     '''
     size = len(y)
     index = range(size)
     trees = []
     for tree in range(num_trees):
         random_sample_index = np.random.choice(index, size, replace=True)
         X_random = X[random_sample_index]
         y_random = y[random_sample_index]
         dt = DecisionTree(num_features)
         dt.fit(X_random, y_random)
         trees.append(dt)
     return trees
コード例 #36
0
class Decider(object):
    def __init__(self, trainingData='data/training.dat'):
        self.dt = DecisionTree(training_datafile = trainingData, debug1=0, debug2=0)
        self.dt.get_training_data()
        self.rootNode = self.dt.construct_decision_tree_classifier()

    def play(self, agentList, agent, map):
        from random import random
        troops = agent.aliveList()
        if random() > 0.5: troops.reverse()
        for troop in troops:
            bestValue = float('-inf')
            actions, teammateList, enemyList = map.legalActions(troop)
            if random() > 0.5: actions.reverse()
            for action in actions:
                self.makeDecision(agentList, agent, troop, action, map)


    def makeDecision(self, agentList, agent, troop, action, map):
        s1 = 'general=>' + generalSituation(agent, troop, action['target']) # situation 1
        s2 = 'situation=>' + situation(troop, action['target'])
        s3 = 'injury=>' + maxInjury(troop, action['target'])
        s4 = 'attackGeneral=>' + str(maxAttackOnGeneral(troop, action['target']))

        testSample = [s1, s2, s3, s4]
        try:
            classification = self.dt.classify(self.rootNode, testSample)
        except:
            print 'somethign wrong with dt!' , testSample
            classification = {'positive':0, 'negative':1}
            print classification
                    
        if classification['positive'] > classification['negative']:
            troop.move(action['target'])
            attackList  = map.legalAttacks(troop,troop.posX,troop.posY)
            bestValue = float('inf')
            for enemy in attackList:
                if enemy['targetTroopId'] == 1:
                    print 'enemy general!!!!'
                    target = enemy
                    break
                elif enemy['targetLife'] < bestValue:
                    bestValue = enemy['targetLife']
                    target = enemy
            try:
                troop.doAttack(agentList, target['targetTroopId'])
            except UnboundLocalError:
                pass
コード例 #37
0
def test_make_split():
    X, y, X1, y1, X2, y2 = fake_data()
    split_index, split_value = 1, 'bat'
    dt = DT()
    dt.categorical = np.array([False, True])
    result = dt._make_split(X, y, split_index, split_value)
    try:
        X1_result, y1_result, X2_result, y2_result = result
    except ValueError:
        n.assert_true(False, 'result not in correct form: (X1, y1, X2, y2)')
    actual = (X1, y1, X2, y2)
    message = '_make_split got results\n%r\nShould be\n%r' % (result, actual)
    n.ok_(np.array_equal(X1, X1_result), message)
    n.ok_(np.array_equal(y1, y1_result), message)
    n.ok_(np.array_equal(X2, X2_result), message)
    n.ok_(np.array_equal(y2, y2_result), message)
コード例 #38
0
def test_tree(filename):
    df = pd.read_csv(filename)
    y = df.pop("Result").values
    X = df.values
    print X

    tree = DecisionTree()
    tree.fit(X, y, df.columns)
    print tree
    print

    y_predict = tree.predict(X)
    print "%26s   %10s   %10s" % ("FEATURES", "ACTUAL", "PREDICTED")
    print "%26s   %10s   %10s" % ("----------", "----------", "----------")
    for features, true, predicted in izip(X, y, y_predict):
        print "%26s   %10s   %10s" % (str(features), str(true), str(predicted))
コード例 #39
0
def test_choose_split_index():
    X, y, X1, y1, X2, y2 = fake_data()
    index, value = 1, 'cat'
    dt = DT()
    dt.categorical = np.array([False, True])
    result = dt._choose_split_index(X, y)
    try:
        split_index, split_value, splits = result
    except ValueError:
        message = 'result not in correct form. Should be:\n' \
                  '    split_index, split_value, splits'
        n.assert_true(False, message)
    message = 'choose split for data:\n%r\n%r\n' \
              'split index, split value should be: %r, %r\n' \
              'not: %r, %r' \
              % (X, y, index, value, split_index, split_value)
    n.eq_(split_index, index, message)
    n.eq_(split_value, value, message)
コード例 #40
0
	def train(self,data,labels):
		"""Trains the random forest using a bunch of decision trees.

		* training_data: n x d numpy matrix of data, where row = sample point, column = feature
		* training_labels: flat nparray of labels, where item i is the label for point i """
		num_points = data.shape[0]
		for i in xrange(self.num_trees):
			sample_indices = np.random.choice(num_points,size=self.data_bagging_size,replace=True)
			sample_data = data[sample_indices]
			sample_labels = labels[sample_indices]
			tree = DecisionTree(feature_bagging_criteria=self.feature_bagging_criteria,
				impurity_measure=self.impurity_measure,
				min_impurity_decrease=self.min_impurity_decrease,
				min_impurity=self.min_impurity,
				max_percentage_in_class=self.max_percentage_in_class,
				max_height=self.max_height,
				min_points_per_node=self.min_points_per_node,
				feature_name_map=self.feature_name_map)
			tree.train(sample_data,sample_labels)
			self.trees.append(tree)
コード例 #41
0
    def build_forest(self, X, y, num_trees, num_samples, num_features):
    # * Return a list of num_trees DecisionTrees.
        forest = []

	# * for each of the num trees
        for i in xrange (num_trees):
	# * create an random selection of the indices of the arrays, sampling
	# with replacement.
            indices = [i for i in xrange(num_samples)]
            indices_sample = r.sample(indices, len(indices)/3)
            X_sample = X[indices_sample]
            y_sample = y[indices_sample]

	# * use these sample indices to select a subset of X and y
	# with the new sample_X and sample_y, build a new tree as a member
    # of the forest and add to the list.
            tree = DecisionTree()
            tree.fit(X_sample, y_sample, self.features)
            forest.append(tree)
        # * Return a list of num_trees DecisionTrees.
        return forest
コード例 #42
0
ファイル: testTree.py プロジェクト: jinyyu/machine-learning
    def test_predict(self):
        data, label = creatDataLabel()
        tree = DecisionTree(maxDeep = 5)
        tree.buildTree(data, label)
        X = numpy.array([1,1,1,1])
        self.assertTrue(tree.predict(X) == 1)

        X = numpy.array([1,0,1,2])
        self.assertTrue(tree.predict(X) == 1)

        X = numpy.array([2,0,1,1])
        self.assertTrue(tree.predict(X) == 1)

        X = numpy.array([2,1,0,1])
        self.assertTrue(tree.predict(X) == 1)
コード例 #43
0
ファイル: main.py プロジェクト: fmars/Differential-privacy
def main():

    print '#############################################'
    print '##   C4.5 based Decision Tree               ##'
    print '## To see usage by main.py --help          ##'
    print '##   Author: mumuhr                        ##'
    print '##   06.01.2015                            ##'
    print '#############################################'

    parser = OptionParser()
    parser.add_option("-i", dest="inc", default='dataset', help="dataset directory")
    parser.add_option("-f", "--file", dest="fileName", default='adult2', help="file name of training dataset")
    parser.add_option("-c", "--class-attr", dest="classAttr", default='class', help="classification attribute")
    parser.add_option("-d", "--depth", dest="depth", default=6, help="max recursion depth of decision trees")
    parser.add_option("-t", "--tree-num", dest="treeNum", default=5, help="num of decision trees")
    parser.add_option("-e", "--epsilon", dest="epsilon", default=1, help="total privacy budget")
    parser.add_option("-v", "--verbose", dest="verbose", default=True, help="open verbose mode")
    parser.add_option("--mode", dest='mode', default='All', help='choose build mode: DecisionTree/RandomForest/ALL')
    (options, args) = parser.parse_args()

    if options.verbose:
        logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
    else:
        logging.basicConfig(stream=sys.stdout, level=logging.INFO)

    logging.info('init Decision Tree building data')
    trainFileName = options.inc + '/' + options.fileName + 'Training.csv'
    testFileName = options.inc + '/' + options.fileName + '.csv'
    
    dataRaw = DataLoader.getData(trainFileName)
    attributes = dataRaw[0]
    attributesType = dataRaw[1]
    if logging.DEBUG:
        for index in range(0, len(attributesType)):
            logging.debug('attr: %s; attrType: %s', attributes[index], attributesType[index])
            logging.debug('class attr: %s', attributes[-1])

    dataRaw.remove(attributes)
    dataRaw.remove(attributesType)
    dataTrain = DataLoader.toFloat(dataRaw, attributesType)

    dataTestRaw = DataLoader.getData(testFileName)
    dataTest = DataLoader.toFloat(dataTestRaw, attributesType)
    classStd = []
    for row in dataTest:
        classStd.append(row[-1])

    target = options.classAttr
    depth = options.depth
    treeNum = options.treeNum
    epsilon = options.epsilon
    logging.debug('target: %s', target)
    logging.debug('depth: %s', str(depth))

    if config.config.MakeTree == 'DecisionTree' or config.config.MakeTree == 'All':

        # Run C4.5
        logging.info('Run C4.5 to generate Decision Tree')
        tree = DecisionTree.makeTree(dataTrain, attributes, attributesType, target, depth, depth, epsilon)
        # Classify testing data
        logging.debug('Classify testing data by generated Decision Tree')
        classResult = DecisionTree.classify(tree, attributes, attributesType, dataTest)
        # Output Classification Accuracy
        acc = ResultParser.classAccDecisionTree(classStd, classResult)
        logging.info('Classification Accuracy: ' + str(acc))

    if config.config.MakeTree == 'RandomForest' or config.config.MakeTree == 'All':

        # Run Random Forest
        logging.info('Run RandomForest to generate Decision Tree')
        trees = RandomForest.randomForest(dataTrain, attributes, attributesType, target, depth, depth, treeNum, epsilon)
        classResults = []
        for tree in trees:
            classResult = DecisionTree.classify(tree, attributes, attributesType, dataTest)
            classResults.append(classResult)
        acc = ResultParser.classAccRandomForest(classStd, classResults)
        logging.info('Classification Accuracy: ' + str(acc))
コード例 #44
0
ファイル: dt.py プロジェクト: belfazt/DecisionTreeExample
from DecisionTree import DecisionTree

dt = DecisionTree(
  training_datafile = "data/dataset.csv",
  csv_class_column_index = 2,
  csv_columns_for_features = [1,2,3,4,5],
  entropy_threshold = 0.01,
  max_depth_desired = 8,
  symbolic_to_numeric_cardinality_threshold = 10,
)

dt.get_training_data()
dt.calculate_first_order_probabilities()
dt.calculate_class_priors()
dt.show_training_data()
root_node = dt.construct_decision_tree_classifier()
root_node.display_decision_tree("   ")

test_sample  = ['ColorDeCabello = Negro',
                'Altura = Alto',
                'Peso = Alto',
                'Proteccion = No',
                'Quemadura = Si']
classification = dt.classify(root_node, test_sample)
print "Classification: ", classification
コード例 #45
0
ファイル: temp.py プロジェクト: druid985/decision_tree
    ["s", "China", "no", 18, "Premium"],
    ["t", "China", "no", 17, "None"],
]

my_data2 = [
    ["a", "USA", "yes", "18", "None"],
    ["b", "France", "yes", "23", "Premium"],
    ["c", "USA", "yes", "24", "Basic"],
    ["d", "France", "yes", "23", "Basic"],
]


train_flowers = data.read_filedata("..//data//train_data.txt", "ALL", ",", [0, 1, 2, 3])
test_flowers = data.read_filedata("..//data//test_data.txt", "ALL", ",", [0, 1, 2, 3])

tree = DecisionTree(train_flowers)
treepredict.buildtree(tree)
tree.printTree()

right = 0
wrong = 0
for flower in test_flowers:
    result = treepredict.predic(tree, flower)
    if flower[-1] in result:
        if right == 49:
            pass
        right += 1
    else:
        wrong += 1

print "正确预测:" + str(right) + "个"
コード例 #46
0
    def test_is_stop_criterion(self):
        tree = DecisionTree()

        self.assertTrue(tree._is_stop_criterion(np.asarray([1])))
        self.assertTrue(tree._is_stop_criterion(np.asarray([1, 1, 1, 1, 1])))
        self.assertFalse(tree._is_stop_criterion(np.asarray([1, 1, 0, 0, 1])))
コード例 #47
0
    def construct_cascade_of_trees(self):
        self._training_samples[0] = self._all_sample_names
        self._misclassified_samples[0] = self.evaluate_one_stage_of_cascade(self._all_trees[0], self._root_nodes[0])
        if self._stagedebug:
            self.show_class_labels_for_misclassified_samples_in_stage(0)
            print("\nSamples misclassified by base classifier: %s" % str(self._misclassified_samples[0]))
            print("\nNumber of misclassified samples: %d" % len(self._misclassified_samples[0]))
        misclassification_error_rate = sum([self._sample_selection_probs[0][x] for x in self._misclassified_samples[0]])
        if self._stagedebug:
            print("\nMisclassification_error_rate for base classifier: %g" % misclassification_error_rate)
        self._trust_factors[0] = 0.5 * math.log((1-misclassification_error_rate)/misclassification_error_rate)
        if self._stagedebug:
            print("\nBase class trust factor: %s" % str(self._trust_factors[0]))
        for stage_index in range(1,self._how_many_stages):
            if self._stagedebug:
                print("\n\n==========================Constructing stage indexed %d=========================\n" % stage_index)
            self._sample_selection_probs[stage_index] =  \
                {sample : self._sample_selection_probs[stage_index - 1][sample] * math.exp(-1.0 * self._trust_factors[stage_index - 1] * (-1.0 if sample in self._misclassified_samples[stage_index - 1] else 1.0)) for sample in self._all_sample_names} 
            normalizer = sum(self._sample_selection_probs[stage_index].values())
            if self._stagedebug:
                print("\nThe normalizer is: ", normalizer)
            self._sample_selection_probs[stage_index].update((sample,prob/normalizer) for sample,prob in
                                                              self._sample_selection_probs[stage_index].items())
            prob_distribution = sorted(self._sample_selection_probs[stage_index].items(), key=lambda x: x[1], reverse=True)
            if self._stagedebug:
                print("\nProbability distribution: %s" % str([(sample_index(x), "%.3f"%y) for x, y in prob_distribution]))
            training_samples_this_stage = []
            sum_of_probs = 0.0
            for sample in [x[0] for x in prob_distribution]:
                sum_of_probs += self._sample_selection_probs[stage_index][sample]
                if sum_of_probs > 0.5:
                    break
                else:
                    training_samples_this_stage.append(sample)
            self._training_samples[stage_index] = sorted(training_samples_this_stage, key=lambda x: sample_index(x))
            if self._stagedebug:
                print("\nTraining samples this stage: %s" % str(self._training_samples[stage_index]))
                print("\nNumber of training samples this stage %d" % len(self._training_samples[stage_index]))
            training_samples_selection_check = set(self._misclassified_samples[stage_index-1]).intersection(set(self._training_samples[stage_index]))
            if self._stagedebug:            
                print("\nTraining samples in the misclassified set: %s" %
                                  str(sorted(training_samples_selection_check, key=lambda x: sample_index(x))))
                print("\nNumber_of_miscalssified_samples_in_training_set: %d" % len(training_samples_selection_check))
            dt_this_stage = DecisionTree('boostingmode')            
            training_data_this_stage = { x : self._all_training_data[x] for x in self._training_samples[stage_index]}
            dt_this_stage._training_data_dict = training_data_this_stage
            dt_this_stage._class_names = self._all_trees[0]._class_names
            dt_this_stage._feature_names = self._all_trees[0]._feature_names
            dt_this_stage._entropy_threshold = self._all_trees[0]._entropy_threshold
            dt_this_stage._max_depth_desired = self._all_trees[0]._max_depth_desired
            dt_this_stage._symbolic_to_numeric_cardinality_threshold =   \
                                                self._all_trees[0]._symbolic_to_numeric_cardinality_threshold
            dt_this_stage._samples_class_label_dict = \
                     {sample_name : self._all_trees[0]._samples_class_label_dict[sample_name] 
                                                     for sample_name in dt_this_stage._training_data_dict.keys()}
            dt_this_stage._features_and_values_dict = \
                                 {feature : [] for feature in self._all_trees[0]._features_and_values_dict}
            pattern = r'(\S+)\s*=\s*(\S+)'        
            for item in sorted(dt_this_stage._training_data_dict.items(), key = lambda x: sample_index(x[0])):
                for feature_and_value in item[1]:
                    m = re.search(pattern, feature_and_value)
                    feature,value = m.group(1),m.group(2)
                    if value != 'NA':
                        dt_this_stage._features_and_values_dict[feature].append(convert(value))
            dt_this_stage._features_and_unique_values_dict = {feature : 
                                      sorted(list(set(dt_this_stage._features_and_values_dict[feature]))) for 
                                                            feature in dt_this_stage._features_and_values_dict}
            dt_this_stage._numeric_features_valuerange_dict = {feature : [] 
                                         for feature in self._all_trees[0]._numeric_features_valuerange_dict}
            dt_this_stage._numeric_features_valuerange_dict = {feature : 
                                   [min(dt_this_stage._features_and_unique_values_dict[feature]), 
                                       max(dt_this_stage._features_and_unique_values_dict[feature])] 
                                           for feature in self._all_trees[0]._numeric_features_valuerange_dict}
            if self._stagedebug:
                print("\n\nPrinting features and their values in the training set:\n")
                for item in sorted(dt_this_stage._features_and_values_dict.items()):
                    print(item[0]  + "  =>  "  + str(item[1]))
                print("\n\nPrinting unique values for features:\n")
                for item in sorted(dt_this_stage._features_and_unique_values_dict.items()):
                    print(item[0]  + "  =>  "  + str(item[1]))
                print("\n\nPrinting unique value ranges for features:\n")
                for item in sorted(dt_this_stage._numeric_features_valuerange_dict.items()):
                    print(item[0]  + "  =>  "  + str(item[1]))
            dt_this_stage._feature_values_how_many_uniques_dict = {feature : [] 
                                       for  feature in self._all_trees[0]._features_and_unique_values_dict}
            dt_this_stage._feature_values_how_many_uniques_dict = {feature :
                 len(dt_this_stage._features_and_unique_values_dict[feature]) 
                                        for  feature in self._all_trees[0]._features_and_unique_values_dict}
#            if stagedebug: dt_this_stage._debug2 = 1
            dt_this_stage.calculate_first_order_probabilities()
            dt_this_stage.calculate_class_priors()
            if self._stagedebug:
                print("\n\n>>>>>>>Done with the initialization of the tree for this stage<<<<<<<<<<\n")
            root_node_this_stage = dt_this_stage.construct_decision_tree_classifier()
            if self._stagedebug:
                root_node_this_stage.display_decision_tree("     ")
            self._all_trees[stage_index] = dt_this_stage
            self._root_nodes[stage_index] = root_node_this_stage
            self._misclassified_samples[stage_index] = \
              self.evaluate_one_stage_of_cascade(self._all_trees[stage_index], self._root_nodes[stage_index])
            if self._stagedebug:           
                print("\nSamples misclassified by this stage classifier: %s" %
                                                           str(self._misclassified_samples[stage_index]))
                print("\nNumber of misclassified samples: %d" % len(self._misclassified_samples[stage_index]))
                self.show_class_labels_for_misclassified_samples_in_stage(stage_index)
            misclassification_error_rate = sum( [self._sample_selection_probs[stage_index][x]
                                                              for x in self._misclassified_samples[stage_index]] )
            if self._stagedebug:           
                print("\nMisclassification_error_rate: %g" % misclassification_error_rate)
            self._trust_factors[stage_index] = \
                                   0.5 * math.log((1-misclassification_error_rate)/misclassification_error_rate)
            if self._stagedebug:
                print("\nThis stage trust factor: %g" % self._trust_factors[stage_index])
コード例 #48
0
ファイル: p5.py プロジェクト: derrickmar/CS189_hw5
import pdb #  pdb.set_trace()
import matplotlib.pyplot as plt
from DecisionTree import DecisionTree
import csv
import sklearn
import sklearn.utils

# ['Xvalidate', '__globals__', '__header__', 'Ytrain', 'Xtrain', '__version__', 'Yvalidate']]
data = scipy.io.loadmat("../spam-dataset/spam_data.mat")
t_data = sklearn.utils.shuffle(data["training_data"], random_state=0) # (5172, 32)
t_labels = sklearn.utils.shuffle(data["training_labels"].ravel(), random_state=0)  # (1, 5172)
training_data = t_data[0:4137]
training_labels = t_labels[0:4137]
validation_data = t_data[4137:5712]
validation_labels = t_labels[4137:5712]
classifier = DecisionTree()
classifier.train(training_data, training_labels)

error_rate = classifier.test(validation_data, validation_labels)
print error_rate







# TESTING CODE

# predictions = classifier.predict(test_data)
# test_data = data["test_data"] # (5857, 32) last one was 0.46755
コード例 #49
0
ファイル: main.py プロジェクト: peixinchen/DecisionTree
# coding: UTF-8
'''
Created on 2013-3-21

@author: peixinchen
'''
from DecisionTree import DecisionTree

data = DecisionTree(training_datafile = "decision.dat")
data.get_training_data()

rootNode = data.construct_decision_tree_classifier()
test_case = [
    "outlook=>sunny",
    "temperature=>hot",
    "humidity=>high",
    "wind=>strong",
]
classification = data.classify(rootNode, test_case)

print classification

if __name__ == '__main__':
    pass
コード例 #50
0
ファイル: testTree.py プロジェクト: jinyyu/machine-learning
 def test_build(self):
     return
     tree = DecisionTree(0, maxDeep = 5)
     data, label = creatDataLabel()
     tree.buildTree(data, label)
     self.assertTrue(tree._maxLabel() == 1)
コード例 #51
0
 def __init__(self, trainingData='data/training.dat'):
     self.dt = DecisionTree(training_datafile = trainingData, debug1=0, debug2=0)
     self.dt.get_training_data()
     self.rootNode = self.dt.construct_decision_tree_classifier()
コード例 #52
0
    def predict(self, X):
        '''
        Return a numpy array of the labels predicted for the given test data.
        '''
        answers = np.array([tree.predict(X) for tree in self.forest]).T
        return np.array([Counter(row).most_common(1)[0][0] for row in answers])

    def score(self, X, y):
        '''
        Return the accuracy of the Random Forest for the given test data and
        labels.
        '''
        return sum(self.predict(X) == y) / float(len(y))

if __name__ == '__main__':
    from sklearn.cross_validation import train_test_split
    import pandas as pd

    df = pd.read_csv('data/congressional_voting.csv', names=['Party']+range(1, 17))
    y = df.pop('Party').values
    X = df.values
    X_train, X_test, y_train, y_test = train_test_split(X, y)

    rf = RandomForest(num_trees=10, num_features=5)
    rf.fit(X_train, y_train)
    print "Random Forest score:", rf.score(X_test, y_test)

    dt = DecisionTree()
    dt.fit(X_train, y_train)
    print "Decision Tree score:", dt.score(X_test, y_test)
コード例 #53
0
ファイル: part2.py プロジェクト: rick-wolf/ID3
def main(argv):

	# Handle User input
	trainFile = ''
	testFile  = ''
	m = 4

	if len(sys.argv) == 4:
		trainFile = sys.argv[1]
		testFile  = sys.argv[2]
		outname = sys.argv[3]
	else:
		sys.exit("Bad input: Please provide a test file, train file, and outfile name")


	# Ingest the datasets
	trainset = readFile(trainFile)
	testset = readFile(testFile)

	# test decision tree constructor
	#a = DecisionTree(trainset, m)

	
	# prep a file for graphing data
	f = open(outname, 'w+')
	f.write('samplePercentage,Accuracy,Min,Max\n')


	# train using various sample sizes
	samplePercs = [0.05, 0.1, 0.2, 0.5]
	for samplePerc in samplePercs:
		# get the number of instances I'll be using
		sampleSize = int(len(trainset.instances)*samplePerc)
		
		# populate the samples
		samples = []
		for i in range(10):
			samples.append(random.sample(trainset.instances, sampleSize))

		accuracies = []
		for sample in samples:
			# train using this sample
			tmpTrain = copy.deepcopy(trainset)
			tmpTrain.overrideInstances(sample)
			tmpTree = DecisionTree(tmpTrain, m)

			scores = []
			for instance in testset.instances:
				scores.append(1 if tmpTree.classify(instance, tmpTree.root) == instance[-1] else 0)
			accuracies.append(float(sum(scores))/len(scores))

		# write the data to a file
		avg = str((float(sum(accuracies))/len(accuracies))*100)
		mi  = str((min(accuracies))*100)
		ma  = str((max(accuracies))*100)
		#f.write(str(samplePerc*100) + ',' + avg + ',Average\n')
		#f.write(str(samplePerc*100) + ',' + mi + ',Minimum\n')
		#f.write(str(samplePerc*100) + ',' + ma + ',Maximum\n')
		f.write(str(samplePerc*100) + ',' + avg + ',' + mi + ',' + ma + '\n')

	# do one more classification accuracy using the whole training set
	scores = []
	a = DecisionTree(trainset, m)
	for instance in testset.instances:
		scores.append(1 if a.classify(instance, a.root) == instance[-1] else 0)
	avg = str((float(sum(scores))/len(scores))*100)
	#f.write('100,' + str((float(sum(scores))/len(scores))*100) + ',Average\n')
	#f.write('100,' + str((float(sum(scores))/len(scores))*100) + ',Minimum\n')
	#f.write('100,' + str((float(sum(scores))/len(scores))*100) + ',Maximum\n')
	f.write('100,' + avg + ',' + avg + ',' + avg + '\n')
コード例 #54
0
ファイル: jueceshu.py プロジェクト: Choes/codesnip
# -*- coding: utf-8 -*_
from DecisionTree import DecisionTree

dt = DecisionTree(training_datafile ="./jueceshu.data")
dt.get_training_data()
dt.show_training_data()
root_node = dt.construct_decision_tree_classifier()
root_node.display_decision_tree("   ")
test_sample = ['exercising=>never', 'smoking=>heavy', 'fatIntake=>heavy', 'videoAddiction=>heavy']
classsification = dt.classify(root_node,test_sample)
print classsification
コード例 #55
0
 def __init__(self, impurity_criterion, num_features = None, prune = False):
     
     DecisionTree.__init__(self, impurity_criterion='entropy')
     self.k = num_features
     self.pruning = prune