Beispiel #1
0
def make_forest(data, n_bootstraps, scoref, min_gain=0.01):
	"""Function to grow a random forest given some training data."""
	trees = []
	for _ in xrange(n_bootstraps):
		data_boot = make_boot(data, data.shape[0])
		trees.append(dt.buildtree(data_boot, scoref, min_gain))

	return Forest(trees)
    start = stepSize * i
    training = dataset[start:start + stepSize]
    trainingLabels = labelInt[start:start + stepSize]

    ############# Feature Extraction ##############

    my_model = PCA(n_components=pca_comps, svd_solver='full')
    newSet = my_model.fit_transform(training).tolist()
    newTestSet = my_model.transform(test).tolist()
    newTrainSet = my_model.transform(training).tolist()

    ############# Model Building ##############
    for k in range(len(newSet)):
        newSet[k].append(trainingLabels[k])
    passingData = newSet[:]
    models.append(dt.buildtree(passingData))
    #    dt.prune(b,0.1)

    ############# Classification of Test Records ##############

    for j in range(len(newTestSet)):
        if j not in test_classify:
            test_classify[j] = []
        test_classify[j].append(dt.classify(newTestSet[j], models[i]))

    ############# Accuracy Calculations ##############

d = []
f = []
flat = []
for l in test_classify.values():
tot_count = 0
tot_correct = 0

train_data = parse_file(train_file_path)
test_data = parse_file(test_file_path)

#Calculating the Accuracy at every level
correct = 0
total = 0
TP = 0
TN = 0
FP = 0
FN = 0
depth = 0
for i in range(1, 7):
    tree = dtree.buildtree(train_data, 0, i)
    for data in test_data:
        predicted = list(dtree.decision(tree, data).keys())[0]
        actual = data[-1]
        total = total + 1
        if predicted == 1.0 and actual == 1.0:
            correct = correct + 1
            TP = TP + 1
        if predicted == 0.0 and actual == 0.0:
            correct = correct + 1
            TN = TN + 1
        if predicted == 1.0 and actual == 0.0:
            FP = FP + 1
        if predicted == 0.0 and actual == 1.0:
            FN = FN + 1
    tot_correct += correct
def build_BNN(data,
              output_condition,
              cd=98,
              mss=1,
              md=10,
              relevant_neuron_dictionary={},
              with_data=0,
              discretization=0,
              cluster_means=None):
    '''
	Starting from the target condition and until the conditions with respect 
	to the first hidden layer, it extracts a DNF that explains each condition
	using conditions of the next shallower layer
	
	param data: instance of DataSet
	param output_condition: condition of interest
	param cd: class dominance
	param mss: minimum dataset size
	param md: maximum tree depth
	param with_data: Avoid ==0. If == 1, the regular simplification operations are performed, if == 2, post-ppruning is performed
	param discretization: method used to determine the thresholds that split the activation range of each neuron
	'''
    BNN = {}
    deep_layer = data.output_layer
    target_class = [output_condition]
    while deep_layer > 0:
        target_split_values = set((l, n, t) for (l, n, t, u) in target_class)
        if not target_split_values:
            warnings.warn(
                'Warning: no split points, returning current dictionary at layer: '
                + str(deep_layer))
        print('Target split values', target_split_values)
        used_shallow_conditions = set([])
        current_data = temp_data(data, deep_layer - 1, target_class)
        if discretization == 0:
            split_points = dis.all_features_trivial_mid_points(current_data)
        elif discretization == 1 or discretization == 3:
            split_points = dis.one_time_discretization(
                current_data,
                discretization,
                rnd=relevant_neuron_dictionary,
                tsv=list(target_split_values))
        elif discretization == 2 or discretization == 4:
            split_points = cluster_means[deep_layer - 1]
        elif discretization == 6:
            colum = [[d[c] for d in current_data]
                     for c in range(len(current_data[0]) - 1)]
            split_points = [[sum(vq.kmeans(v, 2)[0]) / 2] for v in colum]
        elif discretization == 5:
            if deep_layer == 1:
                split_points = [[0.5] for l in range(len(current_data[0]) - 1)]
            else:
                split_points = [[0] for l in range(len(current_data[0]) - 1)]
        print('Split points', [len(l) for l in split_points])
        print(split_points)

        print('')
        for i in target_split_values:
            print('')
            print('i: ', i)
            t = time.time()
            i_data = temp_data(data, deep_layer - 1, i)
            tree = None
            if relevant_neuron_dictionary and discretization == 0:
                pruned_split_points = [
                    _sp(j, i, split_points, relevant_neuron_dictionary)
                    for j in range(len(split_points))
                ]
                print(pruned_split_points)
                tree = dt.buildtree(i_data,
                                    pruned_split_points,
                                    class_dominance=cd,
                                    min_set_size=mss,
                                    max_depth=md,
                                    root=True)
            else:
                tree = dt.buildtree(i_data,
                                    split_points,
                                    class_dominance=cd,
                                    min_set_size=mss,
                                    max_depth=md,
                                    root=True)
            if not tree:
                cero_class = sum(1 for x in i_data if x[-1] == 0)
                one_class = sum(1 for x in i_data if x[-1] == 1)
                if cero_class > one_class:
                    BNN[(i[0], i[1], i[2], True)] = False
                    BNN[(i[0], i[1], i[2], False)] = True
                else:
                    BNN[(i[0], i[1], i[2], False)] = True
                    BNN[(i[0], i[1], i[2], True)] = False
                break
            print('Tree is formed')
            print('Time: ', time.time() - t)
            dnfs = dt.get_dnfs(deep_layer - 1, tree)
            if (i[0], i[1], i[2], False) in target_class:
                print('False case')
                pruned = None
                if isinstance(dnfs[0], list):
                    # print('Fidelity pre-pruning:', ef.accuracy_of_dnf(data, (i[0], i[1], i[2], False), dnfs[0], True, False, False, True))
                    # print('Precision pre-pruning:', ef.precision_of_dnf(data, (i[0], i[1], i[2], False), dnfs[0], True, False, False, True))
                    # print('Recall pre-pruning:', ef.recall_of_dnf(data, (i[0], i[1], i[2], False), dnfs[0], True, False, False, True))
                    data.update_dictionary([(l, n, t) for conj in dnfs[0]
                                            for (l, n, t, u) in conj])
                    if with_data == 0:
                        pruned = s.boolean_simplify_basic(dnfs[0])
                    elif with_data >= 1:
                        pruned = s.boolean_simplify_complex(dnfs[0])
                    if with_data == 2:
                        pruned = p.post_prune(pruned,
                                              (i[0], i[1], i[2], False),
                                              data.example_cond_dict,
                                              data.dict_indexes,
                                              data=None)
                    used_shallow_conditions.update(
                        set(c for conj in pruned for c in conj))
                else:
                    pruned = dnfs[0]
                # print('Fidelity post-pruning:', ef.accuracy_of_dnf(data, (i[0], i[1], i[2], False), pruned, True, False, False, True))
                # print('Precision post-pruning:', ef.precision_of_dnf(data, (i[0], i[1], i[2], False), pruned, True, False, False, True))
                # print('Recall post-pruning:', ef.recall_of_dnf(data, (i[0], i[1], i[2], False), pruned, True, False, False, True))
                BNN[(i[0], i[1], i[2], False)] = pruned
                print((i[0], i[1], i[2], False), pruned)
            if (i[0], i[1], i[2], True) in target_class:
                print('True case')
                pruned = None
                if isinstance(dnfs[1], list):
                    # print('Fidelity pre-pruning:', ef.accuracy_of_dnf(data, (i[0], i[1], i[2], True), dnfs[1], True, False, False, True))
                    # print('Precision pre-pruning:', ef.precision_of_dnf(data, (i[0], i[1], i[2], True), dnfs[1], True, False, False, True))
                    # print('Recall pre-pruning:', ef.recall_of_dnf(data, (i[0], i[1], i[2], True), dnfs[1], True, False, False, True))
                    data.update_dictionary([(l, n, t) for conj in dnfs[1]
                                            for (l, n, t, u) in conj])
                    if with_data == 0:
                        pruned = s.boolean_simplify_basic(dnfs[1])
                    elif with_data >= 1:
                        pruned = s.boolean_simplify_complex(dnfs[1])
                    if with_data == 2:
                        pruned = p.post_prune(pruned, (i[0], i[1], i[2], True),
                                              data.example_cond_dict,
                                              data.dict_indexes,
                                              data=None)
                    used_shallow_conditions.update(
                        set(c for conj in pruned for c in conj))
                else:
                    pruned = dnfs[1]
                # print('Fidelity post-pruning:', ef.accuracy_of_dnf(data, (i[0], i[1], i[2], True), pruned, True, False, False, True))
                # print('Precision post-pruning:', ef.precision_of_dnf(data, (i[0], i[1], i[2], True), pruned, True, False, False, True))
                # print('Recall post-pruning:', ef.recall_of_dnf(data, (i[0], i[1], i[2], True), pruned, True, False, False, True))
                BNN[(i[0], i[1], i[2], True)] = pruned
                print((i[0], i[1], i[2], True), pruned)
        deep_layer -= 1
        target_class = list(used_shallow_conditions)
    return BNN
Beispiel #5
0
    trainingLabels = [labelInt[i] for i in training_idx]
    testLabels = [labelInt[i] for i in test_idx]

    ############# Feature Extraction ##############

    my_model = PCA(n_components=pca_comps, svd_solver='full')
    newSet = my_model.fit_transform(training).tolist()
    newTestSet = my_model.transform(test).tolist()
    newTrainSet = my_model.transform(training).tolist()

    ############# Model Building ##############
    for i in range(len(newSet)):
        newSet[i].append(trainingLabels[i])
    passingData = newSet[:]
    b = dt.buildtree(passingData)
    dt.prune(b, 0.1)

    ############# Classification of Train Records ##############
    count = 0
    for i in range(len(newTrainSet)):
        a = dt.classify(newTrainSet[i], b)
        for key in a.keys():
            if (key == trainingLabels[i]):
                count = count + 1

    ############# Accuracy Calculations for Training DataSet ##############
    accuracy = (count / len(newTrainSet)) * 100
    final_train_acc += accuracy
    print('Train accuracy:', accuracy)
Beispiel #6
0
# remove index column
features_train = features_train[:,1:]

labels_train = np.genfromtxt('../census-dataset/census-train-labels.csv', delimiter=' ', skip_header=1)
# remove index column
labels_train = labels_train[:,1:][:,0]

# split to obtain train and test set
x_train, x_test, y_train, y_test = train_test_split(features_train, labels_train, test_size=0.33)

# concatenate features and labels
data_train = np.column_stack((x_train, y_train))
data_test = np.column_stack((x_test, y_test))

# build decision tree using entropy
decision_tree = dt.buildtree(data_train, dt.entropy, 0.01)

min_gain_error = {}
# test minimal gain values for pruning
for min_gain_value in np.arange(0,1, 0.01):
    dt_temp = copy.copy(decision_tree)
    dt.prune(dt_temp, min_gain_value)
    # classify test data
    y_hat = map(lambda obs : dt.classify(obs, dt_temp), x_test)
    y_hat = map(dt.convertToLabel, y_hat)
    y_hat = np.array(y_hat)
    error = (y_hat != y_test).sum() / float(y_test.shape[0])
    min_gain_error[min_gain_value] = error

# prune tree with optimal min_gain value
Beispiel #7
0
        label, pixels = dataset[test_idx[i]]
        record = (pixels.flatten()).tolist()
        testing_labels.append(label)
        rows_test_total.append(record)

    ############# Feature Extraction ##############
    FinalTrain = []
    my_model = PCA(n_components=pca_comps, svd_solver='full')
    newSet = my_model.fit_transform(rows_total).tolist()
    newtestSet = my_model.transform(rows_test_total).tolist()

    ############# Model Building ##############

    for i in range(len(rows_total)):
        newSet[i].append(training_labels[i])
    b = dt.buildtree(newSet)
    dt.prune(b, 0.1)

    ############# Classification of Test Records ##############
    number = 0
    accuracy = 0
    for i in range(testSize):
        a = dt.classify(newtestSet[i], b)
        for key in a.keys():
            if (key == testing_labels[i]):
                number = number + 1

    ############# Accuracy Calculations ##############

    accuracy = (number / testSize) * 100
    final_test_acc += accuracy
    def buildtree(depth, test_data, train_data, current_index):

        tot_count = 0
        tot_correct = 0

        #Calculating the Accuracy at every level
        correct = 0
        total = 0
        TP = 0
        TN = 0
        FP = 0
        FN = 0
        #print ("Depth Entered is :" ,depth)

        predicted_list = []
        predicted_list_1 = []
        for i in range(depth):

            tree = dtree.buildtree(train_data, 0, i)
            for data in train_data:
                predicted = list(dtree.decision(tree, data).keys())[-1]
                predicted_list.append(predicted)

            for data in test_data:
                predicted = list(dtree.decision(tree, data).keys())[0]
                predicted_list_1.append(predicted)
                one_count_testdata = predicted_list_1.count(1)
                zero_count_testdata = predicted_list_1.count(0)
                actual = data[-1]
                total = total + 1
                if predicted == 1.0 and actual == 1.0:
                    correct = correct + 1
                    TP = TP + 1
                if predicted == 0.0 and actual == 0.0:
                    correct = correct + 1
                    TN = TN + 1
                if predicted == 1.0 and actual == 0.0:
                    FP = FP + 1
                if predicted == 0.0 and actual == 1.0:
                    FN = FN + 1

            tot_correct += correct
            tot_count += total
            Accuracy = round(100 * correct / total, 2)
            Depth_list.append(depth)
            Accuracy_list.append(Accuracy)
            depth = depth + 1
            #print (Accuracy_list)
            #print (Depth_list)
            #printing the confusion matrix
        print("Accuracy::", str(Accuracy) + '%')
        print("False Negatives ", str(FN))
        print("False positives ", str(FP))
        print("True Negatives ", str(TN))
        print("True Positives ", str(TP))
        print("Confusion Matrix for bagging")
        print("------")
        print("| ", TP, "|", FN, "|")
        print("------")
        print("| ", FP, "|", TN, "|")
        print("------")
Beispiel #9
0
 def setUp(self):
     my_data = np.genfromtxt('decision_tree_example.txt', dtype=None)
     self.rows = my_data.tolist()
     self.tree = decision_tree.buildtree(self.rows)