Beispiel #1
0
def Q4():  # decision trees

    syn_data = get_syn_data()
    X_test, X_train, X_val, y_test, y_train, y_val = syn_data[0], syn_data[1], \
                                                     syn_data[2], syn_data[3], \
                                                     syn_data[4], syn_data[5]

    D = [3, 6, 8, 10, 12]
    training_error, validation_error = [], []
    learned_classifiers = [None] * len(D)

    for d in D:
        dt = decision_tree.DecisionTree(d)
        dt.train(X_train, y_train)

        learned_classifiers[D.index(d)] = dt

        training_error.append(dt.error(X_train, y_train))
        validation_error.append(dt.error(X_val, y_val))

    plot_decisions(D, learned_classifiers, X_train, y_train,
                   "CART DT on SynData")
    plt.plot(D, training_error, label='training error', color='magenta')
    plt.plot(D,
             validation_error,
             label='validation error',
             color='deepskyblue')
    plt.title('CART DT error on SynData as function of max depth')
    plt.legend(loc='best')
    plt.xlabel('Max Depth')
    plt.ylabel('Error')
    plt.show()
Beispiel #2
0
    def fit(self, X, y, max_depth=15):
        '''
        Fit the data to all trees
        '''
        # save labels
        self.labels = np.unique(y)
        # determine n_samples
        if (self.n_samples == 'all' or self.n_samples > len(X)):
            self.n_samples = len(X)
        # determine n_features
        if (self.n_features == 'auto'):
            self.n_features = int(math.sqrt(X.shape[1]))
        elif (self.n_features == 'all' or self.n_features > X.shape[1]):
            self.n_features = X.shape[1]
        # QA
        if (self.n_samples <= 0 or self.n_features <= 0 or self.n_trees < 2):
            raise ValueError('There is an error in your input values')

        # generate n trees and fit them
        self.trees = []
        for i in range(self.n_trees):
            # generate a sub-sample (with returns) for the tree
            mask = np.random.choice(np.arange(len(X)),
                                    self.n_samples,
                                    replace=True)
            # fit tree
            tree = decision_tree.DecisionTree(max_depth,
                                              max_features=self.n_features)
            tree.fit(X[mask], y[mask])
            # add to ensemble
            self.trees.append(tree)
def check_decision_tree():
    try:
        results = json.load(open('decision_tree.json', 'r'))
        if results['test_accu'] >= 0.8 and results['train_accu'] >= 0.8:
            score_results = 0.5
        else:
            score_results = 0
    except:
        return 0

    test_features = [[0, 0], [0, 0], [0, 1], [0, 0], [1, 0], [1, 0], [1, 1],
                     [1, 1], [1, 1]]
    test_labels = [0, 0, 0, 1, 1, 1, 0, 0, 1]

    try:
        import decision_tree
        test_tree = decision_tree.DecisionTree()
        test_tree.train(test_features, test_labels)
        predictions = test_tree.predict(test_features)
        if predictions == [0, 0, 0, 0, 1, 1, 0, 0, 0]:
            score_tree = 0.5
        else:
            score_tree = 0
    except:
        return 0

    return round(score_results + score_tree, 1)
Beispiel #4
0
def run_k_folds_custom_dt(corpus, ys, k):
	x_folds, y_folds = get_folds(corpus, ys, k)
	classifier = decision_tree.DecisionTree()
	overall_accuracy = 0
	for i in xrange(0, k):
		train_xs, test_xs, train_ys, test_ys = get_train_and_test(x_folds, y_folds, i, k)
		train_xs, svd, transform = generate_ngrams(train_xs, 1, 2, 50000, True)
		matrix = transform.transform(test_xs)
		matrix = svd.transform(matrix)
		z = 0
		
		classifier.fit(train_xs[0:10000,:], np.array(train_ys[0:10000]))
		num_correct = 0
		predict = np.zeros((matrix.shape[0],1))
		for entry in matrix:
			predict[z] = classifier.predict(entry)
			if  predict[z] == test_ys[z]:
				num_correct += 1
			z += 1
			
		
		cm = confusion_matrix(predict, test_ys)
		plt.matshow(cm)
		plt.colorbar()
		plt.ylabel('True label')
		plt.xlabel('Predicted label')
		plt.show()
			
		current_accuracy = float(num_correct)/len(test_ys)
		print i, ": ", current_accuracy
		overall_accuracy += current_accuracy
	overall_accuracy /= float(k)
	print "Overall: %f" % (rep, overall_accuracy)
Beispiel #5
0
def test_same_class_tree_default_params():
    t = tree.DecisionTree()
    X_train, _, y_train, _ = _trivial_split()
    t.fit(X_train, y_train)
    raw_decision_tree = t.root

    assert raw_decision_tree.is_leaf
    assert raw_decision_tree.prediction == 1
Beispiel #6
0
def Q4():  # decision trees
    val_error = []
    train_error = []
    sample = [3, 6, 8, 10, 12]
    for samp in sample:
        dt = dta.DecisionTree(samp)
        dt.train(X_train, y_train)
        train_error.append(dt.error(X_train, y_train))
        val_error.append(dt.error(X_val, y_val))
    plot(sample, train_error)
    plot(sample, val_error)
    xlabel("samp")
    ylabel("error rate")
    legend(["train error", "validation error"], loc=5)
    show()
    # figure(1)
    # ion()
    for index, samp in enumerate(sample):
        dt = dta.DecisionTree(samp)
        dt.train(X_train, y_train)
        subplot(2, 3, index + 1)
        decision_boundaries(dt, X_train, y_train, "samp = " + str(samp))
    pause(8)
    best_d = sample[val_error.index(np.min(val_error))]
    print(best_d)
    dt = dta.DecisionTree(best_d)
    dt.train(X_train, y_train)
    print(dt.error(X_test, y_test))
    # Bagging:
    val_error = []
    for B in range(5, 105, 5):
        print("B: " + str(B))
        bag = bagging.Bagging(dta.DecisionTree, B, best_d)
        bag.train(X_train, y_train)
        val_error.append(bag.error(X_val, y_val))

    plot(range(5, 105, 5), val_error)
    xlabel("B")
    ylabel("validation error rate")
    show()
    best_b = list(range(5, 105, 5))[val_error.index(np.min(val_error)) + 5]
    print("best b: ", best_b)
    bag = bagging.Bagging(dta.DecisionTree, best_b, best_d)
    bag.train(X_train, y_train)
    print(bag.error(X_test, y_test))
Beispiel #7
0
 def predict(self, data, roots):
     predictions = [0, 0]
     test = decision_tree.DecisionTree(self.maxDepth)
     for x in roots:
         predictions[test.predict(data, x)] += 1
     if predictions[0] > predictions[1]:
         return 0
     else:
         return 1
Beispiel #8
0
 def train(self, data, labels):
     decision_trees = []
     test = decision_tree.DecisionTree(self.maxDepth)
     segments = len(data) // 2
     m = np.sqrt(len(data[0]))
     for x in range(self.trees):
         data, labels = shuffle(data, labels)
         decision_trees.append(
             test.train(data[:segments], labels[:segments], m=int(m)))
     return decision_trees
Beispiel #9
0
def main():
    x, y = get_data()
    model = dt.DecisionTree([0, 1, 2, 3, 4, 5])
    model.fit(x, y)
    count = 0
    _sum = 0
    x, y = get_test()
    for i, e in enumerate(x):
        p = model.predict(e)
        # print(p, y[i])
        if p == y[i]:
            count += 1
        _sum += 1
    print(count / _sum)
 def setUp(self):
     """
     Loads dataset config.
     """
     self.criterion = criteria.GiniGain
     self.config = dataset.load_config(
         os.path.join('.', 'data', 'train_dataset2'))
     self.data = dataset.Dataset(self.config["filepath"],
                                 self.config["key attrib index"],
                                 self.config["class attrib index"],
                                 self.config["split char"],
                                 self.config["missing value string"],
                                 load_numeric=True)
     self.decision_tree = decision_tree.DecisionTree(self.criterion)
Beispiel #11
0
def dt():
    start_time = time.time()
    data_frame, data_discrete_info, data_continuous_info = preprocess.read_data(train_filename, discrete_keys,
                                                                                continuous_keys)
    test_frame, _, __ = preprocess.read_data(test_filename, discrete_keys, continuous_keys)
    # attributes = discrete_keys + continuous_keys
    tree = decision_tree.DecisionTree(data_frame, discrete_keys + continuous_keys, data_discrete_info,
                                      data_continuous_info, 'y')
    tree.build()
    # tree.show_tree()
    error_rate = tree.inference(test_frame)
    end_time = time.time()
    print("Time cost:", end_time - start_time)
    return error_rate
Beispiel #12
0
    def fit(self, data, targets):
        """ fits the data to n decision trees

        Keyword Arguments
        data - the arrays which describe the pacman scene
        target - the move associated with each array
        """
        self.trees = []

        # Create n decision trees from random sample 
        # of train data
        for _ in range(self.dec_num):
            train, target = self.generate_train(data, targets)
            dt = decision_tree.DecisionTree(train, target)
            self.trees.append(dt)
def test_predict_proba_on_digits_dataset():
    digits_dataset = load_digits()
    RANDOM_STATE = 17
    X_train, X_test, y_train, y_test = train_test_split(
        digits_dataset['data'],
        digits_dataset['target'],
        test_size=0.2,
        random_state=RANDOM_STATE)

    t = tree.DecisionTree(criterion='gini', max_depth=3)
    t.fit(X_train, y_train)

    proba = t.predict_proba(X_train[0:1])

    np.testing.assert_almost_equal(proba[0].sum(), 1.0, 3)
Beispiel #14
0
    def __init__(self, input_data, number_trees):
        """
        Creates a new random forest as a list of decision trees.
        Number of trees must be an odd positive integer.

        :param input_data: pandas data frame
        :param number_trees: int
        """
        if number_trees % 2 == 0 or number_trees <= 0:
            raise ValueError("Number of trees must be an odd positive integer")
        self.trees = []
        for i in range(0, number_trees):
            bootstrapped_data = input_data.sample(max(input_data.count()),
                                                  replace=True)
            self.trees.append(
                dt.DecisionTree(bootstrapped_data, random_subset=True))
Beispiel #15
0
def main():
    # ******************************** Part I using k-fold cross validation on the data set  **************************
    # ******************************** Read data from files ***********************************************************
    # Get the required data from the file.
    data_set, features, num_of_features = read_data.read_from_train_file('dataset.txt')
    # ******************************** K cross validation *************************************************************
    # *****************************************************************************************************************
    # *****************************************************************************************************************
    # K cross validation of the data.
    # The data is shuffled and split into k chunks.
    # One chunk is set to be the test set and the rest are mixed to be the training set.
    train, test = k_cross_validation.data_cross_validation(5, data_set)
    # Initialize the features.
    # Cross validation - send the training set.
    utility.create_feature_dictionaries(features, train)
    # ******************************** Decision Tree ******************************************************************
    # Create the model
    tree_model = decision_tree.DecisionTree(num_of_features, utility.all_feature_types)
    # Create the root.
    # Cross validation - send the training set.
    tree_root = tree_model.create_tree_root(train, list(utility.all_feature_types.keys()),
                                            tree_model.majority_classification(train), 0)
    # Create the tree.
    tree = decision_tree.Tree(tree_root)
    # Run the algorithm on the data set.
    # Cross validation - send testing set.
    tree_results = tree_model.classify(test, tree)
    # Create the tree string.
    tree_string = tree.create_tree_string(tree_root)
    # Write it to a file.
    with open("tree.txt", 'w') as f:
        f.write(tree_string)
    # ******************************** KNN ****************************************************************************
    # Create the model.
    knn_model = k_nearest_neightbors.KNearestNeighbors(5, num_of_features)
    # Run the algorithm and get the results.
    # Cross validation - send training and test set.
    knn_results = knn_model.classify(train, test)
    # ******************************** Naives Bayes  ******************************************************************
    # Create the model.
    bayes_model = naive_bayes.NaiveBayes(num_of_features)
    # Run the algorithm and get the results.
    # Cross validation - send training and test set.
    bayes_results = bayes_model.classify(train, test)
    # ******************************** Accuracy  **********************************************************************
    # Call the accuracy function, send the test set and algorithm results to compare and write results to a file.
    accuracy.accuracy(test, tree_results, knn_results, bayes_results)
Beispiel #16
0
def main():
    # ******************************** Part II Hardcoded train.txt and test.txt files *********************************
    # *****************************************************************************************************************
    # *****************************************************************************************************************
    # ******************************** Read data from files ***********************************************************
    # Get the required data from the file.
    train, features, num_of_features = read_data.read_from_train_file(
        'train.txt')
    test = read_data.read_from_test_file('test.txt')
    # Initialize the features.
    utility.create_feature_dictionaries(features, train)
    # ******************************** Decision Tree ******************************************************************
    # Create the model
    tree_model = decision_tree.DecisionTree(num_of_features,
                                            utility.all_feature_types)
    # Create the root.
    tree_root = tree_model.create_tree_root(
        train, list(utility.all_feature_types.keys()),
        tree_model.majority_classification(train), 0)
    # Create the tree.
    tree = decision_tree.Tree(tree_root)
    # Run the algorithm on the data set.
    tree_results = tree_model.classify(test, tree)
    # Create the tree string.
    tree_string = tree.create_tree_string(tree_root)
    # Write it to the output.txt file and add a newline.
    with open("output.txt", 'w') as f:
        f.write(tree_string)
        # Separate by newline for accuracy results later.
        f.write('\n')
    # ******************************** KNN ****************************************************************************
    # Create the model.
    knn_model = k_nearest_neightbors.KNearestNeighbors(5, num_of_features)
    # Run the algorithm and get the results.
    knn_results = knn_model.classify(train, test)
    # ******************************** Naives Bayes  ******************************************************************
    # Create the model.
    bayes_model = naive_bayes.NaiveBayes(num_of_features)
    # Run the algorithm and get the results.
    bayes_results = bayes_model.classify(train, test)
    # ******************************** Accuracy  **********************************************************************
    # Call the accuracy output function, send the test set and algorithm results and write results
    # to the output.txt file
    accuracy.accuracy_output(test, tree_results, knn_results, bayes_results,
                             "output.txt")
 def setUp(self):
     """
     Loads dataset config and Dataset without numeric attributes, trains the tree.
     """
     import criteria
     self.criterion = criteria.GiniGain
     self.config = dataset.load_config(
         os.path.join('.', 'data', 'train_dataset1'))
     self.data = dataset.Dataset(self.config["filepath"],
                                 self.config["key attrib index"],
                                 self.config["class attrib index"],
                                 self.config["split char"],
                                 self.config["missing value string"],
                                 load_numeric=False)
     self.decision_tree = decision_tree.DecisionTree(self.criterion)
     self.decision_tree.train(self.data,
                              list(range(self.data.num_samples)),
                              max_depth=1,
                              min_samples_per_node=1,
                              use_stop_conditions=False,
                              max_p_value_chi_sq=None)
Beispiel #18
0
def main(args):
    filename = args.file
    method = args.method

    print("-STARTING-\n")
    print("Using")
    print("Ab cutoff: {}".format(args.ab_count_cutoff))
    print("Culture cutoff: {}".format(args.culture_cutoff))
    print("Method: {}".format(method))
    print("Analysis type: {}".format(args.analysis_type))
    print()

    if args.average:
        classifier = average.Average(filename,
                                     CULTURE_SIZE_CUTOFF=args.culture_cutoff,
                                     AB_CULTURE_COUNT_CUTOFF=args.ab_count_cutoff,
                                     ESBL_AB_RESISTENCE_LIST=ESBL_AB_RESISTANCE_LIST,
                                     RELEVANT_MO_LIST=RELEVANT_MO_LIST)
    # elif args.svm:
    #     SVM.run(filename, args.culture_cutoff, args.ab_count_cutoff, ESBL_AB_RESISTANCE_LIST)
    elif args.tree:
        classifier = decision_tree.DecisionTree(filename,
                                                CULTURE_SIZE_CUTOFF=args.culture_cutoff,
                                                AB_CULTURE_COUNT_CUTOFF=args.ab_count_cutoff,
                                                ESBL_AB_RESISTANCE_LIST=ESBL_AB_RESISTANCE_LIST,
                                                RELEVANT_MO_LIST=RELEVANT_MO_LIST,
                                                testmode=method,
                                                analysis_type=args.analysis_type,
                                                medication_file=args.medication_file)
    elif args.perceptron:
        classifier = perceptron.Perceptron(filename,
                                           CULTURE_SIZE_CUTOFF=args.culture_cutoff,
                                           AB_CULTURE_COUNT_CUTOFF=args.ab_count_cutoff,
                                           ESBL_AB_RESISTANCE_LIST=ESBL_AB_RESISTANCE_LIST,
                                           RELEVANT_MO_LIST=RELEVANT_MO_LIST,
                                           testmode=method,
                                           analysis_type=args.analysis_type,
                                           medication_file=args.medication_file)
    
    classifier.run()
Beispiel #19
0
 def fit(self, x, y):
     """
     Fit the model on the data
     :param x: (Dataframe) Feature data
     :param y: (array) Dependent variable
     """
     if not self.n_bootstrap:
         self.n_bootstrap = ((len(x) <= 1000) and min(250, len(x))) or 500
     features = x.columns
     if not self.max_features:
         self.max_features = m.ceil(len(features)**0.5)
     data = x
     data['dependent'] = y
     self.forest = []
     for i in range(self.n_trees):
         data_bs = bootstrap(data, self.n_bootstrap)
         self.forest[i] = decision_tree.DecisionTree(
             data_bs[features],
             data_bs.dependent,
             max_features=self.max_features,
             max_depth=self.max_depth,
             min_samples=self.min_samples)
         self.forest[i].grow(True)
def test_wrong_value_of_max_depth():
    with pytest.raises(ValueError):
        tree.DecisionTree(max_depth=0)
import decision_tree
import csv
import random
data = []
labels = []
with open("hw5_titanic_dist/cleaned_data.csv") as census_file:
    censusreader = csv.reader(census_file)
    for x in censusreader:
        data.append(list(map(lambda y: int(y), x)))
with open("hw5_titanic_dist/cleaned_data_labels.csv") as census_file:
    censusreader = csv.reader(census_file)
    for x in censusreader:
        labels.append(int(x[0]))

test = decision_tree.DecisionTree(2)
root = test.train(data, labels)
print(root.split_rule)
print(root.left.split_rule)
print(root.right.split_rule)
print(root.left.left.label)
print(root.left.right.label)
print(root.right.left.label)
print(root.right.right.label)
Beispiel #22
0
def Q5():  # spam data

    T = [5, 50, 100, 200, 500, 1000]
    D = [5, 8, 10, 12, 15, 18]
    # get spam data
    spam_data = np.loadtxt('SpamData/spam.data')

    # change values of 0 to -1
    spam_data[:, -1][spam_data[:, -1] == 0] = -1\

    # get vault data and train data
    np.random.shuffle(spam_data)
    vault_index = np.random.choice(len(spam_data), 1536, replace=False)
    train_index = np.array(
        [i for i in range(len(spam_data)) if i not in vault_index])
    train_data = spam_data[train_index]

    vault_data = spam_data[vault_index]

    # Use 5-fold cross validation to pick T and d
    data_size = len(train_data)
    split = int(data_size / 5)
    folds = np.split(train_data, [split, 2 * split, 3 * split, 4 * split])
    data_sets = split_data_to_folds(folds)

    DT_error = [0] * 6
    adaboost_error = [0] * 6
    best_DT_error = None
    bes_adaboost_error = None

    for i in range(5):

        fold_size1 = data_sets[i][0].shape[1]

        arr1 = data_sets[i][0]
        arr2 = data_sets[i][1]



        X_train, y_train = arr1[:, 0:fold_size1 - 1],\
                           arr1[:,fold_size1 - 1:fold_size1]

        X_validation = arr2[:, 0:(fold_size1 - 1)]
        y_validation = arr2[:, (fold_size1 - 1):fold_size1]

        y_train = y_train.reshape((-1, ))
        y_validation = y_validation.reshape((-1, ))

        for t in T:
            ada_boost = adaboost.AdaBoost(tools.DecisionStump, t)
            ada_boost.train(X_train, y_train)
            current_adaboost_error = ada_boost.error(X_validation,
                                                     y_validation)
            adaboost_error[i] += current_adaboost_error
            if bes_adaboost_error == None or bes_adaboost_error > current_adaboost_error:
                bes_adaboost_error = t

        for d in D:
            dt = decision_tree.DecisionTree(d)
            dt.train(X_train, y_train)
            current_dt_error = dt.error(X_validation, y_validation)
            DT_error[i] += current_dt_error
            if best_DT_error == None or best_DT_error > current_dt_error:
                best_DT_error = d

    # get mean error
    adaboost_error = np.array([x / 5 for x in adaboost_error])
    DT_error = np.array([x / 5 for x in DT_error])

    plt.errorbar(T, adaboost_error, capsize=np.std, color='magenta')
    plt.title('validation error on SpamData for adaBoost as function of T')
    plt.legend(loc='best')
    plt.xlabel('T')
    plt.ylabel('Error')
    plt.errorbar(T, adaboost_error)
    plt.show()

    #
    plt.errorbar(D, DT_error, capsize=np.std, color='magenta')
    plt.title('validation error on SpamData for DT as function of max depth')
    plt.legend(loc='best')
    plt.xlabel('max depth')
    plt.ylabel('Error')
    plt.show()

    # Train classifiers using the chosen parameter values, using the complete training set.
    #

    X_train, y_train = train_data[:, 0:57], train_data[:, 57]
    X_vault, y_vault = vault_data[:, 0:57], vault_data[:, 57]

    ada_boost = adaboost.AdaBoost(tools.DecisionStump, bes_adaboost_error)
    ada_boost.train(X_train, y_train)
    vault_adaboost_error = ada_boost.error(X_vault, y_vault)

    dt = decision_tree.DecisionTree(best_DT_error)
    dt.train(X_train, y_train)
    vault_dt_error = dt.error(X_vault, y_vault)

    print("vault_adaboost_error= " + vault_adaboost_error)
    print("vault_dt_error= " + vault_dt_error)
Beispiel #23
0
def run(dataset_name, train_dataset, criterion, min_num_samples_allowed, max_depth, num_trials,
        starting_seed, num_folds, is_stratified, use_numeric_attributes, use_chi_sq_test,
        max_p_value_chi_sq, output_file_descriptor, output_split_char=',', seed=None):
    """Runs `num_trials` experiments, each one doing a stratified cross-validation in `num_folds`
    folds. Saves the training and classification information in the `output_file_descriptor` file.
    """
    if seed is not None:
        random.seed(seed)
        np.random.seed(seed)

    for trial_number in range(num_trials):
        print('*'*80)
        print('STARTING TRIAL #{} USING SEED #{}'.format(
            trial_number + 1, starting_seed + trial_number))
        print()

        if seed is None:
            random.seed(RANDOM_SEEDS[trial_number + starting_seed - 1])
            np.random.seed(RANDOM_SEEDS[trial_number + starting_seed - 1])

        tree = decision_tree.DecisionTree(criterion=criterion)

        start_time = timeit.default_timer()
        (_,
         num_correct_classifications_w_unkown,
         num_correct_classifications_wo_unkown,
         _,
         _,
         _,
         num_unkown,
         _,
         _,
         num_nodes_prunned_per_fold,
         max_depth_per_fold,
         num_nodes_per_fold,
         num_valid_attributes_in_root,
         num_valid_nominal_attributes_in_root,
         num_valid_numeric_attributes_in_root,
         num_values_root_attribute_list,
         num_trivial_splits,
         trivial_accuracy_percentage) = tree.cross_validate(
             curr_dataset=train_dataset,
             num_folds=num_folds,
             max_depth=max_depth,
             min_samples_per_node=min_num_samples_allowed,
             is_stratified=is_stratified,
             print_tree=False,
             print_samples=False,
             use_stop_conditions=use_chi_sq_test,
             max_p_value_chi_sq=max_p_value_chi_sq)
        total_time_taken = timeit.default_timer() - start_time
        accuracy_with_missing_values = (100.0 * num_correct_classifications_w_unkown
                                        / train_dataset.num_samples)
        try:
            accuracy_without_missing_values = (100.0 * num_correct_classifications_wo_unkown
                                               / (train_dataset.num_samples - num_unkown))
        except ZeroDivisionError:
            accuracy_without_missing_values = None

        percentage_unkown = 100.0 * num_unkown / train_dataset.num_samples

        if num_values_root_attribute_list:
            (avg_num_values_root_attribute,
             max_num_values_root_attribute,
             min_num_values_root_attribute) = (np.mean(num_values_root_attribute_list),
                                               np.amax(num_values_root_attribute_list),
                                               np.amin(num_values_root_attribute_list))
        else:
            (avg_num_values_root_attribute,
             max_num_values_root_attribute,
             min_num_values_root_attribute) = (None, None, None)

        save_trial_info(dataset_name, train_dataset.num_samples, trial_number + starting_seed,
                        criterion.name, max_depth, num_folds, is_stratified, use_numeric_attributes,
                        min_num_samples_allowed, decision_tree.USE_MIN_SAMPLES_SECOND_LARGEST_CLASS,
                        decision_tree.MIN_SAMPLES_SECOND_LARGEST_CLASS,
                        use_chi_sq_test, max_p_value_chi_sq,
                        decision_tree.MIN_SAMPLES_IN_SECOND_MOST_FREQUENT_VALUE,
                        np.mean(num_valid_attributes_in_root),
                        np.mean(num_valid_nominal_attributes_in_root),
                        np.mean(num_valid_numeric_attributes_in_root), total_time_taken,
                        trivial_accuracy_percentage, accuracy_with_missing_values,
                        accuracy_without_missing_values, num_unkown, percentage_unkown,
                        avg_num_values_root_attribute, max_num_values_root_attribute,
                        min_num_values_root_attribute, num_trivial_splits,
                        np.mean(num_nodes_per_fold), np.amax(num_nodes_per_fold),
                        np.amin(num_nodes_per_fold), np.mean(max_depth_per_fold),
                        np.amax(max_depth_per_fold), np.amin(max_depth_per_fold),
                        np.mean(num_nodes_prunned_per_fold), output_split_char,
                        output_file_descriptor)
Beispiel #24
0
def main():
    data_set = [['ACD', 0.0231, 1.157, 0.919, 93.061, 0.0917],
                ['ACD', 0.0296, 1.1183, 0.9356, 80.9492, 0.0681],
                ['ACD', 0.0471, 1.3537, 1.0208, 108.7305, 0.091],
                ['ACD', 0.0165, 1.2621, 1.1879, 116.3081, 0.1154],
                ['ACD', 0.0236, 1.117, 0.8673, 77.9446, 0.066],
                ['ACD', 0.008, 1.413, 1.0474, 102.6556, 0.07],
                ['ACD', 0.0267, 1.4068, 1.1244, 107.5716, 0.0734],
                ['ACD', 0.0838, 1.1258, 1.0406, 100.2574, 0.0474],
                ['ACD', 0.0225, 1.2126, 0.9824, 98.885, 0.0928],
                ['ACD', 0.0639, 2.1101, 1.2162, 137.5727, 0.159],
                ['ACD', 0.0021, 0.8333, 0.7004, 68.5042, 0.0464],
                ['ACD', 0.0208, 1.5963, 1.0204, 142.5501, 0.1329],
                ['HM', 0.461, 2.1225, 1.5204, 133.2334, 0.0623],
                ['HM', 0.2118, 1.5373, 1.2326, 99.011, 0.0808],
                ['HM', 0.2308, 2.3465, 1.3419, 106.459, 0.0548],
                ['HM', 0.5372, 2.171, 1.8759, 135.6919, 0.0602],
                ['HM', 0.318, 2.1527, 1.1671, 130.0122, 0.0651],
                ['HM', 0.2434, 2.3092, 1.6817, 179.5259, 0.1192],
                ['HM', 0.4191, 1.5634, 0.8894, 117.2704, 0.0265],
                ['HM', 0.5952, 2.6538, 1.5957, 152.4041, 0.0752],
                ['HM', 0.3963, 2.0715, 1.2956, 124.8764, 0.094],
                ['HM', 0.1638, 1.8827, 1.0938, 105.0277, 0.0384],
                ['HM', 0.2752, 3.0803, 1.6789, 146.2936, 0.0803],
                ['HM', 0.4227, 1.6529, 0.8303, 84.3475, 0.0399]]

    if len(sys.argv) > 1:
        if sys.argv[1] != "train" and sys.argv[1] != "predict":
            print("Unknown argument, please enter 'predict' or 'train'")
            sys.exit(1)

        elif sys.argv[1] == "train":
            # Train your model
            model = input(
                "Which model would you like to train ? Perceptron(p) or Decision Tree(d): "
            )
            if model != "p" and model != "d":
                print("Sorry! Wrong argument")

            elif model == "p":
                perceptron_data = copy.deepcopy(data_set)
                for data_point in perceptron_data:
                    if "ACD" in data_point[0]:
                        data_point[0] = 1
                    elif "HM" in data_point[0]:
                        data_point[0] = 0
                shuffle(perceptron_data)
                weights = pt.train_perceptron(perceptron_data, 0.01, 20000)
                predict = input(
                    "A perceptron has been trained. Would you like to make a prediction?(y/n) "
                )
                if predict == "y":
                    filename = input(
                        "Please enter the name of the file containing text for author identification: "
                    )
                    data_value = fp.process(filename, "NA")
                    prediction = pt.predict(data_value, weights)
                    if int(prediction) == 1:
                        print("Author is Arthur Conan Doyle.")
                    elif int(prediction) == 0:
                        print("Author is Herman Melville.")
            elif model == "d":
                max_depth = int(
                    input(
                        "Please enter the maximum depth of the decision tree: "
                    ))
                entropy_cutoff = float(
                    input(
                        "Please enter the entropy cutoff of the decision tree(ideal is 0.0): "
                    ))
                print("Training a decision tree on training data...")
                tree = dt.DecisionTree(shuffle(data_set), ["ACD", "HM"],
                                       max_depth, entropy_cutoff)

                predict = input(
                    "The decision tree has been trained. Would you like to make a prediction?(y/n) "
                )
                if predict == "y":
                    filename = input(
                        "Please enter the name of the file containing text for author identification: "
                    )
                    data_value = fp.process(filename, "NA")

                    node = tree
                    while node.FINAL_LABEL == "":
                        if data_value[node.att_index] <= node.threshold:
                            node = node.left
                        elif data_value[node.att_index] > node.threshold:
                            node = node.right
                    if node.FINAL_LABEL == "ACD":
                        print("The author is Arthur Conan Doyle")
                    else:
                        print("The author is Herman Melville")

        elif sys.argv[1] == "predict":
            filename = sys.argv[2]
            print("Predicting using an existing model: ")
            model_file = open("model_perceptron.txt", "r")
            line = model_file.readline().split(",")
            weights = []
            for weight in line:
                weights.append(float(weight))
            data_value = fp.process(filename, "NA")
            prediction = pt.predict(data_value, weights)
            if int(prediction) == 1:
                print("Author is Arthur Conan Doyle.")
            elif int(prediction) == 0:
                print("Author is Herman Melville")

    else:
        print("Please enter argument 'train' or 'predict'. ")
        sys.exit(1)
def test_default():
    t = tree.DecisionTree()
    assert t.max_depth == np.inf
    assert t.min_samples_split == 2
    assert t.criterion == 'gini'
Beispiel #26
0
import numpy as np
from sklearn.metrics import accuracy_score
import json

import data_loader
import decision_tree

# load data
X_train, X_test, y_train, y_test = data_loader.discrete_2D_iris_dataset()

# set classifier
dTree = decision_tree.DecisionTree()

# training
dTree.train(X_train, y_train)
y_est_train = dTree.predict(X_train)
train_accu = accuracy_score(y_est_train, y_train)
print('train_accu', train_accu)

# testing
y_est_test = dTree.predict(X_test)
test_accu = accuracy_score(y_est_test, y_test)
print('test_accu', test_accu)

# print
dTree.print_tree()

# save
json.dump({
    'train_accu': train_accu,
    'test_accu': test_accu
Beispiel #27
0
import decision_tree as dt

# header is not necessary
header = ['color', 'size', 'shape', 'label']
train_data = [
    ['Green', 3, 'round', 'Apple'],
    ['Red', 3, 'round', 'Apple'],
    ['Purple', 1, 'round', 'Grape'],
    ['Purple', 1, 'round', 'Grape'],
    ['Yellow', 3, 'round', 'Lemon'],
    ['Yellow', 3, 'long', 'Banana'],
]
# define a decision tree
myTree = dt.DecisionTree()
# train decision tree with training dta
myTree.fit(train_data)

# test data, should has 'grape' as its label
test_data = ['purple', 1, 'round']
result = myTree.predict(test_data)
# output predicted result
print(result)
    def _run_fold(dataset_name,
                  curr_dataset,
                  criterion,
                  trial_number,
                  min_num_samples_allowed,
                  max_depth,
                  num_folds,
                  is_stratified,
                  use_numeric_attributes,
                  use_chi_sq_test,
                  max_p_value_chi_sq,
                  num_samples,
                  original_valid_nominal_attributes,
                  original_valid_numeric_attributes,
                  training_samples_indices,
                  validation_sample_indices,
                  output_file_descriptor,
                  output_split_char=','):
        print('\nFold #{}'.format(fold_number + 1))
        print_information_per_attrib = {
        }  # ...[attrib_index] = print_information
        accuracy_criterion_value = [
        ]  # ...[...] = (accuracy_with_missing_values, criterion_value)
        tree = decision_tree.DecisionTree(criterion)

        num_attributes = len(original_valid_nominal_attributes)
        for (attrib_index, (is_valid_nominal_attrib,
                            is_valid_numeric_attrib)) in enumerate(
                                zip(original_valid_nominal_attributes,
                                    original_valid_numeric_attributes)):
            if not is_valid_nominal_attrib and not is_valid_numeric_attrib:
                continue

            # Let's pretend only the current attribute is valid.
            print()
            print('Current attribute: {} ({})'.format(
                curr_dataset.attrib_names[attrib_index], attrib_index))
            curr_dataset.valid_nominal_attribute = [False] * num_attributes
            curr_dataset.valid_nominal_attribute[
                attrib_index] = is_valid_nominal_attrib
            curr_dataset.valid_numeric_attribute = [False] * num_attributes
            curr_dataset.valid_numeric_attribute[
                attrib_index] = is_valid_numeric_attrib

            num_values = len(curr_dataset.attrib_int_to_value[attrib_index])
            if not num_values:
                continue

            if max_depth is None:
                curr_max_depth_allowed = 1 + math.ceil(
                    math.log2(curr_dataset.num_classes))
            else:
                curr_max_depth_allowed = max_depth

            start_time = timeit.default_timer()
            ((_, num_correct_classifications_w_unkown,
              num_correct_classifications_wo_unkown, _, _, _, num_unkown, _),
             curr_max_depth_found, _,
             curr_num_nodes_prunned) = tree.train_and_test(
                 curr_dataset,
                 training_samples_indices,
                 validation_sample_indices,
                 max_depth=curr_max_depth_allowed,
                 min_samples_per_node=min_num_samples_allowed,
                 use_stop_conditions=use_chi_sq_test,
                 max_p_value_chi_sq=max_p_value_chi_sq)
            total_time_taken = timeit.default_timer() - start_time
            if (not tree.get_root_node().valid_nominal_attribute[attrib_index]
                    and not tree.get_root_node(
                    ).valid_numeric_attribute[attrib_index]):
                continue
            try:
                curr_criterion_value = tree.get_root_node(
                ).node_split.criterion_value
            except AttributeError:
                continue

            trivial_accuracy = tree.get_trivial_accuracy(
                validation_sample_indices)
            accuracy_with_missing_values = (
                100.0 * num_correct_classifications_w_unkown /
                len(validation_sample_indices))
            try:
                accuracy_without_missing_values = (
                    100.0 * num_correct_classifications_wo_unkown /
                    (len(validation_sample_indices) - num_unkown))
            except ZeroDivisionError:
                accuracy_without_missing_values = None

            percentage_unkown = 100.0 * num_unkown / len(
                validation_sample_indices)
            curr_num_nodes = tree.get_root_node().get_num_nodes()

            print_information_per_attrib[attrib_index] = [
                curr_criterion_value, curr_max_depth_allowed, num_values,
                total_time_taken, trivial_accuracy,
                accuracy_with_missing_values, accuracy_without_missing_values,
                num_unkown, percentage_unkown, curr_num_nodes,
                curr_max_depth_found, curr_num_nodes_prunned
            ]
            accuracy_criterion_value.append(
                (accuracy_with_missing_values, curr_criterion_value))

        (num_inversions, num_ties,
         num_correct) = _count_inversions_and_ties(accuracy_criterion_value)

        num_valid_attributes = len(print_information_per_attrib)
        num_valid_numeric_attributes = sum(
            original_valid_numeric_attributes[attrib_index]
            for attrib_index in print_information_per_attrib)
        num_valid_nominal_attributes = num_valid_attributes - num_valid_numeric_attributes

        for attrib_index in sorted(print_information_per_attrib):
            save_info(dataset_name, use_numeric_attributes,
                      curr_dataset.attrib_names[attrib_index],
                      original_valid_numeric_attributes[attrib_index],
                      num_samples, trial_number + 1, criterion.name, num_folds,
                      fold_number + 1, is_stratified, min_num_samples_allowed,
                      use_chi_sq_test, max_p_value_chi_sq, num_attributes,
                      num_valid_attributes, num_valid_nominal_attributes,
                      num_valid_numeric_attributes, num_inversions, num_ties,
                      num_correct, *print_information_per_attrib[attrib_index],
                      output_file_descriptor, output_split_char)
Beispiel #29
0
def run(dataset_name,
        train_dataset,
        num_training_samples,
        criterion,
        min_num_samples_allowed,
        max_depth,
        num_trials,
        starting_seed,
        use_numeric_attributes,
        use_chi_sq_test,
        max_p_value_chi_sq,
        output_file_descriptor,
        output_split_char=',',
        seed=None):
    """Runs `num_trials` experiments, each one randomly selecting `num_training_samples` valid
    samples to use for training and testing the tree in the rest of the dataset. Saves the training
    and classification information in the `output_file_descriptor` file.
    """
    if seed is not None:
        random.seed(seed)
        np.random.seed(seed)

    training_samples_indices = list(range(train_dataset.num_samples))
    for trial_number in range(num_trials):
        print('*' * 80)
        print('STARTING TRIAL #{} USING SEED #{}'.format(
            trial_number + 1, starting_seed + trial_number))
        print()

        if seed is None:
            random.seed(RANDOM_SEEDS[trial_number + starting_seed - 1])
            np.random.seed(RANDOM_SEEDS[trial_number + starting_seed - 1])
        random.shuffle(training_samples_indices)
        curr_training_samples_indices = training_samples_indices[:
                                                                 num_training_samples]
        curr_test_samples_indices = training_samples_indices[
            num_training_samples:]

        tree = decision_tree.DecisionTree(criterion=criterion)
        # First let's train the tree and save the training information
        start_time = timeit.default_timer()
        (time_taken_prunning, num_nodes_prunned) = tree.train(
            curr_dataset=train_dataset,
            training_samples_indices=curr_training_samples_indices,
            max_depth=max_depth,
            min_samples_per_node=min_num_samples_allowed,
            use_stop_conditions=use_chi_sq_test,
            max_p_value_chi_sq=max_p_value_chi_sq)
        total_time_taken = timeit.default_timer() - start_time

        num_random_tries = 1
        while (sorted(tree.get_root_node().class_index_num_samples)[-2] == 0
               or sum(tree.get_root_node().valid_nominal_attribute) == 0):
            num_random_tries += 1
            if num_random_tries == MAX_RANDOM_TRIES:
                print(
                    'Already did {} random generation, none worked (only one class or no valid'
                    ' attribute).'.format(MAX_RANDOM_TRIES))
                print('Will skip to the next test.')
                return None

            random.shuffle(training_samples_indices)
            curr_training_samples_indices = training_samples_indices[:
                                                                     num_training_samples]
            curr_test_samples_indices = training_samples_indices[
                num_training_samples:2 * num_training_samples]

            start_time = timeit.default_timer()
            (time_taken_prunning, num_nodes_prunned) = tree.train(
                curr_dataset=train_dataset,
                training_samples_indices=curr_training_samples_indices,
                max_depth=max_depth,
                min_samples_per_node=min_num_samples_allowed,
                use_stop_conditions=use_chi_sq_test,
                max_p_value_chi_sq=max_p_value_chi_sq)
            total_time_taken = timeit.default_timer() - start_time

        num_valid_nominal_attributes = sum(
            tree.get_root_node().valid_nominal_attribute)

        time_taken_tree = total_time_taken - time_taken_prunning

        # Time to test this tree's classification and save the classification information
        trivial_accuracy = tree.get_trivial_accuracy(curr_test_samples_indices)
        (_, num_correct_classifications_w_unkown,
         num_correct_classifications_wo_unkown, _, _, _, num_unkown,
         _) = tree.test(curr_test_samples_indices)

        accuracy_with_missing_values = (100.0 *
                                        num_correct_classifications_w_unkown /
                                        len(curr_test_samples_indices))
        try:
            accuracy_without_missing_values = (
                100.0 * num_correct_classifications_wo_unkown /
                (len(curr_test_samples_indices) - num_unkown))
        except ZeroDivisionError:
            accuracy_without_missing_values = None
        percentage_unkown = 100.0 * num_unkown / len(curr_test_samples_indices)

        num_nodes_found = tree.get_root_node().get_num_nodes()
        max_depth_found = tree.get_root_node().get_max_depth()

        save_trial_info(
            dataset_name, train_dataset.num_samples, num_training_samples,
            trial_number + starting_seed, use_numeric_attributes,
            criterion.name, max_depth, min_num_samples_allowed,
            decision_tree.USE_MIN_SAMPLES_SECOND_LARGEST_CLASS,
            decision_tree.MIN_SAMPLES_SECOND_LARGEST_CLASS, use_chi_sq_test,
            max_p_value_chi_sq,
            decision_tree.MIN_SAMPLES_IN_SECOND_MOST_FREQUENT_VALUE,
            num_valid_nominal_attributes, total_time_taken, time_taken_tree,
            time_taken_prunning, trivial_accuracy,
            accuracy_with_missing_values, accuracy_without_missing_values,
            num_unkown, percentage_unkown, num_nodes_found, max_depth_found,
            num_nodes_prunned, output_split_char, output_file_descriptor)
def test_wrong_criterion():
    with pytest.raises(ValueError):
        tree.DecisionTree(criterion='non-existed')