Example #1
0
def test_test():
	dt = DecisionTree(open('data/dt_train.txt'))
	assert dt.test({
		'age': '<=30',
		'income': 'low',
		'student': 'no',
		'credit_rating': 'fair',
	}) == 'no'
Example #2
0
def cross_validate(csv_file_name,
                   losses_file_name,
                   models,
                   tssp,
                   num_words,
                   max_depth,
                   n_estimators,
                   debug=False):
    '''
		Perform 10-fold incremental cross validation.
	'''
    total_num = 2000
    lists_of_dict = []
    setups = [(p, w, d, t) for p in tssp for w in num_words for d in max_depth
              for t in n_estimators]
    losses = zeros((5, len(setups), 10))  # #models, #cases, #folds
    sklosses = zeros((2, len(setups), 10))
    generate_train_and_test_files_cv(csv_file_name, 10)
    # Generate temp CV files
    for i in range(10):
        lists_of_dict.append(csv_to_dict('cv%d.dat' % (i)))
    i = 0
    for prop, nwords, maxdep, ntrees in setups:
        for j in range(10):
            # Contruct train set
            training_lists_of_dict = lists_of_dict[:j] + lists_of_dict[j + 1:]
            training_list_of_dict = [
                item for sublist in training_lists_of_dict for item in sublist
            ]
            testing_list_of_dict = lists_of_dict[j]
            # Randomly select samples
            random_indices = permutation(len(training_list_of_dict))
            random_indices = random_indices[:int(total_num * prop)]
            training_list_of_dict = [
                training_list_of_dict[k] for k in random_indices
            ]
            # Find the word features
            feature_words = construct_word_feature(training_list_of_dict,
                                                   nwords)
            # Extract features and labels
            training_X, training_y = extract_word_feature_and_label(
                training_list_of_dict, feature_words)
            testing_X, testing_y = extract_word_feature_and_label(
                testing_list_of_dict, feature_words)
            # DT
            if 'DT' in models:
                dt = DecisionTree(max_depth=maxdep)
                t1 = time.time()
                dt.train(training_X, training_y)
                t2 = time.time()
                losses[0, i, j] = dt.test(testing_X, testing_y)
                if debug:
                    print "DT training: %fs, testing: %f" % (t2 - t1,
                                                             time.time() - t2)
            # BDT
            if 'BDT' in models:
                bdt = BaggedDecisionTrees(max_depth=maxdep,
                                          n_estimators=ntrees)
                t1 = time.time()
                bdt.train(training_X, training_y)
                t2 = time.time()
                losses[1, i, j] = bdt.test(testing_X, testing_y)
                if debug:
                    print "BDT training: %fs, testing: %f" % (t2 - t1,
                                                              time.time() - t2)
            # BODT
            if 'BODT' in models:
                bodt = BoostedDecisionTrees(max_depth=maxdep,
                                            n_estimators=ntrees)
                bodt.train(training_X, training_y)
                t2 = time.time()
                losses[2, i, j] = bodt.test(testing_X, testing_y)
            # RF
            if 'RF' in models:
                rf = RandomForest(max_depth=maxdep, n_estimators=ntrees)
                rf.train(training_X, training_y)
                losses[3, i, j] = rf.test(testing_X, testing_y)
            # SVM
            if 'SVM' in models:
                svm = SupportVectorMachine()
                svm.train(training_X, training_y)
                losses[4, i, j] = svm.test(testing_X, testing_y)
            # Libary functions
            if debug:
                training_y[training_y == 0] = -1
                testing_y[testing_y == 0] = -1
                skdt = skDecisionTree(max_depth=maxdep, min_samples_split=10)
                skdt.fit(training_X.T, training_y)
                sklosses[0, i, j] = 1 - skdt.score(testing_X.T, testing_y)
                print "ZERO-ONE-LOSS-SKDT %.4f" % sklosses[0, i, j]
                skrf = skRandomForest(max_depth=maxdep,
                                      n_estimators=ntrees,
                                      min_samples_split=10)
                skrf.fit(training_X.T, training_y)
                sklosses[1, i, j] = 1 - skrf.score(testing_X.T, testing_y)
                print "ZERO-ONE-LOSS-SKRF %.4f" % sklosses[1, i, j]
        i += 1
    save(losses_file_name, losses)
    save('debug_' + losses_file_name, sklosses)