def test_test(): dt = DecisionTree(open('data/dt_train.txt')) assert dt.test({ 'age': '<=30', 'income': 'low', 'student': 'no', 'credit_rating': 'fair', }) == 'no'
def cross_validate(csv_file_name, losses_file_name, models, tssp, num_words, max_depth, n_estimators, debug=False): ''' Perform 10-fold incremental cross validation. ''' total_num = 2000 lists_of_dict = [] setups = [(p, w, d, t) for p in tssp for w in num_words for d in max_depth for t in n_estimators] losses = zeros((5, len(setups), 10)) # #models, #cases, #folds sklosses = zeros((2, len(setups), 10)) generate_train_and_test_files_cv(csv_file_name, 10) # Generate temp CV files for i in range(10): lists_of_dict.append(csv_to_dict('cv%d.dat' % (i))) i = 0 for prop, nwords, maxdep, ntrees in setups: for j in range(10): # Contruct train set training_lists_of_dict = lists_of_dict[:j] + lists_of_dict[j + 1:] training_list_of_dict = [ item for sublist in training_lists_of_dict for item in sublist ] testing_list_of_dict = lists_of_dict[j] # Randomly select samples random_indices = permutation(len(training_list_of_dict)) random_indices = random_indices[:int(total_num * prop)] training_list_of_dict = [ training_list_of_dict[k] for k in random_indices ] # Find the word features feature_words = construct_word_feature(training_list_of_dict, nwords) # Extract features and labels training_X, training_y = extract_word_feature_and_label( training_list_of_dict, feature_words) testing_X, testing_y = extract_word_feature_and_label( testing_list_of_dict, feature_words) # DT if 'DT' in models: dt = DecisionTree(max_depth=maxdep) t1 = time.time() dt.train(training_X, training_y) t2 = time.time() losses[0, i, j] = dt.test(testing_X, testing_y) if debug: print "DT training: %fs, testing: %f" % (t2 - t1, time.time() - t2) # BDT if 'BDT' in models: bdt = BaggedDecisionTrees(max_depth=maxdep, n_estimators=ntrees) t1 = time.time() bdt.train(training_X, training_y) t2 = time.time() losses[1, i, j] = bdt.test(testing_X, testing_y) if debug: print "BDT training: %fs, testing: %f" % (t2 - t1, time.time() - t2) # BODT if 'BODT' in models: bodt = BoostedDecisionTrees(max_depth=maxdep, n_estimators=ntrees) bodt.train(training_X, training_y) t2 = time.time() losses[2, i, j] = bodt.test(testing_X, testing_y) # RF if 'RF' in models: rf = RandomForest(max_depth=maxdep, n_estimators=ntrees) rf.train(training_X, training_y) losses[3, i, j] = rf.test(testing_X, testing_y) # SVM if 'SVM' in models: svm = SupportVectorMachine() svm.train(training_X, training_y) losses[4, i, j] = svm.test(testing_X, testing_y) # Libary functions if debug: training_y[training_y == 0] = -1 testing_y[testing_y == 0] = -1 skdt = skDecisionTree(max_depth=maxdep, min_samples_split=10) skdt.fit(training_X.T, training_y) sklosses[0, i, j] = 1 - skdt.score(testing_X.T, testing_y) print "ZERO-ONE-LOSS-SKDT %.4f" % sklosses[0, i, j] skrf = skRandomForest(max_depth=maxdep, n_estimators=ntrees, min_samples_split=10) skrf.fit(training_X.T, training_y) sklosses[1, i, j] = 1 - skrf.score(testing_X.T, testing_y) print "ZERO-ONE-LOSS-SKRF %.4f" % sklosses[1, i, j] i += 1 save(losses_file_name, losses) save('debug_' + losses_file_name, sklosses)