def cross_validate(csv_file_name, losses_file_name, models, tssp, num_words, max_depth, n_estimators, debug=False): ''' Perform 10-fold incremental cross validation. ''' total_num = 2000 lists_of_dict = [] setups = [(p, w, d, t) for p in tssp for w in num_words for d in max_depth for t in n_estimators] losses = zeros((5, len(setups), 10)) # #models, #cases, #folds sklosses = zeros((2, len(setups), 10)) generate_train_and_test_files_cv(csv_file_name, 10) # Generate temp CV files for i in range(10): lists_of_dict.append(csv_to_dict('cv%d.dat' % (i))) i = 0 for prop, nwords, maxdep, ntrees in setups: for j in range(10): # Contruct train set training_lists_of_dict = lists_of_dict[:j] + lists_of_dict[j + 1:] training_list_of_dict = [ item for sublist in training_lists_of_dict for item in sublist ] testing_list_of_dict = lists_of_dict[j] # Randomly select samples random_indices = permutation(len(training_list_of_dict)) random_indices = random_indices[:int(total_num * prop)] training_list_of_dict = [ training_list_of_dict[k] for k in random_indices ] # Find the word features feature_words = construct_word_feature(training_list_of_dict, nwords) # Extract features and labels training_X, training_y = extract_word_feature_and_label( training_list_of_dict, feature_words) testing_X, testing_y = extract_word_feature_and_label( testing_list_of_dict, feature_words) # DT if 'DT' in models: dt = DecisionTree(max_depth=maxdep) t1 = time.time() dt.train(training_X, training_y) t2 = time.time() losses[0, i, j] = dt.test(testing_X, testing_y) if debug: print "DT training: %fs, testing: %f" % (t2 - t1, time.time() - t2) # BDT if 'BDT' in models: bdt = BaggedDecisionTrees(max_depth=maxdep, n_estimators=ntrees) t1 = time.time() bdt.train(training_X, training_y) t2 = time.time() losses[1, i, j] = bdt.test(testing_X, testing_y) if debug: print "BDT training: %fs, testing: %f" % (t2 - t1, time.time() - t2) # BODT if 'BODT' in models: bodt = BoostedDecisionTrees(max_depth=maxdep, n_estimators=ntrees) bodt.train(training_X, training_y) t2 = time.time() losses[2, i, j] = bodt.test(testing_X, testing_y) # RF if 'RF' in models: rf = RandomForest(max_depth=maxdep, n_estimators=ntrees) rf.train(training_X, training_y) losses[3, i, j] = rf.test(testing_X, testing_y) # SVM if 'SVM' in models: svm = SupportVectorMachine() svm.train(training_X, training_y) losses[4, i, j] = svm.test(testing_X, testing_y) # Libary functions if debug: training_y[training_y == 0] = -1 testing_y[testing_y == 0] = -1 skdt = skDecisionTree(max_depth=maxdep, min_samples_split=10) skdt.fit(training_X.T, training_y) sklosses[0, i, j] = 1 - skdt.score(testing_X.T, testing_y) print "ZERO-ONE-LOSS-SKDT %.4f" % sklosses[0, i, j] skrf = skRandomForest(max_depth=maxdep, n_estimators=ntrees, min_samples_split=10) skrf.fit(training_X.T, training_y) sklosses[1, i, j] = 1 - skrf.score(testing_X.T, testing_y) print "ZERO-ONE-LOSS-SKRF %.4f" % sklosses[1, i, j] i += 1 save(losses_file_name, losses) save('debug_' + losses_file_name, sklosses)
print('y_train', y_train.shape) print('X_test ', X_test.shape) y_train = np.reshape(y_train, (y_train.shape[0], 1)) dataset = np.concatenate((X_train, y_train), axis=1) #Instance of Decision Tree Object a = DecisionTree(headers, 5) X_tr = dataset[0:6400] y_tr = y_train[0:6400] X_val = X_train[6400:8000] y_val = y_train[6400:8000] y_val = np.reshape(y_val, (y_val.shape[0], )) t = a.train(dataset) #Saving Trained Model dill.dump(a, open("vamshi.model", "w")) v = dill.load(open("vamshi.model")) y_pred = a.predict(None, X_val) print y_pred y_val = np.array(y_val) y_pred = np.array(y_pred) acc = np.sum((y_val == y_pred) * 1.0) / y_val.shape[0] print acc