def parameter_test_worker(i, p, num_params, alpha, group_size, data_x, data_y): """ Get parameter sets from a queue and find the random forest performance. Send them back through another queue. :param qu_gl: queue for global objects, such as the dataset :param qu_in: queue for the parameter sets :param qu_out: queue for the output """ out_str = "# Parameter set %d of %d" % ( i + 1, num_params) + "\n" + json.dumps(p) + "\n\n" rf_split_counts = [] rf_performance = [] rf_num_nodes = [] rf_train_time = [] fg = { a: { "split_counts": [], "performance": [], "num_nodes": [], "train_time": [] } for a in alpha } kf = sklearn.cross_validation.KFold(data_x.shape[0], n_folds=10) for kf_i, (train, test) in enumerate(kf): print "## kf %d of %d" % (kf_i + 1, len(kf)) train_x = data_x[train] train_y = data_y[train] test_x = data_x[test] test_y = data_y[test] # Train the rf and get the performance. rf = randomforest.RandomForestClassifier(n_rand_dims="auto", n_jobs=1, **p) start = time.time() rf.fit(train_x, train_y) end = time.time() pred, split_counts = rf.predict(test_x, return_split_counts=True) split_counts /= float(len(pred)) count = sum(1 for a, b in zip(test_y, pred) if a == b) performance = count / float(len(pred)) rf_split_counts.append(split_counts) rf_performance.append(performance) rf_num_nodes.append(rf.num_nodes()) rf_train_time.append(end - start) # Train the forest garrote and get the performance. for a_i, a in enumerate(alpha): print "## forest garrote %d of %d" % (a_i + 1, len(alpha)) start = time.time() if rf.num_trees() <= group_size: refined_rf = forest_garrote(rf, train_x, train_y, group_size=None, alpha=a) else: refined_rf = forest_garrote(rf, train_x, train_y, group_size=group_size, alpha=a) end = time.time() pred, split_counts = refined_rf.predict(test_x, return_split_counts=True) split_counts /= float(len(pred)) count = sum(1 for a, b in zip(test_y, pred) if a == b) performance = count / float(len(pred)) fg[a]["split_counts"].append(split_counts) fg[a]["performance"].append(performance) fg[a]["num_nodes"].append(refined_rf.num_nodes()) fg[a]["train_time"].append(end - start) # Create the output string. out_str += "# performance\n" + str(numpy.mean(rf_performance)) + " " + str( numpy.std(rf_performance)) + "\n" out_str += "# train_time\n" + str(numpy.mean(rf_train_time)) + " " + str( numpy.std(rf_train_time)) + "\n" out_str += "# split_counts\n" + str( numpy.mean(rf_split_counts)) + " " + str( numpy.std(rf_split_counts)) + "\n" out_str += "# num_nodes\n" + str(numpy.mean(rf_num_nodes)) + " " + str( numpy.std(rf_num_nodes)) + "\n\n" for a in alpha: out_str += "fg " + str(a) + "\n\n" out_str += "# performance\n" + str( numpy.mean(fg[a]["performance"])) + " " + str( numpy.std(fg[a]["performance"])) + "\n" out_str += "# train_time\n" + str( numpy.mean(fg[a]["train_time"])) + " " + str( numpy.std(fg[a]["train_time"])) + "\n" out_str += "# split_counts\n" + str(numpy.mean( fg[a]["split_counts"])) + " " + str( numpy.std(fg[a]["split_counts"])) + "\n" out_str += "# num_nodes\n" + str( numpy.mean(fg[a]["num_nodes"])) + " " + str( numpy.std(fg[a]["num_nodes"])) + "\n\n" print out_str[:-1] with open("LOGFILE.txt", "a") as f: f.write(out_str)
def parameter_test_worker(i, p, num_params, alpha, group_size, data_x, data_y): """ Get parameter sets from a queue and find the random forest performance. Send them back through another queue. :param qu_gl: queue for global objects, such as the dataset :param qu_in: queue for the parameter sets :param qu_out: queue for the output """ out_str = "# Parameter set %d of %d" % (i+1, num_params) + "\n" + json.dumps(p) + "\n\n" rf_split_counts = [] rf_performance = [] rf_num_nodes = [] rf_train_time = [] fg = {a: {"split_counts": [], "performance": [], "num_nodes": [], "train_time": []} for a in alpha} kf = sklearn.cross_validation.KFold(data_x.shape[0], n_folds=10) for kf_i, (train, test) in enumerate(kf): print "## kf %d of %d" % (kf_i+1, len(kf)) train_x = data_x[train] train_y = data_y[train] test_x = data_x[test] test_y = data_y[test] # Train the rf and get the performance. rf = randomforest.RandomForestClassifier(n_rand_dims="auto", n_jobs=1, **p) start = time.time() rf.fit(train_x, train_y) end = time.time() pred, split_counts = rf.predict(test_x, return_split_counts=True) split_counts /= float(len(pred)) count = sum(1 for a, b in zip(test_y, pred) if a == b) performance = count/float(len(pred)) rf_split_counts.append(split_counts) rf_performance.append(performance) rf_num_nodes.append(rf.num_nodes()) rf_train_time.append(end-start) # Train the forest garrote and get the performance. for a_i, a in enumerate(alpha): print "## forest garrote %d of %d" % (a_i+1, len(alpha)) start = time.time() if rf.num_trees() <= group_size: refined_rf = forest_garrote(rf, train_x, train_y, group_size=None, alpha=a) else: refined_rf = forest_garrote(rf, train_x, train_y, group_size=group_size, alpha=a) end = time.time() pred, split_counts = refined_rf.predict(test_x, return_split_counts=True) split_counts /= float(len(pred)) count = sum(1 for a, b in zip(test_y, pred) if a == b) performance = count/float(len(pred)) fg[a]["split_counts"].append(split_counts) fg[a]["performance"].append(performance) fg[a]["num_nodes"].append(refined_rf.num_nodes()) fg[a]["train_time"].append(end-start) # Create the output string. out_str += "# performance\n" + str(numpy.mean(rf_performance)) + " " + str(numpy.std(rf_performance)) + "\n" out_str += "# train_time\n" + str(numpy.mean(rf_train_time)) + " " + str(numpy.std(rf_train_time)) + "\n" out_str += "# split_counts\n" + str(numpy.mean(rf_split_counts)) + " " + str(numpy.std(rf_split_counts)) + "\n" out_str += "# num_nodes\n" + str(numpy.mean(rf_num_nodes)) + " " + str(numpy.std(rf_num_nodes)) + "\n\n" for a in alpha: out_str += "fg " + str(a) + "\n\n" out_str += "# performance\n" + str(numpy.mean(fg[a]["performance"])) + " " + str(numpy.std(fg[a]["performance"])) + "\n" out_str += "# train_time\n" + str(numpy.mean(fg[a]["train_time"])) + " " + str(numpy.std(fg[a]["train_time"])) + "\n" out_str += "# split_counts\n" + str(numpy.mean(fg[a]["split_counts"])) + " " + str(numpy.std(fg[a]["split_counts"])) + "\n" out_str += "# num_nodes\n" + str(numpy.mean(fg[a]["num_nodes"])) + " " + str(numpy.std(fg[a]["num_nodes"])) + "\n\n" print out_str[:-1] with open("LOGFILE.txt", "a") as f: f.write(out_str)
def train_rf(n_trees, n_jobs, predict=True, save=False, load=False, filename=None, refine=False, group_size=None): """ Train a random forest and compute the accuracy on a test set. :param n_trees: number of trees :param n_jobs: number of jobs :param predict: use the random forest to predict on a test set :param save: save the random forest to a file :param load: load the random forest from a file :param filename: file name """ # train_x, train_y, test_x, test_y = load_data([3, 8]) train_x, train_y, test_x, test_y = load_neuro_data() if load: assert os.path.isfile(filename) print "Loading random forest from file %s." % filename with open(filename, "r") as f: rf_str = f.read() rf = randomforest.RandomForestClassifier.from_string(rf_str) if n_jobs is not None: rf._n_jobs = n_jobs else: print "Training random forest with %d trees." % n_trees rf = randomforest.RandomForestClassifier( n_estimators=n_trees, n_rand_dims="auto", n_jobs=n_jobs, # bootstrap_sampling=True, use_sample_label_count=True, resample_count=None, # bootstrap_sampling=False, use_sample_label_count=False, resample_count=None, bootstrap_sampling=True, use_sample_label_count=False, resample_count=None, # bootstrap_sampling=False, use_sample_label_count=True, resample_count=None, # does not make sense # resample_count=20, # loggamma_tau=1e-6, split_selection="gini") with Timer("Training took %.03f seconds"): rf.fit(train_x, train_y) print "The random forest has %d nodes." % rf.num_nodes() if save and not load: print "Saving random forest to file %s." % filename with open(filename, "w") as f: f.write(rf.to_string()) if predict: print "Predicting on a test set with the random forest." with Timer("Random forest prediction took %.03f seconds."): pred, split_counts = rf.predict(test_x, return_split_counts=True) split_counts /= float(len(pred)) count = sum([1 if a == b else 0 for a, b in zip(test_y, pred)]) print "%d of %d correct (%.03f%%), used %.02f splits per instance" % ( count, len(pred), (100.0 * count) / len(pred), split_counts) if refine: print "Refining the random forest using forest garrote." with Timer("Refining took %.03f seconds."): refined_rf = forest_garrote(rf, train_x, train_y, group_size=group_size) # refined_rf = global_refinement(rf, train_x, train_y) print "The refined forest has %d nodes." % refined_rf.num_nodes() if save: f0, f1 = os.path.split(filename) refined_filename = os.path.join(f0, "refined_" + f1) print "Saving refined random forest to file %s." % refined_filename with open(refined_filename, "w") as f: f.write(refined_rf.to_string()) if predict: print "Predicting on a test set with the forest garrote." with Timer("Forest garrote prediction took %.03f seconds."): pred, split_counts = refined_rf.predict( test_x, return_split_counts=True) split_counts /= float(len(pred)) count = sum([1 if a == b else 0 for a, b in zip(test_y, pred)]) print "%d of %d correct (%.03f%%), used %.02f splits per instance" % ( count, len(pred), (100.0 * count) / len(pred), split_counts)
def train_rf(n_trees, n_jobs, predict=True, save=False, load=False, filename=None, refine=False, group_size=None): """ Train a random forest and compute the accuracy on a test set. :param n_trees: number of trees :param n_jobs: number of jobs :param predict: use the random forest to predict on a test set :param save: save the random forest to a file :param load: load the random forest from a file :param filename: file name """ # train_x, train_y, test_x, test_y = load_data([3, 8]) train_x, train_y, test_x, test_y = load_neuro_data() if load: assert os.path.isfile(filename) print "Loading random forest from file %s." % filename with open(filename, "r") as f: rf_str = f.read() rf = randomforest.RandomForestClassifier.from_string(rf_str) if n_jobs is not None: rf._n_jobs = n_jobs else: print "Training random forest with %d trees." % n_trees rf = randomforest.RandomForestClassifier(n_estimators=n_trees, n_rand_dims="auto", n_jobs=n_jobs, # bootstrap_sampling=True, use_sample_label_count=True, resample_count=None, # bootstrap_sampling=False, use_sample_label_count=False, resample_count=None, bootstrap_sampling=True, use_sample_label_count=False, resample_count=None, # bootstrap_sampling=False, use_sample_label_count=True, resample_count=None, # does not make sense # resample_count=20, # loggamma_tau=1e-6, split_selection="gini" ) with Timer("Training took %.03f seconds"): rf.fit(train_x, train_y) print "The random forest has %d nodes." % rf.num_nodes() if save and not load: print "Saving random forest to file %s." % filename with open(filename, "w") as f: f.write(rf.to_string()) if predict: print "Predicting on a test set with the random forest." with Timer("Random forest prediction took %.03f seconds."): pred, split_counts = rf.predict(test_x, return_split_counts=True) split_counts /= float(len(pred)) count = sum([1 if a == b else 0 for a, b in zip(test_y, pred)]) print "%d of %d correct (%.03f%%), used %.02f splits per instance" % (count, len(pred), (100.0*count)/len(pred), split_counts) if refine: print "Refining the random forest using forest garrote." with Timer("Refining took %.03f seconds."): refined_rf = forest_garrote(rf, train_x, train_y, group_size=group_size) # refined_rf = global_refinement(rf, train_x, train_y) print "The refined forest has %d nodes." % refined_rf.num_nodes() if save: f0, f1 = os.path.split(filename) refined_filename = os.path.join(f0, "refined_" + f1) print "Saving refined random forest to file %s." % refined_filename with open(refined_filename, "w") as f: f.write(refined_rf.to_string()) if predict: print "Predicting on a test set with the forest garrote." with Timer("Forest garrote prediction took %.03f seconds."): pred, split_counts = refined_rf.predict(test_x, return_split_counts=True) split_counts /= float(len(pred)) count = sum([1 if a == b else 0 for a, b in zip(test_y, pred)]) print "%d of %d correct (%.03f%%), used %.02f splits per instance" % (count, len(pred), (100.0*count)/len(pred), split_counts)