def main(): # Get train,test dataset and hyperparameters from file. train, test = create_data(path_to_file, DATA) hyperparameters = parse_configuration_file() # Clearing out previous file. with open("error.txt", "w"): pass # Perform training. start = time.time() ensemble = gbrt(train, hyperparameters, test) training_time = time.time() - start # Get training statistics. train_rmse = ensemble.compute_dataset_rmse(train.get_dataframe_copy()) test_rmse = ensemble.compute_dataset_rmse(test.get_dataframe_copy()) # Write hyperparameters, statistics and training time to file. with open("outcome.txt", "w") as outcome_file: outcome_file.write("HyperParameters:\n" + "=" * 20 + "\n") outcome_file.write("{}".format(str(hyperparameters))) outcome_file.write("\n\nErrors:\n" + "=" * 20) outcome_file.write("\nTrain error = {}".format(train_rmse)) outcome_file.write("\nTest error = {}".format(test_rmse)) outcome_file.write("\n\nRunningTime:\n" + "=" * 20 + "\n") outcome_file.write("{}".format(training_time))
def iterate_sample_parameter(train_set, test_set): hyperparameters = GBRTHyperparameters(*DEFAULT_HYPERPARAMETERS,) results = {} results["time"] = [] for sampling_portion in SAMPLING_OPTIONS: errors = [] hyperparameters.sampling_portion = sampling_portion start = time.time() gbrt(train_set, hyperparameters, test_set, errors) training_time = time.time() - start results[sampling_portion] = errors[:] results["time"].append((sampling_portion, training_time)) with open('sampling_deliverable.pickle', 'wb') as handle: pickle.dump(results, handle, protocol=pickle.HIGHEST_PROTOCOL)
def iterate_threshold_parameter(train_set, test_set): hyperparameters = GBRTHyperparameters(*DEFAULT_HYPERPARAMETERS,) results = {} results["time"] = [] for threshold in THRESHOLD_OPTIONS: errors = [] hyperparameters.num_threshold = threshold start = time.time() gbrt(train_set, hyperparameters, test_set, errors) training_time = time.time() - start results[threshold] = errors[:] results["time"].append((threshold, training_time)) with open('threshold_deliverable.pickle', 'wb') as handle: pickle.dump(results, handle, protocol=pickle.HIGHEST_PROTOCOL)
def test_compute_mse(self): x1 = np.arange(12) y = [1] * 3 + [3] * 3 + [16] * 3 + [20] * 3 df = pd.DataFrame() df["x1"] = x1 df["y"] = y dataset = TrainingDataset(df, "y") hyperparameters = GBRTHyperparameters(1, 3, 4, 1, 1, 0) ensemble = gbrt(dataset, hyperparameters) self.assertEqual(ensemble.compute_dataset_mse(df, 1), 2.5)
def iterate_depth_parameter(train_set, test_set): hyperparameters = GBRTHyperparameters(*DEFAULT_HYPERPARAMETERS,) results = {} for depth in DEPTH_OPTIONS: errors = [] hyperparameters.max_depth = depth tree = gbrt(train_set, hyperparameters, test_set, errors) results[depth] = errors[:] with open('depth_deliverable.pickle', 'wb') as handle: pickle.dump(results, handle, protocol=pickle.HIGHEST_PROTOCOL)
def nottest_gbrt_residual(self): x1 = np.arange(12) y = [1] * 3 + [3] * 3 + [10] * 3 + [20] * 3 df = pd.DataFrame() df["x1"] = x1 df["y"] = y dataset = TrainingDataset(df, "SalePrice") ensemble = gbrt(dataset, 1, 3, 4) self.assertEqual(ensemble.evaluate(df.iloc[0], 1), 2) self.assertEqual(ensemble.evaluate(df.iloc[9], 1), 15)
def test_gbrt_residual(self): x1 = np.arange(6) y = [0, 2, 100, 102, 106, 109.5] df = pd.DataFrame() df["x1"] = x1 df["y"] = y hyperparameters = GBRTHyperparameters(*DEFAULT_HYPERPARAMETERS, ) dataset = TrainingDataset(df, "SalePrice") ensemble = gbrt(dataset, hyperparameters) self.assertEqual(ensemble.evaluate(df.iloc[0], 1), 2) self.assertEqual(ensemble.evaluate(df.iloc[9], 1), 15)
def test_real_data(self): train, test = create_data("../../data/") hyperparameters = GBRTHyperparameters(0, 2, 3, 1, 1, 0) ensemble = gbrt(train, hyperparameters, test) get_features_importance(ensemble)