Exemple #1
0
def main():
    # Get train,test dataset and hyperparameters from file.
    train, test = create_data(path_to_file, DATA)
    hyperparameters = parse_configuration_file()

    # Clearing out previous file.
    with open("error.txt", "w"):
        pass

    # Perform training.
    start = time.time()
    ensemble = gbrt(train, hyperparameters, test)
    training_time = time.time() - start

    # Get training statistics.
    train_rmse = ensemble.compute_dataset_rmse(train.get_dataframe_copy())
    test_rmse = ensemble.compute_dataset_rmse(test.get_dataframe_copy())

    # Write hyperparameters, statistics and training time to file.
    with open("outcome.txt", "w") as outcome_file:
        outcome_file.write("HyperParameters:\n" + "=" * 20 + "\n")
        outcome_file.write("{}".format(str(hyperparameters)))

        outcome_file.write("\n\nErrors:\n" + "=" * 20)
        outcome_file.write("\nTrain error = {}".format(train_rmse))
        outcome_file.write("\nTest error = {}".format(test_rmse))

        outcome_file.write("\n\nRunningTime:\n" + "=" * 20 + "\n")
        outcome_file.write("{}".format(training_time))
def iterate_sample_parameter(train_set, test_set):
    hyperparameters = GBRTHyperparameters(*DEFAULT_HYPERPARAMETERS,)
    results = {}
    results["time"] = []

    for  sampling_portion in SAMPLING_OPTIONS:
        errors = []
        hyperparameters.sampling_portion = sampling_portion

        start = time.time()
        gbrt(train_set, hyperparameters, test_set, errors)
        training_time = time.time() - start

        results[sampling_portion] = errors[:]
        results["time"].append((sampling_portion, training_time))

    with open('sampling_deliverable.pickle', 'wb') as handle:
        pickle.dump(results, handle, protocol=pickle.HIGHEST_PROTOCOL)
def iterate_threshold_parameter(train_set, test_set):
    hyperparameters = GBRTHyperparameters(*DEFAULT_HYPERPARAMETERS,)
    results = {}
    results["time"] = []

    for threshold in THRESHOLD_OPTIONS:
        errors = []
        hyperparameters.num_threshold = threshold

        start = time.time()
        gbrt(train_set, hyperparameters, test_set, errors)
        training_time = time.time() - start

        results[threshold] = errors[:]
        results["time"].append((threshold, training_time))

    with open('threshold_deliverable.pickle', 'wb') as handle:
        pickle.dump(results, handle, protocol=pickle.HIGHEST_PROTOCOL)
    def test_compute_mse(self):
        x1 = np.arange(12)
        y = [1] * 3 + [3] * 3 + [16] * 3 + [20] * 3

        df = pd.DataFrame()
        df["x1"] = x1
        df["y"] = y
        dataset = TrainingDataset(df, "y")
        hyperparameters = GBRTHyperparameters(1, 3, 4, 1, 1, 0)
        ensemble = gbrt(dataset, hyperparameters)

        self.assertEqual(ensemble.compute_dataset_mse(df, 1), 2.5)
def iterate_depth_parameter(train_set, test_set):
    hyperparameters = GBRTHyperparameters(*DEFAULT_HYPERPARAMETERS,)
    results = {}

    for depth in DEPTH_OPTIONS:
        errors = []
        hyperparameters.max_depth = depth
        tree = gbrt(train_set, hyperparameters, test_set, errors)
        results[depth] = errors[:]

    with open('depth_deliverable.pickle', 'wb') as handle:
        pickle.dump(results, handle, protocol=pickle.HIGHEST_PROTOCOL)
    def nottest_gbrt_residual(self):
        x1 = np.arange(12)
        y = [1] * 3 + [3] * 3 + [10] * 3 + [20] * 3

        df = pd.DataFrame()
        df["x1"] = x1
        df["y"] = y
        dataset = TrainingDataset(df, "SalePrice")

        ensemble = gbrt(dataset, 1, 3, 4)

        self.assertEqual(ensemble.evaluate(df.iloc[0], 1), 2)
        self.assertEqual(ensemble.evaluate(df.iloc[9], 1), 15)
    def test_gbrt_residual(self):
        x1 = np.arange(6)
        y = [0, 2, 100, 102, 106, 109.5]

        df = pd.DataFrame()
        df["x1"] = x1
        df["y"] = y

        hyperparameters = GBRTHyperparameters(*DEFAULT_HYPERPARAMETERS, )
        dataset = TrainingDataset(df, "SalePrice")

        ensemble = gbrt(dataset, hyperparameters)

        self.assertEqual(ensemble.evaluate(df.iloc[0], 1), 2)
        self.assertEqual(ensemble.evaluate(df.iloc[9], 1), 15)
 def test_real_data(self):
     train, test = create_data("../../data/")
     hyperparameters = GBRTHyperparameters(0, 2, 3, 1, 1, 0)
     ensemble = gbrt(train, hyperparameters, test)
     get_features_importance(ensemble)