Example #1
0
def evaluate_on_dataset(d, parameters, metric, result_dir, pruning=True,
                        n_margin_values=10, n_min_samples_split_values=10,
                        n_cpu=-1):
    ds_result_dir = join(result_dir, d.name)
    if not exists(ds_result_dir):
        mkdir(ds_result_dir)

    ds_uid_file = join(ds_result_dir, "dataset.uid")
    # if exists(ds_uid_file) and open(ds_uid_file, "r").next().strip() == str(hash(d)):
    if not exists(join(ds_result_dir, "predictions.csv")):
        start_time = time()
        fold_predictions = np.zeros(d.n_examples)
        fold_train_mse = []
        fold_cv_results = []
        for i, fold in enumerate(np.unique(d.folds)):
            fold_start = time()

            fold_train = d.folds != fold
            X_train = d.X[fold_train]
            y_train = d.y[fold_train]
            X_test = d.X[~fold_train]
            y_test = d.y[~fold_train]

            # Determine the margin grid
            sorted_limits = y_train.flatten()
            sorted_limits = sorted_limits[~np.isinf(sorted_limits)]
            sorted_limits.sort()
            range_max = sorted_limits.max() - sorted_limits.min()
            range_min = np.diff(sorted_limits)
            range_min = range_min[range_min > 0].min()
            parameters = dict(parameters)  # Make a copy
            parameters["margin"] = [0.] + np.logspace(np.log10(range_min), np.log10(range_max), n_margin_values).tolist()

            # Determine the min_samples_split grid
            if not pruning:
                range_min = 2
                range_max = X_train.shape[0]
                parameters["min_samples_split"] = np.logspace(np.log10(range_min), np.log10(range_max), n_min_samples_split_values).astype(np.uint).tolist()
            else:
                parameters["min_samples_split"] = [2]

            cv_protocol = KFold(n_splits=10, shuffle=True, random_state=42)
            cv = GridSearchCV(estimator=MaxMarginIntervalTree(), param_grid=parameters, cv=cv_protocol, n_jobs=n_cpu,
                              scoring=metric, pruning=pruning)
            cv.fit(X_train, y_train, d.feature_names)
            fold_predictions[~fold_train] = cv.predict(X_test)
            fold_cv_results.append({"best": cv.best_params_, "all": cv.cv_results_})
            fold_train_mse.append(mean_squared_error(y_train, cv.predict(X_train)))
            print("........fold {0:d} took {1:.2} seconds".format(i + 1, time() - fold_start))

            # Save the tree
            latex_exporter = TreeExporter("latex")
            open(join(ds_result_dir, "model_fold_{0:d}.tex".format(i + 1)), "w").write(
                latex_exporter(cv.best_estimator_))

        # Save the predictions
        open(join(ds_result_dir, "predictions.csv"), "w")\
            .write("pred.log.penalty\n" + "\n".join(str(x) for x in fold_predictions))

        # Save the cross-validation results for each fold
        json.dump(fold_cv_results, open(join(ds_result_dir, "parameters.json"), "w"))

        # Generate the PDF file for each tree
        # build_cmd = "cd {0!s}; for i in ./model_fold_*.tex; do lualatex $i > /dev/null; rm ./*.aux ./*.log;done".format(ds_result_dir)
        # !$build_cmd

        # Save a hash of the data to avoid re-running
        open(join(ds_uid_file), "w").write(str(hash(d)))
Example #2
0
    trainx = x[:len(x) / 2, ]
    trainy = y[:len(y) / 2, ]
    testx = x[len(x) / 2:, ]
    testy = y[len(y) / 2:, ]

    start_time = time.time()
    estimator = MaxMarginIntervalTree(margin=1.0,
                                      max_depth=4,
                                      loss="linear_hinge",
                                      min_samples_split=0)
    clf = estimator.fit(trainx, trainy)
    fit = estimator.predict(testx)

    #print time.time() - start_time
    #print len(x)
    print "|  ", mean_squared_error(testy, fit)
    #file = open(str(i)+".tex", 'w')
    #file.write( _latex_export(estimator))
    #file.close()
    """
	alphas, pruned_trees = min_cost_complexity_pruning(estimator)
	print alphas

	for pt in pruned_trees:
	    print sorted(pt.tree_.rules)
	 

	param_grid =  {"margin": [0.0, 2.0], "loss":["linear_hinge"], "max_depth":[np.infty], "min_samples_split":[0]}
	search = GridSearchCV(estimator, param_grid)
	search.fit(x,y)
Example #3
0
 def mse_metric(estimator, X, y):
     """
     Negative mean squared error, since GridSearchCV maximizes a metric
     """
     return -mean_squared_error(y_pred=estimator.predict(X), y_true=y)
Example #4
0
def evaluate_on_dataset(d,
                        parameters,
                        metric,
                        result_dir,
                        n_margin_values=10,
                        n_min_samples_split_values=10,
                        n_cpu=-1):
    ds_result_dir = join(result_dir, d.name)
    if not exists(ds_result_dir):
        mkdir(ds_result_dir)

    ds_uid_file = join(ds_result_dir, "dataset.uid")
    if not exists(join(ds_result_dir, "predictions.csv")):
        start_time = time()
        fold_predictions = np.zeros(d.n_examples)
        fold_train_mse = []
        fold_cv_results = []
        for i, fold in enumerate(np.unique(d.folds)):
            fold_start = time()

            fold_train = d.folds != fold
            X_train = d.X[fold_train]
            y_train = d.y[fold_train]
            X_test = d.X[~fold_train]
            y_test = d.y[~fold_train]

            # Determine the margin grid
            sorted_limits = y_train.flatten()
            sorted_limits = sorted_limits[~np.isinf(sorted_limits)]
            sorted_limits.sort()
            range_max = sorted_limits.max() - sorted_limits.min()
            range_min = np.diff(sorted_limits)
            range_min = range_min[range_min > 0].min()
            parameters = dict(parameters)  # Make a copy
            parameters["margin"] = np.logspace(np.log10(range_min),
                                               np.log10(range_max),
                                               n_margin_values)

            # Determine the min_samples_split grid
            range_min = 2
            range_max = X_train.shape[0]
            parameters["min_samples_split"] = np.logspace(
                np.log10(range_min), np.log10(range_max),
                n_min_samples_split_values).astype(np.uint).tolist()

            # Fit a regression tree on the transformed data
            cv_protocol = KFold(n_splits=10, shuffle=True, random_state=42)
            cv = GridSearchCV(estimator=IntervalDecisionTreeRegressor(),
                              param_grid=parameters,
                              cv=cv_protocol,
                              n_jobs=n_cpu,
                              scoring=metric)
            cv.fit(X_train, y_train)

            # Evaluate the model
            fold_predictions[~fold_train] = cv.predict(X_test)
            fold_cv_results.append({
                "best": cv.best_params_,
                "all": cv.cv_results_
            })
            fold_train_mse.append(
                mean_squared_error(y_train, cv.predict(X_train)))
            print("........fold {0:d} took {1:.2} seconds".format(
                i + 1,
                time() - fold_start))

        # Save the predictions
        open(join(ds_result_dir, "predictions.csv"),
             "w").write("\n".join(str(x) for x in fold_predictions))

        # Save the cross-validation results for each fold
        import cPickle as c
        c.dump(fold_cv_results, open(join(ds_result_dir, "parameters.pkl"),
                                     "w"))

        # Save a hash of the data to avoid re-running
        open(join(ds_uid_file), "w").write(str(hash(d)))