def evaluate_on_dataset(d, parameters, metric, result_dir, pruning=True, n_margin_values=10, n_min_samples_split_values=10, n_cpu=-1): ds_result_dir = join(result_dir, d.name) if not exists(ds_result_dir): mkdir(ds_result_dir) ds_uid_file = join(ds_result_dir, "dataset.uid") # if exists(ds_uid_file) and open(ds_uid_file, "r").next().strip() == str(hash(d)): if not exists(join(ds_result_dir, "predictions.csv")): start_time = time() fold_predictions = np.zeros(d.n_examples) fold_train_mse = [] fold_cv_results = [] for i, fold in enumerate(np.unique(d.folds)): fold_start = time() fold_train = d.folds != fold X_train = d.X[fold_train] y_train = d.y[fold_train] X_test = d.X[~fold_train] y_test = d.y[~fold_train] # Determine the margin grid sorted_limits = y_train.flatten() sorted_limits = sorted_limits[~np.isinf(sorted_limits)] sorted_limits.sort() range_max = sorted_limits.max() - sorted_limits.min() range_min = np.diff(sorted_limits) range_min = range_min[range_min > 0].min() parameters = dict(parameters) # Make a copy parameters["margin"] = [0.] + np.logspace(np.log10(range_min), np.log10(range_max), n_margin_values).tolist() # Determine the min_samples_split grid if not pruning: range_min = 2 range_max = X_train.shape[0] parameters["min_samples_split"] = np.logspace(np.log10(range_min), np.log10(range_max), n_min_samples_split_values).astype(np.uint).tolist() else: parameters["min_samples_split"] = [2] cv_protocol = KFold(n_splits=10, shuffle=True, random_state=42) cv = GridSearchCV(estimator=MaxMarginIntervalTree(), param_grid=parameters, cv=cv_protocol, n_jobs=n_cpu, scoring=metric, pruning=pruning) cv.fit(X_train, y_train, d.feature_names) fold_predictions[~fold_train] = cv.predict(X_test) fold_cv_results.append({"best": cv.best_params_, "all": cv.cv_results_}) fold_train_mse.append(mean_squared_error(y_train, cv.predict(X_train))) print("........fold {0:d} took {1:.2} seconds".format(i + 1, time() - fold_start)) # Save the tree latex_exporter = TreeExporter("latex") open(join(ds_result_dir, "model_fold_{0:d}.tex".format(i + 1)), "w").write( latex_exporter(cv.best_estimator_)) # Save the predictions open(join(ds_result_dir, "predictions.csv"), "w")\ .write("pred.log.penalty\n" + "\n".join(str(x) for x in fold_predictions)) # Save the cross-validation results for each fold json.dump(fold_cv_results, open(join(ds_result_dir, "parameters.json"), "w")) # Generate the PDF file for each tree # build_cmd = "cd {0!s}; for i in ./model_fold_*.tex; do lualatex $i > /dev/null; rm ./*.aux ./*.log;done".format(ds_result_dir) # !$build_cmd # Save a hash of the data to avoid re-running open(join(ds_uid_file), "w").write(str(hash(d)))
trainx = x[:len(x) / 2, ] trainy = y[:len(y) / 2, ] testx = x[len(x) / 2:, ] testy = y[len(y) / 2:, ] start_time = time.time() estimator = MaxMarginIntervalTree(margin=1.0, max_depth=4, loss="linear_hinge", min_samples_split=0) clf = estimator.fit(trainx, trainy) fit = estimator.predict(testx) #print time.time() - start_time #print len(x) print "| ", mean_squared_error(testy, fit) #file = open(str(i)+".tex", 'w') #file.write( _latex_export(estimator)) #file.close() """ alphas, pruned_trees = min_cost_complexity_pruning(estimator) print alphas for pt in pruned_trees: print sorted(pt.tree_.rules) param_grid = {"margin": [0.0, 2.0], "loss":["linear_hinge"], "max_depth":[np.infty], "min_samples_split":[0]} search = GridSearchCV(estimator, param_grid) search.fit(x,y)
def mse_metric(estimator, X, y): """ Negative mean squared error, since GridSearchCV maximizes a metric """ return -mean_squared_error(y_pred=estimator.predict(X), y_true=y)
def evaluate_on_dataset(d, parameters, metric, result_dir, n_margin_values=10, n_min_samples_split_values=10, n_cpu=-1): ds_result_dir = join(result_dir, d.name) if not exists(ds_result_dir): mkdir(ds_result_dir) ds_uid_file = join(ds_result_dir, "dataset.uid") if not exists(join(ds_result_dir, "predictions.csv")): start_time = time() fold_predictions = np.zeros(d.n_examples) fold_train_mse = [] fold_cv_results = [] for i, fold in enumerate(np.unique(d.folds)): fold_start = time() fold_train = d.folds != fold X_train = d.X[fold_train] y_train = d.y[fold_train] X_test = d.X[~fold_train] y_test = d.y[~fold_train] # Determine the margin grid sorted_limits = y_train.flatten() sorted_limits = sorted_limits[~np.isinf(sorted_limits)] sorted_limits.sort() range_max = sorted_limits.max() - sorted_limits.min() range_min = np.diff(sorted_limits) range_min = range_min[range_min > 0].min() parameters = dict(parameters) # Make a copy parameters["margin"] = np.logspace(np.log10(range_min), np.log10(range_max), n_margin_values) # Determine the min_samples_split grid range_min = 2 range_max = X_train.shape[0] parameters["min_samples_split"] = np.logspace( np.log10(range_min), np.log10(range_max), n_min_samples_split_values).astype(np.uint).tolist() # Fit a regression tree on the transformed data cv_protocol = KFold(n_splits=10, shuffle=True, random_state=42) cv = GridSearchCV(estimator=IntervalDecisionTreeRegressor(), param_grid=parameters, cv=cv_protocol, n_jobs=n_cpu, scoring=metric) cv.fit(X_train, y_train) # Evaluate the model fold_predictions[~fold_train] = cv.predict(X_test) fold_cv_results.append({ "best": cv.best_params_, "all": cv.cv_results_ }) fold_train_mse.append( mean_squared_error(y_train, cv.predict(X_train))) print("........fold {0:d} took {1:.2} seconds".format( i + 1, time() - fold_start)) # Save the predictions open(join(ds_result_dir, "predictions.csv"), "w").write("\n".join(str(x) for x in fold_predictions)) # Save the cross-validation results for each fold import cPickle as c c.dump(fold_cv_results, open(join(ds_result_dir, "parameters.pkl"), "w")) # Save a hash of the data to avoid re-running open(join(ds_uid_file), "w").write(str(hash(d)))