def mmit_fit(X,y): range_min,range_max = get_range_max_min(X,y,nature='margin') margin = get_margin_range(range_min,range_max,n_margin_values=10) range_min,range_max = get_range_max_min(X,y,nature='min_sample_split') min_samples_split = get_min_sample_split_sample(range_min,range_max,n_min_samples_split_values=10) cv_protocol = KFold(n_splits=5, shuffle=True, random_state=42) param_grid = {"margin": margin, "loss":["linear_hinge"], "max_depth":[1,2,4,6], "min_samples_split":min_samples_split} estimator = MaxMarginIntervalTree() cv = GridSearchCV(estimator, param_grid, cv=cv_protocol, n_jobs=-1) cv.fit(X, y) return cv
def main(features, targets, folds): forest_param_grid = { 'n_estimators': [50], 'max_features': [0.01, 0.025, 0.05, 0.1, 0.25, 0.5], 'margin': [0, 0.01, 0.1, 1, 10], 'n_processes': [1], } tree_param_grid = { 'max_depth': list(range(3, 20)), 'margin': [0, 0.01, 0.1, 1, 10] } test_scores = [] for fold_i in set(folds): print("Testing for fold %s" % fold_i) train_mask = folds != fold_i train_X, test_X = features[train_mask], features[~train_mask] train_y, test_y = targets[train_mask], targets[~train_mask] train_folds = folds[train_mask] tree_model = GridSearchCV(MaxMarginIntervalTree(), tree_param_grid, scoring=interval_MSE, cv=5, n_jobs=8) tree_model.fit(train_X, train_y) tree_test_score = interval_MSE(tree_model, test_X, test_y) test_scores.append(tree_test_score) print("Tree: %s" % -tree_test_score) model = GridSearchCV(RandomMaximumMarginIntervalForest(), forest_param_grid, scoring=interval_MSE, cv=5, n_jobs=8) model.fit(train_X, train_y) test_score = interval_MSE(model, test_X, test_y) test_scores.append(test_score) print('Forest: %s' % -test_score)
def evaluate_on_dataset(d, parameters, metric, result_dir, pruning=True, n_margin_values=10, n_min_samples_split_values=10, n_cpu=-1): ds_result_dir = join(result_dir, d.name) if not exists(ds_result_dir): mkdir(ds_result_dir) ds_uid_file = join(ds_result_dir, "dataset.uid") # if exists(ds_uid_file) and open(ds_uid_file, "r").next().strip() == str(hash(d)): if not exists(join(ds_result_dir, "predictions.csv")): start_time = time() fold_predictions = np.zeros(d.n_examples) fold_train_mse = [] fold_cv_results = [] for i, fold in enumerate(np.unique(d.folds)): fold_start = time() fold_train = d.folds != fold X_train = d.X[fold_train] y_train = d.y[fold_train] X_test = d.X[~fold_train] y_test = d.y[~fold_train] # Determine the margin grid sorted_limits = y_train.flatten() sorted_limits = sorted_limits[~np.isinf(sorted_limits)] sorted_limits.sort() range_max = sorted_limits.max() - sorted_limits.min() range_min = np.diff(sorted_limits) range_min = range_min[range_min > 0].min() parameters = dict(parameters) # Make a copy parameters["margin"] = [0.] + np.logspace(np.log10(range_min), np.log10(range_max), n_margin_values).tolist() # Determine the min_samples_split grid if not pruning: range_min = 2 range_max = X_train.shape[0] parameters["min_samples_split"] = np.logspace(np.log10(range_min), np.log10(range_max), n_min_samples_split_values).astype(np.uint).tolist() else: parameters["min_samples_split"] = [2] cv_protocol = KFold(n_splits=10, shuffle=True, random_state=42) cv = GridSearchCV(estimator=MaxMarginIntervalTree(), param_grid=parameters, cv=cv_protocol, n_jobs=n_cpu, scoring=metric, pruning=pruning) cv.fit(X_train, y_train, d.feature_names) fold_predictions[~fold_train] = cv.predict(X_test) fold_cv_results.append({"best": cv.best_params_, "all": cv.cv_results_}) fold_train_mse.append(mean_squared_error(y_train, cv.predict(X_train))) print("........fold {0:d} took {1:.2} seconds".format(i + 1, time() - fold_start)) # Save the tree latex_exporter = TreeExporter("latex") open(join(ds_result_dir, "model_fold_{0:d}.tex".format(i + 1)), "w").write( latex_exporter(cv.best_estimator_)) # Save the predictions open(join(ds_result_dir, "predictions.csv"), "w")\ .write("pred.log.penalty\n" + "\n".join(str(x) for x in fold_predictions)) # Save the cross-validation results for each fold json.dump(fold_cv_results, open(join(ds_result_dir, "parameters.json"), "w")) # Generate the PDF file for each tree # build_cmd = "cd {0!s}; for i in ./model_fold_*.tex; do lualatex $i > /dev/null; rm ./*.aux ./*.log;done".format(ds_result_dir) # !$build_cmd # Save a hash of the data to avoid re-running open(join(ds_uid_file), "w").write(str(hash(d)))
exists(join(path, d, "folds.csv")): yield Dataset(abspath(join(path, d))) for d in find_datasets("/home/parismita/mmit_data"): x = d.X y = d.y trainx = x[:len(x) / 2, ] trainy = y[:len(y) / 2, ] testx = x[len(x) / 2:, ] testy = y[len(y) / 2:, ] start_time = time.time() estimator = MaxMarginIntervalTree(margin=1.0, max_depth=4, loss="linear_hinge", min_samples_split=0) clf = estimator.fit(trainx, trainy) fit = estimator.predict(testx) #print time.time() - start_time #print len(x) print "| ", mean_squared_error(testy, fit) #file = open(str(i)+".tex", 'w') #file.write( _latex_export(estimator)) #file.close() """ alphas, pruned_trees = min_cost_complexity_pruning(estimator) print alphas for pt in pruned_trees:
param_template["margin"] = [0.] + np.logspace( np.log10(range_min), np.log10(range_max), n_margin_values).tolist() print ".... linear hinge" method = "mmit.linear.hinge.pruning" params = dict(param_template) params["loss"] = ["linear_hinge"] if not exists(join(predictions_path, method, d.name)) or \ not exists(join(predictions_path, method, d.name, "predictions.fulltrain.csv")): try: mkdir(join(predictions_path, method, d.name)) except: pass cv_protocol = KFold(n_splits=10, shuffle=True, random_state=42) cv = GridSearchCV(estimator=MaxMarginIntervalTree(), param_grid=params, cv=cv_protocol, n_jobs=n_cpu, scoring=mse_scorer, pruning=True).fit(d.X, d.y) for hps, metrics in cv.cv_results_: print hps, metrics["cv"] print "BEST:" print cv.best_estimator_ print cv.best_params_ print cv.best_score_ save_predictions(cv.best_estimator_, method, d, predictions_path)