def mmit_fit(X,y):

    range_min,range_max = get_range_max_min(X,y,nature='margin')
    margin = get_margin_range(range_min,range_max,n_margin_values=10)
    range_min,range_max = get_range_max_min(X,y,nature='min_sample_split')
    min_samples_split = get_min_sample_split_sample(range_min,range_max,n_min_samples_split_values=10)
    cv_protocol = KFold(n_splits=5, shuffle=True, random_state=42)
    param_grid = {"margin": margin, "loss":["linear_hinge"], "max_depth":[1,2,4,6], "min_samples_split":min_samples_split}
    estimator  = MaxMarginIntervalTree()
    cv         = GridSearchCV(estimator, param_grid, cv=cv_protocol, n_jobs=-1)
    cv.fit(X, y)
    return cv
Exemple #2
0
def main(features, targets, folds):

    forest_param_grid = {
        'n_estimators': [50],
        'max_features': [0.01, 0.025, 0.05, 0.1, 0.25, 0.5],
        'margin': [0, 0.01, 0.1, 1, 10],
        'n_processes': [1],
    }
    tree_param_grid = {
        'max_depth': list(range(3, 20)),
        'margin': [0, 0.01, 0.1, 1, 10]
    }

    test_scores = []
    for fold_i in set(folds):
        print("Testing for fold %s" % fold_i)
        train_mask = folds != fold_i
        train_X, test_X = features[train_mask], features[~train_mask]
        train_y, test_y = targets[train_mask], targets[~train_mask]
        train_folds = folds[train_mask]

        tree_model = GridSearchCV(MaxMarginIntervalTree(),
                                  tree_param_grid,
                                  scoring=interval_MSE,
                                  cv=5,
                                  n_jobs=8)
        tree_model.fit(train_X, train_y)
        tree_test_score = interval_MSE(tree_model, test_X, test_y)
        test_scores.append(tree_test_score)
        print("Tree:   %s" % -tree_test_score)

        model = GridSearchCV(RandomMaximumMarginIntervalForest(),
                             forest_param_grid,
                             scoring=interval_MSE,
                             cv=5,
                             n_jobs=8)
        model.fit(train_X, train_y)
        test_score = interval_MSE(model, test_X, test_y)
        test_scores.append(test_score)
        print('Forest: %s' % -test_score)
Exemple #3
0
def evaluate_on_dataset(d, parameters, metric, result_dir, pruning=True,
                        n_margin_values=10, n_min_samples_split_values=10,
                        n_cpu=-1):
    ds_result_dir = join(result_dir, d.name)
    if not exists(ds_result_dir):
        mkdir(ds_result_dir)

    ds_uid_file = join(ds_result_dir, "dataset.uid")
    # if exists(ds_uid_file) and open(ds_uid_file, "r").next().strip() == str(hash(d)):
    if not exists(join(ds_result_dir, "predictions.csv")):
        start_time = time()
        fold_predictions = np.zeros(d.n_examples)
        fold_train_mse = []
        fold_cv_results = []
        for i, fold in enumerate(np.unique(d.folds)):
            fold_start = time()

            fold_train = d.folds != fold
            X_train = d.X[fold_train]
            y_train = d.y[fold_train]
            X_test = d.X[~fold_train]
            y_test = d.y[~fold_train]

            # Determine the margin grid
            sorted_limits = y_train.flatten()
            sorted_limits = sorted_limits[~np.isinf(sorted_limits)]
            sorted_limits.sort()
            range_max = sorted_limits.max() - sorted_limits.min()
            range_min = np.diff(sorted_limits)
            range_min = range_min[range_min > 0].min()
            parameters = dict(parameters)  # Make a copy
            parameters["margin"] = [0.] + np.logspace(np.log10(range_min), np.log10(range_max), n_margin_values).tolist()

            # Determine the min_samples_split grid
            if not pruning:
                range_min = 2
                range_max = X_train.shape[0]
                parameters["min_samples_split"] = np.logspace(np.log10(range_min), np.log10(range_max), n_min_samples_split_values).astype(np.uint).tolist()
            else:
                parameters["min_samples_split"] = [2]

            cv_protocol = KFold(n_splits=10, shuffle=True, random_state=42)
            cv = GridSearchCV(estimator=MaxMarginIntervalTree(), param_grid=parameters, cv=cv_protocol, n_jobs=n_cpu,
                              scoring=metric, pruning=pruning)
            cv.fit(X_train, y_train, d.feature_names)
            fold_predictions[~fold_train] = cv.predict(X_test)
            fold_cv_results.append({"best": cv.best_params_, "all": cv.cv_results_})
            fold_train_mse.append(mean_squared_error(y_train, cv.predict(X_train)))
            print("........fold {0:d} took {1:.2} seconds".format(i + 1, time() - fold_start))

            # Save the tree
            latex_exporter = TreeExporter("latex")
            open(join(ds_result_dir, "model_fold_{0:d}.tex".format(i + 1)), "w").write(
                latex_exporter(cv.best_estimator_))

        # Save the predictions
        open(join(ds_result_dir, "predictions.csv"), "w")\
            .write("pred.log.penalty\n" + "\n".join(str(x) for x in fold_predictions))

        # Save the cross-validation results for each fold
        json.dump(fold_cv_results, open(join(ds_result_dir, "parameters.json"), "w"))

        # Generate the PDF file for each tree
        # build_cmd = "cd {0!s}; for i in ./model_fold_*.tex; do lualatex $i > /dev/null; rm ./*.aux ./*.log;done".format(ds_result_dir)
        # !$build_cmd

        # Save a hash of the data to avoid re-running
        open(join(ds_uid_file), "w").write(str(hash(d)))
Exemple #4
0
                exists(join(path, d, "folds.csv")):
            yield Dataset(abspath(join(path, d)))


for d in find_datasets("/home/parismita/mmit_data"):
    x = d.X
    y = d.y

    trainx = x[:len(x) / 2, ]
    trainy = y[:len(y) / 2, ]
    testx = x[len(x) / 2:, ]
    testy = y[len(y) / 2:, ]

    start_time = time.time()
    estimator = MaxMarginIntervalTree(margin=1.0,
                                      max_depth=4,
                                      loss="linear_hinge",
                                      min_samples_split=0)
    clf = estimator.fit(trainx, trainy)
    fit = estimator.predict(testx)

    #print time.time() - start_time
    #print len(x)
    print "|  ", mean_squared_error(testy, fit)
    #file = open(str(i)+".tex", 'w')
    #file.write( _latex_export(estimator))
    #file.close()
    """
	alphas, pruned_trees = min_cost_complexity_pruning(estimator)
	print alphas

	for pt in pruned_trees:
        param_template["margin"] = [0.] + np.logspace(
            np.log10(range_min), np.log10(range_max),
            n_margin_values).tolist()

        print ".... linear hinge"
        method = "mmit.linear.hinge.pruning"
        params = dict(param_template)
        params["loss"] = ["linear_hinge"]
        if not exists(join(predictions_path, method, d.name)) or \
                not exists(join(predictions_path, method, d.name, "predictions.fulltrain.csv")):
            try:
                mkdir(join(predictions_path, method, d.name))
            except:
                pass
            cv_protocol = KFold(n_splits=10, shuffle=True, random_state=42)
            cv = GridSearchCV(estimator=MaxMarginIntervalTree(),
                              param_grid=params,
                              cv=cv_protocol,
                              n_jobs=n_cpu,
                              scoring=mse_scorer,
                              pruning=True).fit(d.X, d.y)

            for hps, metrics in cv.cv_results_:
                print hps, metrics["cv"]

            print "BEST:"
            print cv.best_estimator_
            print cv.best_params_
            print cv.best_score_

            save_predictions(cv.best_estimator_, method, d, predictions_path)