Example #1
0
    def fit_penalized(self,
                      train_set,
                      penalty_params,
                      max_em_iters,
                      val_set_evaluator=None,
                      init_theta=None,
                      reference_pen_param=None,
                      pool=None):
        """
        @param penalty_params: penalty parameter for fitting penalized model
        @param val_set_evaluator: LikelihoodComparer with a given reference model
        @param reference_pen_param: the penalty parameters for the reference model

        @return the fitted model after the 2-step procedure
        """
        if init_theta is None:
            init_theta = initialize_theta(self.theta_shape,
                                          self.possible_theta_mask,
                                          self.zero_theta_mask)

        penalized_theta, _, _, _ = self.em_algo.run(
            train_set,
            self.feat_generator,
            theta=init_theta,
            possible_theta_mask=self.possible_theta_mask,
            zero_theta_mask=self.zero_theta_mask,
            burn_in=self.burn_in,
            penalty_params=penalty_params,
            max_em_iters=max_em_iters,
            max_e_samples=self.num_e_samples * 4,
            pool=pool,
        )
        curr_model_results = MethodResults(penalty_params)

        #### Calculate validation log likelihood (EM surrogate), use to determine if model is any good.
        log_lik_ratio_lower_bound, log_lik_ratio = self._do_validation_set_checks(
            penalized_theta,
            val_set_evaluator,
        )
        curr_model_results.set_penalized_theta(
            penalized_theta,
            log_lik_ratio_lower_bound,
            log_lik_ratio,
            model_masks=ModelTruncation(penalized_theta, self.feat_generator),
            reference_penalty_param=reference_pen_param,
        )

        log.info("==== Penalized theta, %s, nonzero %d ====" %
                 (penalty_params, curr_model_results.penalized_num_nonzero))
        log.info(
            get_nonzero_theta_print_lines(penalized_theta,
                                          self.feat_generator))
        return curr_model_results
Example #2
0
def main(argv):
    num_threads = 1
    num_runs = 30

    try:
        opts, args = getopt.getopt(argv,"f:z:a:b:c:s:m:r:t:")
    except getopt.GetoptError:
        print "Bad Arguments to python script"
        sys.exit(2)

    settings = Elastic_Net_Settings()
    for opt, arg in opts:
        if opt == '-f':
            settings.num_features = int(arg)
        elif opt == '-z':
            settings.num_nonzero_features = int(arg)
        elif opt == '-a':
            settings.train_size = int(arg)
        elif opt == '-b':
            settings.validate_size = int(arg)
        elif opt == '-c':
            settings.test_size = int(arg)
        elif opt == "-s":
            settings.snr = float(arg)
        elif opt == "-m":
            assert(arg in METHODS)
            settings.method = arg
        elif opt == "-t":
            num_threads = int(arg)
        elif opt == "-r":
            num_runs = int(arg)

    settings.print_settings()
    sys.stdout.flush()
    data_gen = DataGenerator(settings)

    run_data = []
    for i in range(num_runs):
        observed_data = data_gen.make_correlated(settings.num_features, settings.num_nonzero_features)
        run_data.append(Iteration_Data(i, observed_data, settings))

    if settings.method not in ["SP", "SP0"] and num_threads > 1:
        print "Do multiprocessing"
        pool = Pool(num_threads)
        results = pool.map(fit_data_for_iter_safe, run_data)
    else:
        print "Avoiding multiprocessing"
        results = map(fit_data_for_iter_safe, run_data)

    method_results = MethodResults(settings.method, settings.method_result_keys)
    num_crashes = 0
    for r in results:
        if r is not None:
            method_results.append(r)
        else:
            num_crashes += 1
    print "==========TOTAL RUNS %d============" % method_results.get_num_runs()
    method_results.print_results()
    print "num crashes %d" % num_crashes
def main(argv):
    seed = 10
    print "seed", seed
    np.random.seed(seed)

    num_threads = 1
    num_runs = 1

    try:
        opts, args = getopt.getopt(argv, "m:t:r:")
    except getopt.GetoptError:
        print "Bad argument given to realdata_eval.py"
        sys.exit(2)

    settings = RealDataSettings()
    for opt, arg in opts:
        if opt == "-m":
            assert (arg in ["HC", "GS"])
            settings.method = arg
        elif opt == "-t":
            num_threads = int(arg)
        elif opt == "-r":
            num_runs = int(arg)

    print "TOTAL NUM RUNS %d" % num_runs
    sys.stdout.flush()

    geneset_dict = read_geneset_file()
    X_genesets, y, genesets = read_gene_expr_data(geneset_dict)
    print "num features", sum(
        [X_genesets[i].shape[1] for i in range(0, len(X_genesets))])
    print "total genesets ever", len(X_genesets)
    X_genesets = normalize_data(X_genesets)

    run_data = []
    for i in range(num_runs):
        data = Shuffled_Gene_Data(X_genesets, y, genesets)
        run_data.append(Iteration_Data(i, data, settings))

    if num_threads > 1:
        print "Do multiprocessing"
        pool = Pool(num_threads)
        results = pool.map(fit_data_for_iter_safe, run_data)
    else:
        print "Avoiding multiprocessing"
        results = map(fit_data_for_iter_safe, run_data)

    method_results = MethodResults(settings.method,
                                   settings.method_result_keys)
    num_crashes = 0
    for r in results:
        if r is not None:
            method_results.append(r)
        else:
            num_crashes += 1
    print "==========TOTAL RUNS %d============" % method_results.get_num_runs()
    method_results.print_results()
    print "num crashes %d" % num_crashes
def main(argv):
    num_threads = 1
    num_runs = 1

    try:
        opts, args = getopt.getopt(argv,"m:t:r:")
    except getopt.GetoptError:
        print "Bad argument given to realdata_eval.py"
        sys.exit(2)

    settings = Simulation_Settings()
    settings.results_folder = "results/realdata"
    for opt, arg in opts:
        if opt == "-m":
            assert(arg in ["HC", "GS"])
            settings.method = arg
        elif opt == "-t":
            num_threads = int(arg)
        elif opt == "-r":
            num_runs = int(arg)

    print "TOTAL NUM RUNS %d" % num_runs
    sys.stdout.flush()

    geneset_dict = read_geneset_file()
    X_genesets, y, genesets = read_gene_expr_data(geneset_dict)
    print "num features", sum([X_genesets[i].shape[1] for i in range(0, len(X_genesets))])
    print "total genesets ever", len(X_genesets)
    X_genesets = normalize_data(X_genesets)

    run_data = []
    for i in range(num_runs):
        data = Shuffled_Gene_Data(X_genesets, y, genesets)
        run_data.append(Iteration_Data(i, data, settings))

    if num_threads > 1:
        print "Do multiprocessing"
        pool = Pool(num_threads)
        results = pool.map(fit_data_for_iter_safe, run_data)
    else:
        print "Avoiding multiprocessing"
        results = map(fit_data_for_iter_safe, run_data)

    method_results = MethodResults(settings.method, settings.method_result_keys)
    num_crashes = 0
    for r in results:
        if r is not None:
            method_results.append(r)
        else:
            num_crashes += 1
    print "==========TOTAL RUNS %d============" % method_results.get_num_runs()
    method_results.print_results()
    print "num crashes %d" % num_crashes
Example #5
0
def main(argv):
    seed = 10
    print "seed", seed
    np.random.seed(seed)

    num_threads = 1
    num_runs = 1

    try:
        opts, args = getopt.getopt(argv, "g:f:a:b:c:s:m:t:r:i")
    except getopt.GetoptError:
        print "Bad argument given to sgl_eval.py"
        sys.exit(2)

    settings = SGL_Settings()
    for opt, arg in opts:
        if opt == '-g':
            settings.expert_num_groups = int(arg)
        elif opt == '-f':
            settings.num_features = int(arg)
        elif opt == '-a':
            settings.train_size = int(arg)
        elif opt == '-b':
            settings.validate_size = int(arg)
        elif opt == '-c':
            settings.test_size = int(arg)
        elif opt == "-s":
            settings.snr = float(arg)
        elif opt == "-m":
            assert (arg in METHODS)
            settings.method = arg
        elif opt == "-t":
            num_threads = int(arg)
        elif opt == "-r":
            num_runs = int(arg)

    print "TOTAL NUM RUNS %d" % num_runs
    settings.print_settings()
    sys.stdout.flush()

    data_gen = DataGenerator(settings)

    run_data = []
    for i in range(num_runs):
        observed_data = data_gen.sparse_groups()
        run_data.append(Iteration_Data(i, observed_data, settings))

    if settings.method != "SP" and num_threads > 1:
        print "Do multiprocessing"
        pool = Pool(num_threads)
        results = pool.map(fit_data_for_iter_safe, run_data)
    else:
        print "Avoiding multiprocessing"
        results = map(fit_data_for_iter_safe, run_data)

    method_results = MethodResults(settings.method,
                                   settings.method_result_keys)
    num_crashes = 0
    for r in results:
        if r is not None:
            method_results.append(r)
        else:
            num_crashes += 1
    print "==========TOTAL RUNS %d============" % method_results.get_num_runs()
    method_results.print_results()
    print "num crashes %d" % num_crashes
Example #6
0
def main(argv):
    seed = 10
    print "seed", seed
    np.random.seed(seed)

    num_threads = 1
    num_runs = 1

    try:
        opts, args = getopt.getopt(argv, "d:z:f:a:v:s:m:t:r:i")
    except getopt.GetoptError:
        print "Bad argument given to Matrix_Completion_eval.py"
        sys.exit(2)

    settings = Matrix_Completion_Settings()
    for opt, arg in opts:
        if opt == '-d':
            arg_split = arg.split(",")
            settings.num_rows = int(arg_split[0])
            settings.num_cols = int(arg_split[1])
        elif opt == '-z':
            arg_split = arg.split(",")
            settings.num_nonzero_row_features = int(arg_split[0])
            settings.num_nonzero_col_features = int(arg_split[1])
        elif opt == '-f':
            arg_split = arg.split(",")
            settings.num_row_features = int(arg_split[0])
            settings.num_col_features = int(arg_split[1])
        elif opt == '-a':
            arg_split = arg.split(",")
            settings.train_perc = float(arg_split[0])
            settings.validate_perc = float(arg_split[1])
            settings.test_perc = float(arg_split[2])
            assert (settings.train_perc + settings.validate_perc +
                    settings.test_perc < 1)
        elif opt == "-v":
            settings.num_nonzero_s = int(arg)
        elif opt == "-s":
            settings.snr = float(arg)
        elif opt == "-m":
            assert (arg in METHODS)
            settings.method = arg
        elif opt == "-t":
            num_threads = int(arg)
        elif opt == "-r":
            num_runs = int(arg)
        elif opt == "-i":
            settings.big_init_set = True

    assert (settings.num_nonzero_s <= settings.num_rows
            and settings.num_nonzero_s <= settings.num_cols)
    # SP does not care about initialization
    assert (not (settings.big_init_set == True
                 and settings.method in ["SP", "SP0"]))

    settings.matrix_size = settings.num_rows * settings.num_cols
    settings.train_size = int(settings.train_perc * settings.matrix_size)
    settings.validate_size = int(settings.validate_perc * settings.matrix_size)
    settings.test_size = int(settings.test_perc * settings.matrix_size)

    print "TOTAL NUM RUNS %d" % num_runs
    settings.print_settings()
    sys.stdout.flush()

    data_gen = DataGenerator(settings)

    run_data = []
    for i in range(num_runs):
        observed_data = data_gen.matrix_completion()
        run_data.append(Iteration_Data(i, observed_data, settings))

    if settings.method != "SP" and num_threads > 1:
        print "Do multiprocessing"
        pool = Pool(num_threads)
        results = pool.map(fit_data_for_iter_safe, run_data)
    else:
        print "Avoiding multiprocessing"
        results = map(fit_data_for_iter_safe, run_data)

    method_results = MethodResults(settings.method,
                                   settings.method_result_keys)
    num_crashes = 0
    for r in results:
        if r is not None:
            method_results.append(r)
        else:
            num_crashes += 1
    print "==========TOTAL RUNS %d============" % method_results.get_num_runs()
    method_results.print_results()
    print "num crashes %d" % num_crashes
import gridsearch_interaction_effects

GENERATE_PLOT = False
NUM_RUNS = 15

# TRAIN_SIZE = 70
# NUM_EFFECTS = 40
# NUM_NONZERO_EFFECTS = 3
# NUM_NONZERO_INTERACTIONS = 300

TRAIN_SIZE = 40
NUM_EFFECTS = 20
NUM_NONZERO_EFFECTS = 3
NUM_NONZERO_INTERACTIONS = 40

hc_results = MethodResults("Hillclimb")
mu_results = MethodResults("MU")
gs_results = MethodResults("Gridsearch")

for i in range(0, NUM_RUNS):
    beta_real, theta_real, X_train, W_train, y_train, X_validate, W_validate, y_validate, X_test, W_test, y_test = \
        effects_and_interactions(TRAIN_SIZE, NUM_EFFECTS, NUM_NONZERO_EFFECTS, NUM_NONZERO_INTERACTIONS)

    def _get_test_beta_theta_err(beta_guess, theta_guess):
        test_err = testerror_interactions(X_test, W_test, y_test, beta_guess, theta_guess) / y_test.size * 2
        beta_err = betaerror(beta_real, beta_guess)
        theta_err = betaerror(theta_guess, theta_real)
        return (test_err, beta_err, theta_err)

    hc_beta_guess, hc_theta_guess, hc_costpath = hillclimb_interaction_effects.run(X_train, W_train, y_train, X_validate, W_validate, y_validate)
    hc_results.append_test_beta_theta_err(_get_test_beta_theta_err(hc_beta_guess, hc_theta_guess))
def main(argv):
    seed = 10
    print "seed", seed
    np.random.seed(seed)

    num_threads = 1
    num_runs = 30

    try:
        opts, args = getopt.getopt(argv,"f:z:a:b:c:s:m:r:t:")
    except getopt.GetoptError:
        print "Bad Arguments to python script"
        sys.exit(2)

    settings = Elastic_Net_Settings()
    for opt, arg in opts:
        if opt == '-f':
            settings.num_features = int(arg)
        elif opt == '-z':
            settings.num_nonzero_features = int(arg)
        elif opt == '-a':
            settings.train_size = int(arg)
        elif opt == '-b':
            settings.validate_size = int(arg)
        elif opt == '-c':
            settings.test_size = int(arg)
        elif opt == "-s":
            settings.snr = float(arg)
        elif opt == "-m":
            assert(arg in METHODS)
            settings.method = arg
        elif opt == "-t":
            num_threads = int(arg)
        elif opt == "-r":
            num_runs = int(arg)

    settings.print_settings()
    sys.stdout.flush()
    data_gen = DataGenerator(settings)

    run_data = []
    for i in range(num_runs):
        observed_data = data_gen.make_correlated(settings.num_features, settings.num_nonzero_features)
        run_data.append(Iteration_Data(i, observed_data, settings))

    if settings.method not in ["SP", "SP0"] and num_threads > 1:
        print "Do multiprocessing"
        pool = Pool(num_threads)
        results = pool.map(fit_data_for_iter_safe, run_data)
    else:
        print "Avoiding multiprocessing"
        results = map(fit_data_for_iter_safe, run_data)

    method_results = MethodResults(settings.method, settings.method_result_keys)
    num_crashes = 0
    for r in results:
        if r is not None:
            method_results.append(r)
        else:
            num_crashes += 1
    print "==========TOTAL RUNS %d============" % method_results.get_num_runs()
    method_results.print_results()
    print "num crashes %d" % num_crashes
Example #9
0
def main(argv):
    num_threads = 1
    num_runs = 1

    try:
        opts, args = getopt.getopt(argv, "d:z:f:g:a:v:s:m:t:r:i:")
    except getopt.GetoptError:
        print "Bad argument given"
        sys.exit(2)

    settings = Matrix_Completion_Group_Settings()
    for opt, arg in opts:
        if opt == '-d':
            arg_split = arg.split(",")
            settings.num_rows = int(arg_split[0])
            settings.num_cols = int(arg_split[1])
        elif opt == '-z':
            arg_split = arg.split(",")
            settings.num_nonzero_row_groups = int(arg_split[0])
            settings.num_nonzero_col_groups = int(arg_split[1])
        elif opt == '-f':
            arg_split = arg.split(",")
            settings.num_row_features = int(arg_split[0])
            settings.num_col_features = int(arg_split[1])
        elif opt == '-g':
            arg_split = arg.split(",")
            settings.num_row_groups = int(arg_split[0])
            settings.num_col_groups = int(arg_split[1])
        elif opt == '-a':
            arg_split = arg.split(",")
            settings.train_perc = float(arg_split[0])
            settings.validate_perc = float(arg_split[1])
            assert (settings.train_perc + settings.validate_perc <= 1.0)
        elif opt == "-v":
            settings.num_nonzero_s = int(arg)
        elif opt == "-s":
            settings.snr = float(arg)
        elif opt == "-m":
            assert (arg in METHODS)
            settings.method = arg
        elif opt == "-t":
            num_threads = int(arg)
        elif opt == "-r":
            num_runs = int(arg)
        elif opt == "-i":
            settings.gamma_to_row_col_m = float(arg)

    assert (settings.num_nonzero_s <= settings.num_rows
            and settings.num_nonzero_s <= settings.num_cols)

    settings.matrix_size = settings.num_rows * settings.num_cols

    print "TOTAL NUM RUNS %d" % num_runs
    settings.print_settings()
    sys.stdout.flush()

    data_gen = DataGenerator(settings)

    run_data = []
    for i in range(num_runs):
        observed_data = data_gen.matrix_completion_groups(
            gamma_to_row_col_m=settings.gamma_to_row_col_m,
            feat_factor=settings.feat_factor)
        run_data.append(Iteration_Data(i, observed_data, settings))

    if settings.method != "SP" and num_threads > 1:
        print "Do multiprocessing"
        pool = Pool(num_threads)
        results = pool.map(fit_data_for_iter_safe, run_data)
    else:
        print "Avoiding multiprocessing"
        results = map(fit_data_for_iter_safe, run_data)

    method_results = MethodResults(settings.method,
                                   settings.method_result_keys)
    num_crashes = 0
    for r in results:
        if r is not None:
            method_results.append(r)
        else:
            num_crashes += 1
    print "==========TOTAL RUNS %d============" % method_results.get_num_runs()
    method_results.print_results()
    print "num crashes %d" % num_crashes
def main():
    SMOOTH_FCNS = [big_sin, identity_fcn, big_cos_sin, crazy_down_sin, pwr_small]
    smooth_fcn_list = SMOOTH_FCNS[:NUM_FUNCS]

    hc_results = MethodResults("Hillclimb")
    hc_nesterov_results = MethodResults("Hillclimb_nesterov")
    gs_results = MethodResults("Gridsearch")

    for i in range(0, NUM_RUNS):
        # Generate dataset
        X_train, y_train, X_validate, y_validate, X_test, y_test = multi_smooth_features(
            TRAIN_SIZE,
            smooth_fcn_list,
            desired_snr=SNR,
            feat_range=[f * TRAIN_SIZE/60 for f in FEATURE_RANGE],
            train_to_validate_ratio=VALIDATE_RATIO,
            test_size=NUM_TEST
        )
        X_full, train_idx, validate_idx, test_idx = GenAddModelHillclimb.stack((X_train, X_validate, X_test))

        def _create_method_result(best_thetas, runtime):
            test_err = testerror_multi_smooth(y_test, test_idx, best_thetas)
            validate_err = testerror_multi_smooth(y_validate, validate_idx, best_thetas)
            print "create_method_result", test_err
            return MethodResult(test_err=test_err, validation_err=validate_err, runtime=runtime)

        def _run_hc(results, nesterov):
            hillclimb_prob = GenAddModelHillclimb(X_train, y_train, X_validate, y_validate, X_test, nesterov=nesterov)
            thetas, cost_path, runtime = _hillclimb_coarse_grid_search(hillclimb_prob, smooth_fcn_list)
            results.append(_create_method_result(thetas, runtime))
            if PLOT_RUNS:
                _plot_res(
                    thetas[test_idx], smooth_fcn_list, X_test, y_test,
                    outfile="%s/test_%s_f%d.png" % (FIGURE_DIR, hillclimb_prob.method_label, NUM_FUNCS),
                )
                _plot_res(
                    thetas[validate_idx], smooth_fcn_list, X_validate, y_validate,
                    outfile="%s/validation_%s_f%d.png" % (FIGURE_DIR, hillclimb_prob.method_label, NUM_FUNCS),
                )
                _plot_res(
                    thetas[train_idx], smooth_fcn_list, X_train, y_train,
                    outfile="%s/train_%s_f%d.png" % (FIGURE_DIR, hillclimb_prob.method_label, NUM_FUNCS),
                )
            return thetas, cost_path

        hc_thetas, hc_cost_path = _run_hc(hc_results, nesterov=False)
        # hc_nesterov_thetas, hc_nesterov_cost_path = _run_hc(hc_nesterov_results, nesterov=True)

        if PLOT_RUNS:
            _plot_cost_paths(
                cost_path_list=[hc_cost_path, hc_nesterov_cost_path],
                labels=["HC", "HC_Nesterov"],
                num_funcs=NUM_FUNCS,
            )

        print "=================================================="

        start_time = time.time()
        gs_thetas, best_lambdas = gs.run(
            y_train,
            y_validate,
            X_full,
            train_idx,
            validate_idx,
            num_lambdas=NUM_GS_LAMBDAS,
            max_lambda=MAX_LAMBDA
        )
        gs_runtime = time.time() - start_time
        gs_results.append(_create_method_result(gs_thetas, gs_runtime))

        if PLOT_RUNS:
            _plot_res(
                gs_thetas[test_idx], smooth_fcn_list, X_test, y_test,
                outfile="%s/test_gs_f%d.png" % (FIGURE_DIR, NUM_FUNCS),
            )
            _plot_gs_v_hc(
                gs_thetas[train_idx], hc_thetas[train_idx], smooth_fcn_list, X_train, y_train,
                outfile_prefix="%s/train_gs_v_hc_f%d" % (FIGURE_DIR, NUM_FUNCS),
            )

        print "===========RUN %d ============" % i
        hc_results.print_results()
        hc_nesterov_results.print_results()
        gs_results.print_results()
Example #11
0
def main(argv):
    num_threads = 1
    num_runs = 1

    try:
        opts, args = getopt.getopt(argv, "f:z:a:b:c:s:m:t:r:")
    except getopt.GetoptError:
        sys.exit(2)

    settings = Sparse_Add_Models_Settings()
    for opt, arg in opts:
        if opt == '-f':
            settings.num_funcs = int(arg)
        elif opt == '-z':
            settings.num_zero_funcs = int(arg)
        elif opt == '-a':
            settings.train_size = int(arg)
        elif opt == '-b':
            settings.validate_size = int(arg)
        elif opt == '-c':
            settings.test_size = int(arg)
        elif opt == "-s":
            settings.snr = float(arg)
        elif opt == "-m":
            assert (arg in METHODS)
            settings.method = arg
        elif opt == "-t":
            num_threads = int(arg)
        elif opt == "-r":
            num_runs = int(arg)

    print "TOTAL NUM RUNS %d" % num_runs
    settings.print_settings()
    sys.stdout.flush()

    assert (settings.num_funcs <= len(settings.smooth_fcns))
    smooth_fcn_list = settings.smooth_fcns[:settings.num_funcs] + [
        const_zero
    ] * settings.num_zero_funcs
    data_gen = DataGenerator(settings)

    run_data = []
    for i in range(num_runs):
        observed_data = data_gen.make_additive_smooth_data(smooth_fcn_list)
        run_data.append(Iteration_Data(i, observed_data, settings))

    if settings.method != "SP" and num_threads > 1:
        print "Do multiprocessing"
        pool = Pool(num_threads)
        results = pool.map(fit_data_for_iter_safe, run_data)
    else:
        print "Avoiding multiprocessing"
        results = map(fit_data_for_iter_safe, run_data)

    method_results = MethodResults(settings.method,
                                   settings.method_result_keys)
    num_crashes = 0
    for r in results:
        if r is not None:
            method_results.append(r)
        else:
            num_crashes += 1
    print "==========TOTAL RUNS %d============" % method_results.get_num_runs()
    method_results.print_results()
    print "num crashes %d" % num_crashes
def main():
    seed = int(np.random.rand() * 1e15)
    print "seed", seed
    np.random.seed(seed)

    geneset_dict = read_geneset_file()
    X_genesets, y, genesets = read_gene_expr_data(geneset_dict)
    print "num features", sum([X_genesets[i].shape[1] for i in range(0, len(X_genesets))])
    print "total genesets ever", len(X_genesets)
    X_genesets = normalize_data(X_genesets)

    hc_results = MethodResults("HC")
    gs_grouped_results = MethodResults("GS_Grouped")
    gs_results = MethodResults("GS_Lasso")
    for i in range(0, NUM_ITERS):
        X_groups_train, y_train, X_groups_validate, y_validate, X_groups_test, y_test = shuffle_and_split_data(
            X_genesets, y, TRAIN_SIZE, VALIDATE_SIZE)
        X_validate = np.hstack(X_groups_validate)
        X_test = np.hstack(X_groups_test)

        start = time.time()
        hc_betas, hc_cost_path = hc.run_for_lambdas(X_groups_train, y_train, X_groups_validate, y_validate, init_lambdas=INIT_LAMBDAS)
        hc_runtime = time.time() - start
        print "hc 1e-6", get_num_nonzero_betas(hc_betas, genesets, threshold=1e-6)
        print "hc 1e-8", get_num_nonzero_betas(hc_betas, genesets, threshold=1e-8)
        print "hc 1e-10", get_num_nonzero_betas(hc_betas, genesets, threshold=1e-10)
        hc_validate_cost, hc_validate_rate = testerror_logistic_grouped(X_validate, y_validate, hc_betas)
        print "hc_validate_cost", hc_validate_cost

        start = time.time()
        gs_grouped_betas, gs_grouped_cost = gs_grouped.run_classify(X_groups_train, y_train, X_groups_validate, y_validate)
        gs_grouped_runtime = time.time() - start
        print "gs_grouped 1e-6", get_num_nonzero_betas(gs_grouped_betas, genesets, threshold=1e-6)
        print "gs_grouped 1e-8", get_num_nonzero_betas(gs_grouped_betas, genesets, threshold=1e-8)
        print "gs_grouped 1e-10", get_num_nonzero_betas(gs_grouped_betas, genesets, threshold=1e-10)
        gs_grouped_validate_cost, gs_grouped_validate_rate = testerror_logistic_grouped(X_validate, y_validate, gs_grouped_betas)
        print "gs_grouped_validate_cost", gs_grouped_validate_cost

        start = time.time()
        gs_betas, gs_cost = gs.run_classify(X_groups_train, y_train, X_groups_validate, y_validate)
        gs_runtime = time.time() - start
        print "gs 1e-6", get_num_nonzero_betas(gs_betas, genesets, threshold=1e-6)
        print "gs 1e-8", get_num_nonzero_betas(gs_betas, genesets, threshold=1e-8)
        print "gs 1e-10", get_num_nonzero_betas(gs_betas, genesets, threshold=1e-10)
        gs_validate_cost, gs_validate_rate = testerror_logistic_grouped(X_validate, y_validate, gs_betas)
        print "gs_validate_cost", gs_validate_cost

        print "================= hc ======================"
        hc_test, hc_rate = testerror_logistic_grouped(X_test, y_test, hc_betas)
        print "hc_test", hc_test, "hc_rate", hc_rate
        hc_results.append(MethodResult(test_err=hc_test, validation_err=hc_validate_cost, sensitivity=hc_rate, runtime=hc_runtime))

        print "================= gs grouped ======================"
        gs_grouped_test, gs_grouped_rate = testerror_logistic_grouped(X_test, y_test, gs_grouped_betas)
        print "gs_grouped_test", gs_grouped_test, "gs_grouped_rate", gs_grouped_rate
        gs_grouped_results.append(MethodResult(test_err=gs_grouped_test, validation_err=gs_grouped_validate_cost, sensitivity=gs_grouped_rate, runtime=gs_grouped_runtime))

        print "================= gs ======================"
        gs_test, gs_rate = testerror_logistic_grouped(X_test, y_test, gs_betas)
        print "gs_test", gs_test, "gs_rate", gs_rate
        gs_results.append(MethodResult(test_err=gs_test, validation_err=gs_validate_cost, sensitivity=gs_rate, runtime=gs_runtime))

        print "ITERATION", i
        hc_results.print_results()
        gs_grouped_results.print_results()
        gs_results.print_results()

        if i == 0:
            pickle_data(PICKLE_DATA_FILENAME, X_groups_train, y_train, X_groups_validate, y_validate, X_groups_test, y_test, genesets)
            pickle_betas(PICKLE_BETAS_FILENAME, hc_betas, gs_grouped_betas, gs_betas)
Example #13
0
def main(argv):
    seed = 10
    print "seed", seed
    np.random.seed(seed)
    num_threads = 1
    num_runs = 1

    try:
        opts, args = getopt.getopt(argv,"g:f:a:b:c:s:m:t:r:")
    except getopt.GetoptError:
        print "Bad argument given to sgl_eval.py"
        sys.exit(2)

    settings = SGL_Settings()
    for opt, arg in opts:
        if opt == '-g':
            settings.expert_num_groups = int(arg)
        elif opt == '-f':
            settings.num_features = int(arg)
        elif opt == '-a':
            settings.train_size = int(arg)
        elif opt == '-b':
            settings.validate_size = int(arg)
        elif opt == '-c':
            settings.test_size = int(arg)
        elif opt == "-s":
            settings.snr = float(arg)
        elif opt == "-m":
            assert(arg in METHODS)
            settings.method = arg
        elif opt == "-t":
            num_threads = int(arg)
        elif opt == "-r":
            num_runs = int(arg)

    print "TOTAL NUM RUNS %d" % num_runs
    settings.print_settings()
    sys.stdout.flush()

    data_gen = DataGenerator(settings)

    run_data = []
    for i in range(num_runs):
        observed_data = data_gen.sparse_groups()
        run_data.append(Iteration_Data(i, observed_data, settings))

    if settings.method != "SP" and num_threads > 1:
        print "Do multiprocessing"
        pool = Pool(num_threads)
        results = pool.map(fit_data_for_iter_safe, run_data)
    else:
        print "Avoiding multiprocessing"
        results = map(fit_data_for_iter_safe, run_data)

    method_results = MethodResults(settings.method)
    num_crashes = 0
    for r in results:
        if r is not None:
            method_results.append(r)
        else:
            num_crashes += 1
    print "==========TOTAL RUNS %d============" % method_results.get_num_runs()
    method_results.print_results()
    print "num crashes %d" % num_crashes
    for init_lambda1 in COARSE_LAMBDA_GRID:
        kwargs["initial_lambda1"] = init_lambda1
        kwargs["initial_lambda2"] = init_lambda1
        beta_guess, cost_path = optimization_func(*args, **kwargs)
        validation_cost = testerror(X_validate, y_validate, beta_guess)
        if best_cost > validation_cost:
            best_start_lambdas = [kwargs["initial_lambda1"], kwargs["initial_lambda2"]]
            best_cost = validation_cost
            best_beta = beta_guess
            best_cost_path = cost_path

    end_time = time.time()
    print "HC: BEST best_cost", best_cost, "best_start_lambdas", best_start_lambdas
    return beta_guess, cost_path, end_time - start_time

hc_results = MethodResults(HC_LAMBDA12_LABEL)
hc_results1 = MethodResults(HC_LAMBDA12_LABEL + "_SHRINK")
hc_dim_results = MethodResults(HC_LAMBDA12_DIM_LABEL)
hc_nesterov_results = MethodResults(HC_LAMBDA12_NESTEROV_LABEL)
hc_lambda_alpha_results = MethodResults(HC_LAMBDA_ALPHA_LABEL)
hc_lambda_alpha_results1 = MethodResults(HC_LAMBDA_ALPHA_LABEL + "_SHRINK")
hc_lambda_alpha_dim_results = MethodResults(HC_LAMBDA_ALPHA_DIM_LABEL)
hc_lambda_alpha_nesterov_results = MethodResults(HC_LAMBDA_ALPHA_NESTEROV_LABEL)
nm_results = MethodResults("NELDER-MEAD")
bs_results = MethodResults("BAYES_SPEARMINT")
gs_results = MethodResults(GS_LAMBDA12_LABEL)
for i in range(0, NUM_RUNS):
    beta_real, X_train, y_train, X_validate, y_validate, X_test, y_test = data_generation.correlated(
        TRAIN_SIZE, NUM_FEATURES, NUM_NONZERO_FEATURES, signal_noise_ratio=SIGNAL_NOISE_RATIO)

    def _create_method_result(beta_guess, runtime):
def main(argv):
    seed = 10
    print "seed", seed
    np.random.seed(seed)

    num_threads = 1
    num_runs = 1

    try:
        opts, args = getopt.getopt(argv,"f:z:a:b:c:s:m:t:r:i")
    except getopt.GetoptError:
        sys.exit(2)

    settings = Sparse_Add_Models_Settings()
    for opt, arg in opts:
        if opt == '-f':
            settings.num_funcs = int(arg)
        elif opt == '-z':
            settings.num_zero_funcs = int(arg)
        elif opt == '-a':
            settings.train_size = int(arg)
        elif opt == '-b':
            settings.validate_size = int(arg)
        elif opt == '-c':
            settings.test_size = int(arg)
        elif opt == "-s":
            settings.snr = float(arg)
        elif opt == "-m":
            assert(arg in METHODS)
            settings.method = arg
        elif opt == "-t":
            num_threads = int(arg)
        elif opt == "-r":
            num_runs = int(arg)
        elif opt == "-i":
            settings.big_init_set = True

    # SP does not care about initialization
    assert(not (settings.big_init_set == True and settings.method in ["SP", "SP0"]))

    print "TOTAL NUM RUNS %d" % num_runs
    settings.print_settings()
    sys.stdout.flush()

    assert(settings.num_funcs <= len(settings.smooth_fcns))
    smooth_fcn_list = settings.smooth_fcns[:settings.num_funcs] + [const_zero] * settings.num_zero_funcs
    data_gen = DataGenerator(settings)

    run_data = []
    for i in range(num_runs):
        observed_data = data_gen.make_additive_smooth_data(smooth_fcn_list)
        run_data.append(Iteration_Data(i, observed_data, settings))

    if settings.method != "SP" and num_threads > 1:
        print "Do multiprocessing"
        pool = Pool(num_threads)
        results = pool.map(fit_data_for_iter_safe, run_data)
    else:
        print "Avoiding multiprocessing"
        results = map(fit_data_for_iter_safe, run_data)

    method_results = MethodResults(settings.method, settings.method_result_keys)
    num_crashes = 0
    for r in results:
        if r is not None:
            method_results.append(r)
        else:
            num_crashes += 1
    print "==========TOTAL RUNS %d============" % method_results.get_num_runs()
    method_results.print_results()
    print "num crashes %d" % num_crashes
def main(argv):
    try:
        opts, args = getopt.getopt(argv,"d:p")
    except getopt.GetoptError:
        print "BAD REQUEST"
        print "accepts a folder name. reads the XML files inside"
        sys.exit(2)

    RUN_HC_POOLED = False
    for opt, arg in opts:
        if opt == '-d':
            data_type = int(arg)
            if data_type == 0:
                TRAIN_SIZE = 10
                TOTAL_FEATURES = 30
                NUM_GROUPS = 3
            elif data_type == 1:
                TRAIN_SIZE = 60
                TOTAL_FEATURES = 300
                NUM_GROUPS = 30
            elif data_type == 2:
                TRAIN_SIZE = 90
                TOTAL_FEATURES = 900
                NUM_GROUPS = 60
            elif data_type == 3:
                TRAIN_SIZE = 90
                TOTAL_FEATURES = 1200
                NUM_GROUPS = 100
        elif opt == '-p':
            RUN_HC_POOLED = True

    TRUE_GROUP_FEATURE_SIZES = [TOTAL_FEATURES / TRUE_NUM_GROUPS] * TRUE_NUM_GROUPS
    EXPERT_KNOWLEDGE_GROUP_FEATURE_SIZES = [TOTAL_FEATURES / NUM_GROUPS] * NUM_GROUPS

    COARSE_LAMBDA1S = [1, 1e-1]
    if RUN_HC_POOLED:
        print "RUN POOLED FOR GS and HC"
    else:
        print "UNPOOLED VS. POOLED"

    seed = np.random.randint(0, 1e5)
    seed = 10
    np.random.seed(seed)
    print "RANDOM SEED", seed
    print "TRAIN_SIZE", TRAIN_SIZE
    print "TOTAL_FEATURES", TOTAL_FEATURES
    print "NUM_GROUPS", NUM_GROUPS
    print "COARSE_LAMBDA1S", COARSE_LAMBDA1S

    def _hillclimb_coarse_grid_search(optimization_func, *args, **kwargs):
        start_time = time.time()
        best_cost = 1e10
        best_beta = []
        best_cost_path = []
        best_lambda = 0
        for init_lambda in COARSE_LAMBDA1S:
            kwargs["initial_lambda1"] = init_lambda
            beta_guess, cost_path = optimization_func(*args, **kwargs)
            if best_cost > cost_path[-1]:
                best_cost = cost_path[-1]
                best_cost_path = cost_path
                best_beta = beta_guess
                best_lambda = init_lambda
                print "init_lambda better!", init_lambda
                print "HC: best_cost", best_cost
                sys.stdout.flush()
        print "HC_FINAL: best_cost", best_cost, "best_lambda", best_lambda
        end_time = time.time()
        print "runtime", end_time - start_time
        return best_beta, best_cost_path, end_time - start_time

    hc_results = MethodResults(HC_GROUPED_LASSO_LABEL)
    hc_nesterov_results = MethodResults("NESTEROV")
    hc_pooled_results = MethodResults(HC_GROUPED_LASSO_LABEL + "_POOLED")
    hc_pooled_nesterov_results = MethodResults("NESTEROV_POOLED")
    nm_results = MethodResults("NELDER_MEAD")
    gs_results = MethodResults(GS_GROUPED_LASSO_LABEL)

    for i in range(0, NUM_RUNS):
        beta_reals, X_train, y_train, X_validate, y_validate, X_test, y_test = sparse_groups(TRAIN_SIZE, TRUE_GROUP_FEATURE_SIZES)
        def _create_method_result(beta_guesses, runtime):
            test_err = testerror_grouped(X_test, y_test, beta_guesses)
            validation_err = testerror_grouped(X_validate, y_validate, beta_guesses)
            beta_guesses_all = np.concatenate(beta_guesses)
            beta_reals_all = np.concatenate(beta_reals)
            beta_err = betaerror(beta_reals_all, beta_guesses_all)
            guessed_nonzero_elems = np.where(get_nonzero_indices(beta_guesses_all, threshold=ZERO_THRESHOLD))
            true_nonzero_elems = np.where(get_nonzero_indices(beta_reals_all, threshold=ZERO_THRESHOLD))
            intersection = np.intersect1d(np.array(guessed_nonzero_elems), np.array(true_nonzero_elems))
            sensitivity = intersection.size / float(guessed_nonzero_elems[0].size) * 100
            print "test_err", test_err, "beta_err", beta_err, "sensitivity", sensitivity
            sys.stdout.flush()
            return MethodResult(test_err=test_err, validation_err=validation_err, beta_err=beta_err, sensitivity=sensitivity, runtime=runtime)

        if RUN_HC_POOLED:
            hc_pooled_beta_guesses, hc_pooled_costpath, runtime = _hillclimb_coarse_grid_search(hc_pooled.run, X_train, y_train, X_validate, y_validate, EXPERT_KNOWLEDGE_GROUP_FEATURE_SIZES)
            hc_pooled_results.append(_create_method_result(hc_pooled_beta_guesses, runtime))

            # hc_pooled_nesterov_beta_guesses, hc_pooled_nesterov_costpath, runtime = _hillclimb_coarse_grid_search(hc_pooled.run_nesterov, X_train, y_train, X_validate, y_validate, EXPERT_KNOWLEDGE_GROUP_FEATURE_SIZES)
            # hc_pooled_nesterov_results.append(_create_method_result(hc_pooled_nesterov_beta_guesses, runtime))
        else:
            hc_beta_guesses, hc_costpath, runtime = _hillclimb_coarse_grid_search(hc.run, X_train, y_train, X_validate, y_validate, EXPERT_KNOWLEDGE_GROUP_FEATURE_SIZES)
            hc_results.append(_create_method_result(hc_beta_guesses, runtime))

            # hc_nesterov_beta_guesses, hc_nesterov_costpath, runtime = _hillclimb_coarse_grid_search(hc.run_nesterov, X_train, y_train, X_validate, y_validate, EXPERT_KNOWLEDGE_GROUP_FEATURE_SIZES)
            # hc_nesterov_results.append(_create_method_result(hc_nesterov_beta_guesses, runtime))

        nm_beta_guesses, runtime = nm.run(X_train, y_train, X_validate, y_validate, EXPERT_KNOWLEDGE_GROUP_FEATURE_SIZES)
        nm_results.append(_create_method_result(nm_beta_guesses, runtime))

        start = time.time()
        gs_beta_guesses, gs_lowest_cost = gridsearch_grouped_lasso.run(X_train, y_train, X_validate, y_validate, EXPERT_KNOWLEDGE_GROUP_FEATURE_SIZES)
        runtime = time.time() - start
        gs_results.append(_create_method_result(gs_beta_guesses, runtime))

        print "NUM RUN", i
        print "FEATURE GROUPS", TRUE_GROUP_FEATURE_SIZES
        print "NUM_GROUPS", NUM_GROUPS
        print "TRAIN SIZE", TRAIN_SIZE

        if RUN_HC_POOLED:
            hc_pooled_results.print_results()
            nm_results.print_results()
            hc_pooled_nesterov_results.print_results()
        else:
            hc_results.print_results()
            nm_results.print_results()
            hc_nesterov_results.print_results()
        gs_results.print_results()

        if GENERATE_PLOT and i == 0:
            plt.clf()
            if RUN_HC_POOLED:
                plt.plot(hc_pooled_costpath, label="Gradient Descent", color="red")
                plt.plot(hc_pooled_nesterov_costpath, label="Nesterov's Gradient Descent", color="blue")

                # Integer ticks only
                plt.xticks(np.arange(0, max(len(hc_pooled_costpath), len(hc_pooled_nesterov_costpath)), 1.0))
            else:
                plt.plot(hc_costpath, label=HC_GROUPED_LASSO_LABEL, color=HC_GROUPED_LASSO_COLOR)
                plt.plot(hc_nesterov_costpath, label="Nesterov", color="purple")

                # Integer ticks only
                plt.xticks(np.arange(0, max(len(hc_costpath), len(hc_nesterov_costpath)), 1.0))
            plt.axhline(gs_lowest_cost, label="Grid Search", color=GS_COLOR)
            plt.legend(fontsize="x-small")
            # plt.title("Train=%d p=%d, g=%d, m=%d" % (TRAIN_SIZE, TOTAL_FEATURES, TRUE_NUM_GROUPS, NUM_GROUPS))
            plt.xlabel("Number of iterations")
            plt.ylabel("Validation test error")
            plt.savefig("figures/grouped_lasso_%d_%d_%d_%d.png" % (TRAIN_SIZE, TOTAL_FEATURES, TRUE_NUM_GROUPS, NUM_GROUPS))

        sys.stdout.flush()