Beispiel #1
0
def train_m_then_n_models(m, n, counter, total_evals, start_time, **kwargs):
    best_assignment = None
    best_valid_err = 1
    all_assignments = get_k_sorted_hparams(m)
    for i in range(m):
        cur_assignments = all_assignments[i]
        args = ExperimentParams(**kwargs, **cur_assignments)
        cur_valid_err, _, _ = train_classifier.main(args)
        if cur_valid_err < best_valid_err:
            best_assignment = cur_assignments
            best_valid_err = cur_valid_err
        counter[0] = counter[0] + 1
        print(
            "trained {} out of {} hyperparameter assignments, so far {} seconds"
            .format(counter[0], total_evals, round(time.time() - start_time,
                                                   3)))

    for i in range(n):
        args = ExperimentParams(filename_suffix="_{}".format(i),
                                **kwargs,
                                **best_assignment)
        cur_valid_err, _, _ = train_classifier.main(args)
        counter[0] = counter[0] + 1
        print(
            "trained {} out of {} hyperparameter assignments, so far {} seconds"
            .format(counter[0], total_evals, round(time.time() - start_time,
                                                   3)))
    return best_assignment
def main(argv):
    is_bert = experiment_tools.str2bool(experiment_tools.select_param_value('BERT_EMBED', argv.bert_embed))
    loaded_embedding = experiment_tools.preload_embed(os.path.join(argv.base_data_dir,argv.dataset), is_bert, False)

    models = argv.input_model.split(",")
    d_outs = argv.d_out.split("_")
    patterns = argv.pattern.split("_")

    for (model,d_out, pattern) in zip(models, d_outs, patterns):
        print("Checking model {} with pattern={} and d_out={}".format(model, d_out, pattern))
        # a basic experiment
        args = ExperimentParams(pattern = pattern, d_out = d_out,
                                    seed = argv.seed, loaded_embedding = loaded_embedding,
                                    dataset = argv.dataset, use_rho = False,
                                    depth = argv.depth, gpu=argv.gpu,
                                    batch_size=argv.batch_size, use_last_cs=argv.use_last_cs,
                                    base_data_dir = argv.base_data_dir, input_model=model,
                                    weight_norm = argv.weight_norm,
                                    bert_embed = is_bert)

        if argv.visualize > 0:
            train_classifier.main_visualize(args, os.path.join(argv.base_data_dir,argv.dataset), argv.visualize)
        else:
            _ = train_classifier.main_test(args)

    return 0
Beispiel #3
0
def try_load_data(data, category, **kwargs):
    args = ExperimentParams(**kwargs)
    try:
        dev = load_from_file(args)
        data.append(dev)
        return args
    except FileNotFoundError:
        return
Beispiel #4
0
def main():
    args = ExperimentParams()
    norms = load_norms.load_from_file(args)
    assert (len(norms) % num_groups == 0)

    data, largest_point, smallest_point = arrange_data(norms)
    sorted_data = sort_data_by_decrease(data)

    many_plots(sorted_data, largest_point, smallest_point, args)
Beispiel #5
0
def main(argv):
    loaded_embedding = experiment_tools.preload_embed(
        os.path.join(argv.base_dir, argv.dataset))

    if argv.random_selection or 'RANDOM_SELECTION' in os.environ:
        hyper_parameters_assignments = regularization_search_experiments.hparam_sample(
        )
    else:
        hyper_parameters_assignments = {
            "clip_grad": argv.clip,
            "dropout": argv.dropout,
            "rnn_dropout": argv.rnn_dropout,
            "embed_dropout": argv.embed_dropout,
            "lr": argv.lr,
            "weight_decay": argv.weight_decay,
            "depth": argv.depth
        }

    parameters = {
        'pattern':
        experiment_tools.select_param_value('PATTERN', argv.pattern),
        'd_out':
        experiment_tools.select_param_value('D_OUT', argv.d_out),
        'seed':
        int(experiment_tools.select_param_value('SEED', argv.seed)),
        'learned_structure':
        experiment_tools.select_param_value('LEARNED_STRUCTURE',
                                            argv.learned_structure),
        'semiring':
        experiment_tools.select_param_value('SEMIRING', argv.semiring)
    }

    kwargs = {
        "reg_goal_params": argv.reg_goal_params,
        "filename_prefix": argv.filename_prefix,
        "loaded_embedding": loaded_embedding,
        "dataset": argv.dataset,
        "use_rho": False,
        "gpu": argv.gpu,
        "max_epoch": argv.max_epoch,
        "patience": argv.patience,
        "batch_size": argv.batch_size,
        "use_last_cs": argv.use_last_cs,
        "logging_dir": argv.logging_dir,
        "base_data_dir": argv.base_dir,
        "output_dir": argv.model_save_dir,
        "reg_strength": argv.reg_strength,
        "sparsity_type": argv.sparsity_type
    }

    args = ExperimentParams(**kwargs, **parameters,
                            **hyper_parameters_assignments)

    print(args)

    _ = train_classifier.main(args)
def train_m_then_n_models(m, n, counter, total_evals, start_time, **kwargs):
    if kwargs["bert_embed"]:
        lr_lower_bound = BERT_LR_LOWER_BOUND
        lr_upper_bound = BERT_LR_UPPER_BOUND
    else:
        lr_lower_bound = LR_LOWER_BOUND
        lr_upper_bound = LR_UPPER_BOUND
    best_assignment = None
    best_valid_err = 1
    all_assignments = get_k_sorted_hparams(m, lr_lower_bound, lr_upper_bound)
    for i in range(m):
        cur_assignments = all_assignments[i]
        args = ExperimentParams(counter=counter[0],
                                **kwargs,
                                **cur_assignments)
        cur_valid_err, _, _ = train_classifier.main(args)
        if cur_valid_err < best_valid_err:
            best_assignment = cur_assignments
            best_valid_err = cur_valid_err
        counter[0] = counter[0] + 1
        print(
            "trained {} out of {} hyperparameter assignments, so far {} seconds"
            .format(counter[0], total_evals, round(time.time() - start_time,
                                                   3)))

    for i in range(n):
        args = ExperimentParams(counter=counter[0],
                                filename_suffix="_{}".format(i),
                                **kwargs,
                                **best_assignment)
        cur_valid_err, _, _ = train_classifier.main(args)
        counter[0] = counter[0] + 1
        print(
            "trained {} out of {} hyperparameter assignments, so far {} seconds"
            .format(counter[0], total_evals, round(time.time() - start_time,
                                                   3)))
    return best_assignment
def search_reg_str_entropy(cur_assignments, kwargs):
    starting_reg_str = kwargs["reg_strength"]
    file_base = "/home/jessedd/projects/rational-recurrences/classification/logging/" + kwargs[
        "dataset"]
    found_small_enough_reg_str = False
    # first search by checking that after 5 epochs, more than half aren't above .9
    kwargs["max_epoch"] = 1
    counter = 0
    rho_bound = .99
    while not found_small_enough_reg_str:
        counter += 1
        args = ExperimentParams(**kwargs, **cur_assignments)
        cur_valid_err, cur_test_err = train_classifier.main(args)

        learned_pattern, learned_d_out, frac_under_pointnine = load_learned_structure.entropy_rhos(
            file_base + args.filename() + ".txt", rho_bound)
        print("fraction under {}: {}".format(rho_bound, frac_under_pointnine))
        print("")
        if frac_under_pointnine < .25:
            kwargs["reg_strength"] = kwargs["reg_strength"] / 2.0
            if kwargs["reg_strength"] < 10**-7:
                kwargs["reg_strength"] = starting_reg_str
                return counter, "too_big_lr"
        else:
            found_small_enough_reg_str = True

    found_large_enough_reg_str = False
    kwargs["max_epoch"] = 5
    rho_bound = .9
    while not found_large_enough_reg_str:
        counter += 1
        args = ExperimentParams(**kwargs, **cur_assignments)
        cur_valid_err, cur_test_err = train_classifier.main(args)

        learned_pattern, learned_d_out, frac_under_pointnine = load_learned_structure.entropy_rhos(
            file_base + args.filename() + ".txt", rho_bound)
        print("fraction under {}: {}".format(rho_bound, frac_under_pointnine))
        print("")
        if frac_under_pointnine > .25:
            kwargs["reg_strength"] = kwargs["reg_strength"] * 2.0
            if kwargs["reg_strength"] > 10**4:
                kwargs["reg_strength"] = starting_reg_str
                return counter, "too_small_lr"
        else:
            found_large_enough_reg_str = True
    # to set this back to the default
    kwargs["max_epoch"] = 500
    return counter, "okay_lr"
Beispiel #8
0
def main():

    exp_num = 6

    if exp_num != -2:
        loaded_embedding = preload_embed()
    else:
        loaded_data = preload_data()

    start_time = time.time()
    counter = [0]
    categories = get_categories()

    if exp_num == -2:
        patterns = ["4-gram", "3-gram", "2-gram", "1-gram"]
        m = 20
        n = 5
        total_evals = (m + n) * len(patterns)

        for pattern in patterns:
            train_m_then_n_models(
                m,
                n,
                counter,
                total_evals,
                start_time,
                pattern=pattern,
                d_out="24",
                depth=1,
                filename_prefix="all_cs_and_equal_rho/hparam_opt/",
                dataset="bert/sst/",
                use_rho=False,
                seed=None,
                bert_embed=True,
                batch_size=32,
                loaded_data=loaded_data)

    if exp_num == -1:

        args = ExperimentParams(
            pattern="4-gram",
            d_out="24",
            reg_goal_params=20,
            filename_prefix=
            "all_cs_and_equal_rho/saving_model_for_interpretability/",
            seed=314159,
            loaded_embedding=loaded_embedding,
            dataset="amazon_categories/original_mix/",
            use_rho=False,
            clip_grad=2.82,
            dropout=0.1809,
            rnn_dropout=0.1537,
            embed_dropout=0.3141,
            lr=2.407E-02,
            weight_decay=3.64E-07,
            depth=1,
            reg_strength=3.125E-04,
            sparsity_type="states")
        cur_valid_err, _, _ = train_classifier.main(args)

    # a basic experiment
    if exp_num == 0:
        args = ExperimentParams(use_rho=True,
                                pattern="4-gram",
                                sparsity_type="rho_entropy",
                                rho_sum_to_one=True,
                                reg_strength=0.01,
                                d_out="23",
                                lr=0.001,
                                seed=34159)
        train_classifier.main(args)

    # finding the largest learning rate that doesn't diverge, for evaluating the claims in this paper:
    # The Marginal Value of Adaptive Gradient Methods in Machine Learning
    # https://arxiv.org/abs/1705.08292
    # conclusion: their results don't hold for our models.
    elif exp_num == 1:
        lrs = np.linspace(2, 0.1, 10)
        for lr in lrs:
            args = ExperimentParams(pattern="4-gram",
                                    d_out="256",
                                    trainer="sgd",
                                    max_epoch=3,
                                    lr=lr,
                                    filename_prefix="lr_tuning/")
            train_classifier.main(args)

    # baseline experiments for 1-gram up to 4-gram models
    elif exp_num == 3:
        patterns = ["4-gram", "3-gram", "2-gram", "1-gram"]
        m = 20
        n = 5
        total_evals = len(categories) * (len(patterns) + 1) * (m + n)

        for category in categories:
            for pattern in patterns:
                train_m_then_n_models(
                    m,
                    n,
                    counter,
                    total_evals,
                    start_time,
                    pattern=pattern,
                    d_out="24",
                    depth=1,
                    filename_prefix="all_cs_and_equal_rho/hparam_opt/",
                    dataset="amazon_categories/" + category,
                    use_rho=False,
                    seed=None,
                    loaded_embedding=loaded_embedding)

            train_m_then_n_models(
                m,
                n,
                counter,
                total_evals,
                start_time,
                pattern="1-gram,2-gram,3-gram,4-gram",
                d_out="6,6,6,6",
                depth=1,
                filename_prefix="all_cs_and_equal_rho/hparam_opt/",
                dataset="amazon_categories/" + category,
                use_rho=False,
                seed=None,
                loaded_embedding=loaded_embedding)

    # to learn with an L_1 regularizer
    # first train with the regularizer, choose the best structure, then do hyperparameter search for that structure
    elif exp_num == 6:
        d_out = "24"
        k = 20
        l = 5
        m = 20
        n = 5
        reg_goal_params_list = [80, 60, 40, 20]
        total_evals = len(categories) * (m + n + k +
                                         l) * len(reg_goal_params_list)

        all_reg_search_counters = []

        for category in categories:
            for reg_goal_params in reg_goal_params_list:
                best, reg_search_counters = regularization_search_experiments.train_k_then_l_models(
                    k,
                    l,
                    counter,
                    total_evals,
                    start_time,
                    logging_dir=
                    "/home/jessedd/projects/rational-recurrences/classification/logging/",
                    reg_goal_params=reg_goal_params,
                    pattern="4-gram",
                    d_out=d_out,
                    sparsity_type="states",
                    use_rho=False,
                    filename_prefix=
                    "all_cs_and_equal_rho/hparam_opt/structure_search/add_reg_term_to_loss/",
                    seed=None,
                    loaded_embedding=loaded_embedding,
                    reg_strength=8 * 10**-6,
                    distance_from_target=10,
                    dataset="amazon_categories/" + category)

                all_reg_search_counters.append(reg_search_counters)

                args = train_m_then_n_models(
                    m,
                    n,
                    counter,
                    total_evals,
                    start_time,
                    pattern=best['learned_pattern'],
                    d_out=best["learned_d_out"],
                    learned_structure="l1-states-learned",
                    reg_goal_params=reg_goal_params,
                    filename_prefix=
                    "all_cs_and_equal_rho/hparam_opt/structure_search/add_reg_term_to_loss/",
                    seed=None,
                    loaded_embedding=loaded_embedding,
                    dataset="amazon_categories/" + category,
                    use_rho=False)
        print("search counters:")
        for search_counter in all_reg_search_counters:
            print(search_counter)

    # some rho_entropy experiments
    elif exp_num == 8:
        k = 20
        l = 5
        total_evals = len(categories) * (k + l)

        for d_out in ["24"]:  #, "256"]:
            for category in categories:
                # to learn the structure, and train with the regularizer
                best, reg_search_counters = regularization_search_experiments.train_k_then_l_models(
                    k,
                    l,
                    counter,
                    total_evals,
                    start_time,
                    use_rho=True,
                    pattern="4-gram",
                    sparsity_type="rho_entropy",
                    rho_sum_to_one=True,
                    reg_strength=1,
                    d_out=d_out,
                    filename_prefix="only_last_cs/hparam_opt/reg_str_search/",
                    dataset="amazon_categories/" + category,
                    seed=None,
                    distance_from_target=10,
                    loaded_embedding=loaded_embedding)

    # baseline for rho_entropy experiments
    elif exp_num == 9:
        categories = ["dvd/"]
        patterns = ["1-gram",
                    "2-gram"]  #["4-gram", "3-gram", "2-gram", "1-gram"]
        m = 20
        n = 5
        total_evals = len(categories) * (len(patterns) + 1) * (m + n)

        for category in categories:
            for pattern in patterns:
                # train and eval the learned structure
                args = train_m_then_n_models(
                    m,
                    n,
                    counter,
                    total_evals,
                    start_time,
                    pattern=pattern,
                    d_out="24",
                    filename_prefix="only_last_cs/hparam_opt/",
                    dataset="amazon_categories/" + category,
                    use_last_cs=True,
                    use_rho=False,
                    seed=None,
                    loaded_embedding=loaded_embedding)

    # baseline experiments for l1 regularization, on sst. very similar to exp_num 3
    elif exp_num == 10:
        patterns = ["4-gram", "3-gram", "2-gram", "1-gram"]
        m = 20
        n = 5
        total_evals = m * n
        for pattern in patterns:
            train_m_then_n_models(
                m,
                n,
                counter,
                total_evals,
                start_time,
                pattern=pattern,
                d_out="24",
                depth=1,
                filename_prefix="all_cs_and_equal_rho/hparam_opt/",
                dataset="sst/",
                use_rho=False,
                seed=None,
                loaded_embedding=loaded_embedding)

        train_m_then_n_models(
            m,
            n,
            counter,
            total_evals,
            start_time,
            pattern="1-gram,2-gram,3-gram,4-gram",
            d_out="6,6,6,6",
            depth=1,
            filename_prefix="all_cs_and_equal_rho/hparam_opt/",
            dataset="sst/",
            use_rho=False,
            seed=None,
            loaded_embedding=loaded_embedding)

    elif exp_num == 11:

        args = ExperimentParams(
            pattern="1-gram,2-gram,3-gram,4-gram",
            d_out="0,4,0,2",
            learned_structure="l1-states-learned",
            reg_goal_params=20,
            filename_prefix=
            "all_cs_and_equal_rho/saving_model_for_interpretability/",
            seed=None,
            loaded_embedding=loaded_embedding,
            dataset="amazon_categories/original_mix/",
            use_rho=False,
            clip_grad=1.09,
            dropout=0.1943,
            rnn_dropout=0.0805,
            embed_dropout=0.3489,
            lr=2.553E-02,
            weight_decay=1.64E-06,
            depth=1,
            batch_size=5)
        cur_valid_err, _, _ = train_classifier.main(args)
def search_reg_str_l1(cur_assignments,
                      kwargs,
                      global_counter,
                      distance_from_target=10):
    # the final number of params is within this amount of target
    smallest_reg_str = 10**-9
    largest_reg_str = 10**2
    starting_reg_str = kwargs["reg_strength"]
    found_good_reg_str = False
    too_small = False
    too_large = False
    counter = 0
    reg_str_growth_rate = 2.0
    reduced_model_path = ""

    while not found_good_reg_str:
        # deleting models which aren't going to be used

        save_learned_structure.remove_old(reduced_model_path)

        # if more than 25 regularization strengths have been tried, throw out hparam assignment and resample
        if counter > 25:
            kwargs["reg_strength"] = starting_reg_str
            return counter, "bad_hparams", cur_valid_err, learned_d_out, reduced_model_path

        counter += 1
        args = ExperimentParams(counter=global_counter,
                                **kwargs,
                                **cur_assignments)
        cur_valid_err, learned_d_out, reduced_model_path = train_classifier.main(
            args)

        num_params = sum([
            int(learned_d_out.split(",")[i]) * (i + 1)
            for i in range(len(learned_d_out.split(",")))
        ])

        if num_params < kwargs["reg_goal_params"] - distance_from_target:
            if too_large:
                # reduce size of steps for reg strength
                reg_str_growth_rate = (reg_str_growth_rate + 1) / 2.0
                too_large = False
            too_small = True
            kwargs[
                "reg_strength"] = kwargs["reg_strength"] / reg_str_growth_rate
            if kwargs["reg_strength"] < smallest_reg_str:
                kwargs["reg_strength"] = starting_reg_str
                return counter, "too_small_lr", cur_valid_err, learned_d_out, reduced_model_path
        elif num_params > kwargs["reg_goal_params"] + distance_from_target:
            if too_small:
                # reduce size of steps for reg strength
                reg_str_growth_rate = (reg_str_growth_rate + 1) / 2.0
                too_small = False
            too_large = True
            kwargs[
                "reg_strength"] = kwargs["reg_strength"] * reg_str_growth_rate

            if kwargs["reg_strength"] > largest_reg_str:
                kwargs["reg_strength"] = starting_reg_str

                # it diverged, and for some reason the weights didn't drop
                if num_params == int(args.d_out) * 4 and cur_assignments[
                        "lr"] > .1 and cur_valid_err > .3:
                    return counter, "too_big_lr", cur_valid_err, learned_d_out, reduced_model_path
                else:
                    return counter, "too_small_lr", cur_valid_err, learned_d_out, reduced_model_path
        else:
            found_good_reg_str = True

    return counter, "okay_lr", cur_valid_err, learned_d_out, reduced_model_path
def train_k_then_l_models(k, l, counter, total_evals, start_time, logging_dir,
                          distance_from_target, **kwargs):
    if "seed" in kwargs and kwargs["seed"] is not None:
        np.random.seed(kwargs["seed"])

    assert "reg_strength" in kwargs
    if "prox_step" not in kwargs:
        kwargs["prox_step"] = False
    elif kwargs["prox_step"]:
        assert False, "It's too unstable. books/all_cs_and_equal_rho/hparam_opt/structure_search/proximal_gradient too big then too small"
    assert kwargs[
        "sparsity_type"] == "states", "setting kwargs for structure learning works only with states"
    assert "lr_patience" not in kwargs, "lr_patience is set s.t. the lr never decreases during structure learning."
    kwargs["logging_dir"] = logging_dir

    file_base = logging_dir + kwargs["dataset"]
    best = {
        "assignment": None,
        "valid_err": 1,
        "learned_pattern": None,
        "learned_d_out": None,
        "reg_strength": None
    }

    reg_search_counters = []
    if kwargs["bert_embed"]:
        lr_lower_bound = BERT_LR_LOWER_BOUND
        lr_upper_bound = BERT_LR_UPPER_BOUND
    else:
        lr_lower_bound = LR_LOWER_BOUND
        lr_upper_bound = LR_UPPER_BOUND
    all_assignments = get_k_sorted_hparams(k, lr_lower_bound, lr_upper_bound)
    for i in range(len(all_assignments)):

        valid_assignment = False
        while not valid_assignment:
            cur_assignments = all_assignments[i]

            # to prevent the learning rate from decreasing during structure learning
            kwargs["lr_patience"] = 9999999

            if kwargs["sparsity_type"] == "rho_entropy":
                one_search_counter, lr_judgement = search_reg_str_entropy(
                    cur_assignments, kwargs)
            elif kwargs["sparsity_type"] == "states":
                one_search_counter, lr_judgement, cur_valid_err, learned_d_out, reduced_model_path = search_reg_str_l1(
                    cur_assignments, kwargs, counter[0], distance_from_target)
                learned_pattern = "1-gram,2-gram,3-gram,4-gram"

            del kwargs["lr_patience"]

            reg_search_counters.append(one_search_counter)
            if lr_judgement == "okay_lr":
                valid_assignment = True
            else:
                save_learned_structure.remove_old(reduced_model_path)
                new_assignments = get_k_sorted_hparams(k - i,
                                                       lr_lower_bound,
                                                       lr_upper_bound,
                                                       sort=False)
                all_assignments[i:len(all_assignments)] = new_assignments

                #if lr_judgement == "too_big_lr":
                #    # lower the upper bound
                #    lr_upper_bound = cur_assignments['lr']
                #    reverse = True
                #elif lr_judgement == "too_small_lr":
                #    # rase lower bound
                #    lr_lower_bound = cur_assignments['lr']
                #    reverse = False
                #else:
                #    assert False, "shouldn't be here."
                #new_assignments = get_k_sorted_hparams(k-i, lr_lower_bound, lr_upper_bound)
                #if reverse:
                #    new_assignments.reverse()
                #all_assignments[i:len(all_assignments)] = new_assignments

        # to fine tune the learned model
        kwargs_fine_tune = get_kwargs_for_fine_tuning(kwargs,
                                                      reduced_model_path,
                                                      learned_d_out,
                                                      learned_pattern)
        args = ExperimentParams(counter=counter[0],
                                **kwargs_fine_tune,
                                **cur_assignments)
        cur_valid_err, _, _ = train_classifier.main(args)

        if cur_valid_err < best["valid_err"]:
            best = {
                "assignment": cur_assignments,
                "valid_err": cur_valid_err,
                "learned_pattern": learned_pattern,
                "learned_d_out": learned_d_out,
                "reg_strength": kwargs["reg_strength"]
            }

        counter[0] = counter[0] + 1
        print(
            "trained {} out of {} hyperparameter assignments, so far {} seconds"
            .format(counter[0], total_evals, round(time.time() - start_time,
                                                   3)))

    kwargs["reg_strength"] = best["reg_strength"]
    for i in range(l):
        kwargs["lr_patience"] = 9999999
        args = ExperimentParams(counter=counter[0],
                                filename_suffix="_{}".format(i),
                                **kwargs,
                                **best["assignment"])
        cur_valid_err, learned_d_out, reduced_model_path = train_classifier.main(
            args)
        del kwargs["lr_patience"]

        # to fine tune the model trained on the above line
        kwargs_fine_tune = get_kwargs_for_fine_tuning(kwargs,
                                                      reduced_model_path,
                                                      learned_d_out,
                                                      learned_pattern)
        args = ExperimentParams(counter=counter[0],
                                filename_suffix="_{}".format(i),
                                **kwargs_fine_tune,
                                **best["assignment"])
        cur_valid_err, learned_d_out, reduced_model_path = train_classifier.main(
            args)

        counter[0] = counter[0] + 1

    return best, reg_search_counters