Esempio n. 1
0
def run_task(*_):
    env = normalize(GymEnv("DartWalker2d-v1", record_video=False))

    policy = GaussianHMLPPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers, each with 32 hidden units.
        hidden_sizes=(64, 32),
        #subnet_split1=[5, 6, 7, 8, 9, 10, 23, 24, 25, 26, 27, 28],
        #subnet_split2=[11, 12, 13, 14, 15, 16, 29, 30, 31, 32, 33, 34],
        hlc_output_dim=3,
        sub_out_dim=3,
        option_dim=3,
    )

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo2 = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=15000,
        max_path_length=env.horizon,
        n_itr=3,
        discount=0.99,
        step_size=0.01,
        epopt_epsilon=1.0,
        epopt_after_iter=0,
        # Uncomment both lines (this and the plot parameter below) to enable plotting
        # plot=True,
    )
    for i in range(100):
        algo2.current_itr = 0
        algo2.train()
        llc_signal_file = 'data/local/experiment/' + cur_exp_name + '/signalfile.txt'
        f = open(llc_signal_file, 'w')
        f.write(str(i))
        f.close()

        hlc_signal_file = 'data/local/experiment/' + dual_exp_name + '/signalfile.txt'
        hlc_policy_file = 'data/local/experiment/' + dual_exp_name + '/policy.pkl'
        while True:
            if os.path.isfile(hlc_signal_file):
                f = open(hlc_signal_file, 'r')
                signal = int(f.read())
                f.close()
                print(signal, i)
                if signal == i:
                    dual_policy = joblib.load(hlc_policy_file)
                    hlc2llc(dual_policy, policy)
                    break
            time.sleep(20)  # sleep for a minute before check again
def run_task(v):
    random.seed(v['seed'])
    np.random.seed(v['seed'])

    # Log performance of randomly initialized policy with FIXED goal [0.1, 0.1]
    logger.log("Initializing report...")
    log_dir = logger.get_snapshot_dir()  # problem with logger module here!!
    report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=4)

    report.add_header("{}".format(EXPERIMENT_TYPE))
    report.add_text(format_dict(v))

    inner_env = normalize(Arm3dKeyEnv(ctrl_cost_coeff=v['ctrl_cost_coeff']))

    fixed_goal_generator = FixedStateGenerator(state=v['ultimate_goal'])
    fixed_start_generator = FixedStateGenerator(state=v['start_goal'])

    env = GoalStartExplorationEnv(
        env=inner_env,
        start_generator=fixed_start_generator,
        obs2start_transform=lambda x: x[:v['start_size']],
        goal_generator=fixed_goal_generator,
        obs2goal_transform=lambda x: x[-1 * v['goal_size']:
                                       ],  # the goal are the last 9 coords
        terminal_eps=v['terminal_eps'],
        distance_metric=v['distance_metric'],
        extend_dist_rew=v['extend_dist_rew'],
        inner_weight=v['inner_weight'],
        goal_weight=v['goal_weight'],
        terminate_env=True,
    )

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=v['policy_hidden_sizes'],
        # Fix the variance since different goals will require different variances, making this parameter hard to learn.
        learn_std=v['learn_std'],
        adaptive_std=v['adaptive_std'],
        std_hidden_sizes=(16,
                          16),  # this is only used if adaptive_std is true!
        output_gain=v['output_gain'],
        init_std=v['policy_init_std'],
    )

    if v['baseline'] == 'linear':
        baseline = LinearFeatureBaseline(env_spec=env.spec)
    elif v['baseline'] == 'g_mlp':
        baseline = GaussianMLPBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=v['pg_batch_size'],
        max_path_length=v['horizon'],
        n_itr=v['inner_iters'],
        step_size=0.01,
        discount=v['discount'],
        plot=False,
    )

    # load the state collection from data_upload
    load_dir = 'data_upload/state_collections/'
    all_feasible_starts = pickle.load(
        open(
            osp.join(config.PROJECT_PATH, load_dir, 'all_feasible_states.pkl'),
            'rb'))
    # all_feasible_starts = pickle.load(
    #     open(osp.join(config.PROJECT_PATH, load_dir, 'key_all_feasible_04_230000.pkl'), 'rb'))
    # all_feasible_starts = pickle.load(
    #     open(osp.join(config.PROJECT_PATH, load_dir, 'key_all_feasible_states_med_rad4.pkl'), 'rb'))

    # all_feasible_starts2 = pickle.load(
    #     open(osp.join(config.PROJECT_PATH, load_dir, 'key_all_feasible_states_min_rad4.pkl'), 'rb'))
    # all_feasible_starts3 = pickle.load(
    #     open(osp.join(config.PROJECT_PATH, load_dir, 'key_all_feasible_states_max_rad2.pkl'), 'rb'))
    print("we have %d feasible starts" % all_feasible_starts.size)

    all_starts = StateCollection(distance_threshold=v['coll_eps'])
    brownian_starts = StateCollection(
        distance_threshold=v['regularize_starts'])

    logger.log(
        'Generating seed starts from the goal (horizon 10, subsample 600 of them)'
    )
    with algo.env.set_kill_outside(radius=v['kill_radius']):
        seed_starts = generate_starts(
            env,
            starts=[v['start_goal']],
            horizon=10,  # this is smaller as they are seeds!
            variance=v['brownian_variance'],
            subsample=v['num_new_starts'])  # , animated=True, speedup=10)

        # seed_starts = all_feasible_starts.states
        # with env.set_kill_outside(radius=0.4):
        # find_all_feasible_states(env, seed_starts, distance_threshold=0.1, brownian_variance=1, animate=False)

    # # show where these states are:
    # shuffled_starts = np.array(all_feasible_starts.state_list)
    # np.random.shuffle(shuffled_starts)
    # generate_starts(env, starts=shuffled_starts, horizon=100, variance=v['brownian_variance'],
    #                 zero_action=True, animated=True, speedup=10)

    for outer_iter in range(1, v['outer_iters']):

        logger.log("Outer itr # %i" % outer_iter)
        logger.log("Sampling starts")

        with algo.env.set_kill_outside(radius=v['kill_radius']):
            starts = generate_starts(algo.env,
                                     starts=seed_starts,
                                     horizon=v['brownian_horizon'],
                                     variance=v['brownian_variance'])
        # regularization of the brownian starts
        brownian_starts.empty()
        brownian_starts.append(starts)
        starts = brownian_starts.sample(size=v['num_new_starts'])

        if v['replay_buffer'] and outer_iter > 0 and all_starts.size > 0:
            old_starts = all_starts.sample(v['num_old_starts'])
            starts = np.vstack([starts, old_starts])

        with ExperimentLogger(log_dir,
                              50 * (outer_iter // 50 + 1),
                              snapshot_mode='last',
                              hold_outter_log=True):
            logger.log("Updating the environment start generator")
            algo.env.update_start_generator(
                UniformListStateGenerator(
                    starts.tolist(),
                    persistence=v['persistence'],
                    with_replacement=v['with_replacement'],
                ))
            # algo.start_worker()

            logger.log("Training the algorithm")

            algo.current_itr = 0
            trpo_paths = algo.train(already_init=outer_iter > 1)

        # import pdb; pdb.set_trace()
        if v['use_trpo_paths']:
            logger.log("labeling starts with trpo rollouts")
            [starts, labels] = label_states_from_paths(
                trpo_paths,
                n_traj=2,
                key='goal_reached',  # using the min n_traj
                as_goal=False,
                env=algo.env)
            paths = [path for paths in trpo_paths for path in paths]
        else:
            logger.log("labeling starts manually")
            labels, paths = label_states(starts,
                                         algo.env,
                                         policy,
                                         v['horizon'],
                                         as_goals=False,
                                         n_traj=v['n_traj'],
                                         key='goal_reached',
                                         full_path=True)

        with logger.tabular_prefix("OnStarts_"):
            algo.env.log_diagnostics(paths)

        logger.record_tabular('brownian_starts', brownian_starts.size)

        start_classes, text_labels = convert_label(labels)
        total_starts = labels.shape[0]
        logger.record_tabular('GenStarts_evaluated', total_starts)
        start_class_frac = OrderedDict(
        )  # this needs to be an ordered dict!! (for the log tabular)
        for k in text_labels.keys():
            frac = np.sum(start_classes == k) / total_starts
            logger.record_tabular('GenStart_frac_' + text_labels[k], frac)
            start_class_frac[text_labels[k]] = frac

        labels = np.logical_and(labels[:, 0],
                                labels[:, 1]).astype(int).reshape((-1, 1))

        logger.log("Labeling on uniform starts")
        with logger.tabular_prefix("Uniform_4med_"):
            unif_starts = all_feasible_starts.sample(500)
            unif_starts = np.pad(unif_starts,
                                 ((0, v['start_size'] - unif_starts.shape[1])),
                                 'constant')
            mean_reward, paths = evaluate_states(unif_starts,
                                                 algo.env,
                                                 policy,
                                                 v['horizon'],
                                                 n_traj=1,
                                                 key='goal_reached',
                                                 as_goals=False,
                                                 full_path=True)
            algo.env.log_diagnostics(paths)
        # with logger.tabular_prefix("Uniform_4med_bis_"):
        #     unif_starts = all_feasible_starts.sample(200)
        #     unif_starts1bis = np.pad(unif_starts, ((0, v['start_size'] - unif_starts.shape[1])), 'constant')
        #     mean_reward1bis, paths1bis = evaluate_states(unif_starts1bis, algo.env, policy, v['horizon'], n_traj=1,
        #                                                  key='goal_reached', as_goals=False, full_path=True)
        #     algo.env.log_diagnostics(paths1bis)
        # with logger.tabular_prefix("Uniform_4min_"):
        #     unif_starts2 = all_feasible_starts2.sample(200)
        #     unif_starts2 = np.pad(unif_starts2, ((0, v['start_size'] - unif_starts2.shape[1])), 'constant')
        #     mean_reward2, paths2 = evaluate_states(unif_starts2, algo.env, policy, v['horizon'], n_traj=1,
        #                                            key='goal_reached', as_goals=False, full_path=True)
        #     algo.env.log_diagnostics(paths2)
        # with logger.tabular_prefix("Uniform_2max_"):
        #     unif_starts3 = all_feasible_starts3.sample(200)
        #     unif_starts3 = np.pad(unif_starts3, ((0, v['start_size'] - unif_starts3.shape[1])), 'constant')
        #     mean_reward3, paths3 = evaluate_states(unif_starts3, algo.env, policy, v['horizon'], n_traj=1,
        #                                            key='goal_reached', as_goals=False, full_path=True)
        #     algo.env.log_diagnostics(paths3)

        logger.dump_tabular(with_prefix=True)

        # append new states to list of all starts (replay buffer):
        if v['seed_with'] == 'only_goods':
            logger.log("Appending good goals to replay and generating seeds")
            filtered_raw_starts = [
                start for start, label in zip(starts, labels) if label[0] == 1
            ]
            all_starts.append(filtered_raw_starts)
            if len(filtered_raw_starts) > 0:
                seed_starts = filtered_raw_starts
            elif np.sum(start_classes == 0) > np.sum(
                    start_classes == 1):  # if more low reward than high reward
                seed_starts = all_starts.sample(
                    300)  # sample them from the replay
            else:  # add a tone of noise if all the states I had ended up being high_reward!
                with algo.env.set_kill_outside(radius=v['kill_radius']):
                    seed_starts = generate_starts(
                        algo.env,
                        starts=starts,
                        horizon=int(v['horizon'] * 10),
                        subsample=v['num_new_starts'],
                        variance=v['brownian_variance'] * 10)
        elif v['seed_with'] == 'all_previous':
            logger.log("Appending all goals to replay and generating seeds")
            all_starts.append(starts)
            seed_starts = starts
        elif v['seed_with'] == 'on_policy':
            all_starts.append(starts)
            with algo.env.set_kill_outside(radius=v['kill_radius']):
                seed_starts = generate_starts(algo.env,
                                              policy,
                                              horizon=v['horizon'],
                                              subsample=v['num_new_starts'])
Esempio n. 3
0
def run_task(*_):
    env = normalize(GymEnv("DartWalker3d-v1", record_video=False))

    policy_int = GaussianHMLPPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers, each with 32 hidden units.
        hidden_sizes=(64, 16),
        subnet_split1=[5, 6, 7, 8, 9, 21, 22, 23, 24, 25],
        subnet_split2=[10, 11, 12, 13, 14, 26, 27, 28, 29, 30],
        sub_out_dim=6,
        option_dim=4,
    )

    policy_sep = GaussianHLCPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers, each with 32 hidden units.
        hidden_sizes=(64, 16),
        subnet_split1=[5, 6, 7, 8, 9, 21, 22, 23, 24, 25],
        subnet_split2=[10, 11, 12, 13, 14, 26, 27, 28, 29, 30],
        sub_out_dim=6,
        option_dim=4,
    )

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo1 = TRPO(
        env=env,
        policy=policy_int,
        baseline=baseline,
        batch_size=500,
        max_path_length=env.horizon,
        n_itr=2,
        discount=0.99,
        step_size=0.01,
        epopt_epsilon=1.0,
        epopt_after_iter=0,
        # Uncomment both lines (this and the plot parameter below) to enable plotting
        # plot=True,
    )

    algo2 = TRPO(
        env=env,
        policy=policy_sep,
        baseline=baseline,
        batch_size=500,
        max_path_length=env.horizon,
        n_itr=2,
        discount=0.99,
        step_size=0.01,
        epopt_epsilon=1.0,
        epopt_after_iter=0,
        # Uncomment both lines (this and the plot parameter below) to enable plotting
        # plot=True,
    )

    # copy parameter from integrated controller to separate controller
    def int2sep():
        # sync the weights
        hrl_pol_param = policy_int._mean_network.get_params()
        hlc_param = policy_sep._mean_network.get_params()
        llc_param = policy_sep._lowlevelnetwork.get_params()

        for param in hlc_param:
            for hrl_param in hrl_pol_param:
                if param.name == hrl_param.name:
                    param.set_value(hrl_param.get_value(borrow=True))

        for param in llc_param:
            for hrl_param in hrl_pol_param:
                if param.name == hrl_param.name:
                    param.set_value(hrl_param.get_value(borrow=True))

    # copy parameter from separate controller to integrated controller
    def sep2int():
        hrl_pol_param = policy_int._mean_network.get_params()
        hlc_param = policy_sep._mean_network.get_params()
        llc_param = policy_sep._lowlevelnetwork.get_params()
        for param in hrl_pol_param:
            for hrl_param in hlc_param:
                if param.name == hrl_param.name:
                    param.set_value(hrl_param.get_value(borrow=True))

        for param in hrl_pol_param:
            for hrl_param in llc_param:
                if param.name == hrl_param.name:
                    param.set_value(hrl_param.get_value(borrow=True))

    for i in range(100):
        algo1.current_itr = 0
        algo2.current_itr = 0
        algo2.train(continue_learning=(i > 0))
        sep2int()

        algo1.train(continue_learning=(i > 0))
        int2sep()