Ejemplo n.º 1
0
def cli(seed, buffer_size, n_layers, layer_size, learning_rate, entropy_scale,
        reward_scale, batch_size, num_train_steps, logdir, save_path, load_path, render,
        grad_clip, map_dims, max_steps, random_map, random_start, random_goal,
        is_slippery, default_reward, boss_freq, worker_oracle, boss_oracle):
    env = TimeLimit(
        env=FrozenLakeEnv(
            map_dims=map_dims,
            random_map=random_map,
            random_start=random_start,
            random_goal=random_goal,
            is_slippery=is_slippery,
            default_reward=default_reward,
        ),
        max_episode_steps=max_steps)

    kwargs = dict(
        sess=create_sess(),
        base_agent=MlpAgent,
        seq_len=0,
        device_num=1,
        seed=seed,
        buffer_size=buffer_size,
        activation=tf.nn.relu,
        n_layers=n_layers,
        layer_size=layer_size,
        learning_rate=learning_rate,
        entropy_scale=entropy_scale,
        reward_scale=reward_scale,
        batch_size=batch_size,
        grad_clip=grad_clip,
        num_train_steps=num_train_steps,
    )
    if boss_freq:
        trainer = HierarchicalTrainer(
            boss_act_freq=boss_freq,
            use_worker_oracle=worker_oracle,
            use_boss_oracle=boss_oracle,
            env=FrozenLakeHierarchicalWrapper(env),
            **kwargs)
    else:
        trainer = Trainer(env=env, **kwargs)

    trainer.train(load_path=load_path, logdir=logdir, render=render, save_path=save_path)
def cli(
    seed,
    buffer_size,
    worker_n_layers,
    worker_layer_size,
    worker_learning_rate,
    worker_entropy_scale,
    worker_reward_scale,
    worker_num_train_steps,
    worker_grad_clip,
    boss_n_layers,
    boss_layer_size,
    boss_learning_rate,
    boss_entropy_scale,
    boss_reward_scale,
    boss_num_train_steps,
    boss_grad_clip,
    batch_size,
    logdir,
    save_path,
    load_path,
    render_freq,
    max_steps,
    steps_per_action,
    boss_freq,
    worker_oracle,
    boss_oracle,
    correct_boss_action,
    record,
    record_freq,
    record_path,
    image_dims,
    randomize_pose,
    geofence,
    hindsight_geofence,
    fixed_block,
    fixed_goal,
    goal_x,
    goal_y,
    temp_path,
):
    env = TimeLimit(max_episode_steps=max_steps,
                    env=ShiftEnv(
                        geofence=geofence,
                        xml_filepath=temp_path,
                        steps_per_action=steps_per_action,
                        render_freq=render_freq,
                        record=record,
                        record_path=record_path,
                        record_freq=record_freq,
                        image_dimensions=image_dims,
                        fixed_block=fixed_block,
                        fixed_goal=fixed_goal,
                        randomize_pose=randomize_pose,
                        goal_x=goal_x,
                        goal_y=goal_y,
                    ))

    kwargs = dict(
        sess=create_sess(),
        base_agent=MlpAgent,
        seq_len=0,
        device_num=1,
        seed=seed,
        buffer_size=buffer_size,
        activation=tf.nn.relu,
        batch_size=batch_size,
    )
    worker_kwargs = dict(
        n_layers=worker_n_layers,
        layer_size=worker_layer_size,
        learning_rate=worker_learning_rate,
        entropy_scale=worker_entropy_scale,
        reward_scale=worker_reward_scale,
        grad_clip=worker_grad_clip,
        num_train_steps=worker_num_train_steps,
    )

    boss_kwargs = dict(
        n_layers=boss_n_layers,
        layer_size=boss_layer_size,
        learning_rate=boss_learning_rate,
        entropy_scale=boss_entropy_scale,
        reward_scale=boss_reward_scale,
        grad_clip=boss_grad_clip,
        num_train_steps=boss_num_train_steps,
    )

    HierarchicalTrainer(env=ShiftHierarchicalWrapper(env, geofence=geofence),
                        boss_act_freq=boss_freq,
                        use_worker_oracle=worker_oracle,
                        use_boss_oracle=boss_oracle,
                        worker_kwargs=worker_kwargs,
                        boss_kwargs=boss_kwargs,
                        **kwargs).train(load_path=load_path,
                                        logdir=logdir,
                                        render=False,
                                        save_path=save_path)
Ejemplo n.º 3
0
def cli(
    seed,
    buffer_size,
    worker_n_layers,
    worker_layer_size,
    worker_learning_rate,
    worker_entropy_scale,
    worker_reward_scale,
    worker_num_train_steps,
    worker_grad_clip,
    boss_n_layers,
    boss_layer_size,
    boss_learning_rate,
    boss_entropy_scale,
    boss_reward_scale,
    boss_num_train_steps,
    boss_grad_clip,
    batch_size,
    logdir,
    save_path,
    load_path,
    render,
    map_dims,
    max_steps,
    random_map,
    random_start,
    random_goal,
    is_slippery,
    default_reward,
    boss_freq,
    n_boss_actions,
    worker_oracle,
    boss_oracle,
    correct_boss_action,
):

    env = TimeLimit(env=FrozenLakeEnv(
        map_dims=map_dims,
        random_map=random_map,
        random_start=random_start,
        random_goal=random_goal,
        is_slippery=is_slippery,
        default_reward=default_reward,
    ),
                    max_episode_steps=max_steps)

    kwargs = dict(
        sess=create_sess(),
        base_agent=MlpAgent,
        seq_len=0,
        device_num=1,
        seed=seed,
        buffer_size=buffer_size,
        activation=tf.nn.relu,
        batch_size=batch_size,
    )
    worker_kwargs = dict(
        n_layers=worker_n_layers,
        layer_size=worker_layer_size,
        learning_rate=worker_learning_rate,
        entropy_scale=worker_entropy_scale,
        reward_scale=worker_reward_scale,
        grad_clip=worker_grad_clip,
        num_train_steps=worker_num_train_steps,
    )

    boss_kwargs = dict(
        n_layers=boss_n_layers,
        layer_size=boss_layer_size,
        learning_rate=boss_learning_rate,
        entropy_scale=boss_entropy_scale,
        reward_scale=boss_reward_scale,
        grad_clip=boss_grad_clip,
        num_train_steps=boss_num_train_steps,
    )

    # n_boss_actions = (1 + 2 * boss_freq) ** 2
    HierarchicalTrainer(env=FrozenLakeHierarchicalWrapper(
        env, n_boss_actions=n_boss_actions),
                        boss_act_freq=boss_freq,
                        use_worker_oracle=worker_oracle,
                        use_boss_oracle=boss_oracle,
                        worker_kwargs=worker_kwargs,
                        boss_kwargs=boss_kwargs,
                        **kwargs).train(load_path=load_path,
                                        logdir=logdir,
                                        render=render,
                                        save_path=save_path)
Ejemplo n.º 4
0
def main(worker_n_layers, worker_layer_size, worker_learning_rate,
         worker_entropy_scale, worker_reward_scale, worker_num_train_steps,
         worker_grad_clip, steps_per_action, worker_batch_size,
         worker_buffer_size, boss_n_layers, boss_layer_size,
         boss_learning_rate, boss_entropy_scale, boss_reward_scale,
         boss_num_train_steps, boss_grad_clip, boss_buffer_size,
         boss_batch_size, max_steps, min_lift_height, geofence,
         hindsight_geofence, seed, goal_space, block_space, concat_record,
         logdir, save_path, load_path, worker_load_path, render_freq, n_goals,
         record, randomize_pose, image_dims, record_freq, record_path,
         temp_path, freeze_worker):
    env = HSRHindsightWrapper(geofence=hindsight_geofence or geofence,
                              env=TimeLimit(
                                  max_episode_steps=max_steps,
                                  env=HSREnv(
                                      steps_per_action=steps_per_action,
                                      randomize_pose=randomize_pose,
                                      min_lift_height=min_lift_height,
                                      xml_filepath=temp_path,
                                      block_space=block_space,
                                      goal_space=goal_space,
                                      geofence=geofence,
                                      render_freq=render_freq,
                                      record=record,
                                      record_path=record_path,
                                      record_freq=record_freq,
                                      record_separate_episodes=concat_record,
                                      image_dimensions=image_dims,
                                  )))

    worker_kwargs = dict(
        n_layers=worker_n_layers,
        layer_size=worker_layer_size,
        learning_rate=worker_learning_rate,
        entropy_scale=worker_entropy_scale,
        reward_scale=worker_reward_scale,
        grad_clip=worker_grad_clip,
        num_train_steps=worker_num_train_steps,
        batch_size=worker_batch_size,
        buffer_size=worker_buffer_size,
    )

    boss_kwargs = dict(
        n_layers=boss_n_layers,
        layer_size=boss_layer_size,
        learning_rate=boss_learning_rate,
        entropy_scale=boss_entropy_scale,
        reward_scale=boss_reward_scale,
        grad_clip=boss_grad_clip,
        num_train_steps=boss_num_train_steps,
        batch_size=boss_batch_size,
        buffer_size=boss_buffer_size,
    )

    UnsupervisedTrainer(env=env,
                        sess=create_sess(),
                        n_goals=n_goals,
                        seq_len=None,
                        base_agent=MlpAgent,
                        seed=seed,
                        activation=tf.nn.relu,
                        worker_load_path=worker_load_path,
                        worker_kwargs=worker_kwargs,
                        update_worker=not freeze_worker,
                        boss_kwargs=boss_kwargs).train(
                            load_path=load_path,
                            logdir=logdir,
                            render=False,
                            save_path=save_path,
                            save_threshold=None,
                        )
Ejemplo n.º 5
0
    def __init__(self,
                 env: gym.Env,
                 seed: Optional[int],
                 buffer_size: int,
                 batch_size: int,
                 seq_len: int,
                 num_train_steps: int,
                 sess: tf.Session = None,
                 preprocess_func=None,
                 action_space=None,
                 observation_space=None,
                 **kwargs):

        if seed is not None:
            np.random.seed(seed)
            tf.set_random_seed(seed)
            env.seed(seed)

        self.episodes = None
        self.episode_count = None
        self.num_train_steps = num_train_steps
        self.batch_size = batch_size
        self.env = env
        self.buffer = ReplayBuffer(buffer_size)
        self.sess = sess or create_sess()
        self.action_space = action_space or env.action_space
        observation_space = observation_space or env.observation_space

        obs = env.reset()
        self.preprocess_func = preprocess_func
        if preprocess_func is None and not isinstance(obs, np.ndarray):
            try:
                self.preprocess_func = unwrap_env(
                    env, lambda e: hasattr(e, 'preprocess_obs')).preprocess_obs
            except RuntimeError:
                self.preprocess_func = vectorize

        observation_space = spaces.Box(*[
            self.preprocess_obs(get_space_attrs(observation_space, attr))
            for attr in ['low', 'high']
        ],
                                       dtype=np.float32)

        self.agents = Agents(
            act=self.build_agent(sess=self.sess,
                                 batch_size=None,
                                 seq_len=1,
                                 reuse=False,
                                 action_space=action_space,
                                 observation_space=observation_space,
                                 **kwargs),
            train=self.build_agent(sess=self.sess,
                                   batch_size=batch_size,
                                   seq_len=seq_len,
                                   reuse=True,
                                   action_space=action_space,
                                   observation_space=observation_space,
                                   **kwargs))
        self.seq_len = self.agents.act.seq_len

        self.global_step = tf.Variable(0, name='global_step', trainable=False)
        self.episode_time_step = tf.placeholder(tf.int32,
                                                name='episode_time_steps')
        self.increment_global_step = tf.assign_add(self.global_step,
                                                   self.episode_time_step)
        self.sess.run(self.global_step.initializer)