Esempio n. 1
0
def main(argv):
    log_flags(FLAGS)
    f = functools.partial(EnvWrap,
                          reward_scale=FLAGS.reward_scale,
                          action_repeat=FLAGS.action_repeat)
    vec_env = SubprocVecEnv([f for _ in range(8)])
    eval_env = EnvWrap(reward_scale=1,
                       action_repeat=FLAGS.action_repeat,
                       rm_done=False)

    config_class = A2CTrainerConfig
    train_class = A2CTrainer

    if FLAGS.visualize:
        tconf = config_class.from_json(
            os.path.join(OUTPUT_DIR, FLAGS.config_path))
        trainer = train_class(vec_env, eval_env, tconf, OUTPUT_DIR)
        trainer.load_checkpoint(os.path.join(OUTPUT_DIR,
                                             FLAGS.checkpoint_path))
        eval_return, eval_steps = trainer.evaluate(render=True)
        print(f'EvalReturn {eval_return:.1f} Steps {eval_steps:.1f}')
    else:  # Train
        tconf = config_class.from_flags(FLAGS)
        tconf.to_json(os.path.join(OUTPUT_DIR, FLAGS.config_path))
        trainer = train_class(vec_env, eval_env, tconf, OUTPUT_DIR)
        trainer.train()
        trainer.save_checkpoint(os.path.join(OUTPUT_DIR,
                                             FLAGS.checkpoint_path))

    vec_env.close()
    eval_env.close()
Esempio n. 2
0
 def multi_fps(n_workers):
     vec_env = SubprocVecEnv([make_env_fn for _ in range(n_workers)])
     start_time = time.time()
     steps = 0
     for episode in range(500):
         vec_env.reset()
         for idx, act in enumerate(ref_actions[reset_step:]):
             acts = np.tile(act, (n_workers, 1))
             obs, rew, done, info = vec_env.step(acts)
             steps += 1
     elapsed = time.time() - start_time
     fps = steps / elapsed
     print(f'{n_workers}-worker FPS: {fps} EffectiveFPS: {fps*n_workers}')
     vec_env.close()
Esempio n. 3
0
def main(argv):
    """ Trains a model through backward RL. """
    ref_actions = np.load(os.path.join(DATA_DIR, FLAGS.ref_actions_path))
    clip_name, start_step = parse_path(FLAGS.ref_actions_path)

    make_env_fn = lambda: RefTrackingEnv(
        clip_name, ref_actions, start_step, reset_step=0)
    vec_env = SubprocVecEnv([make_env_fn for _ in range(FLAGS.num_workers)])
    eval_env = make_env_fn()

    config_class = SACTrainerConfig
    train_class = SACTrainer

    if FLAGS.visualize:
        tconf = config_class.from_json(FLAGS.config_path)
        trainer = train_class(vec_env, env, tconf, OUTPUT_DIR)
        trainer.load_checkpoint(os.path.join(OUTPUT_DIR,
                                             FLAGS.checkpoint_path))
        env.visualize(trainer.policy, device='cpu')
    else:
        tconf = config_class.from_flags(FLAGS)
        tconf.to_json(os.path.join(OUTPUT_DIR, FLAGS.config_path))
        trainer = train_class(vec_env, eval_env, tconf, OUTPUT_DIR)

        # Generate the curriculum
        for idx in range(len(ref_actions)):
            reset_step = len(ref_actions) - (idx + 1)

            # Modify the environments to reflect the new reset_step
            vec_env.set_attr('reset_step', reset_step)
            vec_env.reset()
            eval_env.reset_step = reset_step
            eval_env.reset()

            target_return = eval_env.ref_returns[reset_step]
            print(
                f'Curriculum Task {idx}: reset_step {reset_step} target_return {target_return:.3f}'
            )

            trainer.train(target_return)
            trainer.save_checkpoint(
                os.path.join(OUTPUT_DIR, FLAGS.checkpoint_path))

    vec_env.close()
    eval_env.close()
Esempio n. 4
0
def main(argv):
    log_flags(FLAGS)
    f = functools.partial(
        RefTrackingEnv,
        reward_type=FLAGS.reward_type,
        clip_name=FLAGS.clip_name,
        reward_scale=FLAGS.reward_scale,
        always_init_at_clip_start=FLAGS.always_init_at_clip_start,
        use_ref_observables=FLAGS.use_ref_observables,
    )
    vec_env = SubprocVecEnv([f for _ in range(FLAGS.workers)],
                            norm_obs=FLAGS.norm_obs)
    eval_env = RefTrackingEnv(
        reward_type=FLAGS.reward_type,
        clip_name=FLAGS.clip_name,
        always_init_at_clip_start=True,
        ghost_offset=None,  #1.0 if FLAGS.visualize else None,
        use_ref_observables=FLAGS.use_ref_observables,
    )

    config_class = SACTrainerConfig
    train_class = SACTrainer

    if FLAGS.visualize:
        tconf = config_class.from_json(
            os.path.join(OUTPUT_DIR, FLAGS.config_path))
        tconf.buffer_size = 100  # Only for visualization purposes
        trainer = train_class(vec_env, eval_env, tconf, OUTPUT_DIR)
        trainer.load_checkpoint(os.path.join(OUTPUT_DIR,
                                             FLAGS.checkpoint_path))
        eval_env.visualize(trainer.model, device='cpu', deterministic=False)
    else:  # Train
        tconf = config_class.from_flags(FLAGS)
        tconf.to_json(os.path.join(OUTPUT_DIR, FLAGS.config_path))
        trainer = train_class(vec_env, eval_env, tconf, OUTPUT_DIR)
        trainer.train()
        trainer.save_checkpoint(os.path.join(OUTPUT_DIR,
                                             FLAGS.checkpoint_path))

    vec_env.close()
    eval_env.close()
Esempio n. 5
0
    # Check set_attr for reset_step is correct
    make_env_fn = lambda: RefTrackingEnv(
        'CMU_069_02', ref_actions, 0, reset_step=len(ref_actions) - 2)
    vec_env = SubprocVecEnv([make_env_fn for _ in range(2)])
    # Change the reset step and make sure we don't get an epsiode termination until expected
    vec_env.set_attr('reset_step', reset_step)
    vec_env.reset()
    for idx, act in enumerate(ref_actions[reset_step:]):
        acts = np.tile(act, (2, 1))
        obs, rew, done, info = vec_env.step(acts)
        # Ensure we're not done until the last step
        if idx < len(ref_actions[reset_step:]) - 1:
            assert not np.any(done)
    # Make sure we're done on the last step
    assert np.all(done)
    vec_env.close()

    # Check for correctness
    for _ in range(2):
        obs = env.reset()
        # Check that the reset is working properly and we get the expected sequences
        # of observations and rewards from executing the reference actions.
        rewards = []
        for idx, act in enumerate(ref_actions[reset_step:]):
            obs, rew, done, info = env.step(act)
            # Ensure we're not done until the last step
            if idx < len(ref_actions[reset_step:]) - 1:
                assert not done
            rewards.append(rew)
            # Make sure the observations match what's expected
            assert np.all(env.ref_observations[reset_step + idx + 1] -