def main(argv): log_flags(FLAGS) f = functools.partial(EnvWrap, reward_scale=FLAGS.reward_scale, action_repeat=FLAGS.action_repeat) vec_env = SubprocVecEnv([f for _ in range(8)]) eval_env = EnvWrap(reward_scale=1, action_repeat=FLAGS.action_repeat, rm_done=False) config_class = A2CTrainerConfig train_class = A2CTrainer if FLAGS.visualize: tconf = config_class.from_json( os.path.join(OUTPUT_DIR, FLAGS.config_path)) trainer = train_class(vec_env, eval_env, tconf, OUTPUT_DIR) trainer.load_checkpoint(os.path.join(OUTPUT_DIR, FLAGS.checkpoint_path)) eval_return, eval_steps = trainer.evaluate(render=True) print(f'EvalReturn {eval_return:.1f} Steps {eval_steps:.1f}') else: # Train tconf = config_class.from_flags(FLAGS) tconf.to_json(os.path.join(OUTPUT_DIR, FLAGS.config_path)) trainer = train_class(vec_env, eval_env, tconf, OUTPUT_DIR) trainer.train() trainer.save_checkpoint(os.path.join(OUTPUT_DIR, FLAGS.checkpoint_path)) vec_env.close() eval_env.close()
def multi_fps(n_workers): vec_env = SubprocVecEnv([make_env_fn for _ in range(n_workers)]) start_time = time.time() steps = 0 for episode in range(500): vec_env.reset() for idx, act in enumerate(ref_actions[reset_step:]): acts = np.tile(act, (n_workers, 1)) obs, rew, done, info = vec_env.step(acts) steps += 1 elapsed = time.time() - start_time fps = steps / elapsed print(f'{n_workers}-worker FPS: {fps} EffectiveFPS: {fps*n_workers}') vec_env.close()
def main(argv): """ Trains a model through backward RL. """ ref_actions = np.load(os.path.join(DATA_DIR, FLAGS.ref_actions_path)) clip_name, start_step = parse_path(FLAGS.ref_actions_path) make_env_fn = lambda: RefTrackingEnv( clip_name, ref_actions, start_step, reset_step=0) vec_env = SubprocVecEnv([make_env_fn for _ in range(FLAGS.num_workers)]) eval_env = make_env_fn() config_class = SACTrainerConfig train_class = SACTrainer if FLAGS.visualize: tconf = config_class.from_json(FLAGS.config_path) trainer = train_class(vec_env, env, tconf, OUTPUT_DIR) trainer.load_checkpoint(os.path.join(OUTPUT_DIR, FLAGS.checkpoint_path)) env.visualize(trainer.policy, device='cpu') else: tconf = config_class.from_flags(FLAGS) tconf.to_json(os.path.join(OUTPUT_DIR, FLAGS.config_path)) trainer = train_class(vec_env, eval_env, tconf, OUTPUT_DIR) # Generate the curriculum for idx in range(len(ref_actions)): reset_step = len(ref_actions) - (idx + 1) # Modify the environments to reflect the new reset_step vec_env.set_attr('reset_step', reset_step) vec_env.reset() eval_env.reset_step = reset_step eval_env.reset() target_return = eval_env.ref_returns[reset_step] print( f'Curriculum Task {idx}: reset_step {reset_step} target_return {target_return:.3f}' ) trainer.train(target_return) trainer.save_checkpoint( os.path.join(OUTPUT_DIR, FLAGS.checkpoint_path)) vec_env.close() eval_env.close()
def main(argv): log_flags(FLAGS) f = functools.partial( RefTrackingEnv, reward_type=FLAGS.reward_type, clip_name=FLAGS.clip_name, reward_scale=FLAGS.reward_scale, always_init_at_clip_start=FLAGS.always_init_at_clip_start, use_ref_observables=FLAGS.use_ref_observables, ) vec_env = SubprocVecEnv([f for _ in range(FLAGS.workers)], norm_obs=FLAGS.norm_obs) eval_env = RefTrackingEnv( reward_type=FLAGS.reward_type, clip_name=FLAGS.clip_name, always_init_at_clip_start=True, ghost_offset=None, #1.0 if FLAGS.visualize else None, use_ref_observables=FLAGS.use_ref_observables, ) config_class = SACTrainerConfig train_class = SACTrainer if FLAGS.visualize: tconf = config_class.from_json( os.path.join(OUTPUT_DIR, FLAGS.config_path)) tconf.buffer_size = 100 # Only for visualization purposes trainer = train_class(vec_env, eval_env, tconf, OUTPUT_DIR) trainer.load_checkpoint(os.path.join(OUTPUT_DIR, FLAGS.checkpoint_path)) eval_env.visualize(trainer.model, device='cpu', deterministic=False) else: # Train tconf = config_class.from_flags(FLAGS) tconf.to_json(os.path.join(OUTPUT_DIR, FLAGS.config_path)) trainer = train_class(vec_env, eval_env, tconf, OUTPUT_DIR) trainer.train() trainer.save_checkpoint(os.path.join(OUTPUT_DIR, FLAGS.checkpoint_path)) vec_env.close() eval_env.close()
# Check set_attr for reset_step is correct make_env_fn = lambda: RefTrackingEnv( 'CMU_069_02', ref_actions, 0, reset_step=len(ref_actions) - 2) vec_env = SubprocVecEnv([make_env_fn for _ in range(2)]) # Change the reset step and make sure we don't get an epsiode termination until expected vec_env.set_attr('reset_step', reset_step) vec_env.reset() for idx, act in enumerate(ref_actions[reset_step:]): acts = np.tile(act, (2, 1)) obs, rew, done, info = vec_env.step(acts) # Ensure we're not done until the last step if idx < len(ref_actions[reset_step:]) - 1: assert not np.any(done) # Make sure we're done on the last step assert np.all(done) vec_env.close() # Check for correctness for _ in range(2): obs = env.reset() # Check that the reset is working properly and we get the expected sequences # of observations and rewards from executing the reference actions. rewards = [] for idx, act in enumerate(ref_actions[reset_step:]): obs, rew, done, info = env.step(act) # Ensure we're not done until the last step if idx < len(ref_actions[reset_step:]) - 1: assert not done rewards.append(rew) # Make sure the observations match what's expected assert np.all(env.ref_observations[reset_step + idx + 1] -