def test_mpc_and_model(model, args, L): """ Tests the model (is MPC helping?) """ L.log(f"\n\n== Testing trained ensemble model with MPC planner == \n\n") utils.seed(args.seed) # Environment env = Env( args.env_name, max_episode_len=args.max_episode_len, action_repeat=args.action_repeat, seed=args.seed, ) state_size = env.observation_space.shape[0] action_size = env.action_space.shape[0] # MPC Planner reward_measure = RewardMeasure(env, args.reward_scale) expl_measure = None mpc_agent = MpcAgent( model, args.ensemble_size, action_size, plan_horizon=args.plan_horizon, optimisation_iters=args.optimisation_iters, n_candidates=args.n_candidates, top_candidates=args.top_candidates, reward_measure=reward_measure, expl_measure=expl_measure, device=DEVICE, ) # Data samplers random_sampler = RandomSampler(env) hybrid_sampler = ControlSampler(env, mpc_agent) # Logging mpc_log_fn = lambda step, reward: L.log(f"Collect Step {step}: {reward}") train_log_fn = lambda epoch, loss: L.log(f"Train Epoch {epoch}: {loss}") # Main loop rewards = [] global_step = 0 for episode in range(args.n_test_epi): L.log(f"\nEpisode {episode}") # Test agent L.log(f"Testing on {args.n_test_episodes} episodes") sac_agent.toggle_updates(False) sac_agent.toggle_stochastic(False) stats = hybrid_sampler.sample_episodes(args.n_eval_episodes, action_noise=None, log_fn=mpc_log_fn, log_every=args.mpc_log_every) L.log_episode(stats['rewards'], stats['steps']) L.save() return rewards
def train_sac_and_model(args, L): """ Trains SAC agents and ensemble model Data is collected via SAC agent """ L.log(f"\n\n== Training SAC agents and ensemble model == \n\n") utils.seed(args.seed) # Environment env = Env( args.env_name, max_episode_len=args.max_episode_len, action_repeat=args.action_repeat, seed=args.seed, ) state_size = env.observation_space.shape[0] action_size = env.action_space.shape[0] # Buffer normalizer = Normalizer() buffer = Buffer( state_size, action_size, args.ensemble_size, args.batch_size, normalizer=normalizer, buffer_size=args.buffer_size, device=DEVICE, ) # Model model = EnsembleDynamicsModel( state_size + action_size, state_size, args.ensemble_size, args.hidden_size, normalizer=normalizer, device=DEVICE, ) trainer = Trainer( model, buffer, n_train_epochs=args.n_train_epochs, batch_size=args.batch_size, learning_rate=args.learning_rate, epsilon=args.epsilon, grad_clip_norm=args.grad_clip_norm, ) # SAC agents sac_agents = [ make_sac_agent( state_shape=env.observation_space.shape, action_shape=env.action_space.shape, args=args, device=DEVICE, ) for _ in range(args.n_sac_agents) ] sac_agent = EnsembleSacAgent(sac_agents, buffer) # Data sampler random_sampler = RandomSampler(env) sac_sampler = ControlSampler(env, sac_agent) # Logging sac_log_fn = lambda step, reward: L.log(f"Collect Step {step}: {reward}") train_log_fn = lambda epoch, loss: L.log(f"Train Epoch {epoch}: {loss}") # Collect random data random_sampler.sample_record_episodes(args.n_seed_episodes, buffer) L.log(f"Collected {buffer.current_size} seed frames") # Main loop rewards = [] global_step = 0 for episode in range(args.n_train_epi): L.log(f"\nEpisode {episode} [{buffer.current_size} frames]") # Collect data L.log(f"Collecting {args.n_collect_episodes} episodes of data") sac_agent.toggle_updates(True) sac_agent.toggle_stochastic(True) buffer, stats = sac_sampler.sample_record_episodes( args.n_collect_episodes, buffer, action_noise=None, log_fn=sac_log_fn, log_every=args.mpc_log_every, ) L.log_episode(stats['rewards'][0], stats['steps'][0]) L.save() # Train Model n_batches = buffer.current_size // args.batch_size L.log( f"\nTraining on ({n_batches * args.batch_size}) frames ({n_batches}) batches | buffer size ({buffer.current_size})\n" ) trainer.train(n_batches=n_batches, log_fn=train_log_fn, log_every=args.train_log_every) sac_agent.save(L.path) trainer.save_models(L.path) return sac_agent, model
def main(config): utils.seed(config.seed) env = Env( config.env_name, max_episode_len=config.max_episode_len, action_repeat=config.action_repeat, seed=config.seed, ) state_size = env.observation_space.shape[0] action_size = env.action_space.shape[0] buffer = Buffer( state_size, action_size, None, config.batch_size, buffer_size=config.buffer_size, n_augments=config.n_augments, augment_std=config.augment_std, reward_std=config.reward_std, sample_jitter=config.sample_jitter, device=DEVICE, ) sac_agents = [ make_sac_agent( state_shape=env.observation_space.shape, action_shape=env.action_space.shape, args=config, device=DEVICE, ) for _ in range(config.n_sac_agents) ] sac_agent = EnsembleSacAgent(sac_agents, buffer) random_sampler = RandomSampler(env) sac_sampler = ControlSampler(env, sac_agent) sac_log_fn = lambda step, reward: print(f"Collect Step {step}: {reward}") train_log_fn = lambda epoch, loss: print(f"Train Epoch {epoch}: {loss}") random_sampler.sample_record_episodes(config.n_seed_episodes, buffer) print(f"Collected {buffer.current_size} seed frames") rewards = [] global_step = 0 for episode in range(config.n_episodes): print(f"\nEpisode {episode} [{buffer.current_size} frames]") print(f"Collecting {config.n_collect_episodes} episodes of data") sac_agent.toggle_updates(True) sac_agent.toggle_stochastic(True) buffer, stats = sac_sampler.sample_record_episodes( config.n_collect_episodes, buffer, log_fn=sac_log_fn, log_every=config.sac_log_every, ) print(f"Train reward: {stats['rewards']} Steps: {stats['steps']}") print(f"Testing on {config.n_test_episodes} episodes") sac_agent.toggle_updates(False) sac_agent.toggle_stochastic(False) stats = sac_sampler.sample_episodes(config.n_eval_episodes, log_fn=sac_log_fn, log_every=config.sac_log_every) print(f"Test reward: {stats['rewards']} steps: {stats['steps']}") rewards.append(stats["rewards"][0]) #save properly if episode % 100 == 0: subprocess.call(['echo', "rewards: " + str(stats["rewards"][0])]) now = datetime.now() current_time = str(now.strftime("%H:%M:%S")) subprocess.call( ['echo', "saving rewards at time: " + str(current_time)]) np.save(config.logdir + "/rewards.npy", np.array(deepcopy(rewards))) subprocess.call([ 'rsync', '--archive', '--update', '--compress', '--progress', str(config.logdir) + "/", str(config.savedir) ]) print("Rsynced files from: " + str(config.logdir) + "/ " + " to" + str(config.savedir)) return rewards
def main(config): utils.seed(config.seed) env = Env( config.env_name, max_episode_len=config.max_episode_len, action_repeat=config.action_repeat, seed=config.seed, ) state_size = env.observation_space.shape[0] action_size = env.action_space.shape[0] normalizer = Normalizer() buffer = Buffer( state_size, action_size, config.ensemble_size, config.batch_size, normalizer=normalizer, buffer_size=config.buffer_size, device=DEVICE, ) model = EnsembleDynamicsModel( state_size + action_size, state_size, config.ensemble_size, config.hidden_size, normalizer=normalizer, device=DEVICE, ) trainer = Trainer( model, buffer, n_train_epochs=config.n_train_epochs, batch_size=config.batch_size, learning_rate=config.learning_rate, epsilon=config.epsilon, grad_clip_norm=config.grad_clip_norm, ) reward_measure = RewardMeasure(env, config.reward_scale) expl_measure = None mpc_agent = MpcAgent( model, config.ensemble_size, action_size, plan_horizon=config.plan_horizon, optimisation_iters=config.optimisation_iters, n_candidates=config.n_candidates, top_candidates=config.top_candidates, reward_measure=reward_measure, expl_measure=expl_measure, alpha=config.alpha, device=DEVICE, ) sac_agents = [ make_sac_agent( state_shape=env.observation_space.shape, action_shape=env.action_space.shape, args=config, device=DEVICE, ) for _ in range(config.n_sac_agents) ] hybrid_agent = HybridAgent(sac_agents, mpc_agent, model, buffer, action_size, n_sac_updates=config.n_sac_updates, cem_std=config.cem_std, device=DEVICE) random_sampler = RandomSampler(env) hybrid_sampler = ControlSampler(env, hybrid_agent) mpc_log_fn = lambda step, reward: print(f"Collect Step {step}: {reward}") train_log_fn = lambda epoch, loss: print(f"Train Epoch {epoch}: {loss}") random_sampler.sample_record_episodes(config.n_seed_episodes, buffer) print(f"Collected {buffer.current_size} seed frames") rewards = [] global_step = 0 for episode in range(config.n_episodes): print(f"\n=== Episode {episode} [{buffer.current_size} frames] ===") n_batches = buffer.current_size // config.batch_size print( f"\nTraining on ({n_batches * config.batch_size}) frames ({n_batches}) batches (buffer size {buffer.current_size})" ) if config.warm_start is 0: trainer.reset_models() trainer.train( n_batches=n_batches, log_fn=train_log_fn, log_every=config.train_log_every ) warm_up = (episode < config.n_warm_up_episodes) print(f"\nCollecting {config.n_collect_episodes} episodes of data [warm up: {warm_up}]") hybrid_agent.toggle_updates(True) # TODO double stochastic? hybrid_agent.toggle_stochastic(False) hybrid_agent.toggle_warm_up(warm_up) buffer, stats = hybrid_sampler.sample_record_episodes( config.n_collect_episodes, buffer, action_noise=config.action_noise, log_fn=mpc_log_fn, log_every=config.mpc_log_every, ) print(f"Train reward: {stats['rewards']} Steps: {stats['steps']}") if episode % config.test_every == 0: print(f"\nTesting on {config.n_test_episodes} episodes") hybrid_agent.toggle_updates(False) hybrid_agent.toggle_stochastic(False) stats = hybrid_sampler.sample_episodes( config.n_eval_episodes, action_noise=None, log_fn=mpc_log_fn, log_every=config.mpc_log_every ) print(f"Test reward: {stats['rewards']} steps: {stats['steps']}") rewards.append(stats["rewards"][0]) return rewards
def main(config): utils.seed(config.seed) env = Env( config.env_name, max_episode_len=config.max_episode_len, action_repeat=config.action_repeat, seed=config.seed, ) state_size = env.observation_space.shape[0] action_size = env.action_space.shape[0] normalizer = Normalizer() buffer = Buffer( state_size, action_size, config.ensemble_size, config.batch_size, normalizer=normalizer, buffer_size=config.buffer_size, device=DEVICE, ) model = EnsembleDynamicsModel( state_size + action_size, state_size, config.ensemble_size, config.hidden_size, normalizer=normalizer, device=DEVICE, ) trainer = Trainer( model, buffer, n_train_epochs=config.n_train_epochs, batch_size=config.batch_size, learning_rate=config.learning_rate, epsilon=config.epsilon, grad_clip_norm=config.grad_clip_norm, ) reward_measure = RewardMeasure(env, config.reward_scale) expl_measure = None mpc_agent = MpcAgent( model, config.ensemble_size, action_size, plan_horizon=config.plan_horizon, optimisation_iters=config.optimisation_iters, n_candidates=config.n_candidates, top_candidates=config.top_candidates, reward_measure=reward_measure, expl_measure=expl_measure, device=DEVICE, ) random_sampler = RandomSampler(env) mpc_sampler = ControlSampler(env, mpc_agent) mpc_log_fn = lambda step, reward: print(f"Collect Step {step}: {reward}") train_log_fn = lambda epoch, loss: print(f"Train Epoch {epoch}: {loss}") random_sampler.sample_record_episodes(config.n_seed_episodes, buffer) print(f"Collected {buffer.current_size} seed frames") rewards = [] global_step = 0 for episode in range(config.n_episodes): print(f"\nEpisode {episode} [{buffer.current_size} frames]") n_batches = buffer.current_size // config.batch_size print( f"Training on ({n_batches * config.batch_size}) frames ({n_batches}) batches ({buffer.current_size})" ) if config.warm_start is 0: trainer.reset_models() trainer.train( n_batches=n_batches, log_fn=train_log_fn, log_every=config.train_log_every ) print(f"Collecting {config.n_collect_episodes} episodes of data") buffer, stats = mpc_sampler.sample_record_episodes( config.n_collect_episodes, buffer, action_noise=config.action_noise, log_fn=mpc_log_fn, log_every=config.mpc_log_every, ) print(f"Train reward: {stats['rewards']} Steps: {stats['steps']}") print(f"Testing on {config.n_test_episodes} episodes") stats = mpc_sampler.sample_episodes( config.n_eval_episodes, action_noise=None, log_fn=mpc_log_fn, log_every=config.mpc_log_every ) print(f"Test reward: {stats['rewards']} steps: {stats['steps']}") rewards.append(stats["rewards"][0]) if episode % 10 == 0: trainer.save_models(episode) return rewards