def rollout_sim_single_step_parallel(task_id, env_name, horizon, actor=None, env=None): time_1 = time.time() # initialize environment if actor is None: actor = gen_actor(env_name, 512) if env is None: env = gym.make(env_name) # env.seed(task_id) # initialize logger old_states, new_states, raw_actions, dones, rewards, log_probs, advantages, episode_reward = [], [], [], [], [], [], [], 0. # collect episode old_obs = env.reset() for step in range(horizon): # interact with environment action, log_prob, raw_action = actor.gen_action(torch.Tensor(old_obs).cuda()) assert (env.action_space.low < np.array(action)).all() and (np.array(action) < env.action_space.high).all() new_obs, reward, done, info = env.step(action) # record trajectory step old_states.append(old_obs) new_states.append(new_obs) raw_actions.append(raw_action.view(-1)) rewards.append(reward) dones.append(done) log_probs.append(log_prob) episode_reward += reward # update old observation old_obs = new_obs # if done: # break dones[-1] = True time_2 = time.time() print(" id={}, reward: {}, episode_time: {:.3f}sec".format(task_id, episode_reward, time_2 - time_1)) return [old_states, new_states, raw_actions, rewards, dones, log_probs, episode_reward]
def parallel_rollout_sim(env_name, env_number, horizon): envs = [gym.make(env_name) for _ in range(env_number)] actor = gen_actor(env_name, 512) critic = gen_critic(env_name, 512) rolloutmem = RolloutMemory(env_number * horizon, env_name) time_start = time.time() episodes_rewards = [] data = ray.get( [rollout_sim_single_step_parallel.remote(i, env_name, horizon, None, None) for i in range(env_number)]) time_end = time.time() for episode in data: old_states, new_states, raw_actions, rewards, dones, log_probs, episode_reward = \ torch.Tensor(episode[0]).cuda(), torch.Tensor(episode[1]).cuda(), torch.stack(episode[2]).detach().cuda(), \ torch.Tensor(episode[3]).cuda(), torch.Tensor(episode[4]).cuda(), torch.stack(episode[5]).detach().cuda(), \ torch.Tensor([episode[6]]).cuda() gae_deltas = critic.gae_delta(old_states, new_states, rewards, 0.99) advantages = torch.Tensor(get_advantage_new(gae_deltas, 0.99, 0.95)).cuda() values = get_values(rewards, 0.99).cuda() if len(advantages.shape) == 1: advantages = advantages[:, None] if len(values.shape) == 1: values = values[:, None] rolloutmem.append(old_states, new_states, raw_actions, rewards, dones, log_probs, advantages, values) episodes_rewards.append(episode_reward) time_reformat = time.time() print( "parallel_time: {}, reformat_time: {:.3f}\nrollout_time: {:.3f}\ndata_len: {}\navgR: {:.3f}\nsaved_step_num: {}\n\n" .format(time_end - time_start, time_reformat - time_end, time_reformat - time_start, len(data), torch.mean(torch.Tensor(episodes_rewards)), rolloutmem.offset)) return torch.mean(torch.Tensor(episodes_rewards)), time_end - time_start
def serial_rollout(env_name, env_number, horizon): envs = [gym.make(env_name) for _ in range(env_number)] actors = [gen_actor(env_name, 512) for _ in range(env_number)] time_start = time.time() data = [rollout_single_step(i, envs[i], actors[i], horizon) for i in range(env_number)] time_end = time.time() print("parallel_time: {}, data:{}".format(time_end - time_start, data))
def parallel_rollout(env_name, env_number, horizon): actors = [gen_actor(env_name, 512) for _ in range(env_number)] time_start = time.time() data = ray.get( [rollout_single_step_parallel.remote(i, env_name, actors[i], horizon) for i in range(env_number)]) time_end = time.time() print("parallel_time: {}, data:{}".format(time_end - time_start, data))
def test_save(env_name): iteration = 1000 actor = gen_actor(env_name, 64) critic = gen_critic(env_name, 64) rolloutmem = RolloutMemory(5 * 10, env_name) envs = [ParallelEnv.remote(env_name, i) for i in range(5)] optimizer = torch.optim.Adam(list(actor.parameters()) + list(critic.parameters()), lr=0.0001) seed = 123 tb = SummaryWriter() for i in range(100): tb.add_scalar('loss', i, i) rollout_time, update_time = AverageMeter(), AverageMeter() rollout_time.update(100) update_time.update(100) save_path = os.path.join("../base/save/model", 'dev_Hopper_resume.tar') torch.save({ 'iteration': iteration, 'seed': seed, 'actor_state_dict': actor.state_dict(), 'critic_state_dict': critic.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'rolloutmem': rolloutmem, 'time_recorder': [rollout_time, update_time], }, save_path) print("Save Done!")
def repeat_rollout(env_name, env_number, horizon, iter_num): # ingredients prepare time_start = time.time() envs = [ParallelEnv.remote(env_name, id) for id in range(env_number)] actor = gen_actor(env_name, 512) critic = gen_critic(env_name, 512) rolloutmem = RolloutMemory(env_number * horizon, env_name) print(" build_time: {}".format(time.time() - time_start)) # repeat iteration for i in range(iter_num): print("iter_{}".format(i)) parallel_rollout_env(envs, actor, critic, rolloutmem, horizon) print("Work Done!")
def test_state_dict(env_name): env = gen_env(env_name) actor = gen_actor(env_name, 64) critic = gen_critic(env_name, 64) rolloutmem = RolloutMemory(50 * 200, env_name) optimizer = torch.optim.Adam(list(actor.parameters()) + list(critic.parameters()), lr=0.0001) tb = SummaryWriter() for param_tensor in actor.state_dict(): print(param_tensor, "\t", actor.state_dict()[param_tensor].size()) for param_tensor in critic.state_dict(): print(param_tensor, "\t", critic.state_dict()[param_tensor].size()) for param_tensor in optimizer.state_dict(): print(param_tensor, "\t", optimizer.state_dict()[param_tensor])
def loop_rollout(env_name, env_number, horizon): envs = [gym.make(env_name) for _ in range(env_number)] actor = gen_actor(env_name, 512) time_start = time.time() for env in envs: time_1 = time.time() obs = env.reset() total_reward = 0. for _ in range(horizon): obs, reward, _, _ = env.step(actor.gen_action(torch.Tensor(obs))[0]) total_reward += reward time_2 = time.time() print('episode_time={}'.format(time_2 - time_1)) time_end = time.time() print("parallel_time: {}".format(time_end - time_start))
def test_load(env_name): actor = gen_actor(env_name, 64) critic = gen_critic(env_name, 64) optimizer = torch.optim.Adam(list(actor.parameters()) + list(critic.parameters()), lr=0.0001) load_path = os.path.join("../base/save/model", 'dev_Hopper_resume.tar') checkpoint = torch.load(load_path) actor.load_state_dict(checkpoint['actor_state_dict']) actor.train() critic.load_state_dict(checkpoint['critic_state_dict']) critic.train() optimizer.load_state_dict(checkpoint['optimizer_state_dict']) rolloutmem = checkpoint['rolloutmem'] iteration = checkpoint['iteration'] seed = checkpoint['seed'] [rollout_time, update_time] = checkpoint['time_recorder'] print("Load Done!") print('')
def serial_rollout_sim(env_name, env_number, horizon): actor = gen_actor(env_name, 512) envs = [gym.make(env_name) for _ in range(env_number)] for i in range(env_number): envs[i].seed(seed=i) data = [] time_start = time.time() for env_id in range(len(envs)): env = envs[env_id] time_1 = time.time() # initialize logger old_states, new_states, raw_actions, dones, rewards, log_probs, advantages, episode_reward = \ [], [], [], [], [], [], [], 0. # collect episode old_obs = env.reset() for step in range(horizon): # interact with environment action, log_prob, raw_action = actor.gen_action(torch.Tensor(old_obs)) new_obs, reward, done, info = env.step(action) # record trajectory step old_states.append(old_obs) new_states.append(new_obs) raw_actions.append(raw_action) rewards.append(reward) dones.append(done) log_probs.append(log_prob) episode_reward += reward # update old observation old_obs = new_obs # if done: # break dones[-1] = True time_2 = time.time() data.append([old_states, new_states, raw_actions, rewards, dones, log_probs, episode_reward]) print(" env_id={}, reward: {}, episode_time: {:.3f}sec".format(env_id, episode_reward, time_2 - time_1)) time_end = time.time() print("parallel_time: {}\ndata_len:{}\n\n".format(time_end - time_start, len(data)))
def train(params): # ============ # Preparations # ============ gc.collect() ray.init(log_to_driver=False, local_mode=False, num_gpus=1) # or, ray.init() if not params.use_pretrain: # algorithm ingredients instantiation seed = params.seed actor = gen_actor(params.env_name, params.policy_params.hidden_dim) critic = gen_critic(params.env_name, params.policy_params.hidden_dim) optimizer = torch.optim.Adam(list(actor.parameters()) + list(critic.parameters()), lr=params.policy_params.learning_rate) rollout_time, update_time = AverageMeter(), AverageMeter() iteration_pretrain = 0 # set random seed (for reproducing experiment) os.environ['PYTHONHASHSEED'] = str(seed) random.seed(seed) torch.manual_seed(seed) np.random.seed(seed) else: # build models actor = gen_actor(params.env_name, params.policy_params.hidden_dim).cuda() critic = gen_critic(params.env_name, params.policy_params.hidden_dim).cuda() optimizer = torch.optim.Adam(list(actor.parameters()) + list(critic.parameters()), lr=0.0001) # load models print("\n\nLoading training checkpoint...") print("------------------------------") load_path = os.path.join('./save/model', params.pretrain_file) checkpoint = torch.load(load_path) seed = checkpoint['seed'] actor.load_state_dict(checkpoint['actor_state_dict']) actor.train() critic.load_state_dict(checkpoint['critic_state_dict']) critic.train() optimizer.load_state_dict(checkpoint['optimizer_state_dict']) [rollout_time, update_time] = checkpoint['time_recorder'] iteration_pretrain = checkpoint['iteration'] # >> set random seed (for reproducing experiment) os.environ['PYTHONHASHSEED'] = str(seed) random.seed(seed) torch.manual_seed(seed) np.random.seed(seed) print("Loading finished!") print("------------------------------\n\n") rolloutmem = RolloutMemory( params.policy_params.envs_num * params.policy_params.horizon, params.env_name) envs = [ ParallelEnv.remote(params.env_name, i) for i in range(params.policy_params.envs_num) ] for i in range(len(envs)): envs[i].seed.remote(seed=seed + i) tb = SummaryWriter() # ============ # Training # ============ # >> training loop print("----------------------------------") print("Training model with {} parameters...".format( count_model_params(actor) + count_model_params(critic))) print("----------------------------------") time_start = time.time() for iteration in range(int(params.iter_num - iteration_pretrain)): # collect rollouts from current policy rolloutmem.reset() iter_start_time = time.time() mean_iter_reward = rollout(rolloutmem, envs, actor, critic, params) # optimize by gradient descent update_start_time = time.time() loss, policy_loss, critic_loss, entropy_loss, advantage, ratio, surr1, surr2, epochs_len = \ None, None, None, None, None, None, None, None, None, for epoch in range(params.policy_params.epochs_num): loss, policy_loss, critic_loss, entropy_loss, advantage, ratio, surr1, surr2, epochs_len = \ optimize_step(optimizer, rolloutmem, actor, critic, params, iteration) iter_end_time = time.time() tb = logger_scalar(tb, iteration + iteration_pretrain, loss, policy_loss, critic_loss, entropy_loss, advantage, ratio, surr1, surr2, epochs_len, mean_iter_reward, time_start) # tb = logger_histogram(tb, iteration + iteration_pretrain, actor, critic) rollout_time.update(update_start_time - iter_start_time) update_time.update(iter_end_time - update_start_time) tb.add_scalar('rollout_time', rollout_time.val, iteration + iteration_pretrain) tb.add_scalar('update_time', update_time.val, iteration + iteration_pretrain) print( 'it {}: avgR: {:.3f} avgL: {:.3f} | rollout_time: {:.3f}sec update_time: {:.3f}sec' .format(iteration + iteration_pretrain, mean_iter_reward, epochs_len, rollout_time.val, update_time.val)) # save rollout video if (iteration + 1) % int(params.plotting_iters) == 0 \ and iteration > 0 \ and params.log_video \ and params.env_name not in envnames_classiccontrol: log_policy_rollout( params, actor, params.env_name, 'iter-{}'.format(iteration + iteration_pretrain)) # save model if (iteration + 1) % int( params.checkpoint_iter ) == 0 and iteration > 0 and params.save_checkpoint: save_model(params.prefix, iteration, iteration_pretrain, seed, actor, critic, optimizer, rollout_time, update_time) # save rollout videos if params.log_video: save_model(params.prefix, params.iter_num, iteration_pretrain, seed, actor, critic, optimizer, rollout_time, update_time) if params.env_name not in envnames_classiccontrol: for i in range(3): log_policy_rollout(params, actor, params.env_name, 'final-{}'.format(i))