class PPOAgent(): def __init__(self, args, env_params): self.o_dim = env_params['o_dim'] self.a_dim = env_params['a_dim'] self.r_dim = args.r_dim self.lr = args.lr self.gamma_e = args.gamma_e self.gamma_i = args.gamma_i self.lamda = args.lamda self.entropy_coef = args.entropy_coef self.ex_coef = args.ex_coef self.in_coef = args.in_coef self.clip_eps = args.clip_eps self.update_epoch = args.update_epoch self.batch_size = args.batch_size self.initialize_episode = args.initialize_episode self.update_proportion = args.update_proportion self.rollout_len = args.rollout_len self.obs_clip = args.obs_clip self.device = torch.device(args.device) self.actor_critic = CNNActorCritic(in_channel=self.o_dim[0], a_dim=self.a_dim).to(self.device) self.RND = RNDNetwork(in_channel=1).to(self.device) self.optimizer = optim.Adam(list(self.actor_critic.parameters()) + list(self.RND.predictor.parameters()), lr=self.lr) self.buffer = Buffer(capacity=self.rollout_len, o_dim=self.o_dim) self.normalizer_obs = Normalizer(shape=self.o_dim, clip=self.obs_clip) self.normalizer_ri = Normalizer(shape=1, clip=np.inf) def choose_action(self, obs): obs = torch.from_numpy(obs).float().to(self.device) / 255. with torch.no_grad(): action_logits = self.actor_critic.act(obs) dist = Categorical(action_logits) action = dist.sample() log_prob = dist.log_prob(action) action, log_prob = action.cpu().detach().numpy(), log_prob.cpu( ).detach().numpy() return action, log_prob def compute_intrinsic_reward(self, obs_): obs_ = self.normalizer_obs.normalize(obs_) obs_ = torch.from_numpy(obs_[:, 3:, :, :]).float().to(self.device) with torch.no_grad(): pred_feature, tar_feature = self.RND(obs_) reward_in = F.mse_loss(pred_feature, tar_feature, reduction='none').mean(dim=-1) reward_in = reward_in.cpu().detach().numpy() return reward_in def GAE_caculate(self, rewards, masks, values, gamma, lamda): returns = np.zeros(shape=len(rewards), dtype=np.float32) deltas = np.zeros(shape=len(rewards), dtype=np.float32) advantages = np.zeros(shape=len(rewards), dtype=np.float32) pre_return = 0. pre_advantage = 0. pre_value = 0. for i in reversed(range(len(rewards))): returns[i] = rewards[i] + masks[i] * gamma * pre_return deltas[i] = rewards[i] + masks[i] * gamma * pre_value - values[i] advantages[i] = deltas[i] + gamma * lamda * pre_advantage pre_return = returns[i] pre_value = values[i] pre_advantage = advantages[i] return returns, deltas, advantages def update(self, o, a, r_i, r_e, mask, o_, log_prob): self.normalizer_obs.update(o_.reshape(-1, 4, 84, 84).copy()) self.normalizer_ri.update(r_i.reshape(-1).copy()) r_i = self.normalizer_ri.normalize(r_i) o_ = self.normalizer_obs.normalize(o_) o = torch.from_numpy(o).to(self.device).float() / 255. returns_ex = np.zeros_like(r_e) returns_in = np.zeros_like(r_e) advantage_ex = np.zeros_like(r_e) advantage_in = np.zeros_like(r_e) for i in range(r_e.shape[0]): action_logits, value_ex, value_in = self.actor_critic(o[i]) value_ex, value_in = value_ex.cpu().detach().numpy(), value_in.cpu( ).detach().numpy() returns_ex[i], _, advantage_ex[i] = self.GAE_caculate( r_e[i], mask[i], value_ex, self.gamma_e, self.lamda) #episodic returns_in[i], _, advantage_in[i] = self.GAE_caculate( r_i[i], np.ones_like(mask[i]), value_in, self.gamma_i, self.lamda) #non_episodic o = o.reshape((-1, 4, 84, 84)) a = np.reshape(a, -1) o_ = np.reshape(o_[:, :, 3, :, :], (-1, 1, 84, 84)) log_prob = np.reshape(log_prob, -1) returns_ex = np.reshape(returns_ex, -1) returns_in = np.reshape(returns_in, -1) advantage_ex = np.reshape(advantage_ex, -1) advantage_in = np.reshape(advantage_in, -1) a = torch.from_numpy(a).float().to(self.device) o_ = torch.from_numpy(o_).float().to(self.device).float() log_prob = torch.from_numpy(log_prob).float().to(self.device) returns_ex = torch.from_numpy(returns_ex).float().to( self.device).unsqueeze(dim=1) returns_in = torch.from_numpy(returns_in).float().to( self.device).unsqueeze(dim=1) advantage_ex = torch.from_numpy(advantage_ex).float().to(self.device) advantage_in = torch.from_numpy(advantage_in).float().to(self.device) sample_range = list(range(len(o))) for i_update in range(self.update_epoch): np.random.shuffle(sample_range) for j in range(int(len(o) / self.batch_size)): idx = sample_range[self.batch_size * j:self.batch_size * (j + 1)] #update RND pred_RND, tar_RND = self.RND(o_[idx]) loss_RND = F.mse_loss(pred_RND, tar_RND.detach(), reduction='none').mean(-1) mask = torch.randn(len(loss_RND)).to(self.device) mask = (mask < self.update_proportion).type( torch.FloatTensor).to(self.device) loss_RND = (loss_RND * mask).sum() / torch.max( mask.sum(), torch.Tensor([1]).to(self.device)) #update actor-critic action_logits, value_ex, value_in = self.actor_critic(o[idx]) advantage = self.ex_coef * advantage_ex[ idx] + self.in_coef * advantage_in[idx] dist = Categorical(action_logits) new_log_prob = dist.log_prob(a[idx]) ratio = torch.exp(new_log_prob - log_prob[idx]) surr1 = ratio * advantage surr2 = torch.clamp(ratio, 1 - self.clip_eps, 1 + self.clip_eps) * advantage loss_actor = torch.min( surr1, surr2).mean() - self.entropy_coef * dist.entropy().mean() loss_critic = F.mse_loss(value_ex, returns_ex[idx]) + F.mse_loss( value_in, returns_in[idx]) loss_ac = loss_actor + 0.5 * loss_critic loss = loss_RND + loss_ac self.optimizer.zero_grad() loss.backward() global_grad_norm_( list(self.actor_critic.parameters()) + list(self.RND.predictor.parameters())) self.optimizer.step() return loss_RND.cpu().detach().numpy(), loss_actor.cpu().detach( ).numpy(), loss_critic.cpu().detach().numpy() def save_model(self, remark): if not os.path.exists('pretrained_models_PPO/'): os.mkdir('pretrained_models_PPO/') path = 'pretrained_models_PPO/{}.pt'.format(remark) print('Saving model to {}'.format(path)) torch.save(self.actor_critic.state_dict(), path) def load_model(self, load_model_remark): print('Loading models with remark {}'.format(load_model_remark)) model = torch.load( 'pretrained_models_PPO/{}.pt'.format(load_model_remark), map_location=lambda storage, loc: storage) self.actor_critic.load_state_dict(model)
class DDPG_Agent(): def __init__(self, args, env_params): self.s_dim = env_params['o_dim'] + env_params['g_dim'] self.a_dim = env_params['a_dim'] self.f_dim = args.f_dim self.action_bound = env_params['action_max'] self.max_timestep = env_params['max_timestep'] self.max_episode = args.max_episode self.evaluate_episode = args.evaluate_episode self.evaluate_interval = args.evaluate_interval self.log_interval = args.log_interval self.save_model_interval = args.save_model_interval self.save_model_start = args.save_model_start self.lr = args.lr self.lr_model = args.lr_model self.gamma = args.gamma self.batch_size = args.batch_size self.tau = args.tau self.eta = args.eta self.noise_eps = args.noise_eps self.device = torch.device(args.device) self.normalizer_s = Normalizer(size=self.s_dim, eps=1e-2, clip_range=1.) self.memory = Memory(size=args.memory_size, s_dim=self.s_dim, a_dim=self.a_dim) self.policy = Policy(s_dim=self.s_dim, a_dim=self.a_dim).to(self.device) self.policy_target = Policy(s_dim=self.s_dim, a_dim=self.a_dim).to(self.device) self.Q = QFunction(s_dim=self.s_dim, a_dim=self.a_dim).to(self.device) self.Q_target = QFunction(s_dim=self.s_dim, a_dim=self.a_dim).to(self.device) self.optimizer_p = optim.Adam(self.policy.parameters(), lr=self.lr) self.optimizer_q = optim.Adam(self.Q.parameters(), lr=self.lr) self.encoder = StateEncoder(s_dim=self.s_dim, f_dim=self.f_dim).to(self.device) self.EnvForward = ForwardModel(f_dim=self.f_dim, a_dim=self.a_dim).to(self.device) self.EnvInverse = InverseModel(f_dim=self.f_dim, a_dim=self.a_dim).to(self.device) self.optimizer_forward = optim.Adam( [{ 'params': self.EnvForward.parameters() }, { 'params': self.encoder.parameters() }], lr=self.lr_model) self.optimizer_inverse = optim.Adam( [{ 'params': self.EnvInverse.parameters() }, { 'params': self.encoder.parameters() }], lr=self.lr_model) self.hard_update() self.update_num = 0 def select_action(self, state, train_mode=True): s = self.normalize_input(state) s = torch.tensor(state, dtype=torch.float32).to(self.device) with torch.no_grad(): action = self.policy(s).cpu().numpy() if train_mode: action += np.random.randn( self.a_dim ) * self.noise_eps * self.action_bound #Gaussian Noise else: pass action = np.clip(action, a_min=-self.action_bound, a_max=self.action_bound) return action def get_intrisic_reward(self, s, a, s_): s, a, s_ = torch.from_numpy(s).to( self.device).float(), torch.from_numpy(a).to( self.device).float(), torch.from_numpy(s_).to( self.device).float() with torch.no_grad(): feature = self.encoder(s) next_feature_pred = self.EnvForward(feature, a) next_feature = self.encoder(s_) r_i = self.eta * torch.norm(next_feature_pred - next_feature) r_i = torch.clamp(r_i, min=-0.1, max=0.1) return r_i.cpu().detach().numpy() def train(self, env, logger=None): total_step = 0 loss_pi, loss_q, loss_forward, loss_inverse = 0., 0., 0., 0. for i_episode in range(self.max_episode): obs = env.reset() s = get_state(obs) cumulative_r = 0. for i_step in range(self.max_timestep): a = self.select_action(s) obs_, r_e, done, info = env.step(a) s_ = get_state(obs_) r_i = self.get_intrisic_reward(s, a, s_) r = r_e + r_i self.memory.store(s, a, r, s_) s = s_ if len(self.memory) > self.batch_size: loss_pi, loss_q, loss_forward, loss_inverse = self.learn() cumulative_r += r_e total_step += 1 print( 'i_episode: {} total step: {} cumulative reward: {:.4f} is_success: {} ' .format(i_episode, total_step, cumulative_r, info['is_success'])) if logger is not None and i_episode % self.log_interval == 0: logger.add_scalar('Indicator/cumulative reward', cumulative_r, i_episode) logger.add_scalar('Loss/pi_loss', loss_pi, i_episode) logger.add_scalar('Loss/q_loss', loss_q, i_episode) logger.add_scalar('Loss/forward_loss', loss_forward, i_episode) logger.add_scalar('Loss/inverse_loss', loss_inverse, i_episode) if i_episode % self.evaluate_interval == 0: success_rate = self.evaluate(env) if logger is not None: logger.add_scalar('Indicator/success rate', success_rate, i_episode) if i_episode > self.save_model_start and i_episode % self.save_model_interval == 0: self.save_model(remarks='{}_{}'.format(env.spec.id, i_episode)) def evaluate(self, env, render=False): success_count = 0 for i_episode in range(self.evaluate_episode): obs = env.reset() s = get_state(obs) for i_step in range(self.max_timestep): if render: env.render() a = self.select_action(s, train_mode=False) obs_, r_e, done, info = env.step(a) s_ = get_state(obs_) s = s_ success_count += info['is_success'] return success_count / self.evaluate_episode def learn(self): s, a, r, s_ = self.memory.sample_batch(batch_size=self.batch_size) self.normalizer_s.update(s) s, s_ = self.normalize_input(s, s_) s = torch.from_numpy(s).to(self.device) a = torch.from_numpy(a).to(self.device) r = torch.from_numpy(r).to(self.device).unsqueeze(dim=1) s_ = torch.from_numpy(s_).to(self.device) #update policy and Q with torch.no_grad(): a_next_tar = self.policy_target(s_) Q_next_tar = self.Q_target(s_, a_next_tar) loss_q_tar = r + self.gamma * Q_next_tar loss_q_pred = self.Q(s, a) loss_q = F.mse_loss(loss_q_pred, loss_q_tar.detach()) self.optimizer_q.zero_grad() loss_q.backward() self.optimizer_q.step() loss_p = -self.Q(s, self.policy(s)).mean() self.optimizer_p.zero_grad() loss_p.backward() self.optimizer_p.step() self.soft_update() #update env model and encoder feature = self.encoder(s) next_feature = self.encoder(s_) a_pred = self.EnvInverse(feature, next_feature) loss_inverse = F.mse_loss(a_pred, a) next_feature_pred = self.EnvForward(feature, a) with torch.no_grad(): next_feature_tar = self.encoder(s_) loss_forward = F.mse_loss(next_feature_pred, next_feature_tar.detach()) self.optimizer_forward.zero_grad() self.optimizer_inverse.zero_grad() loss_forward.backward(retain_graph=True) loss_inverse.backward() self.optimizer_forward.step() self.optimizer_inverse.step() self.update_num += 1 return loss_p.cpu().detach().numpy(), loss_q.cpu().detach().numpy( ), loss_forward.cpu().detach().numpy(), loss_inverse.cpu().detach( ).numpy() def update_normalizer(self, states): states = np.array(states, dtype=np.float32) self.normalizer_s.update(states) def hard_update(self): self.policy_target.load_state_dict(self.policy.state_dict()) self.Q_target.load_state_dict(self.Q.state_dict()) def soft_update(self): for param, param_target in zip(self.policy.parameters(), self.policy_target.parameters()): param_target.data.copy_(param.data * self.tau + param_target.data * (1 - self.tau)) for param, param_target in zip(self.Q.parameters(), self.Q_target.parameters()): param_target.data.copy_(param.data * self.tau + param_target.data * (1 - self.tau)) def normalize_input(self, s, s_=None): s = self.normalizer_s.normalize(s) if s_ is not None: s_ = self.normalizer_s.normalize(s_) return s, s_ else: return s def save_model(self, remarks): if not os.path.exists('pretrained_models_DDPG/'): os.mkdir('pretrained_models_DDPG/') path = 'pretrained_models_DDPG/{}.pt'.format(remarks) print('Saving model to {}'.format(path)) torch.save([ self.normalizer_s.mean, self.normalizer_s.std, self.policy.state_dict() ], path) def load_model(self, remark): print('Loading models with remark {}'.format(remark)) self.normalizer_s.mean, self.normalizer_s.std, policy_model = torch.load( 'pretrained_models_DDPG/{}.pt'.format(remark), map_location=lambda storage, loc: storage) self.policy.load_state_dict(policy_model)
class ddpgAgent(object): def __init__(self, params): """Implementation of DDPG agent with Hindsight Experience Replay (HER) sampler. @param params: dict containing all necessary parameters: dims, buffer_size, tau (= 1-polyak), batch_size, lr_critic, lr_actor, norm_eps, norm_clip, clip_obs, clip_action, T (episode length), num_workers, clip_return, sample_her_transitions, gamma, replay_strategy """ self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') self.input_dims = params['dims'] self.buffer_size = params['buffer_size'] self.tau = params['tau'] self.batch_size = params['batch_size'] self.critic_lr = params['lr_critic'] self.actor_lr = params['lr_actor'] self.norm_eps = params['norm_eps'] self.norm_clip = params['norm_clip'] self.clip_obs = params['clip_obs'] self.clip_action = params['clip_action'] self.T = params['T'] self.rollout_batch_size = params['num_workers'] self.clip_return = params['clip_return'] self.sample_transitions = params['sample_her_transitions'] self.gamma = params['gamma'] self.replay_strategy = params['replay_strategy'] self.dimo = self.input_dims['o'] self.dimg = self.input_dims['g'] self.dimu = self.input_dims['u'] stage_shapes = OrderedDict() for key in sorted(self.input_dims.keys()): if key.startswith('info_'): continue stage_shapes[key] = (None, self.input_dims[key]) stage_shapes['o_2'] = stage_shapes['o'] stage_shapes['r'] = (None,) self.stage_shapes = stage_shapes # normalizer self.obs_normalizer = Normalizer(size=self.dimo, eps=self.norm_eps, clip_range=self.norm_clip) self.goal_normalizer = Normalizer(size=self.dimg, eps=self.norm_eps, clip_range=self.norm_clip) # networks self.actor_local = Actor(self.input_dims).to(self.device) self.critic_local = Critic(self.input_dims).to(self.device) self.actor_target = copy.deepcopy(self.actor_local) self.critic_target = copy.deepcopy(self.critic_local) # optimizers self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.actor_lr) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=self.critic_lr) # Configuring the replay buffer buffer_shapes = {key: (self.T-1 if key != 'o' else self.T, self.input_dims[key]) for key, val in self.input_dims.items()} buffer_shapes['g'] = (buffer_shapes['g'][0], self.dimg) buffer_shapes['ag'] = (self.T, self.dimg) buffer_size = (self.buffer_size // self.rollout_batch_size) * self.rollout_batch_size self.buffer = ReplayBuffer(buffer_shapes, buffer_size, self.T, self.sample_transitions) def act(self, o, g, noise_eps=0., random_eps=0., testing=False): """Choose action from observations with probability 'random_eps' at random, else use actor output and add noise 'noise_eps' @param o: observation @param g: desired goal @param noise_eps: noise added to action @param random_eps: random action probability @param testing: (bool) set to 'True' if testing a single environment """ obs = self.obs_normalizer.normalize(o) goals = self.goal_normalizer.normalize(g) obs = torch.tensor(obs).to(self.device) goals = torch.tensor(goals).to(self.device) # for testing single environment if testing: with torch.no_grad(): action = self.actor_local(torch.cat([obs, goals], dim=0)).cpu().data.numpy() return action actions = self.actor_local(torch.cat([obs, goals], dim=1)) noise = (noise_eps * np.random.randn(actions.shape[0], 4)).astype(np.float32) actions += torch.tensor(noise).to(self.device) eps_greedy_noise = np.random.binomial(1, random_eps, actions.shape[0]).reshape(-1, 1) random_action = torch.tensor(np.random.uniform( low=-1., high=1., size=(actions.shape[0], self.dimu)).astype(np.float32)).to(self.device) actions += torch.tensor(eps_greedy_noise.astype(np.float32)).to(self.device) * ( random_action - actions) # eps-greedy actions = torch.clamp(actions, -self.clip_action, self.clip_action) return actions def store_episode(self, episode_batch): """Store episodes to replay buffer. @param episode_batch: array of batch_size x (T or T+1) x dim_key. Observation 'o' is of size T+1, others are of size T """ self.buffer.store_episode(episode_batch) # add transitions to normalizer episode_batch['o_2'] = episode_batch['o'][:, 1:, :] episode_batch['ag_2'] = episode_batch['ag'][:, 1:, :] shape = episode_batch['u'].shape num_normalizing_transitions = shape[0] * shape[1] # num_rollouts * (T - 1), steps every cycle transitions = self.sample_transitions(episode_batch, num_normalizing_transitions) self.obs_normalizer.update(transitions['o']) self.goal_normalizer.update(transitions['g']) self.obs_normalizer.recompute_stats() self.goal_normalizer.recompute_stats() def sample_batch(self): """Sample random transitions from replay buffer (which also contains HER samples). @return: transitions """ transitions = self.buffer.sample(self.batch_size) return [transitions[key] for key in self.stage_shapes.keys()] def learn(self): """learning step i.e. optimizing the network. """ batch = self.sample_batch() batch_dict = OrderedDict([(key, batch[i].astype(np.float32).copy()) for i, key in enumerate(self.stage_shapes.keys())]) batch_dict['r'] = np.reshape(batch_dict['r'], [-1, 1]) # prepare state, action, reward, next state obs = torch.tensor(self.obs_normalizer.normalize(batch_dict['o'])).to(self.device) goal = torch.tensor(self.goal_normalizer.normalize(batch_dict['g'])).to(self.device) actions = torch.tensor(batch_dict['u']).to(self.device) rewards = torch.tensor(batch_dict['r'].astype(np.float32)).to(self.device) obs_2 = torch.tensor(self.obs_normalizer.normalize(batch_dict['o_2'])).to(self.device) # update critic -------------------------------------------------------------- # compute predicted Q values next_actions = self.actor_target(torch.cat([obs_2, goal], dim=1)) next_Q_targets = self.critic_target(torch.cat([obs_2, goal], dim=1), next_actions) # compute Q values for current states and clip them Q_targets = rewards + self.gamma * next_Q_targets # Note: last experience of episode is not included Q_targets = torch.clamp(Q_targets, -self.clip_return, 0.) # clipping # compute loss Q_expected = self.critic_local(torch.cat([obs, goal], dim=1), actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # update weights critic self.critic_optimizer.zero_grad() critic_loss.backward() # torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # update actor ------------------------------------------------------------- # compute loss pred_actions = self.actor_local(torch.cat([obs, goal], dim=1)) actor_loss = -self.critic_local(torch.cat([obs, goal], dim=1), pred_actions).mean() actor_loss += (pred_actions ** 2).mean() # minimize action moments # update weights actor self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() def soft_update_target_networks(self): """Soft update model parameters: θ_target = τ*θ_local + (1 - τ)*θ_target """ # update critic net for target_param, local_param in zip(self.critic_target.parameters(), self.critic_local.parameters()): target_param.data.copy_(self.tau * local_param.data + (1.0 - self.tau) * target_param.data) # update actor net for target_param, local_param in zip(self.actor_target.parameters(), self.actor_local.parameters()): target_param.data.copy_(self.tau * local_param.data + (1.0 - self.tau) * target_param.data) def save_checkpoint(self, path, name): """Save actor, critic networks and the stats for normalization to the path. @param path: path to store checkpoints @param name: (str) name of environment, for naming files """ torch.save(self.actor_local.state_dict(), path + '/'+name+'_checkpoint_actor_her.pth') torch.save(self.critic_local.state_dict(), path + '/'+name+'_checkpoint_critic_her.pth') self.obs_normalizer.save_normalizer(path + '/'+name+'_obs_normalizer.pth') self.goal_normalizer.save_normalizer(path + '/'+name+'_goal_normalizer.pth')