def forward(self, observation, reparameterize=True, deterministic=False, return_log_prob=False): """ Forward pass. Assumes input is a torch tensor. :type observation: torch.Tensor """ layer_input = observation for fc in self.fcs: layer_input = self.hidden_activation(fc(layer_input)) network_output = self.output_activation(self.last_fc(layer_input)) alpha = network_output[:, 0].unsqueeze(1) + EPSILON beta = network_output[:, 1].unsqueeze(1) + EPSILON distribution = Beta(alpha, beta) distribution_mean = distribution.mean if deterministic: sample = distribution.rsample() else: sample = distribution_mean # transform to range (min, max) action = self.min + self.max_min_difference * sample mean = self.min + self.max_min_difference * distribution_mean variance = self.max_min_difference_squared * distribution.variance std = torch.sqrt(variance) log_std = torch.log(std) log_prob = distribution.log_prob(sample) entropy = distribution.entropy() mean_action_log_prob = None pre_tanh_value = None return action, mean, log_std, log_prob, entropy, std, mean_action_log_prob, pre_tanh_value
def trainmodel(self): s = torch.tensor(self.memory.buffer['s'], dtype=torch.double).to(device) a = torch.tensor(self.memory.buffer['a'], dtype=torch.double).to(device) #r = torch.tensor(self.memory.buffer['r'], dtype=torch.double).to(device).view(-1, 1) s_ = torch.tensor(self.memory.buffer['s_'], dtype=torch.double).to(device) #v = torch.tensor(self.memory.buffer['v'], dtype=torch.double).to(device).view(-1, 1) input = s_[-1].view(1, 4, 28, 28) future_value = self.net(input)[1].item() adv, target_v = self.getgae(future_value) adv = torch.tensor(np.array(adv), dtype=torch.double).to(device).view(-1, 1) target_v = torch.tensor(target_v, dtype=torch.double).to(device).view(-1, 1) adv = (adv - adv.mean()) / (adv.std() + 1e-5) old_a_logp = torch.tensor(self.memory.buffer['a_logp'], dtype=torch.double).to(device).view(-1, 1) for _ in range(self.PPOepoch): for index in BatchSampler( SubsetRandomSampler(range(self.memory.buffer_capacity)), self.memory.batch_size, False): alpha, beta = self.net(s[index])[0] dist = Beta(alpha, beta) a_logp = dist.log_prob(a[index]).sum(dim=1) a_logp = a_logp.reshape(-1, 1) ratio = torch.exp(a_logp - old_a_logp[index]) with torch.no_grad(): entrop = dist.entropy() surr1 = ratio * adv[index] surr2 = torch.clamp(ratio, 1.0 - self.clip_param, 1.0 + self.clip_param) * adv[index] action_loss = -torch.min(surr1, surr2).mean() value_loss = F.smooth_l1_loss( self.net(s[index])[1], target_v[index]) self.storeloss(action_loss, value_loss) action_loss = torch.clamp(action_loss, 0, 10) value_loss = torch.clamp(value_loss, 0, 10) loss = action_loss + 2. * value_loss - args.bound * entrop.mean( ) self.optimizer.zero_grad() loss.backward() nn.utils.clip_grad_norm_(self.net.parameters(), self.max_grad_norm) self.optimizer.step() torch.save(self.net.state_dict(), self.path_t7)
def update(self, epochs, steps, total_obs, total_actions, advantage, real_values): total_obs_ = torch.from_numpy(total_obs).type(torch.FloatTensor) advantage_ = torch.from_numpy(advantage).type(torch.FloatTensor) real_values_ = torch.from_numpy(real_values).type(torch.FloatTensor) total_actions = torch.from_numpy(total_actions).type(torch.FloatTensor) for _ in range(epochs): inds = np.arange(steps) np.random.shuffle(inds) for t in range(steps): index = inds[t] alpha, beta, values_to_backprop = self.network( total_obs_[index].unsqueeze(0)) m = Beta(alpha, beta) action_taken_prob = m.log_prob(total_actions[index]).sum( dim=1, keepdim=True) entropy = m.entropy() entropy = entropy.sum(dim=1) print(entropy) alpha, beta, _ = self.old_network( total_obs_[index].unsqueeze(0)) m_old = Beta(alpha, beta) old_action_taken_probs = m_old.log_prob( total_actions[index]).sum(dim=1, keepdim=True) ratios = action_taken_prob / (old_action_taken_probs + 1e-5) surr1 = ratios * advantage_[index] surr2 = torch.clamp(ratios, min=(1. - .1), max=(1. + .1)) * advantage_[index] policy_loss = -torch.min(surr1, surr2) value_loss = ((values_to_backprop - real_values_[index])**2) #value_loss = F.smooth_l1_loss(values_to_backprop, real_values_[index]) total_loss = policy_loss + value_loss - 0.01 * entropy print(total_loss) self.optimizer.zero_grad() total_loss.backward() torch.nn.utils.clip_grad_norm_(self.network.parameters(), 0.5) self.optimizer.step() self.old_network.load_state_dict(self.dic_placeholder) self.dic_placeholder = self.network.state_dict() return (value_loss)
class CarlaImgPolicy(nn.Module): def __init__(self, input_dim, action_dim, hidden_layer=[400, 300]): super(CarlaImgPolicy, self).__init__() self.main_actor = CarlaSimpleEncoder(latent_size=input_dim - 1) self.main_critic = CarlaSimpleEncoder(latent_size=input_dim - 1) actor_layer_size = [input_dim] + hidden_layer actor_feature_layers = nn.ModuleList([]) for i in range(len(actor_layer_size) - 1): actor_feature_layers.append( nn.Linear(actor_layer_size[i], actor_layer_size[i + 1])) actor_feature_layers.append(nn.ReLU()) self.actor = nn.Sequential(*actor_feature_layers) self.alpha_head = nn.Sequential( nn.Linear(hidden_layer[-1], action_dim), nn.Softplus()) self.beta_head = nn.Sequential(nn.Linear(hidden_layer[-1], action_dim), nn.Softplus()) critic_layer_size = [input_dim] + hidden_layer critic_layers = nn.ModuleList([]) for i in range(len(critic_layer_size) - 1): critic_layers.append( nn.Linear(critic_layer_size[i], critic_layer_size[i + 1])) critic_layers.append(nn.ReLU()) critic_layers.append(layer_init(nn.Linear(hidden_layer[-1], 1), gain=1)) self.critic = nn.Sequential(*critic_layers) def forward(self, x, action=None): speed = x[:, -1:] x = x[:, :-1].view(-1, 3, 128, 128) # image size in carla driving task is 128x128 x1 = self.main_actor(x) x1 = torch.cat([x1, speed], dim=1) x2 = self.main_critic(x) x2 = torch.cat([x2, speed], dim=1) actor_features = self.actor(x1) alpha = self.alpha_head(actor_features) + 1 beta = self.beta_head(actor_features) + 1 self.dist = Beta(alpha, beta) if action is None: action = self.dist.sample() else: action = (action + 1) / 2 action_log_prob = self.dist.log_prob(action).sum(-1) entropy = self.dist.entropy().sum(-1) value = self.critic(x2) return action * 2 - 1, action_log_prob, value.squeeze(-1), entropy
def forward(self, s, g, greedy=False, action_logit=None): """Produce an action""" c0, c1 = self.action_stats(s, g) action_mode = (c0 - 1) / (c0 + c1 - 2) m = Beta(c0, c1) # Sample. if action_logit is None: if greedy: action_logit = action_mode else: action_logit = m.sample() n_ent = -m.entropy().mean() lprobs = m.log_prob(action_logit) action = self.scale_action(action_logit) return action, action_logit, lprobs, n_ent # Evaluate the action previously taken else: n_ent = -m.entropy().mean(dim=1) lprobs = m.log_prob(action_logit) action = self.scale_action(action_logit) return lprobs, n_ent, action
class BetaSeparatedPolicy(nn.Module): def __init__(self, input_dim, action_dim, hidden_layer=[64, 64]): super(BetaSeparatedPolicy, self).__init__() actor_layer_size = [input_dim] + hidden_layer alpha_feature_layers = nn.ModuleList([]) beta_feature_layers = nn.ModuleList([]) for i in range(len(actor_layer_size) - 1): alpha_feature_layers.append( nn.Linear(actor_layer_size[i], actor_layer_size[i + 1])) alpha_feature_layers.append(nn.ReLU()) beta_feature_layers.append( nn.Linear(actor_layer_size[i], actor_layer_size[i + 1])) beta_feature_layers.append(nn.ReLU()) self.alpha_body = nn.Sequential(*alpha_feature_layers) self.beta_body = nn.Sequential(*beta_feature_layers) self.alpha_head = nn.Sequential( nn.Linear(hidden_layer[-1], action_dim), nn.Softplus()) self.beta_head = nn.Sequential(nn.Linear(hidden_layer[-1], action_dim), nn.Softplus()) critic_layer_size = [input_dim] + hidden_layer critic_layers = nn.ModuleList([]) for i in range(len(critic_layer_size) - 1): critic_layers.append( nn.Linear(critic_layer_size[i], critic_layer_size[i + 1])) critic_layers.append(nn.ReLU()) critic_layers.append(nn.Linear(hidden_layer[-1], 1)) self.critic = nn.Sequential(*critic_layers) def forward(self, x, action=None): alpha = self.alpha_head(self.alpha_body(x)) + 1 beta = self.beta_head(self.beta_body(x)) + 1 self.dist = Beta(alpha, beta) if action is None: action = self.dist.sample() else: action = (action + 1) / 2 action_log_prob = self.dist.log_prob(action).sum(-1) entropy = self.dist.entropy().sum(-1) value = self.critic(x) return action * 2 - 1, action_log_prob, value.squeeze(-1), entropy
class MyDist(ActionDistribution): @staticmethod def required_model_output_shape(action_space, model_config): return 6 def __init__(self, inputs, model): super(MyDist, self).__init__(inputs, model) self.dist = Beta(inputs[:, :3], inputs[:, 3:]) def sample(self): self.sampled_action = self.dist.sample() return self.sampled_action def deterministic_sample(self): return self.dist.mean def sampled_action_logp(self): return self.logp(self.sampled_action) def logp(self, actions): return self.dist.log_prob(actions).sum(-1) # refered from https://github.com/pytorch/pytorch/blob/master/torch/distributions/kl.py def kl(self, other): p, q = self.dist, other.dist sum_params_p = p.concentration1 + p.concentration0 sum_params_q = q.concentration1 + q.concentration0 t1 = q.concentration1.lgamma() + q.concentration0.lgamma() + ( sum_params_p).lgamma() t2 = p.concentration1.lgamma() + p.concentration0.lgamma() + ( sum_params_q).lgamma() t3 = (p.concentration1 - q.concentration1) * torch.digamma( p.concentration1) t4 = (p.concentration0 - q.concentration0) * torch.digamma( p.concentration0) t5 = (sum_params_q - sum_params_p) * torch.digamma(sum_params_p) return (t1 - t2 + t3 + t4 + t5).sum(-1) def entropy(self): return self.dist.entropy().sum(-1)
def run(self): updatestep = 0 update = 0 i_episode = 0 while (update < 100000): self.lr = args.lr - (args.lr * (i_episode / float(10000))) i_episode = i_episode + 1 observation = self.env.reset() step = 0 observes_list, rewards, actions, values, old_log = [], [], [], [], [] if updatestep > 2048: update = update + 1 updatestep = 0 if (args.usegae): self.add_gae(self.trajectories, self.gamma, self.lam) else: self.add_no_gae(self.trajectories, self.gamma) s, a, adv, old_a_logp, target_v, totalsize = self.gettraindata( ) minibatch = max(totalsize // args.numminibatch, 1) for _ in range(self.PPOepoch): for index in BatchSampler( SubsetRandomSampler(range(totalsize)), minibatch, False): alpha, beta = self.net(s[index])[0] dist = Beta(alpha, beta) a_logp = dist.log_prob(a[index]).sum(dim=1) ratio = torch.exp(a_logp - old_a_logp[index]) with torch.no_grad(): entrop = dist.entropy() surr1 = ratio * adv[index] surr2 = torch.clamp(ratio, 1.0 - self.clip_param, 1.0 + self.clip_param) * adv[index] action_loss = -torch.min(surr1, surr2).mean() value_loss = F.mse_loss( self.net(s[index])[1], target_v[index]) self.storeloss(action_loss, value_loss) loss = action_loss + 0.5 * value_loss - 0.01 * entrop.mean( ) self.optimizer.zero_grad() loss.backward() nn.utils.clip_grad_norm_(self.net.parameters(), args.maxgradnorm) self.optimizer.step() self.trajectories = [] while (1): step = step + 1 updatestep = updatestep + 1 #self.env.render() observes = observation.astype(np.float32).reshape((1, -1)) input = torch.tensor(observes, dtype=torch.double).to(device).reshape( -1, self.inputsize) (alpha, beta), v = self.net(input) dist = Beta(alpha, beta) action = dist.sample() a_logp = dist.log_prob(action.view(-1, 6)).sum(dim=1) a_logp = a_logp.item() old_log.append(a_logp) values.append(v.item()) observes_list.append(observes) actions.append(action) action = action.squeeze().cpu().numpy() observation, reward, done, info = self.env.step(action * 2 - 1) rewards.append(reward) if done: print("Episode finished after {} timesteps, rewards is {}". format(step, sum(rewards))) self.storereward(format(step)) trajectory = { 'observes': np.concatenate([t for t in observes_list]), 'actions': np.concatenate([t.to('cpu') for t in actions]), 'rewards': np.array(rewards), 'values': np.array(values), 'old_log': np.array(old_log) } self.trajectories.append(trajectory) break
def entropy(self, datas): alpha, beta = datas distribution = Beta(alpha, beta) return distribution.entropy().float().to(set_device(self.use_gpu))
def run(self): for i_episode in range(10000 * self.envpoch): observation = self.env.reset() step = 0 observes_list = [] rewards = [] actions = [] values = [] old_log = [] if i_episode % 20 == 19: self.add_gae(self.trajectories, self.gamma, self.lam) s, a, adv, old_a_logp, target_v, totalsize = self.gettraindata( ) minibatch = max(totalsize // args.numminibatch, 1) for _ in range(self.PPOepoch): for index in BatchSampler( SubsetRandomSampler(range(totalsize)), minibatch, False): alpha, beta = self.net(s[index])[0] dist = Beta(alpha, beta) a_logp = dist.log_prob(a[index]).sum(-1, keepdim=True) ratio = torch.exp(a_logp - old_a_logp[index]) with torch.no_grad(): entrop = dist.entropy() surr1 = ratio * adv[index] surr2 = torch.clamp(ratio, 1.0 - self.clip_param, 1.0 + self.clip_param) * adv[index] action_loss = -torch.min(surr1, surr2).mean() value_loss = F.smooth_l1_loss( self.net(s[index])[1], target_v[index]) self.storeloss(action_loss, value_loss) action_loss = torch.clamp(action_loss, 0, 1) value_loss = torch.clamp(value_loss, 0, 1) loss = action_loss + 2. * value_loss - 0.01 * entrop.mean( ) self.optimizer.zero_grad() loss.backward() nn.utils.clip_grad_norm_(self.net.parameters(), 5) self.optimizer.step() self.trajectories = [] while (1): step = step + 1 #self.env.render() observes = observation.astype(np.float32).reshape((1, -1)) observes = np.append(observes, [[step]], axis=1) input = torch.tensor(observes, dtype=torch.double).to(device).reshape( -1, 18) observes_list.append(observes) (alpha, beta), v = self.net(input) values.append(v.item()) dist = Beta(alpha, beta) action = dist.sample() actions.append(action) a_logp = dist.log_prob(action.view(-1, 6)).sum(dim=1) action = action.squeeze().cpu().numpy() a_logp = a_logp.item() old_log.append(a_logp) observation, reward, done, info = self.env.step(action * 2 - 1) rewards.append(reward) if done: print("Episode finished after {} timesteps".format(step)) self.storereward(format(step)) observes = observation.astype(np.float32).reshape((1, -1)) observes = np.append(observes, [[step + 1]], axis=1) input = torch.tensor( observes, dtype=torch.double).to(device).reshape(-1, 18) (mean, std), v = self.net(input) values.append(v.item()) obs = np.concatenate([t for t in observes_list]) acs = np.concatenate([t.to('cpu') for t in actions]) res = np.array(rewards) vas = np.array(values[1:]) olog = np.array(old_log) self.scaler.update(obs) scale, offset = self.scaler.get() scale[-1] = 1.0 # don't scale time step feature offset[-1] = 0.0 # don't offset time step feature obs = (obs - offset) * scale trajectory = { 'observes': obs, 'actions': acs, 'rewards': res, 'values': vas, 'old_log': olog } self.trajectories.append(trajectory) break
def update(self): """ Update policy gradient by using old batch of experience. This happens when the buffer is full """ self.training_step += 1 s = torch.tensor(self.buffer['s'], dtype=torch.float).to(device) a = torch.tensor(self.buffer['a'], dtype=torch.float).to(device) r = torch.tensor(self.buffer['r'], dtype=torch.float).to(device).view(-1, 1) s_ = torch.tensor(self.buffer['s_'], dtype=torch.float).to(device) if args.action_vec > 0: a_v = torch.tensor(self.buffer['a_v'], dtype=torch.float).to(device) """ print("Weights before update: ") for k, v in self.net.state_dict().items(): print("Layer {}".format(k)) print(v.sum()) print(v.mean(), v.median()) print(v.max(), v.min()) """ if args.vae and args.tb: z = self.net.get_z(s[0].unsqueeze_(0)) dec2 = self.net.vae.decode(z).squeeze(0) imgs = torch.cat((dec2, s[0]), dim=2) img_grid = torchvision.utils.make_grid(imgs) writer.add_image("Encoder-Img", img_grid) #save_image(s[0].cpu(), 'test_img/' + args.title + "_" + str(args.ndim) + 'img_update_' + str(self.training_step) + '.png') old_a_logp = torch.tensor(self.buffer['a_logp'], dtype=torch.float).to(device).view(-1, 1) with torch.no_grad(): # Compute a vector with advantage terms if args.action_vec > 0: target_v = r + args.gamma * self.net((s_, a_v[:, 3:]))[1] adv = target_v - self.net((s_, a_v[:, :-3]))[1] else: target_v = r + args.gamma * self.net(s_)[1] adv = target_v - self.net(s)[1] # adv = (adv - adv.mean()) / (adv.std() + 1e-8) for _ in range(self.ppo_epoch): # Compute update for mini batch for index in BatchSampler( SubsetRandomSampler(range(self.buffer_capacity)), self.batch_size, False): if args.action_vec > 0: alpha, beta = self.net((s[index], a_v[index, :-3]))[0] else: alpha, beta = self.net(s[index])[0] dist = Beta(alpha, beta) entropy = dist.entropy().mean() a_logp = dist.log_prob(a[index]).sum(dim=1, keepdim=True) ratio = torch.exp(a_logp - old_a_logp[index] ) # old/new_policy for Trust Region Method surr1 = ratio * adv[index] surr2 = torch.clamp(ratio, 1.0 - self.clip_param, 1.0 + self.clip_param) * adv[index] # Clip Ratio action_loss = -torch.min(surr1, surr2).mean() # Difference between prediction and real values if args.action_vec > 0: value_loss = F.smooth_l1_loss( self.net((s[index], a_v[index, :-3]))[1], target_v[index]) else: value_loss = F.smooth_l1_loss( self.net(s[index])[1], target_v[index]) #loss = action_loss + 2. * value_loss # Loss with Entropy loss = action_loss + 2. * value_loss - 0.001 * entropy self.optimizer.zero_grad() loss.backward() # nn.utils.clip_grad_norm_(self.net.parameters(), self.max_grad_norm) self.optimizer.step() if args.vae: z = self.net.get_z(s[0].unsqueeze_(0)) dec2 = self.net.vae.decode(z) save_image( dec2, 'test_img/' + args.title + "_" + str(args.ndim) + '_dec_final_' + str(self.training_step) + '.png')