def forward(self, state, action=None): a = t.relu(self.fc1(state)) a = t.relu(self.fc2(a)) mu = self.mu_head(a) sigma = softplus(self.sigma_head(a)) dist = Normal(mu, sigma) act = (atanh(action / self.action_range) if action is not None else dist.rsample()) act_entropy = dist.entropy() # the suggested way to confine your actions within a valid range # is not clamping, but remapping the distribution act_log_prob = dist.log_prob(act) act_tanh = t.tanh(act) act = act_tanh * self.action_range # the distribution remapping process used in the original essay. act_log_prob -= t.log(self.action_range * (1 - act_tanh.pow(2)) + 1e-6) act_log_prob = act_log_prob.sum(1, keepdim=True) # If your distribution is different from "Normal" then you may either: # 1. deduce the remapping function for your distribution and clamping # function such as tanh # 2. clamp you action, but please take care: # 1. do not clamp actions before calculating their log probability, # because the log probability of clamped actions might will be # extremely small, and will cause nan # 2. do not clamp actions after sampling and before storing them in # the replay buffer, because during update, log probability will # be re-evaluated they might also be extremely small, and network # will "nan". (might happen in PPO, not in SAC because there is # no re-evaluation) # Only clamp actions sent to the environment, this is equivalent to # change the action reward distribution, will not cause "nan", but # this makes your training environment further differ from you real # environment. return act, act_log_prob, act_entropy
def act(self, states, TEST): # states = Variable(torch.from_numpy(states)) # if self.use_cuda: # states = states.cuda() value, action_mu, action_sigma, (self.hx, self.cx) = self.network( states, (self.hx, self.cx)) a_dist = Normal(action_mu, action_sigma) if not TEST: action = a_dist.sample() else: action = action_mu a_log_probs = a_dist.log_prob(action) a_dist_entropy = a_dist.entropy() # print("action_mu:", action_mu) print("action_sigma:", action_sigma.data) # print("action:", action) # print("hx,cx:",self.hx,self.cx) # print "value:", # print value return value, action, a_log_probs, a_dist_entropy
def optimize_model(self, epochs, variance): self.model.train() actions, states, old_probs, rewards = self.memory.return_mem() old_probs = torch.Tensor(old_probs).detach() for epoch in range(epochs): for i in range(len(states)): new_action_mean = self.model(states[i]) dist = Normal(new_action_mean, variance) dist_entropy = dist.entropy() new_prob = dist.log_prob(actions[i]) r = torch.exp(new_prob - old_probs[i]) # Loss follows the PPO loss of the (new prob of action / old prob of action) * reward clamped between 2 values actor_loss = -min(r * rewards[i], torch.clamp(r, 1-self.clip_factor, 1+self.clip_factor) * rewards[i]) actor_loss = actor_loss - (0.01 * dist_entropy) # Small bonus for entropy self.optimizer.zero_grad() actor_loss.backward() self.optimizer.step() self.memory.clear_mem()
def forward(self, state, action=None): a = t.relu(self.fc1(state)) a = t.relu(self.fc2(a)) mu = t.tanh(self.mu_head(a)) * self.action_range sigma = softplus(self.sigma_head(a)) dist = Normal(mu, sigma) act = (action if action is not None else dist.rsample()) act_entropy = dist.entropy() # do not clamp actions here, because # we action probability might be extremely small, # and network will "nan". act_log_prob = dist.log_prob(act) # do not clamp actions here, because # actions will be stored in replay buffer # and new evaluated log probability in update # might also be extremely small, and network will "nan". # clamp actions only before sending your actions into # the environment. return act, act_log_prob, act_entropy
def forward(self, state, action=None): x = state x = self.actor_bn(x) for l in self.actor_linears: x = l(x) x = self.relu(x) mu = self.tanh(self.mu(x)) log_var = -3. - self.relu(self.log_var(x)) # log_var = -3. - self.relu(self.log_var_const) sigmas = log_var.exp().sqrt() + 1.0e-5 dists = Normal(mu, sigmas) if action is None: action = dists.sample() log_prob = dists.log_prob(action).sum(dim=-1, keepdim=True) x = state x = self.critic_bn(x) for l in self.critic_linears: x = l(x) x = self.relu(x) v = self.v(x) return action, log_prob, dists.entropy(), v
def forward(self, x): body_actor = F.tanh(self.body_actor(x)) y = F.tanh(self.a(body_actor)) mean = self.mean(y) logstd = self.logstd(y) std = logstd.exp() dist = Normal(mean, std) action = dist.sample() a_logp = dist.log_prob(action) entropy = dist.entropy() body_critic = F.relu(self.body_critic(x)) z = F.relu(self.v1(body_critic)) value = self.v2(z) return { 'action': action, 'a_logp': a_logp, 'value': value, 'entropy': entropy, 'mean': mean, 'logstd': logstd, }
def forward(self, state, action=None): if type(state) != torch.Tensor: state = torch.FloatTensor(state).to(device) x = self.layers[0](state) for layer in self.layers[1:-1]: x = self.activ(layer(x)) mean = torch.tanh(self.layers[-1](x)) # (-1, 1) # Always positive value. # See https://sefiks.com/2017/08/11/softplus-as-a-neural-networks-activation-function/ std = F.softplus(self.std) dist = Normal(mean, std) if action is None: action = dist.sample() log_prob = dist.log_prob(action).sum(-1).unsqueeze(-1) entropy = dist.entropy().sum(-1).unsqueeze(-1) return mean, action, log_prob, entropy
def forward_multiple_mcs(self, model_params, data, var_params, itr, num_samples=5): ''' useful for analytic kl kl = torch.distributions.kl.kl_divergence(z_dist, self.prior).sum(-1) ''' y, x = self.unpack_data(data) loc, log_scale = self.unpack_var_params(var_params) var_dist = Normal(loc, torch.exp(log_scale)) #cov = torch.diag(torch.exp(log_scale))**2 #scale_tril = cov.tril() #var_dist = MultivariateNormal(loc, scale_tril=scale_tril) samples = var_dist.rsample(torch.Size((num_samples, ))) #data_term = self.model.log_joint(y, x, samples[0]) data_terms = torch.empty(num_samples, device=device) for i in range(len(samples)): data_terms[i] = self.model.log_joint(model_params, y, x, samples[i]) data_term = torch.mean(data_terms) entropy = torch.sum(var_dist.entropy()) return (data_term + entropy)
def evaluate(self, state, action): # return the value of given state, and the probability of the actor take {action} state_value = self.critic(state) if action is None: return state_value, None act_hid = self.actor(state) action_mean = self.action_mean(act_hid) action_log_std = self.action_log_std(act_hid) action_log_std = torch.clamp(action_log_std, min=LOG_SIG_MIN, max=LOG_SIG_MAX) action_std = action_log_std.exp() normal = Normal(action_mean, action_std) entropy = normal.entropy() action = normal.sample() log_prob = normal.log_prob(action) return state_value, log_prob, entropy
class GaussianPolicy(nn.Module): def __init__(self, input_dim, action_dim, hidden_layer=[64, 64]): super(GaussianPolicy, self).__init__() actor_layer_size = [input_dim] + hidden_layer actor_feature_layers = nn.ModuleList([]) for i in range(len(actor_layer_size) - 1): actor_feature_layers.append( nn.Linear(actor_layer_size[i], actor_layer_size[i + 1])) actor_feature_layers.append(nn.ReLU()) self.actor = nn.Sequential(*actor_feature_layers) self.mu_head = nn.Sequential(nn.Linear(hidden_layer[-1], action_dim), nn.Tanh()) self.std_head = nn.Sequential(nn.Linear(hidden_layer[-1], action_dim), nn.Softplus()) critic_layer_size = [input_dim] + hidden_layer critic_layers = nn.ModuleList([]) for i in range(len(critic_layer_size) - 1): critic_layers.append( nn.Linear(critic_layer_size[i], critic_layer_size[i + 1])) critic_layers.append(nn.ReLU()) critic_layers.append(nn.Linear(hidden_layer[-1], 1)) self.critic = nn.Sequential(*critic_layers) def forward(self, x, action=None): actor_features = self.actor(x) mu = self.mu_head(actor_features) std = self.std_head(actor_features) self.dist = Normal(mu, std) if action is None: action = self.dist.sample() action_log_prob = self.dist.log_prob(action).sum(-1) entropy = self.dist.entropy().sum(-1) value = self.critic(x) return action, action_log_prob, value.squeeze(-1), entropy
def sample(self, obs): mean, log_std, hidden = self.actor.forward(obs) std = log_std.exp() normal = Normal(mean, std) x_t = normal.rsample() # for reparameterization trick (mean + std * N(0,1)) y_t = torch.tanh(x_t) action = y_t * self.hyperps['action_scale'] #+ self.hyperps['action_bias'] action[:, 0] += self.hyperps['action_bias'] log_prob = normal.log_prob(x_t) # Enforcing Action Bound log_prob -= torch.log(self.hyperps['action_scale'] * (1 - y_t.pow(2)) + self.hyperps['epsilon']) log_prob = log_prob.sum(1, keepdim=True) mean = torch.tanh(mean) * self.hyperps['action_scale'] + self.hyperps['action_bias'] entropy = normal.entropy() entropy1, entropy2 = entropy[0][0].item(), entropy[0][1].item() #print('Std: {:2.3f}, {:2.3f}, log_std: {:2.3f},{:2.3f}, entropy:{:2.3f}, {:2.3f}'.format(std[0][0].item(),std[0][1].item(), log_std[0][0].item(), log_std[0][1].item(), entropy1, entropy2)) return action, log_prob, mean, std, hidden
class DiagGaussianDistribution(Distribution): """ Gaussian distribution with diagonal covariance matrix, for continuous actions. :param action_dim: (int) Dimension of the action space. """ def __init__(self, action_dim: int): super(DiagGaussianDistribution, self).__init__() self.distribution = None self.action_dim = action_dim self.mean_actions = None self.log_std = None def proba_distribution_net( self, latent_dim: int, log_std_init: float = 0.0) -> Tuple[nn.Module, nn.Parameter]: """ Create the layers and parameter that represent the distribution: one output will be the mean of the Gaussian, the other parameter will be the standard deviation (log std in fact to allow negative values) :param latent_dim: (int) Dimension og the last layer of the policy (before the action layer) :param log_std_init: (float) Initial value for the log standard deviation :return: (nn.Linear, nn.Parameter) """ mean_actions = nn.Linear(latent_dim, self.action_dim) # TODO: allow action dependent std log_std = nn.Parameter(th.ones(self.action_dim) * log_std_init, requires_grad=True) return mean_actions, log_std def proba_distribution(self, mean_actions: th.Tensor, log_std: th.Tensor) -> 'DiagGaussianDistribution': """ Create the distribution given its parameters (mean, std) :param mean_actions: (th.Tensor) :param log_std: (th.Tensor) :return: (DiagGaussianDistribution) """ action_std = th.ones_like(mean_actions) * log_std.exp() self.distribution = Normal(mean_actions, action_std) return self def mode(self) -> th.Tensor: return self.distribution.mean def sample(self) -> th.Tensor: # Reparametrization trick to pass gradients return self.distribution.rsample() def entropy(self) -> th.Tensor: return sum_independent_dims(self.distribution.entropy()) def actions_from_params(self, mean_actions: th.Tensor, log_std: th.Tensor, deterministic: bool = False) -> th.Tensor: # Update the proba distribution self.proba_distribution(mean_actions, log_std) return self.get_actions(deterministic=deterministic) def log_prob_from_params( self, mean_actions: th.Tensor, log_std: th.Tensor) -> Tuple[th.Tensor, th.Tensor]: """ Compute the log probability of taking an action given the distribution parameters. :param mean_actions: (th.Tensor) :param log_std: (th.Tensor) :return: (Tuple[th.Tensor, th.Tensor]) """ actions = self.actions_from_params(mean_actions, log_std) log_prob = self.log_prob(actions) return actions, log_prob def log_prob(self, actions: th.Tensor) -> th.Tensor: """ Get the log probabilities of actions according to the distribution. Note that you must call ``proba_distribution()`` method before. :param actions: (th.Tensor) :return: (th.Tensor) """ log_prob = self.distribution.log_prob(actions) return sum_independent_dims(log_prob)
class StateDependentNoiseDistribution(Distribution): """ Distribution class for using generalized State Dependent Exploration (gSDE). Paper: https://arxiv.org/abs/2005.05719 It is used to create the noise exploration matrix and compute the log probability of an action with that noise. :param action_dim: (int) Dimension of the action space. :param full_std: (bool) Whether to use (n_features x n_actions) parameters for the std instead of only (n_features,) :param use_expln: (bool) Use ``expln()`` function instead of ``exp()`` to ensure a positive standard deviation (cf paper). It allows to keep variance above zero and prevent it from growing too fast. In practice, ``exp()`` is usually enough. :param squash_output: (bool) Whether to squash the output using a tanh function, this allows to ensure boundaries. :param learn_features: (bool) Whether to learn features for gSDE or not. This will enable gradients to be backpropagated through the features ``latent_sde`` in the code. :param epsilon: (float) small value to avoid NaN due to numerical imprecision. """ def __init__(self, action_dim: int, full_std: bool = True, use_expln: bool = False, squash_output: bool = False, learn_features: bool = False, epsilon: float = 1e-6): super(StateDependentNoiseDistribution, self).__init__() self.distribution = None self.action_dim = action_dim self.latent_sde_dim = None self.mean_actions = None self.log_std = None self.weights_dist = None self.exploration_mat = None self.exploration_matrices = None self._latent_sde = None self.use_expln = use_expln self.full_std = full_std self.epsilon = epsilon self.learn_features = learn_features if squash_output: self.bijector = TanhBijector(epsilon) else: self.bijector = None def get_std(self, log_std: th.Tensor) -> th.Tensor: """ Get the standard deviation from the learned parameter (log of it by default). This ensures that the std is positive. :param log_std: (th.Tensor) :return: (th.Tensor) """ if self.use_expln: # From gSDE paper, it allows to keep variance # above zero and prevent it from growing too fast below_threshold = th.exp(log_std) * (log_std <= 0) # Avoid NaN: zeros values that are below zero safe_log_std = log_std * (log_std > 0) + self.epsilon above_threshold = (th.log1p(safe_log_std) + 1.0) * (log_std > 0) std = below_threshold + above_threshold else: # Use normal exponential std = th.exp(log_std) if self.full_std: return std # Reduce the number of parameters: return th.ones(self.latent_sde_dim, self.action_dim).to( log_std.device) * std def sample_weights(self, log_std: th.Tensor, batch_size: int = 1) -> None: """ Sample weights for the noise exploration matrix, using a centered Gaussian distribution. :param log_std: (th.Tensor) :param batch_size: (int) """ std = self.get_std(log_std) self.weights_dist = Normal(th.zeros_like(std), std) # Reparametrization trick to pass gradients self.exploration_mat = self.weights_dist.rsample() # Pre-compute matrices in case of parallel exploration self.exploration_matrices = self.weights_dist.rsample((batch_size, )) def proba_distribution_net( self, latent_dim: int, log_std_init: float = -2.0, latent_sde_dim: Optional[int] = None ) -> Tuple[nn.Module, nn.Parameter]: """ Create the layers and parameter that represent the distribution: one output will be the deterministic action, the other parameter will be the standard deviation of the distribution that control the weights of the noise matrix. :param latent_dim: (int) Dimension of the last layer of the policy (before the action layer) :param log_std_init: (float) Initial value for the log standard deviation :param latent_sde_dim: (Optional[int]) Dimension of the last layer of the feature extractor for gSDE. By default, it is shared with the policy network. :return: (nn.Linear, nn.Parameter) """ # Network for the deterministic action, it represents the mean of the distribution mean_actions_net = nn.Linear(latent_dim, self.action_dim) # When we learn features for the noise, the feature dimension # can be different between the policy and the noise network self.latent_sde_dim = latent_dim if latent_sde_dim is None else latent_sde_dim # Reduce the number of parameters if needed log_std = th.ones(self.latent_sde_dim, self.action_dim) if self.full_std else th.ones( self.latent_sde_dim, 1) # Transform it to a parameter so it can be optimized log_std = nn.Parameter(log_std * log_std_init, requires_grad=True) # Sample an exploration matrix self.sample_weights(log_std) return mean_actions_net, log_std def proba_distribution( self, mean_actions: th.Tensor, log_std: th.Tensor, latent_sde: th.Tensor) -> 'StateDependentNoiseDistribution': """ Create the distribution given its parameters (mean, std) :param mean_actions: (th.Tensor) :param log_std: (th.Tensor) :param latent_sde: (th.Tensor) :return: (StateDependentNoiseDistribution) """ # Stop gradient if we don't want to influence the features self._latent_sde = latent_sde if self.learn_features else latent_sde.detach( ) variance = th.mm(self._latent_sde**2, self.get_std(log_std)**2) self.distribution = Normal(mean_actions, th.sqrt(variance + self.epsilon)) return self def mode(self) -> th.Tensor: actions = self.distribution.mean if self.bijector is not None: return self.bijector.forward(actions) return actions def get_noise(self, latent_sde: th.Tensor) -> th.Tensor: latent_sde = latent_sde if self.learn_features else latent_sde.detach() # Default case: only one exploration matrix if len(latent_sde) == 1 or len(latent_sde) != len( self.exploration_matrices): return th.mm(latent_sde, self.exploration_mat) # Use batch matrix multiplication for efficient computation # (batch_size, n_features) -> (batch_size, 1, n_features) latent_sde = latent_sde.unsqueeze(1) # (batch_size, 1, n_actions) noise = th.bmm(latent_sde, self.exploration_matrices) return noise.squeeze(1) def sample(self) -> th.Tensor: noise = self.get_noise(self._latent_sde) actions = self.distribution.mean + noise if self.bijector is not None: return self.bijector.forward(actions) return actions def entropy(self) -> Optional[th.Tensor]: # No analytical form, # entropy needs to be estimated using -log_prob.mean() if self.bijector is not None: return None return sum_independent_dims(self.distribution.entropy()) def actions_from_params(self, mean_actions: th.Tensor, log_std: th.Tensor, latent_sde: th.Tensor, deterministic: bool = False) -> th.Tensor: # Update the proba distribution self.proba_distribution(mean_actions, log_std, latent_sde) return self.get_actions(deterministic=deterministic) def log_prob_from_params( self, mean_actions: th.Tensor, log_std: th.Tensor, latent_sde: th.Tensor) -> Tuple[th.Tensor, th.Tensor]: actions = self.actions_from_params(mean_actions, log_std, latent_sde) log_prob = self.log_prob(actions) return actions, log_prob def log_prob(self, actions: th.Tensor) -> th.Tensor: if self.bijector is not None: gaussian_actions = self.bijector.inverse(actions) else: gaussian_actions = actions # log likelihood for a gaussian log_prob = self.distribution.log_prob(gaussian_actions) # Sum along action dim log_prob = sum_independent_dims(log_prob) if self.bijector is not None: # Squash correction (from original SAC implementation) log_prob -= th.sum( self.bijector.log_prob_correction(gaussian_actions), dim=1) return log_prob
def entropy(self, datas): mean, std = datas distribution = Normal(mean, std) return distribution.entropy().float().to(set_device(self.use_gpu))
class TanhNormal(Distribution): """ Represent distribution of X where X ~ tanh(Z) Z ~ N(mean, std) Note: this is not very numerically stable. """ def __init__(self, normal_mean, normal_std, epsilon=1e-6): """ :param normal_mean: Mean of the normal distribution :param normal_std: Std of the normal distribution :param epsilon: Numerical stability epsilon when computing log-prob. """ self.normal_mean = normal_mean self.normal_std = normal_std self.normal = Normal(normal_mean, normal_std) self.epsilon = epsilon def sample_n(self, n, return_pre_tanh_value=False): z = self.normal.sample_n(n) if return_pre_tanh_value: return torch.tanh(z), z else: return torch.tanh(z) def log_prob(self, value, pre_tanh_value=None): """ :param value: some value, x :param pre_tanh_value: arctanh(x) :return: """ if pre_tanh_value is None: pre_tanh_value = torch.log((1 + value) / (1 - value)) / 2 return self.normal.log_prob(pre_tanh_value) - torch.log(1 - value * value + self.epsilon) def sample(self, return_pretanh_value=False): """ Gradients will and should *not* pass through this operation. See https://github.com/pytorch/pytorch/issues/4620 for discussion. """ z = self.normal.sample().detach() if return_pretanh_value: return torch.tanh(z), z else: return torch.tanh(z) def rsample(self, return_pretanh_value=False): """ Sampling in the reparameterization case. """ # z = ( # self.normal_mean + # self.normal_std * # Normal( # ptu.zeros(self.normal_mean.size()), # ptu.ones(self.normal_std.size()) # ).sample() # ) # z.requires_grad_() z = self.normal.rsample() if return_pretanh_value: return torch.tanh(z), z else: return torch.tanh(z) def entropy(self): """Returns entropy of the underlying normal distribution. Returns: torch.Tensor: entropy of the underlying normal distribution. """ return self.normal.entropy().sum(-1, keepdim=True)
state = torch.Tensor([np.swapaxes(npa(state_raw), 0, 2)]).to(device) # encode to latent variables (mu/var) latent_mu, latent_stddev = policy.encode(state) m = Normal(latent_mu, latent_stddev) rewards = [] rewards_raw = [] log_probs = [] entropies = [] for k in range(SAMPLES): # sample K times action = m.sample() log_probs.append(m.log_prob(action)) entropies.append(m.entropy()) params = policy.decode(action) # render out an image for each of the K samples # IMPORTANT THIS CURRENTLY ASSUMES BATCH SIZE = 1 next_state = env.render(params.detach().view(-1).cpu().numpy(), data_generator.cam) # calculate reward for each one of the K samples reward_raw = -(np.square(npa(state_raw) - npa(next_state))).mean(axis=None) rewards_raw.append(reward_raw) # deduct average reward of all K-1 samples (variance reduction) for k in range(SAMPLES): baseline = np.mean(rewards_raw[:k] + rewards_raw[k + 1:]) rewards.append(rewards_raw[k] - baseline)
def entropy(self, mean, std): distribution = Normal(mean, std) return distribution.entropy().float().to(self.device)
def train(episodes): # Initialize global counters first_batch = True episode_i = 0 total_i = 0 while episode_i < episodes: # START MAIN LOOP # Initialize batch lists current_state_q = [] next_state_q = [] reward_q = [] action_log_prob_q = [] value_q = [] advantage_q_new = [] done_q = [] action_q = [] avg_reward_batch = [] episode_in_batch = 0 i_in_batch = 0 # while i_in_batch < N_STEPS: # START EPISODE BATCH LOOP while episode_in_batch < N_TRAJECTORIES: # Reset environment and get first state cur_state = env.reset() done = False ret = 0 i_in_episode = 0 while not done: # RUN SINGLE EPISODE # Get parameters for distribution and assign action torch_state = torch.tensor(cur_state).unsqueeze(0).float() with torch.no_grad(): mu, sd = ac_net_actor(torch_state) val_out = ac_net_critic(torch_state) distribution = Normal(mu[0], sd[0]) action = distribution.sample() clamped_action_t = torch.clamp(action, -1.0, 1.0) clamped_action = clamped_action_t.data.numpy() for action_count in range(10): # Step environment next_state, reward, done, info = env.step(clamped_action) # Append values to queues current_state_q.append(cur_state) next_state_q.append(next_state) reward_q.append(float(reward)) value_q.append(val_out) action_q.append(clamped_action) action_log_prob_q.append(distribution.log_prob(clamped_action_t).data.numpy()) done_q.append(1-done) ret += reward # Sum total reward for episode # Iterate counters, etc cur_state = next_state i_in_episode += 1 i_in_batch += 1 total_i += 1 if i_in_episode % 10 == 0 and episode_i % 25 == 0 and episode_i >= 0: env.render() # TODO get args if i_in_episode > 3000: done = True if done: break # END SINGLE EPISODE episode_in_batch += 1 episode_i += 1 avg_reward.append(ret) avg_reward_batch.append(ret) # END EPISODE BATCH LOOP # START CUMULATIVE REWARD CALC discounted_reward = [] cumul_reward = 0 for reward, done, in zip(reversed(reward_q), reversed(done_q)): if done == 1: cumul_reward = cumul_reward*gamma + reward discounted_reward.insert(0, cumul_reward) elif done == 0: cumul_reward = reward discounted_reward.insert(0, cumul_reward) # SET UP TENSORS batch_length = len(current_state_q) current_state_t = torch.tensor(current_state_q).float() action_log_prob_t = torch.tensor(action_log_prob_q).float() action_t = torch.tensor(action_q).float() reward_t = torch.tensor(discounted_reward).float() # CALCULATE ADVANTAGE value_t_new = ac_net_critic(current_state_t) for reward_i, value_i in zip(np.asarray(discounted_reward), value_t_new.data.numpy()): advantage_q_new.append(reward_i - value_i) advantage_q_new = np.asarray(advantage_q_new) # TODO check how this is converted between numpy and tensor advantage_q_new = (advantage_q_new-np.mean(advantage_q_new))/(np.std(advantage_q_new)) advantage_t = torch.tensor(advantage_q_new).float() # START UPDATING NETWORKS # START BASELINE OPTIMIZE for epoch in range(B_EPOCHS): # Get random permutation of indexes indexes = torch.tensor(np.random.permutation(batch_length)).type(torch.LongTensor) n_batch = 0 batch_start = 0 batch_end = 0 # Loop over permutation while batch_end < batch_length: # Get batch indexes batch_end = batch_start + N_MINI_BATCH if batch_end > batch_length: batch_end = batch_length batch_idx = indexes[batch_start:batch_end] # Gather data from saved tensors batch_state_t = torch.index_select(current_state_t, 0, batch_idx).float() batch_reward_t = torch.index_select(reward_t, 0, batch_idx) # Get new baseline values new_val = ac_net_critic(batch_state_t) # Calculate loss compared with reward and optimize critic_loss_batch = criterion_val(new_val, batch_reward_t.unsqueeze(1)) # Do optimization optimizer_c.zero_grad() critic_loss_batch.backward() optimizer_c.step() # Iterate counters batch_start = batch_end n_batch += 1 # END BASELINE OPTIMIZE # START POLICY OPTIMIZE for epoch in range(K_EPOCHS): # Get random permutation of indexes indexes = torch.tensor(np.random.permutation(batch_length)).type(torch.LongTensor) n_batch = 0 batch_start = 0 batch_end = 0 # Loop over permutation while batch_end < batch_length: # Get batch indexes batch_end = batch_start + N_MINI_BATCH if batch_end > batch_length: batch_end = batch_length batch_idx = indexes[batch_start:batch_end] # Gather data from saved tensors batch_state_t = torch.index_select(current_state_t, 0, batch_idx).float() batch_advantage_t = torch.index_select(advantage_t, 0, batch_idx).float() batch_action_log_prob_t = torch.index_select(action_log_prob_t, 0, batch_idx) batch_action_t = torch.index_select(action_t, 0, batch_idx) # batch_reward_t = torch.index_select(reward_t, 0, batch_idx) # Get new batch of parameters and action log probs mu_batch, sd_batch = ac_net_actor(batch_state_t) batch_distribution = Normal(mu_batch, sd_batch) exp_probs = batch_distribution.log_prob(batch_action_t).exp() old_exp_probs = batch_action_log_prob_t.exp() r_theta_i = torch.div(exp_probs, old_exp_probs) # Expand advantage to dimensions of r_theta_i batch_advantage_t4 = batch_advantage_t.expand_as(r_theta_i) # Calculate the options surrogate1 = r_theta_i * batch_advantage_t4 surrogate2 = torch.clamp(r_theta_i, 1 - EPSILON, 1 + EPSILON) * batch_advantage_t4 batch_entropy = batch_distribution.entropy() batch_entropy_loss = torch.mean(torch.pow(batch_entropy, 2)) # Choose minimum of surrogates and calculate L_clip as final loss function r_theta_surrogate_min = torch.min(surrogate1, surrogate2) L_clip = -torch.sum(r_theta_surrogate_min) / r_theta_surrogate_min.size()[0] + 0.03 * batch_entropy_loss # if batch_entropy_loss > 1.2: # L_clip = L_clip + 0.05 * batch_entropy_loss # Optimize optimizer_a.zero_grad() L_clip.backward() optimizer_a.step() # Iterate counters batch_start = batch_end n_batch += 1 # END UPDATING ACTOR if episode_i % return_time == 0: print("%4d, %6.0d, %6.2f, %6.2f | %6.2f" % (episode_i, total_i, np.mean(avg_reward_batch), np.mean(avg_reward), torch.mean(batch_entropy).item())) with open('C:\\Users\\genia\\source\\repos\\Box2dEnv\\Box2dEnv\\saves\\{}.csv'.format("testWrite"), 'a+') as csv: for ret_write in zip(np.asarray(avg_reward_batch)): csv.write("{:2.2f}\n".format(ret_write[0])) return episode_i
def get_entropy(self, state): mu = self.forward(state) std = torch.exp(self.log_std) ac_dist = Normal(mu, std) return ac_dist.entropy()
def Worker(global_actor, n_steps, multi): if n_steps == 1 and multi == 1: mode = "SS" elif n_steps != 1 and multi == 1: mode = "MS" elif n_steps !=1 and multi != 1: mode = "MM" env = gym.make('InvertedPendulumSwingupBulletEnv-v0') local_actor = ActorCritic() if mode == "SS": lr = 0.001 elif mode == "MS": lr = 0.0005 elif mode == "MM": lr=0.0001 optimizer = optim.Adam(global_actor.parameters(), lr=lr) t = 1 score = 0.0 beta = 0.05 start_time = time.time() for train_episode in range(3000): local_actor.load_state_dict(global_actor.state_dict()) t_start = t - 1 state = env.reset() done = False rewards, log_probs, values, Rs = [], [], [], [] policy_losses, value_losses = [], [] entropies = [] R = 0 while True: # get action mu, sigma = local_actor.act(torch.from_numpy(state).float()) norm_dist = Normal(mu, sigma) action = norm_dist.sample() action = torch.clamp(action, min=-ACT_LIMIT, max=ACT_LIMIT) # get next_state and reward according to action next_state, reward, done, _ = env.step(action) score += reward log_prob = norm_dist.log_prob(action) value = local_actor.cri(torch.from_numpy(state).float()) entropy = norm_dist.entropy() log_probs.append(log_prob) values.append(value) rewards.append(reward) entropies.append(entropy) # gradient update if t - t_start == n_steps or done: if done: R = 0 else: R = local_actor.cri(torch.from_numpy(next_state).float()) for r in rewards[::-1]: R = r + GAMMA * R Rs.insert(0, R) for log_prob, value, entropy, R in zip(log_probs, values, entropies, Rs): advantage = R - value.item() policy_losses.append(-(log_prob * advantage + beta * entropy)) value_losses.append(F.mse_loss(value, torch.tensor([R]))) loss = torch.stack(policy_losses).sum() + torch.stack(value_losses).sum() optimizer.zero_grad() loss.backward() for local_param, global_param in zip(local_actor.parameters(), global_actor.parameters()): global_param._grad = local_param.grad optimizer.step() local_actor.load_state_dict(global_actor.state_dict()) rewards, log_probs, values, Rs = [], [], [], [] policy_losses, value_losses = [], [] entropies = [] R = 0 state = next_state t += 1 t_start = t - 1 if done: if mode == "SS": if score > 500: optimizer.param_groups[0]['lr'] = 0.00005 elif score > 400: optimizer.param_groups[0]['lr'] = 0.0001 elif score > 300: optimizer.param_groups[0]['lr'] = 0.0002 elif score > 200: optimizer.param_groups[0]['lr'] = 0.0003 elif score > 100: optimizer.param_groups[0]['lr'] = 0.0005 beta = beta * 0.999 if beta > 0.025 else 0.025 break else: state = next_state t += 1 #print("Train Episode: {}, Score: {:.1f}, Time: {:.2f}".format(train_episode, score, time.time() - start_time)) score = 0.0 env.close() print("Training process reached maximum episode.")
x = 400. * theta2 * (torch.exp(-theta1*design) - torch.exp(-theta2*design)) / (theta3*(theta2-theta1)) return x n_inner = 1000 n_outer = 100 loc = torch.tensor(np.log((0.1, 1., 20.)), dtype=torch.float64) scale = torch.tensor(np.sqrt((0.05, 0.05, 0.05)), dtype=torch.float64) prior = LogNormal(loc, scale) prior = Independent(prior, 1) theta_inner = prior.sample((n_inner,)) theta_outer = prior.sample((n_outer,)) loc = torch.zeros(15, dtype=torch.float64) scale = 0.1 * torch.ones(15, dtype=torch.float64) noise = Normal(loc, scale) noise = Independent(noise, 1) noise_entropy = noise.entropy() noise_outer = noise.sample((n_outer,)) def objective(design): x_outer = get_x(theta_outer, design) x_inner = get_x(theta_inner, design) y_outer = x_outer + noise_outer # Get matrix of all y_outer-x_inner values diff = y_outer.unsqueeze(1) - x_inner.unsqueeze(0) log_prob_diff = noise.log_prob(diff) log_evidence = torch.logsumexp(log_prob_diff, dim=1) - np.log(n_inner) sig = noise_entropy - log_evidence.mean() print('Design ', np.sort(design)) print('SIG {:.3f}'.format(sig.numpy())) return -sig.numpy()
def compute_action(self, cur_obs_tensor): m, std, v = self.model(cur_obs_tensor) dist = Normal(m, std) entropy = dist.entropy().sum(1, keepdim=True) return dist, entropy, v
def evaluate(self, state, action): mu, sigma = self.actor(state) sigma = sigma.expand_as(mu) dist = Normal(mu, sigma) return dist.log_prob(action).sum(dim=-1), dist.entropy().sum( dim=-1), torch.squeeze(self.critic(state), 1)
class ActorCriticPPO(StochasticContinuousNeuralNet): def __init__(self, architecture, weight_init=gauss_weights_init(0, 0.02), activation_functions=None): super(ActorCriticPPO, self).__init__() if len(architecture) < 2: raise Exception( "Architecture needs at least two numbers to create network") #assert architecture[-1]%2 == 1, "Last layer has to represent 2*actions_space for the Gaussian + 1 for value" self.activation_functions = activation_functions self.layer_list = [] self.layer_list_val = [] self.siglog = tor.zeros(1, requires_grad=True) self.siglog = nn.Parameter(self.siglog) for i in range(len(architecture) - 1): self.layer_list.append( nn.Linear(architecture[i], architecture[i + 1])) setattr(self, "fc" + str(i), self.layer_list[-1]) for i in range(len(architecture) - 2): self.layer_list_val.append( nn.Linear(architecture[i], architecture[i + 1])) setattr(self, "fc_val" + str(i), self.layer_list_val[-1]) self.layer_list_val.append(nn.Linear(architecture[-2], 1)) setattr(self, "fc_val" + str(len(architecture) - 1), self.layer_list_val[-1]) self.apply(weight_init) def policy_forward(self, x): # Policy network if self.activation_functions: for i, func in enumerate(self.activation_functions): x = func(self.layer_list[i](x)) else: for i, layer in enumerate(self.layer_list[:-1]): x = self.tanh(layer(x)) x = self.layer_list[-1](x) self._means = self.tanh(x) self._dist = Normal(self._means, tor.exp(self.siglog)) self.sampled = self._dist.rsample() x = self.sampled return x def mu(self): return self.means def value_forward(self, x): if self.activation_functions: for i, func in enumerate(self.activation_functions): x = func(self.layer_list_val[i](x)) else: for i, layer in enumerate(self.layer_list_val[:-1]): x = self.tanh(layer(x)) x = self.layer_list_val[-1](x) return x def forward(self, x): # Policy network action = self.policy_forward(x) value = self.value_forward(x) return tor.cat([action, value], dim=1) def __call__(self, state): #self.sigma_log -= sigma_epsilon action, value = self.policy_forward(state), self.value_forward(state) return action, value def sigma(self): return self.sigmas def mu(self): return self._means def logprob(self, values): return self._dist.log_prob(values) def entropy(self): return self._dist.entropy()
def _optimize(self, obs, acts, advs, est_rs): self.obs, self.acts, self.advs, self.est_rs = obs, acts, advs, est_rs self.obs = Tensor(self.obs) self.acts = Tensor(self.acts) self.advs = Tensor(self.advs).unsqueeze(1) self.est_rs = Tensor(self.est_rs).unsqueeze(1) # Calculate Advantage & Normalize it self.advs = (self.advs - self.advs.mean()) / (self.advs.std() + 1e-8) # Surrogate loss with Entropy if self.continuous: mean, std, values = self.model(self.obs) dis = Normal(mean, std) log_prob = dis.log_prob(self.acts).sum(-1, keepdim=True) ent = dis.entropy().sum(-1, keepdim=True) probs_new = torch.exp(log_prob) probs_old = probs_new.detach() + 1e-8 else: probs, values = self.model(self.obs) dis = F.softmax(probs, dim=1) self.acts = self.acts.long() probs_new = dis.gather(1, self.acts) probs_old = probs_new + 1e-8 ent = -(dis.log() * dis).sum(-1) ratio = probs_new / probs_old surrogate_loss = -torch.mean( ratio * self.advs) - self.entropy_para * ent.mean() # criterion = torch.nn.MSELoss() # empty_value_loss = criterion( values, values.detach() ) # Calculate the gradient of the surrogate loss self.model.zero_grad() surrogate_loss.backward() policy_gradient = parameters_to_vector([ p.grad for p in self.model.policy_parameters() ]).squeeze(0).detach() # ensure gradient is not zero if policy_gradient.nonzero().size()[0]: # Use Conjugate gradient to calculate step direction step_direction = self.conjugate_gradient(-policy_gradient) # line search for step shs = .5 * step_direction.dot( self.hessian_vector_product(step_direction)) lm = torch.sqrt(shs / self.max_kl) fullstep = step_direction / lm gdotstepdir = -policy_gradient.dot(step_direction) theta = self.linesearch( parameters_to_vector(self.model.policy_parameters()).detach(), fullstep, gdotstepdir / lm) # Update parameters of policy model old_model = copy.deepcopy(self.model) old_model.load_state_dict(self.model.state_dict()) if any(np.isnan(theta.cpu().detach().numpy())): print("NaN detected. Skipping update...") else: # for param in self.model.policy_parameters(): # print(param) vector_to_parameters(theta, self.model.policy_parameters()) kl_old_new = self.mean_kl_divergence(old_model) print('KL:{:10} , Entropy:{:10}'.format(kl_old_new.item(), ent.mean().item())) else: print("Policy gradient is 0. Skipping update...") print(policy_gradient.shape) self.model.zero_grad() if self.continuous: _, _, values = self.model(self.obs) else: _, values = self.model(self.obs) criterion = torch.nn.MSELoss() critic_loss = self.value_loss_coeff * criterion(values, self.est_rs) critic_loss.backward() self.optim.step() print("MSELoss for Value Net:{}".format(critic_loss.item()))
def train(episodes): env.env.unwrapped.seed(random_seed) first_batch = True episode_i = 0 total_i = 0 curious_reward_std = 0.2 while episode_i < episodes: # START MAIN LOOP cur_state_q = [] next_state_q = [] reward_q = [] action_log_prob_q = [] value_q = [] advantage_q_new = [] done_q = [] action_q = [] avg_reward_batch = [] avg_curious_reward_batch = [] curious_reward_q = [] avg_max_height = [] i_in_batch = 0 completed_q = [] while i_in_batch < N_STEPS: # START EPISODE BATCH LOOP cur_state = env.reset() cur_state_copy = cur_state.copy() cur_state_copy[1] = cur_state_copy[1]/0.035 done = False ret = 0 curious_ret = 0 i_in_episode = 0 episode_distance_q = [] next_cur_state_episode_q = [] while not done: # RUN SINGLE EPISODE # Get parameters for distribution and assign action torch_state = torch.tensor(cur_state_copy).unsqueeze(0).float() with torch.no_grad(): mu, sd = ac_net_actor(torch_state) # val_out = ac_net_critic(torch_state) # curious_out = ac_net_c_critic(torch_state) distribution = Normal(mu[0], sd[0]) action = distribution.sample() if episode_i < 15: clamped_action = torch.clamp(action, min=-1, max=1).data.numpy() else: clamped_action = torch.clamp(action, min=-1, max=1).data.numpy() episode_distance_q.append(cur_state[0]) # Step environment next_state, reward, done, info = env.step(clamped_action) # Append values to queues cur_state_q.append(cur_state_copy) next_state_copy = next_state.copy() next_state_copy[1] = next_state_copy[1]/0.035 next_cur_state_episode_q.append(next_state_copy) next_state_q.append(next_state_copy) reward_i = reward/20.0 reward_q.append(float(reward_i)) # value_q.append(val_out) action_q.append(action.data.numpy()) action_log_prob_q.append(distribution.log_prob(torch.tensor(clamped_action)).data.numpy()) done_q.append(1-done) # Why 1-done? ret += reward # Sum total reward for episode # Iterate counters, etc cur_state = next_state cur_state_copy = next_state_copy i_in_episode += 1 i_in_batch += 1 total_i += 1 if i_in_episode % 50 == 0 and episode_i % 10 == 0 and episode_i >= 0: env.render() # if i_in_episode > 500: # done = True if done: break # END SINGLE EPISODE if ret > 0.01: completed_q += np.ones((len(episode_distance_q), 1)).tolist() else: completed_q += np.zeros((len(episode_distance_q), 1)).tolist() next_state_episode = np.asarray(next_cur_state_episode_q) next_curious_state = get_curious_state(next_state_episode, p1, p2) with torch.no_grad(): rnd_val = ac_net_rnd(next_curious_state) pred_val = ac_net_pred(next_curious_state) curious_reward_episode = torch.pow((rnd_val - pred_val), 2) curious_rewards_episode = (curious_reward_episode.data.numpy()) curious_reward_q += curious_rewards_episode.tolist() curious_ret = np.sum(curious_rewards_episode) avg_curious_ret = curious_ret/i_in_episode episode_i += 1 avg_reward.append(ret) avg_curious_reward.append(curious_ret) avg_reward_batch.append(ret) avg_curious_reward_batch.append(curious_ret) avg_max_height_q.append(np.max(episode_distance_q)) avg_max_height.append(np.max(episode_distance_q)) print("%4d, %6.2f, %6.0f | " % (episode_i, np.max(episode_distance_q), curious_ret)) # print("") # END EPISODE BATCH LOOP max_achieved_height_in_batch = np.max(avg_max_height) # NORMALIZE CURIOUS REWARD if first_batch: curious_reward_std = np.std(np.asarray(curious_reward_q)) first_batch = False # START CUMULATIVE REWARD CALC curious_reward_q = curious_reward_q / curious_reward_std discounted_reward = [] discounted_curious_reward = [] cul_reward = 0 cul_curious_reward = 0 for reward, cur_reward, done, in zip(reversed(reward_q), reversed(curious_reward_q), reversed(done_q)): if done == 1: cul_reward = cul_reward*gamma1 + reward cul_curious_reward = cul_curious_reward*gamma2 + cur_reward discounted_reward.insert(0, cul_reward) discounted_curious_reward.insert(0, cul_curious_reward) elif done == 0: cul_reward = reward cul_curious_reward = cul_curious_reward*gamma2 + cur_reward discounted_reward.insert(0, cul_reward) discounted_curious_reward.insert(0, cul_curious_reward) # CALCULATE ADVANTAGE # Why is this a loop, dumbass? current_state_t = torch.tensor(cur_state_q).float() curious_advantage_q_new = [] advantage_q_new = [] with torch.no_grad(): value_t_new = ac_net_critic(current_state_t) curious_value_t_new = ac_net_c_critic(current_state_t) for reward_i, value_i in zip(np.asarray(discounted_reward), value_t_new.data.numpy()): advantage_q_new.append(reward_i - value_i) advantage_q_new = np.asarray(advantage_q_new) for reward_i, value_i in zip(np.asarray(discounted_curious_reward), curious_value_t_new.data.numpy()): curious_advantage_q_new.append(reward_i - value_i) curious_advantage_q_new = np.asarray(curious_advantage_q_new) advantage_q_new = (advantage_q_new-np.mean(advantage_q_new))/(np.std(advantage_q_new)) # Should advantage be recalculated at each optimize step? # curious_advantage_q_new = (curious_advantage_q_new-np.mean(curious_advantage_q_new))/(np.std(curious_advantage_q_new)) # Should advantage be recalculated at each optimize step? curious_advantage_q_new = (np.asarray(discounted_curious_reward) - np.mean(discounted_curious_reward))/(np.std(discounted_curious_reward)) # Should advantage be recalculated at each optimize step? plotted_data = np.transpose(np.asarray((np.asarray(cur_state_q)[:, 0], np.squeeze(discounted_curious_reward)))) plotted_data = np.transpose(np.asarray((np.asarray(cur_state_q)[:, 0], np.squeeze(curious_advantage_q_new)))) plt.plot(plotted_data) plt.show() max_curious_advantage = np.max(curious_advantage_q_new) std_curious_advantage = np.std(curious_advantage_q_new) mean_curious_advantage = np.mean(curious_advantage_q_new) max_advantage = np.max(advantage_q_new) std_advantage = np.std(advantage_q_new) mean_advantage = np.mean(advantage_q_new) advantage_t = torch.tensor(advantage_q_new).float() curious_advantage_t = torch.tensor(curious_advantage_q_new).float() completed_t = torch.tensor(np.asarray(completed_q)).float() # advantage_t = completed_t * advantage_t a_prop = 0.5 summed_advantage_t = torch.add(torch.mul(advantage_t, 1), torch.mul(curious_advantage_t, 1)) # START UPDATING NETWORKS batch_length = len(cur_state_q) action_log_prob_t = torch.tensor(action_log_prob_q).float() action_t = torch.tensor(action_q).float() reward_t = torch.tensor(discounted_reward).float() curious_reward_t = torch.tensor(discounted_curious_reward).float() summed_reward_t = torch.add(curious_reward_t, reward_t) # START BASELINE OPTIMIZE avg_baseline_loss = [] for epoch in range(B_epochs): # Get random permutation of indexes indexes = torch.tensor(np.random.permutation(batch_length)).type(torch.LongTensor) n_batch = 0 batch_start = 0 batch_end = 0 # Loop over permutation avg_baseline_batch_loss = [] avg_baseline_curious_batch_loss = [] while batch_end < batch_length: # Get batch indexes batch_end = batch_start + N_MINI_BATCH if batch_end > batch_length: batch_end = batch_length batch_idx = indexes[batch_start:batch_end] # Gather data from saved tensors batch_state_t = torch.index_select(current_state_t, 0, batch_idx).float() batch_reward_t = torch.index_select(reward_t, 0, batch_idx) batch_curious_reward_t = torch.index_select(curious_reward_t, 0, batch_idx) batch_summed_reward_t = torch.index_select(summed_reward_t, 0, batch_idx) batch_start = batch_end n_batch += 1 # Get new baseline values new_val = ac_net_critic(batch_state_t) new_curious_val = ac_net_c_critic(batch_state_t) # Calculate loss compared with reward and optimize # NEEDS TO BE OPTIMIZED WITH CURIOUS VAL AS WELL # new_summed_val = new_val + new_curious_val critic_loss_batch = criterion_val(new_val, batch_reward_t.unsqueeze(1)) critic_curious_loss_batch = criterion_val(new_curious_val, batch_curious_reward_t) # critic_loss_batch = criterion_val(new_summed_val, batch_summed_reward_t.unsqueeze(1)) # critic_loss_both = critic_curious_loss_batch # + critic_loss_batch optimizer_c.zero_grad() optimizer_cc.zero_grad() critic_loss_batch.backward() critic_curious_loss_batch.backward() optimizer_cc.step() optimizer_c.step() # avg_value_STD.append(critic_loss_batch.item()) avg_baseline_batch_loss.append(critic_loss_batch.item()) avg_baseline_curious_batch_loss.append(critic_curious_loss_batch.item()) # print(np.mean(avg_baseline_batch_loss), np.mean(avg_baseline_curious_batch_loss), " ", end="") # avg_baseline_loss.append(np.mean(avg_baseline_batch_loss)) # print("") # END BASELINE OPTIMIZE # START POLICY OPTIMIZE for epoch in range(K_epochs): # Get random permutation of indexes indexes = torch.tensor(np.random.permutation(batch_length)).type(torch.LongTensor) n_batch = 0 batch_start = 0 batch_end = 0 # Loop over permutation while batch_end < batch_length: # Get batch indexes batch_end = batch_start + N_MINI_BATCH if batch_end > batch_length: batch_end = batch_length batch_idx = indexes[batch_start:batch_end] # Gather data from saved tensors batch_state_t = torch.index_select(current_state_t, 0, batch_idx).float() if np.max(reward_q) > 0.01: if CURIOUS: batch_advantage_t = torch.index_select(summed_advantage_t, 0, batch_idx) else: batch_advantage_t = torch.index_select(advantage_t, 0, batch_idx) else: if CURIOUS: batch_advantage_t = torch.index_select(curious_advantage_t, 0, batch_idx) else: batch_advantage_t = torch.index_select(advantage_t, 0, batch_idx) # batch_advantage_t = torch.index_select(summed_advantage_t, 0, batch_idx) batch_action_log_prob_t = torch.index_select(action_log_prob_t, 0, batch_idx) batch_action_t = torch.index_select(action_t, 0, batch_idx) # batch_reward_t = torch.index_select(reward_t, 0, batch_idx) batch_start = batch_end n_batch += 1 # Get new batch of parameters and action log probs mu_batch, sd_batch = ac_net_actor(batch_state_t) batch_distribution = Normal(mu_batch, sd_batch) exp_probs = batch_distribution.log_prob(batch_action_t).exp() old_exp_probs = batch_action_log_prob_t.exp() r_theta_i = torch.div(exp_probs, old_exp_probs) # Advantage needs to include curious advantage. Should advantage be recalculated each epoch? batch_advantage_t4 = batch_advantage_t.expand_as(r_theta_i) surrogate1 = r_theta_i * batch_advantage_t4 surrogate2 = torch.clamp(r_theta_i, 1 - epsilon, 1 + epsilon) * batch_advantage_t4 batch_entropy = batch_distribution.entropy() batch_entropy_loss = torch.mean(batch_entropy) r_theta_surrogate_min = torch.min(surrogate1, surrogate2) L_clip = -torch.sum(r_theta_surrogate_min) / r_theta_surrogate_min.size()[0] + 0.03 * batch_entropy_loss optimizer_a.zero_grad() L_clip.backward() optimizer_a.step() # END OPTIMIZE POLICY # START OPTIMIZE CURIOUS curious_state_t = get_curious_state(np.asarray(cur_state_q), p1, p2) avg_curious_loss = [] curious_batch_length = N_CURIOUS_BATCH for epoch in range(R_epochs): # Get random permutation of indexes indexes = torch.tensor(np.random.permutation(batch_length)).type(torch.LongTensor) n_batch = 0 batch_start = 0 batch_end = 0 # Loop over permutation # avg_curious_loss = [] while batch_end < curious_batch_length: # Get batch indexes batch_end = batch_start + N_CURIOUS_BATCH if batch_end > curious_batch_length: batch_end = curious_batch_length batch_idx = indexes[batch_start:batch_end] # Gather data from saved tensors batch_state_t = torch.index_select(curious_state_t, 0, batch_idx).float() # batch_state_t = batch_state_t.unsqueeze(1) # batch_reward_t = torch.index_select(reward_t, 0, batch_idx) # batch_summed_reward_t = torch.index_select(summed_reward_t, 0, batch_idx) batch_start = batch_end n_batch += 1 with torch.no_grad(): rnd_val = ac_net_rnd(batch_state_t) pred_val = ac_net_pred(batch_state_t) # Calculate loss compared with reward and optimize optimizer_rnd.zero_grad() pred_loss_batch_curious = criterion_val(pred_val, rnd_val) pred_loss_batch_curious.backward() # nn.utils.clip_grad_norm(ac_net_pred.parameters(), 1) # nn.utils.clip_grad_value_(ac_net_pred.parameters(), 100) # clip_min_grad_value_(ac_net_pred.parameters(), 0.2) optimizer_rnd.step() avg_curious_loss.append(pred_loss_batch_curious.item()) # print((pred_loss_batch_curious.data.numpy()), " ", end="") # print("") # print(epoch) # print("") if episode_i % return_time == 0: print("%4d | %6.0d | %6.1f, %6.1f | %6.1f, %6.1f | %6.2f, %6.2f, %6.2f | %6.2f, %6.2f, %6.2f | %6.2f, %6.2f | %6.2f" % (episode_i, total_i, np.mean(avg_reward_batch), np.mean(avg_reward), np.mean(avg_curious_reward_batch), np.mean(avg_curious_reward), max_advantage, mean_advantage, std_advantage, max_curious_advantage, mean_curious_advantage, std_curious_advantage, max_achieved_height_in_batch, np.mean(avg_max_height_q), torch.mean(batch_entropy).item())) # END UPDATING ACTOR return episode_i
def main(): parser = argparse.ArgumentParser() parser.add_argument('run_number', help="Consecutive number of this run") parser.add_argument('-e', '--episodes', type=int, default=None, help="Number of episodes") parser.add_argument('-t', '--timesteps', type=int, default=None, help="Number of timesteps") parser.add_argument('-m', '--max-episode-timesteps', type=int, default=None, help="Maximum number of timesteps per episode") parser.add_argument('-ns', '--network-size', type=int, default=128) parser.add_argument('-s', '--save', help="Save agent to this dir") parser.add_argument('-se', '--save-episodes', type=int, default=100, help="Save agent every x episodes") parser.add_argument('-rl', '--reward-level', type=int, default=3) parser.add_argument('-rn', '--random-level', type=int, default=3) parser.add_argument('-sc', '--reward-scale', type=int, default=6) parser.add_argument('-rp', '--repeat', type=int, default=1, help='How many times to repeat an action') parser.add_argument('-bt', '--batch-size', type=int) parser.add_argument('-os', '--optimization-steps', type=int) parser.add_argument('-bs', '--baseline-steps', type=int) parser.add_argument('-mb', '--mini-batch', type=int, default=128) parser.add_argument('-sd', '--seed', type=int, default=None, help='Random seed for this trial') parser.add_argument('-tk', '--task', type=int, default=0) parser.add_argument('-gm', '--gamma', type=float, default=0.99) parser.add_argument('-lr', '--epsilon', type=float, default=0.2) parser.add_argument('-en', '--entropy', type=float, default=0.0) args = parser.parse_args() random_seed = args.seed torch.manual_seed(random_seed) np.random.seed(random_seed) random.seed(a=random_seed) # Make environment and set parameters env = gym.make('EnvTestContinuousR-v2') env.unwrapped.set_reward(args.reward_level) env.unwrapped.set_random(args.random_level) env.unwrapped.set_reward_scale(args.reward_scale) env.unwrapped.set_task(args.task) env.unwrapped.seed(random_seed) env.unwrapped.set_repeat(args.repeat) return_time = 1 # Set network parameters and initialize N_STATES = 5 N_ACTIONS = 3 # Initialise network and hyper params NETWORK_SIZE = args.network_size ac_net_critic = Net_Critic.Net(N_STATES, NETWORK_SIZE) ac_net_actor = Net_Actor.Net(N_STATES, N_ACTIONS, NETWORK_SIZE) criterion_val = nn.MSELoss() optimizer_c = torch.optim.Adam(ac_net_critic.parameters(), lr=0.0001, betas=(0.9, 0.999), eps=1e-08, weight_decay=0.00, amsgrad=False) optimizer_a = torch.optim.Adam(ac_net_actor.parameters(), lr=0.0001, betas=(0.9, 0.999), eps=1e-08, weight_decay=0.00, amsgrad=False) # optimizer_c = torch.optim.SGD(ac_net_critic.parameters(), lr=0.001, momentum=0.9, nesterov=True) # optimizer_a = torch.optim.SGD(ac_net_actor.parameters(), lr=0.001, momentum=0.9, nesterov=True) gamma = args.gamma N_TRAJECTORIES = args.batch_size K_EPOCHS = args.optimization_steps B_EPOCHS = args.baseline_steps N_MINI_BATCH = args.mini_batch EPSILON = args.epsilon # Initialize tracking queues avg_reward = deque(maxlen=100) # Setup filename run_number = args.run_number # Naming variables nNum = str(run_number).zfill(4) task = env.unwrapped.task if task == 'LIFT': nTask = 'L' else: nTask = 'P' nReward = env.unwrapped.reward_level nRandom = env.unwrapped.rand_level nSeed = str(random_seed).zfill(2) nAlg = 'mPPO' nName = ("{}-{}{}{}-{}-{}".format(nNum, nTask, nReward, nRandom, nSeed, nAlg)) # Initialize global counters episode_i = 0 total_i = 0 episodes = args.episodes # noinspection PyCallingNonCallable while episode_i < episodes: # START MAIN LOOP # Initialize batch lists current_state_q = [] next_state_q = [] reward_q = [] action_log_prob_q = [] value_q = [] advantage_q_new = [] done_q = [] action_q = [] avg_reward_batch = [] episode_in_batch = 0 i_in_batch = 0 #if episode_i > 500: # env.unwrapped.set_repeat(int(args.repeat/2)) while episode_in_batch < N_TRAJECTORIES: # Reset environment and get first state cur_state = env.reset() done = False ret = 0 i_in_episode = 0 while not done: # RUN SINGLE EPISODE # Get parameters for distribution and assign action torch_state = torch.tensor(cur_state).unsqueeze(0).float() with torch.no_grad(): mu, sd = ac_net_actor(torch_state) val_out = ac_net_critic(torch_state) distribution = Normal(mu[0], sd[0]) action = distribution.sample() clamped_action_t = torch.clamp(action, -1.0, 1.0) clamped_action = clamped_action_t.data.numpy() # Step environment next_state, reward, done, info = env.step(clamped_action) # Append values to queues current_state_q.append(cur_state) next_state_q.append(next_state) reward_q.append(float(reward)) value_q.append(val_out) action_q.append(clamped_action) action_log_prob_q.append( distribution.log_prob(clamped_action_t).data.numpy()) done_q.append(1 - done) ret += reward # Sum total reward for episode # Iterate counters, etc cur_state = next_state i_in_episode += 1 i_in_batch += 1 total_i += 1 #if i_in_episode % 1 == 0 and episode_i % 10 == 0 and episode_i >= 0: # env.render() if i_in_episode > args.max_episode_timesteps: done = True if done: break # END SINGLE EPISODE episode_in_batch += 1 episode_i += 1 avg_reward.append(ret) avg_reward_batch.append(ret) # END EPISODE BATCH LOOP # START CUMULATIVE REWARD CALC discounted_reward = [] cumul_reward = 0 for reward, done, in zip(reversed(reward_q), reversed(done_q)): if done == 1: cumul_reward = cumul_reward * gamma + reward discounted_reward.insert(0, cumul_reward) elif done == 0: cumul_reward = reward discounted_reward.insert(0, cumul_reward) # SET UP TENSORS batch_length = len(current_state_q) current_state_t = torch.tensor(current_state_q).float() action_log_prob_t = torch.tensor(action_log_prob_q).float() action_t = torch.tensor(action_q).float() reward_t = torch.tensor(discounted_reward).float() # CALCULATE ADVANTAGE value_t_new = ac_net_critic(current_state_t) for reward_i, value_i in zip(np.asarray(discounted_reward), value_t_new.data.numpy()): advantage_q_new.append(reward_i - value_i) advantage_q_new = np.asarray(advantage_q_new) # TODO check how this is converted between numpy and tensor advantage_q_new = (advantage_q_new - np.mean(advantage_q_new)) / ( np.std(advantage_q_new)) advantage_t = torch.tensor(advantage_q_new).float() # START UPDATING NETWORKS # START BASELINE OPTIMIZE for epoch in range(B_EPOCHS): # Get random permutation of indexes indexes = torch.tensor(np.random.permutation(batch_length)).type( torch.LongTensor) n_batch = 0 batch_start = 0 batch_end = 0 # Loop over permutation while batch_end < batch_length: # Get batch indexes batch_end = batch_start + N_MINI_BATCH if batch_end > batch_length: batch_end = batch_length batch_idx = indexes[batch_start:batch_end] # Gather data from saved tensors batch_state_t = torch.index_select(current_state_t, 0, batch_idx).float() batch_reward_t = torch.index_select(reward_t, 0, batch_idx) # Get new baseline values new_val = ac_net_critic(batch_state_t) # Calculate loss compared with reward and optimize critic_loss_batch = criterion_val(new_val, batch_reward_t.unsqueeze(1)) # Do optimization optimizer_c.zero_grad() critic_loss_batch.backward() optimizer_c.step() # Iterate counters batch_start = batch_end n_batch += 1 # END BASELINE OPTIMIZE # START POLICY OPTIMIZE for epoch in range(K_EPOCHS): # Get random permutation of indexes indexes = torch.tensor(np.random.permutation(batch_length)).type( torch.LongTensor) n_batch = 0 batch_start = 0 batch_end = 0 # Loop over permutation while batch_end < batch_length: # Get batch indexes batch_end = batch_start + N_MINI_BATCH if batch_end > batch_length: batch_end = batch_length batch_idx = indexes[batch_start:batch_end] # Gather data from saved tensors batch_state_t = torch.index_select(current_state_t, 0, batch_idx).float() batch_advantage_t = torch.index_select(advantage_t, 0, batch_idx).float() batch_action_log_prob_t = torch.index_select( action_log_prob_t, 0, batch_idx) batch_action_t = torch.index_select(action_t, 0, batch_idx) # batch_reward_t = torch.index_select(reward_t, 0, batch_idx) # Get new batch of parameters and action log probs mu_batch, sd_batch = ac_net_actor(batch_state_t) batch_distribution = Normal(mu_batch, sd_batch) exp_probs = batch_distribution.log_prob(batch_action_t).exp() old_exp_probs = batch_action_log_prob_t.exp() r_theta_i = torch.div(exp_probs, old_exp_probs) # Expand advantage to dimensions of r_theta_i batch_advantage_t4 = batch_advantage_t.expand_as(r_theta_i) # Calculate the options surrogate1 = r_theta_i * batch_advantage_t4 surrogate2 = torch.clamp(r_theta_i, 1 - EPSILON, 1 + EPSILON) * batch_advantage_t4 # Calculate batch entropy batch_entropy = batch_distribution.entropy() batch_entropy_loss = torch.mean(torch.pow(batch_entropy, 2)) # Choose minimum of surrogates and calculate L_clip as final loss function r_theta_surrogate_min = torch.min(surrogate1, surrogate2) L_clip = -torch.sum(r_theta_surrogate_min) / ( r_theta_surrogate_min.size()[0] ) + args.entropy * batch_entropy_loss # if batch_entropy_loss > 1.2: # L_clip = L_clip + 0.05 * batch_entropy_loss # Optimize optimizer_a.zero_grad() L_clip.backward() optimizer_a.step() # Iterate counters batch_start = batch_end n_batch += 1 # END UPDATING ACTOR if episode_i % return_time == 0: print("%4d, %6.0d, %6.2f, %6.2f | %6.2f" % (episode_i, total_i, np.mean(avg_reward_batch), np.mean(avg_reward), torch.mean(batch_entropy).item())) with open( '/home/adf/exp715/Box2dEnv/Box2dEnv/saves/{}.csv'.format( nName), 'a+') as csv: for ret_write in zip(np.asarray(avg_reward_batch)): csv.write("{:2.2f}\n".format(ret_write[0])) # END UPDATE OF BATCH - RETURN TO TOP WHILE STILL EPISODES TO GO # END MAIN LOOP env.close()
def lowtrain(self): buffer, buffer_capacity, batch_size = self.lowmemory.show() s = torch.tensor(buffer['s'], dtype=torch.double).to(self.device) option = torch.tensor(buffer['option'], dtype=torch.double).view(-1, 1).to(self.device) s_ = torch.tensor(buffer['s_'], dtype=torch.double).to(self.device) option_ = torch.tensor(buffer['option_'], dtype=torch.double).view(-1, 1).to(self.device) a = torch.tensor(buffer['a'], dtype=torch.double).to(self.device) old_a_logp = torch.tensor(buffer['a_logp'], dtype=torch.double).view(-1, 1).to(self.device) r = torch.tensor(buffer['r'], dtype=torch.double).view(-1, 1).to(self.device) done = torch.tensor(buffer['done'], dtype=torch.double).view(-1, 1).to(self.device) action_loss_record, value_loss_record, entropy_record, loop_record = 0, 0, 0, 0 with torch.no_grad(): value_next = self.lownet(s_)['value'] option_change_next = torch.where(option_ > 5, torch.zeros_like(option_), option_) value_next_zeros = torch.gather(value_next, 1, option_change_next.long()) value_next = torch.where( option_ > 5, value_next.sum(dim=1, keepdim=True) / self.config.get('num_options'), value_next_zeros) value_now = self.lownet(s)['value'] option_change_now = torch.where(option > 5, torch.zeros_like(option), option) value_now_zeros = torch.gather(value_now, 1, option_change_now.long()) value_now = torch.where( option > 5, value_now.sum(dim=1, keepdim=True) / self.config.get('num_options'), value_now_zeros) delta = r + ( 1 - done) * self.config.get('gamma') * value_next - value_now adv = torch.zeros_like(delta) adv[-1] = delta[-1] # GAE for i in reversed(range(buffer_capacity - 1)): adv[i] = delta[i] + self.config.get('tau') * ( 1 - done[i]) * adv[i + 1] target_v = value_now + adv adv = (adv - adv.mean()) / (adv.std() + np.finfo(np.float).eps ) # Normalize advantage for _ in range(self.config.get('ppoepoch')): for index in BatchSampler( SubsetRandomSampler(range(buffer_capacity)), batch_size, False): mean, logstd = self.lownet(s[index])['mean'], self.lownet( s[index])['logstd'] std = logstd.exp() dist = Normal(mean, std) a_logp = dist.log_prob(a[index]) option_short = option[index] mask = torch.zeros_like(a_logp).double() index_list = [ torch.where(option_short == i)[0] for i in range(self.config.get('num_options')) ] input_list = torch.zeros(self.config.get('num_options'), self.config.get('action_dim')) start_list = self.config.get('start_list') end_list = self.config.get('end_list') for i in range(self.config.get('num_options')): input_list[i][start_list[i]:end_list[i]] = 1 for i in range(self.config.get('num_options')): if torch.tensor(index_list[i].shape) != 0: mask[index_list[i]] = torch.ones( torch.tensor(index_list[i].shape), self.config.get('action_dim')).double().to( self.device) * input_list[i].double().to( self.device) a_logp = a_logp * mask a_p_1 = a_logp.sum(dim=1, keepdim=True) ratio = torch.exp((a_p_1 - old_a_logp[index])) surr1 = ratio * adv[index] surr2 = torch.clamp( ratio, 1.0 - self.config.get('clip_param'), 1.0 + self.config.get('clip_param')) * adv[index] action_loss = -torch.min(surr1, surr2).mean() entropy = dist.entropy() * mask value_now = self.lownet(s[index])['value'] option_change_now = torch.where( option[index] > 5, torch.zeros_like(option[index]), option[index]) value_now_zeros = torch.gather(value_now, 1, option_change_now.long()) value_now = torch.where( option[index] > 5, value_now.sum(dim=1, keepdim=True) / self.config.get('num_options'), value_now_zeros) value_loss = F.smooth_l1_loss(value_now, target_v[index]) self.lowoptimizition.zero_grad() loss = action_loss + value_loss - self.config.get( 'entropy_para_low') * entropy.mean() loss.backward() nn.utils.clip_grad_norm_(self.lownet.parameters(), self.config.get('max_grad_norm')) self.lowoptimizition.step() action_loss_record += action_loss.cpu().detach() value_loss_record += value_loss.cpu().detach() entropy_record += entropy.mean().cpu().detach() loop_record += 1 return { 'actionloss': action_loss_record / loop_record, 'valueloss': value_loss_record / loop_record, 'entropy': entropy_record / loop_record, }
def get_entropy(mu, std): dist = Normal(mu, std) entropy = dist.entropy().mean() return entropy
def entropy(self): distribution = Normal(loc=self.mu, scale=self.log_var.exp()) return distribution.entropy().mean()