Ejemplo n.º 1
0
 def evaluate(self, state, epsilon=1e-6):
     '''
     generate sampled action with state as input wrt the policy network;
     '''
     mean, log_std = self.forward(state)
     std = log_std.exp() # no clip in evaluation, clip affects gradients flow
     
     normal = Normal(0, 1)
     z      = normal.sample() 
     action_0 = torch.tanh(mean + std*z.to(device)) # TanhNormal distribution as actions; reparameterization trick
     action = self.action_range*action_0
     # The log-likelihood here is for the TanhNorm distribution instead of only Gaussian distribution. \
     # The TanhNorm forces the Gaussian with infinite action range to be finite. \
     # For the three terms in this log-likelihood estimation: \
     # (1). the first term is the log probability of action as in common \
     # stochastic Gaussian action policy (without Tanh); \
     # (2). the second term is the caused by the Tanh(), \
     # as shown in appendix C. Enforcing Action Bounds of https://arxiv.org/pdf/1801.01290.pdf, \
     # the epsilon is for preventing the negative cases in log; \
     # (3). the third term is caused by the action range I used in this code is not (-1, 1) but with \
     # an arbitrary action range, which is slightly different from original paper.
     log_prob = Normal(mean, std).log_prob(mean+ std*z.to(device)) - torch.log(1. - action_0.pow(2) + epsilon) -  np.log(self.action_range)
     # both dims of normal.log_prob and -log(1-a**2) are (N,dim_of_action); 
     # the Normal.log_prob outputs the same dim of input features instead of 1 dim probability, 
     # needs sum up across the features dim to get 1 dim prob; or else use Multivariate Normal.
     log_prob = log_prob.sum(dim=1, keepdim=True)
     return action, log_prob, z, mean, log_std
Ejemplo n.º 2
0
    def evaluate(self, state, epsilon=1e-6):
        '''
        generate sampled action with state as input wrt the policy network;
        deterministic evaluation provides better performance according to the original paper;
        '''
        mean, log_std = self.forward(state)
        std = log_std.exp(
        )  # no clip in evaluation, clip affects gradients flow

        normal = Normal(0, 1)
        z = normal.sample()
        action_0 = torch.tanh(
            mean + std * z.to(device)
        )  # TanhNormal distribution as actions; reparameterization trick
        action = self.action_range * action_0
        ''' stochastic evaluation '''
        log_prob = Normal(
            mean, std).log_prob(mean + std * z.to(device)) - torch.log(
                1. - action_0.pow(2) + epsilon) - np.log(self.action_range)
        ''' deterministic evaluation '''
        # log_prob = Normal(mean, std).log_prob(mean) - torch.log(1. - torch.tanh(mean).pow(2) + epsilon) -  np.log(self.action_range)
        '''
         both dims of normal.log_prob and -log(1-a**2) are (N,dim_of_action); 
         the Normal.log_prob outputs the same dim of input features instead of 1 dim probability, 
         needs sum up across the features dim to get 1 dim prob; or else use Multivariate Normal.
         '''
        log_prob = log_prob.sum(dim=-1, keepdim=True)
        return action, log_prob, z, mean, log_std
Ejemplo n.º 3
0
    def evaluate_action(self, state):
        '''
        evaluate action within GPU graph, for gradients flowing through it
        '''
        state = torch.FloatTensor(state).unsqueeze(0).to(device) # state dim: (N, dim of state)
        if DETERMINISTIC:
            action = self.forward(state)
            return action.detach().cpu().numpy()

        elif DISCRETE and not DETERMINISTIC:  # actor-critic (discrete)
            probs = self.forward(state)
            m = Categorical(probs)
            action = m.sample().to(device)
            log_prob = m.log_prob(action)

            return action.detach().cpu().numpy(), log_prob.squeeze(0), m.entropy().mean()

        elif not DISCRETE and not DETERMINISTIC: # soft actor-critic (continuous)
            self.action_range = 30.
            self.epsilon = 1e-6

            mean, log_std = self.forward(state)
            std = log_std.exp()
            normal = Normal(0, 1)
            z = normal.sample().to(device)
            action0 = torch.tanh(mean + std*z.to(device)) # TanhNormal distribution as actions; reparameterization trick
            action = self.action_range * action0
            
            log_prob = Normal(mean, std).log_prob(mean+ std*z.to(device)) - torch.log(1. - action0.pow(2) + self.epsilon) -  np.log(self.action_range)            
            log_prob = log_prob.sum(dim=1, keepdim=True)
            print('mean: ', mean, 'log_std: ', log_std)
            # return action.item(), log_prob, z, mean, log_std
            return action.detach().cpu().numpy().squeeze(0), log_prob.squeeze(0), Normal(mean, std).entropy().mean()
Ejemplo n.º 4
0
    def sample_actions_and_llhoods_for_all_skills(self, s, explore=True):
        x = s.clone().view(s.size(0), 1,
                           s.size(1)).repeat(1, self.n_m_actions, 1)
        m, log_stdev = self(x)
        stdev = log_stdev.exp()
        if explore:
            u = m + stdev * torch.randn_like(m)
        else:
            u = m
        a = torch.tanh(u)

        if self.log_func == 'self':
            llhoods = gaussian_likelihood(u.unsqueeze(1), m.unsqueeze(2),
                                          log_stdev.unsqueeze(2),
                                          self.EPS_sigma)
        elif self.log_func == 'torch':
            llhoods = Normal(m.unsqueeze(2),
                             stdev.unsqueeze(2)).log_prob(u.unsqueeze(1))

        if self.log_lim_method == 'clamp':
            llhoods -= torch.log(
                torch.clamp(1 - a.unsqueeze(1).pow(2), self.EPS_log_1_min_a2,
                            1.0))
        elif self.log_lim_method == 'sum':
            llhoods -= torch.log(1 - a.unsqueeze(1).pow(2) +
                                 self.EPS_log_1_min_a2)

        llhoods = llhoods.sum(
            3)  #.clamp(self.min_log_stdev, self.max_log_stdev)

        return a, llhoods
Ejemplo n.º 5
0
    def step(self, image, location, recurrent_hidden):

        # image = [batch size, n channels, height, width]
        # location = [batch size, 2]
        # recurrent_hidden = [batch size, recurrent hid dim]

        glimpse_hidden = self.glimpse_network(image, location)

        # glimpse_hidden = [batch size, glimpse hid dim + location hid dim]

        recurrent_hidden = self.core_network(glimpse_hidden, recurrent_hidden)

        # recurrent_hidden = [batch size, recurrent hid dim]

        location, location_mu = self.location_network(recurrent_hidden)

        # location = [batch size, 2]
        # location_mu = [batch size, 2]

        log_location_action = Normal(location_mu, self.std).log_prob(location)
        log_location_action = log_location_action.sum(dim=1)

        # log_location_action = [batch size]

        baseline = self.baseline_network(recurrent_hidden)

        return recurrent_hidden, log_location_action, baseline, location, location_mu,
    def evaluate(self,
                 state,
                 smooth_policy,
                 device=torch.device("cpu"),
                 epsilon=1e-6):

        mean, log_std = self.forward(state)
        normal = Normal(torch.zeros(mean.shape), torch.ones(log_std.shape))
        z = normal.sample().to(device)
        std = log_std.exp()
        if self.args.stochastic_actor:
            z = torch.clamp(z, -3, 3)
            action_0 = mean + torch.mul(z, std)
            action_1 = torch.tanh(action_0)
            action = torch.mul(self.action_range.to(device),
                               action_1) + self.action_bias.to(device)
            log_prob = Normal(mean, std).log_prob(action_0) - torch.log(
                1. - action_1.pow(2) + epsilon) - torch.log(
                    self.action_range.to(device))
            log_prob = log_prob.sum(dim=-1, keepdim=True)
            return action, log_prob, std.detach()
        else:
            action_mean = torch.mul(
                self.action_range.to(device),
                torch.tanh(mean)) + self.action_bias.to(device)
            smooth_random = torch.clamp(0.2 * z, -0.5, 0.5)
            action_random = action_mean + smooth_random
            action_random = torch.min(action_random,
                                      self.action_high.to(device))
            action_random = torch.max(action_random,
                                      self.action_low.to(device))
            action = action_random if smooth_policy else action_mean
            return action, 0 * log_std.sum(dim=-1, keepdim=True), std.detach()
 def get_action(self, state, deterministic, epsilon=1e-6):
     mean, log_std = self.forward(state)
     normal = Normal(torch.zeros(mean.shape), torch.ones(log_std.shape))
     z = normal.sample()
     if self.args.stochastic_actor:
         std = log_std.exp()
         action_0 = mean + torch.mul(z, std)
         action_1 = torch.tanh(action_0)
         action = torch.mul(self.action_range, action_1) + self.action_bias
         log_prob = Normal(mean, std).log_prob(action_0) - torch.log(
             1. - action_1.pow(2) + epsilon) - torch.log(self.action_range)
         log_prob = log_prob.sum(dim=-1, keepdim=True)
         action_mean = torch.mul(self.action_range,
                                 torch.tanh(mean)) + self.action_bias
         action = action_mean.detach().cpu().numpy(
         ) if deterministic else action.detach().cpu().numpy()
         return action, log_prob.detach().item()
     else:
         action_mean = torch.mul(self.action_range,
                                 torch.tanh(mean)) + self.action_bias
         action = action_mean + 0.1 * torch.mul(self.action_range, z)
         action = torch.min(action, self.action_high)
         action = torch.max(action, self.action_low)
         action = action_mean.detach().cpu().numpy(
         ) if deterministic else action.detach().cpu().numpy()
         return action, 0
Ejemplo n.º 8
0
    def evaluate(self, state, deterministic, eval_noise_scale, epsilon=1e-6):
        '''
        generate action with state as input wrt the policy network, for calculating gradients
        '''
        mean, log_std = self.forward(state)
        std = log_std.exp(
        )  # no clip in evaluation, clip affects gradients flow

        normal = Normal(0, 1)
        z = normal.sample()
        action_0 = torch.tanh(
            mean + std * z.to(device)
        )  # TanhNormal distribution as actions; reparameterization trick
        action = self.action_range * mean if deterministic else self.action_range * action_0
        log_prob = Normal(
            mean, std).log_prob(mean + std * z.to(device)) - torch.log(
                1. - action_0.pow(2) + epsilon) - np.log(self.action_range)
        # both dims of normal.log_prob and -log(1-a**2) are (N,dim_of_action);
        # the Normal.log_prob outputs the same dim of input features instead of 1 dim probability,
        # needs sum up across the features dim to get 1 dim prob; or else use Multivariate Normal.
        log_prob = log_prob.sum(dim=1, keepdim=True)
        ''' add noise '''
        eval_noise_clip = 2 * eval_noise_scale
        noise = normal.sample(action.shape) * eval_noise_scale
        noise = torch.clamp(noise, -eval_noise_clip, eval_noise_clip)
        action = action + noise.to(device)

        return action, log_prob, z, mean, log_std
Ejemplo n.º 9
0
 def get_KL(self, params, old_log_prob, state, old_action_raw, old_action):
     torch.nn.utils.vector_to_parameters(params, self.actor.evaluate_net.parameters())
     mean, log_std = self.actor.evaluate_net.forward(state)
     std = log_std.exp()
     new_log_prob = Normal(mean, std).log_prob(old_action_raw) - torch.log(1 - old_action.pow(2) + 1e-6)
     new_log_prob = new_log_prob.sum(-1, keepdim=True)
     KL = old_log_prob - new_log_prob
     KL = KL.mean()
     return KL.item()
Ejemplo n.º 10
0
 def evaluate(self, state, epsilon=1e-6):
     mean, log_std = self.forward(state)
     std = log_std.exp()
     
     normal = Normal(0, 1)
     z      = normal.sample()
     action = torch.tanh(mean+ std*z.to(device))
     log_prob = Normal(mean, std).log_prob(mean+ std*z.to(device)) - torch.log(1 - action.pow(2) + epsilon)
     log_prob = log_prob.sum(dim=-1, keepdim=True)
     return action, log_prob, z, mean, log_std
Ejemplo n.º 11
0
 def get_log_probs(self, obs_rest, epsilon=1e-6):
     mean, log_std = self.forward(obs_rest)
     std = log_std.exp(
     )  # no clip in evaluation, clip affects gradients flow
     action_logit = Normal(mean, std).sample()
     action = torch.tanh(action_logit)
     log_prob = Normal(
         mean, std).log_prob(action_logit) - torch.log(1. - action.pow(2) +
                                                       epsilon)
     #assert float(log_prob.mean())==float(log_prob.mean()), "Log_prob is nan"
     return log_prob.sum(dim=1, keepdim=True), action
Ejemplo n.º 12
0
    def sample_actions_and_llhoods(self, s, explore=True):
        m, std = self(s)
        if explore:
            u = m + std * torch.randn_like(m)
        else:
            u = m
        a = torch.tanh(u)

        llhoods = Normal(m, std.abs()).log_prob(u)
        llhoods -= torch.log(1 - a.pow(2) + 1e-6)
        llhoods = llhoods.sum(1, keepdim=True)
        return a, llhoods
Ejemplo n.º 13
0
 def get_reward(self, state):
     same_z = Normal(0, 1).sample()
     mean, log_std = self.actor.policy_net.forward(state)
     std = log_std.exp()
     action_raw = mean + std * same_z
     action = torch.tanh(action_raw)
     log_prob = Normal(mean, std).log_prob(action_raw) - torch.log(1 - action.pow(2) + 1e-6)
     log_prob = log_prob.sum(-1, keepdim=True)
     predicted_new_q_value_1, predicted_new_q_value_2 = self.critic.predict_q(state, action)
     predicted_new_q_value = torch.min(predicted_new_q_value_1, predicted_new_q_value_2)
     loss = (predicted_new_q_value - self.alpha * log_prob).mean()
     return loss.item()
    def get_action(self, x):
        mean, log_std = self.pi(x)
        std = log_std.exp()
        normal = Normal(0, 1)
        z      = normal.sample()
        action = mean + std*z
        log_prob = Normal(mean, std).log_prob(action)
        log_prob = log_prob.sum(dim=-1, keepdim=True)  # reduce dim
        prob = log_prob.exp()

        action = self.action_range*action # scale the action

        return action.detach().numpy(), prob
Ejemplo n.º 15
0
    def evaluate(self, state, epsilon=1e-6):

        mean, log_std = self.forward(state)
        std = log_std.exp()
        normal = Normal(torch.zeros(mean.shape), torch.ones(std.shape))
        z = normal.sample().to(device)
        action_0 = mean + torch.mul(z, std)
        action_1 = torch.tanh(action_0)
        action = torch.mul(self.action_range.to(device), action_1) + self.action_bias.to(device)
        log_prob = Normal(mean, std).log_prob(action_0)-torch.log(1. - action_1.pow(2) + epsilon) - torch.log(self.action_range.to(device))
        log_prob = log_prob.sum(dim=-1, keepdim=True)
        entropy = Normal(mean, std).entropy()

        return action, log_prob, entropy, mean.detach(), std.detach()
Ejemplo n.º 16
0
 def evaluate(self,
              data,
              numerical_state,
              epsilon=1e-6):  #must check how to fine-tune it.
     mean, log_std = self.forward(data, numerical_state)
     std = torch.exp(log_std)
     policy = (
         mean + std *
         Normal(torch.zeros(4), torch.ones(4)).sample().to(self.device))
     policy.requires_grad_()
     action = torch.tanh(policy)
     log_prob = Normal(torch.zeros(4).to(self.device), torch.ones(4).to(self.device)).\
         log_prob(policy) - torch.log(1 - action.pow(2) + epsilon)
     log_prob = log_prob.sum(dim=1, keepdim=True)
     return action, log_prob, policy, mean, log_std
Ejemplo n.º 17
0
    def learn(self):
        state, action, reward, next_state, end = self.memory.sample(self.batch_size)

        state = torch.FloatTensor(state).to(device)
        action = torch.FloatTensor(action).to(device)
        reward = torch.FloatTensor(reward).unsqueeze(1).to(device)
        next_state = torch.FloatTensor(next_state).to(device)
        end = torch.FloatTensor(np.float32(end)).unsqueeze(1).to(device)

        # Training Q Networks
        predicted_q_value_1, predicted_q_value_2 = self.critic.predict_q(state, action)
        predicted_v_target = self.critic.predict_v_target(next_state)
        target_q_value = reward + (1 - end) * self.discount * predicted_v_target
        q_loss_1 = nn.MSELoss()(predicted_q_value_1, target_q_value.detach())
        q_loss_2 = nn.MSELoss()(predicted_q_value_2, target_q_value.detach())
        self.critic.learn_q(q_loss_1, q_loss_2)

        # Training V Network
        new_action, log_prob = self.actor.predict(state)
        predicted_new_q_value_1, predicted_new_q_value_2 = self.critic.predict_q(state, new_action)
        predicted_new_q_value = torch.min(predicted_new_q_value_1, predicted_new_q_value_2)
        target_v_value = predicted_new_q_value - self.alpha * log_prob
        predicted_v_value = self.critic.predict_v(state)
        v_loss = nn.MSELoss()(predicted_v_value, target_v_value.detach())
        self.critic.learn_v(v_loss)

        if self.debug_file is not None:
            z = Normal(0, 1).sample().to(device)
            mean, log_std = self.actor.policy_net.forward(state)
            std = log_std.exp()
            old_action_raw = mean + std * z
            old_action = torch.tanh(old_action_raw)
            old_log_prob = Normal(mean, std).log_prob(old_action_raw) - torch.log(1 - old_action.pow(2) + 1e-6)
            old_log_prob = old_log_prob.sum(-1, keepdim=True)
            old_reward = self.get_reward(state)

        # Training Policy Network
        policy_loss = (self.alpha * log_prob - predicted_new_q_value).mean()
        self.actor.learn(policy_loss)

        if self.debug_file is not None:
            KL = self.get_KL(torch.nn.utils.parameters_to_vector(self.actor.policy_net.parameters()),
                             old_log_prob, state, old_action_raw, old_action)
            new_reward = self.get_reward(state)
            self.debug_file.write("{},{}\n".format(abs(KL), new_reward - old_reward))

        # Updating Target-V Network
        self.critic.update_target_v()
    def evaluate(self, state, epsilon=1e-6):
        '''
        generate sampled action with state as input wrt the policy network;
        '''
        mean, log_std, d_action_prob = self.forward(state)

        std = log_std.exp()
        normal = Normal(0, 1)
        z = normal.sample().to(d)
        c_action_0 = torch.tanh(mean + std * z)
        c_action = self.action_range * c_action_0
        log_prob = Normal(mean, std).log_prob(mean + std * z) - torch.log(
            1. - c_action_0.pow(2) + epsilon) - np.log(self.action_range)

        log_prob = log_prob.sum(dim=-1, keepdim=True)
        return d_action_prob, c_action, log_prob, z, mean, log_std
Ejemplo n.º 19
0
    def forward(self, y, X):
        # Sample parameters
        b = self.b.rsample()
        sig = self.sig.rsample()

        # Compute loglike
        ll = Normal(X.matmul(b), sig).log_prob(y)

        # Compute kl_qp
        kl_qp = kld(self.b.dist(), Normal(0, 1)) + kld(self.sig.dist(),
                                                       Gamma(1, 1))

        # Compute ELBO
        elbo = ll.sum() - kl_qp.sum()

        return elbo
Ejemplo n.º 20
0
 def elbo(self,
          qz_m,
          qz_logv,
          zode_L,
          logpL,
          X,
          XrecL,
          L,
          qz_enc_m=None,
          qz_enc_logv=None):
     ''' Input:
             qz_m - latent means [N,2q]
             qz_logv - latent logvars [N,2q]
             zode_L - latent trajectory samples [L,N,T,2q]
             logpL - densities of latent trajectory samples [L,N,T]
             X - input images [N,T,nc,d,d]
             XrecL - reconstructions [L,N,T,nc,d,d]
             qz_enc_m - encoder density means  [N*T,2*q]
             qz_enc_logv - encoder density variances [N*T,2*q]
     '''
     [N, T, nc, d, d] = X.shape
     q = qz_m.shape[1] // 2
     # prior
     log_pzt = self.mvn.log_prob(zode_L.contiguous().view(
         [L * N * T, 2 * q]))  # L*N*T
     log_pzt = log_pzt.view([L, N, T])  # L,N,T
     kl_zt = logpL - log_pzt  # L,N,T
     kl_z = kl_zt.sum(2).mean(0)  # N
     # likelihood
     XL = X.repeat([L, 1, 1, 1, 1, 1])  # L,N,T,nc,d,d
     lhood_L = torch.log(XrecL) * XL + torch.log(1 - XrecL) * (
         1 - XL)  # L,N,T,nc,d,d
     lhood = lhood_L.sum([2, 3, 4, 5]).mean(0)  # N
     if qz_enc_m is not None:  # instant encoding
         qz_enc_mL = qz_enc_m.repeat([L, 1])  # L*N*T,2*q
         qz_enc_logvL = qz_enc_logv.repeat([L, 1])  # L*N*T,2*q
         mean_ = qz_enc_mL.contiguous().view(-1)  # L*N*T*2*q
         std_ = qz_enc_logvL.exp().contiguous().view(-1)  # L*N*T*2*q
         qenc_zt_ode = Normal(mean_, std_).log_prob(
             zode_L.contiguous().view(-1)).view([L, N, T, 2 * q])
         qenc_zt_ode = qenc_zt_ode.sum([3])  # L,N,T
         inst_enc_KL = logpL - qenc_zt_ode
         inst_enc_KL = inst_enc_KL.sum(2).mean(0)  # N
         return lhood.mean(), kl_z.mean(), inst_enc_KL.mean()
     else:
         return lhood.mean(), kl_z.mean()  # mean over training samples
 def evaluate(self, state, last_action, hidden_in, epsilon=1e-6):
     '''
     generate sampled action with state as input wrt the policy network;
     '''
     mean, log_std, hidden_out = self.forward(state, last_action, hidden_in)
     std = log_std.exp() # no clip in evaluation, clip affects gradients flow
     
     normal = Normal(0, 1)
     z = normal.sample()
     action_0 = torch.tanh(mean + std * z.cuda())  # TanhNormal distribution as actions; reparameterization trick
     action = self.action_range * action_0
     log_prob = Normal(mean, std).log_prob(mean + std * z.cuda()) - torch.log(
         1. - action_0.pow(2) + epsilon) - np.log(self.action_range)
     # both dims of normal.log_prob and -log(1-a**2) are (N,dim_of_action);
     # the Normal.log_prob outputs the same dim of input features instead of 1 dim probability,
     # needs sum up across the features dim to get 1 dim prob; or else use Multivariate Normal.
     log_prob = log_prob.sum(dim=-1, keepdim=True)
     return action, log_prob, z, mean, log_std, hidden_out
Ejemplo n.º 22
0
    def sample_action(self, s):
        mean, log_std = self.forward(s)
        std = log_std.exp()

        # calculate action using reparameterization trick and action scaling
        normal = Normal(0, 1)
        xi = normal.sample()
        u = mean + std * xi.to(hyp.device)
        y = torch.tanh(u)
        a = y * self.action_scale + self.action_bias

        # enforcing action bound (appendix of paper)
        log_pi = Normal(
            mean, std).log_prob(u) - torch.log(self.action_scale *
                                               (1 - y.pow(2)) + hyp.EPSILON)
        log_pi = log_pi.sum(1, keepdim=True)
        mean = torch.tanh(mean) * self.action_scale + self.action_bias

        return a, log_pi, mean
Ejemplo n.º 23
0
 def elbo(self, qz_m, qz_logv, zode_L, logpL, X, XrecL, Ndata, qz_enc_m=None, qz_enc_logv=None):
     ''' Input:
             qz_m        - latent means [N,2q]
             qz_logv     - latent logvars [N,2q]
             zode_L      - latent trajectory samples [L,N,T,2q]
             logpL       - densities of latent trajectory samples [L,N,T]
             X           - input images [N,T,nc,d,d]
             XrecL       - reconstructions [L,N,T,nc,d,d]
             Ndata       - number of sequences in the dataset (required for elbo
             qz_enc_m    - encoder density means  [N*T,2*q]
             qz_enc_logv - encoder density variances [N*T,2*q]
         Returns:
             likelihood
             prior on ODE trajectories KL[q_ode(z_{0:T})||N(0,I)]
             prior on BNN weights
             instant encoding term KL[q_ode(z_{0:T})||q_enc(z_{0:T}|X_{0:T})] (if required)
     '''
     [N,T,nc,d,d] = X.shape
     L = zode_L.shape[0]
     q = qz_m.shape[1]//2
     # prior
     log_pzt = self.mvn.log_prob(zode_L.contiguous().view([L*N*T,2*q])) # L*N*T
     log_pzt = log_pzt.view([L,N,T]) # L,N,T
     kl_zt   = logpL - log_pzt  # L,N,T
     kl_z    = kl_zt.sum(2).mean(0) # N
     kl_w    = self.bnn.kl().sum()
     # likelihood
     XL = X.repeat([L,1,1,1,1,1]) # L,N,T,nc,d,d
     lhood_L = torch.log(1e-3+XrecL)*XL + torch.log(1e-3+1-XrecL)*(1-XL) # L,N,T,nc,d,d
     lhood = lhood_L.sum([2,3,4,5]).mean(0) # N
     if qz_enc_m is not None: # instant encoding
         qz_enc_mL    = qz_enc_m.repeat([L,1])  # L*N*T,2*q
         qz_enc_logvL = qz_enc_logv.repeat([L,1])  # L*N*T,2*q
         mean_ = qz_enc_mL.contiguous().view(-1) # L*N*T*2*q
         std_  = 1e-3+qz_enc_logvL.exp().contiguous().view(-1) # L*N*T*2*q
         qenc_zt_ode = Normal(mean_,std_).log_prob(zode_L.contiguous().view(-1)).view([L,N,T,2*q])
         qenc_zt_ode = qenc_zt_ode.sum([3]) # L,N,T
         inst_enc_KL = logpL - qenc_zt_ode
         inst_enc_KL = inst_enc_KL.sum(2).mean(0) # N
         return Ndata*lhood.mean(), Ndata*kl_z.mean(), kl_w, Ndata*inst_enc_KL.mean()
     else:
         return Ndata*lhood.mean(), Ndata*kl_z.mean(), kl_w
Ejemplo n.º 24
0
    def step(self, x, l_t, h_t):
        """
        @param x: image. (batch, channel, height, width)
        @param l_t: location trial. (batch, 2)
        @param h_t: last hidden state. (batch, rnn_hidden)
        @return h_t: next hidden state. (batch, rnn_hidden)
        @return l_t: next location trial. (batch, 2)
        @return b_t: baseline for step t. (batch)
        @return log_pi: probability for next location trial. (batch)
        """
        glimpse = self.glimpse_net(x, l_t)
        h_t = self.rnn(glimpse, h_t)
        mu, l_t = self.location_net(h_t)
        b_t = self.baseline_net(h_t).squeeze()

        log_pi = Normal(mu, self.std).log_prob(l_t)
        # Note: log(p_y*p_x) = log(p_y) + log(p_x)
        log_pi = log_pi.sum(dim=1)

        return h_t, l_t, b_t, log_pi
Ejemplo n.º 25
0
def so3_entropy_old(w_eps, std, k=10):
    '''
    w_eps(Tensor of dim 3): sample from so3
    covar(Tensor of dim 3x3): covariance of distribution on so3
    k: 2k+1 samples for truncated summation
    '''
    # entropy of gaussian distribution on so3
    # see appendix C of https://arxiv.org/pdf/1807.04689.pdf
    theta = w_eps.norm(p=2)
    u = w_eps / theta  # 3
    angles = 2 * np.pi * torch.arange(
        -k, k + 1, dtype=w_eps.dtype, device=w_eps.device)  # 2k+1
    theta_hat = theta + angles  # 2k+1
    x = u[None, :] * theta_hat[:, None]  # 2k+1 , 3
    log_p = Normal(torch.zeros(3, device=w_eps.device),
                   std).log_prob(x)  # 2k+1,3
    clamp = 1e-3
    log_vol = torch.log((theta_hat**2).clamp(min=clamp) /
                        (2 - 2 * torch.cos(theta)).clamp(min=clamp))  # 2k+1
    log_p = log_p.sum(-1) + log_vol
    entropy = -logsumexp(log_p)
    return entropy
Ejemplo n.º 26
0
def sample(mod, lam_draw, y_grid=None):
    if y_grid is None:
        upper = 6
        lower = -6
        grid_size = 100
        step = (upper - lower) / grid_size
        y_grid = torch.arange(start=-6, end=6, step=step)

    # TODO: TEST
    gam, mu, sig = gam_post.sample(mod, lam_draw)

    dden = []
    for i in range(mod.I):
        gami_onehot = util.get_one_hot(gam[i], sum(mod.L))
        obs_i = 1 - mod.m[i]
        mu_i = (gami_onehot * mu[None, None, :]).sum(-1)
        dden_i = Normal(mu_i[:, :, None],
                        sig[i]).log_prob(y_grid[None, None, :]).exp()
        dden_i = dden_i * obs_i[:, :, None].double()
        dden_i = dden_i.sum(0) / obs_i.sum(0, keepdim=True).double().transpose(
            0, 1)
        dden.append(dden_i)

    return (y_grid, dden)
Ejemplo n.º 27
0
def so3_entropy(w_eps, std, k=10):
    '''
    w_eps(Tensor of dim Bx3): sample from so3
    std(Tensor of dim Bx3): std of distribution on so3
    k: Use 2k+1 samples for truncated summation
    '''
    # entropy of gaussian distribution on so3
    # see appendix C of https://arxiv.org/pdf/1807.04689.pdf
    theta = w_eps.norm(p=2, dim=-1, keepdim=True)  # [B, 1]
    u = w_eps / theta  # [B, 3]
    angles = 2 * np.pi * torch.arange(
        -k, k + 1, dtype=w_eps.dtype, device=w_eps.device)  # 2k+1
    theta_hat = theta[:, None, :] + angles[:, None]  # [B, 2k+1, 1]
    x = u[:, None, :] * theta_hat  # [B, 2k+1 , 3]
    log_p = Normal(torch.zeros(3, device=w_eps.device),
                   std).log_prob(x.permute([1, 0, 2]))  # [2k+1, B, 3]
    log_p = log_p.permute([1, 0, 2])  # [B, 2k+1, 3]
    clamp = 1e-3
    log_vol = torch.log(
        (theta_hat**2).clamp(min=clamp) /
        (2 - 2 * torch.cos(theta_hat)).clamp(min=clamp))  # [B, 2k+1, 1]
    log_p = log_p.sum(-1) + log_vol.sum(-1)  #[B, 2k+1]
    entropy = -logsumexp(log_p, -1)
    return entropy
Ejemplo n.º 28
0
    def forward(self, state, deterministic=False):
        x = F.relu(self.linear1(state))
        x = F.relu(self.linear2(x))

        mean = self.mean_linear(x)
        log_std = self.log_std_linear(x)
        log_std = torch.clamp(log_std, self.log_std_min, self.log_std_max)

        std = torch.exp(log_std)
        log_prob = None

        if deterministic:
            action = torch.tanh(mean)
        else:
            normal = Normal(0, 1)
            z = mean + std * normal.sample().to(
                torch.device("cuda" if torch.cuda.is_available() else "cpu"))
            action = torch.tanh(z)
            log_prob = Normal(
                mean,
                std).log_prob(z) - torch.log(1 - action.pow(2) + self.epsilon)
            log_prob = log_prob.sum(dim=1, keepdim=True)

        return action, mean, log_std, log_prob, std
Ejemplo n.º 29
0
    def learn(self):
        state, action, reward, next_state, end = self.memory.sample(
            self.batch_size)

        state = torch.FloatTensor(state).to(device)
        action = torch.FloatTensor(action).to(device)
        reward = torch.FloatTensor(reward).unsqueeze(1).to(device)
        next_state = torch.FloatTensor(next_state).to(device)
        end = torch.FloatTensor(np.float32(end)).unsqueeze(1).to(device)

        # Training Q Networks
        predicted_q_value_1, predicted_q_value_2 = self.critic.predict_q(
            state, action)
        predicted_v_target = self.critic.predict_v_target(next_state)
        target_q_value = reward + (1 -
                                   end) * self.discount * predicted_v_target
        q_loss_1 = nn.MSELoss()(predicted_q_value_1, target_q_value.detach())
        q_loss_2 = nn.MSELoss()(predicted_q_value_2, target_q_value.detach())
        self.critic.learn_q(q_loss_1, q_loss_2)

        # Training V Network
        new_action, log_prob = self.actor.predict(state)
        predicted_new_q_value_1, predicted_new_q_value_2 = self.critic.predict_q(
            state, new_action)
        predicted_new_q_value = torch.min(predicted_new_q_value_1,
                                          predicted_new_q_value_2)
        target_v_value = predicted_new_q_value - self.alpha * log_prob
        predicted_v_value = self.critic.predict_v(state)
        v_loss = nn.MSELoss()(predicted_v_value, target_v_value.detach())
        self.critic.learn_v(v_loss)

        # Training Policy Network
        policy_loss = (self.alpha * log_prob - predicted_new_q_value).mean()

        normal = Normal(0, 1)
        z = normal.sample().to(device)
        mean, log_std = self.actor.policy_net.forward(state)
        std = log_std.exp()
        old_action_raw = mean + std * z
        old_action = torch.tanh(old_action_raw)
        old_log_prob = Normal(mean, std).log_prob(old_action_raw) - torch.log(
            1 - old_action.pow(2) + 1e-6)
        old_log_prob = old_log_prob.sum(-1, keepdim=True)

        if self.debug_file is not None:
            old_reward = self.get_reward(state)

        params = torch.nn.utils.parameters_to_vector(
            self.actor.policy_net.parameters())
        search_direction = torch.nn.utils.parameters_to_vector(
            torch.autograd.grad(policy_loss,
                                self.actor.policy_net.parameters(),
                                retain_graph=True))

        unit_size = torch.FloatTensor([1e-4]).to(device)
        max_iteration = 5

        # Now we have the iterations
        for i in range(max_iteration):
            test_params = params - search_direction * unit_size
            KL = self.get_KL(test_params, old_log_prob, state, old_action_raw,
                             old_action)
            if abs(KL) <= self.tr:
                params = test_params
                torch.nn.utils.vector_to_parameters(
                    params, self.actor.policy_net.parameters())
                # Compute new direction
                new_action, log_prob = self.actor.predict(state)
                predicted_new_q_value_1, predicted_new_q_value_2 = self.critic.predict_q(
                    state, new_action)
                predicted_new_q_value = torch.min(predicted_new_q_value_1,
                                                  predicted_new_q_value_2)
                policy_loss = (self.alpha * log_prob -
                               predicted_new_q_value).mean()
                search_direction = torch.nn.utils.parameters_to_vector(
                    torch.autograd.grad(policy_loss,
                                        self.actor.policy_net.parameters(),
                                        retain_graph=True))
            else:
                break

        if self.debug_file is not None:
            KL = self.get_KL(
                torch.nn.utils.parameters_to_vector(
                    self.actor.policy_net.parameters()), old_log_prob, state,
                old_action_raw, old_action)
            new_reward = self.get_reward(state)
            self.debug_file.write("{},{}\n".format(abs(KL),
                                                   new_reward - old_reward))

        # Updating Target-V Network
        self.critic.update_target_v()
Ejemplo n.º 30
0
                total_ll = 0.0
                aleatoric = 0.0
                epistemic = 0.0
                # total_d_ll = 0.0
                for i, (x, y) in enumerate(test):
                    x, y = x.to(device), y.to(device)

                    mus = torch.zeros(args.samples, x.size(0), device=device)
                    logvars = torch.zeros(args.samples,
                                          x.size(0),
                                          device=device)
                    for j in range(args.samples):
                        mus[j], logvars[j], _ = model(x)

                    ll = Normal(mus, torch.exp(logvars / 2)).log_prob(y)
                    total_ll += torch.logsumexp(ll.sum(dim=1), dim=0).item()

                    # mean = mus.mean(dim=0)
                    # std = (
                    #     mus.var(dim=0) + torch.exp(logvars / 2).mean(dim=0) ** 2
                    # ) ** 0.5
                    # total_d_ll += Normal(mean, std).log_prob(y).sum()

                    epistemic += mus.var(dim=0).mean().item()
                    aleatoric += torch.exp(logvars).mean(dim=0).sum().item()

                    real_y = y * test.dataset.y_sigma + test.dataset.y_mu  # type: ignore
                    real_mu = mus.mean(
                        dim=0
                    ) * test.dataset.y_sigma + test.dataset.y_mu  # type: ignore
                    squared_err += ((real_y - real_mu)**2).sum().item()