コード例 #1
0
ファイル: qseq_orig.py プロジェクト: lytzV/CS285Proj
    def update(self, ob_no, ac_na, reward_n, next_ob_no, terminal_n):
        # everything else should be numpy arrays up til this point
        ob_no = np.array(ob_no)
        ac_na = ptu.from_numpy(ac_na).to(torch.long)
        next_ob_no = np.array(next_ob_no)
        reward_n = ptu.from_numpy(reward_n)
        terminal_n = ptu.from_numpy(terminal_n)

        ac_na = ac_na.to(self.device)
        q = torch.gather(self.q_net(ob_no), 1, ac_na.unsqueeze(1)).squeeze()
        #print('q', q.shape)
        ac_qmax = torch.argmax(self.q_net(next_ob_no), dim=1).unsqueeze(1)

        # next_ob_no = next_ob_no.to(self.device)
        q_target = self.q_net_target(next_ob_no)
        q_target_plug_in = q_target.gather(1, ac_qmax).squeeze()
        terminal_n = terminal_n.to(self.device)
        reward_n = reward_n.to(self.device)
        target = reward_n + q_target_plug_in * (
            torch.logical_not(terminal_n)).detach()

        loss = self.loss(q, target)
        self.optimizer.zero_grad()
        loss.backward()
        utils.clip_grad_value_(self.q_decoder.parameters(),
                               self.grad_norm_clipping)
        self.optimizer.step()
コード例 #2
0
    def updateActors(self, trajectories):
        self.decoder.optimizer.zero_grad()
        self.encoder.optimizer.zero_grad()
        loss = torch.zeros(1)
        for t in trajectories:
            obs = t.observations
            decoder_input, decoder_hidden, encoder_padded = ptu.from_numpy(
                obs[0]).long()[:, :, 0], ptu.from_numpy(
                    obs[1]), ptu.from_numpy(obs[2]).squeeze()
            acs = ptu.from_numpy(t.actions)

            # actions could be squeezed, need to reshape to 1*N so log prob calculates respectively instead of a matrix for categorical distirbution
            # N*1 for normal batch
            acs = torch.reshape(acs, (1, -1))

            action_distribution, _, _, _ = self.decoder(
                decoder_input, decoder_hidden, encoder_padded)

            neg_log_prob = -1 * action_distribution.log_prob(acs)
            neg_log_prob = torch.squeeze(neg_log_prob)

            causality_cumsum = np.flip(np.cumsum(np.flip(t.rewards))).copy()
            traj_reward = ptu.from_numpy(causality_cumsum)  # causality trick
            loss += torch.dot(neg_log_prob, traj_reward)

        loss.backward()
        self.decoder.optimizer.step()
        self.encoder.optimizer.step()
コード例 #3
0
ファイル: q_seq2seq.py プロジェクト: lytzV/CS285Proj
    def q_net(self, ob):
        encoder_padded = ptu.from_numpy(np.array(ob[:,1].tolist()).astype(np.float32))[:,0,:,:]
        decoder_hidden = ptu.from_numpy(np.array(ob[:,2].tolist()).astype(np.float32))[:,0,:,:]
        decoder_input = ptu.from_numpy(np.array(ob[:,3].tolist()).astype(np.float32)).long()

        decoder_input = decoder_input.to(self.device)
        decoder_hidden = decoder_hidden.to(self.device)
        encoder_padded = encoder_padded.to(self.device)
        output, _, _ = self.q_decoder(decoder_input, decoder_hidden, encoder_padded)
        return output.squeeze()
コード例 #4
0
    def get_action_distribution(self, ob):
        ob = np.array(ob, dtype=object).reshape(-1,5)
        encoder_padded = ptu.from_numpy(np.array(ob[:,1].tolist()).astype(np.float32))[:,0,:,:]
        decoder_hidden = ptu.from_numpy(np.array(ob[:,2].tolist()).astype(np.float32))[:,0,:,:]
        decoder_input = ptu.from_numpy(np.array(ob[:,3].tolist()).astype(np.float32)).long()

        encoder_padded = encoder_padded.to(device)
        decoder_hidden = decoder_hidden.to(device)
        decoder_input = decoder_input.to(device)

        output, _, _ = self.action_decoder(decoder_input, decoder_hidden, encoder_padded)
        prob = F.softmax(output[:,0,:], dim=1)
        action_distribution = torch.distributions.Categorical(probs=prob)
        return action_distribution, prob
コード例 #5
0
    def get_baseline(self, ob):
        ob = np.array(ob, dtype=object).reshape(-1,5)
        decoder_hidden = ptu.from_numpy(np.array(ob[:,2].tolist()).astype(np.float32))[:,0,:,:]

        decoder_hidden = decoder_hidden.to(device)
        value = self.baseline_decoder(decoder_hidden).squeeze()
        return value
コード例 #6
0
    def update(self, observations, actions, advantages, q_values=None):
        observation = np.array(observations)
        actions = ptu.from_numpy(actions)
        advantages = ptu.from_numpy(advantages)
        
        action_distribution, probs = self.get_action_distribution(observations)
        negative_loglikelihood_predicted = -action_distribution.log_prob(actions)

        advantages = torch.squeeze(advantages)
        loss = torch.dot(negative_loglikelihood_predicted.squeeze(), advantages)
    
        self.action_decoder.optimizer.zero_grad()
        loss.backward()
        self.action_decoder.optimizer.step()

        targets = ptu.normalize(q_values, np.mean(q_values), np.std(q_values))
        targets = ptu.from_numpy(targets)

        baseline_predictions = self.get_baseline(observations)
        baseline_loss = self.baseline_loss(baseline_predictions, targets)

        self.baseline_optimizer.zero_grad()
        baseline_loss.backward()
        self.baseline_optimizer.step()