Exemple #1
0
    def _learn_rl(self, global_step):
        # sample a minibatch of experiences
        # gamma = Variable(t.Tensor([self.gamma]).float(), requires_grad=False)
        gamma = variable([self.gamma], cuda=self.cuda)
        exps, imp_weights, ids = self.memory_rl.sample(global_step)
        # how many of the samples in a batch are showdowns or all-ins
        state_vars = [variable(s, cuda=self.cuda) for s in exps[0]]
        action_vars = variable(exps[1], cuda=self.cuda)
        imp_weights = variable(imp_weights, cuda=self.cuda)
        rewards = variable(exps[2].astype(np.float32), cuda=self.cuda)
        next_state_vars = [variable(s, cuda=self.cuda) for s in exps[3]]
       # state_hashes = exps[5]

        if self.verbose and self.tensorboard is not None:
            actions = bucket_encode_actions(action_vars, cuda=self.cuda)
            for a in actions.data.cpu().numpy():
                self.tensorboard.add_scalar_value('M_RL_sampled_actions', int(a), time.time())
            for r in exps[2]:
                self.tensorboard.add_scalar_value('M_RL_sampled_rewards', int(r), time.time())
#            for h in state_hashes:
#                self.tensorboard.add_scalar_value('M_RL_sampled_states', int(h), time.time())


        if self.is_training:
            Q_targets = rewards + gamma * t.max(self.strategy._target_Q.forward(*next_state_vars), 1)[0]

            if self.verbose:
                start = timer()
            td_deltas = self.strategy._Q.learn(state_vars, action_vars, Q_targets, imp_weights)
            if self.verbose:
                print('backward pass of Q network took ', timer() - start)
            self.memory_rl.update(ids, td_deltas.data.cpu().numpy())
Exemple #2
0
    def learn(self, states, actions):
        """
        From Torch site
         loss = nn.CrossEntropyLoss()
         input = autograd.Variable(torch.randn(3, 5), requires_grad=True)
         target = autograd.Variable(torch.LongTensor(3).random_(5))
         output = loss(input, target)
         output.backward()
        """
        self.optim.zero_grad()
        pi_preds = self.forward(*states).squeeze()
        criterion = nn.CrossEntropyLoss()
        one_hot_actions = bucket_encode_actions(actions, cuda=self.is_cuda)
        loss = criterion(pi_preds, (1 + one_hot_actions).long())

        # log loss history data
        #if not 'pi' in self.neural_network_loss[self.player_id]:
        #    self.neural_network_loss[self.player_id]['pi'] = []
        raw_loss = loss.data.cpu().numpy()[0]
        #self.neural_network_loss[self.player_id]['pi'].append(raw_loss)
        if self.tensorboard is not None:
            self.tensorboard.add_scalar_value(
                'p{}_pi_loss'.format(self.player_id + 1), float(raw_loss),
                time.time())

        loss.backward()
        self.optim.step()
        return loss
Exemple #3
0
    def learn(self, states, actions, Q_targets, imp_weights):
        self.optim.zero_grad()
        all_Q_preds = self.forward(*states)
        actions_ = (bucket_encode_actions(actions, cuda=self.is_cuda) +
                    1).long()
        Q_preds = t.cat([
            all_Q_preds[i, aa] for i, aa in enumerate(actions_.data)
        ]).squeeze()  # Q(s,a)
        loss, td_deltas = self.compute_loss(Q_preds, Q_targets, imp_weights)

        # log loss history data
        #if not 'q' in self.neural_network_loss[self.player_id]:
        #    self.neural_network_loss[self.player_id]['q'] = []
        #raw_loss = loss.data.cpu().numpy()[0]
        #self.neural_network_loss[self.player_id]['q'].append(raw_loss)
        # todo: refactor the hard coded name
        if self.tensorboard is not None:
            self.tensorboard.add_scalar_value(
                'p{}_q_mse_loss'.format(self.player_id + 1), float(raw_loss),
                time.time())

        loss.backward()
        # update weights
        self.optim.step()
        return td_deltas
Exemple #4
0
    def learn(self, states, actions, Q_targets, imp_weights):
        self.optim.zero_grad()
        all_Q_preds = self.forward(*states)

        actions_ = (bucket_encode_actions(actions, cuda=self.is_cuda) +
                    1).long()
        Q_preds = t.cat([
            all_Q_preds[i, aa] for i, aa in enumerate(actions_.data)
        ]).squeeze()  # Q(s,a)

        loss, td_deltas = self.compute_loss(Q_preds, Q_targets, imp_weights)

        if self.tensorboard is not None:
            raw_loss = loss.data.cpu().numpy()[0]
            self.tensorboard.add_scalar_value(
                'p{}_q_loss'.format(self.player_id + 1), float(raw_loss),
                time.time())

        loss.backward()

        if self.grad_clip is not None:
            t.nn.utils.clip_grad_norm(self.parameters(), self.grad_clip)

        self.optim.step()
        return td_deltas
Exemple #5
0
    def learn(self, states, actions):
        self.optim.zero_grad()
        pi_preds = self.forward(*states).squeeze()
        criterion = nn.CrossEntropyLoss()
        one_hot_actions = bucket_encode_actions(actions, cuda=self.is_cuda)
        loss = criterion(pi_preds, (1 + one_hot_actions).long())

        raw_loss = loss.data.cpu().numpy()[0]

        if self.tensorboard is not None:
            self.tensorboard.add_scalar_value(
                'p{}_pi_loss'.format(self.player_id + 1), float(raw_loss),
                time.time())

        loss.backward()

        if self.grad_clip is not None:
            t.nn.utils.clip_grad_norm(self.parameters(), self.grad_clip)
        self.optim.step()

        return loss
Exemple #6
0
    def _learn_sl(self, global_step):
        """
        reservoir sampling from M_sl
        """
        if self.is_training:
            exps = self.memory_sl.sample(global_step)
            state_vars = [variable(s, cuda=self.cuda) for s in exps[0]]
            # 4 x 11 each column is torch variable
            action_vars = variable(exps[1], cuda=self.cuda)
            #state_hashes = exps[2]
            if self.verbose and self.tensorboard is not None:
                actions= bucket_encode_actions(action_vars, cuda=self.cuda)
                for a in actions.data.cpu().numpy():
                    self.tensorboard.add_scalar_value('M_SL_sampled_actions', int(a), time.time())
#                for h in state_hashes:
#                    self.tensorboard.add_scalar_value('M_SL_sampled_states', int(h), time.time())

            if self.verbose:
                start = timer()
            self.strategy._pi.learn(state_vars, action_vars)
            if self.verbose:
                print('backward pass of pi network took ', timer() - start)