def _learn_rl(self, global_step): # sample a minibatch of experiences # gamma = Variable(t.Tensor([self.gamma]).float(), requires_grad=False) gamma = variable([self.gamma], cuda=self.cuda) exps, imp_weights, ids = self.memory_rl.sample(global_step) # how many of the samples in a batch are showdowns or all-ins state_vars = [variable(s, cuda=self.cuda) for s in exps[0]] action_vars = variable(exps[1], cuda=self.cuda) imp_weights = variable(imp_weights, cuda=self.cuda) rewards = variable(exps[2].astype(np.float32), cuda=self.cuda) next_state_vars = [variable(s, cuda=self.cuda) for s in exps[3]] # state_hashes = exps[5] if self.verbose and self.tensorboard is not None: actions = bucket_encode_actions(action_vars, cuda=self.cuda) for a in actions.data.cpu().numpy(): self.tensorboard.add_scalar_value('M_RL_sampled_actions', int(a), time.time()) for r in exps[2]: self.tensorboard.add_scalar_value('M_RL_sampled_rewards', int(r), time.time()) # for h in state_hashes: # self.tensorboard.add_scalar_value('M_RL_sampled_states', int(h), time.time()) if self.is_training: Q_targets = rewards + gamma * t.max(self.strategy._target_Q.forward(*next_state_vars), 1)[0] if self.verbose: start = timer() td_deltas = self.strategy._Q.learn(state_vars, action_vars, Q_targets, imp_weights) if self.verbose: print('backward pass of Q network took ', timer() - start) self.memory_rl.update(ids, td_deltas.data.cpu().numpy())
def learn(self, states, actions): """ From Torch site loss = nn.CrossEntropyLoss() input = autograd.Variable(torch.randn(3, 5), requires_grad=True) target = autograd.Variable(torch.LongTensor(3).random_(5)) output = loss(input, target) output.backward() """ self.optim.zero_grad() pi_preds = self.forward(*states).squeeze() criterion = nn.CrossEntropyLoss() one_hot_actions = bucket_encode_actions(actions, cuda=self.is_cuda) loss = criterion(pi_preds, (1 + one_hot_actions).long()) # log loss history data #if not 'pi' in self.neural_network_loss[self.player_id]: # self.neural_network_loss[self.player_id]['pi'] = [] raw_loss = loss.data.cpu().numpy()[0] #self.neural_network_loss[self.player_id]['pi'].append(raw_loss) if self.tensorboard is not None: self.tensorboard.add_scalar_value( 'p{}_pi_loss'.format(self.player_id + 1), float(raw_loss), time.time()) loss.backward() self.optim.step() return loss
def learn(self, states, actions, Q_targets, imp_weights): self.optim.zero_grad() all_Q_preds = self.forward(*states) actions_ = (bucket_encode_actions(actions, cuda=self.is_cuda) + 1).long() Q_preds = t.cat([ all_Q_preds[i, aa] for i, aa in enumerate(actions_.data) ]).squeeze() # Q(s,a) loss, td_deltas = self.compute_loss(Q_preds, Q_targets, imp_weights) # log loss history data #if not 'q' in self.neural_network_loss[self.player_id]: # self.neural_network_loss[self.player_id]['q'] = [] #raw_loss = loss.data.cpu().numpy()[0] #self.neural_network_loss[self.player_id]['q'].append(raw_loss) # todo: refactor the hard coded name if self.tensorboard is not None: self.tensorboard.add_scalar_value( 'p{}_q_mse_loss'.format(self.player_id + 1), float(raw_loss), time.time()) loss.backward() # update weights self.optim.step() return td_deltas
def learn(self, states, actions, Q_targets, imp_weights): self.optim.zero_grad() all_Q_preds = self.forward(*states) actions_ = (bucket_encode_actions(actions, cuda=self.is_cuda) + 1).long() Q_preds = t.cat([ all_Q_preds[i, aa] for i, aa in enumerate(actions_.data) ]).squeeze() # Q(s,a) loss, td_deltas = self.compute_loss(Q_preds, Q_targets, imp_weights) if self.tensorboard is not None: raw_loss = loss.data.cpu().numpy()[0] self.tensorboard.add_scalar_value( 'p{}_q_loss'.format(self.player_id + 1), float(raw_loss), time.time()) loss.backward() if self.grad_clip is not None: t.nn.utils.clip_grad_norm(self.parameters(), self.grad_clip) self.optim.step() return td_deltas
def learn(self, states, actions): self.optim.zero_grad() pi_preds = self.forward(*states).squeeze() criterion = nn.CrossEntropyLoss() one_hot_actions = bucket_encode_actions(actions, cuda=self.is_cuda) loss = criterion(pi_preds, (1 + one_hot_actions).long()) raw_loss = loss.data.cpu().numpy()[0] if self.tensorboard is not None: self.tensorboard.add_scalar_value( 'p{}_pi_loss'.format(self.player_id + 1), float(raw_loss), time.time()) loss.backward() if self.grad_clip is not None: t.nn.utils.clip_grad_norm(self.parameters(), self.grad_clip) self.optim.step() return loss
def _learn_sl(self, global_step): """ reservoir sampling from M_sl """ if self.is_training: exps = self.memory_sl.sample(global_step) state_vars = [variable(s, cuda=self.cuda) for s in exps[0]] # 4 x 11 each column is torch variable action_vars = variable(exps[1], cuda=self.cuda) #state_hashes = exps[2] if self.verbose and self.tensorboard is not None: actions= bucket_encode_actions(action_vars, cuda=self.cuda) for a in actions.data.cpu().numpy(): self.tensorboard.add_scalar_value('M_SL_sampled_actions', int(a), time.time()) # for h in state_hashes: # self.tensorboard.add_scalar_value('M_SL_sampled_states', int(h), time.time()) if self.verbose: start = timer() self.strategy._pi.learn(state_vars, action_vars) if self.verbose: print('backward pass of pi network took ', timer() - start)