def forward(self, state, action_mask, betsize_mask): mask = combined_masks(action_mask, betsize_mask) x = state if x.dim() == 2: x = x.unsqueeze(0) out = self.process_input(x).unsqueeze(0) B, M, c = out.size() n_padding = max(self.maxlen - M, 0) padding = torch.zeros(B, n_padding, out.size(-1)) h = torch.cat((out, padding), dim=1) lstm_out, _ = self.lstm(h) t_logits = self.fc3(lstm_out.view(-1)) category_logits = self.noise(t_logits) action_soft = F.softmax(category_logits, dim=-1) action_probs = norm_frequencies(action_soft, mask) m = Categorical(action_probs) action = m.sample() action_category, betsize_category = self.helper_functions.unwrap_action( action, state[:, -1, self.mapping['state']['previous_action']]) outputs = { 'action': action, 'action_category': action_category, 'action_prob': m.log_prob(action), 'action_probs': action_probs, 'betsize': betsize_category } return outputs
def forward(self,state,mask): x = state if not isinstance(state,torch.Tensor): x = torch.tensor(x,dtype=torch.float32) #device = self.device, x = x.unsqueeze(0) # print(x) # print(self.mapping['state']['rank']) # print(self.mapping['state']['previous_action']) # print(x[:,self.mapping['state']['rank']]) # print(x[:,self.mapping['state']['previous_action']]) hand = x[:,self.mapping['state']['rank']].long() last_action = x[:,self.mapping['state']['previous_action']].long() hand = self.hand_emb(hand) last_action = self.action_emb(last_action) x = torch.cat([hand,last_action],dim=-1) x = self.activation(self.fc1(x)) x = self.activation(self.fc2(x)) x = self.fc3(x) action_logits = self.noise(x) action_soft = F.softmax(action_logits,dim=-1) action_probs = norm_frequencies(action_soft,mask) m = Categorical(action_probs) action = m.sample() outputs = { 'action':action, 'action_prob':m.log_prob(action), 'action_probs':action_probs} return outputs
def forward(self,state,mask,betsize_mask): x = state M,c = x.size() hand = x[:,self.mapping['state']['rank']].long() last_action = x[:,self.mapping['state']['previous_action']].long() # previous_betsize = x[:,self.mapping['state']['previous_betsize']].float().unsqueeze(0) hand = self.hand_emb(hand) embedded_action = self.action_emb(last_action) # print(hand.size(),embedded_action.size(),previous_betsize.size()) # x = torch.cat([hand,embedded_action,previous_betsize],dim=-1) x = torch.cat([hand,embedded_action],dim=-1) x = self.activation(self.fc1(x)) x = self.activation(self.fc2(x)) category_logits = self.fc3(x) category_logits = self.noise(category_logits) action_soft = F.softmax(category_logits,dim=-1) action_probs = norm_frequencies(action_soft,mask) # with torch.no_grad(): # action_masked = action_soft * mask # action_probs = action_masked / action_masked.sum(-1).unsqueeze(1) m = Categorical(action_probs) action = m.sample() # Check which category it is # betsize = torch.tensor([-1]) # betsize_prob = torch.tensor([-1]).float() # betsize_probs = torch.Tensor(self.nA).fill_(-1).unsqueeze(0).float() # # print('action',action) # # print('betsize_mask',betsize_mask) # if action > 2: # generate betsize b = self.activation(self.bfc1(x)) b = self.activation(self.bfc2(b)) b = self.bfc3(b) betsize_logits = self.noise(b) # print('betsize_logits',betsize_logits) betsize_probs = F.softmax(betsize_logits,dim=-1) # print('betsize_probs',betsize_probs) if betsize_mask.sum(-1) == 0: betsize_mask = torch.ones(M,self.nA) # with torch.no_grad(): mask_betsize_probs = betsize_probs * betsize_mask # print('mask_betsize_probs',mask_betsize_probs) norm_betsize_probs = mask_betsize_probs / mask_betsize_probs.sum(-1).unsqueeze(1) # print('mask_betsize_probs',mask_betsize_probs) b = Categorical(norm_betsize_probs) betsize = b.sample() betsize_prob = b.log_prob(betsize) # print('betsize',betsize) # print('betsize_prob',betsize_prob) # print('betsize_probs',betsize_probs) outputs = { 'action':action, 'action_prob':m.log_prob(action), 'action_probs':action_probs, 'action_category':action, 'betsize':betsize, 'betsize_prob':betsize_prob, 'betsize_probs':betsize_probs} return outputs
def forward(self, state, action_mask, betsize_mask): """ state: B,M,39 """ x = state if not isinstance(x, torch.Tensor): x = torch.tensor(x, dtype=torch.float32).to(self.device) action_mask = torch.tensor(action_mask, dtype=torch.float).to(self.device) betsize_mask = torch.tensor(betsize_mask, dtype=torch.float).to(self.device) mask = combined_masks(action_mask, betsize_mask) out = self.process_input(x) B, M, c = out.size() n_padding = self.maxlen - M if n_padding < 0: h = out[:, -self.maxlen:, :] else: padding = torch.zeros(B, n_padding, out.size(-1)).to(self.device) h = torch.cat((padding, out), dim=1) lstm_out, hidden_states = self.lstm(h) norm = self.batchnorm(lstm_out) # self.attention(out) # blocks_out = self.blocks(lstm_out.view(-1)) t_logits = self.fc_final(norm.view(B, -1)) category_logits = self.noise(t_logits) # skip connection # category_logits += h action_soft = F.softmax(category_logits, dim=-1) # if torch.cuda.is_available(): # action_probs = norm_frequencies(action_soft,mask.cuda()) # previous_action = torch.as_tensor(state[:,-1,self.state_mapping['last_action']]).cuda()#.to(self.device) # else: action_probs = norm_frequencies(action_soft, mask) previous_action = torch.as_tensor( state[:, -1, self.state_mapping['last_action']]).to(self.device) m = Categorical(action_probs) action = m.sample() action_category, betsize_category = self.helper_functions.batch_unwrap_action( action, previous_action) if B > 1: # batch training outputs = { 'action': action, 'action_category': action_category, 'action_prob': m.log_prob(action), 'action_probs': action_probs, 'betsize': betsize_category } else: # playing hand outputs = { 'action': action.item(), 'action_category': action_category.item(), 'action_prob': m.log_prob(action), 'action_probs': action_probs, 'betsize': betsize_category.item() } return outputs
def forward(self,state,action_mask,betsize_mask): mask = combined_masks(action_mask,betsize_mask) if mask.dim() > 1: mask = mask[-1] x = state if x.dim() == 2: x = x.unsqueeze(0) out = self.preprocess(x) M,C = out.size() n_padding = self.max_length - M padding = torch.zeros(n_padding,out.size(-1)) h = torch.cat((out,padding),dim=0).unsqueeze(0) # pos_emd = self.positional_emb(torch.arange(self.max_length)) # padding_mask_o = torch.ones(M,self.emb) # padding_mask_z = torch.zeros(n_padding,self.emb) # padding_mask = torch.cat((padding_mask_o,padding_mask_z),dim=0) # pos_emd = (pos_emd.view(-1) * padding_mask.view(-1)).view(h.size(0),self.emb) # h = h + pos_emd # x = (h + pos_emd).unsqueeze(0) # x = self.activation(self.fc1(h)) # x = self.activation(self.fc2(x)).view(-1) # t_logits = self.fc3(x).unsqueeze(0) x,_ = self.lstm(h) # x_stripped = (x.view(-1) * padding_mask.view(-1)).view(1,-1) t_logits = self.fc3(x.view(-1)) # t_logits = self.transformer(x) cateogry_logits = self.noise(t_logits) # distribution_inputs = F.log_softmax(cateogry_logits, dim=1) * mask action_soft = F.softmax(cateogry_logits,dim=-1) action_probs = norm_frequencies(action_soft,mask) last_action = state[M-1,self.mapping['state']['previous_action']].long().unsqueeze(-1) m = Categorical(action_probs) action = m.sample() action_category,betsize_category = self.helper_functions.unwrap_action(action,last_action) outputs = { 'action':action, 'action_category':action_category, 'action_prob':m.log_prob(action), 'action_probs':m.probs, 'betsize':betsize_category } return outputs
def forward(self, state, action_mask, betsize_mask): x = torch.tensor(state, dtype=torch.float32).to(self.device) action_mask = torch.tensor(action_mask, dtype=torch.float).to(self.device) betsize_mask = torch.tensor(betsize_mask, dtype=torch.float).to(self.device) mask = combined_masks(action_mask, betsize_mask) out = self.process_input(x) # Actor B, M, c = out.size() n_padding = self.maxlen - M if n_padding < 0: h = out[:, -self.maxlen:, :] else: padding = torch.zeros(B, n_padding, out.size(-1)).to(self.device) h = torch.cat((out, padding), dim=1) lstm_out, _ = self.lstm(h) t_logits = self.policy_out(lstm_out.view(-1)) category_logits = self.noise(t_logits) action_soft = F.softmax(category_logits, dim=-1) action_probs = norm_frequencies(action_soft, mask) m = Categorical(action_probs) action = m.sample() action_category, betsize_category = self.helper_functions.unwrap_action( action, state[:, -1, self.mapping['last_action']]) outputs = { 'action': action.item(), 'action_category': action_category.item(), 'action_prob': m.log_prob(action), 'action_probs': action_probs, 'betsize': betsize_category.item() } # Critic q_input = self.transformer(out) a = self.advantage_output(q_input) v = self.value_output(q_input) v = v.expand_as(a) q = v + a - a.mean(-1, keepdim=True).expand_as(a) outputs['value'] = q.squeeze(0) return outputs
def forward(self,state,action_mask,betsize_mask): # last_state = state[-1].unsqueeze(0) mask = combined_masks(action_mask,betsize_mask) if mask.dim() > 1: mask = mask[-1] x = state M,C = x.size() out = self.preprocess(x) x = self.activation(self.fc1(out)) x = self.activation(self.fc2(x)) n_padding = self.max_length - M padding = torch.zeros(n_padding,out.size(-1)) h = torch.cat((out,padding),dim=0) pos_emd = self.positional_emb(torch.arange(self.max_length)) h = h + pos_emd # x = (h + pos_emd).unsqueeze(0) t_logits = self.fc3(h.view(-1)).unsqueeze(0) # t_logits = self.transformer(x) cateogry_logits = self.noise(t_logits) # distribution_inputs = F.log_softmax(cateogry_logits, dim=1) * mask action_soft = F.softmax(cateogry_logits,dim=-1) action_probs = norm_frequencies(action_soft,mask) last_action = state[-1,self.mapping['state']['previous_action']].long().unsqueeze(-1) m = Categorical(action_probs) action = m.sample() action_category,betsize_category = self.helper_functions.unwrap_action(action,last_action) q_input = x.view(M,-1) a = self.advantage_output(q_input) v = self.value_output(q_input) v = v.expand_as(a) q = v + a - a.mean(1,keepdim=True).expand_as(a) outputs = { 'action':action, 'action_category':action_category, 'action_prob':m.log_prob(action), 'action_probs':m.probs, 'betsize':betsize_category, 'value':q } return outputs
def forward(self, state, action_mask, betsize_mask): x = torch.tensor(state, dtype=torch.float32).to(self.device) action_mask = torch.tensor(action_mask, dtype=torch.float).to(self.device) betsize_mask = torch.tensor(betsize_mask, dtype=torch.float).to(self.device) mask = combined_masks(action_mask, betsize_mask) out = self.process_input(x) B, M, c = out.size() n_padding = self.maxlen - M if n_padding < 0: h = out[:, -self.maxlen:, :] else: padding = torch.zeros(B, n_padding, out.size(-1)).to(self.device) h = torch.cat((out, padding), dim=1) lstm_out, _ = self.lstm(h) norm = self.batchnorm(lstm_out) # blocks_out = self.blocks(lstm_out.view(-1)) t_logits = self.fc_final(norm.view(-1)) category_logits = self.noise(t_logits) action_soft = F.softmax(category_logits, dim=-1) action_probs = norm_frequencies(action_soft, mask) m = Categorical(action_probs) action = m.sample() action_category, betsize_category = self.helper_functions.unwrap_action( action, state[:, -1, self.state_mapping['last_action']]) outputs = { 'action': action.item(), 'action_category': action_category.item(), 'action_prob': m.log_prob(action), 'action_probs': action_probs, 'betsize': betsize_category.item() } return outputs
def forward(self,state,action_mask,betsize_mask): mask = combined_masks(action_mask,betsize_mask) x = state hand = x[:,self.mapping['state']['rank']].long() last_action = x[:,self.mapping['state']['previous_action']].long() previous_betsize = x[:,self.mapping['state']['previous_betsize']].float() if previous_betsize.dim() == 1: previous_betsize = previous_betsize.unsqueeze(1) hand = self.hand_emb(hand) last_action_emb = self.action_emb(last_action) # print('hand,last_action_emb,previous_betsize',hand.size(),last_action_emb.size(),previous_betsize.size()) x = torch.cat([hand,last_action_emb,previous_betsize],dim=-1) x = self.activation(self.fc1(x)) x = self.activation(self.fc2(x)) cateogry_logits = self.fc3(x) cateogry_logits = self.noise(cateogry_logits) action_soft = F.softmax(cateogry_logits,dim=-1) # print(action_soft.size(),mask.size()) action_probs = norm_frequencies(action_soft,mask) # action_probs = action_probs * mask # action_probs /= torch.sum(action_probs) m = Categorical(action_probs) action = m.sample() action_category,betsize_category = self.helper_functions.unwrap_action(action,last_action) # print('state',state) # print('action_category,betsize_category',action_category,betsize_category) outputs = { 'action':action, 'action_category':action_category, 'action_prob':m.log_prob(action), 'action_probs':action_probs, 'betsize':betsize_category } return outputs
def forward(self, state, action_mask, betsize_mask, target=False): """ state: B,M,39 """ if not isinstance(state, torch.Tensor): state = torch.tensor(state, dtype=torch.float32).to(self.device) action_mask = torch.tensor(action_mask, dtype=torch.float32).to(self.device) betsize_mask = torch.tensor(betsize_mask, dtype=torch.float32).to(self.device) mask = combined_masks(action_mask, betsize_mask) if target and np.random.random() < self.epsilon: B = state.size(0) # pick random legal move action_masked = self.epsilon_weights * mask action_probs = action_masked / action_masked.sum(-1).unsqueeze(-1) action = action_probs.multinomial(num_samples=1, replacement=False) action_prob = torch.zeros(B, 1) else: out = self.process_input(state) B, M, c = state.size() n_padding = self.maxlen - M if n_padding < 0: h = out[:, -self.maxlen:, :] else: padding = torch.zeros(B, n_padding, out.size(-1)).to(self.device) h = torch.cat((padding, out), dim=1) lstm_out, hidden_states = self.lstm(h) norm = self.batchnorm(lstm_out) # self.attention(out) # blocks_out = self.blocks(lstm_out.view(-1)) t_logits = self.fc_final(norm.view(B, -1)) category_logits = self.noise(t_logits) # skip connection # category_logits += h action_soft = F.softmax(category_logits, dim=-1) action_probs = norm_frequencies(action_soft, mask) m = Categorical(action_probs) action = m.sample() action_prob = m.log_prob(action) previous_action = torch.as_tensor( state[:, -1, self.state_mapping['last_action']]).to(self.device) action_category, betsize_category = self.helper_functions.batch_unwrap_action( action, previous_action) if B > 1: # batch training outputs = { 'action': action, 'action_category': action_category, 'action_prob': action_prob, 'action_probs': action_probs, 'betsize': betsize_category } else: # playing hand outputs = { 'action': action.item(), 'action_category': action_category.item(), 'action_prob': action_prob, 'action_probs': action_probs, 'betsize': betsize_category.item() } return outputs