def test_model(model_file: str): net = ActorCriticNet(4, 2) net.load_state_dict(torch.load(model_file)) net.eval() env = gym.make("CartPole-v1") env = gym.wrappers.Monitor(env, f"./cart", video_callable=lambda episode_id: True, force=True) observation = env.reset() R = 0 while True: env.render() cleaned_observation = torch.tensor(observation).unsqueeze(dim=0) action_logits = net.forward_actor(cleaned_observation) action = Categorical(logits=action_logits).sample() observation, r, done, _ = env.step(action.item()) R += r if done: break env.close() print(R)
def act(self, obs): A, b, c0, curr_sol, (A_cuts, b_cuts) = obs A = torch.from_numpy(A).float() b = torch.from_numpy(b).unsqueeze(-1).float() A_cuts = torch.from_numpy(A_cuts).float() b_cuts = torch.from_numpy(b_cuts).unsqueeze(-1).float() Ab = torch.cat([A, b], dim=1) cut_ab = torch.cat([A_cuts, b_cuts], dim=1) all_ob = torch.cat([Ab, cut_ab], dim=0) if self.normalize: all_ob = (all_ob - all_ob.mean()) / (all_ob.max() - all_ob.min() + 1e-8) constraints = all_ob[:A.shape[0], :] cuts = all_ob[A.shape[0]:, :] constraints_embed = self.mlp_embed(constraints) cuts_embed = self.mlp_embed(cuts) att_map = cuts_embed.matmul(constraints_embed.T) score = att_map.mean(dim=1) score -= score.max() probs = F.softmax(score, dim=0) action = Categorical(probs).sample() return action.item()
def get_action(self, inv_obs): with torch.no_grad(): device = self.pi.weight.device obs = torch.from_numpy(inv_obs).to(device).float()[None, ...] pi = F.softmax(self.pi(self.mlp(obs)), dim=-1) action = Categorical(probs=pi).sample() return action.item()
def get_action(self, pov_obs): with torch.no_grad(): device = self.conv.conv.conv[0].weight.device obs = torch.from_numpy(pov_obs).to(device).float()[None, ...] obs = obs.mul_(1. / 255) pi = F.softmax(self.pi(self.conv(obs)), dim=-1) action = Categorical(probs=pi).sample() return action.item()
def play_episode(self): episode_actions = torch.empty(size=(0, ), dtype=torch.long) episode_logits = torch.empty(size=(0, self.env.action_space.n), dtype=torch.long) episode_observs = torch.empty(size=(0, *self.env.observation_space.shape), dtype=torch.long) episode_rewards = np.empty(shape=(0, ), dtype=np.float) observation = self.env.reset() t = 0 done = False while not done: # Prepare observation cleaned_observation = torch.tensor(observation).unsqueeze(dim=0) episode_observs = torch.cat((episode_observs, cleaned_observation), dim=0) # Get action from policy net action_logits = self.proc_net.forward_actor(cleaned_observation) action = Categorical(logits=action_logits).sample() # Save observation and the action from the net episode_logits = torch.cat((episode_logits, action_logits), dim=0) episode_actions = torch.cat((episode_actions, action), dim=0) # Get new observation and reward from action observation, r, done, _ = self.env.step(action.item()) # Save reward from net_action episode_rewards = np.concatenate( (episode_rewards, np.asarray([r])), axis=0) t += 1 discounted_R = self.get_discounted_rewards(episode_rewards, GAMMA) discounted_R -= episode_rewards.mean() mask = F.one_hot(episode_actions, num_classes=self.env.action_space.n) episode_log_probs = torch.sum(mask.float() * F.log_softmax(episode_logits, dim=1), dim=1) values = self.proc_net.forward_critic(episode_observs) action_advantage = (discounted_R.float() - values).detach() episode_weighted_log_probs = episode_log_probs * action_advantage sum_weighted_log_probs = torch.sum( episode_weighted_log_probs).unsqueeze(dim=0) sum_action_advantages = torch.sum(action_advantage).unsqueeze(dim=0) return ( sum_weighted_log_probs, sum_action_advantages, episode_logits, np.sum(episode_rewards), t, )
def generate(self, inputs, hidden, generated_seq_len, id_2_word): # TODO ======================== # Compute the forward pass, as in the self.forward method (above). # You'll probably want to copy substantial portions of that code here. # # We "seed" the generation by providing the first inputs # Subsequent inputs are generated by sampling from the output distribution, # as described in the tex (Problem 5.3) # Unlike for self.forward, you WILL need to apply the softmax activation # function here in order to compute the parameters of the categorical # distributions to be sampled from at each time-step. """ Arguments: - input: A mini-batch of input tokens (NOT sequences!) shape: (batch_size) - hidden: The initial hidden states for every layer of the stacked RNN. shape: (num_layers, batch_size, hidden_size) - generated_seq_len: The length of the sequence to generate. Note that this can be different than the length used for training (self.seq_len) Returns: - Sampled sequences of tokens shape: (generated_seq_len, batch_size) """ samples = torch.zeros([generated_seq_len, self.batch_size], dtype=torch.long, device=hidden.device) samples[0] = inputs outp = self.embedding( inputs) # shape: (self.batch_size, self.emb_size) for i in range(1, generated_seq_len): for j in range(self.num_layers): inp = self.inp_dp(outp) if j == 0 else outp hid = hidden[j] outp, hidden[j] = self.model[j](inp=inp.clone(), hidden=hid.clone()) outp = self.Wy(outp) dist = F.softmax(outp, dim=1) # shape (self.batch_size, self.vocab_size) for k in range(dist.size(0)): while True: s = Categorical(dist[k]).sample() if id_2_word[s.item()] != '<eos>': break samples[i, k] = s outp = self.embedding(samples[i]) return samples
def episode(self, train=True, render=False, z=None, return_states=False): """Run one episode. Parameters ---------- train : bool If True, perform update on underlying parameters and store reward into self.rewards. render : bool If True, display the episode with env.render and return total reward. z : torch.Tensor Skill value. If None, a random skill is sampled from self.prior. return_states : bool If True, return the list of states of the episode. """ s = self.env.reset() if z is None: z = self.prior.sample() p_z = self.prior.log_prob(z) done, step, total_reward = False, 0, 0 if return_states: states = [s] while not done: pi = self.actor(s, z) # log P(a | s, z) a = Categorical(torch.exp(pi)).sample() # Sample action new_s, _, done, _ = self.env.step(a.item()) q = self.discriminator(s) # log P(z | s) reward = q[:, z.argmax(dim=1)] - self.alpha * pi[:, a] - p_z if train: # Perform update self._update_models(pi, a, q, reward, s, z, new_s, done) total_reward += reward.item() if render: # Render the environment self.env.render() step += 1 s = new_s if return_states: states.append(s) if train: # Store episode score self.n_episode += 1 self.rewards.append(total_reward / step) if render: # Return episode score return total_reward if return_states: return states
def episode(self, train=True, render=False, return_states=False): """ Run one episode. Parameters ---------- train : bool If True, perform update on underlying parameters and store reward into self.rewards. render : bool If True, display the episode with env.render and return total reward. return_states : bool If True, return the list of states of the episode. """ s = self.env.reset() done, step, total_reward = False, 0, 0 if return_states: states = [s] while not done: pi = self.actor(s) # log P(a | s) a = Categorical(torch.exp(pi)).sample() # Sample action new_s, reward, done, _ = self.env.step(a.item()) #print(2*(np.abs(new_s[0]+0.5)+20*np.abs(new_s[1]))) reward = torch.Tensor([[reward + (np.abs(new_s[0] + 0.5))]]) if train: # Perform update self._update_models(pi, a, reward, s, new_s, done) #print(reward.item()) total_reward += reward.item() if render: # Render the environment self.env.render() step += 1 s = new_s if return_states: states.append(s) if train: # Store episode score self.n_episode += 1 self.rewards.append(total_reward) if render: # Return episode score return total_reward if return_states: return states