Example #1
0
def test_model(model_file: str):
    net = ActorCriticNet(4, 2)
    net.load_state_dict(torch.load(model_file))
    net.eval()

    env = gym.make("CartPole-v1")
    env = gym.wrappers.Monitor(env,
                               f"./cart",
                               video_callable=lambda episode_id: True,
                               force=True)

    observation = env.reset()

    R = 0
    while True:
        env.render()
        cleaned_observation = torch.tensor(observation).unsqueeze(dim=0)
        action_logits = net.forward_actor(cleaned_observation)
        action = Categorical(logits=action_logits).sample()
        observation, r, done, _ = env.step(action.item())
        R += r
        if done:
            break

    env.close()

    print(R)
Example #2
0
    def act(self, obs):
        A, b, c0, curr_sol, (A_cuts, b_cuts) = obs
        A = torch.from_numpy(A).float()
        b = torch.from_numpy(b).unsqueeze(-1).float()
        A_cuts = torch.from_numpy(A_cuts).float()
        b_cuts = torch.from_numpy(b_cuts).unsqueeze(-1).float()

        Ab = torch.cat([A, b], dim=1)
        cut_ab = torch.cat([A_cuts, b_cuts], dim=1)
        all_ob = torch.cat([Ab, cut_ab], dim=0)

        if self.normalize:
            all_ob = (all_ob - all_ob.mean()) / (all_ob.max() - all_ob.min() +
                                                 1e-8)

        constraints = all_ob[:A.shape[0], :]
        cuts = all_ob[A.shape[0]:, :]
        constraints_embed = self.mlp_embed(constraints)
        cuts_embed = self.mlp_embed(cuts)

        att_map = cuts_embed.matmul(constraints_embed.T)
        score = att_map.mean(dim=1)
        score -= score.max()
        probs = F.softmax(score, dim=0)
        action = Categorical(probs).sample()
        return action.item()
Example #3
0
 def get_action(self, inv_obs):
     with torch.no_grad():
         device = self.pi.weight.device
         obs = torch.from_numpy(inv_obs).to(device).float()[None, ...]
         pi = F.softmax(self.pi(self.mlp(obs)), dim=-1)
         action = Categorical(probs=pi).sample()
     return action.item()
Example #4
0
 def get_action(self, pov_obs):
     with torch.no_grad():
         device = self.conv.conv.conv[0].weight.device
         obs = torch.from_numpy(pov_obs).to(device).float()[None, ...]
         obs = obs.mul_(1. / 255)
         pi = F.softmax(self.pi(self.conv(obs)), dim=-1)
         action = Categorical(probs=pi).sample()
     return action.item()
Example #5
0
    def play_episode(self):
        episode_actions = torch.empty(size=(0, ), dtype=torch.long)
        episode_logits = torch.empty(size=(0, self.env.action_space.n),
                                     dtype=torch.long)
        episode_observs = torch.empty(size=(0,
                                            *self.env.observation_space.shape),
                                      dtype=torch.long)
        episode_rewards = np.empty(shape=(0, ), dtype=np.float)

        observation = self.env.reset()

        t = 0
        done = False
        while not done:
            # Prepare observation
            cleaned_observation = torch.tensor(observation).unsqueeze(dim=0)
            episode_observs = torch.cat((episode_observs, cleaned_observation),
                                        dim=0)

            # Get action from policy net
            action_logits = self.proc_net.forward_actor(cleaned_observation)
            action = Categorical(logits=action_logits).sample()

            # Save observation and the action from the net
            episode_logits = torch.cat((episode_logits, action_logits), dim=0)
            episode_actions = torch.cat((episode_actions, action), dim=0)

            # Get new observation and reward from action
            observation, r, done, _ = self.env.step(action.item())

            # Save reward from net_action
            episode_rewards = np.concatenate(
                (episode_rewards, np.asarray([r])), axis=0)

            t += 1

        discounted_R = self.get_discounted_rewards(episode_rewards, GAMMA)
        discounted_R -= episode_rewards.mean()

        mask = F.one_hot(episode_actions, num_classes=self.env.action_space.n)
        episode_log_probs = torch.sum(mask.float() *
                                      F.log_softmax(episode_logits, dim=1),
                                      dim=1)

        values = self.proc_net.forward_critic(episode_observs)
        action_advantage = (discounted_R.float() - values).detach()
        episode_weighted_log_probs = episode_log_probs * action_advantage
        sum_weighted_log_probs = torch.sum(
            episode_weighted_log_probs).unsqueeze(dim=0)
        sum_action_advantages = torch.sum(action_advantage).unsqueeze(dim=0)

        return (
            sum_weighted_log_probs,
            sum_action_advantages,
            episode_logits,
            np.sum(episode_rewards),
            t,
        )
Example #6
0
    def generate(self, inputs, hidden, generated_seq_len, id_2_word):
        # TODO ========================
        # Compute the forward pass, as in the self.forward method (above).
        # You'll probably want to copy substantial portions of that code here.
        #
        # We "seed" the generation by providing the first inputs
        # Subsequent inputs are generated by sampling from the output distribution,
        # as described in the tex (Problem 5.3)
        # Unlike for self.forward, you WILL need to apply the softmax activation
        # function here in order to compute the parameters of the categorical
        # distributions to be sampled from at each time-step.
        """
    Arguments:
        - input: A mini-batch of input tokens (NOT sequences!)
                        shape: (batch_size)
        - hidden: The initial hidden states for every layer of the stacked RNN.
                        shape: (num_layers, batch_size, hidden_size)
        - generated_seq_len: The length of the sequence to generate.
                       Note that this can be different than the length used
                       for training (self.seq_len)
    Returns:
        - Sampled sequences of tokens
                    shape: (generated_seq_len, batch_size)
    """
        samples = torch.zeros([generated_seq_len, self.batch_size],
                              dtype=torch.long,
                              device=hidden.device)
        samples[0] = inputs

        outp = self.embedding(
            inputs)  # shape: (self.batch_size, self.emb_size)
        for i in range(1, generated_seq_len):
            for j in range(self.num_layers):
                inp = self.inp_dp(outp) if j == 0 else outp
                hid = hidden[j]
                outp, hidden[j] = self.model[j](inp=inp.clone(),
                                                hidden=hid.clone())

            outp = self.Wy(outp)
            dist = F.softmax(outp,
                             dim=1)  # shape (self.batch_size, self.vocab_size)
            for k in range(dist.size(0)):
                while True:
                    s = Categorical(dist[k]).sample()
                    if id_2_word[s.item()] != '<eos>':
                        break
                samples[i, k] = s
            outp = self.embedding(samples[i])

        return samples
Example #7
0
    def episode(self, train=True, render=False, z=None, return_states=False):
        """Run one episode.

        Parameters
        ----------
        train : bool
            If True, perform update on underlying parameters and store reward
            into self.rewards.
        render : bool
            If True, display the episode with env.render and return total
            reward.
        z : torch.Tensor
            Skill value. If None, a random skill is sampled from self.prior.
        return_states : bool
            If True, return the list of states of the episode.
        """
        s = self.env.reset()
        if z is None:
            z = self.prior.sample()
        p_z = self.prior.log_prob(z)
        done, step, total_reward = False, 0, 0
        if return_states:
            states = [s]
        while not done:
            pi = self.actor(s, z)  # log P(a | s, z)
            a = Categorical(torch.exp(pi)).sample()  # Sample action
            new_s, _, done, _ = self.env.step(a.item())
            q = self.discriminator(s)  # log P(z | s)
            reward = q[:, z.argmax(dim=1)] - self.alpha * pi[:, a] - p_z
            if train:  # Perform update
                self._update_models(pi, a, q, reward, s, z, new_s, done)
            total_reward += reward.item()
            if render:  # Render the environment
                self.env.render()
            step += 1
            s = new_s
            if return_states:
                states.append(s)
        if train:  # Store episode score
            self.n_episode += 1
            self.rewards.append(total_reward / step)
        if render:  # Return episode score
            return total_reward
        if return_states:
            return states
Example #8
0
    def episode(self, train=True, render=False, return_states=False):
        """
        Run one episode.

        Parameters
        ----------
        train : bool
            If True, perform update on underlying parameters and store reward
            into self.rewards.
        render : bool
            If True, display the episode with env.render and return total
            reward.
        return_states : bool
            If True, return the list of states of the episode.
        """
        s = self.env.reset()
        done, step, total_reward = False, 0, 0
        if return_states:
            states = [s]
        while not done:
            pi = self.actor(s)  # log P(a | s)
            a = Categorical(torch.exp(pi)).sample()  # Sample action
            new_s, reward, done, _ = self.env.step(a.item())
            #print(2*(np.abs(new_s[0]+0.5)+20*np.abs(new_s[1])))
            reward = torch.Tensor([[reward + (np.abs(new_s[0] + 0.5))]])
            if train:  # Perform update
                self._update_models(pi, a, reward, s, new_s, done)
            #print(reward.item())
            total_reward += reward.item()
            if render:  # Render the environment
                self.env.render()
            step += 1
            s = new_s
            if return_states:
                states.append(s)
        if train:  # Store episode score
            self.n_episode += 1
            self.rewards.append(total_reward)
        if render:  # Return episode score
            return total_reward
        if return_states:
            return states