Ejemplo n.º 1
0
    def gpcf(self, z_goal, h):
        #Compares possible next state as a result of each action to goal.
        #Chooses the action which will reduce the loss the most

        output = []
        for action in range(len(h)):
            if self.hiddengoals:
                if self.Forward_model == 'D':
                    output.append(self.LSEloss(h[action], z_goal))
                else:
                    #compares the goal with the next state using the NLL loss of the latent representation of the next hidden state
                    #h[action][4] = next_hidden_mu, h[action][5] =  next_hidden_sigma, logpi = -1
                    output.append(
                        gmm_loss(
                            z_goal, h[action][4], h[action][5],
                            torch.tensor([-1.0], dtype=torch.float32).to(
                                self.device)) / 33)
            else:
                if self.Forward_model == 'D':
                    output.append(criterion(h[action], z_goal).item())
                else:
                    #compares the goal with the next state using the NLL loss of the latent representation of the next hidden state
                    #h[action][0] = mu, h[action][1] = sigma, logpi = logpi
                    output.append(
                        gmm_loss(z_goal, h[action][0], h[action][1],
                                 h[action][2]) / 33)

        return output.index(min(output))
Ejemplo n.º 2
0
    def get_loss(self, latent_obs, action, reward, terminal, latent_next_obs,
                 include_reward: bool):
        """ Compute losses.
        
            The loss that is computed is:
            (GMMLoss(latent_next_obs, GMMPredicted) + MSE(reward, predicted_reward) +
                 BCE(terminal, logit_terminal)) / (LSIZE + 2)
            The LSIZE + 2 factor is here to counteract the fact that the GMMLoss scales
            approximately linearily with LSIZE. All losses are averaged both on the
            batch and the sequence dimensions (the two first dimensions).
        
            :args latent_obs: (BSIZE, SEQ_LEN, LSIZE) torch tensor
            :args action: (BSIZE, SEQ_LEN, ASIZE) torch tensor
            :args reward: (BSIZE, SEQ_LEN) torch tensor
            :args latent_next_obs: (BSIZE, SEQ_LEN, LSIZE) torch tensor
        
            :returns: dictionary of losses, containing the gmm, the mse, the bce and
                the averaged loss.
            """

        mus, sigmas, logpi, rs, ds = self.mdrnnBIG(action, latent_obs)
        gmm = gmm_loss(latent_next_obs, mus, sigmas, logpi)
        bce = F.binary_cross_entropy_with_logits(ds, terminal)
        if include_reward:
            mse = F.mse_loss(rs, reward)
            scale = LSIZE + 2
        else:
            mse = 0
            scale = LSIZE + 1
        loss = (gmm + bce + mse) / scale
        return dict(gmm=gmm, bce=bce, mse=mse, loss=loss)
Ejemplo n.º 3
0
def get_loss(latent_obs, action, reward, terminal, latent_next_obs):
    """ Compute losses.

    The loss that is computed is:
    (GMMLoss(latent_next_obs, GMMPredicted) + MSE(reward, predicted_reward) +
         BCE(terminal, logit_terminal)) / (LSIZE + 2)
    The LSIZE + 2 factor is here to counteract the fact that the GMMLoss scales
    approximately linearily with LSIZE. All losses are averaged both on the
    batch and the sequence dimensions (the two first dimensions).

    :args latent_obs: (BSIZE, SEQ_LEN, LSIZE) torch tensor
    :args action: (BSIZE, SEQ_LEN, ASIZE) torch tensor
    :args reward: (BSIZE, SEQ_LEN) torch tensor
    :args latent_next_obs: (BSIZE, SEQ_LEN, LSIZE) torch tensor

    :returns: dictionary of losses, containing the gmm, the mse, the bce and
        the averaged loss.
    """
    latent_obs, action,\
        reward, terminal,\
        latent_next_obs = [arr.transpose(1, 0)
                           for arr in [latent_obs, action,
                                       reward, terminal,
                                       latent_next_obs]]
    mus, sigmas, logpi, rs, ds = mdrnn(action, latent_obs)
    gmm = gmm_loss(latent_next_obs, mus, sigmas, logpi)
    bce = f.binary_cross_entropy_with_logits(ds, terminal)
    mse = f.mse_loss(rs, reward)
    loss = (gmm + bce + mse) / (LSIZE + 2)
    return dict(gmm=gmm, bce=bce, mse=mse, loss=loss)
Ejemplo n.º 4
0
def get_loss(latent_obs, action, reward, terminal, latent_next_obs):
    """ Compute losses.

    The loss that is computed is:
    (GMMLoss(latent_next_obs, GMMPredicted) + MSE(reward, predicted_reward) +
         BCE(terminal, logit_terminal)) / (LSIZE + 2)
    The LSIZE + 2 factor is here to counteract the fact that the GMMLoss scales
    approximately linearily with LSIZE. All losses are averaged both on the
    batch and the sequence dimensions (the two first dimensions).

    :args latent_obs: (BSIZE, SEQ_LEN, LSIZE) torch tensor
    :args action: (BSIZE, SEQ_LEN, ASIZE) torch tensor
    :args reward: (BSIZE, SEQ_LEN) torch tensor
    :args latent_next_obs: (BSIZE, SEQ_LEN, LSIZE) torch tensor

    :returns: dictionary of losses, containing the gmm, the mse, the bce and
        the averaged loss.
    """
    # transpose such that seq_len is the first dimension
    latent_obs, action,\
        reward, terminal,\
        latent_next_obs = [arr.transpose(1, 0)
                           for arr in [latent_obs, action,
                                       reward, terminal,
                                       latent_next_obs]]
    mus, sigmas, logpi, rs, ds = mdrnn(action, latent_obs)
    gmm = gmm_loss(latent_next_obs, mus, sigmas, logpi)
    bce = f.binary_cross_entropy_with_logits(ds, terminal)
    mse = f.mse_loss(rs, reward)
    loss = (gmm + bce + mse) / (LSIZE + 2)
    return dict(gmm=gmm, bce=bce, mse=mse, loss=loss)
Ejemplo n.º 5
0
    def mdrnn_exp_reward(self, latent_obs, action, reward, latent_next_obs,
                         hidden):
        """  # REMOVE TERMINAL

        Compute losses.

        The loss that is computed is:
        (GMMLoss(latent_next_obs, GMMPredicted) + MSE(reward, predicted_reward) +
             BCE(terminal, logit_terminal)) / (LSIZE + 2)
        The LSIZE + 2 factor is here to counteract the fact that the GMMLoss scales
        approximately linearily with LSIZE. All losses are averaged both on the
        batch and the sequence dimensions (the two first dimensions).

        :args latent_obs: (BSIZE, SEQ_LEN, LSIZE) torch tensor
        :args action: (BSIZE, SEQ_LEN, ASIZE) torch tensor
        :args reward: (BSIZE, SEQ_LEN) torch tensor
        :args latent_next_obs: (BSIZE, SEQ_LEN, LSIZE) torch tensor

        :returns: dictionary of losses, containing the gmm, the mse, the bce and
            the averaged loss.
        """

        mus, sigmas, logpi, rs, ds, next_hidden = self.mdrnn(
            action, latent_obs, hidden)
        gmm = gmm_loss(latent_next_obs, mus, sigmas, logpi)
        # bce = f.binary_cross_entropy_with_logits(ds, terminal)
        mse = f.mse_loss(rs, reward)
        loss = (gmm + mse) / (LSIZE + 2)
        return loss.squeeze().cpu().numpy()
Ejemplo n.º 6
0
    def test_gmm_loss_my(self):
        # seq_len x batch_size x gaussian_size x feature_size
        # 1 x 1 x 2 x 2
        mus = torch.Tensor([[
            [[0., 0.], [6., 6.]],
        ]])
        sigmas = torch.Tensor([[
            [[2., 2.], [2., 2.]],
        ]])
        # seq_len x batch_size x gaussian_size
        pi = torch.Tensor([[[.5, .5]]])
        logpi = torch.log(pi)

        # seq_len x batch_size x feature_size
        batch = torch.Tensor([[[3., 3.]]])
        gl = gmm_loss(batch, mus, sigmas, logpi)

        # first component, first dimension
        n11 = Normal(mus[0, 0, 0, 0], sigmas[0, 0, 0, 0])
        # first component, second dimension
        n12 = Normal(mus[0, 0, 0, 1], sigmas[0, 0, 0, 1])
        p1 = pi[0, 0, 0] * torch.exp(n11.log_prob(batch[0, 0, 0])) * torch.exp(
            n12.log_prob(batch[0, 0, 1]))
        # second component, first dimension
        n21 = Normal(mus[0, 0, 1, 0], sigmas[0, 0, 1, 0])
        # second component, second dimension
        n22 = Normal(mus[0, 0, 1, 1], sigmas[0, 0, 1, 1])
        p2 = pi[0, 0, 1] * torch.exp(n21.log_prob(batch[0, 0, 0])) * torch.exp(
            n22.log_prob(batch[0, 0, 1]))

        print("gmm loss={}, p1={}, p2={}, p1+p2={}, -log(p1+p2)={}".format(
            gl, p1, p2, p1 + p2, -torch.log(p1 + p2)))
        assert -torch.log(p1 + p2) == gl
        print()
Ejemplo n.º 7
0
    def test_gmm_loss(self):
        """ Test case 1 """
        n_samples = 10000

        means = torch.Tensor([[0., 0.],
                              [1., 1.],
                              [-1., 1.]])
        stds = torch.Tensor([[.03, .05],
                             [.02, .1],
                             [.1, .03]])
        pi = torch.Tensor([.2, .3, .5])

        cat_dist = Categorical(pi)
        indices = cat_dist.sample((n_samples,)).long()
        rands = torch.randn(n_samples, 2)

        samples = means[indices] + rands * stds[indices]

        class _model(nn.Module):
            def __init__(self, gaussians):
                super().__init__()
                self.means = nn.Parameter(torch.Tensor(1, gaussians, 2).normal_())
                self.pre_stds = nn.Parameter(torch.Tensor(1, gaussians, 2).normal_())
                self.pi = nn.Parameter(torch.Tensor(1, gaussians).normal_())

            def forward(self, *inputs):
                return self.means, torch.exp(self.pre_stds), f.softmax(self.pi, dim=1)

        model = _model(3)
        optimizer = torch.optim.Adam(model.parameters())

        iterations = 100000
        log_step = iterations // 10
        pbar = tqdm(total=iterations)
        cum_loss = 0
        for i in range(iterations):
            batch = samples[torch.LongTensor(128).random_(0, n_samples)]
            m, s, p = model.forward()
            loss = gmm_loss(batch, m, s, p)
            cum_loss += loss.item()
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            pbar.set_postfix_str("avg_loss={:10.6f}".format(
                cum_loss / (i + 1)))
            pbar.update(1)
            if i % log_step == log_step - 1:
                print(m)
                print(s)
                print(p)
Ejemplo n.º 8
0
    def test_gmm_loss(self):
        """ Test case 1 """
        n_samples = 10000

        means = torch.Tensor([[0., 0.], [1., 1.], [-1., 1.]])
        stds = torch.Tensor([[.03, .05], [.02, .1], [.1, .03]])
        pi = torch.Tensor([.2, .3, .5])

        cat_dist = Categorical(pi)
        indices = cat_dist.sample((n_samples, )).long()
        rands = torch.randn(n_samples, 2)

        samples = means[indices] + rands * stds[indices]

        class _model(nn.Module):
            def __init__(self, gaussians):
                super().__init__()
                self.means = nn.Parameter(
                    torch.Tensor(1, gaussians, 2).normal_())
                self.pre_stds = nn.Parameter(
                    torch.Tensor(1, gaussians, 2).normal_())
                self.pi = nn.Parameter(torch.Tensor(1, gaussians).normal_())

            def forward(self, *inputs):
                return self.means, torch.exp(self.pre_stds), f.softmax(self.pi,
                                                                       dim=1)

        model = _model(3)
        optimizer = torch.optim.Adam(model.parameters())

        iterations = 100000
        log_step = iterations // 10
        pbar = tqdm(total=iterations)
        cum_loss = 0
        for i in range(iterations):
            batch = samples[torch.LongTensor(128).random_(0, n_samples)]
            m, s, p = model.forward()
            loss = gmm_loss(batch, m, s, p)
            cum_loss += loss.item()
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            pbar.set_postfix_str("avg_loss={:10.6f}".format(cum_loss /
                                                            (i + 1)))
            pbar.update(1)
            if i % log_step == log_step - 1:
                print(m)
                print(s)
                print(p)
def mdn_rnn_loss_function(latent_obs, action, reward, terminal,
                          latent_next_obs, mdn_rnn_prediction):
    latent_obs, action,\
        reward, terminal,\
        latent_next_obs = [arr.transpose(1, 0)
                           for arr in [latent_obs, action,
                                       reward, terminal,
                                       latent_next_obs]]
    mus, sigmas, logpi, rs, ds = mdn_rnn_prediction
    gmm = gmm_loss(latent_next_obs, mus, sigmas, logpi)
    bce = F.binary_cross_entropy_with_logits(ds, terminal)
    mse = F.binary_cross_entropy_with_logits(rs, reward)
    scale = LSIZE + 2
    loss = (gmm + bce + mse) / scale
    return dict(gmm=gmm, bce=bce, mse=mse, loss=loss)
def mdn_rnn_loss_function(latent_obs, action, reward, terminal,
                          latent_next_obs, mdn_rnn_prediction):
    """ Compute losses.

    The loss that is computed is:
    (GMMLoss(latent_next_obs, GMMPredicted) + MSE(reward, predicted_reward) +
         BCE(terminal, logit_terminal)) / (LSIZE + 2)
    The LSIZE + 2 factor is here to counteract the fact that the GMMLoss scales
    approximately linearily with LSIZE. All losses are averaged both on the
    batch and the sequence dimensions (the two first dimensions).

    :args latent_obs: (BSIZE, SEQ_LEN, LSIZE) torch tensor
    :args action: (BSIZE, SEQ_LEN, ASIZE) torch tensor
    :args reward: (BSIZE, SEQ_LEN) torch tensor
    :args latent_next_obs: (BSIZE, SEQ_LEN, LSIZE) torch tensor

    :returns: dictionary of losses, containing the gmm, the mse, the bce and
        the averaged loss.
    """
    (latent_obs, lengths), (action, _),\
        (reward, _), (terminal, _),\
        (latent_next_obs, _) = [
        pad_packed_sequence(a)
        for a in [latent_obs, action,
                    reward, terminal,
                    latent_next_obs]]
    (mus, _), (sigmas, _), (logpi, _), (rs, _), (ds, _) = [
        pad_packed_sequence(o) for o in mdn_rnn_prediction
    ]
    gmm_losses = []
    bce_losses = []
    for b, length in enumerate(lengths):
        gmm_losses.append(
            gmm_loss(latent_next_obs[:length, b], mus[:length, b],
                     sigmas[:length, b], logpi[:length, b]))
        bce_losses.append(
            F.binary_cross_entropy_with_logits(ds[:length, b],
                                               terminal[:length, b]))
    gmm = torch.mean(torch.stack(gmm_losses))
    bce = torch.mean(torch.stack(bce_losses))
    loss = (gmm + bce) / (LSIZE + 1)
    return dict(gmm=gmm, bce=bce, loss=loss)
Ejemplo n.º 11
0
def get_loss(input, output, train):
    """ Compute losses.

    The loss that is computed is:
    (GMMLoss(latent_next_obs, GMMPredicted) + MSE(reward, predicted_reward) +
         BCE(terminal, logit_terminal)) / (LSIZE + 2)
    The LSIZE + 2 factor is here to counteract the fact that the GMMLoss scales
    approximately linearily with LSIZE. All losses are averaged both on the
    batch and the sequence dimensions (the two first dimensions).

    :args latent_obs: (BSIZE, SEQ_LEN, LSIZE) torch tensor
    :args action: (BSIZE, SEQ_LEN, ASIZE) torch tensor
    :args reward: (BSIZE, SEQ_LEN) torch tensor
    :args latent_next_obs: (BSIZE, SEQ_LEN, LSIZE) torch tensor

    :returns: dictionary of losses, containing the gmm, the mse, the bce and
        the averaged loss.
    """
    mu, sigma, pi = mdrnn(input, train)
    gmm = gmm_loss(mu, sigma, pi, output)

    return gmm
Ejemplo n.º 12
0
    def rollout(self, params, render=False):
        """ Executes rollouts for number of goals

        """
        # copy params into the controller
        if params is not None:
            load_parameters(params, self.fmodel)
        optimizer = optim.Adam(params=self.fmodel.parameters(), lr=0.0001)

        MDRNNoptimizer = torch.optim.RMSprop(self.mdrnnBIG.parameters(),
                                             lr=1e-3,
                                             alpha=.9)
        #MDRNNoptimizer.load_state_dict(self.rnn_state["optimizer"])

        VAEOptimizer = optim.Adam(self.vae.parameters())
        VAEOptimizer.load_state_dict(self.vae_state["optimizer"])

        HiddenVAEOptimizer = optim.Adam(self.HiddenVAE.parameters())
        HiddenVAEOptimizer.load_state_dict(self.hiddenvae_state["optimizer"])

        zstate_list = []
        self.env.seed(1337)
        #self.env.reset()
        obs = self.env.reset()

        obs = obs['image']
        expl_rate = 0.4

        hidden = [torch.zeros(1, RSIZE).to(self.device) for _ in range(2)]

        _, latent_mu, logsigma, z = self.tolatent(obs)

        i = 0

        #Bootstrapping, collect 100 initial states for goal space
        while True:

            action = random.randrange(6)

            _, hidden, z, zh = self.transform_obs_hidden(obs, hidden, action)
            _, _, hidden_latent = self.tohiddenlatent(hidden)
            obs, exreward, done, _ = self.env.step(action)
            obs = obs['image']

            if self.hiddengoals:
                zstate_list.append(
                    np.array(hidden_latent.cpu().detach().numpy())
                )  #if we use pure hidden
            else:
                zstate_list.append(np.array(
                    z.cpu().detach().numpy()))  #if we use latent_space

            i += 1
            if render:
                self.env.render()

            if i > self.time_limit:
                break

        s = obs
        loss_list = []
        WM_loss = []
        VAE_loss_per_rollout = []
        hiddenvae_loss_per_rollout = []
        rollout_reward = []
        visitationcount = []
        exreward_per_rollout = []
        visitationarray = np.zeros((25, 25))
        final_loss = []

        #Goal Exploration
        for c in range(self.number_goals):
            #reset env, uncomment below if necessary to reset agent in enviroinment after every episode
            '''
            self.env.seed(1337)
            self.env.reset()
            s = self.env.reset()
            
            #reset obs and hidden state
            s = s['image']
            _,_,_,z = self.tolatent(s)
            hidden = [
            torch.zeros(1, RSIZE).to(self.device) for _ in range(2)]
            '''

            print('Goal Number', c)
            zstate_list = np.array(zstate_list)
            zstate_list = zstate_list.squeeze(1)
            kde = scipy.stats.gaussian_kde(zstate_list.T)

            z_goal = sampling_method(
                kde)  #sample goal from goal space using KDE
            z_goal = torch.tensor([z_goal], dtype=torch.float32).to(
                self.device)  #controller requires both as tensors

            if not self.hiddengoals:
                z_goal_obs = self.vae.decoder(z_goal)
                z_goal_obs = z_goal_obs.reshape(7, 7, 3)
                z_goal_obs = np.array(z_goal_obs.cpu().detach())

                plt9 = plt.figure('Zgoal')
                plt.cla()
                sn.heatmap(z_goal_obs[:, :, 0],
                           cmap='Reds',
                           annot=True,
                           cbar=False).invert_yaxis()

            total_hiddenvae_loss = 0
            total_vae_loss = 0
            total_reward = 0
            total_exreward = 0
            total_loss = 0
            goal_loss = []

            scur_rollout = []
            snext_rollout = []
            r_rollout = []
            d_rollout = []
            act_rollout = []

            zstate_list = zstate_list[:, np.newaxis, :]
            zstate_list = zstate_list.tolist()

            for goalattempts in range(100):
                if visitationarray[self.env.agent_pos[0],
                                   self.env.agent_pos[1]] == 0:
                    visitationarray[self.env.agent_pos[0],
                                    self.env.agent_pos[1]] += 1
                h = []
                for a in range(6):
                    if self.Forward_model == 'D':
                        h.append(
                            self.fmodel(
                                z.detach(), hidden[0].detach(),
                                torch.tensor([[a]], dtype=torch.float32).to(
                                    self.device)))

                    else:
                        #Perform a prediction of next state for every action. Add to a list spo comparison with goal can occur
                        z, hmus, hsigmas, hlogpi, zh, next_hidden, next_hidden_latent, next_hidden_mu, next_hidden_sigma = self.predict_next(
                            s, hidden, a)
                        h.append([
                            hmus, hsigmas, hlogpi, next_hidden_latent,
                            next_hidden_mu, next_hidden_sigma
                        ])

                if expl_rate > random.random():
                    m = random.randrange(6)
                else:
                    #choose action which will bring us closer to goal
                    m = self.gpcf(z_goal, h)

                z, hmus, hsigmas, hlogpi, zh, hidden, hidden_latent, hidden_mu, hidden_sigma = self.predict_next(
                    s, hidden, m
                )  #gets mean, standard deviation and  pi, next latent of prediction of next latent obs

                if not self.hiddengoals:
                    if self.Forward_model == 'D':
                        predicted_next_obs = self.vae.decoder(h[m])
                        predicted_next_obs = predicted_next_obs.reshape(
                            7, 7, 3)
                        p = np.array(predicted_next_obs.cpu().detach())
                    else:
                        predicted_next_obs = self.vae.decoder(zh)
                        predicted_next_obs = predicted_next_obs.reshape(
                            7, 7, 3)
                        p = np.array(predicted_next_obs.cpu().detach())
                else:
                    predicted_next_obs = self.vae.decoder(zh)
                    predicted_next_obs = predicted_next_obs.reshape(7, 7, 3)
                    p = np.array(predicted_next_obs.cpu().detach())

                #Show predicted next observation
                if render:
                    plt5 = plt.figure('Predicted obs')
                    plt.cla()
                    sn.heatmap(p[:, :, 0], cmap='Reds', annot=True,
                               cbar=False).invert_yaxis()

                s, exreward, _, _ = self.env.step(
                    m
                )  #perform action , get next observation and external reward if any
                total_exreward += exreward

                s = s['image']

                recons, next_mu, next_logsigma, next_z = self.tolatent(
                    s)  #transform observation to latent representation

                if self.hiddengoals:
                    reconhidden, hiddenmu, hiddenlogsigma = self.HiddenVAE(
                        hidden[0].detach()
                    )  #transoform hidden state into latent representation if using goals in world model

                #Show actual observation
                if render:
                    plt6 = plt.figure('Actual obs')
                    plt.cla()
                    sn.heatmap(s[:, :, 0], cmap='Reds', annot=True,
                               cbar=False).invert_yaxis()

                #Collect information for training World Model
                scur_rollout.append(np.array(z.cpu().detach()))
                snext_rollout.append(np.array(next_z.cpu().detach()))
                r_rollout.append([0.0])
                act_rollout.append([[np.float(m)]])
                d_rollout.append([0.0])

                if render:
                    self.env.render()

                if self.hiddengoals:
                    hiddenvae_loss = self.VAEloss(reconhidden,
                                                  hidden[0].detach(), hiddenmu,
                                                  hiddenlogsigma)
                    total_hiddenvae_loss += hiddenvae_loss

                VAE_loss = self.VAEloss(
                    recons,
                    torch.tensor(s.flatten(),
                                 dtype=torch.float32).unsqueeze(0).to(
                                     self.device), next_mu, next_logsigma)
                total_vae_loss += VAE_loss

                #Curiosity reward is how far the next state was from the prediction
                Curiosityreward = gmm_loss(next_z.detach(), hmus, hsigmas,
                                           hlogpi) / 33

                #Uncomment below if requiring to add only completely new hidden states to goal space
                '''
                if Curiosityreward > 1.29: #only add this to the goal space if it was new: this promotes sampling goals which we are unsure about
                    if self.hiddengoals:
                        zstate_list.append(np.array(hidden_latent.cpu().detach().numpy()))#if we use pure hidden
                    else:
                        zstate_list.append(np.array(z.cpu().detach().numpy()))#if we use latent_space
                '''

                #add all states to goal space
                if self.hiddengoals:
                    zstate_list.append(
                        np.array(hidden_latent.cpu().detach().numpy())
                    )  #if we use pure hidden
                else:
                    zstate_list.append(np.array(
                        z.cpu().detach().numpy()))  #if we use latent_space

                #if forward model is a linear layer then there are vastly different loss functions. This performs badly so is not recommended to use
                if self.Forward_model == 'D':
                    if self.hiddengoals:
                        goal_loss.append(
                            self.LSEloss(hidden_latent, z_goal)
                        )  #how far away the achieved step is from the goal
                        floss = self.LSEloss(
                            h[m], hidden_latent.detach()
                        )  #difference between forward model prediction and next hidden
                    else:
                        goal_loss.append(
                            criterion(next_z.detach(), z_goal).item()
                        )  #how far away the achieved step is from the goal
                        floss = criterion(
                            h[m], next_z.detach()
                        )  #difference between forward model prediction and next latent
                else:
                    if self.hiddengoals:
                        goal_loss.append(
                            gmm_loss(
                                z_goal, hidden_mu, hidden_sigma,
                                torch.tensor([-1.0], dtype=torch.float32).to(
                                    self.device)) / 33
                        )  #how far away the achieved step is from the goal
                        floss = Curiosityreward  #difference between forward model prediction and next hidden
                    else:
                        goal_loss.append(
                            gmm_loss(
                                z_goal, next_mu, next_logsigma.exp(),
                                torch.tensor([-1.0], dtype=torch.float32).to(
                                    self.device)) / 33)
                        floss = Curiosityreward

                total_loss += floss

                #train forward model D if necessary
                if self.Forward_model == 'D':
                    optimizer.zero_grad()
                    floss.backward()
                    optimizer.step()

                #To see what goals look like at lowest observed distance throughout testing
                if goal_loss[-1] < 1.5:
                    '''
                    plt84 = plt.figure('Actual obs')
                    plt.cla()
                    sn.heatmap(s[:,:,0],cmap = 'Reds', annot=True,cbar = False).invert_yaxis() 
                    
                    
                    plt85 = plt.figure('Zgoal')
                    plt.cla()
                    sn.heatmap(z_goal_obs[:,:,0],cmap = 'Reds', annot=True,cbar = False).invert_yaxis()
                    plt.show()
                    '''
                    reward = 4.0  #this reward is more of a place holder
                else:
                    reward = 0.0

                if self.curiosityreward:
                    reward = reward + Curiosityreward

                total_reward += reward

            final_loss.append(goal_loss[-1])

            #Using every single observation, action, next observation,terminality condition and reward seen in the episode, get the loss of the world model
            mdrnnlosses = self.get_loss(
                torch.tensor(scur_rollout).to(self.device),
                torch.tensor(act_rollout).to(self.device),
                torch.tensor(r_rollout).to(self.device),
                torch.tensor(d_rollout).to(self.device),
                torch.tensor(snext_rollout).to(self.device),
                include_reward=False)

            #train world model
            MDRNNoptimizer.zero_grad()
            mdrnnlosses['loss'].backward()
            MDRNNoptimizer.step()

            WM_loss.append(
                mdrnnlosses['loss'])  #append to world model loss graph

            #train VAE and HiddenVAE if representation learning is not static
            if not self.static:
                VAE_loss_per_rollout.append(
                    total_vae_loss / (goalattempts + 1)
                )  #average VAE loss metric when non static representations are being used
                VAEOptimizer.zero_grad()
                VAE_loss_per_rollout[-1].backward()
                VAEOptimizer.step()

                if self.hiddengoals:
                    hiddenvae_loss_per_rollout.append(
                        total_hiddenvae_loss / (goalattempts + 1)
                    )  #average HiddenVAE loss metric when non static representations  of hiddens states are being used
                    HiddenVAEOptimizer.zero_grad()
                    hiddenvae_loss_per_rollout[-1].backward()
                    HiddenVAEOptimizer.step()

            if goalattempts % 10 == 0:  #every 10 goals update the MDRNN cell for use in predicting the next state
                self.mdrnn.load_state_dict(self.mdrnnBIG.state_dict())

            loss_list.append(total_loss / (goalattempts + 1))
            rollout_reward.append(total_reward)
            visitationcount.append(np.sum(visitationarray))
            exreward_per_rollout.append(total_exreward)

        plot1 = plt.figure('Average Forward model loss')
        plt.plot(loss_list)
        plt7 = plt.figure('WM_loss')
        plt.plot(WM_loss)
        plt4 = plt.figure('Distance to goal per step')
        plt.cla()
        plt.plot(goal_loss)
        rolloutrewardplot = plt.figure('Reward per rollout')
        plt.plot(rollout_reward)
        if not self.static:
            vaerolloutplot = plt.figure('VAE loss per rollout')
            plt.plot(VAE_loss_per_rollout)
            if self.hiddengoals:
                hiddenvaerolloutplot = plt.figure('HiddenVAE loss per rollout')
                plt.plot(hiddenvae_loss_per_rollout)
        plt8 = plt.figure('Visitation')
        plt.plot(visitationcount)
        pltexreward = plt.figure('Extrinsic Reward per rollout')
        plt.plot(exreward_per_rollout)
        pltgoalloss = plt.figure('Final Goal Loss per Episode')
        plt.plot(final_loss)
        plt.show()

        input('stop')
Ejemplo n.º 13
0
    def test_mdrnn_learning(self):
        num_epochs = 300
        num_episodes = 400
        batch_size = 200
        action_dim = 2
        seq_len = 5
        state_dim = 2
        simulated_num_gaussian = 2
        mdrnn_num_gaussian = 2
        simulated_hidden_size = 3
        mdrnn_hidden_size = 10
        mdrnn_hidden_layer = 1
        adam_lr = 0.01
        cur_state_mem = numpy.zeros((num_episodes, seq_len, state_dim))
        next_state_mem = numpy.zeros((num_episodes, seq_len, state_dim))
        action_mem = numpy.zeros((num_episodes, seq_len, action_dim))
        reward_mem = numpy.zeros((num_episodes, seq_len))
        terminal_mem = numpy.zeros((num_episodes, seq_len))
        next_mus_mem = numpy.zeros(
            (num_episodes, seq_len, simulated_num_gaussian, state_dim))

        swm = SimulatedWorldModel(
            action_dim=action_dim,
            state_dim=state_dim,
            num_gaussian=simulated_num_gaussian,
            lstm_num_layer=1,
            lstm_hidden_dim=simulated_hidden_size,
        )

        actions = torch.eye(action_dim)
        for e in range(num_episodes):
            swm.init_hidden(batch_size=1)
            next_state = torch.randn((1, 1, state_dim))
            for s in range(seq_len):
                cur_state = next_state

                action = torch.tensor(
                    actions[numpy.random.randint(action_dim)]).view(
                        1, 1, action_dim)
                next_mus, reward = swm(action, cur_state)
                terminal = 0
                if s == seq_len - 1:
                    terminal = 1

                next_pi = torch.ones(
                    simulated_num_gaussian) / simulated_num_gaussian
                index = Categorical(next_pi).sample((1, )).long().item()
                next_state = next_mus[0, 0, index].view(1, 1, state_dim)

                print(
                    "{} cur_state: {}, action: {}, next_state: {}, reward: {}, terminal: {}"
                    .format(e, cur_state, action, next_state, reward,
                            terminal))
                print("next_pi: {}, sampled index: {}".format(next_pi, index))
                print("next_mus:", next_mus, "\n")

                cur_state_mem[e, s, :] = cur_state.detach().numpy()
                action_mem[e, s, :] = action.numpy()
                reward_mem[e, s] = reward.detach().numpy()
                terminal_mem[e, s] = terminal
                next_state_mem[e, s, :] = next_state.detach().numpy()
                next_mus_mem[e, s, :, :] = next_mus.detach().numpy()

        mdrnn = MDRNN(
            latents=state_dim,
            actions=action_dim,
            gaussians=mdrnn_num_gaussian,
            hiddens=mdrnn_hidden_size,
            layers=mdrnn_hidden_layer,
        )
        mdrnn.train()
        optimizer = torch.optim.Adam(mdrnn.parameters(), lr=adam_lr)
        num_batch = num_episodes // batch_size
        earlystopping = EarlyStopping('min', patience=30)

        cum_loss = []
        cum_gmm = []
        cum_bce = []
        cum_mse = []
        for e in range(num_epochs):
            for i in range(0, num_batch):
                mdrnn.init_hidden(batch_size=batch_size)
                optimizer.zero_grad()
                sample_indices = numpy.random.randint(num_episodes,
                                                      size=batch_size)

                obs, action, reward, terminal, next_obs = \
                    cur_state_mem[sample_indices], \
                    action_mem[sample_indices], \
                    reward_mem[sample_indices], \
                    terminal_mem[sample_indices], \
                    next_state_mem[sample_indices]
                obs, action, reward, terminal, next_obs = \
                    torch.tensor(obs, dtype=torch.float), \
                    torch.tensor(action, dtype=torch.float), \
                    torch.tensor(reward, dtype=torch.float), \
                    torch.tensor(terminal, dtype=torch.float), \
                    torch.tensor(next_obs, dtype=torch.float)

                print("learning at epoch {} step {} best score {} counter {}".
                      format(e, i, earlystopping.best,
                             earlystopping.num_bad_epochs))
                losses = self.get_loss(obs, action, reward, terminal, next_obs,
                                       state_dim, mdrnn)
                losses['loss'].backward()
                optimizer.step()

                cum_loss += [losses['loss'].item()]
                cum_gmm += [losses['gmm'].item()]
                cum_bce += [losses['bce'].item()]
                cum_mse += [losses['mse'].item()]
                print(
                    "loss={loss:10.6f} bce={bce:10.6f} gmm={gmm:10.6f} mse={mse:10.6f}"
                    .format(
                        loss=losses['loss'],
                        bce=losses['bce'],
                        gmm=losses['gmm'],
                        mse=losses['mse'],
                    ))
                print(
                    "cum loss={loss:10.6f} cum bce={bce:10.6f} cum gmm={gmm:10.6f} cum mse={mse:10.6f}"
                    .format(
                        loss=numpy.mean(cum_loss),
                        bce=numpy.mean(cum_bce),
                        gmm=numpy.mean(cum_gmm),
                        mse=numpy.mean(cum_mse),
                    ))

                print()

            earlystopping.step(numpy.mean(cum_loss[-num_batch:]))
            if numpy.mean(cum_loss[-num_batch:]) < -3. and earlystopping.stop:
                break

        assert numpy.mean(cum_loss[-num_batch:]) < -3.

        sample_indices = [0]
        mdrnn.init_hidden(batch_size=len(sample_indices))
        mdrnn.eval()
        obs, action, reward, terminal, next_obs = \
            cur_state_mem[sample_indices], \
            action_mem[sample_indices], \
            reward_mem[sample_indices], \
            terminal_mem[sample_indices], \
            next_state_mem[sample_indices]
        obs, action, reward, terminal, next_obs = \
            torch.tensor(obs, dtype=torch.float), \
            torch.tensor(action, dtype=torch.float), \
            torch.tensor(reward, dtype=torch.float), \
            torch.tensor(terminal, dtype=torch.float), \
            torch.tensor(next_obs, dtype=torch.float)
        transpose_obs, transpose_action, transpose_reward, transpose_terminal, transpose_next_obs = \
            self.transpose(obs, action, reward, terminal, next_obs)
        mus, sigmas, logpi, rs, ds = mdrnn(transpose_action, transpose_obs)
        pi = torch.exp(logpi)
        gl = gmm_loss(transpose_next_obs, mus, sigmas, logpi)
        print(gl)

        print()