Ejemplo n.º 1
0
class Agent():
    """
    Agent for training
    """
    def __init__(self):

        # Loading world model and vae
        vae_file, rnn_file, ctrl_file = \
            [join("./training", m, 'best.tar') for m in ['vae', 'mdrnn', 'ctrl']]

        assert exists(vae_file) and exists(rnn_file),\
            "Either vae or mdrnn is untrained."

        vae_state, rnn_state = [
            torch.load(fname, map_location={'cuda:0': str(device)})
            for fname in (vae_file, rnn_file)
        ]

        for m, s in (('VAE', vae_state), ('MDRNN', rnn_state)):
            logger.info("Loading {} at epoch {} "
                        "with test loss {}".format(m, s['epoch'],
                                                   s['precision']))

        self.vae = VAE(3, LSIZE).to(device).double()
        self.vae.load_state_dict(vae_state['state_dict'])

        self.mdrnn = MDRNNCell(LSIZE, ASIZE, RSIZE, 5).to(device).double()
        self.mdrnn.load_state_dict(
            {k.strip('_l0'): v
             for k, v in rnn_state['state_dict'].items()})

        for p in self.vae.parameters():
            p.requires_grad = False
        for p in self.mdrnn.parameters():
            p.requires_grad = False

        self.net = Controller(LSIZE, RSIZE, ASIZE).to(device).double()
        # load controller if it was previously saved
        if exists(ctrl_file):
            ctrl_state = torch.load(ctrl_file,
                                    map_location={'cuda:0': str(device)})
            logger.info("Loading Controller with reward {}".format(
                ctrl_state['reward']))
            self.net.load_state_dict(ctrl_state['state_dict'])

    def select_action(self, state, hidden):

        with torch.no_grad():
            _, latent_mu, _ = self.vae(state)
            alpha, beta = self.net(latent_mu, hidden[0])[0]

        action = alpha / (alpha + beta)

        _, _, _, _, _, next_hidden = self.mdrnn(action, latent_mu, hidden)
        action = action.squeeze().cpu().numpy()
        return action, next_hidden

    def load_param(self):
        self.net.load_state_dict(torch.load('param/ppo_net_params.pkl'))
Ejemplo n.º 2
0
class RolloutGeneratorSingle(object):
    def __init__(self, mdir, device, controller_model):
        """ Run one step. 
        Load VAE and MDRNN from files
        Take the controller (exp/ctrl) an an input, so we can easily change stuff inside the other file.
         """
        self.controller = controller_model.to(device)

        # Load controllers
        vae_file, rnn_file, ctrl_file = \
            [join(mdir, m, 'best.tar') for m in ['vae', 'mdrnn', 'ctrl']]

        assert exists(vae_file) and exists(rnn_file),\
            "Either vae or mdrnn is untrained."

        vae_state, rnn_state = [
            torch.load(fname, map_location={'cuda:0': str(device)})
            for fname in (vae_file, rnn_file)
        ]

        for m, s in (('VAE', vae_state), ('MDRNN', rnn_state)):
            print("Loading {} at epoch {} "
                  "with test loss {}".format(m, s['epoch'], s['precision']))

        self.vae = VAE(3, LSIZE).to(device)
        self.vae.load_state_dict(vae_state['state_dict'])

        # MDRNNCell
        self.mdrnn = MDRNNCell(LSIZE, ASIZE, RSIZE, 5).to(device)
        self.mdrnn.load_state_dict(
            {k.strip('_l0'): v
             for k, v in rnn_state['state_dict'].items()})

    def single_step(obs, hidden):
        if params is not None:
            load_parameters(params, self.controller)

        obs = transform(obs).unsqueeze(0).to(self.device)

        # GET ACTION
        _, latent_mu, _ = self.vae(obs)
        action = self.controller(latent_mu, hidden[0])
        _, _, _, _, _, next_hidden = self.mdrnn(action, latent_mu, hidden)
        action = action.squeeze().cpu().numpy()

        return action, next_obs, next_hidden
Ejemplo n.º 3
0
class RolloutGenerator(object):
    """ Utility to generate rollouts.

    Encapsulate everything that is needed to generate rollouts in the TRUE ENV
    using a controller with previously trained VAE and MDRNN.

    :attr vae: VAE model loaded from mdir/vae
    :attr mdrnn: MDRNN model loaded from mdir/mdrnn
    :attr controller: Controller, either loaded from mdir/ctrl or randomly
        initialized
    :attr env: instance of the CarRacing-v0 gym environment
    :attr device: device used to run VAE, MDRNN and Controller
    :attr time_limit: rollouts have a maximum of time_limit timesteps
    """
    def __init__(self, mdir, device, time_limit):
        """ Build vae, rnn, controller and environment. """
        # Loading world model and vae
        vae_file, rnn_file, ctrl_file = \
            [join(mdir, m, 'best.tar') for m in ['vae', 'mdrnn', 'ctrl']]

        assert exists(vae_file) and exists(rnn_file),\
            "Either vae or mdrnn is untrained."

        vae_state, rnn_state = [
            torch.load(fname, map_location={'cuda:0': str(device)})
            for fname in (vae_file, rnn_file)
        ]

        for m, s in (('VAE', vae_state), ('MDRNN', rnn_state)):
            print("Loading {} at epoch {} "
                  "with test loss {}".format(m, s['epoch'], s['precision']))

        self.vae = VAE(3, LSIZE).to(device)
        self.vae.load_state_dict(vae_state['state_dict'])

        self.mdrnn = MDRNNCell(LSIZE, ASIZE, RSIZE, 5).to(device)
        self.mdrnn.load_state_dict(
            {k.strip('_l0'): v
             for k, v in rnn_state['state_dict'].items()})

        self.controller = Controller(LSIZE, RSIZE, ASIZE).to(device)

        # load controller if it was previously saved
        if exists(ctrl_file):
            ctrl_state = torch.load(ctrl_file,
                                    map_location={'cuda:0': str(device)})
            print("Loading Controller with reward {}".format(
                ctrl_state['reward']))
            self.controller.load_state_dict(ctrl_state['state_dict'])

        self.env = gym.make('CarRacing-v0')
        self.device = device

        self.time_limit = time_limit

    def get_action_and_transition(self, obs, hidden):
        """ Get action and transition.

        Encode obs to latent using the VAE, then obtain estimation for next
        latent and next hidden state using the MDRNN and compute the controller
        corresponding action.

        :args obs: current observation (1 x 3 x 64 x 64) torch tensor
        :args hidden: current hidden state (1 x 256) torch tensor

        :returns: (action, next_hidden)
            - action: 1D np array
            - next_hidden (1 x 256) torch tensor
        """
        _, latent_mu, _ = self.vae(obs)
        action = self.controller(latent_mu, hidden[0])
        _, _, _, _, _, next_hidden = self.mdrnn(action, latent_mu, hidden)
        return action.squeeze().cpu().numpy(), next_hidden

    def rollout(self, params, render=False):
        """ Execute a rollout and returns minus cumulative reward.

        Load :params: into the controller and execute a single rollout. This
        is the main API of this class.

        :args params: parameters as a single 1D np array

        :returns: minus cumulative reward
        """
        # copy params into the controller
        if params is not None:
            load_parameters(params, self.controller)

        obs = self.env.reset()

        # This first render is required !
        self.env.render()

        hidden = [torch.zeros(1, RSIZE).to(self.device) for _ in range(2)]

        cumulative = 0
        i = 0
        while True:
            obs = transform(obs).unsqueeze(0).to(self.device)
            action, hidden = self.get_action_and_transition(obs, hidden)
            obs, reward, done, _ = self.env.step(action)

            if render:
                self.env.render()

            cumulative += reward
            if done or i > self.time_limit:
                return -cumulative
            i += 1
Ejemplo n.º 4
0
class RolloutGenerator(object):
    """ Utility to generate rollouts.

    mdir: model directory i.e where models are stored, 
    device: cuda or cpu
    time_limit: number of samples in goal space before exploration,
    number_goals: number of goals to set over lifetime of agent,
    Forward_model: 'M' = World Model, 'D' = Linear layers(do not use),
    hiddengoals: True = Goals set in World Model, False = goals as observations(basically IMGEPs),
    curiosityreward = True/False - not relevant in this implementation,
    static: True = static VAE and HiddenVAE, False = constantly evolving VAE and HiddenVAE
    """
    def __init__(self,
                 mdir,
                 device,
                 time_limit,
                 number_goals,
                 Forward_model,
                 hiddengoals: bool,
                 curiosityreward=bool,
                 static=bool):
        """ Build vae, rnn, controller and environment. """
        # Loading world model and vae
        vae_file, rnn_file, ctrl_file, Dtild_file, hiddenvae_file = [
            join(mdir, m, 'best.tar')
            for m in ['vae', 'mdrnn', 'ctrl', 'dtild', 'hiddenvae']
        ]

        assert exists(vae_file) and exists(
            rnn_file), "Either vae or mdrnn is untrained."

        vae_state, rnn_state, hiddenvae_state = [
            torch.load(fname, map_location={'cuda:0': str(device)})
            for fname in (vae_file, rnn_file, hiddenvae_file)
        ]

        for m, s in (('VAE', vae_state), ('MDRNN', rnn_state),
                     ('HiddenVAE', hiddenvae_state)):
            print("Loading {} at epoch {} "
                  "with test loss {}".format(m, s['epoch'], s['precision']))

        self.vae = VAE(3, LSIZE).to(device)
        self.vae.load_state_dict(vae_state['state_dict'])

        self.HiddenVAE = HiddenVAE(256, LSIZE).to(device)
        self.HiddenVAE.load_state_dict(hiddenvae_state['state_dict'])

        self.mdrnn = MDRNNCell(LSIZE, ASIZE, RSIZE, 5).to(device)
        self.mdrnn.load_state_dict(
            {k.strip('_l0'): v
             for k, v in rnn_state['state_dict'].items()})

        self.mdrnnBIG = MDRNN(LSIZE, ASIZE, RSIZE, 5).to(device)
        self.mdrnnBIG.load_state_dict(rnn_state["state_dict"])

        self.controller = Controller(256, 256, 6).to(device)

        self.env = gym.make('MiniGrid-MultiRoom-N6-v0')

        self.device = device
        self.number_goals = number_goals
        self.time_limit = time_limit

        self.vae_state = vae_state
        self.rnn_state = rnn_state
        self.hiddenvae_state = hiddenvae_state

        self.hiddengoals = hiddengoals
        self.curiosityreward = curiosityreward
        self.static = static
        self.Forward_model = Forward_model

        self.fmodel = Dtild(32, 256, 1, 32).to(device)

    def rollout(self, params, render=False):
        """ Executes rollouts for number of goals

        """
        # copy params into the controller
        if params is not None:
            load_parameters(params, self.fmodel)
        optimizer = optim.Adam(params=self.fmodel.parameters(), lr=0.0001)

        MDRNNoptimizer = torch.optim.RMSprop(self.mdrnnBIG.parameters(),
                                             lr=1e-3,
                                             alpha=.9)
        #MDRNNoptimizer.load_state_dict(self.rnn_state["optimizer"])

        VAEOptimizer = optim.Adam(self.vae.parameters())
        VAEOptimizer.load_state_dict(self.vae_state["optimizer"])

        HiddenVAEOptimizer = optim.Adam(self.HiddenVAE.parameters())
        HiddenVAEOptimizer.load_state_dict(self.hiddenvae_state["optimizer"])

        zstate_list = []
        self.env.seed(1337)
        #self.env.reset()
        obs = self.env.reset()

        obs = obs['image']
        expl_rate = 0.4

        hidden = [torch.zeros(1, RSIZE).to(self.device) for _ in range(2)]

        _, latent_mu, logsigma, z = self.tolatent(obs)

        i = 0

        #Bootstrapping, collect 100 initial states for goal space
        while True:

            action = random.randrange(6)

            _, hidden, z, zh = self.transform_obs_hidden(obs, hidden, action)
            _, _, hidden_latent = self.tohiddenlatent(hidden)
            obs, exreward, done, _ = self.env.step(action)
            obs = obs['image']

            if self.hiddengoals:
                zstate_list.append(
                    np.array(hidden_latent.cpu().detach().numpy())
                )  #if we use pure hidden
            else:
                zstate_list.append(np.array(
                    z.cpu().detach().numpy()))  #if we use latent_space

            i += 1
            if render:
                self.env.render()

            if i > self.time_limit:
                break

        s = obs
        loss_list = []
        WM_loss = []
        VAE_loss_per_rollout = []
        hiddenvae_loss_per_rollout = []
        rollout_reward = []
        visitationcount = []
        exreward_per_rollout = []
        visitationarray = np.zeros((25, 25))
        final_loss = []

        #Goal Exploration
        for c in range(self.number_goals):
            #reset env, uncomment below if necessary to reset agent in enviroinment after every episode
            '''
            self.env.seed(1337)
            self.env.reset()
            s = self.env.reset()
            
            #reset obs and hidden state
            s = s['image']
            _,_,_,z = self.tolatent(s)
            hidden = [
            torch.zeros(1, RSIZE).to(self.device) for _ in range(2)]
            '''

            print('Goal Number', c)
            zstate_list = np.array(zstate_list)
            zstate_list = zstate_list.squeeze(1)
            kde = scipy.stats.gaussian_kde(zstate_list.T)

            z_goal = sampling_method(
                kde)  #sample goal from goal space using KDE
            z_goal = torch.tensor([z_goal], dtype=torch.float32).to(
                self.device)  #controller requires both as tensors

            if not self.hiddengoals:
                z_goal_obs = self.vae.decoder(z_goal)
                z_goal_obs = z_goal_obs.reshape(7, 7, 3)
                z_goal_obs = np.array(z_goal_obs.cpu().detach())

                plt9 = plt.figure('Zgoal')
                plt.cla()
                sn.heatmap(z_goal_obs[:, :, 0],
                           cmap='Reds',
                           annot=True,
                           cbar=False).invert_yaxis()

            total_hiddenvae_loss = 0
            total_vae_loss = 0
            total_reward = 0
            total_exreward = 0
            total_loss = 0
            goal_loss = []

            scur_rollout = []
            snext_rollout = []
            r_rollout = []
            d_rollout = []
            act_rollout = []

            zstate_list = zstate_list[:, np.newaxis, :]
            zstate_list = zstate_list.tolist()

            for goalattempts in range(100):
                if visitationarray[self.env.agent_pos[0],
                                   self.env.agent_pos[1]] == 0:
                    visitationarray[self.env.agent_pos[0],
                                    self.env.agent_pos[1]] += 1
                h = []
                for a in range(6):
                    if self.Forward_model == 'D':
                        h.append(
                            self.fmodel(
                                z.detach(), hidden[0].detach(),
                                torch.tensor([[a]], dtype=torch.float32).to(
                                    self.device)))

                    else:
                        #Perform a prediction of next state for every action. Add to a list spo comparison with goal can occur
                        z, hmus, hsigmas, hlogpi, zh, next_hidden, next_hidden_latent, next_hidden_mu, next_hidden_sigma = self.predict_next(
                            s, hidden, a)
                        h.append([
                            hmus, hsigmas, hlogpi, next_hidden_latent,
                            next_hidden_mu, next_hidden_sigma
                        ])

                if expl_rate > random.random():
                    m = random.randrange(6)
                else:
                    #choose action which will bring us closer to goal
                    m = self.gpcf(z_goal, h)

                z, hmus, hsigmas, hlogpi, zh, hidden, hidden_latent, hidden_mu, hidden_sigma = self.predict_next(
                    s, hidden, m
                )  #gets mean, standard deviation and  pi, next latent of prediction of next latent obs

                if not self.hiddengoals:
                    if self.Forward_model == 'D':
                        predicted_next_obs = self.vae.decoder(h[m])
                        predicted_next_obs = predicted_next_obs.reshape(
                            7, 7, 3)
                        p = np.array(predicted_next_obs.cpu().detach())
                    else:
                        predicted_next_obs = self.vae.decoder(zh)
                        predicted_next_obs = predicted_next_obs.reshape(
                            7, 7, 3)
                        p = np.array(predicted_next_obs.cpu().detach())
                else:
                    predicted_next_obs = self.vae.decoder(zh)
                    predicted_next_obs = predicted_next_obs.reshape(7, 7, 3)
                    p = np.array(predicted_next_obs.cpu().detach())

                #Show predicted next observation
                if render:
                    plt5 = plt.figure('Predicted obs')
                    plt.cla()
                    sn.heatmap(p[:, :, 0], cmap='Reds', annot=True,
                               cbar=False).invert_yaxis()

                s, exreward, _, _ = self.env.step(
                    m
                )  #perform action , get next observation and external reward if any
                total_exreward += exreward

                s = s['image']

                recons, next_mu, next_logsigma, next_z = self.tolatent(
                    s)  #transform observation to latent representation

                if self.hiddengoals:
                    reconhidden, hiddenmu, hiddenlogsigma = self.HiddenVAE(
                        hidden[0].detach()
                    )  #transoform hidden state into latent representation if using goals in world model

                #Show actual observation
                if render:
                    plt6 = plt.figure('Actual obs')
                    plt.cla()
                    sn.heatmap(s[:, :, 0], cmap='Reds', annot=True,
                               cbar=False).invert_yaxis()

                #Collect information for training World Model
                scur_rollout.append(np.array(z.cpu().detach()))
                snext_rollout.append(np.array(next_z.cpu().detach()))
                r_rollout.append([0.0])
                act_rollout.append([[np.float(m)]])
                d_rollout.append([0.0])

                if render:
                    self.env.render()

                if self.hiddengoals:
                    hiddenvae_loss = self.VAEloss(reconhidden,
                                                  hidden[0].detach(), hiddenmu,
                                                  hiddenlogsigma)
                    total_hiddenvae_loss += hiddenvae_loss

                VAE_loss = self.VAEloss(
                    recons,
                    torch.tensor(s.flatten(),
                                 dtype=torch.float32).unsqueeze(0).to(
                                     self.device), next_mu, next_logsigma)
                total_vae_loss += VAE_loss

                #Curiosity reward is how far the next state was from the prediction
                Curiosityreward = gmm_loss(next_z.detach(), hmus, hsigmas,
                                           hlogpi) / 33

                #Uncomment below if requiring to add only completely new hidden states to goal space
                '''
                if Curiosityreward > 1.29: #only add this to the goal space if it was new: this promotes sampling goals which we are unsure about
                    if self.hiddengoals:
                        zstate_list.append(np.array(hidden_latent.cpu().detach().numpy()))#if we use pure hidden
                    else:
                        zstate_list.append(np.array(z.cpu().detach().numpy()))#if we use latent_space
                '''

                #add all states to goal space
                if self.hiddengoals:
                    zstate_list.append(
                        np.array(hidden_latent.cpu().detach().numpy())
                    )  #if we use pure hidden
                else:
                    zstate_list.append(np.array(
                        z.cpu().detach().numpy()))  #if we use latent_space

                #if forward model is a linear layer then there are vastly different loss functions. This performs badly so is not recommended to use
                if self.Forward_model == 'D':
                    if self.hiddengoals:
                        goal_loss.append(
                            self.LSEloss(hidden_latent, z_goal)
                        )  #how far away the achieved step is from the goal
                        floss = self.LSEloss(
                            h[m], hidden_latent.detach()
                        )  #difference between forward model prediction and next hidden
                    else:
                        goal_loss.append(
                            criterion(next_z.detach(), z_goal).item()
                        )  #how far away the achieved step is from the goal
                        floss = criterion(
                            h[m], next_z.detach()
                        )  #difference between forward model prediction and next latent
                else:
                    if self.hiddengoals:
                        goal_loss.append(
                            gmm_loss(
                                z_goal, hidden_mu, hidden_sigma,
                                torch.tensor([-1.0], dtype=torch.float32).to(
                                    self.device)) / 33
                        )  #how far away the achieved step is from the goal
                        floss = Curiosityreward  #difference between forward model prediction and next hidden
                    else:
                        goal_loss.append(
                            gmm_loss(
                                z_goal, next_mu, next_logsigma.exp(),
                                torch.tensor([-1.0], dtype=torch.float32).to(
                                    self.device)) / 33)
                        floss = Curiosityreward

                total_loss += floss

                #train forward model D if necessary
                if self.Forward_model == 'D':
                    optimizer.zero_grad()
                    floss.backward()
                    optimizer.step()

                #To see what goals look like at lowest observed distance throughout testing
                if goal_loss[-1] < 1.5:
                    '''
                    plt84 = plt.figure('Actual obs')
                    plt.cla()
                    sn.heatmap(s[:,:,0],cmap = 'Reds', annot=True,cbar = False).invert_yaxis() 
                    
                    
                    plt85 = plt.figure('Zgoal')
                    plt.cla()
                    sn.heatmap(z_goal_obs[:,:,0],cmap = 'Reds', annot=True,cbar = False).invert_yaxis()
                    plt.show()
                    '''
                    reward = 4.0  #this reward is more of a place holder
                else:
                    reward = 0.0

                if self.curiosityreward:
                    reward = reward + Curiosityreward

                total_reward += reward

            final_loss.append(goal_loss[-1])

            #Using every single observation, action, next observation,terminality condition and reward seen in the episode, get the loss of the world model
            mdrnnlosses = self.get_loss(
                torch.tensor(scur_rollout).to(self.device),
                torch.tensor(act_rollout).to(self.device),
                torch.tensor(r_rollout).to(self.device),
                torch.tensor(d_rollout).to(self.device),
                torch.tensor(snext_rollout).to(self.device),
                include_reward=False)

            #train world model
            MDRNNoptimizer.zero_grad()
            mdrnnlosses['loss'].backward()
            MDRNNoptimizer.step()

            WM_loss.append(
                mdrnnlosses['loss'])  #append to world model loss graph

            #train VAE and HiddenVAE if representation learning is not static
            if not self.static:
                VAE_loss_per_rollout.append(
                    total_vae_loss / (goalattempts + 1)
                )  #average VAE loss metric when non static representations are being used
                VAEOptimizer.zero_grad()
                VAE_loss_per_rollout[-1].backward()
                VAEOptimizer.step()

                if self.hiddengoals:
                    hiddenvae_loss_per_rollout.append(
                        total_hiddenvae_loss / (goalattempts + 1)
                    )  #average HiddenVAE loss metric when non static representations  of hiddens states are being used
                    HiddenVAEOptimizer.zero_grad()
                    hiddenvae_loss_per_rollout[-1].backward()
                    HiddenVAEOptimizer.step()

            if goalattempts % 10 == 0:  #every 10 goals update the MDRNN cell for use in predicting the next state
                self.mdrnn.load_state_dict(self.mdrnnBIG.state_dict())

            loss_list.append(total_loss / (goalattempts + 1))
            rollout_reward.append(total_reward)
            visitationcount.append(np.sum(visitationarray))
            exreward_per_rollout.append(total_exreward)

        plot1 = plt.figure('Average Forward model loss')
        plt.plot(loss_list)
        plt7 = plt.figure('WM_loss')
        plt.plot(WM_loss)
        plt4 = plt.figure('Distance to goal per step')
        plt.cla()
        plt.plot(goal_loss)
        rolloutrewardplot = plt.figure('Reward per rollout')
        plt.plot(rollout_reward)
        if not self.static:
            vaerolloutplot = plt.figure('VAE loss per rollout')
            plt.plot(VAE_loss_per_rollout)
            if self.hiddengoals:
                hiddenvaerolloutplot = plt.figure('HiddenVAE loss per rollout')
                plt.plot(hiddenvae_loss_per_rollout)
        plt8 = plt.figure('Visitation')
        plt.plot(visitationcount)
        pltexreward = plt.figure('Extrinsic Reward per rollout')
        plt.plot(exreward_per_rollout)
        pltgoalloss = plt.figure('Final Goal Loss per Episode')
        plt.plot(final_loss)
        plt.show()

        input('stop')

    def transform_obs_hidden(self, obs, hidden, m):
        obs = torch.tensor(obs.flatten(),
                           dtype=torch.float32).unsqueeze(0).to(self.device)

        action = torch.Tensor([[m]]).to(self.device)
        reconx, latent_mu, logsigma = self.vae(obs)

        sigma = logsigma.exp()
        eps = torch.randn_like(sigma)
        z = eps.mul(sigma).add_(latent_mu)

        hmus, hsigmas, hlogpi, _, _, next_hidden = self.mdrnn(
            action, z, tuple(hidden))

        hlogpi = hlogpi.squeeze()
        mixt = Categorical(torch.exp(hlogpi)).sample().item()

        zh = hmus[:, mixt, :] + hsigmas[:, mixt, :] * torch.randn_like(
            hmus[:, mixt, :])

        return action.squeeze().cpu().numpy(), next_hidden, z, zh

    def gpcf(self, z_goal, h):
        #Compares possible next state as a result of each action to goal.
        #Chooses the action which will reduce the loss the most

        output = []
        for action in range(len(h)):
            if self.hiddengoals:
                if self.Forward_model == 'D':
                    output.append(self.LSEloss(h[action], z_goal))
                else:
                    #compares the goal with the next state using the NLL loss of the latent representation of the next hidden state
                    #h[action][4] = next_hidden_mu, h[action][5] =  next_hidden_sigma, logpi = -1
                    output.append(
                        gmm_loss(
                            z_goal, h[action][4], h[action][5],
                            torch.tensor([-1.0], dtype=torch.float32).to(
                                self.device)) / 33)
            else:
                if self.Forward_model == 'D':
                    output.append(criterion(h[action], z_goal).item())
                else:
                    #compares the goal with the next state using the NLL loss of the latent representation of the next hidden state
                    #h[action][0] = mu, h[action][1] = sigma, logpi = logpi
                    output.append(
                        gmm_loss(z_goal, h[action][0], h[action][1],
                                 h[action][2]) / 33)

        return output.index(min(output))

    def get_loss(self, latent_obs, action, reward, terminal, latent_next_obs,
                 include_reward: bool):
        """ Compute losses.
        
            The loss that is computed is:
            (GMMLoss(latent_next_obs, GMMPredicted) + MSE(reward, predicted_reward) +
                 BCE(terminal, logit_terminal)) / (LSIZE + 2)
            The LSIZE + 2 factor is here to counteract the fact that the GMMLoss scales
            approximately linearily with LSIZE. All losses are averaged both on the
            batch and the sequence dimensions (the two first dimensions).
        
            :args latent_obs: (BSIZE, SEQ_LEN, LSIZE) torch tensor
            :args action: (BSIZE, SEQ_LEN, ASIZE) torch tensor
            :args reward: (BSIZE, SEQ_LEN) torch tensor
            :args latent_next_obs: (BSIZE, SEQ_LEN, LSIZE) torch tensor
        
            :returns: dictionary of losses, containing the gmm, the mse, the bce and
                the averaged loss.
            """

        mus, sigmas, logpi, rs, ds = self.mdrnnBIG(action, latent_obs)
        gmm = gmm_loss(latent_next_obs, mus, sigmas, logpi)
        bce = F.binary_cross_entropy_with_logits(ds, terminal)
        if include_reward:
            mse = F.mse_loss(rs, reward)
            scale = LSIZE + 2
        else:
            mse = 0
            scale = LSIZE + 1
        loss = (gmm + bce + mse) / scale
        return dict(gmm=gmm, bce=bce, mse=mse, loss=loss)

    def tolatent(self, obs):
        obs = torch.tensor(obs.flatten(),
                           dtype=torch.float32).unsqueeze(0).to(self.device)

        reconx, latent_mu, logsigma = self.vae(obs)

        sigma = logsigma.exp()
        eps = torch.randn_like(sigma)
        z = eps.mul(sigma).add_(latent_mu)

        return reconx, latent_mu, logsigma, z

    def tohiddenlatent(self, hidden):
        '''

        Parameters
        ----------
        hidden : tensor.
            hidden state of mdrnn
        Returns
        -------
        latent_mu : tensor
            mean of latent representation of hidden state.
        sigma : tensor
            standard deviation of latent representation of hidden state.
        zhidden : tensor
           latent representation of hidden state.

        '''
        _, latent_mu, logsigma = self.HiddenVAE(hidden[0].detach())

        sigma = logsigma.exp()
        eps = torch.randn_like(sigma)
        zhidden = eps.mul(sigma).add_(latent_mu)

        return latent_mu, sigma, zhidden

    def predict_next(self, obs, hidden, m):
        '''
        Parameters
        ----------
        obs : array(7x7x3)
            Observation of current state.
        hidden : tensor
            current hidden state.
        m : integer
            action to be taken or could be taken.

        Returns
        -------
        z : tensor
            Latent representation of current observation.
        hmus : tensor
            mean of gaussians of prediction of latent representation of next observation.
        hsigmas : tensor
             standard deviation of gaussians of prediction of latent representation of next observation.
        hlogpi : tensor
            Mixture proportion of gaussians of prediction of latent representation of next observation.
        zh : tensor
            predicition of next observation using categorical distribution.
        next_hidden : tenosr
            next hidden state given the action.
        next_hidden_latent : tensor
            latent representation of next hidden state.
        next_hidden_mu : tensor
            mean of latent representation of next hidden state.
        next_hidden_sigma : tensor
            standard deviation of latent representation of next hidden state.
        '''

        obs = torch.tensor(obs.flatten(),
                           dtype=torch.float32).unsqueeze(0).to(self.device)

        action = torch.Tensor([[m]]).to(self.device)
        reconx, latent_mu, logsigma = self.vae(obs)

        sigma = logsigma.exp()
        eps = torch.randn_like(sigma)
        z = eps.mul(sigma).add_(latent_mu)

        hmus, hsigmas, hlogpi, _, _, next_hidden = self.mdrnn(
            action, z, tuple(hidden))

        hlogpi = hlogpi.squeeze()
        mixt = Categorical(torch.exp(hlogpi)).sample().item()
        #gets prediction of next latent using categorical distribution over gaussians predicted with MDRNN
        zh = hmus[:, mixt, :] + hsigmas[:, mixt, :] * torch.randn_like(
            hmus[:, mixt, :])

        next_hidden_mu, next_hidden_sigma, next_hidden_latent = self.tohiddenlatent(
            next_hidden)

        return z, hmus, hsigmas, hlogpi, zh, next_hidden, next_hidden_latent, next_hidden_mu, next_hidden_sigma

    def LSEloss(self, yHat, y):
        return torch.sum((yHat - y)**2)

    def VAEloss(self, recon_x, x, mu, logsigma):
        """ VAE loss function """

        BCE = F.mse_loss(recon_x, x, size_average=False)

        # see Appendix B from VAE paper:
        # Kingma and Welling. Auto-Encoding Variational Bayes. ICLR, 2014
        # https://arxiv.org/abs/1312.6114
        # 0.5 * sum(1 + log(sigma^2) - mu^2 - sigma^2)
        KLD = -0.5 * torch.sum(1 + 2 * logsigma - mu.pow(2) -
                               (2 * logsigma).exp())

        return BCE + KLD
class Agent():
    """
    Agent for training
    """
    max_grad_norm = 0.5
    clip_param = 0.1  # epsilon in clipped loss
    ppo_epoch = 10
    buffer_capacity, batch_size = 1500, 128

    def __init__(self):
        
        
        # Loading world model and vae
        vae_file, rnn_file, ctrl_file = \
            [join("./training", m, 'best.tar') for m in ['vae', 'mdrnn', 'ctrl']]

        assert exists(vae_file) and exists(rnn_file),\
            "Either vae or mdrnn is untrained."

        vae_state, rnn_state = [
            torch.load(fname, map_location={'cuda:0': str(device)})
            for fname in (vae_file, rnn_file)]

        for m, s in (('VAE', vae_state), ('MDRNN', rnn_state)):
            logger.info("Loading {} at epoch {} "
                  "with test loss {}".format(
                      m, s['epoch'], s['precision']))

        self.vae = VAE(3, LSIZE).to(device).double()
        self.vae.load_state_dict(vae_state['state_dict'])

        self.mdrnn = MDRNNCell(LSIZE, ASIZE, RSIZE, 5).to(device).double()
        self.mdrnn.load_state_dict(
            {k.strip('_l0'): v for k, v in rnn_state['state_dict'].items()})
    
        for p in self.vae.parameters():
            p.requires_grad = False
        for p in self.mdrnn.parameters():
            p.requires_grad = False
        
        
        self.net = Controller(LSIZE, RSIZE, ASIZE).to(device).double()
        # load controller if it was previously saved
        if exists(ctrl_file):
            ctrl_state = torch.load(ctrl_file, map_location={'cuda:0': str(device)})
            logger.info("Loading Controller with reward {}".format(
                ctrl_state['reward']))
            self.net.load_state_dict(ctrl_state['state_dict'])
        
        self.training_step = 0
       
        self.buffer = np.empty(self.buffer_capacity, dtype=transition)
        self.counter = 0

        self.optimizer = optim.Adam(self.net.parameters(), lr=1e-3)

    def select_action(self, state, hidden):
        
        with torch.no_grad():
            _, latent_mu, _ = self.vae(state)
            alpha, beta = self.net(latent_mu, hidden[0])[0]
        
        dist = Beta(alpha, beta)
        action = dist.sample()
        a_logp = dist.log_prob(action).sum(dim=1)

        a_logp = a_logp.item()
        _, _, _, _, _, next_hidden = self.mdrnn(action, latent_mu, hidden)
        
        return action.squeeze().cpu().numpy(), a_logp, latent_mu, next_hidden

    def save_param(self):
        torch.save(self.net.state_dict(), 'param/ppo_net_params.pkl')

    def store(self, transition):
        self.buffer[self.counter] = transition
        self.counter += 1
        if self.counter == self.buffer_capacity:
            self.counter = 0
            return True
        else:
            return False

    def update(self):
        self.training_step += 1
        mu = torch.tensor(self.buffer['mu'], dtype=torch.double).to(device)
        hidden = torch.tensor(self.buffer['hidden'], dtype=torch.double).to(device).view(-1, RSIZE)
        a = torch.tensor(self.buffer['a'], dtype=torch.double).to(device)
        r = torch.tensor(self.buffer['r'], dtype=torch.double).to(device).view(-1, 1)
        mu_ = torch.tensor(self.buffer['mu_'], dtype=torch.double).to(device)
        hidden_ = torch.tensor(self.buffer['hidden_'], dtype=torch.double).to(device).view(-1, RSIZE)

        old_a_logp = torch.tensor(self.buffer['a_logp'], dtype=torch.double).to(device).view(-1, 1)

        with torch.no_grad():
            target_v = r + args.gamma * self.net(mu_, hidden_)[1]
            adv = target_v - self.net(mu, hidden)[1]
            # adv = (adv - adv.mean()) / (adv.std() + 1e-8)

        for _ in range(self.ppo_epoch):
            for index in BatchSampler(SubsetRandomSampler(range(self.buffer_capacity)), self.batch_size, False):

                alpha, beta = self.net(mu[index], hidden[index])[0]
                dist = Beta(alpha, beta)
                a_logp = dist.log_prob(a[index]).sum(dim=1, keepdim=True)
                ratio = torch.exp(a_logp - old_a_logp[index])

                surr1 = ratio * adv[index]
                surr2 = torch.clamp(ratio, 1.0 - self.clip_param, 1.0 + self.clip_param) * adv[index]
                action_loss = -torch.min(surr1, surr2).mean()
                value_loss = F.smooth_l1_loss(self.net(mu[index], hidden[index])[1], target_v[index])
                loss = action_loss + 2. * value_loss

                self.optimizer.zero_grad()
                loss.backward()
                # nn.utils.clip_grad_norm_(self.net.parameters(), self.max_grad_norm)
                self.optimizer.step()
Ejemplo n.º 6
0
class RolloutGenerator(object):
    """ Utility to generate rollouts.

    Encapsulate everything that is needed to generate rollouts in the TRUE ENV
    using a controller with previously trained VAE and MDRNN.

    :attr vae: VAE model loaded from mdir/vae
    :attr mdrnn: MDRNN model loaded from mdir/mdrnn
    :attr controller: Controller, either loaded from mdir/ctrl or randomly
        initialized
    :attr env: instance of the CarRacing-v0 gym environment
    :attr device: device used to run VAE, MDRNN and Controller
    :attr time_limit: rollouts have a maximum of time_limit timesteps
    """
    def __init__(self, mdir, device, time_limit, explorer=False):
        """ Build vae, rnn, controller and environment. """
        self.explorer = explorer

        # Load controllers
        vae_file, rnn_file, ctrl_file = \
            [join(mdir, m, 'best.tar') for m in ['vae', 'mdrnn', 'ctrl']]

        if self.explorer:
            ctrl_file = join(mdir, 'exp', 'best.tar')

        assert exists(vae_file) and exists(rnn_file),\
            "Either vae or mdrnn is untrained."

        vae_state, rnn_state = [
            torch.load(fname, map_location={'cuda:0': str(device)})
            for fname in (vae_file, rnn_file)
        ]

        for m, s in (('VAE', vae_state), ('MDRNN', rnn_state)):
            print("Loading {} at epoch {} "
                  "with test loss {}".format(m, s['epoch'], s['precision']))

        self.vae = VAE(3, LSIZE).to(device)
        self.vae.load_state_dict(vae_state['state_dict'])

        # MDRNNCell
        self.mdrnn = MDRNNCell(LSIZE, ASIZE, RSIZE, 5).to(device)
        self.mdrnn.load_state_dict(
            {k.strip('_l0'): v
             for k, v in rnn_state['state_dict'].items()})

        self.controller = Controller(LSIZE, RSIZE, ASIZE).to(device)

        # load controller if it was previously saved
        if exists(ctrl_file):
            ctrl_state = torch.load(ctrl_file,
                                    map_location={'cuda:0': str(device)})
            print("Loading Controller with reward {}".format(
                ctrl_state['reward']))
            self.controller.load_state_dict(ctrl_state['state_dict'])

        self.env = gym.make('CarRacing-v0')
        self.device = device

        self.time_limit = time_limit

        self.mdrnn_notcell = MDRNN(LSIZE, ASIZE, RSIZE, 5)
        self.mdrnn_notcell.to(device)
        self.mdrnn_notcell.load_state_dict(rnn_state['state_dict'])


#####$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$

# VERY LAZY. Copied from the other trainmdrnn file
# from trainmdrnn import get_loss, to_latent

    def to_latent(self, obs, next_obs):
        """ Transform observations to latent space.

        :args obs: 5D torch tensor (BSIZE, SEQ_LEN, ASIZE, SIZE, SIZE)
        :args next_obs: 5D torch tensor (BSIZE, SEQ_LEN, ASIZE, SIZE, SIZE)

        :returns: (latent_obs, latent_next_obs)
            - latent_obs: 4D torch tensor (BSIZE, SEQ_LEN, LSIZE)
            - next_latent_obs: 4D torch tensor (BSIZE, SEQ_LEN, LSIZE)
        """

        with torch.no_grad():
            obs, next_obs = [
                f.upsample(x.view(-1, 3, SIZE, SIZE),
                           size=RED_SIZE,
                           mode='bilinear',
                           align_corners=True) for x in (obs, next_obs)
            ]

            (obs_mu, obs_logsigma), (next_obs_mu, next_obs_logsigma) = [
                self.vae(x)[1:] for x in (obs, next_obs)
            ]

            SEQ_LEN = 1

            latent_obs, latent_next_obs = [
                (x_mu + x_logsigma.exp() * torch.randn_like(x_mu)).view(
                    BSIZE, SEQ_LEN, LSIZE)
                for x_mu, x_logsigma in [(
                    obs_mu, obs_logsigma), (next_obs_mu, next_obs_logsigma)]
            ]

        return latent_obs, latent_next_obs

    def mdrnn_exp_reward(self, latent_obs, action, reward, latent_next_obs,
                         hidden):
        """  # REMOVE TERMINAL

        Compute losses.

        The loss that is computed is:
        (GMMLoss(latent_next_obs, GMMPredicted) + MSE(reward, predicted_reward) +
             BCE(terminal, logit_terminal)) / (LSIZE + 2)
        The LSIZE + 2 factor is here to counteract the fact that the GMMLoss scales
        approximately linearily with LSIZE. All losses are averaged both on the
        batch and the sequence dimensions (the two first dimensions).

        :args latent_obs: (BSIZE, SEQ_LEN, LSIZE) torch tensor
        :args action: (BSIZE, SEQ_LEN, ASIZE) torch tensor
        :args reward: (BSIZE, SEQ_LEN) torch tensor
        :args latent_next_obs: (BSIZE, SEQ_LEN, LSIZE) torch tensor

        :returns: dictionary of losses, containing the gmm, the mse, the bce and
            the averaged loss.
        """

        mus, sigmas, logpi, rs, ds, next_hidden = self.mdrnn(
            action, latent_obs, hidden)
        gmm = gmm_loss(latent_next_obs, mus, sigmas, logpi)
        # bce = f.binary_cross_entropy_with_logits(ds, terminal)
        mse = f.mse_loss(rs, reward)
        loss = (gmm + mse) / (LSIZE + 2)
        return loss.squeeze().cpu().numpy()

    # def recon_error_reward(self, obs, hidden, obs_new):
    #     print('recon_error_reward')
    #     """Find out how good the reconstruction was.
    #     Encoding the vae to get mu and the controller action is deterministic, so its fine to be duplicated
    #     ??? maybe remove this and the above function because of unnecessary duplication
    #     """
    #     # obs_new = torch.from_numpy(np.moveaxis(obs_new, 2, 0).copy()).unsqueeze(0).to(self.device).type(torch.cuda.FloatTensor)
    #     # obs = obs.to(self.device).type(torch.cuda.FloatTensor)

    #     _, latent_mu, _ = self.vae(obs)
    #     action = self.controller(latent_mu, hidden[0])

    #     mus, sigmas, logpi, r, d, next_hidden = self.mdrnn(action, latent_mu, hidden)
    #     print('mus.size()', mus.size())
    #     print('sigmas.size()', sigmas.size())
    #     print('logpi.size()', logpi.size())
    #     print('r.size()', r.size())
    #     print('d.size()', d.size())
    #     print('next_hidden.size() [0], [1]', next_hidden[0].size(), next_hidden[1].size())

    #     recon_x = self.vae.decoder(mus.squeeze()).type(torch.cuda.FloatTensor) # ??? this is just mu, right? Still a bit confused
    #     print('obs_new.size()', obs_new.size())
    #     print('recon_x.size()', recon_x.size())

    #     # reward = -1*((recon_x - obs_new) ** 2).mean()
    #     reward = -1*F.mse_loss(recon_x, obs_new).item()

    def rollout(self, params, render=False):
        """ Execute a rollout and return reward

        Load :params: into the controller and execute a single rollout. This
        is the main API of this class.

        :args params: parameters as a single 1D np array

        :returns: minus cumulative reward if ctrl mode, cumulative recon_error if exp mode
        """
        # copy params into the controller
        if params is not None:
            load_parameters(params, self.controller)

        obs = self.env.reset()

        # This first render is required !
        self.env.render()

        hidden = [torch.zeros(1, RSIZE).to(self.device) for _ in range(2)]

        cumulative = 0
        i = 0
        while True:
            obs = transform(obs).unsqueeze(0).to(self.device)

            # GET ACTION
            _, latent_mu, _ = self.vae(obs)
            action = self.controller(latent_mu, hidden[0])
            _, _, _, _, _, next_hidden = self.mdrnn(action, latent_mu, hidden)
            action = action.squeeze().cpu().numpy()

            next_obs, reward, done, _ = self.env.step(action)

            if self.explorer:
                latent_obs, latent_next_obs = self.to_latent(
                    obs.unsqueeze(0),
                    transform(next_obs).unsqueeze(0).to(self.device))
                action = torch.from_numpy(action).unsqueeze(0)
                latent_obs = latent_obs.to(self.device).squeeze().unsqueeze(0)
                latent_next_obs = latent_next_obs.to(
                    self.device).squeeze().unsqueeze(0)
                action = action.to(self.device)
                reward = torch.from_numpy(np.array(reward)).unsqueeze(0).type(
                    torch.cuda.FloatTensor)
                reward = self.mdrnn_exp_reward(latent_obs, action, reward,
                                               latent_next_obs, hidden)

            obs = next_obs
            hidden = next_hidden

            if render:
                self.env.render()

            cumulative += reward
            if done or i > self.time_limit:
                return -cumulative
            i += 1
Ejemplo n.º 7
0
class RolloutGenerator(object):
    """ Utility to generate rollouts.

    Encapsulate everything that is needed to generate rollouts in the TRUE ENV
    using a controller with previously trained VAE and MDRNN.

    :attr vae: VAE model loaded from mdir/vae
    :attr mdrnn: MDRNN model loaded from mdir/mdrnn
    :attr controller: Controller, either loaded from mdir/ctrl or randomly
        initialized
    :attr env: instance of the CarRacing-v0 gym environment
    :attr device: device used to run VAE, MDRNN and Controller
    :attr time_limit: rollouts have a maximum of time_limit timesteps
    """
    def __init__(self, mdir, device, time_limit):
        """ Build vae, rnn, controller and environment. """
        # Loading world model and vae
        vae_file, rnn_file, ctrl_file = \
            [join(mdir, m, 'best.tar') for m in ['vae', 'mdrnn', 'ctrl']]

        assert exists(vae_file) and exists(rnn_file),\
            "Either vae or mdrnn is untrained."

        vae_state, rnn_state = [
            torch.load(fname, map_location={'cuda:0': str(device)})
            for fname in (vae_file, rnn_file)]

        for m, s in (('VAE', vae_state), ('MDRNN', rnn_state)):
            print("Loading {} at epoch {} "
                  "with test loss {}".format(
                      m, s['epoch'], s['precision']))

        self.vae = VAE(3, LSIZE).to(device)
        self.vae.load_state_dict(vae_state['state_dict'])

        self.mdrnn = MDRNNCell(LSIZE, ASIZE, RSIZE, 5).to(device)
        self.mdrnn.load_state_dict(
            {k.strip('_l0'): v for k, v in rnn_state['state_dict'].items()})

        self.controller = Controller(LSIZE, RSIZE, ASIZE).to(device)

        # load controller if it was previously saved
        if exists(ctrl_file):
            ctrl_state = torch.load(ctrl_file, map_location={'cuda:0': str(device)})
            print("Loading Controller with reward {}".format(
                ctrl_state['reward']))
            self.controller.load_state_dict(ctrl_state['state_dict'])

        self.env = gym.make('CarRacing-v0')
        self.device = device

        self.time_limit = time_limit

    def get_action_and_transition(self, obs, hidden):
        """ Get action and transition.

        Encode obs to latent using the VAE, then obtain estimation for next
        latent and next hidden state using the MDRNN and compute the controller
        corresponding action.

        :args obs: current observation (1 x 3 x 64 x 64) torch tensor
        :args hidden: current hidden state (1 x 256) torch tensor

        :returns: (action, next_hidden)
            - action: 1D np array
            - next_hidden (1 x 256) torch tensor
        """
        _, latent_mu, _ = self.vae(obs)
        action = self.controller(latent_mu, hidden[0])
        _, _, _, _, _, next_hidden = self.mdrnn(action, latent_mu, hidden)
        return action.squeeze().cpu().numpy(), next_hidden

    def rollout(self, params, render=False):
        """ Execute a rollout and returns minus cumulative reward.

        Load :params: into the controller and execute a single rollout. This
        is the main API of this class.

        :args params: parameters as a single 1D np array

        :returns: minus cumulative reward
        """
        # copy params into the controller
        if params is not None:
            load_parameters(params, self.controller)

        obs = self.env.reset()

        # This first render is required !
        self.env.render()

        hidden = [
            torch.zeros(1, RSIZE).to(self.device)
            for _ in range(2)]

        cumulative = 0
        i = 0
        while True:
            obs = transform(obs).unsqueeze(0).to(self.device)
            action, hidden = self.get_action_and_transition(obs, hidden)
            obs, reward, done, _ = self.env.step(action)

            if render:
                self.env.render()

            cumulative += reward
            if done or i > self.time_limit:
                return - cumulative
            i += 1
Ejemplo n.º 8
0
def generate_data(rollouts, data_dir, noise_type):  # pylint: disable=R0914
    """ Generates data """
    assert exists(data_dir), "The data directory does not exist..."

    mdir = 'D:\steps1000'

    vae_file, rnn_file, ctrl_file = \
                [join(mdir, m, 'best.tar') for m in ['vae', 'mdrnn', 'ctrl']]

    assert exists(vae_file) and exists(rnn_file),\
        "Either vae or mdrnn is untrained."

    vae_state, rnn_state = [
        torch.load(fname, map_location={'cuda:0': str('cuda')})
        for fname in (vae_file, rnn_file)
    ]

    for m, s in (('VAE', vae_state), ('MDRNN', rnn_state)):
        print("Loading {} at epoch {} "
              "with test loss {}".format(m, s['epoch'], s['precision']))

    vae = VAE(3, 32).to('cuda')
    vae.load_state_dict(vae_state['state_dict'])

    mdrnn = MDRNNCell(32, 1, 256, 5).to('cuda')
    mdrnn.load_state_dict(
        {k.strip('_l0'): v
         for k, v in rnn_state['state_dict'].items()})

    hidden = [torch.zeros(1, 256).to('cuda') for _ in range(2)]

    env = gym.make('MiniGrid-MultiRoom-N6-v0')
    env.reset()
    #env = RGBImgPartialObsWrapper(env) # Get pixel observations
    seq_len = 1000

    for i in range(rollouts):
        #env.reset() #uncomment this if a new environment must be produced every episode
        #env.env.viewer.window.dispatch_events()

        s_rollout = []
        r_rollout = []
        d_rollout = []
        a_rollout = []
        h_rollout = []

        t = 0
        while True:
            action = random.randint(0, env.action_space.n - 1)
            t += 1
            #env.render()
            s, r, done, _ = env.step(action)

            #tu = cv2.resize(s['image'],(64,64))
            tu = s['image']
            obs = torch.tensor(tu.flatten(),
                               dtype=torch.float32).unsqueeze(0).to('cuda')

            reconx, latent_mu, logsigma = vae(obs)

            #print(hidden[0])
            act = torch.Tensor([[action]]).to('cuda')
            _, _, _, _, _, hidden = mdrnn(act, latent_mu, tuple(hidden))

            #env.env.viewer.window.dispatch_events()
            s_rollout += [tu]
            if t == 125:
                d = True
            else:
                d = False

            r_rollout += [r]
            d_rollout += [d]
            a_rollout += [[action]]

            h_rollout.append(np.array(hidden[0].cpu().detach().numpy()))

            if t == 125:
                print("> End of rollout {}, {} frames...".format(
                    i, len(s_rollout)))
                np.savez(join(data_dir, 'rollout_{}'.format(i)),
                         observations=np.array(s_rollout),
                         rewards=np.array(r_rollout),
                         actions=np.array(a_rollout),
                         terminals=np.array(d_rollout),
                         hiddens=np.array(h_rollout).squeeze(1))
                break
Ejemplo n.º 9
0
class RolloutGenerator(object):
    """ Utility to generate rollouts.

    Encapsulate everything that is needed to generate rollouts in the TRUE ENV
    using a controller with previously trained VAE and MDRNN.

    :attr vae: VAE model loaded from mdir/vae
    :attr mdrnn: MDRNN model loaded from mdir/mdrnn
    :attr controller: Controller, either loaded from mdir/ctrl or randomly
        initialized
    :attr env: instance of the CarRacing-v0 gym environment
    :attr device: device used to run VAE, MDRNN and Controller
    :attr time_limit: rollouts have a maximum of time_limit timesteps
    """
    def __init__(self,
                 mdir,
                 device,
                 time_limit,
                 iteration_num=None,
                 video_dir=None):
        """ Build vae, rnn, controller and environment. """
        # Loading world model and vae
        vae_file, rnn_file, ctrl_file = [
            join(mdir, m, "best.tar") for m in ["vae", "mdrnn", "ctrl"]
        ]

        if iteration_num is not None:
            vae_file, rnn_file, ctrl_file = [
                join(mdir, m, "iter_{}".format(iteration_num), "best.tar")
                for m in ["vae", "mdrnn", "ctrl"]
            ]

        assert exists(vae_file) and exists(
            rnn_file), "Either vae or mdrnn is untrained."

        if iteration_num is not None:
            vae_file, rnn_file, ctrl_file = [
                join(mdir, m, "iter_{}".format(iteration_num), "best.tar")
                for m in ["vae", "mdrnn", "ctrl"]
            ]

        assert exists(vae_file) and exists(
            rnn_file), "Either vae or mdrnn is untrained."

        print("\nRollout Generator")

        vae_state, rnn_state = [
            torch.load(fname, map_location={"cuda:0": str(device)})
            for fname in (vae_file, rnn_file)
        ]

        print("Loading VAE from {}".format(vae_file))
        print("Loading RNN from {}".format(rnn_file))
        for m, s in (("VAE", vae_state), ("MDRNN", rnn_state)):
            print("Loading {} at epoch {} "
                  "with test loss {}".format(m, s["epoch"], s["precision"]))

        self.vae = VAE(3, LSIZE).to(device)
        self.vae.load_state_dict(vae_state["state_dict"])

        self.mdrnn = MDRNNCell(LSIZE, ASIZE, RSIZE, 5).to(device)
        self.mdrnn.load_state_dict(
            {k.strip("_l0"): v
             for k, v in rnn_state["state_dict"].items()})

        self.controller = Controller(LSIZE, RSIZE, ASIZE).to(device)

        # load controller if it was previously saved
        if exists(ctrl_file):
            print("Loading Controller from {}".format(ctrl_file))
            ctrl_state = torch.load(ctrl_file,
                                    map_location={"cuda:0": str(device)})
            print("Loading Controller with reward {}".format(
                ctrl_state["reward"]))
            self.controller.load_state_dict(ctrl_state["state_dict"])

        self.env = gym.make("BipedalWalkerHardcore-v2")
        self.device = device

        self.time_limit = time_limit

    def get_action_and_transition(self, obs, hidden):
        """ Get action and transition.

        Encode obs to latent using the VAE, then obtain estimation for next
        latent and next hidden state using the MDRNN and compute the controller
        corresponding action.

        :args obs: current observation (1 x 3 x 64 x 64) torch tensor
        :args hidden: current hidden state (1 x 256) torch tensor

        :returns: (action, next_hidden)
            - action: 1D np array
            - next_hidden (1 x 256) torch tensor
        """
        _, latent_mu, _ = self.vae(obs)
        action = self.controller(latent_mu, hidden[0])
        _, _, _, _, _, next_hidden = self.mdrnn(action, latent_mu, hidden)
        return action.squeeze().cpu().numpy(), next_hidden

    def rollout(self,
                params,
                render=False,
                rollout_dir=None,
                rollout_num=0,
                video_dir=None):
        """ Execute a rollout and returns minus cumulative reward.

        Load :params: into the controller and execute a single rollout. This
        is the main API of this class.

        :args params: parameters as a single 1D np array

        :returns: minus cumulative reward
        """
        if video_dir is not None:
            self.env = wrappers.Monitor(
                self.env, "./{}/rollout_{}/".format(video_dir, rollout_num))
        # copy params into the controller
        if params is not None:
            load_parameters(params, self.controller)

        self.env.reset()

        # This first render is required !
        obs = self.env.render(mode='rgb_array')

        hidden = [torch.zeros(1, RSIZE).to(self.device) for _ in range(2)]

        cumulative = 0
        i = 0

        s_rollout = []
        r_rollout = []
        d_rollout = []
        a_rollout = []

        print('Starting to create the rollouts')

        while True:
            if i % 100 == 0:
                print("{} steps done of rollout".format(i))
            obs = transform(obs).unsqueeze(0).to(self.device)
            action, hidden = self.get_action_and_transition(obs, hidden)
            _, reward, done, _ = self.env.step(action)

            # Save rollout data
            im_frame = self.env.render(mode="rgb_array")
            img = PIL.Image.fromarray(im_frame)
            img = img.resize((64, 64))
            obs = np.array(img)
            s_rollout += [obs]
            r_rollout += [reward]
            d_rollout += [done]
            a_rollout += [action]

            if render:
                self.env.render()

            cumulative += reward
            if done or i > self.time_limit:
                print('Completed rollout with {} steps'.format(i))
                if rollout_dir is not None:
                    print("> End of rollout {}, {} frames...".format(
                        rollout_num, len(s_rollout)))
                    np.savez(
                        join(rollout_dir, "rollout_{}".format(rollout_num)),
                        observations=np.array(s_rollout),
                        rewards=np.array(r_rollout),
                        actions=np.array(a_rollout),
                        terminals=np.array(d_rollout),
                    )
                self.env.reset()
                return -cumulative
            i += 1
Ejemplo n.º 10
0
vae_dir = join(args.logdir, 'vae')
reload_file = join(vae_dir, 'best.tar')
state = torch.load(reload_file)
print("Reloading model at epoch {}"
      ", with test error {}".format(state['epoch'], state['precision']))
vae_model.load_state_dict(state['state_dict'])
mdrnn_dir = join(args.logdir, 'mdrnn')
reload_file = join(mdrnn_dir, 'best.tar')
state = torch.load(reload_file)

print("Reloading model at epoch {}"
      ", with test error {}".format(state['epoch'], state['precision']))
# mdrnn_model.load_state_dict(state['state_dict'])
# print(state['state_dict'])
mdrnn_model.load_state_dict(
    {k.strip('_l0'): v
     for k, v in state['state_dict'].items()})
control = torch.nn.Linear(288, 3)
control_weight = json.load(open('./log/ctrl/carracing.cma.16.64.best.json'))[0]
#control weight shape is 867= (256+32)*3+3, not sure how to assign the weight to controller
weights = np.array(control_weight[:-3])
bias = np.array(control_weight[-3:])
control.weight = torch.nn.Parameter(torch.from_numpy(weights).float())
control.bias = torch.nn.Parameter(torch.from_numpy(bias).float())
control = control.cuda(7)
# exit()
print('dreaming')
with torch.no_grad():
    for i in range(20):
        sample = torch.randn(1, LSIZE).cuda(7)
        #print(sample.shape)