Example #1
0
    def rollout(self):
        rollout = []
        for t in range(self.nsteps):
            Qsa = self.eval_state(self.states, self.loc)
            actions = np.argmax(Qsa, axis=1)
            random = np.random.uniform(size=(self.num_envs))
            random_actions = np.random.randint(self.action_size,
                                               size=(self.num_envs))
            actions = np.where(random < self.epsilon, random_actions, actions)
            next_states, rewards, dones, infos = self.env.step(actions)
            values = np.sum(Qsa * one_hot(actions, self.action_size), axis=-1)
            rollout.append((self.states, self.loc, actions, rewards, dones,
                            infos, values))
            self.states = next_states
            self.epsilon = self.scheduler.step()
            self.loc = self.get_locs()

        states, locs, actions, rewards, dones, infos, values = stack_many(*zip(
            *rollout))

        last_Qsa = self.eval_state(next_states, self.loc)  # Q(s,a|theta)
        last_actions = np.argmax(last_Qsa, axis=1)
        last_values = np.sum(last_Qsa *
                             one_hot(last_actions, self.action_size),
                             axis=-1)
        return states, locs, actions, rewards, dones, infos, values, last_values
Example #2
0
    def rollout(self, ):
        rollout = []
        for t in range(self.nsteps):
            policies, values = self.model.evaluate(self.states)
            actions = fastsample(policies)
            next_states, rewards, dones, infos = self.env.step(actions)
            rollout.append((self.states, actions, rewards, values, dones))
            self.states = next_states

        states, actions, rewards, values, dones = stack_many(*zip(*rollout))
        _, last_values = self.model.evaluate(next_states)
        return states, actions, rewards, dones, values, last_values
Example #3
0
    def rollout(self,):
        rollout = []
        for t in range(self.nsteps):
            policies, values = self.model.evaluate(self.states)
            # Qaux = self.model.get_pixel_control(self.states, self.prev_hidden, self.prev_actions_rewards[np.newaxis])
            actions = fastsample(policies)
            next_states, rewards, dones, infos = self.env.step(actions)

            rollout.append((self.states, actions, rewards, values, dones))
            self.replay.append((self.states, actions, rewards, values, dones)) # add to replay memory
            self.states = next_states
        
        states, actions, rewards, values, dones = stack_many(*zip(*rollout))
        _, last_values = self.model.evaluate(next_states)
        return states, actions, rewards, values, dones, last_values
Example #4
0
def train(global_model, model, env, nsteps, num_episodes, ID):
    opt = torch.optim.RMSprop(global_model.parameters(), lr=1e-3)
    episode = 0
    episode_steps = 0
    episode_score = 0
    T = 0
    state = env.reset()
    start = time.time()
    while episode < num_episodes:
        rollout = []
        for t in range(nsteps):
            with torch.no_grad():
                policy, value = model(totorch(state[None], device='cpu'))
                policy, value = tonumpy(policy), tonumpy(value)
            action = np.random.choice(policy.shape[1], p=policy[0])
            next_state, reward, done, info = env.step(action)
            episode_score += reward
            rollout.append((state, action, reward, value, done))
            state = next_state

            T += 1
            episode_steps += 1

            if done or t == nsteps-1:
                states, actions, rewards, values, dones = stack_many(*zip(*rollout))
                with torch.no_grad():
                    _, last_values = model.forward(totorch(next_state[None], device='cpu'))
                    last_values = last_values.cpu().numpy()
                

                    R = lambda_return(rewards, values, last_values, dones, gamma=0.9, lambda_=0.95, clip=False)
                
                loss = update_params(model, global_model, opt, states, actions, R)
                
                #self.T += t

                if done:
                    episode += 1
                    state = env.reset()
                    if episode % 1 == 0:
                        time_taken = time.time() - start 
                        print(f'worker {ID}, total worker steps {T:,} local episode {episode}, episode score {episode_score} episode steps {episode_steps}, time taken {time_taken:,.1f}s, fps {episode_steps/time_taken:.2f}')
                    episode_steps = 0
                    episode_score = 0
                    start = time.time()
                    break
Example #5
0
        def run(self, ):
            rollout = []
            for t in range(self.num_steps):
                policies, values = self.model.forward(self.states)
                actions = [
                    np.random.choice(policies.shape[1], p=policies[i])
                    for i in range(policies.shape[0])
                ]
                next_states, rewards, dones, infos = self.env.step(actions)
                rollout.append((self.states, actions, rewards, values, dones,
                                np.array(infos)))
                self.states = next_states

            states, actions, rewards, values, dones, infos = stack_many(
                zip(*rollout))
            _, last_values = self.model.forward(next_states)
            return states, actions, rewards, dones, infos, values, last_values
Example #6
0
    def rollout(self, ):
        rollout = []
        first_hidden = self.prev_hidden
        for t in range(self.nsteps):
            policies, values, hidden = self.model.evaluate(
                self.states[None], self.prev_hidden)
            actions = fastsample(policies)
            next_states, rewards, dones, infos = self.env.step(actions)
            rollout.append((self.states, actions, rewards, values, dones))
            self.states = next_states
            self.prev_hidden = self.model.mask_hidden(
                hidden, dones)  # reset hidden state at end of episode

        states, actions, rewards, values, dones = stack_many(*zip(*rollout))
        _, last_values, _ = self.model.evaluate(self.states[None],
                                                self.prev_hidden)
        return states, actions, rewards, first_hidden, dones, values, last_values
    def rollout(self,):
        rollout = []
        first_hidden = self.prev_hidden
        for t in range(self.nsteps):
            policies, values, hidden = self.model.evaluate(self.states[None], self.prev_actions_rewards, self.prev_hidden)
            #Qaux = self.model.get_pixel_control(self.states, self.prev_hidden, self.prev_actions_rewards[None])
            actions = fastsample(policies)
            next_states, rewards, dones, infos = self.env.step(actions)

            rollout.append((self.states, actions, rewards, self.prev_actions_rewards, dones, infos))
            self.replay.append((self.states, actions, rewards, self.prev_hidden, self.prev_actions_rewards, dones)) # add to replay memory
            self.states = next_states
            self.prev_hidden = self.model.mask_hidden(hidden, dones) # reset hidden state at end of episode
            self.prev_actions_rewards = concat_action_reward(actions, rewards, self.action_size+1)
        
        states, actions, rewards, prev_actions_rewards, dones, infos = stack_many(*zip(*rollout))
        _, last_values, _ = self.model.evaluate(self.states[None], self.prev_actions_rewards, self.prev_hidden)
        return states, actions, rewards, first_hidden, prev_actions_rewards, dones, last_values
 def run(self,):
     rollout = []
     for t in range(self.num_steps):
         start = time.time()
         policies, values = self.model.forward(self.states)
         actions = [np.random.choice(policies.shape[1], p=policies[i]) for i in range(policies.shape[0])]
         next_states, extr_rewards, dones, infos = self.env.step(actions)
         
         mean, std = np.stack([self.state_mean for i in range(4)], -1), np.stack([self.state_std for i in range(4)], -1)
         intr_rewards = self.model.intrinsic_reward(self.states, actions, next_states, mean, std)
         #print('intr_rewards', intr_rewards)
         rewards = extr_rewards + intr_rewards
         #print('rewards', rewards)
         rollout.append((self.states, next_states, actions, rewards, values, dones))
         self.states = next_states
    
     states, next_states, actions, rewards, values, dones = stack_many(zip(*rollout))
     return states, next_states, actions, rewards, dones, values
Example #9
0
    def rollout(self, ):
        rollout = []
        for t in range(self.nsteps):
            start = time.time()
            policies, values = self.model.evaluate(self.states)
            actions = fastsample(policies)
            next_states, extr_rewards, dones, infos = self.env.step(actions)

            mean, std = self.state_mean[None], self.state_std[None]
            intr_rewards = self.model.intrinsic_reward(
                (self.states - mean) / std, actions,
                (next_states - mean) / std)
            rewards = extr_rewards + intr_rewards
            rollout.append(
                (self.states, next_states, actions, rewards, values, dones))
            self.states = next_states

        states, next_states, actions, rewards, values, dones = stack_many(*zip(
            *rollout))
        return states, next_states, actions, rewards, dones, values
        def run(self, ):
            rollout = []
            for t in range(self.num_steps):
                policies, values, hidden = self.model.forward(
                    self.states, self.prev_hidden)
                actions = [
                    np.random.choice(policies.shape[1], p=policies[i])
                    for i in range(policies.shape[0])
                ]
                next_states, rewards, dones, infos = self.env.step(actions)
                rollout.append((self.states, actions, rewards,
                                self.prev_hidden, dones, infos))
                self.states = next_states

                self.prev_hidden = self.model.reset_batch_hidden(
                    hidden, 1 - dones)  # reset hidden state at end of episode

            states, actions, rewards, hidden_batch, dones, infos = stack_many(
                zip(*rollout))
            _, last_values, _ = self.model.forward(next_states,
                                                   self.prev_hidden)
            return states, actions, rewards, hidden_batch, dones, infos, values, last_values
        def run(self, ):
            rollout = []
            first_state = self.first_state
            for t in range(self.num_steps):
                policies, values = self.model.forward(self.states)
                #Qaux = self.model.get_pixel_control(self.states, self.prev_hidden, self.prev_actions_rewards[np.newaxis])
                actions = [
                    np.random.choice(policies.shape[1], p=policies[i])
                    for i in range(policies.shape[0])
                ]
                next_states, rewards, dones, infos = self.env.step(actions)

                rollout.append((self.states, actions, rewards, values, dones))
                self.replay.append((self.states, actions, rewards,
                                    dones))  # add to replay memory
                self.first_state = self.states.copy()
                self.states = next_states

            states, actions, rewards, values, dones = stack_many(zip(*rollout))
            _, last_values = self.model.forward(next_states)
            Qaux = self.model.get_pixel_control(next_states)
            return states, actions, rewards, values, dones, last_values, first_state, Qaux
Example #12
0
 def init_state_obs(self, num_steps):
     rollout = []
     states = self.env.reset()
     for i in range(1, num_steps + 1):
         rand_actions = np.random.randint(0,
                                          self.model.action_size,
                                          size=self.num_envs)
         #print('rand_actions.shape', rand_actions.shape)
         next_states, rewards, dones, infos = self.env.step(rand_actions)
         rollout.append([states, next_states, rand_actions, rewards])
         states = next_states
         if i % self.nsteps == 0:
             mb_states, mb_next_states, mb_actions, mb_rewards = stack_many(
                 zip(*rollout))
             #print('states, next_states, actions, rewards', mb_states.shape, mb_next_states.shape, mb_actions.shape, mb_rewards.shape)
             self.runner.state_mean, self.runner.state_std = self.state_rolling.update(
                 mb_states)
             self.forward_model.backprop(mb_states[0],
                                         fold_batch(mb_next_states),
                                         fold_batch(mb_actions),
                                         fold_batch(mb_rewards),
                                         len(mb_states))
             rollout = []
Example #13
0
        def run(self, ):
            rollout = []
            for t in range(self.num_steps):
                policies, values_extr, values_intr = self.model.forward(
                    self.states)
                actions = [
                    np.random.choice(policies.shape[1], p=policies[i])
                    for i in range(policies.shape[0])
                ]
                next_states, extr_rewards, dones, infos = self.env.step(
                    actions)
                rollout.append(
                    (self.states, next_states, actions, extr_rewards,
                     values_extr, values_intr, policies, dones))
                self.states = next_states

            states, next_states, actions, extr_rewards, values_extr, values_intr, policies, dones = stack_many(
                zip(*rollout))
            intr_rewards = self.model.intrinsic_reward(fold_batch(states),
                                                       fold_batch(actions),
                                                       fold_batch(next_states),
                                                       self.state_mean,
                                                       self.state_std)
            intr_rewards = unfold_batch(intr_rewards, self.num_steps,
                                        len(self.env))
            return states, next_states, actions, extr_rewards, intr_rewards, values_extr, values_intr, policies, dones
Example #14
0
        def run(self, ):
            rollout = []
            for t in range(self.num_steps):
                policies, values_extr, values_intr = self.model.forward(
                    self.states)
                actions = [
                    np.random.choice(policies.shape[1], p=policies[i])
                    for i in range(policies.shape[0])
                ]
                next_states, extr_rewards, dones, infos = self.env.step(
                    actions)

                next_states__ = next_states[..., -1:] if len(
                    next_states.shape) == 4 else next_states
                intr_rewards = self.model.intrinsic_reward(
                    next_states__, self.state_mean, self.state_std)
                #print('intr rewards', intr_rewards)
                rollout.append(
                    (self.states, next_states, actions, extr_rewards,
                     intr_rewards, values_extr, values_intr, policies, dones))
                self.replay.append(
                    (self.states, actions, extr_rewards, values_extr,
                     dones))  # add to replay memory
                self.states = next_states

            states, next_states, actions, extr_rewards, intr_rewards, values_extr, values_intr, policies, dones = stack_many(
                zip(*rollout))
            return states, next_states, actions, extr_rewards, intr_rewards, values_extr, values_intr, policies, dones
 def run(self,):
     rollout = []
     for t in range(self.num_steps):
         start = time.time()
         policies, extr_values, intr_values = self.model.forward(self.states)
         actions = [np.random.choice(policies.shape[1], p=policies[i]) for i in range(policies.shape[0])]
         next_states, extr_rewards, dones, infos = self.env.step(actions)
         next_states__ = next_states[...,-1:] if len(next_states.shape) == 4 else next_states
         intr_rewards = self.model.intrinsic_reward(next_states__, self.state_mean, self.state_std)
         #print('intr_rewards', self.model.intr_coeff * intr_rewards)
         rollout.append((self.states, next_states, actions, extr_rewards, intr_rewards, extr_values, intr_values, dones, np.array(infos)))
         self.states = next_states
     
     states, next_states, actions, extr_rewards, intr_rewards, extr_values, intr_values, dones, infos = stack_many(zip(*rollout))
     return states, next_states, actions, extr_rewards, intr_rewards, extr_values, intr_values, dones, infos
Example #16
0
    def rollout(self):
        rollout = []
        for t in range(self.nsteps):
            policies, values_extr, values_intr = self.model.evaluate(
                self.states)
            actions = fastsample(policies)
            next_states, extr_rewards, dones, infos = self.env.step(actions)

            next_states__ = next_states[:, -1:] if len(
                next_states.shape
            ) == 4 else next_states  # [num_envs, channels, height, width] for convolutions
            intr_rewards = self.model.intrinsic_reward(next_states__,
                                                       self.state_mean,
                                                       self.state_std)

            rollout.append(
                (self.states, next_states__, actions, extr_rewards,
                 intr_rewards, values_extr, values_intr, policies, dones))
            self.states = next_states

        states, next_states, actions, extr_rewards, intr_rewards, values_extr, values_intr, policies, dones = stack_many(
            *zip(*rollout))
        last_policy, last_values_extr, last_values_intr, = self.model.evaluate(
            self.states)
        return states, next_states, actions, extr_rewards, intr_rewards, values_extr, values_intr, last_values_extr, last_values_intr, policies, dones
Example #17
0
        def run(self, ):
            rollout = []
            for t in range(self.num_steps):
                policies, values_extr, values_intr = self.model.forward(
                    self.states)
                #actions = np.argmax(policies, axis=1)
                actions = [
                    np.random.choice(policies.shape[1], p=policies[i])
                    for i in range(policies.shape[0])
                ]
                next_states, extr_rewards, dones, infos = self.env.step(
                    actions)

                intr_rewards = self.model.intrinsic_reward(
                    next_states[..., -1:], self.state_mean, self.state_std)
                #print('intr rewards', intr_rewards)
                rollout.append(
                    (self.states, next_states, actions, extr_rewards,
                     intr_rewards, values_extr, values_intr, policies, dones))
                self.states = next_states

            states, next_states, actions, extr_rewards, intr_rewards, values_extr, values_intr, policies, dones = stack_many(
                zip(*rollout))
            return states, next_states, actions, extr_rewards, intr_rewards, values_extr, values_intr, policies, dones
Example #18
0
    def _train_nstep(self):
        start = time.time()
        num_updates = self.total_steps // (self.num_envs * self.nsteps)
        alpha_step = 1 / num_updates
        s = 0
        rolling = RunningMeanStd(shape=())
        self.state_rolling = rolling_obs(shape=())
        self.init_state_obs(129)
        #self.runner.state_mean, self.runner.state_std = self.state_rolling.mean, np.sqrt(self.state_rolling.var)
        self.runner.states = self.env.reset()
        forward_filter = RewardForwardFilter(self.gamma)

        # main loop
        for t in range(1, num_updates + 1):
            states, next_states, actions, extr_rewards, intr_rewards, values_extr, values_intr, old_policies, dones = self.runner.run(
            )
            policy, extr_last_values, intr_last_values = self.model.forward(
                next_states[-1])
            int_rff = np.array([
                forward_filter.update(intr_rewards[i])
                for i in range(len(intr_rewards))
            ])
            #R_intr_mean, R_intr_std = rolling.update(self.discount(intr_rewards, self.gamma).ravel().mean()) #
            rolling.update(int_rff.ravel())
            R_intr_std = np.sqrt(rolling.var)
            intr_rewards /= R_intr_std
            #print('intr reward', intr_rewards)

            forward_loss = self.forward_model.backprop(
                states[0], fold_batch(next_states), fold_batch(actions),
                fold_batch(extr_rewards), self.nsteps)

            Adv_extr = self.GAE(extr_rewards,
                                values_extr,
                                extr_last_values,
                                dones,
                                gamma=0.999,
                                lambda_=self.lambda_)
            Adv_intr = self.GAE(
                intr_rewards,
                values_intr,
                intr_last_values,
                np.zeros_like(dones),
                gamma=0.99,
                lambda_=self.lambda_)  # non episodic intr reward signal
            R_extr = Adv_extr + values_extr
            R_intr = Adv_intr + values_intr
            total_Adv = self.model.extr_coeff * Adv_extr + self.model.intr_coeff * Adv_intr

            #self.runner.state_mean, self.runner.state_std = state_rolling.update(fold_batch(next_states)[:,:,:,-1:]) # update state normalisation statistics
            self.runner.state_mean, self.runner.state_std = self.state_rolling.update(
                next_states)  # update state normalisation statistics

            # perform minibatch gradient descent for K epochs
            l = 0
            idxs = np.arange(len(states))
            for epoch in range(self.num_epochs):
                batch_size = self.nsteps // self.num_minibatches
                np.random.shuffle(idxs)
                for batch in range(0, len(states), batch_size):
                    batch_idxs = idxs[batch:batch + batch_size]
                    # stack all states, next_states, actions and Rs across all workers into a single batch
                    mb_states, mb_nextstates, mb_actions, mb_Rextr, mb_Rintr, mb_Adv, mb_old_policies = fold_batch(states[batch_idxs]), fold_batch(next_states[batch_idxs]), \
                                                    fold_batch(actions[batch_idxs]), fold_batch(R_extr[batch_idxs]), fold_batch(R_intr[batch_idxs]), \
                                                    fold_batch(total_Adv[batch_idxs]), fold_batch(old_policies[batch_idxs])

                    mb_nextstates = mb_nextstates[np.where(
                        np.random.uniform(
                            size=(batch_size)) < self.pred_prob)][:, :, :, -1:]
                    #mb_nextstates = (mb_nextstates  - self.runner.state_mean[np.newaxis,:,:,np.newaxis]) / self.runner.state_std[np.newaxis,:,:,np.newaxis]
                    mean, std = self.runner.state_mean, self.runner.state_std
                    l += self.model.backprop(mb_states, mb_nextstates,
                                             mb_Rextr, mb_Rintr, mb_Adv,
                                             mb_actions, mb_old_policies,
                                             self.alpha, mean, std)

            l /= (self.num_epochs * self.num_minibatches)

            # Imagined future rollout

            hidden = self.forward_model.get_initial_hidden(self.num_envs)
            obs = next_states[-1]
            encoded_last_state = self.forward_model.encode_state(
                next_states[-1])  # o_t -> s_t
            actions = [
                np.random.choice(policy.shape[1], p=policy[i])
                for i in range(policy.shape[0])
            ]
            imagined_rollout = []
            with tf.variable_scope('forward_model/latent-space-rnn',
                                   reuse=tf.AUTO_REUSE):
                for i in range(self.nsteps):
                    next_obs, extr_rewards, encoded_last_state, hidden = self.forward_model.predict_next(
                        encoded_last_state, hidden, actions)
                    #print('imagined obs', next_obs.shape)
                    intr_rewards = self.model.intrinsic_reward(
                        next_obs[..., -1:], self.runner.state_mean,
                        self.runner.state_std)
                    policies, extr_values, intr_values = self.model.forward(
                        obs)
                    actions = [
                        np.random.choice(policy.shape[1], p=policy[i])
                        for i in range(policy.shape[0])
                    ]
                    imagined_rollout.append([
                        obs, next_obs, actions, extr_rewards[:, 0],
                        intr_rewards, extr_values, intr_values, policies
                    ])
                    obs = next_obs

            obs, next_obs, actions, extr_rewards, intr_rewards, extr_values, intr_values, old_policies = stack_many(
                zip(*imagined_rollout))
            #print('imagined obs', obs.shape)
            #print('imagined extr rew', extr_rewards.shape)
            #print('imagined extr_values', extr_values.shape)
            #print('imagined intr_values', intr_values.shape)

            intr_rewards /= R_intr_std

            policies, extr_last_values, intr_last_values = self.model.forward(
                next_obs[-1])
            Adv_extr = self.GAE(extr_rewards,
                                extr_values,
                                extr_last_values,
                                np.zeros_like(dones),
                                gamma=0.999,
                                lambda_=self.lambda_)
            Adv_intr = self.GAE(
                intr_rewards,
                intr_values,
                intr_last_values,
                np.zeros_like(dones),
                gamma=0.99,
                lambda_=self.lambda_)  # non episodic intr reward signal
            R_extr = Adv_extr + values_extr
            R_intr = Adv_intr + values_intr
            total_Adv = self.model.extr_coeff * Adv_extr + self.model.intr_coeff * Adv_intr

            for batch in range(0, len(obs), batch_size):
                batch_idxs = idxs[batch:batch + batch_size]
                # stack all states, next_states, actions and Rs across all workers into a single batch
                mb_states, mb_nextstates, mb_actions, mb_Rextr, mb_Rintr, mb_Adv, mb_old_policies = fold_batch(obs[batch_idxs]), fold_batch(next_obs[batch_idxs]), \
                                                fold_batch(actions[batch_idxs]), fold_batch(R_extr[batch_idxs]), fold_batch(R_intr[batch_idxs]), \
                                                fold_batch(total_Adv[batch_idxs]), fold_batch(old_policies[batch_idxs])

                mb_nextstates = mb_nextstates[np.where(
                    np.random.uniform(
                        size=(batch_size)) < self.pred_prob)][..., -1:]
                #mb_nextstates = (mb_nextstates  - self.runner.state_mean[np.newaxis,:,:,np.newaxis]) / self.runner.state_std[np.newaxis,:,:,np.newaxis]
                mean, std = self.runner.state_mean, self.runner.state_std
                l += self.model.backprop(mb_states, mb_nextstates, mb_Rextr,
                                         mb_Rintr, mb_Adv, mb_actions,
                                         mb_old_policies, self.alpha, mean,
                                         std)

            if self.render_freq > 0 and t % (self.validate_freq *
                                             self.render_freq) == 0:
                render = True
            else:
                render = False

            if self.validate_freq > 0 and t % self.validate_freq == 0:
                self.validation_summary(t, l, start, render)
                start = time.time()

            if self.save_freq > 0 and t % self.save_freq == 0:
                s += 1
                self.saver.save(
                    self.sess,
                    str(self.model_dir + self.current_time + '/' + str(s) +
                        ".ckpt"))
                print('saved model')