Esempio n. 1
0
def train_loop(env, model, num_actions, return_fn, logdir, eval_fn, args):
    monitor = Monitor(logdir)
    reward_monitor = MonitorSeries('reward', monitor, interval=1)
    eval_reward_monitor = MonitorSeries('eval_reward', monitor, interval=1)
    policy_loss_monitor = MonitorSeries('policy_loss', monitor, interval=10000)
    value_loss_monitor = MonitorSeries('value_loss', monitor, interval=10000)
    sample_action = lambda x: np.random.choice(num_actions, p=x)

    step = 0
    obs_t = env.reset()
    cumulative_reward = np.zeros(len(env.envs), dtype=np.float32)
    obss_t, acts_t, vals_t, rews_tp1, ters_tp1 = [], [], [], [], []
    while step <= args.final_step:
        # inference
        probs_t, val_t = model.infer(pixel_to_float(obs_t))
        # sample actions
        act_t = list(map(sample_action, probs_t))
        # move environment
        obs_tp1, rew_tp1, ter_tp1, _ = env.step(act_t)
        # clip reward between [-1.0, 1.0]
        clipped_rew_tp1 = np.clip(rew_tp1, -1.0, 1.0)

        obss_t.append(obs_t)
        acts_t.append(act_t)
        vals_t.append(val_t)
        rews_tp1.append(clipped_rew_tp1)
        ters_tp1.append(ter_tp1)

        # update parameters
        if len(obss_t) == args.time_horizon:
            vals_t.append(val_t)
            rets_t = return_fn(vals_t, rews_tp1, ters_tp1)
            advs_t = rets_t - vals_t[:-1]
            policy_loss, value_loss = model.train(pixel_to_float(obss_t),
                                                  acts_t, rets_t, advs_t, step)
            policy_loss_monitor.add(step, policy_loss)
            value_loss_monitor.add(step, value_loss)
            obss_t, acts_t, vals_t, rews_tp1, ters_tp1 = [], [], [], [], []

        # save parameters
        cumulative_reward += rew_tp1
        obs_t = obs_tp1

        for i, ter in enumerate(ter_tp1):
            step += 1
            if ter:
                reward_monitor.add(step, cumulative_reward[i])
                cumulative_reward[i] = 0.0
            if step % 10**6 == 0:
                path = os.path.join(logdir, 'model_{}.h5'.format(step))
                nn.save_parameters(path)
                eval_reward_monitor.add(step, eval_fn())
Esempio n. 2
0
 def evaluate(self, obs_t):
     if np.random.random() < 0.05:
         return np.random.randint(self.num_actions)
     self.infer_obs_t.d = np.array(pixel_to_float([obs_t]))
     self.infer_all.forward(clear_buffer=True)
     votes = np.zeros(self.num_actions)
     for q_value in self.infer_qs_t:
         votes[np.argmax(q_value.d[0])] += 1
     return np.argmax(votes)
Esempio n. 3
0
    def _func(step):
        indices, experiences, weights = buffer.sample()
        obss_t = []
        acts_t = []
        rews_tp1 = []
        obss_tp1 = []
        ters_tp1 = []
        for experience in experiences:
            obss_t.append(experience['obs_t'])
            acts_t.append(experience['act_t'])
            rews_tp1.append(experience['rew_tp1'])
            obss_tp1.append(experience['obs_tp1'])
            ters_tp1.append(experience['ter_tp1'])
        td, loss = model.train(pixel_to_float(obss_t), acts_t, rews_tp1,
                               pixel_to_float(obss_tp1), ters_tp1, weights)
        buffer.update_priorities(indices, td)

        if step % target_update_interval == 0:
            model.update_target()

        return [loss]
Esempio n. 4
0
 def evaluate(self, obs_t):
     return self.infer(pixel_to_float(obs_t))
Esempio n. 5
0
 def evaluate(self, obs_t):
     self.eval_obs_t.d = np.array(pixel_to_float([obs_t]))
     self.eval_pi_t.forward(clear_buffer=True)
     pi = self.eval_pi_t.d[0]
     return np.random.choice(pi.shape[0], p=pi)
Esempio n. 6
0
 def infer(self, obs_t):
     self.infer_obs_t.d = np.array(pixel_to_float([obs_t]))
     self.infer_qs_t[self.current_head].forward(clear_buffer=True)
     return np.argmax(self.infer_qs_t[self.current_head].d[0])