def train_loop(env, model, num_actions, return_fn, logdir, eval_fn, args): monitor = Monitor(logdir) reward_monitor = MonitorSeries('reward', monitor, interval=1) eval_reward_monitor = MonitorSeries('eval_reward', monitor, interval=1) policy_loss_monitor = MonitorSeries('policy_loss', monitor, interval=10000) value_loss_monitor = MonitorSeries('value_loss', monitor, interval=10000) sample_action = lambda x: np.random.choice(num_actions, p=x) step = 0 obs_t = env.reset() cumulative_reward = np.zeros(len(env.envs), dtype=np.float32) obss_t, acts_t, vals_t, rews_tp1, ters_tp1 = [], [], [], [], [] while step <= args.final_step: # inference probs_t, val_t = model.infer(pixel_to_float(obs_t)) # sample actions act_t = list(map(sample_action, probs_t)) # move environment obs_tp1, rew_tp1, ter_tp1, _ = env.step(act_t) # clip reward between [-1.0, 1.0] clipped_rew_tp1 = np.clip(rew_tp1, -1.0, 1.0) obss_t.append(obs_t) acts_t.append(act_t) vals_t.append(val_t) rews_tp1.append(clipped_rew_tp1) ters_tp1.append(ter_tp1) # update parameters if len(obss_t) == args.time_horizon: vals_t.append(val_t) rets_t = return_fn(vals_t, rews_tp1, ters_tp1) advs_t = rets_t - vals_t[:-1] policy_loss, value_loss = model.train(pixel_to_float(obss_t), acts_t, rets_t, advs_t, step) policy_loss_monitor.add(step, policy_loss) value_loss_monitor.add(step, value_loss) obss_t, acts_t, vals_t, rews_tp1, ters_tp1 = [], [], [], [], [] # save parameters cumulative_reward += rew_tp1 obs_t = obs_tp1 for i, ter in enumerate(ter_tp1): step += 1 if ter: reward_monitor.add(step, cumulative_reward[i]) cumulative_reward[i] = 0.0 if step % 10**6 == 0: path = os.path.join(logdir, 'model_{}.h5'.format(step)) nn.save_parameters(path) eval_reward_monitor.add(step, eval_fn())
def evaluate(self, obs_t): if np.random.random() < 0.05: return np.random.randint(self.num_actions) self.infer_obs_t.d = np.array(pixel_to_float([obs_t])) self.infer_all.forward(clear_buffer=True) votes = np.zeros(self.num_actions) for q_value in self.infer_qs_t: votes[np.argmax(q_value.d[0])] += 1 return np.argmax(votes)
def _func(step): indices, experiences, weights = buffer.sample() obss_t = [] acts_t = [] rews_tp1 = [] obss_tp1 = [] ters_tp1 = [] for experience in experiences: obss_t.append(experience['obs_t']) acts_t.append(experience['act_t']) rews_tp1.append(experience['rew_tp1']) obss_tp1.append(experience['obs_tp1']) ters_tp1.append(experience['ter_tp1']) td, loss = model.train(pixel_to_float(obss_t), acts_t, rews_tp1, pixel_to_float(obss_tp1), ters_tp1, weights) buffer.update_priorities(indices, td) if step % target_update_interval == 0: model.update_target() return [loss]
def evaluate(self, obs_t): return self.infer(pixel_to_float(obs_t))
def evaluate(self, obs_t): self.eval_obs_t.d = np.array(pixel_to_float([obs_t])) self.eval_pi_t.forward(clear_buffer=True) pi = self.eval_pi_t.d[0] return np.random.choice(pi.shape[0], p=pi)
def infer(self, obs_t): self.infer_obs_t.d = np.array(pixel_to_float([obs_t])) self.infer_qs_t[self.current_head].forward(clear_buffer=True) return np.argmax(self.infer_qs_t[self.current_head].d[0])