def __init__(self, *, task_name): logger.info(f"Executing training...") tmp_env = get_env(record=False) self.is_done = tmp_env.unwrapped.is_done self.eval_tasks = {task_name: tmp_env.tasks()[task_name]} self.exploitation_task = tmp_env.tasks()[task_name] del tmp_env # Constitute the state of Trainable ex.step_i = 0 self.model = get_model() self.reward_model = get_reward_model() self.model_optimizer = get_model_optimizer(self.model.parameters()) self.reward_model_optimizer = get_reward_model_optimizer( self.reward_model.parameters()) self.buffer = get_buffer() self.agent = get_agent(mode='train') self.agent.setup_normalizer(self.buffer.normalizer) self.stats = EpisodeStats(self.eval_tasks) self.last_avg_eval_score = None self.neptune_ex = None ex.mlog = None # Not considered part of the state self.new_experiment = True # I need to know if I had to create a new experiment (neptune) or continue an old one self.random_agent = get_random_agent() self._common_setup()
def run_episode(env, agent, deterministic, do_training=True, rendering=False, max_timesteps=1000): """ This methods runs one episode for a gym environment. deterministic == True => agent executes only greedy actions according the Q function approximator (no random actions). do_training == True => train agent """ stats = EpisodeStats() # save statistics like episode reward or action usage state = env.reset() step = 0 while True: action_id = agent.act(state=state, deterministic=deterministic) next_state, reward, terminal, info = env.step(action_id) if do_training: agent.train(state, action_id, next_state, reward, terminal) stats.step(reward, action_id) state = next_state if rendering: env.render() if terminal or step > max_timesteps: break step += 1 return stats
def run_episode(env, agent, deterministic, skip_frames=0, do_training=True, rendering=False, max_timesteps=1000, history_length=0): """ This methods runs one episode for a gym environment. deterministic == True => agent executes only greedy actions according the Q function approximator (no random actions). do_training == True => train agent """ stats = EpisodeStats() # Save history image_hist = [] step = 0 state = env.reset() # fix bug of corrupted states without rendering in gym environment env.viewer.window.dispatch_events() # append image history to first state state = state_preprocessing(state) image_hist.extend([state] * (history_length + 1)) state = np.array(image_hist).reshape(96, 96, history_length + 1) while True: # TODO: get action_id from agent # Hint: adapt the probabilities of the 5 actions for random sampling so that the agent explores properly. # action_id = agent.act(...) # action = your_id_to_action_method(...) # Hint: frame skipping might help you to get better results. reward = 0 for _ in range(skip_frames + 1): next_state, r, terminal, info = env.step(action) reward += r if rendering: env.render() if terminal: break next_state = state_preprocessing(next_state) image_hist.append(next_state) image_hist.pop(0) next_state = np.array(image_hist).reshape(96, 96, history_length + 1) if do_training: agent.train(state, action_id, next_state, reward, terminal) stats.step(reward, action_id) state = next_state if terminal or (step * (skip_frames + 1)) > max_timesteps : break step += 1 return stats
def test_agent(env, agent, run=0, episodes=5, time_steps=500, initial_state=None, initial_noise=None, render=True, deterministic=True): stats = EpisodeStats(episode_lengths=np.zeros(episodes), episode_rewards=np.zeros(episodes), episode_loss=np.zeros(episodes)) print_header(3, 'Testing') for e in range(episodes): s = env.reset(initial_state=initial_state, noise_amplitude=initial_noise) for t in range(time_steps): if render: env.render() a = agent.get_action(s, deterministic=deterministic) s, r, d, _ = env.step(tn(a)) stats.episode_rewards[e] += r stats.episode_lengths[e] = t if d: break pr_stats = { 'run': run, 'steps': int(stats.episode_lengths[e] + 1), 'episode': e + 1, 'episodes': episodes, 'reward': stats.episode_rewards[e] } print_stats(pr_stats) if render: env.viewer.close() return stats
def train(self, env, episodes, time_steps, initial_state=None, initial_noise=0.5): stats = EpisodeStats(episode_lengths=np.zeros(episodes), episode_rewards=np.zeros(episodes), episode_loss=np.zeros(episodes)) self._run += 1 for e in range(episodes): s = env.reset(initial_state=initial_state, noise_amplitude=initial_noise) for t in range(time_steps): a = self._actor.get_action(s, deterministic=False) ns, r, d, _ = env.step(tn(a)) stats.episode_rewards[e] += r stats.episode_lengths[e] = t self._steps += 1 self._replay_buffer.add_transition(s, a, ns, r, d) # Sample replay buffer b_states, b_actions, b_nstates, b_rewards, b_terminal = self._replay_buffer.random_next_batch(self._batch_size) # Get action according to target actor policy b_nactions = self._actor_target.get_action(b_nstates, deterministic=False) # Compute the target Q value from target critic target_Q1, target_Q2 = self._critic_target(b_nstates, b_nactions) target_Q = torch.min(target_Q1, target_Q2).reshape((-1)) target_Q = b_rewards + (1 - b_terminal) * self._gamma * target_Q target_Q = target_Q.reshape((-1, 1)).detach() # Get current Q estimates from critic current_Q1, current_Q2 = self._critic(b_states, b_actions) # Compute critic loss critic_loss = self._critic_loss(current_Q1, target_Q) + self._critic_loss(current_Q2, target_Q) stats.episode_loss[e] += critic_loss.item() # Optimize the critic self._critic_optimizer.zero_grad() critic_loss.backward() self._critic_optimizer.step() # Delayed policy updates if self._steps % self._policy_freq == 0: # Compute actor losses by the deterministic policy gradient actor_loss = -self._critic.Q1(b_states, self._actor.get_action(b_states, deterministic=True)).mean() # Optimize the actor self._actor_optimizer.zero_grad() actor_loss.backward() self._actor_optimizer.step() # Soft-Update the target models soft_update(self._critic_target, self._critic, self._tau) soft_update(self._actor_target, self._actor, self._tau) if d: break s = ns pr_stats = {'run': self._run, 'steps': int(stats.episode_lengths[e] + 1), 'episode': e + 1, 'episodes': episodes, 'reward': stats.episode_rewards[e], 'loss': stats.episode_loss[e]} print_stats(pr_stats) return stats
def learn(self, total_timesteps, callback=None, log_interval=1, tb_log_name="PPO2", reset_num_timesteps=True): # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) self.cliprange = get_schedule_fn(self.cliprange) cliprange_vf = get_schedule_fn(self.cliprange_vf) bestscore = 0 new_tb_log = self._init_num_timesteps(reset_num_timesteps) callback = self._init_callback(callback) fig = plt.figure() ax = fig.add_subplot(111) x, y = [0], [0] with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn() episode_stats = EpisodeStats(self.n_steps, self.n_envs) t_first_start = time.time() n_updates = total_timesteps // self.n_batch callback.on_training_start(locals(), globals()) for update in range(1, n_updates + 1): assert self.n_batch % self.nminibatches == 0, ( "The number of minibatches (`nminibatches`) " "is not a factor of the total number of samples " "collected per rollout (`n_batch`), " "some samples won't be used.") batch_size = self.n_batch // self.nminibatches t_start = time.time() frac = 1.0 - (update - 1.0) / n_updates lr_now = self.learning_rate(frac) cliprange_now = self.cliprange(frac) cliprange_vf_now = cliprange_vf(frac) callback.on_rollout_start() # true_reward is the reward without discount rollout = self.runner.run(callback) # Unpack obs, returns, masks, actions, values, neglogpacs, states, ep_infos, true_reward = rollout callback.update_locals(locals()) callback.on_rollout_end() # Early stopping due to the callback if not self.runner.continue_training: break episode_stats.feed(true_reward, masks) self.ep_info_buf.extend(ep_infos) mb_loss_vals = [] if states is None: # nonrecurrent version update_fac = max( self.n_batch // self.nminibatches // self.noptepochs, 1) inds = np.arange(self.n_batch) for epoch_num in range(self.noptepochs): np.random.shuffle(inds) for start in range(0, self.n_batch, batch_size): timestep = self.num_timesteps // update_fac + ( (epoch_num * self.n_batch + start) // batch_size) end = start + batch_size mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mb_loss_vals.append( self._train_step( lr_now, cliprange_now, *slices, writer=writer, update=timestep, cliprange_vf=cliprange_vf_now)) else: # recurrent version update_fac = max( self.n_batch // self.nminibatches // self.noptepochs // self.n_steps, 1) assert self.n_envs % self.nminibatches == 0 env_indices = np.arange(self.n_envs) flat_indices = np.arange(self.n_envs * self.n_steps).reshape( self.n_envs, self.n_steps) envs_per_batch = batch_size // self.n_steps for epoch_num in range(self.noptepochs): np.random.shuffle(env_indices) for start in range(0, self.n_envs, envs_per_batch): timestep = self.num_timesteps // update_fac + ( (epoch_num * self.n_envs + start) // envs_per_batch) end = start + envs_per_batch mb_env_inds = env_indices[start:end] mb_flat_inds = flat_indices[mb_env_inds].ravel() slices = (arr[mb_flat_inds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mb_states = states[mb_env_inds] mb_loss_vals.append( self._train_step( lr_now, cliprange_now, *slices, update=timestep, writer=writer, states=mb_states, cliprange_vf=cliprange_vf_now)) loss_vals = np.mean(mb_loss_vals, axis=0) t_now = time.time() fps = int(self.n_batch / (t_now - t_start)) if writer is not None: total_episode_reward_logger( self.episode_reward, true_reward.reshape((self.n_envs, self.n_steps)), masks.reshape((self.n_envs, self.n_steps)), writer, self.num_timesteps) if self.verbose == 1 and (update % log_interval == 0 or update == 1): explained_var = explained_variance(values, returns) logger.logkv("serial_timesteps", update * self.n_steps) logger.logkv("n_updates", update) logger.logkv("total_timesteps", self.num_timesteps) logger.logkv("fps", fps) logger.logkv("explained_variance", float(explained_var)) #if len(self.ep_info_buf) > 0 and len(self.ep_info_buf[0]) > 0: #logger.logkv('ep_reward_mean', safe_mean([ep_info['r'] for ep_info in self.ep_info_buf])) #logger.logkv('ep_len_mean', safe_mean([ep_info['l'] for ep_info in self.ep_info_buf])) logger.logkv("mean_episode_length", episode_stats.mean_length()) logger.logkv("mean_episode_reward", episode_stats.mean_reward()) logger.logkv('time_elapsed', t_start - t_first_start) for (loss_val, loss_name) in zip(loss_vals, self.loss_names): logger.logkv(loss_name, loss_val) logger.dumpkvs() if self.verbose == 2 and ( update % log_interval == 0 or update == 1) and episode_stats.mean_reward() > bestscore: bestscore = episode_stats.mean_reward() logger.logkv('time_elapsed', t_start - t_first_start) logger.logkv("mean_episode_reward", bestscore) logger.dumpkvs() x.append(self.num_timesteps) y.append(bestscore) ax.plot(x, y, marker='.', color='b') fig.canvas.draw() ax.set_xlim(left=0, right=total_timesteps) ax.set(title='Street Fighter 2 AI - PPO2 Algorithm', ylabel='Fitness score', xlabel='Timesteps') fig.show() plt.pause(0.001) callback.on_training_end() return self
def learn(seed, policy, env, nsteps, total_timesteps, ent_coef, lr, vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95, nminibatches=4, noptepochs=4, cliprange=0.1, next_n=10, nslupdates=10, seq_len=10, ext_coef=1, int_coef=0.1, K=10): rng = np.random.RandomState(seed) total_timesteps = int(total_timesteps) nenvs = env.num_envs ob_space = env.observation_space loc_space = 2 ac_space = env.action_space nbatch = nenvs * nsteps nbatch_train = nbatch // nminibatches nbatch_sl_train = nenvs * seq_len // nminibatches make_model = lambda: Model(policy=policy, ob_space=ob_space, loc_space=loc_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nbatch_sl_train=nbatch_sl_train, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm, seq_len=seq_len, seed=seed) model = make_model() replay_buffer = Buffer(max_size=1000, seed=seed) runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam, next_n=next_n, seq_len=seq_len, int_coef=int_coef, ext_coef=ext_coef, replay_buffer=replay_buffer, seed=seed) episode_raw_stats = EpisodeStats(nsteps, nenvs) episode_stats = EpisodeStats(nsteps, nenvs) tfirststart = time.time() nupdates = total_timesteps // nbatch sl_acc = 0 p = 0 for update in range(1, nupdates + 1): assert nbatch % nminibatches == 0 p = update * nbatch / (total_timesteps * 0.875) nbatch_train = nbatch // nminibatches tstart = time.time() obs, locs, goals, raw_rewards, rewards, returns, masks, rnn_masks, actions, values, neglogpacs, states = runner.run( K, p) episode_raw_stats.feed(raw_rewards, masks) episode_stats.feed(rewards, masks) mblossvals = [] assert nenvs % nminibatches == 0 envsperbatch = nenvs // nminibatches envinds = np.arange(nenvs) flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps) envsperbatch = nbatch_train // nsteps for _ in range(noptepochs): rng.shuffle(envinds) for start in range(0, nenvs, envsperbatch): end = start + envsperbatch mbenvinds = envinds[start:end] mbflatinds = flatinds[mbenvinds].ravel() slices = (arr[mbflatinds] for arr in (obs, locs, goals, returns, rnn_masks, actions, values, neglogpacs)) mbstates = states[mbenvinds] mblossvals.append(model.train(lr, cliprange, *slices, mbstates)) if nslupdates > 0 and sl_acc < 0.75: sl_acc, sl_loss = sl_train(model, replay_buffer, nslupdates=nslupdates, seq_len=seq_len, nenvs=nenvs, envsperbatch=envsperbatch, lr=lr) elif nslupdates > 0: sl_acc, sl_loss = sl_train(model, replay_buffer, nslupdates=1, seq_len=seq_len, nenvs=nenvs, envsperbatch=envsperbatch, lr=lr) lossvals = np.mean(mblossvals, axis=0) tnow = time.time() fps = int(nbatch / (tnow - tstart)) logger.logkv("serial_timesteps", update * nsteps) logger.logkv("nupdates", update) logger.logkv("total_timesteps", update * nbatch) logger.logkv("fps", fps) logger.logkv('episode_raw_reward', episode_raw_stats.mean_reward()) logger.logkv('imitation_episode_reward', np.mean(runner.recent_imitation_rewards)) logger.logkv('episode_reward', episode_stats.mean_reward()) logger.logkv('episode_success_ratio', np.mean(runner.recent_success_ratio)) logger.logkv('time_elapsed', tnow - tfirststart) if nslupdates > 0: logger.logkv('sl_loss', sl_loss) logger.logkv('sl_acc', sl_acc) logger.logkv('replay_buffer_num', replay_buffer.num_episodes()) logger.logkv('replay_buffer_best', replay_buffer.max_reward()) if noptepochs > 0: for (lossval, lossname) in zip(lossvals, model.loss_names): logger.logkv(lossname, lossval) logger.dumpkvs() print(logger.get_dir()) env.close() return model
class MainTrainingLoop: """ Resembles ray.Trainable """ @ex.capture def __init__(self, *, task_name): logger.info(f"Executing training...") tmp_env = get_env(record=False) self.is_done = tmp_env.unwrapped.is_done self.eval_tasks = {task_name: tmp_env.tasks()[task_name]} self.exploitation_task = tmp_env.tasks()[task_name] del tmp_env # Constitute the state of Trainable ex.step_i = 0 self.model = get_model() self.reward_model = get_reward_model() self.model_optimizer = get_model_optimizer(self.model.parameters()) self.reward_model_optimizer = get_reward_model_optimizer( self.reward_model.parameters()) self.buffer = get_buffer() self.agent = get_agent(mode='train') self.agent.setup_normalizer(self.buffer.normalizer) self.stats = EpisodeStats(self.eval_tasks) self.last_avg_eval_score = None self.neptune_ex = None ex.mlog = None # Not considered part of the state self.new_experiment = True # I need to know if I had to create a new experiment (neptune) or continue an old one self.random_agent = get_random_agent() self._common_setup() @ex.capture def _common_setup(self, *, render, record, dump_dir, _run): """ Called in __init__ but needs also to be called after restore (due to reinitialized randomness) """ video_file_base = dump_dir + "/max_exploitation_step_{}.mp4" if dump_dir is not None else None self.env_loop = EnvLoop(get_env, render=render, record=record, video_file_base=video_file_base, run=_run) def _setup_if_new(self): """ Executed for a new experiment only. This is a workaround for Trainable. """ if self.new_experiment: self.new_experiment = False self.neptune_ex = get_neptune_ex() ex.mlog = MetricLogger(ex, self.neptune_ex) @ex.capture def train(self, *, device, n_total_steps, n_warm_up_steps, record_freq, record, model_training_freq, policy_training_freq, eval_freq, task_name, model_training_n_batches, train_reward): """ A single step of interaction with the environment. """ self._setup_if_new() ex.step_i += 1 behavioral_agent = self.random_agent if ex.step_i <= n_warm_up_steps else self.agent with torch.no_grad(): action = behavioral_agent.get_action(self.env_loop.state, deterministic=False).to('cpu') prev_state = self.env_loop.state.clone().to(device) if record and (ex.step_i == 1 or ex.step_i % record_freq == 0): self.env_loop.record_next_episode() state, next_state, done = self.env_loop.step( to_np(action), video_file_suffix=ex.step_i) reward = self.exploitation_task(state, action, next_state).item() self.buffer.add(state, action, next_state, torch.from_numpy(np.array([[reward]], dtype=np.float))) self.stats.add(state, action, next_state, done) if done: log_last_episode(self.stats) tasks_rewards = { f'{task_name}': self.stats.get_recent_reward(task_name) for task_name in self.eval_tasks } step_stats = dict( step=ex.step_i, done=done, action_abs_mean=action.abs().mean().item(), reward=self.exploitation_task(state, action, next_state).item(), action_value=self.agent.get_action_value(prev_state, action).item(), ) ex.mlog.add_scalars('main_loop', {**step_stats, **tasks_rewards}) # (Re)train the model on the current buffer if model_training_freq is not None and model_training_n_batches > 0 and ex.step_i % model_training_freq == 0: self.model.setup_normalizer(self.buffer.normalizer) self.reward_model.setup_normalizer(self.buffer.normalizer) timed(train_model)(self.model, self.model_optimizer, self.buffer, mode='train') if train_reward: task = self.exploitation_task timed(train_reward_model)(self.reward_model, self.reward_model_optimizer, self.buffer, mode='train', task=task) # (Re)train the policy using current buffer and model if ex.step_i >= n_warm_up_steps and ex.step_i % policy_training_freq == 0: task = self.exploitation_task self.agent.setup_normalizer(self.buffer.normalizer) self.agent = timed(train_agent)(self.agent, self.model, self.reward_model, self.buffer, task=task, task_name=task_name, is_done=self.is_done, mode='train', context_i={}) # Evaluate the agent if eval_freq is not None and ex.step_i % eval_freq == 0: self.last_avg_eval_score = evaluate_on_tasks(agent=self.agent, model=self.model, buffer=self.buffer, task_name=task_name, context='eval') experiment_finished = ex.step_i >= n_total_steps return DotMap( done=experiment_finished, avg_eval_score=self.last_avg_eval_score, action_abs_mean=action.abs().mean().item( ), # This is just for regression tests step_i=ex.step_i) def stop(self): self.env_loop.close() if ex.mlog is not None: ex.mlog.save_artifacts() if ex.mlog.neptune_ex is not None: logger.info("Stopping neptune...") ex.mlog.neptune_ex.stop()
def train(self, env, episodes, time_steps, initial_state=None, initial_noise=0.5): stats = EpisodeStats(episode_lengths=np.zeros(episodes), episode_rewards=np.zeros(episodes), episode_loss=np.zeros(episodes)) self._run += 1 for e in range(episodes): # Generate an episode. # An episode is an array of (state, action, reward) tuples episode = [] s = env.reset(initial_state=initial_state, noise_amplitude=initial_noise) total_r = 0 for t in range(time_steps): a = self._get_action(s) ns, r, d, _ = env.step(tn(self._action_fun.act2env(a))) stats.episode_rewards[e] += r stats.episode_lengths[e] = t episode.append((s, a, r)) total_r += r if d: break s = ns gamma_t = 1 for t in range(len(episode)): # Find the first occurrence of the state in the episode s, a, r = episode[t] g = 0 gamma_kt = 1 for k in range(t, len(episode)): gamma_kt = gamma_kt * self._gamma _, _, r_k = episode[k] g = g + (gamma_kt * r_k) g = float(g) p = self._pi(s, a) # For Numerical Stability, in order to not get probabilities higher than one (e.g. delta distribution) # and to not return a probability equal to 0 because of the log in the score_function eps = 1e-8 p = p.clamp(eps, 1) log_p = torch.log(p) gamma_t = gamma_t * self._gamma if self._baseline: bl = self.baseline_fun(s) delta = g - bl bl_loss = self._bl_loss_function(self.baseline_fun(s), tt([g])) self._bl_optimizer.zero_grad() bl_loss.backward() self._bl_optimizer.step() score_fun = torch.mean(-(gamma_t * delta) * log_p) else: score_fun = torch.mean(-(gamma_t * g) * log_p) stats.episode_loss[e] += score_fun.item() self._pi_optimizer.zero_grad() score_fun.backward() self._pi_optimizer.step() pr_stats = { 'run': self._run, 'steps': int(stats.episode_lengths[e] + 1), 'episode': e + 1, 'episodes': episodes, 'reward': stats.episode_rewards[e], 'loss': stats.episode_loss[e] } print_stats(pr_stats) return stats
def learn(policy, env, seed, ob_space, ac_space, save_name, nsteps=5, total_timesteps=int(80e6), vf_coef=0.5, ent_coef=0.01, max_grad_norm=0.5, lr=7e-4, lrschedule='linear', epsilon=1e-5, alpha=0.99, gamma=0.99, log_interval=100): set_global_seeds(seed) nenvs = env.num_envs #ob_space = env.observation_space #ac_space = env.action_space save_dir = './model/' + save_name + '.ckt' summary_dir = './summary/' + save_name model = Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nenvs=nenvs, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm, lr=lr, alpha=alpha, epsilon=epsilon, total_timesteps=total_timesteps, lrschedule=lrschedule, summary_dir=summary_dir) runner = Runner(env, model, ob_space=ob_space, nsteps=nsteps, gamma=gamma) nbatch = nenvs * nsteps tstart = time.time() train_writer = model.train_writer episode_stats = EpisodeStats(nsteps, nenvs) for update in range(1, total_timesteps // nbatch + 1): obs, states, rewards, masks, actions, values, raw_rewards = runner.run( ) episode_stats.feed(raw_rewards, masks) mean_reward = episode_stats.mean_reward() mean_reward = np.asarray(mean_reward, dtype=np.float32) policy_loss, value_loss, policy_entropy, summary = model.train( obs, states, mean_reward, rewards, masks, actions, values) train_writer.add_summary(summary, update * nbatch) nseconds = time.time() - tstart fps = int((update * nbatch) / nseconds) if update % log_interval == 0 or update == 1: ev = explained_variance(values, rewards) logger.record_tabular("nupdates", update) logger.record_tabular("total_timesteps", update * nbatch) logger.record_tabular("fps", fps) logger.record_tabular("policy_entropy", float(policy_entropy)) logger.record_tabular("value_loss", float(value_loss)) logger.record_tabular("explained_variance", float(ev)) logger.record_tabular("episode_reward", episode_stats.mean_reward()) logger.record_tabular("episode_length", episode_stats.mean_length()) logger.dump_tabular() model.save(save_dir) env.close() return model
def learn(policy, env, seed, nsteps=5, nstack=4, total_timesteps=int(80e6), vf_coef=0.5, ent_coef=0.01, max_grad_norm=0.5, lr=7e-4, lrschedule='linear', epsilon=1e-5, alpha=0.99, gamma=0.99, log_interval=100, max_episode_length=None, optimizer=None): tf.reset_default_graph() set_global_seeds(seed) nenvs = env.num_envs ob_space = env.observation_space ac_space = env.action_space num_procs = len(env.remotes) # HACK model = Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nenvs=nenvs, nsteps=nsteps, nstack=nstack, num_procs=num_procs, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm, lr=lr, alpha=alpha, epsilon=epsilon, total_timesteps=total_timesteps, lrschedule=lrschedule, optimizer=optimizer) runner = Runner(env, model, nsteps=nsteps, nstack=nstack, gamma=gamma) stats = EpisodeStats(nsteps, nenvs, maxlen=100) nbatch = nenvs * nsteps tstart = time.time() for update in itertools.count(): obs, states, rewards, masks, actions, values = runner.run() total_loss, policy_loss, value_loss, policy_entropy = model.train( obs, states, rewards, masks, actions, values) nseconds = time.time() - tstart fps = int((update * nbatch) / nseconds) stats.feed(rewards, masks) if update % log_interval == 0 or update == 1: ev = explained_variance(values, rewards) logger.record_tabular("nupdates", update) logger.record_tabular("total_timesteps", update * nbatch) logger.record_tabular("fps", fps) logger.record_tabular("policy_entropy", float(policy_entropy)) logger.record_tabular("value_loss", float(value_loss)) logger.record_tabular("total_loss", float(total_loss)) logger.record_tabular("explained_variance", float(ev)) logger.record_tabular("mean_episode_length", stats.mean_length()) logger.record_tabular("mean_episode_reward", stats.mean_reward()) logger.dump_tabular() if max_episode_length and stats.mean_length( ) >= max_episode_length: break env.close()
def train(self, env, episodes, time_steps, initial_state=None, initial_noise=0.5): stats = EpisodeStats(episode_lengths=np.zeros(episodes), episode_rewards=np.zeros(episodes), episode_loss=np.zeros(episodes)) self._run += 1 for e in range(episodes): s = env.reset(initial_state=initial_state, noise_amplitude=initial_noise) total_r = 0 # Step policy for advancing the scheduler epsilon = self._pi.epsilon() # print("\t\t\tStep: {:5d} Epsilon: {:6.5f}".format(t, epsilon)) self._pi.step() for t in range(time_steps): a = self._get_action(s) ns, r, d, _ = env.step(self._action_fun.act2env(a)) stats.episode_rewards[e] += r stats.episode_lengths[e] = t total_r += r if self._use_rbuffer: self._replay_buffer.add_transition(s, a, ns, r, d) b_states, b_actions, b_nstates, b_rewards, b_terminal = self._replay_buffer.random_next_batch( self._batch_size) dim = 1 else: b_states = s b_actions = a b_nstates = ns b_rewards = r b_terminal = d dim = 0 if self._doubleQ: # Q-Values from next states [Q] used only to determine the optima next actions q_nstates = self._q(b_nstates) # Optimal Action Prediction [Q] nactions = torch.argmax(q_nstates, dim=dim) if self._use_rbuffer: nactions = [ torch.arange(self._batch_size).long(), nactions ] # Q-Values from [Q_target] function using the action indices from [Q] function q_target_nstates = self._q_target(b_nstates)[nactions] else: q_target_nstates = self._q_target(b_nstates) q_target_nstates = torch.max(q_target_nstates, dim=dim) target_prediction = b_rewards + ( 1 - b_terminal) * self._gamma * q_target_nstates if self._use_rbuffer: q_actions = [ torch.arange(self._batch_size).long(), b_actions.long() ] else: q_actions = b_actions current_prediction = self._q(b_states)[q_actions] loss = self._loss_function(current_prediction, target_prediction.detach()) stats.episode_loss[e] += loss.item() self._q_optimizer.zero_grad() loss.backward() self._q_optimizer.step() soft_update(self._q_target, self._q, self._tau) if d: break s = ns pr_stats = { 'run': self._run, 'steps': int(stats.episode_lengths[e] + 1), 'episode': e + 1, 'episodes': episodes, 'reward': stats.episode_rewards[e], 'loss': stats.episode_loss[e] } print_stats(pr_stats, ', Epsilon: {:6.5f}'.format(epsilon)) return stats