def plot(experiment_path, roll=50, save_name="results"): fig = plt.figure(figsize=(20, 10)) if len(glob(os.path.join(experiment_path, "*monitor*"))) != 0: exps = glob(experiment_path) print(exps) df_train = load_results(experiment_path) df_train['steps'] = df_train['l'].cumsum() / 1000000 df_train['time'] = df_train['t'] / 3600 ax = plt.subplot(1, 1, 1) df_train.rolling(roll).mean().plot('steps', 'r', style='-', ax=ax, legend=False) fig.legend(["train"], loc="lower center", ncol=2) ax.set_xlabel('Num steps (M)') ax.set_ylabel('Reward') ax.grid(True) fig.subplots_adjust(left=0.1, bottom=0.1, right=0.9, top=0.9, wspace=0.1, hspace=0.2) # Save figure save_name = os.path.join(experiment_path, save_name) + ".jpg" ax.get_figure().savefig(save_name) print("Plot saved as: {}".format(save_name)) plt.clf()
def plot(experiment_path, save_dir="/tmp/", save_name="results", limit_steps=None): fig = plt.figure(figsize=(20, 10)) if len(glob(os.path.join(experiment_path, "train/*monitor*"))) != 0: exps = glob(experiment_path) print(exps) # Get data df = load_results(os.path.join(experiment_path, "train")) roll = 5 rdf = df.rolling(roll) df['steps'] = df['l'].cumsum() if 'rrr' in df: df = df[df['lives'] == 0] df['r'] = df['rrr'] ax = plt.subplot(1, 1, 1) df.rolling(roll).mean().plot('steps', 'r', style='-', ax=ax, legend=False) rdf.max().plot('steps', 'r', style='-', ax=ax, legend=False, color="#28B463", alpha=0.65) rdf.min().plot('steps', 'r', style='-', ax=ax, legend=False, color="#F39C12", alpha=0.65) # X axis gap = 1 # ax.set_xticks(np.arange(0, ((df['steps'].iloc[-1] // gap) + 1) * gap, gap)) ax.set_xlabel('Num steps (M)') if limit_steps: plt.xlim(0, limit_steps) # Y axis gap = 25 # ax.set_yticks(np.arange(((df['r'].min() // gap) - 1) * gap, ((df['r'].max() // gap) + 1) * gap, gap)) ax.set_ylabel('Reward') ax.grid(True) fig.subplots_adjust(left=0.1, bottom=0.1, right=0.9, top=0.9, wspace=0.1, hspace=0.2) # Save figure ax.get_figure().savefig(os.path.join(save_dir, save_name) + ".jpg") plt.clf()
def train(self, env_id, seed, num_workers, max_timesteps, gamma, ent_coef, value_coef, num_steps_update, max_grad_norm, log_interval, optimizer, optimizer_params, epsilon_greedy=False): """Performs training of an A2C thread Parameters ---------- env_id: string environment to train on, using Gym's id seed: int random seed num_workers: int number of workers max_timesteps: int total training steps gamma: float discount factor ent_coef: float controls the strength of the entropy regularization term value_coef: float controls the strength of the value loss term num_steps_update: int number of steps in A2C max_grad_norm: float maximum gradient of the norm of the weights log_interval: int frequency of logging epsilon_greedy: bool whether to use an ε-greedy policy optimizer: torch.Optimizer the network's optimizer optimizer_params: dict lr: float learning rate alpha: float smoothing constant eps: float term added to the denominator to improve numerical stability """ env = self.create_env_vec(env_id, seed, num_workers) # TODO: move the network initialization elsewhere self.net = mlp([env.observation_space.shape[0]], env.action_space.n, [16]) self.net.train() optimizer = optimizer(self.net.parameters(), **optimizer_params) episode_len = 0 state = env.reset() # TODO: handle stacking of frames in ale/rle state = torch.from_numpy(state) assert (state.shape[0] == num_workers) assert (state.shape[1:] == torch.Size(list( env.observation_space.shape))) avg_value_estimate = AverageMeter() avg_value_loss = AverageMeter() avg_policy_loss = AverageMeter() avg_entropy_loss = AverageMeter() while self.T < max_timesteps: rewards = [] values = [] entropies = [] log_probs = [] terminals = [] # TODO: set the parameters through args if epsilon_greedy: init_eps = 0.5 end_eps = 0.15 steps_eps = 50000 epsilon = max( end_eps, init_eps - self.T * (init_eps - end_eps) / steps_eps) for t in range(num_steps_update): # env.render() action_prob, value = self.net(Variable(state)) avg_value_estimate.update(value.data.mean()) # print(action_prob.mean(0).data) action = action_prob.multinomial().data action_log_probs = torch.log(action_prob) entropy = -(action_log_probs * action_prob).sum(1) if epsilon_greedy: rand_numbers = torch.rand(num_workers) action_mask = rand_numbers.le( epsilon * torch.ones(rand_numbers.size())) random_actions = torch.multinomial(torch.ones( env.action_space.n), num_workers, replacement=True) action[action_mask] = random_actions[action_mask] state, reward, terminal, info = env.step(action.numpy()) # TODO: is the code below necessary? # for n, done in enumerate(dones): # if done: # self.obs[n] = self.obs[n] * 0 state = torch.from_numpy(state) episode_len += 1 # save rewards and values for later rewards.append(reward) terminals.append(terminal) values.append(value) entropies.append(entropy) log_probs.append(action_log_probs.gather(1, Variable(action))) self.T += 1 # Convert lists to torch.Tensor/Variable rewards = torch.from_numpy(np.asarray(rewards, dtype=np.float32)).transpose( 0, 1) terminals = torch.from_numpy(np.asarray(terminals, dtype=np.uint8)).transpose( 0, 1) values = torch.cat(values, 1) entropies = torch.cat(entropies, 0).view(values.size()) log_probs = torch.cat(log_probs, 0).view(values.size()) rewards = Variable(rewards, requires_grad=False) _, last_value = self.net(Variable(state)) last_value.squeeze_() mask = Variable(torch.ones(terminals.size()) - terminals.float(), requires_grad=False) R = Variable(torch.zeros(rewards.size( ))) # VALIDATE: is this the correct place for Variable()? R[:, -1] = last_value * mask[:, -1] # bootstrap from last state for i in reversed(range(num_steps_update - 1)): R[:, i] = (rewards[:, i] + gamma * R[:, i + 1]) * mask[:, i] advantage = R - values value_loss = advantage.pow(2)[:, :-1].mean() policy_loss = (-advantage * log_probs)[:, :-1].mean() entropy_loss = entropies[:, :-1].mean() optimizer.zero_grad() (policy_loss + value_coef * value_loss + ent_coef * entropy_loss).backward() avg_entropy_loss.update(entropy_loss.data[0]) avg_value_loss.update(value_loss.data[0]) avg_policy_loss.update(policy_loss.data[0]) torch.nn.utils.clip_grad_norm(self.net.parameters(), max_grad_norm) optimizer.step() episode_len = 0 if self.T % log_interval == 0: # save results json_results = bench.load_results(self.save_path) self.results.add( step=self.T, value=avg_value_estimate.avg(), avg_entropy_loss=avg_entropy_loss.avg(), avg_policy_loss=avg_policy_loss.avg(), avg_value_loss=avg_value_loss.avg(), time=time.time() - json_results['initial_reset_time'], mean_reward=np.mean(json_results['episode_rewards'][-10:])) avg_value_estimate.reset() avg_value_loss.reset() avg_policy_loss.reset() avg_entropy_loss.reset() # self.results.smooth('reward', window=10) # self.results.smooth('value', window=10) # self.results.smooth('avg_policy_loss', window=10) # self.results.smooth('avg_value_loss', window=10) # self.results.smooth('avg_entropy_loss', window=10) # self.results.plot(x='step', y='reward_smoothed', # title='Reward', ylabel='Reward') self.results.plot(x='time', y='mean_reward', title='mean_reward', ylabel='average reward') # self.results.plot(x='step', y='epsilon', # title='epsilon', ylabel='epsilon') self.results.plot(x='step', y='value', title='value', ylabel='Avg value estimate') self.results.plot(x='step', y='avg_policy_loss', title='avg_policy_loss', ylabel='avg_policy_loss') self.results.plot(x='step', y='avg_value_loss', title='avg_value_loss', ylabel='avg_value_loss') self.results.plot(x='step', y='avg_entropy_loss', title='avg_entropy_loss', ylabel='avg_entropy_loss') self.results.save() env.close()