Beispiel #1
0
def plot(experiment_path, roll=50, save_name="results"):

    fig = plt.figure(figsize=(20, 10))

    if len(glob(os.path.join(experiment_path, "*monitor*"))) != 0:

        exps = glob(experiment_path)
        print(exps)

        df_train = load_results(experiment_path)
        df_train['steps'] = df_train['l'].cumsum() / 1000000
        df_train['time'] = df_train['t'] / 3600

        ax = plt.subplot(1, 1, 1)
        df_train.rolling(roll).mean().plot('steps', 'r',  style='-',  ax=ax,  legend=False)

    fig.legend(["train"], loc="lower center", ncol=2)
    ax.set_xlabel('Num steps (M)')
    ax.set_ylabel('Reward')
    ax.grid(True)

    fig.subplots_adjust(left=0.1, bottom=0.1, right=0.9, top=0.9, wspace=0.1, hspace=0.2)

    # Save figure
    save_name = os.path.join(experiment_path, save_name) + ".jpg"
    ax.get_figure().savefig(save_name)
    print("Plot saved as: {}".format(save_name))
    plt.clf()
Beispiel #2
0
def plot(experiment_path, save_dir="/tmp/", save_name="results", limit_steps=None):

    fig = plt.figure(figsize=(20, 10))

    if len(glob(os.path.join(experiment_path, "train/*monitor*"))) != 0:

        exps = glob(experiment_path)
        print(exps)

        # Get data
        df = load_results(os.path.join(experiment_path, "train"))
        roll = 5
        rdf = df.rolling(roll)
        df['steps'] = df['l'].cumsum()
        if 'rrr' in df:
            df = df[df['lives'] == 0]
            df['r'] = df['rrr']

        ax = plt.subplot(1, 1, 1)
        df.rolling(roll).mean().plot('steps', 'r',  style='-',  ax=ax,  legend=False)
        rdf.max().plot('steps', 'r', style='-', ax=ax, legend=False, color="#28B463", alpha=0.65)
        rdf.min().plot('steps', 'r', style='-', ax=ax, legend=False, color="#F39C12", alpha=0.65)

        # X axis
        gap = 1
        # ax.set_xticks(np.arange(0, ((df['steps'].iloc[-1] // gap) + 1) * gap, gap))
        ax.set_xlabel('Num steps (M)')
        if limit_steps: plt.xlim(0, limit_steps)

        # Y axis
        gap = 25
        # ax.set_yticks(np.arange(((df['r'].min() // gap) - 1) * gap, ((df['r'].max() // gap) + 1) * gap, gap))
        ax.set_ylabel('Reward')
        ax.grid(True)

    fig.subplots_adjust(left=0.1, bottom=0.1, right=0.9, top=0.9, wspace=0.1, hspace=0.2)

    # Save figure
    ax.get_figure().savefig(os.path.join(save_dir, save_name) + ".jpg")
    plt.clf()
Beispiel #3
0
    def train(self,
              env_id,
              seed,
              num_workers,
              max_timesteps,
              gamma,
              ent_coef,
              value_coef,
              num_steps_update,
              max_grad_norm,
              log_interval,
              optimizer,
              optimizer_params,
              epsilon_greedy=False):
        """Performs training of an A2C thread
        Parameters
        ----------
        env_id: string
            environment to train on, using Gym's id
        seed: int
            random seed
        num_workers: int
            number of workers
        max_timesteps: int
            total training steps
        gamma: float
            discount factor
        ent_coef: float
            controls the strength of the entropy regularization term
        value_coef: float
            controls the strength of the value loss term
        num_steps_update: int
            number of steps in A2C
        max_grad_norm: float
            maximum gradient of the norm of the weights
        log_interval: int
            frequency of logging
        epsilon_greedy: bool
            whether to use an ε-greedy policy
        optimizer: torch.Optimizer
            the network's optimizer
        optimizer_params: dict
            lr: float
                learning rate
            alpha: float
                smoothing constant
            eps: float
                term added to the denominator to improve numerical stability
        """

        env = self.create_env_vec(env_id, seed, num_workers)

        # TODO: move the network initialization elsewhere
        self.net = mlp([env.observation_space.shape[0]], env.action_space.n,
                       [16])
        self.net.train()

        optimizer = optimizer(self.net.parameters(), **optimizer_params)

        episode_len = 0
        state = env.reset()
        # TODO: handle stacking of frames in ale/rle
        state = torch.from_numpy(state)
        assert (state.shape[0] == num_workers)
        assert (state.shape[1:] == torch.Size(list(
            env.observation_space.shape)))

        avg_value_estimate = AverageMeter()
        avg_value_loss = AverageMeter()
        avg_policy_loss = AverageMeter()
        avg_entropy_loss = AverageMeter()

        while self.T < max_timesteps:

            rewards = []
            values = []
            entropies = []
            log_probs = []
            terminals = []

            # TODO: set the parameters through args
            if epsilon_greedy:
                init_eps = 0.5
                end_eps = 0.15
                steps_eps = 50000
                epsilon = max(
                    end_eps,
                    init_eps - self.T * (init_eps - end_eps) / steps_eps)

            for t in range(num_steps_update):
                # env.render()
                action_prob, value = self.net(Variable(state))
                avg_value_estimate.update(value.data.mean())
                # print(action_prob.mean(0).data)
                action = action_prob.multinomial().data

                action_log_probs = torch.log(action_prob)
                entropy = -(action_log_probs * action_prob).sum(1)

                if epsilon_greedy:
                    rand_numbers = torch.rand(num_workers)
                    action_mask = rand_numbers.le(
                        epsilon * torch.ones(rand_numbers.size()))

                    random_actions = torch.multinomial(torch.ones(
                        env.action_space.n),
                                                       num_workers,
                                                       replacement=True)
                    action[action_mask] = random_actions[action_mask]

                state, reward, terminal, info = env.step(action.numpy())
                # TODO: is the code below necessary?
                # for n, done in enumerate(dones):
                #     if done:
                #         self.obs[n] = self.obs[n] * 0
                state = torch.from_numpy(state)

                episode_len += 1

                # save rewards and values for later
                rewards.append(reward)
                terminals.append(terminal)
                values.append(value)
                entropies.append(entropy)
                log_probs.append(action_log_probs.gather(1, Variable(action)))

                self.T += 1

            # Convert lists to torch.Tensor/Variable
            rewards = torch.from_numpy(np.asarray(rewards,
                                                  dtype=np.float32)).transpose(
                                                      0, 1)
            terminals = torch.from_numpy(np.asarray(terminals,
                                                    dtype=np.uint8)).transpose(
                                                        0, 1)
            values = torch.cat(values, 1)
            entropies = torch.cat(entropies, 0).view(values.size())
            log_probs = torch.cat(log_probs, 0).view(values.size())

            rewards = Variable(rewards, requires_grad=False)

            _, last_value = self.net(Variable(state))
            last_value.squeeze_()
            mask = Variable(torch.ones(terminals.size()) - terminals.float(),
                            requires_grad=False)
            R = Variable(torch.zeros(rewards.size(
            )))  # VALIDATE: is this the correct place for Variable()?

            R[:, -1] = last_value * mask[:, -1]  # bootstrap from last state
            for i in reversed(range(num_steps_update - 1)):
                R[:, i] = (rewards[:, i] + gamma * R[:, i + 1]) * mask[:, i]

            advantage = R - values
            value_loss = advantage.pow(2)[:, :-1].mean()
            policy_loss = (-advantage * log_probs)[:, :-1].mean()
            entropy_loss = entropies[:, :-1].mean()

            optimizer.zero_grad()

            (policy_loss + value_coef * value_loss +
             ent_coef * entropy_loss).backward()
            avg_entropy_loss.update(entropy_loss.data[0])
            avg_value_loss.update(value_loss.data[0])
            avg_policy_loss.update(policy_loss.data[0])

            torch.nn.utils.clip_grad_norm(self.net.parameters(), max_grad_norm)
            optimizer.step()

            episode_len = 0
            if self.T % log_interval == 0:
                # save results
                json_results = bench.load_results(self.save_path)
                self.results.add(
                    step=self.T,
                    value=avg_value_estimate.avg(),
                    avg_entropy_loss=avg_entropy_loss.avg(),
                    avg_policy_loss=avg_policy_loss.avg(),
                    avg_value_loss=avg_value_loss.avg(),
                    time=time.time() - json_results['initial_reset_time'],
                    mean_reward=np.mean(json_results['episode_rewards'][-10:]))
                avg_value_estimate.reset()
                avg_value_loss.reset()
                avg_policy_loss.reset()
                avg_entropy_loss.reset()
                # self.results.smooth('reward', window=10)
                # self.results.smooth('value', window=10)
                # self.results.smooth('avg_policy_loss', window=10)
                # self.results.smooth('avg_value_loss', window=10)
                # self.results.smooth('avg_entropy_loss', window=10)
                # self.results.plot(x='step', y='reward_smoothed',
                #                   title='Reward', ylabel='Reward')
                self.results.plot(x='time',
                                  y='mean_reward',
                                  title='mean_reward',
                                  ylabel='average reward')
                # self.results.plot(x='step', y='epsilon',
                #                   title='epsilon', ylabel='epsilon')
                self.results.plot(x='step',
                                  y='value',
                                  title='value',
                                  ylabel='Avg value estimate')
                self.results.plot(x='step',
                                  y='avg_policy_loss',
                                  title='avg_policy_loss',
                                  ylabel='avg_policy_loss')
                self.results.plot(x='step',
                                  y='avg_value_loss',
                                  title='avg_value_loss',
                                  ylabel='avg_value_loss')
                self.results.plot(x='step',
                                  y='avg_entropy_loss',
                                  title='avg_entropy_loss',
                                  ylabel='avg_entropy_loss')
                self.results.save()

        env.close()