def post_project(self):
     # Log validation loss
     expected_loss = np.sum(self.validation_sa_weights *
                            (self.current_q - self.all_target_q_np)**2)
     logger.record_tabular('validation_loss_reweighted', expected_loss)
     expected_loss = np.sum(self.sample_visit_sa *
                            (self.current_q - self.all_target_q_np)**2)
     logger.record_tabular('validation_loss_sampling', expected_loss)
Esempio n. 2
0
def record_tabular_moving(key, value, n=100, fill_value=0.0):
    vals = KEY_TO_VALUES[key]
    if len(vals) == 0:
        vals.extend([fill_value] * n)
    vals.append(value)
    vals = vals[-n:]
    KEY_TO_VALUES[key] = vals
    rllablogger.record_tabular(key + '_%d_step_mean' % n, np.mean(vals))
 def pre_project(self):
     if self.sampling_policy == 'adversarial':
         q_vals = ptu.to_numpy(
             self.evaluate_qvalues(np.arange(0, self.env.num_states),
                                   None,
                                   mode=fqi.MULTIPLE_HEADS))
         errors = np.abs(q_vals - self.all_target_q_np)**0.5
         # pick adversarial distribution - reward is bellman error
         adversarial_qs = q_iteration.softq_iteration_custom_reward(
             self.env,
             reward=errors,
             num_itrs=self.time_limit,
             discount=self.discount,
             ent_wt=self.ent_wt,
             atol=1e-5)
         self.adversarial_qs = adversarial_qs
     self.batch_s, self.batch_a, self.batch_ns, self.batch_r = self.collect_samples(
     )
     self._total_samples += len(self.batch_s)
     logger.record_tabular('total_samples', self._total_samples)
Esempio n. 4
0
def record_tabular_stats(key, array, stats=(MEAN, MAX, MIN)):
    if MEAN in stats:
        rllablogger.record_tabular(key + '_mean', np.mean(array))
    if MAX in stats:
        rllablogger.record_tabular(key + '_max', np.max(array))
    if MIN in stats:
        rllablogger.record_tabular(key + '_min', np.min(array))
Esempio n. 5
0
    def evaluate_policy(self,
                        eval_episodes=200,
                        greedy=True,
                        prefix='Eval',
                        total_timesteps=0):
        env = self.env

        all_states = []
        all_goal_states = []
        all_actions = []
        final_dist_vec = np.zeros(eval_episodes)
        success_vec = np.zeros(eval_episodes)

        for index in tqdm.trange(eval_episodes, leave=True):
            states, actions, goal_state = self.sample_trajectory(noise=0,
                                                                 greedy=greedy)
            all_actions.extend(actions)
            all_states.append(states)
            all_goal_states.append(goal_state)
            final_dist = env.goal_distance(states[-1], goal_state)

            final_dist_vec[index] = final_dist
            success_vec[index] = (final_dist < self.goal_threshold)

        all_states = np.stack(all_states)
        all_goal_states = np.stack(all_goal_states)

        logger.record_tabular('%s num episodes' % prefix, eval_episodes)
        logger.record_tabular('%s avg final dist' % prefix,
                              np.mean(final_dist_vec))
        logger.record_tabular('%s success ratio' % prefix,
                              np.mean(success_vec))
        if self.summary_writer:
            self.summary_writer.add_scalar('%s/avg final dist' % prefix,
                                           np.mean(final_dist_vec),
                                           total_timesteps)
            self.summary_writer.add_scalar('%s/success ratio' % prefix,
                                           np.mean(success_vec),
                                           total_timesteps)
        diagnostics = env.get_diagnostics(all_states, all_goal_states)
        for key, value in diagnostics.items():
            logger.record_tabular('%s %s' % (prefix, key), value)

        return all_states, all_goal_states
Esempio n. 6
0
    def update(self, step=-1):
        start_time = time.time()
        # backup
        with log_utils.timer('compute_backup'):
            self.all_target_q_np = q_iteration_cy.softq_iteration(
                self.env,
                num_itrs=self.n_steps,
                warmstart_q=self.current_q,
                discount=self.discount,
                ent_wt=self.ent_wt)
            # smooth
            if self.smooth_target_tau < 1.0:
                self.all_target_q_np = self.smooth_target_tau * self.all_target_q_np + (
                    1 - self.smooth_target_tau) * self.current_q
            self.all_target_q = ptu.tensor(self.all_target_q_np)

        # project
        with log_utils.timer('pre_project'):
            self.pre_project()

        stopped_mode, critic_loss, k = self.project()

        if isinstance(stopped_mode, stopping.ValidationLoss):
            self.current_q = ptu.to_numpy(stopped_mode.best_validation_qs)
            logger.record_tabular('validation_stop_step',
                                  stopped_mode.validation_k)
        else:
            self.current_q = ptu.to_numpy(self.network(self.all_states))
        self.current_q = np.minimum(self.current_q,
                                    self.max_q)  # clip when diverging
        self.post_project()
        with log_utils.timer('eval_policy'):
            returns = self.eval_policy()

        logger.record_tabular('project_loss', ptu.to_numpy(critic_loss))
        logger.record_tabular('fit_steps', k)
        if step >= 0:
            logger.record_tabular('step', step)

        # Logging
        logger.record_tabular('fit_q_value_mean', np.mean(self.current_q))
        logger.record_tabular('target_q_value_mean',
                              np.mean(self.all_target_q_np))
        logger.record_tabular('returns_expert', self.expert_returns)
        logger.record_tabular('returns_random', self.random_returns)
        logger.record_tabular('returns', returns)
        log_utils.record_tabular_moving('returns', returns, n=50)
        logger.record_tabular('returns_normalized',
                              self.normalize_returns(returns))
        log_utils.record_tabular_moving('returns_normalized',
                                        self.normalize_returns(returns),
                                        n=50)

        # measure contraction errors
        diff_tq_qstar = weighted_q_diff(self.all_target_q_np,
                                        self.ground_truth_q,
                                        self.valid_weights)
        abs_diff_tq_qstar = np.abs(diff_tq_qstar)
        log_utils.record_tabular_stats('tq_q*_diff', diff_tq_qstar)
        log_utils.record_tabular_stats('tq_q*_diff_abs', abs_diff_tq_qstar)

        if self.log_proj_qstar:
            diff = weighted_q_diff(self.current_q, self.ground_truth_q_proj,
                                   self.valid_weights)
            abs_diff = np.abs(diff)
            log_utils.record_tabular_stats('q*_proj_diff', diff)
            log_utils.record_tabular_stats('q*_proj_diff_abs', abs_diff)
            log_utils.record_tabular_stats('ground_truth_error',
                                           self.qstar_abs_diff)

        logger.record_tabular('iteration_time', time.time() - start_time)

        logger.dump_tabular()
Esempio n. 7
0
 def pre_project(self):
     super(ReplayBufferFQI, self).pre_project()
     # add samples to replay buffer
     self.replay_buffer.add_all(self.batch_s, self.batch_a, self.batch_ns,
                                self.batch_r)
     logger.record_tabular('replay_buffer_len', len(self.replay_buffer))
Esempio n. 8
0
    def train(self):
        start_time = time.time()
        last_time = start_time

        # Evaluate untrained policy
        total_timesteps = 0
        timesteps_since_train = 0
        timesteps_since_eval = 0
        timesteps_since_reset = 0

        iteration = 0
        running_loss = None
        running_validation_loss = None

        if logger.get_snapshot_dir() and self.log_tensorboard:
            self.summary_writer = SummaryWriter(
                osp.join(logger.get_snapshot_dir(), 'tensorboard'))

        # Evaluation Code
        self.policy.eval()
        self.evaluate_policy(self.eval_episodes,
                             total_timesteps=0,
                             greedy=True,
                             prefix='Eval')
        logger.record_tabular('policy loss', 0)
        logger.record_tabular('timesteps', total_timesteps)
        logger.record_tabular('epoch time (s)', time.time() - last_time)
        logger.record_tabular('total time (s)', time.time() - start_time)
        last_time = time.time()
        logger.dump_tabular()
        # End Evaluation Code

        with tqdm.tqdm(total=self.eval_freq, smoothing=0) as ranger:
            while total_timesteps < self.max_timesteps:

                # Interact in environmenta according to exploration strategy.
                if total_timesteps < self.explore_timesteps:
                    states, actions, goal_state = self.sample_trajectory(
                        noise=1)
                else:
                    states, actions, goal_state = self.sample_trajectory(
                        greedy=True, noise=self.expl_noise)

                # With some probability, put this new trajectory into the validation buffer
                if self.validation_buffer is not None and np.random.rand(
                ) < 0.2:
                    self.validation_buffer.add_trajectory(
                        states, actions, goal_state)
                else:
                    self.replay_buffer.add_trajectory(states, actions,
                                                      goal_state)

                total_timesteps += self.max_path_length
                timesteps_since_train += self.max_path_length
                timesteps_since_eval += self.max_path_length

                ranger.update(self.max_path_length)

                # Take training steps
                if timesteps_since_train >= self.train_policy_freq and total_timesteps > self.start_policy_timesteps:
                    timesteps_since_train %= self.train_policy_freq
                    self.policy.train()
                    for _ in range(
                            int(self.policy_updates_per_step *
                                self.train_policy_freq)):
                        loss = self.take_policy_step()
                        validation_loss = self.validation_loss()
                        if running_loss is None:
                            running_loss = loss
                        else:
                            running_loss = 0.9 * running_loss + 0.1 * loss

                        if running_validation_loss is None:
                            running_validation_loss = validation_loss
                        else:
                            running_validation_loss = 0.9 * running_validation_loss + 0.1 * validation_loss

                    self.policy.eval()
                    ranger.set_description(
                        'Loss: %s Validation Loss: %s' %
                        (running_loss, running_validation_loss))

                    if self.summary_writer:
                        self.summary_writer.add_scalar('Losses/Train',
                                                       running_loss,
                                                       total_timesteps)
                        self.summary_writer.add_scalar(
                            'Losses/Validation', running_validation_loss,
                            total_timesteps)

                # Evaluate, log, and save to disk
                if timesteps_since_eval >= self.eval_freq:
                    timesteps_since_eval %= self.eval_freq
                    iteration += 1
                    # Evaluation Code
                    self.policy.eval()
                    self.evaluate_policy(self.eval_episodes,
                                         total_timesteps=total_timesteps,
                                         greedy=True,
                                         prefix='Eval')
                    logger.record_tabular('policy loss', running_loss
                                          or 0)  # Handling None case
                    logger.record_tabular('timesteps', total_timesteps)
                    logger.record_tabular('epoch time (s)',
                                          time.time() - last_time)
                    logger.record_tabular('total time (s)',
                                          time.time() - start_time)
                    last_time = time.time()
                    logger.dump_tabular()

                    # Logging Code
                    if logger.get_snapshot_dir():
                        modifier = str(
                            iteration) if self.save_every_iteration else ''
                        torch.save(
                            self.policy.state_dict(),
                            osp.join(logger.get_snapshot_dir(),
                                     'policy%s.pkl' % modifier))
                        if hasattr(self.replay_buffer, 'state_dict'):
                            with open(
                                    osp.join(logger.get_snapshot_dir(),
                                             'buffer%s.pkl' % modifier),
                                    'wb') as f:
                                pickle.dump(self.replay_buffer.state_dict(), f)

                        full_dict = dict(env=self.env, policy=self.policy)
                        with open(
                                osp.join(logger.get_snapshot_dir(),
                                         'params%s.pkl' % modifier),
                                'wb') as f:
                            pickle.dump(full_dict, f)

                    ranger.reset()
    def post_project(self):
        #raise NotImplementedError("TODO: measure distributional shift - loss under next and ")
        if not self.weight_states_only:
            prev_loss = np.sum(self.prev_weights *
                               (self.prev_q_target - self.prev_q_value)**2)
            shift_loss = np.sum(self.weights *
                                (self.prev_q_target - self.prev_q_value)**2)
            logger.record_tabular('distributional_shift_old_loss', prev_loss)
            logger.record_tabular('distributional_shift_new_loss', shift_loss)
            logger.record_tabular('distributional_shift_diff_loss',
                                  shift_loss - prev_loss)
            logger.record_tabular('distributional_shift_abs_diff_loss',
                                  np.abs(shift_loss - prev_loss))
            logger.record_tabular(
                'distributional_shift_tv',
                0.5 * np.sum(np.abs(self.weights - self.prev_weights)))
            logger.record_tabular('fit_qvalue_weighted_mean',
                                  np.sum(self.weights * self.current_q))

            # update
            self.prev_weights = self.weights
            self.prev_q_target = self.all_target_q_np
            self.prev_q_value = self.current_q
Esempio n. 10
0
    def get_sample_states(self, itr=0):
        if itr % 5 == 0:  # compute weights
            weights = None
            if self.wscheme == 'uniform':
                weights = np.ones((self.env.num_states, self.env.num_actions))
            elif self.wscheme == 'buffer_infinite':
                weights = self.buffer_sa
            elif self.wscheme == 'buffer10':
                weights = self.buffer_sa
            elif self.wscheme == 'pi*':
                weights = self.visit_sa
            elif self.wscheme == 'pi*proj':
                assert self.log_proj_qstar
                weights = self.opt_proj_visit_sa
            elif self.wscheme == 'random':
                weights = self.pi_visit_sa
            elif self.wscheme == 'pi':
                weights = self.pi_visit_sa
            elif self.wscheme == 'online':
                q_vals = ptu.to_numpy(
                    self.evaluate_qvalues(np.arange(0, self.env.num_states),
                                          None))
                visit_sa = q_iteration_py.compute_visitation(
                    self.env,
                    q_vals,
                    ent_wt=self.ent_wt,
                    discount=self.discount,
                    env_time_limit=self.time_limit)
                weights = visit_sa
            elif self.wscheme == 'robust_prioritized':
                q_vals = ptu.to_numpy(
                    self.evaluate_qvalues(np.arange(0, self.env.num_states),
                                          None))
                errors = np.abs(q_vals - self.all_target_q_np)
                weights = errors
            elif self.wscheme == 'robust_adversarial':
                # solve for max_pi [bellman error]
                # compute bellman errors
                q_vals = ptu.to_numpy(
                    self.evaluate_qvalues(np.arange(0, self.env.num_states),
                                          None))
                errors = np.abs(q_vals - self.all_target_q_np)
                # pick adversarial distribution - reward is bellman error
                adversarial_qs = q_iteration.softq_iteration_custom_reward(
                    self.env,
                    reward=errors,
                    num_itrs=self.time_limit,
                    discount=self.discount,
                    ent_wt=self.ent_wt,
                    warmstart_q=self.warmstart_adversarial_q,
                    atol=1e-5)
                self.warmstart_adversarial_q = adversarial_qs
                visit_sa = q_iteration_py.compute_visitation(
                    self.env,
                    adversarial_qs,
                    ent_wt=self.ent_wt,
                    discount=self.discount,
                    env_time_limit=self.time_limit)
                weights = visit_sa
            else:
                raise ValueError("Unknown weighting scheme: %s" % self.wscheme)

            if self.weight_states_only:
                weights = np.sum(weights, axis=1)
                weights = np.repeat(weights[:, np.newaxis],
                                    self.env.num_actions,
                                    axis=-1)
            self.weights = (weights / np.sum(weights))  # normalize
        if itr == 0:
            entropy = -np.sum(self.weights * np.log(self.weights + 1e-6))
            logger.record_tabular('weight_entropy', entropy)
            unif = np.ones_like(self.weights) / float(self.weights.size)
            max_entropy = -np.sum(unif * np.log(unif))
            logger.record_tabular('weight_entropy_normalized',
                                  entropy / max_entropy)
        return np.arange(0,
                         self.env.num_states), None, None, None, self.weights