def post_project(self): # Log validation loss expected_loss = np.sum(self.validation_sa_weights * (self.current_q - self.all_target_q_np)**2) logger.record_tabular('validation_loss_reweighted', expected_loss) expected_loss = np.sum(self.sample_visit_sa * (self.current_q - self.all_target_q_np)**2) logger.record_tabular('validation_loss_sampling', expected_loss)
def record_tabular_moving(key, value, n=100, fill_value=0.0): vals = KEY_TO_VALUES[key] if len(vals) == 0: vals.extend([fill_value] * n) vals.append(value) vals = vals[-n:] KEY_TO_VALUES[key] = vals rllablogger.record_tabular(key + '_%d_step_mean' % n, np.mean(vals))
def pre_project(self): if self.sampling_policy == 'adversarial': q_vals = ptu.to_numpy( self.evaluate_qvalues(np.arange(0, self.env.num_states), None, mode=fqi.MULTIPLE_HEADS)) errors = np.abs(q_vals - self.all_target_q_np)**0.5 # pick adversarial distribution - reward is bellman error adversarial_qs = q_iteration.softq_iteration_custom_reward( self.env, reward=errors, num_itrs=self.time_limit, discount=self.discount, ent_wt=self.ent_wt, atol=1e-5) self.adversarial_qs = adversarial_qs self.batch_s, self.batch_a, self.batch_ns, self.batch_r = self.collect_samples( ) self._total_samples += len(self.batch_s) logger.record_tabular('total_samples', self._total_samples)
def record_tabular_stats(key, array, stats=(MEAN, MAX, MIN)): if MEAN in stats: rllablogger.record_tabular(key + '_mean', np.mean(array)) if MAX in stats: rllablogger.record_tabular(key + '_max', np.max(array)) if MIN in stats: rllablogger.record_tabular(key + '_min', np.min(array))
def evaluate_policy(self, eval_episodes=200, greedy=True, prefix='Eval', total_timesteps=0): env = self.env all_states = [] all_goal_states = [] all_actions = [] final_dist_vec = np.zeros(eval_episodes) success_vec = np.zeros(eval_episodes) for index in tqdm.trange(eval_episodes, leave=True): states, actions, goal_state = self.sample_trajectory(noise=0, greedy=greedy) all_actions.extend(actions) all_states.append(states) all_goal_states.append(goal_state) final_dist = env.goal_distance(states[-1], goal_state) final_dist_vec[index] = final_dist success_vec[index] = (final_dist < self.goal_threshold) all_states = np.stack(all_states) all_goal_states = np.stack(all_goal_states) logger.record_tabular('%s num episodes' % prefix, eval_episodes) logger.record_tabular('%s avg final dist' % prefix, np.mean(final_dist_vec)) logger.record_tabular('%s success ratio' % prefix, np.mean(success_vec)) if self.summary_writer: self.summary_writer.add_scalar('%s/avg final dist' % prefix, np.mean(final_dist_vec), total_timesteps) self.summary_writer.add_scalar('%s/success ratio' % prefix, np.mean(success_vec), total_timesteps) diagnostics = env.get_diagnostics(all_states, all_goal_states) for key, value in diagnostics.items(): logger.record_tabular('%s %s' % (prefix, key), value) return all_states, all_goal_states
def update(self, step=-1): start_time = time.time() # backup with log_utils.timer('compute_backup'): self.all_target_q_np = q_iteration_cy.softq_iteration( self.env, num_itrs=self.n_steps, warmstart_q=self.current_q, discount=self.discount, ent_wt=self.ent_wt) # smooth if self.smooth_target_tau < 1.0: self.all_target_q_np = self.smooth_target_tau * self.all_target_q_np + ( 1 - self.smooth_target_tau) * self.current_q self.all_target_q = ptu.tensor(self.all_target_q_np) # project with log_utils.timer('pre_project'): self.pre_project() stopped_mode, critic_loss, k = self.project() if isinstance(stopped_mode, stopping.ValidationLoss): self.current_q = ptu.to_numpy(stopped_mode.best_validation_qs) logger.record_tabular('validation_stop_step', stopped_mode.validation_k) else: self.current_q = ptu.to_numpy(self.network(self.all_states)) self.current_q = np.minimum(self.current_q, self.max_q) # clip when diverging self.post_project() with log_utils.timer('eval_policy'): returns = self.eval_policy() logger.record_tabular('project_loss', ptu.to_numpy(critic_loss)) logger.record_tabular('fit_steps', k) if step >= 0: logger.record_tabular('step', step) # Logging logger.record_tabular('fit_q_value_mean', np.mean(self.current_q)) logger.record_tabular('target_q_value_mean', np.mean(self.all_target_q_np)) logger.record_tabular('returns_expert', self.expert_returns) logger.record_tabular('returns_random', self.random_returns) logger.record_tabular('returns', returns) log_utils.record_tabular_moving('returns', returns, n=50) logger.record_tabular('returns_normalized', self.normalize_returns(returns)) log_utils.record_tabular_moving('returns_normalized', self.normalize_returns(returns), n=50) # measure contraction errors diff_tq_qstar = weighted_q_diff(self.all_target_q_np, self.ground_truth_q, self.valid_weights) abs_diff_tq_qstar = np.abs(diff_tq_qstar) log_utils.record_tabular_stats('tq_q*_diff', diff_tq_qstar) log_utils.record_tabular_stats('tq_q*_diff_abs', abs_diff_tq_qstar) if self.log_proj_qstar: diff = weighted_q_diff(self.current_q, self.ground_truth_q_proj, self.valid_weights) abs_diff = np.abs(diff) log_utils.record_tabular_stats('q*_proj_diff', diff) log_utils.record_tabular_stats('q*_proj_diff_abs', abs_diff) log_utils.record_tabular_stats('ground_truth_error', self.qstar_abs_diff) logger.record_tabular('iteration_time', time.time() - start_time) logger.dump_tabular()
def pre_project(self): super(ReplayBufferFQI, self).pre_project() # add samples to replay buffer self.replay_buffer.add_all(self.batch_s, self.batch_a, self.batch_ns, self.batch_r) logger.record_tabular('replay_buffer_len', len(self.replay_buffer))
def train(self): start_time = time.time() last_time = start_time # Evaluate untrained policy total_timesteps = 0 timesteps_since_train = 0 timesteps_since_eval = 0 timesteps_since_reset = 0 iteration = 0 running_loss = None running_validation_loss = None if logger.get_snapshot_dir() and self.log_tensorboard: self.summary_writer = SummaryWriter( osp.join(logger.get_snapshot_dir(), 'tensorboard')) # Evaluation Code self.policy.eval() self.evaluate_policy(self.eval_episodes, total_timesteps=0, greedy=True, prefix='Eval') logger.record_tabular('policy loss', 0) logger.record_tabular('timesteps', total_timesteps) logger.record_tabular('epoch time (s)', time.time() - last_time) logger.record_tabular('total time (s)', time.time() - start_time) last_time = time.time() logger.dump_tabular() # End Evaluation Code with tqdm.tqdm(total=self.eval_freq, smoothing=0) as ranger: while total_timesteps < self.max_timesteps: # Interact in environmenta according to exploration strategy. if total_timesteps < self.explore_timesteps: states, actions, goal_state = self.sample_trajectory( noise=1) else: states, actions, goal_state = self.sample_trajectory( greedy=True, noise=self.expl_noise) # With some probability, put this new trajectory into the validation buffer if self.validation_buffer is not None and np.random.rand( ) < 0.2: self.validation_buffer.add_trajectory( states, actions, goal_state) else: self.replay_buffer.add_trajectory(states, actions, goal_state) total_timesteps += self.max_path_length timesteps_since_train += self.max_path_length timesteps_since_eval += self.max_path_length ranger.update(self.max_path_length) # Take training steps if timesteps_since_train >= self.train_policy_freq and total_timesteps > self.start_policy_timesteps: timesteps_since_train %= self.train_policy_freq self.policy.train() for _ in range( int(self.policy_updates_per_step * self.train_policy_freq)): loss = self.take_policy_step() validation_loss = self.validation_loss() if running_loss is None: running_loss = loss else: running_loss = 0.9 * running_loss + 0.1 * loss if running_validation_loss is None: running_validation_loss = validation_loss else: running_validation_loss = 0.9 * running_validation_loss + 0.1 * validation_loss self.policy.eval() ranger.set_description( 'Loss: %s Validation Loss: %s' % (running_loss, running_validation_loss)) if self.summary_writer: self.summary_writer.add_scalar('Losses/Train', running_loss, total_timesteps) self.summary_writer.add_scalar( 'Losses/Validation', running_validation_loss, total_timesteps) # Evaluate, log, and save to disk if timesteps_since_eval >= self.eval_freq: timesteps_since_eval %= self.eval_freq iteration += 1 # Evaluation Code self.policy.eval() self.evaluate_policy(self.eval_episodes, total_timesteps=total_timesteps, greedy=True, prefix='Eval') logger.record_tabular('policy loss', running_loss or 0) # Handling None case logger.record_tabular('timesteps', total_timesteps) logger.record_tabular('epoch time (s)', time.time() - last_time) logger.record_tabular('total time (s)', time.time() - start_time) last_time = time.time() logger.dump_tabular() # Logging Code if logger.get_snapshot_dir(): modifier = str( iteration) if self.save_every_iteration else '' torch.save( self.policy.state_dict(), osp.join(logger.get_snapshot_dir(), 'policy%s.pkl' % modifier)) if hasattr(self.replay_buffer, 'state_dict'): with open( osp.join(logger.get_snapshot_dir(), 'buffer%s.pkl' % modifier), 'wb') as f: pickle.dump(self.replay_buffer.state_dict(), f) full_dict = dict(env=self.env, policy=self.policy) with open( osp.join(logger.get_snapshot_dir(), 'params%s.pkl' % modifier), 'wb') as f: pickle.dump(full_dict, f) ranger.reset()
def post_project(self): #raise NotImplementedError("TODO: measure distributional shift - loss under next and ") if not self.weight_states_only: prev_loss = np.sum(self.prev_weights * (self.prev_q_target - self.prev_q_value)**2) shift_loss = np.sum(self.weights * (self.prev_q_target - self.prev_q_value)**2) logger.record_tabular('distributional_shift_old_loss', prev_loss) logger.record_tabular('distributional_shift_new_loss', shift_loss) logger.record_tabular('distributional_shift_diff_loss', shift_loss - prev_loss) logger.record_tabular('distributional_shift_abs_diff_loss', np.abs(shift_loss - prev_loss)) logger.record_tabular( 'distributional_shift_tv', 0.5 * np.sum(np.abs(self.weights - self.prev_weights))) logger.record_tabular('fit_qvalue_weighted_mean', np.sum(self.weights * self.current_q)) # update self.prev_weights = self.weights self.prev_q_target = self.all_target_q_np self.prev_q_value = self.current_q
def get_sample_states(self, itr=0): if itr % 5 == 0: # compute weights weights = None if self.wscheme == 'uniform': weights = np.ones((self.env.num_states, self.env.num_actions)) elif self.wscheme == 'buffer_infinite': weights = self.buffer_sa elif self.wscheme == 'buffer10': weights = self.buffer_sa elif self.wscheme == 'pi*': weights = self.visit_sa elif self.wscheme == 'pi*proj': assert self.log_proj_qstar weights = self.opt_proj_visit_sa elif self.wscheme == 'random': weights = self.pi_visit_sa elif self.wscheme == 'pi': weights = self.pi_visit_sa elif self.wscheme == 'online': q_vals = ptu.to_numpy( self.evaluate_qvalues(np.arange(0, self.env.num_states), None)) visit_sa = q_iteration_py.compute_visitation( self.env, q_vals, ent_wt=self.ent_wt, discount=self.discount, env_time_limit=self.time_limit) weights = visit_sa elif self.wscheme == 'robust_prioritized': q_vals = ptu.to_numpy( self.evaluate_qvalues(np.arange(0, self.env.num_states), None)) errors = np.abs(q_vals - self.all_target_q_np) weights = errors elif self.wscheme == 'robust_adversarial': # solve for max_pi [bellman error] # compute bellman errors q_vals = ptu.to_numpy( self.evaluate_qvalues(np.arange(0, self.env.num_states), None)) errors = np.abs(q_vals - self.all_target_q_np) # pick adversarial distribution - reward is bellman error adversarial_qs = q_iteration.softq_iteration_custom_reward( self.env, reward=errors, num_itrs=self.time_limit, discount=self.discount, ent_wt=self.ent_wt, warmstart_q=self.warmstart_adversarial_q, atol=1e-5) self.warmstart_adversarial_q = adversarial_qs visit_sa = q_iteration_py.compute_visitation( self.env, adversarial_qs, ent_wt=self.ent_wt, discount=self.discount, env_time_limit=self.time_limit) weights = visit_sa else: raise ValueError("Unknown weighting scheme: %s" % self.wscheme) if self.weight_states_only: weights = np.sum(weights, axis=1) weights = np.repeat(weights[:, np.newaxis], self.env.num_actions, axis=-1) self.weights = (weights / np.sum(weights)) # normalize if itr == 0: entropy = -np.sum(self.weights * np.log(self.weights + 1e-6)) logger.record_tabular('weight_entropy', entropy) unif = np.ones_like(self.weights) / float(self.weights.size) max_entropy = -np.sum(unif * np.log(unif)) logger.record_tabular('weight_entropy_normalized', entropy / max_entropy) return np.arange(0, self.env.num_states), None, None, None, self.weights