def eval_alg(policy, env, max_path_length, num_eval_rollouts, env_seed, eval_deterministic=False): if eval_deterministic: policy = MakeDeterministic(policy) env.seed(env_seed) eval_sampler = InPlacePathSampler( env=env, policy=policy, max_samples=max_path_length * (num_eval_rollouts + 1), max_path_length=max_path_length, policy_uses_pixels=False, policy_uses_task_params=False, concat_task_params_to_policy_obs=False ) test_paths = eval_sampler.obtain_samples() path_trajs = [np.array([d['xy_pos'] for d in path["env_infos"]]) for path in test_paths] return {'path_trajs': path_trajs}
def _try_to_eval(self, epoch): logger.save_extra_data(self.get_extra_data_to_save(epoch)) if self._can_evaluate(): if self.environment_farming: # Create new new eval_sampler each evaluation time in order to avoid relesed environment problem env_for_eval_sampler = self.farmer.force_acq_env() print(env_for_eval_sampler) self.eval_sampler = InPlacePathSampler( env=env_for_eval_sampler, policy=self.eval_policy, max_samples=self.num_steps_per_eval + self.max_path_length, max_path_length=self.max_path_length, ) self.evaluate(epoch) # Adding env back to free_env list self.farmer.add_free_env(env_for_eval_sampler) params = self.get_epoch_snapshot(epoch) logger.save_itr_params(epoch, params) table_keys = logger.get_table_key_set() if self._old_table_keys is not None: assert table_keys == self._old_table_keys, ( "Table keys cannot change from iteration to iteration.") self._old_table_keys = table_keys logger.record_tabular( "Number of train steps total", self._n_train_steps_total, ) logger.record_tabular( "Number of env steps total", self._n_env_steps_total, ) logger.record_tabular( "Number of rollouts total", self._n_rollouts_total, ) times_itrs = gt.get_times().stamps.itrs train_time = times_itrs['train'][-1] sample_time = times_itrs['sample'][-1] eval_time = times_itrs['eval'][-1] if epoch > 0 else 0 epoch_time = train_time + sample_time + eval_time total_time = gt.get_times().total logger.record_tabular('Train Time (s)', train_time) logger.record_tabular('(Previous) Eval Time (s)', eval_time) logger.record_tabular('Sample Time (s)', sample_time) logger.record_tabular('Epoch Time (s)', epoch_time) logger.record_tabular('Total Train Time (s)', total_time) logger.record_tabular("Epoch", epoch) logger.dump_tabular(with_prefix=False, with_timestamp=False) else: logger.log("Skipping eval for now.")
def eval_alg(policy, env, num_eval_rollouts, eval_deterministic=False, max_path_length=1000): if eval_deterministic: policy = MakeDeterministic(policy) eval_sampler = InPlacePathSampler(env=env, policy=policy, max_samples=max_path_length * (num_eval_rollouts + 1), max_path_length=max_path_length, policy_uses_pixels=False, policy_uses_task_params=False, concat_task_params_to_policy_obs=False) test_paths = eval_sampler.obtain_samples() average_returns = get_average_returns(test_paths) return average_returns
def __init__( self, env, qf1, qf2, policy, replay_buffer1, replay_buffer2, num_epochs=1000, num_steps_per_epoch=1000, policy_learning_rate=1e-4, batch_size=128, num_steps_per_eval=3000, max_path_length=300, discount=0.99, ): super().__init__() self.env = env self.qf1 = qf1 self.qf2 = qf2 self.policy = policy self.replay_buffer1 = replay_buffer1 self.replay_buffer2 = replay_buffer2 self.num_steps_per_epoch = num_steps_per_epoch self.num_epochs = num_epochs self.policy_learning_rate = policy_learning_rate self.batch_size = batch_size self.discount = discount self.eval_sampler = InPlacePathSampler( env=env, policy=self.policy, max_samples=num_steps_per_eval, max_path_length=max_path_length, ) self.policy_optimizer = optim.Adam(self.policy.parameters(), lr=self.policy_learning_rate)
class MetaRLAlgorithm(metaclass=abc.ABCMeta): def __init__( self, env, agent, train_tasks, eval_tasks, meta_batch=64, num_iterations=100, num_train_steps_per_itr=1000, num_initial_steps=100, num_tasks_sample=100, num_steps_prior=100, num_steps_posterior=100, num_extra_rl_steps_posterior=100, num_evals=10, num_steps_per_eval=1000, batch_size=1024, embedding_batch_size=1024, embedding_mini_batch_size=1024, max_path_length=1000, discount=0.99, replay_buffer_size=1000000, reward_scale=1, num_exp_traj_eval=1, update_post_train=1, eval_deterministic=True, render=False, save_replay_buffer=False, save_algorithm=False, save_environment=False, render_eval_paths=False, dump_eval_paths=False, plotter=None, ): """ :param env: training env :param agent: agent that is conditioned on a latent variable z that rl_algorithm is responsible for feeding in :param train_tasks: list of tasks used for training :param eval_tasks: list of tasks used for eval see default experiment config file for descriptions of the rest of the arguments """ self.env = env self.agent = agent self.exploration_agent = agent # Can potentially use a different policy purely for exploration rather than also solving tasks, currently not being used self.train_tasks = train_tasks self.eval_tasks = eval_tasks self.meta_batch = meta_batch self.num_iterations = num_iterations self.num_train_steps_per_itr = num_train_steps_per_itr self.num_initial_steps = num_initial_steps self.num_tasks_sample = num_tasks_sample self.num_steps_prior = num_steps_prior self.num_steps_posterior = num_steps_posterior self.num_extra_rl_steps_posterior = num_extra_rl_steps_posterior self.num_evals = num_evals self.num_steps_per_eval = num_steps_per_eval self.batch_size = batch_size self.embedding_batch_size = embedding_batch_size self.embedding_mini_batch_size = embedding_mini_batch_size self.max_path_length = max_path_length self.discount = discount self.replay_buffer_size = replay_buffer_size self.reward_scale = reward_scale self.update_post_train = update_post_train self.num_exp_traj_eval = num_exp_traj_eval self.eval_deterministic = eval_deterministic self.render = render self.save_replay_buffer = save_replay_buffer self.save_algorithm = save_algorithm self.save_environment = save_environment self.eval_statistics = None self.render_eval_paths = render_eval_paths self.dump_eval_paths = dump_eval_paths self.plotter = plotter # self.alpha = alpha self.sampler = InPlacePathSampler( env=env, policy=agent, max_path_length=self.max_path_length, ) # separate replay buffers for # - training RL update # - training encoder update self.replay_buffer = MultiTaskReplayBuffer( self.replay_buffer_size, env, self.train_tasks, ) self.enc_replay_buffer = MultiTaskReplayBuffer( self.replay_buffer_size, env, self.train_tasks, ) self._n_env_steps_total = 0 self._n_train_steps_total = 0 self._n_rollouts_total = 0 self._do_train_time = 0 self._epoch_start_time = None self._algo_start_time = None self._old_table_keys = None self._current_path_builder = PathBuilder() self._exploration_paths = [] def make_exploration_policy(self, policy): return policy def make_eval_policy(self, policy): return policy def sample_task(self, is_eval=False): ''' sample task randomly ''' if is_eval: idx = np.random.randint(len(self.eval_tasks)) else: idx = np.random.randint(len(self.train_tasks)) return idx def train(self): ''' meta-training loop ''' self.pretrain() params = self.get_epoch_snapshot(-1) logger.save_itr_params(-1, params) gt.reset() gt.set_def_unique(False) self._current_path_builder = PathBuilder() # at each iteration, we first collect data from tasks, perform meta-updates, then try to evaluate for it_ in gt.timed_for( range(self.num_iterations), save_itrs=True, ): self._start_epoch(it_) self.training_mode(True) print("\nIteration:{}".format(it_+1)) if it_ == 0:# 算法第一步,初始化每个任务的buffer print('\nCollecting initial pool of data for train and eval') # temp for evaluating for idx in self.train_tasks:#在训练开始之前,为每个任务采集2000条transition self.task_idx = idx#更改当前任务idx self.env.reset_task(idx)#重置任务 self.collect_data(self.num_initial_steps, 1, np.inf)#采集num_initial_steps条轨迹c并利用q(z|c)更新self.z # print("task id:", self.task_idx, " env:", self.replay_buffer.env) # print("buffer ", self.task_idx, ":", self.replay_buffer.task_buffers[self.task_idx].__dict__.items()) # Sample data from train tasks. print("\nFinishing collecting initial pool of data") print("\nSampling data from train tasks for Meta-training") for i in range(self.num_tasks_sample):#对于所有的train_tasks,随机从中取5个,然后为每个任务的buffer采集num_steps_prior + num_extra_rl_steps_posterior条transition print("\nSample data , round{}".format(i+1))#为每个任务的enc_buffer采集num_steps_prior条transition idx = np.random.randint(len(self.train_tasks))#train_tasks里面随便选一个task self.task_idx = idx self.env.reset_task(idx)#task重置 self.enc_replay_buffer.task_buffers[idx].clear()#清除对应的enc_bufffer # collect some trajectories with z ~ prior if self.num_steps_prior > 0: print("\ncollect some trajectories with z ~ prior") self.collect_data(self.num_steps_prior, 1, np.inf)#利用z的先验采集num_steps_prior条transition # collect some trajectories with z ~ posterior if self.num_steps_posterior > 0: print("\ncollect some trajectories with z ~ posterior") self.collect_data(self.num_steps_posterior, 1, self.update_post_train)#利用后验的z收集轨迹 # even if encoder is trained only on samples from the prior, the policy needs to learn to handle z ~ posterior if self.num_extra_rl_steps_posterior > 0: print("\ncollect some trajectories for policy update only") self.collect_data(self.num_extra_rl_steps_posterior, 1, self.update_post_train, add_to_enc_buffer=False)#利用后验的z收集num_extra_rl_steps_posterior条轨迹,仅用于策略 print("\nFinishing sample data from train tasks") # Sample train tasks and compute gradient updates on parameters. print("\nStrating Meta-training , Episode {}".format(it_)) for train_step in range(self.num_train_steps_per_itr):#每轮迭代计算num_train_steps_per_itr次梯度 500x2000=1000000 indices = np.random.choice(self.train_tasks, self.meta_batch)#train_tasks中随机取meta_batch个task , sample RL batch b~B if ((train_step + 1) % 500 == 0): print("\nTraining step {}".format(train_step + 1)) print("Indices: {}".format(indices)) print("alpha:{}".format(self.alpha)) self._do_training(indices)#梯度下降 self._n_train_steps_total += 1 gt.stamp('train') self.training_mode(False) # eval self._try_to_eval(it_) gt.stamp('eval') self._end_epoch() def pretrain(self): """ Do anything before the main training phase. """ pass def collect_data(self, num_samples, resample_z_rate, update_posterior_rate, add_to_enc_buffer=True):#在当前环境下,用当前self.agent.policy采样num_samples条轨迹 ''' get trajectories from current env in batch mode with given policy collect complete trajectories until the number of collected transitions >= num_samples :param agent: policy to rollout :param num_samples: total number of transitions to sample 总共采集多少条轨迹 :param resample_z_rate: how often to resample latent context z (in units of trajectories),每采集多少条轨迹,利用q(z|c)前向传播采样一次z :param update_posterior_rate: how often to update q(z | c) from which z is sampled (in units of trajectories),每多少条轨迹更新一次推断网络q(z|c) :param add_to_enc_buffer: whether to add collected data to encoder replay buffer ''' # start from the prior self.agent.clear_z() num_transitions = 0 while num_transitions < num_samples:#paths, n_steps_total返回轨迹与总步数 paths, n_samples = self.sampler.obtain_samples(max_samples=num_samples - num_transitions,#最大总步数 max_trajs=update_posterior_rate,#最大轨迹数量 accum_context=False, resample=resample_z_rate)#resample_z_rate:根据c采样z的频率 num_transitions += n_samples#步数总数+=采样步数 self.replay_buffer.add_paths(self.task_idx, paths)#将该task下采集的轨迹加入经验池 print("\n buffer",self.task_idx, "size:", self.replay_buffer.task_buffers[self.task_idx].size()) # time.sleep(1) # print("task id:", self.task_idx) # print("buffer ", self.task_idx, ":", self.replay_buffer.task_buffers[self.task_idx]) # print("buffer ", self.task_idx, ":", self.replay_buffer.task_buffers[self.task_idx].__dict__.items()) # print("buffer ", self.task_idx, ":", self.replay_buffer.task_buffers[self.task_idx]) if add_to_enc_buffer:#是否加入encoder的buffer self.enc_replay_buffer.add_paths(self.task_idx, paths) # print("enc_buffer ", self.task_idx, ":", self.enc_replay_buffer.task_buffers[self.task_idx].__dict__.items()) # print("enc_buffer ", self.task_idx, ":",self.enc_replay_buffer.task_buffers[self.task_idx]) print("enc_buffer",self.task_idx, "size:", self.enc_replay_buffer.task_buffers[self.task_idx].size()) # time.sleep(1) if update_posterior_rate != np.inf:#利用context更新后验z # context = self.prepare_context(self.task_idx) context = self.prepare_context(self.task_idx) self.agent.infer_posterior(context) self._n_env_steps_total += num_transitions gt.stamp('sample') # print("buffer ", self.task_idx, ":", self.replay_buffer.task_buffers[self.task_idx]) # print("enc_buffer ", self.task_idx, ":", self.enc_replay_buffer.task_buffers[self.task_idx]) def _try_to_eval(self, epoch): logger.save_extra_data(self.get_extra_data_to_save(epoch)) if self._can_evaluate(): self.evaluate(epoch) params = self.get_epoch_snapshot(epoch) logger.save_itr_params(epoch, params) table_keys = logger.get_table_key_set() if self._old_table_keys is not None: assert table_keys == self._old_table_keys, ( "Table keys cannot change from iteration to iteration." ) self._old_table_keys = table_keys logger.record_tabular( "Number of train steps total", self._n_train_steps_total, ) logger.record_tabular( "Number of env steps total", self._n_env_steps_total, ) logger.record_tabular( "Number of rollouts total", self._n_rollouts_total, ) times_itrs = gt.get_times().stamps.itrs train_time = times_itrs['train'][-1] sample_time = times_itrs['sample'][-1] eval_time = times_itrs['eval'][-1] if epoch > 0 else 0 epoch_time = train_time + sample_time + eval_time total_time = gt.get_times().total logger.record_tabular('Train Time (s)', train_time) logger.record_tabular('(Previous) Eval Time (s)', eval_time) logger.record_tabular('Sample Time (s)', sample_time) logger.record_tabular('Epoch Time (s)', epoch_time) logger.record_tabular('Total Train Time (s)', total_time) logger.record_tabular("Epoch", epoch) logger.dump_tabular(with_prefix=False, with_timestamp=False) else: logger.log("Skipping eval for now.") def _can_evaluate(self): """ One annoying thing about the logger table is that the keys at each iteration need to be the exact same. So unless you can compute everything, skip evaluation. A common example for why you might want to skip evaluation is that at the beginning of training, you may not have enough data for a validation and training set. :return: """ # eval collects its own context, so can eval any time return True def _can_train(self): return all([self.replay_buffer.num_steps_can_sample(idx) >= self.batch_size for idx in self.train_tasks]) def _get_action_and_info(self, agent, observation): """ Get an action to take in the environment. :param observation: :return: """ agent.set_num_steps_total(self._n_env_steps_total) return agent.get_action(observation,) def _start_epoch(self, epoch): self._epoch_start_time = time.time() self._exploration_paths = [] self._do_train_time = 0 logger.push_prefix('Iteration #%d | ' % epoch) def _end_epoch(self): logger.log("Epoch Duration: {0}".format( time.time() - self._epoch_start_time )) logger.log("Started Training: {0}".format(self._can_train())) logger.pop_prefix() ##### Snapshotting utils ##### def get_epoch_snapshot(self, epoch): data_to_save = dict( epoch=epoch, exploration_policy=self.exploration_policy, ) if self.save_environment: data_to_save['env'] = self.training_env return data_to_save def get_extra_data_to_save(self, epoch): """ Save things that shouldn't be saved every snapshot but rather overwritten every time. :param epoch: :return: """ if self.render: self.training_env.render(close=True) data_to_save = dict( epoch=epoch, ) if self.save_environment: data_to_save['env'] = self.training_env if self.save_replay_buffer: data_to_save['replay_buffer'] = self.replay_buffer if self.save_algorithm: data_to_save['algorithm'] = self return data_to_save def collect_paths(self, idx, epoch, run): self.task_idx = idx self.env.reset_task(idx) self.agent.clear_z() paths = [] num_transitions = 0 num_trajs = 0 while num_transitions < self.num_steps_per_eval: path, num = self.sampler.obtain_samples(deterministic=self.eval_deterministic, max_samples=self.num_steps_per_eval - num_transitions, max_trajs=1, accum_context=True) paths += path num_transitions += num num_trajs += 1 if num_trajs >= self.num_exp_traj_eval: self.agent.infer_posterior(self.agent.context)#利用收集到的context,更新self.z if self.sparse_rewards: for p in paths: sparse_rewards = np.stack(e['sparse_reward'] for e in p['env_infos']).reshape(-1, 1) p['rewards'] = sparse_rewards goal = self.env._goal for path in paths: path['goal'] = goal # goal # save the paths for visualization, only useful for point mass if self.dump_eval_paths: logger.save_extra_data(paths, path='eval_trajectories/task{}-epoch{}-run{}'.format(idx, epoch, run)) return paths def _do_eval(self, indices, epoch): final_returns = [] online_returns = [] for idx in indices: all_rets = [] for r in range(self.num_evals): paths = self.collect_paths(idx, epoch, r) all_rets.append([eval_util.get_average_returns([p]) for p in paths]) final_returns.append(np.mean([a[-1] for a in all_rets])) # record online returns for the first n trajectories n = min([len(a) for a in all_rets]) all_rets = [a[:n] for a in all_rets] all_rets = np.mean(np.stack(all_rets), axis=0) # avg return per nth rollout online_returns.append(all_rets) n = min([len(t) for t in online_returns]) online_returns = [t[:n] for t in online_returns] return final_returns, online_returns def evaluate(self, epoch): if self.eval_statistics is None: self.eval_statistics = OrderedDict() ### sample trajectories from prior for debugging / visualization if self.dump_eval_paths: # 100 arbitrarily chosen for visualizations of point_robot trajectories # just want stochasticity of z, not the policy self.agent.clear_z() prior_paths, _ = self.sampler.obtain_samples(deterministic=self.eval_deterministic, max_samples=self.max_path_length * 20, accum_context=False, resample=1) logger.save_extra_data(prior_paths, path='eval_trajectories/prior-epoch{}'.format(epoch)) ### train tasks # eval on a subset of train tasks for speed indices = np.random.choice(self.train_tasks, len(self.eval_tasks))#indices是任务编号的集合,从train_tasks中选择eval_tasks个任务,组成indices eval_util.dprint('evaluating on {} train tasks'.format(len(indices))) print('\nevaluating on {} train tasks'.format(len(indices))) ### eval train tasks with posterior sampled from the training replay buffer train_returns = [] for idx in indices:#对于所有需要评估的任务 self.task_idx = idx self.env.reset_task(idx) paths = [] for _ in range(self.num_steps_per_eval // self.max_path_length):#总共用多少条path进行评估 600/200 # context = self.prepare_context(idx)#c~Sc(B) context = self.prepare_context(idx) # print("context:",context) self.agent.infer_posterior(context)#z~q(z|c) p, _ = self.sampler.obtain_samples(deterministic=self.eval_deterministic, max_samples=self.max_path_length, accum_context=False, max_trajs=1, resample=np.inf) paths += p #收集200条paths if self.sparse_rewards: for p in paths: sparse_rewards = np.stack(e['sparse_reward'] for e in p['env_infos']).reshape(-1, 1) p['rewards'] = sparse_rewards train_returns.append(eval_util.get_average_returns(paths))#200条path的轨迹平均 # print(" train_returns:{}".format(train_returns)) train_returns = np.mean(train_returns)#把轨迹获得的奖赏加起来取平均 # print(" train_returns:{}".format(train_returns)) # time.sleep(5) ### eval train tasks with on-policy data to match eval of test tasks train_final_returns, train_online_returns = self._do_eval(indices, epoch) print("train_final_returns:{}".format(train_final_returns)) # print("train_online_returns:{}".format(train_online_returns)) eval_util.dprint('train online returns') eval_util.dprint(train_online_returns) ### test tasks eval_util.dprint('evaluating on {} test tasks'.format(len(self.eval_tasks))) print('\nevaluating on {} test tasks'.format(len(self.eval_tasks))) test_final_returns, test_online_returns = self._do_eval(self.eval_tasks, epoch) print("test_final_returns:{}".format(test_final_returns)) # print("test_online_returns:{}".format(test_online_returns)) eval_util.dprint('test online returns') eval_util.dprint(test_online_returns) # save the final posterior self.agent.log_diagnostics(self.eval_statistics) if hasattr(self.env, "log_diagnostics"): #self.env.log_diagnostics(paths, prefix=None) self.env.log_diagnostics(paths) avg_train_return = np.mean(train_final_returns) avg_test_return = np.mean(test_final_returns) # print("\ntrain_returns:{}".format(train_returns)) print("\navg_train_return:{}".format(avg_train_return)) print("avg_test_return:{}".format(avg_test_return)) time.sleep(5) avg_train_online_return = np.mean(np.stack(train_online_returns), axis=0) avg_test_online_return = np.mean(np.stack(test_online_returns), axis=0) self.eval_statistics['AverageTrainReturn_all_train_tasks'] = train_returns self.eval_statistics['AverageReturn_all_train_tasks'] = avg_train_return self.eval_statistics['AverageReturn_all_test_tasks'] = avg_test_return logger.save_extra_data(avg_train_online_return, path='online-train-epoch{}'.format(epoch)) logger.save_extra_data(avg_test_online_return, path='online-test-epoch{}'.format(epoch)) for key, value in self.eval_statistics.items(): logger.record_tabular(key, value) self.eval_statistics = None if self.render_eval_paths: self.env.render_paths(paths) if self.plotter: self.plotter.draw() @abc.abstractmethod def training_mode(self, mode): """ Set training mode to `mode`. :param mode: If True, training will happen (e.g. set the dropout probabilities to not all ones). """ pass @abc.abstractmethod def _do_training(self): """ Perform some update, e.g. perform one gradient step. :return: """ pass
def __init__( self, env, exploration_policy: ExplorationPolicy, expert_replay_buffer, training_env=None, num_epochs=100, num_steps_per_epoch=10000, num_steps_per_eval=1000, num_steps_between_updates=1000, min_steps_before_training=1000, max_path_length=1000, discount=0.99, replay_buffer_size=10000, render=False, save_replay_buffer=False, save_algorithm=False, save_environment=False, save_best=False, save_best_starting_from_epoch=0, eval_sampler=None, eval_policy=None, replay_buffer=None, policy_uses_pixels=False, wrap_absorbing=False, freq_saving=1, # some environment like halfcheetah_v2 have a timelimit that defines the terminal # this is used as a minor hack to turn off time limits no_terminal=False, policy_uses_task_params=False, concat_task_params_to_policy_obs=False ): """ Base class for RL Algorithms :param env: Environment used to evaluate. :param exploration_policy: Policy used to explore :param training_env: Environment used by the algorithm. By default, a copy of `env` will be made. :param num_epochs: :param num_steps_per_epoch: :param num_steps_per_eval: :param num_updates_per_env_step: Used by online training mode. :param num_updates_per_epoch: Used by batch training mode. :param batch_size: :param max_path_length: :param discount: :param replay_buffer_size: :param render: :param save_replay_buffer: :param save_algorithm: :param save_environment: :param eval_sampler: :param eval_policy: Policy to evaluate with. :param replay_buffer: """ self.training_env = training_env or pickle.loads(pickle.dumps(env)) # self.training_env = training_env or deepcopy(env) self.exploration_policy = exploration_policy self.expert_replay_buffer = expert_replay_buffer self.num_epochs = num_epochs self.num_env_steps_per_epoch = num_steps_per_epoch self.num_steps_per_eval = num_steps_per_eval self.num_steps_between_updates = num_steps_between_updates self.min_steps_before_training = min_steps_before_training self.max_path_length = max_path_length self.discount = discount self.replay_buffer_size = replay_buffer_size self.render = render self.save_replay_buffer = save_replay_buffer self.save_algorithm = save_algorithm self.save_environment = save_environment self.save_best = save_best self.save_best_starting_from_epoch = save_best_starting_from_epoch self.policy_uses_pixels = policy_uses_pixels self.policy_uses_task_params = policy_uses_task_params self.concat_task_params_to_policy_obs = concat_task_params_to_policy_obs if eval_sampler is None: if eval_policy is None: eval_policy = exploration_policy eval_sampler = InPlacePathSampler( env=env, policy=eval_policy, max_samples=self.num_steps_per_eval + self.max_path_length, max_path_length=self.max_path_length, policy_uses_pixels=policy_uses_pixels, policy_uses_task_params=policy_uses_task_params, concat_task_params_to_policy_obs=concat_task_params_to_policy_obs ) self.eval_policy = eval_policy self.eval_sampler = eval_sampler self.action_space = env.action_space self.obs_space = env.observation_space self.env = env if replay_buffer is None: replay_buffer = EnvReplayBuffer( self.replay_buffer_size, self.env, policy_uses_pixels=self.policy_uses_pixels, policy_uses_task_params=self.policy_uses_task_params, concat_task_params_to_policy_obs=self.concat_task_params_to_policy_obs ) self.replay_buffer = replay_buffer self._n_env_steps_total = 0 self._n_train_steps_total = 0 self._n_rollouts_total = 0 self._do_train_time = 0 self._epoch_start_time = None self._algo_start_time = None self._old_table_keys = None self._current_path_builder = PathBuilder() self._exploration_paths = [] self.wrap_absorbing = wrap_absorbing self.freq_saving = freq_saving self.no_terminal = no_terminal
def __init__( self, env, exploration_policy: ExplorationPolicy, training_env=None, num_epochs=100, num_steps_per_epoch=10000, num_steps_per_eval=1000, num_updates_per_env_step=1, batch_size=1024, max_path_length=1000, discount=0.99, replay_buffer_size=1000000, reward_scale=1, render=False, save_replay_buffer=False, save_algorithm=False, save_environment=False, eval_sampler=None, eval_policy=None, replay_buffer=None, demo_path=None, action_skip=1, experiment_name="default", mix_demo=False, ): """ Base class for RL Algorithms :param env: Environment used to evaluate. :param exploration_policy: Policy used to explore :param training_env: Environment used by the algorithm. By default, a copy of `env` will be made. :param num_epochs: :param num_steps_per_epoch: :param num_steps_per_eval: :param num_updates_per_env_step: Used by online training mode. :param num_updates_per_epoch: Used by batch training mode. :param batch_size: :param max_path_length: :param discount: :param replay_buffer_size: :param reward_scale: :param render: :param save_replay_buffer: :param save_algorithm: :param save_environment: :param eval_sampler: :param eval_policy: Policy to evaluate with. :param replay_buffer: """ ### TODO: look at NormalizedBoxEnv, do we need it? ### # self.training_env = training_env or gym.make("HalfCheetah-v2") self.training_env = training_env or MujocoManipEnv( env.env.__class__.__name__) self.exploration_policy = exploration_policy self.num_epochs = num_epochs self.num_env_steps_per_epoch = num_steps_per_epoch self.num_steps_per_eval = num_steps_per_eval self.num_updates_per_train_call = num_updates_per_env_step self.batch_size = batch_size self.max_path_length = max_path_length self.discount = discount self.replay_buffer_size = replay_buffer_size self.reward_scale = reward_scale self.render = render self.save_replay_buffer = save_replay_buffer self.save_algorithm = save_algorithm self.save_environment = save_environment if eval_sampler is None: if eval_policy is None: eval_policy = exploration_policy eval_sampler = InPlacePathSampler( env=env, policy=eval_policy, max_samples=self.num_steps_per_eval + self.max_path_length, max_path_length=self.max_path_length, ) self.eval_policy = eval_policy self.eval_sampler = eval_sampler self.action_space = env.action_space self.obs_space = env.observation_space self.env = env if replay_buffer is None: replay_buffer = EnvReplayBuffer( self.replay_buffer_size, self.env, ) self.replay_buffer = replay_buffer self.demo_sampler = None self.mix_demo = mix_demo if demo_path is not None: self.demo_sampler = DemoSampler( demo_path=demo_path, observation_dim=self.obs_space.shape[0], action_dim=self.action_space.shape[0], preload=True) self.action_skip = action_skip self.action_skip_count = 0 self._n_env_steps_total = 0 self._n_train_steps_total = 0 self._n_rollouts_total = 0 self._do_train_time = 0 self._epoch_start_time = None self._algo_start_time = None self._old_table_keys = None self._current_path_builder = PathBuilder() self._exploration_paths = [] t_now = time.time() time_str = datetime.datetime.fromtimestamp(t_now).strftime( '%Y%m%d%H%M%S') os.makedirs(os.path.join(LOCAL_EXP_PATH, experiment_name, time_str)) self._writer = SummaryWriter( os.path.join(LOCAL_EXP_PATH, experiment_name, time_str))
def __init__( self, env, exploration_policy: ExplorationPolicy, training_env=None, num_epochs=100, num_steps_per_epoch=10000, num_steps_per_eval=1000, num_updates_per_env_step=1, num_updates_per_epoch=None, batch_size=1024, max_path_length=1000, discount=0.99, replay_buffer_size=1000000, reward_scale=1, min_num_steps_before_training=None, render=False, save_replay_buffer=False, save_algorithm=False, save_environment=True, eval_sampler=None, eval_policy=None, replay_buffer=None, collection_mode='online', ): """ Base class for RL Algorithms :param env: Environment used to evaluate. :param exploration_policy: Policy used to explore :param training_env: Environment used by the algorithm. By default, a copy of `env` will be made for training, so that training and evaluation are completely independent. :param num_epochs: :param num_steps_per_epoch: :param num_steps_per_eval: :param num_updates_per_env_step: Used by online training mode. :param num_updates_per_epoch: Used by batch training mode. :param batch_size: :param max_path_length: :param discount: :param replay_buffer_size: :param reward_scale: :param min_num_steps_before_training: :param render: :param save_replay_buffer: :param save_algorithm: :param save_environment: :param eval_sampler: :param eval_policy: Policy to evaluate with. :param replay_buffer: :param collection_mode: String determining how training happens - 'online': Train after every step taken in the environment. - 'batch': Train after every epoch. """ assert collection_mode in ['online', 'batch'] if collection_mode == 'batch': assert num_updates_per_epoch is not None self.training_env = training_env #or pickle.loads(pickle.dumps(env)) self.exploration_policy = exploration_policy self.num_epochs = num_epochs self.num_env_steps_per_epoch = num_steps_per_epoch self.num_steps_per_eval = num_steps_per_eval if collection_mode == 'online': self.num_updates_per_train_call = num_updates_per_env_step else: self.num_updates_per_train_call = num_updates_per_epoch self.batch_size = batch_size self.max_path_length = max_path_length self.discount = discount self.replay_buffer_size = replay_buffer_size self.reward_scale = reward_scale self.render = render self.collection_mode = collection_mode self.save_replay_buffer = save_replay_buffer self.save_algorithm = save_algorithm self.save_environment = save_environment if min_num_steps_before_training is None: min_num_steps_before_training = self.num_env_steps_per_epoch self.min_num_steps_before_training = min_num_steps_before_training if eval_sampler is None: if eval_policy is None: eval_policy = exploration_policy eval_sampler = InPlacePathSampler( env=env, policy=eval_policy, max_samples=self.num_steps_per_eval + self.max_path_length, max_path_length=self.max_path_length, ) self.eval_policy = eval_policy self.eval_sampler = eval_sampler self.eval_statistics = OrderedDict() self.need_to_update_eval_statistics = True self.action_space = env.action_space self.obs_space = env.observation_space self.env = env if replay_buffer is None: replay_buffer = EnvReplayBuffer( self.replay_buffer_size, self.env, ) self.replay_buffer = replay_buffer self._n_env_steps_total = 0 self._n_train_steps_total = 0 self._n_rollouts_total = 0 self._do_train_time = 0 self._epoch_start_time = None self._algo_start_time = None self._old_table_keys = None self._current_path_builder = PathBuilder() self._exploration_paths = [] self.post_epoch_funcs = []
class MetaRLAlgorithm(metaclass=abc.ABCMeta): def __init__( self, env, agent, train_tasks, eval_tasks, meta_batch=64, num_iterations=100, num_train_steps_per_itr=1000, num_initial_steps=100, num_tasks_sample=100, num_steps_prior=100, num_steps_posterior=100, num_extra_rl_steps_posterior=100, num_evals=10, num_steps_per_eval=1000, batch_size=1024, embedding_batch_size=1024, embedding_mini_batch_size=1024, max_path_length=1000, discount=0.99, replay_buffer_size=1000000, reward_scale=1, num_exp_traj_eval=1, update_post_train=1, eval_deterministic=True, render=False, save_replay_buffer=False, save_algorithm=False, save_environment=False, render_eval_paths=False, dump_eval_paths=False, plotter=None, dyna=False, dyna_num_train_itr=50, dyna_num_train_steps_per_itr=50, dyna_tandem_train=True, dyna_n_layers=3, dyna_hidden_layer_size=64, dyna_learning_rate=1e-3, ): """ :param env: training env :param agent: agent that is conditioned on a latent variable z that rl_algorithm is responsible for feeding in :param train_tasks: list of tasks used for training :param eval_tasks: list of tasks used for eval see default experiment config file for descriptions of the rest of the arguments """ self.env = env self.agent = agent self.exploration_agent = agent # Can potentially use a different policy purely for exploration rather than also solving tasks, currently not being used self.train_tasks = train_tasks self.eval_tasks = eval_tasks self.meta_batch = meta_batch self.num_iterations = num_iterations self.num_train_steps_per_itr = num_train_steps_per_itr self.num_initial_steps = num_initial_steps self.num_tasks_sample = num_tasks_sample self.num_steps_prior = num_steps_prior self.num_steps_posterior = num_steps_posterior self.num_extra_rl_steps_posterior = num_extra_rl_steps_posterior self.num_evals = num_evals self.num_steps_per_eval = num_steps_per_eval self.batch_size = batch_size self.embedding_batch_size = embedding_batch_size self.embedding_mini_batch_size = embedding_mini_batch_size self.max_path_length = max_path_length self.discount = discount self.replay_buffer_size = replay_buffer_size self.reward_scale = reward_scale self.update_post_train = update_post_train self.num_exp_traj_eval = num_exp_traj_eval self.eval_deterministic = eval_deterministic self.render = render self.save_replay_buffer = save_replay_buffer self.save_algorithm = save_algorithm self.save_environment = save_environment self.eval_statistics = None self.render_eval_paths = render_eval_paths self.dump_eval_paths = dump_eval_paths self.plotter = plotter self.dyna = dyna self.dyna_num_train_itr = dyna_num_train_itr self.dyna_num_train_steps_per_itr = dyna_num_train_steps_per_itr self.dyna_tandem_train = dyna_tandem_train self.dyna_n_layers = dyna_n_layers self.dyna_hidden_layer_size = dyna_hidden_layer_size self.dyna_learning_rate = dyna_learning_rate if dyna: self.sampler = DynamicsSampler( env=env, policy=agent, max_path_length=self.max_path_length, num_train_itr=dyna_num_train_itr, num_train_steps_per_itr=dyna_num_train_steps_per_itr, tandem_train=dyna_tandem_train, n_layers=dyna_n_layers, hidden_layer_size=dyna_hidden_layer_size, learning_rate=dyna_learning_rate, ) else: self.sampler = InPlacePathSampler( env=env, policy=agent, max_path_length=self.max_path_length, ) # separate replay buffers for # - training RL update # - training encoder update self.replay_buffer = MultiTaskReplayBuffer( self.replay_buffer_size, env, self.train_tasks, ) self.enc_replay_buffer = MultiTaskReplayBuffer( self.replay_buffer_size, env, self.train_tasks, ) self._n_env_steps_total = 0 self._n_train_steps_total = 0 self._n_rollouts_total = 0 self._do_train_time = 0 self._epoch_start_time = None self._algo_start_time = None self._old_table_keys = None self._current_path_builder = PathBuilder() self._exploration_paths = [] def make_exploration_policy(self, policy): return policy def make_eval_policy(self, policy): return policy def sample_task(self, is_eval=False): ''' sample task randomly ''' if is_eval: idx = np.random.randint(len(self.eval_tasks)) else: idx = np.random.randint(len(self.train_tasks)) return idx def train(self): ''' meta-training loop ''' self.pretrain() params = self.get_epoch_snapshot(-1) logger.save_itr_params(-1, params) gt.reset() gt.set_def_unique(False) self._current_path_builder = PathBuilder() # at each iteration, we first collect data from tasks, perform meta-updates, then try to evaluate for it_ in gt.timed_for( range(self.num_iterations), save_itrs=True, ): self._start_epoch(it_) self.training_mode(True) if it_ == 0: print('collecting initial pool of data for train and eval') # temp for evaluating for idx in self.train_tasks: self.task_idx = idx self.env.reset_task(idx) self.collect_data(self.num_initial_steps, 1, np.inf) # Sample data from train tasks. for i in range(self.num_tasks_sample): idx = np.random.randint(len(self.train_tasks)) self.task_idx = idx self.env.reset_task(idx) self.enc_replay_buffer.task_buffers[idx].clear() # collect some trajectories with z ~ prior if self.num_steps_prior > 0: self.collect_data(self.num_steps_prior, 1, np.inf) # collect some trajectories with z ~ posterior if self.num_steps_posterior > 0: self.collect_data(self.num_steps_posterior, 1, self.update_post_train) # even if encoder is trained only on samples from the prior, the policy needs to learn to handle z ~ posterior if self.num_extra_rl_steps_posterior > 0: self.collect_data(self.num_extra_rl_steps_posterior, 1, self.update_post_train, add_to_enc_buffer=False) # Sample train tasks and compute gradient updates on parameters. for train_step in range(self.num_train_steps_per_itr): indices = np.random.choice(self.train_tasks, self.meta_batch) self._do_training(indices) self._n_train_steps_total += 1 gt.stamp('train') self.training_mode(False) # eval self._try_to_eval(it_) gt.stamp('eval') self._end_epoch() def pretrain(self): """ Do anything before the main training phase. """ pass def collect_data(self, num_samples, resample_z_rate, update_posterior_rate, add_to_enc_buffer=True): ''' get trajectories from current env in batch mode with given policy collect complete trajectories until the number of collected transitions >= num_samples :param agent: policy to rollout :param num_samples: total number of transitions to sample :param resample_z_rate: how often to resample latent context z (in units of trajectories) :param update_posterior_rate: how often to update q(z | c) from which z is sampled (in units of trajectories) :param add_to_enc_buffer: whether to add collected data to encoder replay buffer ''' # start from the prior self.agent.clear_z() num_transitions = 0 while num_transitions < num_samples: paths, n_samples = self.sampler.obtain_samples( max_samples=num_samples - num_transitions, max_trajs=update_posterior_rate, accum_context=False, resample=resample_z_rate, testing=False) num_transitions += n_samples self.replay_buffer.add_paths(self.task_idx, paths) if add_to_enc_buffer: self.enc_replay_buffer.add_paths(self.task_idx, paths) if update_posterior_rate != np.inf: context = self.sample_context(self.task_idx) self.agent.infer_posterior(context) self._n_env_steps_total += num_transitions gt.stamp('sample') def _try_to_eval(self, epoch): logger.save_extra_data(self.get_extra_data_to_save(epoch)) if self._can_evaluate(): self.evaluate(epoch) params = self.get_epoch_snapshot(epoch) logger.save_itr_params(epoch, params) table_keys = logger.get_table_key_set() if self._old_table_keys is not None: assert table_keys == self._old_table_keys, ( "Table keys cannot change from iteration to iteration.") self._old_table_keys = table_keys logger.record_tabular( "Number of train steps total", self._n_train_steps_total, ) logger.record_tabular( "Number of env steps total", self._n_env_steps_total, ) logger.record_tabular( "Number of rollouts total", self._n_rollouts_total, ) times_itrs = gt.get_times().stamps.itrs train_time = times_itrs['train'][-1] sample_time = times_itrs['sample'][-1] eval_time = times_itrs['eval'][-1] if epoch > 0 else 0 epoch_time = train_time + sample_time + eval_time total_time = gt.get_times().total logger.record_tabular('Train Time (s)', train_time) logger.record_tabular('(Previous) Eval Time (s)', eval_time) logger.record_tabular('Sample Time (s)', sample_time) logger.record_tabular('Epoch Time (s)', epoch_time) logger.record_tabular('Total Train Time (s)', total_time) logger.record_tabular("Epoch", epoch) logger.dump_tabular(with_prefix=False, with_timestamp=False) else: logger.log("Skipping eval for now.") def _can_evaluate(self): """ One annoying thing about the logger table is that the keys at each iteration need to be the exact same. So unless you can compute everything, skip evaluation. A common example for why you might want to skip evaluation is that at the beginning of training, you may not have enough data for a validation and training set. :return: """ # eval collects its own context, so can eval any time return True def _can_train(self): return all([ self.replay_buffer.num_steps_can_sample(idx) >= self.batch_size for idx in self.train_tasks ]) def _get_action_and_info(self, agent, observation): """ Get an action to take in the environment. :param observation: :return: """ agent.set_num_steps_total(self._n_env_steps_total) return agent.get_action(observation, ) def _start_epoch(self, epoch): self._epoch_start_time = time.time() self._exploration_paths = [] self._do_train_time = 0 logger.push_prefix('Iteration #%d | ' % epoch) def _end_epoch(self): logger.log("Epoch Duration: {0}".format(time.time() - self._epoch_start_time)) logger.log("Started Training: {0}".format(self._can_train())) logger.pop_prefix() ##### Snapshotting utils ##### def get_epoch_snapshot(self, epoch): data_to_save = dict( epoch=epoch, exploration_policy=self.exploration_policy, ) if self.save_environment: data_to_save['env'] = self.training_env return data_to_save def get_extra_data_to_save(self, epoch): """ Save things that shouldn't be saved every snapshot but rather overwritten every time. :param epoch: :return: """ if self.render: self.training_env.render(close=True) data_to_save = dict(epoch=epoch, ) if self.save_environment: data_to_save['env'] = self.training_env if self.save_replay_buffer: data_to_save['replay_buffer'] = self.replay_buffer if self.save_algorithm: data_to_save['algorithm'] = self return data_to_save def collect_paths(self, idx, epoch, run): self.task_idx = idx self.env.reset_task(idx) self.agent.clear_z() paths = [] num_transitions = 0 num_trajs = 0 while num_transitions < self.num_steps_per_eval: path, num = self.sampler.obtain_samples( deterministic=self.eval_deterministic, max_samples=self.num_steps_per_eval - num_transitions, max_trajs=1, accum_context=True, testing=True) paths += path num_transitions += num num_trajs += 1 if num_trajs >= self.num_exp_traj_eval: self.agent.infer_posterior(self.agent.context) if self.sparse_rewards: for p in paths: sparse_rewards = np.stack( e['sparse_reward'] for e in p['env_infos']).reshape(-1, 1) p['rewards'] = sparse_rewards goal = self.env._goal for path in paths: path['goal'] = goal # goal # save the paths for visualization, only useful for point mass if self.dump_eval_paths: logger.save_extra_data( paths, path='eval_trajectories/task{}-epoch{}-run{}'.format( idx, epoch, run)) return paths def _do_eval(self, indices, epoch): final_returns = [] online_returns = [] for idx in indices: all_rets = [] for r in range(self.num_evals): paths = self.collect_paths(idx, epoch, r) all_rets.append( [eval_util.get_average_returns([p]) for p in paths]) final_returns.append(np.mean([a[-1] for a in all_rets])) # record online returns for the first n trajectories n = min([len(a) for a in all_rets]) all_rets = [a[:n] for a in all_rets] all_rets = np.mean(np.stack(all_rets), axis=0) # avg return per nth rollout online_returns.append(all_rets) n = min([len(t) for t in online_returns]) online_returns = [t[:n] for t in online_returns] return final_returns, online_returns def evaluate(self, epoch): if self.eval_statistics is None: self.eval_statistics = OrderedDict() ### sample trajectories from prior for debugging / visualization if self.dump_eval_paths: # 100 arbitrarily chosen for visualizations of point_robot trajectories # just want stochasticity of z, not the policy self.agent.clear_z() prior_paths, _ = self.sampler.obtain_samples( deterministic=self.eval_deterministic, max_samples=self.max_path_length * 20, accum_context=False, resample=1, testing=True) logger.save_extra_data( prior_paths, path='eval_trajectories/prior-epoch{}'.format(epoch)) ### train tasks # eval on a subset of train tasks for speed indices = np.random.choice(self.train_tasks, len(self.eval_tasks)) eval_util.dprint('evaluating on {} train tasks'.format(len(indices))) ### eval train tasks with posterior sampled from the training replay buffer train_returns = [] for idx in indices: self.task_idx = idx self.env.reset_task(idx) paths = [] for _ in range(self.num_steps_per_eval // self.max_path_length): context = self.sample_context(idx) self.agent.infer_posterior(context) p, _ = self.sampler.obtain_samples( deterministic=self.eval_deterministic, max_samples=self.max_path_length, accum_context=False, max_trajs=1, resample=np.inf, testing=True) paths += p if self.sparse_rewards: for p in paths: sparse_rewards = np.stack(e['sparse_reward'] for e in p['env_infos']).reshape( -1, 1) p['rewards'] = sparse_rewards train_returns.append(eval_util.get_average_returns(paths)) train_returns = np.mean(train_returns) ### eval train tasks with on-policy data to match eval of test tasks train_final_returns, train_online_returns = self._do_eval( indices, epoch) eval_util.dprint('train online returns') eval_util.dprint(train_online_returns) ### test tasks eval_util.dprint('evaluating on {} test tasks'.format( len(self.eval_tasks))) test_final_returns, test_online_returns = self._do_eval( self.eval_tasks, epoch) eval_util.dprint('test online returns') eval_util.dprint(test_online_returns) # save the final posterior self.agent.log_diagnostics(self.eval_statistics) if hasattr(self.env, "log_diagnostics"): self.env.log_diagnostics(paths, prefix=None) avg_train_return = np.mean(train_final_returns) avg_test_return = np.mean(test_final_returns) avg_train_online_return = np.mean(np.stack(train_online_returns), axis=0) avg_test_online_return = np.mean(np.stack(test_online_returns), axis=0) self.eval_statistics[ 'AverageTrainReturn_all_train_tasks'] = train_returns self.eval_statistics[ 'AverageReturn_all_train_tasks'] = avg_train_return self.eval_statistics['AverageReturn_all_test_tasks'] = avg_test_return logger.save_extra_data(avg_train_online_return, path='online-train-epoch{}'.format(epoch)) logger.save_extra_data(avg_test_online_return, path='online-test-epoch{}'.format(epoch)) for key, value in self.eval_statistics.items(): logger.record_tabular(key, value) self.eval_statistics = None if self.render_eval_paths: self.env.render_paths(paths) if self.plotter: self.plotter.draw() @abc.abstractmethod def training_mode(self, mode): """ Set training mode to `mode`. :param mode: If True, training will happen (e.g. set the dropout probabilities to not all ones). """ pass @abc.abstractmethod def _do_training(self): """ Perform some update, e.g. perform one gradient step. :return: """ pass
def __init__( self, env, policy, train_tasks, eval_tasks, meta_batch=64, num_iterations=100, num_train_steps_per_itr=1000, num_tasks_sample=100, num_steps_per_task=100, num_evals=10, num_steps_per_eval=1000, batch_size=1024, embedding_batch_size=1024, embedding_mini_batch_size=1024, max_path_length=1000, discount=0.99, replay_buffer_size=1000000, #1000000, reward_scale=1, train_embedding_source='posterior_only', eval_embedding_source='initial_pool', eval_deterministic=True, render=False, save_replay_buffer=False, save_algorithm=False, save_environment=False, obs_emb_dim=0): """ Base class for Meta RL Algorithms :param env: training env :param policy: policy that is conditioned on a latent variable z that rl_algorithm is responsible for feeding in :param train_tasks: list of tasks used for training :param eval_tasks: list of tasks used for eval :param meta_batch: number of tasks used for meta-update :param num_iterations: number of meta-updates taken :param num_train_steps_per_itr: number of meta-updates performed per iteration :param num_tasks_sample: number of train tasks to sample to collect data for :param num_steps_per_task: number of transitions to collect per task :param num_evals: number of independent evaluation runs, with separate task encodings :param num_steps_per_eval: number of transitions to sample for evaluation :param batch_size: size of batches used to compute RL update :param embedding_batch_size: size of batches used to compute embedding :param embedding_mini_batch_size: size of batch used for encoder update :param max_path_length: max episode length :param discount: :param replay_buffer_size: max replay buffer size :param reward_scale: :param render: :param save_replay_buffer: :param save_algorithm: :param save_environment: """ self.env = env self.policy = policy self.exploration_policy = policy # Can potentially use a different policy purely for exploration rather than also solving tasks, currently not being used self.train_tasks = train_tasks self.eval_tasks = eval_tasks self.meta_batch = meta_batch self.num_iterations = num_iterations self.num_train_steps_per_itr = num_train_steps_per_itr self.num_tasks_sample = num_tasks_sample self.num_steps_per_task = num_steps_per_task self.num_evals = num_evals self.num_steps_per_eval = num_steps_per_eval self.batch_size = batch_size self.embedding_batch_size = embedding_batch_size self.embedding_mini_batch_size = embedding_mini_batch_size self.max_path_length = max_path_length self.discount = discount self.replay_buffer_size = min( int(replay_buffer_size / (len(train_tasks))), 1000) self.reward_scale = reward_scale self.train_embedding_source = train_embedding_source self.eval_embedding_source = eval_embedding_source # TODO: add options for computing embeddings on train tasks too self.eval_deterministic = eval_deterministic self.render = render self.save_replay_buffer = save_replay_buffer self.save_algorithm = save_algorithm self.save_environment = save_environment self.eval_sampler = InPlacePathSampler( env=env, policy=policy, max_samples=self.num_steps_per_eval, max_path_length=self.max_path_length, ) # separate replay buffers for # - training RL update # - training encoder update # - testing encoder self.replay_buffer = MultiTaskReplayBuffer(self.replay_buffer_size, env, self.train_tasks, state_dim=obs_emb_dim) self.enc_replay_buffer = MultiTaskReplayBuffer(self.replay_buffer_size, env, self.train_tasks, state_dim=obs_emb_dim) self.eval_enc_replay_buffer = MultiTaskReplayBuffer( self.replay_buffer_size, env, self.eval_tasks, state_dim=obs_emb_dim) self._n_env_steps_total = 0 self._n_train_steps_total = 0 self._n_rollouts_total = 0 self._do_train_time = 0 self._epoch_start_time = None self._algo_start_time = None self._old_table_keys = None self._current_path_builder = PathBuilder() self._exploration_paths = []
def __init__( self, env_sampler, exploration_policy: ExplorationPolicy, neural_process, train_neural_process=False, latent_repr_mode='concat_params', # OR concat_samples num_latent_samples=5, num_epochs=100, num_steps_per_epoch=10000, num_steps_per_eval=1000, num_updates_per_env_step=1, batch_size=1024, max_path_length=1000, discount=0.99, replay_buffer_size=1000000, reward_scale=1, render=False, save_replay_buffer=False, save_algorithm=False, save_environment=False, eval_sampler=None, eval_policy=None, replay_buffer=None, epoch_to_start_training=0): """ Base class for RL Algorithms :param env: Environment used to evaluate. :param exploration_policy: Policy used to explore :param training_env: Environment used by the algorithm. By default, a copy of `env` will be made. :param num_epochs: :param num_steps_per_epoch: :param num_steps_per_eval: :param num_updates_per_env_step: Used by online training mode. :param num_updates_per_epoch: Used by batch training mode. :param batch_size: :param max_path_length: :param discount: :param replay_buffer_size: :param reward_scale: :param render: :param save_replay_buffer: :param save_algorithm: :param save_environment: :param eval_sampler: :param eval_policy: Policy to evaluate with. :param replay_buffer: """ assert not train_neural_process, 'Have not implemented it yet! Remember to set it to train mode when training' self.neural_process = neural_process self.neural_process.set_mode('eval') self.latent_repr_mode = latent_repr_mode self.num_latent_samples = num_latent_samples self.env_sampler = env_sampler env, env_specs = env_sampler() self.training_env, _ = env_sampler(env_specs) # self.training_env = training_env or pickle.loads(pickle.dumps(env)) # self.training_env = training_env or deepcopy(env) self.exploration_policy = exploration_policy self.num_epochs = num_epochs self.num_env_steps_per_epoch = num_steps_per_epoch self.num_steps_per_eval = num_steps_per_eval self.num_updates_per_train_call = num_updates_per_env_step self.batch_size = batch_size self.max_path_length = max_path_length self.discount = discount self.replay_buffer_size = replay_buffer_size self.reward_scale = reward_scale self.render = render self.save_replay_buffer = save_replay_buffer self.save_algorithm = save_algorithm self.save_environment = save_environment self.epoch_to_start_training = epoch_to_start_training if self.latent_repr_mode == 'concat_params': def get_latent_repr(posterior_state): z_mean, z_cov = self.neural_process.get_posterior_params( posterior_state) return np.concatenate([z_mean, z_cov]) self.extra_obs_dim = 2 * self.neural_process.z_dim else: def get_latent_repr(posterior_state): z_mean, z_cov = self.neural_process.get_posterior_params( posterior_state) samples = np.random.multivariate_normal( z_mean, np.diag(z_cov), self.num_latent_samples) samples = samples.flatten() return samples self.extra_obs_dim = self.num_latent_samples * self.neural_process.z_dim self.get_latent_repr = get_latent_repr if eval_sampler is None: if eval_policy is None: eval_policy = exploration_policy eval_sampler = InPlacePathSampler( env=env, policy=eval_policy, max_samples=self.num_steps_per_eval + self.max_path_length, max_path_length=self.max_path_length, neural_process=neural_process, latent_repr_fn=get_latent_repr, reward_scale=reward_scale) self.eval_policy = eval_policy self.eval_sampler = eval_sampler self.action_space = env.action_space self.obs_space = env.observation_space self.env = env obs_space_dim = gym_get_dim(self.obs_space) act_space_dim = gym_get_dim(self.action_space) if replay_buffer is None: replay_buffer = SimpleReplayBuffer( self.replay_buffer_size, obs_space_dim + self.extra_obs_dim, act_space_dim, discrete_action_dim=isinstance(self.action_space, Discrete)) self.replay_buffer = replay_buffer self._n_env_steps_total = 0 self._n_train_steps_total = 0 self._n_rollouts_total = 0 self._do_train_time = 0 self._epoch_start_time = None self._algo_start_time = None self._old_table_keys = None self._current_path_builder = PathBuilder() self._exploration_paths = []
def __init__( self, env, agent, train_goals, wd_goals, ood_goals, replay_buffers, meta_batch_size=64, num_iterations=100, num_train_steps_per_itr=1000, num_tasks=100, num_steps_prior=100, num_steps_posterior=100, num_extra_rl_steps_posterior=100, num_evals=10, num_steps_per_eval=1000, max_path_length=1000, discount=0.99, reward_scale=1, num_exp_traj_eval=1, eval_deterministic=True, render=False, save_replay_buffer=False, save_algorithm=False, save_environment=False, render_eval_paths=False, dump_eval_paths=False, plotter=None, use_same_context=True, recurrent=False, ): """ :param env: training env :param agent: agent that is conditioned on a latent variable z that rl_algorithm is responsible for feeding in :param train_tasks: list of tasks used for training :param eval_tasks: list of tasks used for eval see default experiment config file for descriptions of the rest of the arguments """ self.env = env self.agent = agent self.train_goals = train_goals self.wd_goals = wd_goals self.ood_goals = ood_goals self.replay_buffers = replay_buffers self.num_iterations = num_iterations self.num_train_steps_per_itr = num_train_steps_per_itr self.meta_batch_size = meta_batch_size self.num_evals = num_evals self.num_steps_per_eval = num_steps_per_eval self.max_path_length = max_path_length self.discount = discount self.reward_scale = reward_scale self.num_exp_traj_eval = num_exp_traj_eval self.eval_deterministic = eval_deterministic self.render = render self.save_replay_buffer = save_replay_buffer self.save_algorithm = save_algorithm self.save_environment = save_environment self.use_same_context = use_same_context self.recurrent = recurrent self.eval_statistics = None self.render_eval_paths = render_eval_paths self.dump_eval_paths = dump_eval_paths self.plotter = plotter self.sampler = InPlacePathSampler( env=env, policy=agent, max_path_length=self.max_path_length, ) # separate replay buffers for # - training RL update # - training encoder update self._n_env_steps_total = 0 self._n_train_steps_total = 0 self._n_rollouts_total = 0 self._do_train_time = 0 self._epoch_start_time = None self._algo_start_time = None self._old_table_keys = None self._current_path_builder = PathBuilder() self._exploration_paths = []
class MetaRLAlgorithm(metaclass=abc.ABCMeta): def __init__( self, env, agent, train_goals, wd_goals, ood_goals, replay_buffers, meta_batch_size=64, num_iterations=100, num_train_steps_per_itr=1000, num_tasks=100, num_steps_prior=100, num_steps_posterior=100, num_extra_rl_steps_posterior=100, num_evals=10, num_steps_per_eval=1000, max_path_length=1000, discount=0.99, reward_scale=1, num_exp_traj_eval=1, eval_deterministic=True, render=False, save_replay_buffer=False, save_algorithm=False, save_environment=False, render_eval_paths=False, dump_eval_paths=False, plotter=None, use_same_context=True, recurrent=False, ): """ :param env: training env :param agent: agent that is conditioned on a latent variable z that rl_algorithm is responsible for feeding in :param train_tasks: list of tasks used for training :param eval_tasks: list of tasks used for eval see default experiment config file for descriptions of the rest of the arguments """ self.env = env self.agent = agent self.train_goals = train_goals self.wd_goals = wd_goals self.ood_goals = ood_goals self.replay_buffers = replay_buffers self.num_iterations = num_iterations self.num_train_steps_per_itr = num_train_steps_per_itr self.meta_batch_size = meta_batch_size self.num_evals = num_evals self.num_steps_per_eval = num_steps_per_eval self.max_path_length = max_path_length self.discount = discount self.reward_scale = reward_scale self.num_exp_traj_eval = num_exp_traj_eval self.eval_deterministic = eval_deterministic self.render = render self.save_replay_buffer = save_replay_buffer self.save_algorithm = save_algorithm self.save_environment = save_environment self.use_same_context = use_same_context self.recurrent = recurrent self.eval_statistics = None self.render_eval_paths = render_eval_paths self.dump_eval_paths = dump_eval_paths self.plotter = plotter self.sampler = InPlacePathSampler( env=env, policy=agent, max_path_length=self.max_path_length, ) # separate replay buffers for # - training RL update # - training encoder update self._n_env_steps_total = 0 self._n_train_steps_total = 0 self._n_rollouts_total = 0 self._do_train_time = 0 self._epoch_start_time = None self._algo_start_time = None self._old_table_keys = None self._current_path_builder = PathBuilder() self._exploration_paths = [] def make_exploration_policy(self, policy): return policy def make_eval_policy(self, policy): return policy def train(self): ''' meta-training loop ''' self.pretrain() gt.reset() gt.set_def_unique(False) self._current_path_builder = PathBuilder() # at each iteration, we first collect data from tasks, perform meta-updates, then try to evaluate for it_ in gt.timed_for( range(self.num_iterations), save_itrs=True, ): self._start_epoch(it_) self.training_mode(True) # Sample train tasks and compute gradient updates on parameters. batch_idxes = np.random.randint(0, len(self.train_goals), size=self.meta_batch_size) train_batch_obj_id = self.replay_buffers.sample_training_data( batch_idxes, self.use_same_context) for _ in range(self.num_train_steps_per_itr): train_raw_batch = ray.get(train_batch_obj_id) gt.stamp('sample_training_data', unique=False) batch_idxes = np.random.randint(0, len(self.train_goals), size=self.meta_batch_size) # In this way, we can start the data sampling job for the # next training while doing training for the current loop. train_batch_obj_id = self.replay_buffers.sample_training_data( batch_idxes, self.use_same_context) gt.stamp('set_up_sampling', unique=False) train_data = self.construct_training_batch(train_raw_batch) gt.stamp('construct_training_batch', unique=False) self._do_training(train_data) self._n_train_steps_total += 1 gt.stamp('train') self.training_mode(False) # eval self._try_to_eval(it_) gt.stamp('eval') self._end_epoch() if it_ == self.num_iterations: logger.save_itr_params(it_, self.agent.get_snapshot()) def construct_training_batch(self, raw_batch): ''' Construct training batch from raw batch''' state = np.concatenate([rb[0] for rb in raw_batch], axis=0) next_state = np.concatenate([rb[1] for rb in raw_batch], axis=0) actions = np.concatenate([rb[2] for rb in raw_batch], axis=0) rewards = np.concatenate([rb[3] for rb in raw_batch], axis=0) dones = np.concatenate([rb[4] for rb in raw_batch], axis=0) contexts = np.concatenate([rb[5] for rb in raw_batch], axis=0) return [state, next_state, actions, rewards, dones, contexts] def pretrain(self): """ Do anything before the main training phase. """ pass def _try_to_eval(self, epoch): logger.save_extra_data(self.get_extra_data_to_save(epoch)) if self._can_evaluate(): self.evaluate(epoch) params = self.get_epoch_snapshot(epoch) logger.save_itr_params(epoch, params) table_keys = logger.get_table_key_set() if self._old_table_keys is not None: assert table_keys == self._old_table_keys, ( "Table keys cannot change from iteration to iteration.") self._old_table_keys = table_keys logger.record_tabular( "Number of train steps total", self._n_train_steps_total, ) logger.record_tabular( "Number of env steps total", self._n_env_steps_total, ) logger.record_tabular( "Number of rollouts total", self._n_rollouts_total, ) times_itrs = gt.get_times().stamps.itrs train_time = times_itrs['train'][-1] sample_time = times_itrs['sample_training_data'][-1] eval_time = times_itrs['eval'][-1] if epoch > 0 else 0 epoch_time = train_time + sample_time + eval_time total_time = gt.get_times().total logger.record_tabular('Train Time (s)', train_time) logger.record_tabular('(Previous) Eval Time (s)', eval_time) logger.record_tabular('Sample Time (s)', sample_time) logger.record_tabular('Epoch Time (s)', epoch_time) logger.record_tabular('Total Train Time (s)', total_time) logger.record_tabular("Epoch", epoch) logger.dump_tabular(with_prefix=False, with_timestamp=False) else: logger.log("Skipping eval for now.") def _can_evaluate(self): """ One annoying thing about the logger table is that the keys at each iteration need to be the exact same. So unless you can compute everything, skip evaluation. A common example for why you might want to skip evaluation is that at the beginning of training, you may not have enough data for a validation and training set. :return: """ # eval collects its own context, so can eval any time return True def _can_train(self): return True def _get_action_and_info(self, agent, observation): """ Get an action to take in the environment. :param observation: :return: """ agent.set_num_steps_total(self._n_env_steps_total) return agent.get_action(observation, ) def _start_epoch(self, epoch): self._epoch_start_time = time.time() self._exploration_paths = [] self._do_train_time = 0 logger.push_prefix('Iteration #%d | ' % epoch) def _end_epoch(self): logger.log("Epoch Duration: {0}".format(time.time() - self._epoch_start_time)) logger.log("Started Training: {0}".format(self._can_train())) logger.pop_prefix() ##### Snapshotting utils ##### def get_epoch_snapshot(self, epoch): data_to_save = dict( epoch=epoch, exploration_policy=self.agent, ) if self.save_environment: data_to_save['env'] = self.env return data_to_save def get_extra_data_to_save(self, epoch): """ Save things that shouldn't be saved every snapshot but rather overwritten every time. :param epoch: :return: """ if self.render: self.env.render(close=True) data_to_save = dict(epoch=epoch, ) if self.save_algorithm: data_to_save['algorithm'] = self if epoch == self.num_iterations - 1: data_to_save['algorithm'] = self return data_to_save def collect_paths(self, goal, epoch, run): self.env.set_goal(goal) self.agent.clear_z() paths = [] num_transitions = 0 num_trajs = 0 while num_transitions < self.num_steps_per_eval: path, num = self.sampler.obtain_samples( deterministic=self.eval_deterministic, max_samples=self.num_steps_per_eval - num_transitions, max_trajs=1, accum_context=True) paths += path num_transitions += num num_trajs += 1 if num_trajs >= self.num_exp_traj_eval: self.agent.infer_posterior(self.agent.context) if self.sparse_rewards: for p in paths: sparse_rewards = np.stack( e['sparse_reward'] for e in p['env_infos']).reshape(-1, 1) p['rewards'] = sparse_rewards goal = self.env._goal for path in paths: path['goal'] = goal # goal # save the paths for visualization, only useful for point mass if self.dump_eval_paths: logger.save_extra_data( paths, path='eval_trajectories/evla_goal{}-epoch{}-run{}'.format( goal, epoch, run)) return paths def _do_eval(self, goal_set, epoch): final_returns = [] final_achieved = [] for goal in goal_set: all_rets = [] all_achieved = [] for r in range(self.num_evals): paths = self.collect_paths(goal, epoch, r) all_rets.append( [eval_util.get_average_returns([p]) for p in paths]) all_achieved.append( [eval_util.get_average_achieved([p]) for p in paths]) final_returns.append(np.mean([a[-1] for a in all_rets])) final_achieved.append(np.mean([a[-1] for a in all_achieved])) return final_returns, final_achieved def evaluate(self, epoch): if self.eval_statistics is None: self.eval_statistics = OrderedDict() ### sample trajectories from prior for debugging / visualization if self.dump_eval_paths: # 100 arbitrarily chosen for visualizations of point_robot trajectories # just want stochasticity of z, not the policy self.agent.clear_z() prior_paths, _ = self.sampler.obtain_samples( deterministic=self.eval_deterministic, max_samples=self.max_path_length * 20, accum_context=False, resample=1) logger.save_extra_data( prior_paths, path='eval_trajectories/prior-epoch{}'.format(epoch)) ### train tasks # eval on a subset of train tasks for speed eval_util.dprint('evaluating on {} train tasks'.format( len(self.train_goals))) ### eval train tasks with on-policy data to match eval of test tasks train_final_returns, train_final_achieved = self._do_eval( self.train_goals, epoch) # Comment this line for walker-param # train_final_achieved_pair = [(train_final_achieved[i], goal) for i, goal in enumerate(self.train_goals)] train_final_achieved_pair = [(train_final_achieved[i], -1) for i, goal in enumerate(self.train_goals) ] eval_util.dprint('train final achieved') eval_util.dprint(train_final_achieved_pair) ### WD tasks eval_util.dprint('evaluating on {} wd tasks'.format(len( self.wd_goals))) wd_final_returns, wd_final_achieved = self._do_eval( self.wd_goals, epoch) # Comment this line for walker-param # wd_final_achieved_pair = [(wd_final_achieved[i], goal) for i, goal in enumerate(self.wd_goals)] wd_final_achieved_pair = [(wd_final_achieved[i], -1) for i, goal in enumerate(self.wd_goals)] eval_util.dprint('WD test final achieved') eval_util.dprint(wd_final_achieved_pair) # ### OOD tasks # eval_util.dprint('evaluating on {} wd tasks'.format(len(self.ood_goals))) # ood_final_returns, ood_final_achieved = self._do_eval(self.ood_goals, epoch) # # Comment this line for walker-param # # ood_final_achieved_pair = [(ood_final_achieved[i], goal) for i, goal in enumerate(self.ood_goals)] # ood_final_achieved_pair = [(ood_final_achieved[i], -1) for i, goal in enumerate(self.ood_goals)] # eval_util.dprint('OOD test final achieved') # eval_util.dprint(ood_final_achieved_pair) # # save the final posterior # self.agent.log_diagnostics(self.eval_statistics) avg_train_return = np.mean(train_final_returns) avg_wd_return = np.mean(wd_final_returns) # avg_ood_return = np.mean(ood_final_returns) self.eval_statistics[ 'AverageReturn_all_train_tasks'] = avg_train_return self.eval_statistics['AverageReturn_all_wd_tasks'] = avg_wd_return # self.eval_statistics['AverageReturn_all_ood_tasks'] = avg_ood_return self.eval_statistics['Return_all_train_tasks'] = train_final_returns self.eval_statistics['Return_all_wd_tasks'] = wd_final_returns # self.eval_statistics['Return_all_ood_tasks'] = ood_final_returns self.eval_statistics[ 'Achieved_all_train_tasks'] = train_final_achieved_pair self.eval_statistics['Achieved_all_wd_tasks'] = wd_final_achieved_pair # self.eval_statistics['Achieved_all_ood_tasks'] = ood_final_achieved_pair for key, value in self.eval_statistics.items(): logger.record_tabular(key, value) self.eval_statistics = None if self.plotter: self.plotter.draw() @abc.abstractmethod def training_mode(self, mode): """ Set training mode to `mode`. :param mode: If True, training will happen (e.g. set the dropout probabilities to not all ones). """ pass @abc.abstractmethod def _do_training(self, train_data): """ Perform some update, e.g. perform one gradient step. :return: """ pass
def __init__( self, env, agent, train_tasks, eval_tasks, meta_batch=64, num_iterations=100, num_train_steps_per_itr=1000, num_initial_steps=100, num_tasks_sample=100, num_steps_prior=100, num_steps_posterior=100, num_extra_rl_steps_posterior=100, num_evals=10, num_steps_per_eval=1000, batch_size=1024, low_batch_size=2048, #TODO: Tune this batch size embedding_batch_size=1024, embedding_mini_batch_size=1024, max_path_length=1000, discount=0.99, replay_buffer_size=1000000, reward_scale=1, num_exp_traj_eval=1, update_post_train=1, eval_deterministic=True, render=False, save_replay_buffer=False, save_algorithm=False, save_environment=False, render_eval_paths=False, dump_eval_paths=False, plotter=None, use_goals=False): """ :param env: training env :param agent: agent that is conditioned on a latent variable z that rl_algorithm is responsible for feeding in :param train_tasks: list of tasks used for training :param eval_tasks: list of tasks used for eval see default experiment config file for descriptions of the rest of the arguments """ self.env = env self.agent = agent self.use_goals = use_goals assert (agent.use_goals == self.use_goals) self.exploration_agent = agent # Can potentially use a different policy purely for exploration rather than also solving tasks, currently not being used self.train_tasks = train_tasks self.eval_tasks = eval_tasks self.meta_batch = meta_batch self.num_iterations = num_iterations self.num_train_steps_per_itr = num_train_steps_per_itr self.num_initial_steps = num_initial_steps self.num_tasks_sample = num_tasks_sample self.num_steps_prior = num_steps_prior self.num_steps_posterior = num_steps_posterior self.num_extra_rl_steps_posterior = num_extra_rl_steps_posterior self.num_evals = num_evals self.num_steps_per_eval = num_steps_per_eval self.batch_size = batch_size self.embedding_batch_size = embedding_batch_size self.embedding_mini_batch_size = embedding_mini_batch_size self.low_batch_size = low_batch_size self.max_path_length = max_path_length self.discount = discount self.replay_buffer_size = replay_buffer_size self.reward_scale = reward_scale self.update_post_train = update_post_train self.num_exp_traj_eval = num_exp_traj_eval self.eval_deterministic = eval_deterministic self.render = render self.save_replay_buffer = save_replay_buffer self.save_algorithm = save_algorithm self.save_environment = save_environment self.eval_statistics = None self.render_eval_paths = render_eval_paths self.dump_eval_paths = dump_eval_paths self.plotter = plotter obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) self.sampler = InPlacePathSampler( env=env, policy=agent, max_path_length=self.max_path_length, ) # separate replay buffers for # - training RL update # - training encoder update self.enc_replay_buffer = MultiTaskReplayBuffer( self.replay_buffer_size, env, self.train_tasks, ) if self.use_goals: self.high_buffer = MultiTaskReplayBuffer(self.replay_buffer_size, env, self.train_tasks) #Hacky method for changing the obs and action dimensions for the internal #buffers since they're not the same as the original environment internal_buffers = dict([ (idx, SimpleReplayBuffer( max_replay_buffer_size=self.replay_buffer_size, observation_dim=obs_dim, action_dim=obs_dim, )) for idx in self.train_tasks ]) self.high_buffer.task_buffers = internal_buffers self.low_buffer = SimpleReplayBuffer( max_replay_buffer_size=replay_buffer_size, observation_dim=2 * obs_dim, action_dim=action_dim, ) else: self.replay_buffer = MultiTaskReplayBuffer( self.replay_buffer_size, env, self.train_tasks, ) self._n_env_steps_total = 0 self._n_train_steps_total = 0 self._n_rollouts_total = 0 self._do_train_time = 0 self._epoch_start_time = None self._algo_start_time = None self._old_table_keys = None self._current_path_builder = PathBuilder() self._exploration_paths = []
def __init__(self, env, agent, train_tasks, eval_tasks, goal_radius, eval_deterministic=True, render=False, render_eval_paths=False, plotter=None, **kwargs): """ :param env: training env :param agent: agent that is conditioned on a latent variable z that rl_algorithm is responsible for feeding in :param train_tasks: list of tasks used for training :param eval_tasks: list of tasks used for eval :param goal_radius: reward threshold for defining sparse rewards see default experiment config file for descriptions of the rest of the arguments """ self.env = env self.agent = agent self.train_tasks = train_tasks self.eval_tasks = eval_tasks self.goal_radius = goal_radius self.meta_batch = kwargs['meta_batch'] self.batch_size = kwargs['batch_size'] self.num_iterations = kwargs['num_iterations'] self.num_train_steps_per_itr = kwargs['num_train_steps_per_itr'] self.num_initial_steps = kwargs['num_initial_steps'] self.num_tasks_sample = kwargs['num_tasks_sample'] self.num_steps_prior = kwargs['num_steps_prior'] self.num_steps_posterior = kwargs['num_steps_posterior'] self.num_extra_rl_steps_posterior = kwargs[ 'num_extra_rl_steps_posterior'] self.num_evals = kwargs['num_evals'] self.num_steps_per_eval = kwargs['num_steps_per_eval'] self.embedding_batch_size = kwargs['embedding_batch_size'] self.embedding_mini_batch_size = kwargs['embedding_mini_batch_size'] self.max_path_length = kwargs['max_path_length'] self.discount = kwargs['discount'] self.replay_buffer_size = kwargs['replay_buffer_size'] self.reward_scale = kwargs['reward_scale'] self.update_post_train = kwargs['update_post_train'] self.num_exp_traj_eval = kwargs['num_exp_traj_eval'] self.save_replay_buffer = kwargs['save_replay_buffer'] self.save_algorithm = kwargs['save_algorithm'] self.save_environment = kwargs['save_environment'] self.dump_eval_paths = kwargs['dump_eval_paths'] self.data_dir = kwargs['data_dir'] self.train_epoch = kwargs['train_epoch'] self.eval_epoch = kwargs['eval_epoch'] self.sample = kwargs['sample'] self.n_trj = kwargs['n_trj'] self.allow_eval = kwargs['allow_eval'] self.mb_replace = kwargs['mb_replace'] self.eval_deterministic = eval_deterministic self.render = render self.eval_statistics = None self.render_eval_paths = render_eval_paths self.plotter = plotter self.train_buffer = MultiTaskReplayBuffer(self.replay_buffer_size, env, self.train_tasks, self.goal_radius) self.eval_buffer = MultiTaskReplayBuffer(self.replay_buffer_size, env, self.eval_tasks, self.goal_radius) self.replay_buffer = MultiTaskReplayBuffer(self.replay_buffer_size, env, self.train_tasks, self.goal_radius) self.enc_replay_buffer = MultiTaskReplayBuffer(self.replay_buffer_size, env, self.train_tasks, self.goal_radius) # offline sampler which samples from the train/eval buffer self.offline_sampler = OfflineInPlacePathSampler( env=env, policy=agent, max_path_length=self.max_path_length) # online sampler for evaluation (if collect on-policy context, for offline context, use self.offline_sampler) self.sampler = InPlacePathSampler(env=env, policy=agent, max_path_length=self.max_path_length) self._n_env_steps_total = 0 self._n_train_steps_total = 0 self._n_rollouts_total = 0 self._do_train_time = 0 self._epoch_start_time = None self._algo_start_time = None self._old_table_keys = None self._current_path_builder = PathBuilder() self._exploration_paths = [] self.init_buffer()
} # set up the policy # policy = joblib.load(POLICY_SAVE_PATH)['exploration_policy'] policy = joblib.load(POLICY_SAVE_PATH) # set up the env # if env_specs['train_test_env']: # _, training_env = get_env(env_specs) # else: # training_env, _ = get_env(env_specs) # training_env = DebugFetchReachAndLiftEnv() training_env = WrappedRotatedFetchReachAnywhereEnv() # build an eval sampler that also renders eval_sampler = InPlacePathSampler( env=training_env, policy=policy, max_samples=max_samples, max_path_length=max_path_length, policy_uses_pixels=policy_specs['policy_uses_pixels'], policy_uses_task_params=policy_specs['policy_uses_task_params'], concat_task_params_to_policy_obs=policy_specs['concat_task_params_to_policy_obs'], animated=True ) eval_sampler.obtain_samples() training_env.close() eval_sampler = None
def __init__( self, env, exploration_policy: ExplorationPolicy, training_env=None, num_epochs=100, num_steps_per_epoch=10000, num_steps_per_eval=1000, num_updates_per_env_step=1, max_num_episodes=None, batch_size=1024, max_path_length=1000, discount=0.99, replay_buffer_size=1000000, reward_scale=1, render=False, save_replay_buffer=False, save_algorithm=False, save_environment=False, save_best=False, save_best_starting_from_epoch=0, eval_sampler=None, eval_policy=None, replay_buffer=None, # for compatibility with deepmind control suite # Right now the semantics is that if observations is not a dictionary # then it means the policy just uses that. If it's a dictionary, it # checks whether policy_uses_pixels to see if it's true or false and # based on that it decides whether the policy takes 'pixels' or 'obs' # from the dictionary policy_uses_pixels=False, freq_saving=1, # for meta-learning policy_uses_task_params=False, # whether the policy uses the task parameters concat_task_params_to_policy_obs=False, # how the policy sees the task parameters # this is useful when you want to generate trajectories from the expert using the # exploration policy do_not_train=False, # some environment like halfcheetah_v2 have a timelimit that defines the terminal # this is used as a minor hack to turn off time limits no_terminal=False, **kwargs ): """ Base class for RL Algorithms :param env: Environment used to evaluate. :param exploration_policy: Policy used to explore :param training_env: Environment used by the algorithm. By default, a copy of `env` will be made. :param num_epochs: :param num_steps_per_epoch: :param num_steps_per_eval: :param num_updates_per_env_step: Used by online training mode. :param num_updates_per_epoch: Used by batch training mode. :param batch_size: :param max_path_length: :param discount: :param replay_buffer_size: :param reward_scale: :param render: :param save_replay_buffer: :param save_algorithm: :param save_environment: :param eval_sampler: :param eval_policy: Policy to evaluate with. :param replay_buffer: """ self.training_env = training_env or pickle.loads(pickle.dumps(env)) # self.training_env = training_env or deepcopy(env) self.exploration_policy = exploration_policy self.num_epochs = num_epochs self.num_env_steps_per_epoch = num_steps_per_epoch self.num_steps_per_eval = num_steps_per_eval self.num_updates_per_train_call = num_updates_per_env_step self.batch_size = batch_size self.max_path_length = max_path_length self.discount = discount self.replay_buffer_size = replay_buffer_size self.reward_scale = reward_scale self.render = render self.save_replay_buffer = save_replay_buffer self.save_algorithm = save_algorithm self.save_environment = save_environment self.save_best = save_best self.save_best_starting_from_epoch = save_best_starting_from_epoch self.policy_uses_pixels = policy_uses_pixels self.policy_uses_task_params = policy_uses_task_params self.concat_task_params_to_policy_obs = concat_task_params_to_policy_obs self.freq_saving = freq_saving if eval_sampler is None: if eval_policy is None: eval_policy = exploration_policy eval_sampler = InPlacePathSampler( env=env, policy=eval_policy, max_samples=self.num_steps_per_eval + self.max_path_length, max_path_length=self.max_path_length, policy_uses_pixels=policy_uses_pixels, policy_uses_task_params=policy_uses_task_params, concat_task_params_to_policy_obs=concat_task_params_to_policy_obs ) self.eval_policy = eval_policy self.eval_sampler = eval_sampler self.action_space = env.action_space self.obs_space = env.observation_space self.env = env if replay_buffer is None: replay_buffer = EnvReplayBuffer( self.replay_buffer_size, self.env, policy_uses_pixels=self.policy_uses_pixels, policy_uses_task_params=self.policy_uses_task_params, concat_task_params_to_policy_obs=self.concat_task_params_to_policy_obs ) self.replay_buffer = replay_buffer self._n_env_steps_total = 0 self._n_train_steps_total = 0 self._n_rollouts_total = 0 self._do_train_time = 0 self._epoch_start_time = None self._algo_start_time = None self._old_table_keys = None self._current_path_builder = PathBuilder() self._exploration_paths = [] self.do_not_train = do_not_train self.num_episodes = 0 self.max_num_episodes = max_num_episodes if max_num_episodes is not None else float('inf') self.no_terminal = no_terminal
def experiment(log_dir, variant_overwrite, cpu=False): if not cpu: ptu.set_gpu_mode(True) # optionally set the GPU (default=False) # Load experiment from file. env, _, data, variant = load_experiment(log_dir, variant_overwrite) #assert all([a == b for a, b in zip(print(samples)env.sampled_goal, variant['env_kwargs']['goal_prior'])]) # Set log directory. exp_id = 'eval/ne{}-mpl{}-{}-rs{}/nhp{}'.format( variant['algo_kwargs']['num_episodes'], variant['algo_kwargs']['max_path_length'], ','.join(variant_overwrite['env_kwargs']['shaped_rewards']), variant['algo_kwargs']['reward_scale'], variant['historical_policies_kwargs']['num_historical_policies'], ) exp_id = create_exp_name(exp_id) out_dir = os.path.join(log_dir, exp_id) print('Logging to:', out_dir) setup_logger( log_dir=out_dir, variant=variant, snapshot_mode='none', snapshot_gap=50, ) # Load trained model from file. policy = data['policy'] vf = data['vf'] qf = data['qf'] algorithm = SoftActorCritic( env=env, training_env=env, # can't clone box2d env cause of swig save_environment=False, # can't save box2d env cause of swig policy=policy, qf=qf, vf=vf, **variant['algo_kwargs'], ) # Overwrite algorithm for p(z) adaptation (if model is SMM). if variant['intrinsic_reward'] == 'smm': discriminator = data['discriminator'] density_model = data['density_model'] SMMHook(base_algorithm=algorithm, discriminator=discriminator, density_model=density_model, **variant['smm_kwargs']) # Overwrite algorithm for historical averaging. if variant['historical_policies_kwargs']['num_historical_policies'] > 0: HistoricalPoliciesHook( base_algorithm=algorithm, log_dir=log_dir, **variant['historical_policies_kwargs'], ) algorithm.to(ptu.device) #algorithm.train() samples = algorithm.get_eval_paths() #for path in samples: # print(path['observations']) #plt.figure() #plt.plot(samples[0]['observations'][:, 0], samples[0]['observations'][:, 1]) #plt.plot(3, 2) #plt.show() print(env.reset()) print(samples[0]['observations']) i = 0 for path in samples: np.save('./outtem/out%i.npy' % i, path['observations']) i = i + 1 #print(algorithm.policy.get_action(np.array([0,0]))) from rlkit.samplers.util import rollout from rlkit.samplers.in_place import InPlacePathSampler #path=rollout(env,algorithm.eval_policy,50) eval_sampler = InPlacePathSampler( env=env, policy=algorithm.eval_policy, max_samples=100, max_path_length=50, ) path = algorithm.eval_sampler.obtain_samples() print(path[0]['observations'])
def __init__( self, env, exploration_policy: ExplorationPolicy, training_env=None, num_epochs=100, num_steps_per_epoch=10000, num_steps_per_eval=1000, num_updates_per_env_step=1, batch_size=1024, max_path_length=1000, discount=0.99, replay_buffer_size=1000000, reward_scale=1, render=False, save_replay_buffer=False, save_algorithm=False, save_environment=True, eval_sampler=None, eval_policy=None, replay_buffer=None, ): """ Base class for RL Algorithms :param env: Environment used to evaluate. :param exploration_policy: Policy used to explore :param training_env: Environment used by the algorithm. By default, a copy of `env` will be made. :param num_epochs: :param num_steps_per_epoch: :param num_steps_per_eval: :param num_updates_per_env_step: Used by online training mode. :param num_updates_per_epoch: Used by batch training mode. :param batch_size: :param max_path_length: :param discount: :param replay_buffer_size: :param reward_scale: :param render: :param save_replay_buffer: :param save_algorithm: :param save_environment: :param eval_sampler: :param eval_policy: Policy to evaluate with. :param replay_buffer: """ self.training_env = training_env or pickle.loads(pickle.dumps(env)) self.exploration_policy = exploration_policy self.num_epochs = num_epochs self.num_env_steps_per_epoch = num_steps_per_epoch self.num_steps_per_eval = num_steps_per_eval self.num_updates_per_train_call = num_updates_per_env_step self.batch_size = batch_size self.max_path_length = max_path_length self.discount = discount self.replay_buffer_size = replay_buffer_size self.reward_scale = reward_scale self.render = render self.save_replay_buffer = save_replay_buffer self.save_algorithm = save_algorithm self.save_environment = save_environment if eval_sampler is None: if eval_policy is None: eval_policy = exploration_policy eval_sampler = InPlacePathSampler( env=env, policy=eval_policy, max_samples=self.num_steps_per_eval + self.max_path_length, max_path_length=self.max_path_length, ) self.eval_policy = eval_policy self.eval_sampler = eval_sampler self.action_space = env.action_space self.obs_space = env.observation_space self.env = env if replay_buffer is None: replay_buffer = EnvReplayBuffer( self.replay_buffer_size, self.env, ) self.replay_buffer = replay_buffer self._n_env_steps_total = 0 self._n_train_steps_total = 0 self._n_rollouts_total = 0 self._do_train_time = 0 self._epoch_start_time = None self._algo_start_time = None self._old_table_keys = None self._current_path_builder = PathBuilder() self._exploration_paths = []
def __init__( self, env, agent, train_tasks, eval_tasks, meta_batch=64, num_iterations=100, num_train_steps_per_itr=1000, num_initial_steps=100, num_tasks_sample=100, num_steps_prior=100, num_steps_posterior=100, num_extra_rl_steps_posterior=100, num_evals=10, num_steps_per_eval=1000, batch_size=1024, embedding_batch_size=1024, embedding_mini_batch_size=1024, max_path_length=1000, discount=0.99, replay_buffer_size=1000000, reward_scale=1, num_exp_traj_eval=1, update_post_train=1, eval_deterministic=True, render=False, save_replay_buffer=False, save_algorithm=False, save_environment=False, render_eval_paths=False, dump_eval_paths=False, plotter=None, dyna=False, dyna_num_train_itr=50, dyna_num_train_steps_per_itr=50, dyna_tandem_train=True, dyna_n_layers=3, dyna_hidden_layer_size=64, dyna_learning_rate=1e-3, ): """ :param env: training env :param agent: agent that is conditioned on a latent variable z that rl_algorithm is responsible for feeding in :param train_tasks: list of tasks used for training :param eval_tasks: list of tasks used for eval see default experiment config file for descriptions of the rest of the arguments """ self.env = env self.agent = agent self.exploration_agent = agent # Can potentially use a different policy purely for exploration rather than also solving tasks, currently not being used self.train_tasks = train_tasks self.eval_tasks = eval_tasks self.meta_batch = meta_batch self.num_iterations = num_iterations self.num_train_steps_per_itr = num_train_steps_per_itr self.num_initial_steps = num_initial_steps self.num_tasks_sample = num_tasks_sample self.num_steps_prior = num_steps_prior self.num_steps_posterior = num_steps_posterior self.num_extra_rl_steps_posterior = num_extra_rl_steps_posterior self.num_evals = num_evals self.num_steps_per_eval = num_steps_per_eval self.batch_size = batch_size self.embedding_batch_size = embedding_batch_size self.embedding_mini_batch_size = embedding_mini_batch_size self.max_path_length = max_path_length self.discount = discount self.replay_buffer_size = replay_buffer_size self.reward_scale = reward_scale self.update_post_train = update_post_train self.num_exp_traj_eval = num_exp_traj_eval self.eval_deterministic = eval_deterministic self.render = render self.save_replay_buffer = save_replay_buffer self.save_algorithm = save_algorithm self.save_environment = save_environment self.eval_statistics = None self.render_eval_paths = render_eval_paths self.dump_eval_paths = dump_eval_paths self.plotter = plotter self.dyna = dyna self.dyna_num_train_itr = dyna_num_train_itr self.dyna_num_train_steps_per_itr = dyna_num_train_steps_per_itr self.dyna_tandem_train = dyna_tandem_train self.dyna_n_layers = dyna_n_layers self.dyna_hidden_layer_size = dyna_hidden_layer_size self.dyna_learning_rate = dyna_learning_rate if dyna: self.sampler = DynamicsSampler( env=env, policy=agent, max_path_length=self.max_path_length, num_train_itr=dyna_num_train_itr, num_train_steps_per_itr=dyna_num_train_steps_per_itr, tandem_train=dyna_tandem_train, n_layers=dyna_n_layers, hidden_layer_size=dyna_hidden_layer_size, learning_rate=dyna_learning_rate, ) else: self.sampler = InPlacePathSampler( env=env, policy=agent, max_path_length=self.max_path_length, ) # separate replay buffers for # - training RL update # - training encoder update self.replay_buffer = MultiTaskReplayBuffer( self.replay_buffer_size, env, self.train_tasks, ) self.enc_replay_buffer = MultiTaskReplayBuffer( self.replay_buffer_size, env, self.train_tasks, ) self._n_env_steps_total = 0 self._n_train_steps_total = 0 self._n_rollouts_total = 0 self._do_train_time = 0 self._epoch_start_time = None self._algo_start_time = None self._old_table_keys = None self._current_path_builder = PathBuilder() self._exploration_paths = []
class DdpgQfCombiner(object): def __init__( self, env, qf1, qf2, policy, replay_buffer1, replay_buffer2, num_epochs=1000, num_steps_per_epoch=1000, policy_learning_rate=1e-4, batch_size=128, num_steps_per_eval=3000, max_path_length=300, discount=0.99, ): super().__init__() self.env = env self.qf1 = qf1 self.qf2 = qf2 self.policy = policy self.replay_buffer1 = replay_buffer1 self.replay_buffer2 = replay_buffer2 self.num_steps_per_epoch = num_steps_per_epoch self.num_epochs = num_epochs self.policy_learning_rate = policy_learning_rate self.batch_size = batch_size self.discount = discount self.eval_sampler = InPlacePathSampler( env=env, policy=self.policy, max_samples=num_steps_per_eval, max_path_length=max_path_length, ) self.policy_optimizer = optim.Adam(self.policy.parameters(), lr=self.policy_learning_rate) def train(self): for epoch in range(self.num_epochs): logger.push_prefix('Iteration #%d | ' % epoch) start_time = time.time() for _ in range(self.num_steps_per_epoch): batch = self.get_batch() train_dict = self.get_train_dict(batch) self.policy_optimizer.zero_grad() policy_loss = train_dict['Policy Loss'] policy_loss.backward() self.policy_optimizer.step() logger.log("Train time: {}".format(time.time() - start_time)) start_time = time.time() self.evaluate(epoch) logger.log("Eval time: {}".format(time.time() - start_time)) params = self.get_epoch_snapshot(epoch) logger.save_itr_params(epoch, params) logger.pop_prefix() def to(self, device=ptu.device): self.policy.to(device) self.qf1.to(device) self.qf2.to(device) def get_batch(self): sample_size = self.batch_size // 2 batch1 = self.replay_buffer1().random_batch(sample_size) batch2 = self.replay_buffer2().random_batch(sample_size) new_batch = {} for k, v in batch1.items(): new_batch[k] = np.concatenate( ( v, batch2[k] ), axis=0, ) return np_to_pytorch_batch(new_batch) def get_train_dict(self, batch): obs = batch['observations'] policy_actions = self.policy(obs) q_output = self.qf1(obs, policy_actions) + self.qf2(obs, policy_actions) policy_loss = - q_output.mean() return OrderedDict([ ('Policy Actions', policy_actions), ('Policy Loss', policy_loss), ('QF Outputs', q_output), ]) def evaluate(self, epoch): """ Perform evaluation for this algorithm. :param epoch: The epoch number. """ statistics = OrderedDict() train_batch = self.get_batch() statistics.update(self._statistics_from_batch(train_batch, "Train")) logger.log("Collecting samples for evaluation") test_paths = self._sample_eval_paths() statistics.update(get_generic_path_information( test_paths, stat_prefix="Test", )) statistics.update(self._statistics_from_paths(test_paths, "Test")) average_returns = get_average_returns(test_paths) statistics['AverageReturn'] = average_returns statistics['Epoch'] = epoch for key, value in statistics.items(): logger.record_tabular(key, value) self.env.log_diagnostics(test_paths) logger.dump_tabular(with_prefix=False, with_timestamp=False) def _statistics_from_paths(self, paths, stat_prefix): rewards, terminals, obs, actions, next_obs = split_paths(paths) np_batch = dict( rewards=rewards, terminals=terminals, observations=obs, actions=actions, next_observations=next_obs, ) batch = np_to_pytorch_batch(np_batch) statistics = self._statistics_from_batch(batch, stat_prefix) statistics.update(create_stats_ordered_dict( 'Num Paths', len(paths), stat_prefix=stat_prefix )) return statistics def _statistics_from_batch(self, batch, stat_prefix): statistics = OrderedDict() train_dict = self.get_train_dict(batch) for name in [ 'Policy Loss', ]: tensor = train_dict[name] statistics_name = "{} {} Mean".format(stat_prefix, name) statistics[statistics_name] = np.mean(ptu.get_numpy(tensor)) for name in [ 'QF Outputs', 'Policy Actions', ]: tensor = train_dict[name] statistics.update(create_stats_ordered_dict( '{} {}'.format(stat_prefix, name), ptu.get_numpy(tensor) )) statistics.update(create_stats_ordered_dict( "{} Env Actions".format(stat_prefix), ptu.get_numpy(batch['actions']) )) return statistics def _sample_eval_paths(self): return self.eval_sampler.obtain_samples() def get_epoch_snapshot(self, epoch): return dict( epoch=epoch, policy=self.policy, env=self.env, algo=self, )