def __init__(self, args): """ Seeds everything. Initialises: logger, environments, policy (+storage +optimiser). """ self.args = args # make sure everything has the same seed utl.seed(self.args.seed) # initialize tensorboard logger if self.args.log_tensorboard: self.tb_logger = TBLogger(self.args) # initialise environment self.env = make_env(self.args.env_name, self.args.max_rollouts_per_task, seed=self.args.seed, n_tasks=self.args.num_tasks) # unwrapped env to get some info about the environment unwrapped_env = self.env.unwrapped # split to train/eval tasks shuffled_tasks = np.random.permutation( unwrapped_env.get_all_task_idx()) self.train_tasks = shuffled_tasks[:self.args.num_train_tasks] if self.args.num_eval_tasks > 0: self.eval_tasks = shuffled_tasks[-self.args.num_eval_tasks:] else: self.eval_tasks = [] # calculate what the maximum length of the trajectories is args.max_trajectory_len = unwrapped_env._max_episode_steps args.max_trajectory_len *= self.args.max_rollouts_per_task self.args.max_trajectory_len = args.max_trajectory_len # get action / observation dimensions if isinstance(self.env.action_space, gym.spaces.discrete.Discrete): self.args.action_dim = 1 else: self.args.action_dim = self.env.action_space.shape[0] self.args.obs_dim = self.env.observation_space.shape[0] self.args.num_states = unwrapped_env.num_states if hasattr( unwrapped_env, 'num_states') else None self.args.act_space = self.env.action_space # initialize policy self.initialize_policy() # initialize buffer for RL updates self.policy_storage = MultiTaskPolicyStorage( max_replay_buffer_size=int(self.args.policy_buffer_size), obs_dim=self._get_augmented_obs_dim(), action_space=self.env.action_space, tasks=self.train_tasks, trajectory_len=args.max_trajectory_len, ) self.current_experience_storage = None self.args.belief_reward = False # initialize arg to not use belief rewards
def __init__(self, args): self.args = args utl.seed(self.args.seed, self.args.deterministic_execution) # calculate number of updates and keep count of frames/iterations self.num_updates = int( args.num_frames) // args.policy_num_steps // args.num_processes self.frames = 0 self.iter_idx = 0 # initialise tensorboard logger self.logger = TBLogger(self.args, self.args.exp_label) # initialise environments self.envs = make_vec_envs( env_name=args.env_name, seed=args.seed, num_processes=args.num_processes, gamma=args.policy_gamma, device=device, episodes_per_task=self.args.max_rollouts_per_task, normalise_rew=args.norm_rew_for_policy, ret_rms=None, ) # calculate what the maximum length of the trajectories is self.args.max_trajectory_len = self.envs._max_episode_steps self.args.max_trajectory_len *= self.args.max_rollouts_per_task # get policy input dimensions self.args.state_dim = self.envs.observation_space.shape[0] self.args.task_dim = self.envs.task_dim self.args.belief_dim = self.envs.belief_dim self.args.num_states = self.envs.num_states # get policy output (action) dimensions self.args.action_space = self.envs.action_space if isinstance(self.envs.action_space, gym.spaces.discrete.Discrete): self.args.action_dim = 1 elif isinstance(self.envs.action_space, gym.spaces.multi_discrete.MultiDiscrete): self.args.action_dim = self.envs.action_space.nvec[0] else: self.args.action_dim = self.envs.action_space.shape[0] # initialise VAE and policy self.vae = VaribadVAE(self.args, self.logger, lambda: self.iter_idx) self.policy_storage = self.initialise_policy_storage() self.policy = self.initialise_policy()
def __init__(self, args): """ Seeds everything. Initialises: logger, environments, policy (+storage +optimiser). """ self.args = args # make sure everything has the same seed utl.seed(self.args.seed) # initialize tensorboard logger if self.args.log_tensorboard: self.tb_logger = TBLogger(self.args) self.args, env = off_utl.expand_args(self.args, include_act_space=True) if self.args.act_space.__class__.__name__ == "Discrete": self.args.policy = 'dqn' else: self.args.policy = 'sac' # load buffers with data if 'load_data' not in self.args or self.args.load_data: goals, augmented_obs_dim = self.load_buffer( env) # env is input just for possible relabelling option self.args.augmented_obs_dim = augmented_obs_dim self.goals = goals # initialize policy self.initialize_policy() # load vae for inference in evaluation self.load_vae() # create environment for evaluation self.env = make_env( args.env_name, args.max_rollouts_per_task, presampled_tasks=args.presampled_tasks, seed=args.seed, ) # n_tasks=self.args.num_eval_tasks) if self.args.env_name == 'GridNavi-v2': self.env.unwrapped.goals = [ tuple(goal.astype(int)) for goal in self.goals ]
def __init__(self, args): self.args = args utl.seed(self.args.seed, self.args.deterministic_execution) # count number of frames and number of meta-iterations self.frames = 0 self.iter_idx = 0 # initialise tensorboard logger self.logger = TBLogger(self.args, self.args.exp_label) # initialise environments self.envs = make_vec_envs( env_name=args.env_name, seed=args.seed, num_processes=args.num_processes, gamma=args.policy_gamma, log_dir=args.agent_log_dir, device=device, allow_early_resets=False, episodes_per_task=self.args.max_rollouts_per_task, obs_rms=None, ret_rms=None, ) # calculate what the maximum length of the trajectories is args.max_trajectory_len = self.envs._max_episode_steps args.max_trajectory_len *= self.args.max_rollouts_per_task # calculate number of meta updates self.args.num_updates = int( args.num_frames) // args.policy_num_steps // args.num_processes # get action / observation dimensions if isinstance(self.envs.action_space, gym.spaces.discrete.Discrete): self.args.action_dim = 1 else: self.args.action_dim = self.envs.action_space.shape[0] self.args.obs_dim = self.envs.observation_space.shape[0] self.args.num_states = self.envs.num_states if str.startswith( self.args.env_name, 'Grid') else None self.args.act_space = self.envs.action_space self.vae = VaribadVAE(self.args, self.logger, lambda: self.iter_idx) self.initialise_policy()
def __init__(self, args): self.args = args utl.seed(self.args.seed, self.args.deterministic_execution) # calculate number of updates and keep count of frames/iterations self.num_updates = int( args.num_frames) // args.policy_num_steps // args.num_processes self.frames = 0 self.iter_idx = -1 # initialise tensorboard logger self.logger = TBLogger(self.args, self.args.exp_label) # initialise environments self.envs = make_vec_envs( env_name=args.env_name, seed=args.seed, num_processes=args.num_processes, gamma=args.policy_gamma, device=device, episodes_per_task=self.args.max_rollouts_per_task, normalise_rew=args.norm_rew_for_policy, ret_rms=None, tasks=None) if self.args.single_task_mode: # get the current tasks (which will be num_process many different tasks) self.train_tasks = self.envs.get_task() # set the tasks to the first task (i.e. just a random task) self.train_tasks[1:] = self.train_tasks[0] # make it a list self.train_tasks = [t for t in self.train_tasks] # re-initialise environments with those tasks self.envs = make_vec_envs( env_name=args.env_name, seed=args.seed, num_processes=args.num_processes, gamma=args.policy_gamma, device=device, episodes_per_task=self.args.max_rollouts_per_task, normalise_rew=args.norm_rew_for_policy, ret_rms=None, tasks=self.train_tasks, ) # save the training tasks so we can evaluate on the same envs later utl.save_obj(self.train_tasks, self.logger.full_output_folder, "train_tasks") else: self.train_tasks = None # calculate what the maximum length of the trajectories is args.max_trajectory_len = self.envs._max_episode_steps args.max_trajectory_len *= self.args.max_rollouts_per_task # get policy input dimensions self.args.state_dim = self.envs.observation_space.shape[0] self.args.task_dim = self.envs.task_dim self.args.belief_dim = self.envs.belief_dim self.args.num_states = self.envs.num_states # get policy output (action) dimensions self.args.action_space = self.envs.action_space if isinstance(self.envs.action_space, gym.spaces.discrete.Discrete): self.args.action_dim = 1 else: self.args.action_dim = self.envs.action_space.shape[0] # initialise policy self.policy_storage = self.initialise_policy_storage() self.policy = self.initialise_policy()
def __init__(self, args): """ Seeds everything. Initialises: logger, environments, policy (+storage +optimiser). """ self.args = args # make sure everything has the same seed utl.seed(self.args.seed) # initialise environment self.env = make_env( self.args.env_name, self.args.max_rollouts_per_task, seed=self.args.seed, n_tasks=1, modify_init_state_dist=self.args.modify_init_state_dist if 'modify_init_state_dist' in self.args else False, on_circle_init_state=self.args.on_circle_init_state if 'on_circle_init_state' in self.args else True) # saving buffer with task in name folder if hasattr(self.args, 'save_buffer') and self.args.save_buffer: env_dir = os.path.join(self.args.main_save_dir, '{}'.format(self.args.env_name)) goal = self.env.unwrapped._goal self.output_dir = os.path.join( env_dir, self.args.save_dir, 'seed_{}_'.format(self.args.seed) + off_utl.create_goal_path_ext_from_goal(goal)) if self.args.save_models or self.args.save_buffer: os.makedirs(self.output_dir, exist_ok=True) config_utl.save_config_file(args, self.output_dir) # initialize tensorboard logger if self.args.log_tensorboard: self.tb_logger = TBLogger(self.args) # if not self.args.log_tensorboard: # self.save_config_json_file() # unwrapped env to get some info about the environment unwrapped_env = self.env.unwrapped # calculate what the maximum length of the trajectories is args.max_trajectory_len = unwrapped_env._max_episode_steps args.max_trajectory_len *= self.args.max_rollouts_per_task self.args.max_trajectory_len = args.max_trajectory_len # get action / observation dimensions if isinstance(self.env.action_space, gym.spaces.discrete.Discrete): self.args.action_dim = 1 else: self.args.action_dim = self.env.action_space.shape[0] self.args.obs_dim = self.env.observation_space.shape[0] self.args.num_states = unwrapped_env.num_states if hasattr( unwrapped_env, 'num_states') else None self.args.act_space = self.env.action_space # simulate env step to get reward types _, _, _, info = unwrapped_env.step(unwrapped_env.action_space.sample()) reward_types = [ reward_type for reward_type in list(info.keys()) if reward_type.startswith('reward') ] # support dense rewards training (if exists) self.args.dense_train_sparse_test = self.args.dense_train_sparse_test \ if 'dense_train_sparse_test' in self.args else False # initialize policy self.initialize_policy() # initialize buffer for RL updates self.policy_storage = MultiTaskPolicyStorage( max_replay_buffer_size=int(self.args.policy_buffer_size), obs_dim=self.args.obs_dim, action_space=self.env.action_space, tasks=[0], trajectory_len=args.max_trajectory_len, num_reward_arrays=len(reward_types) if reward_types and self.args.dense_train_sparse_test else 1, reward_types=reward_types, ) self.args.belief_reward = False # initialize arg to not use belief rewards