class TwoValueHeadsBaseGeneral(ABC): """The base class for RL algorithms.""" def __init__(self, envs, acmodel, num_frames_per_proc, discount, gae_lambda, entropy_coef, value_loss_coef, max_grad_norm, recurrence, preprocess_obss, reshape_reward, exp_used_pred, min_stats_ep_batch=16): """ Initializes a `BaseAlgo` instance. Parameters: ---------- envs : list a list of environments that will be run in parallel acmodel : torch.Module the model num_frames_per_proc : int the number of frames collected by every process for an update discount : float the discount for future rewards lr : float the learning rate for optimizers gae_lambda : float the lambda coefficient in the GAE formula ([Schulman et al., 2015](https://arxiv.org/abs/1506.02438)) entropy_coef : float the weight of the entropy cost in the final objective value_loss_coef : float the weight of the value loss in the final objective max_grad_norm : float gradient will be clipped to be at most this value recurrence : int the number of steps the gradient is propagated back in time preprocess_obss : function a function that takes observations returned by the environment and converts them into the format that the model can handle reshape_reward : function a function that shapes the reward, takes an (observation, action, reward, done) tuple as an input exp_used_pred : float the proportion of experience used for training predictor """ # Store parameters self.env = ParallelEnv(envs) self.acmodel = acmodel self.acmodel.train() self.num_frames_per_proc = num_frames_per_proc self.discount = discount self.gae_lambda = gae_lambda self.entropy_coef = entropy_coef self.value_loss_coef = value_loss_coef self.max_grad_norm = max_grad_norm self.recurrence = recurrence self.preprocess_obss = preprocess_obss or default_preprocess_obss self.reshape_reward = reshape_reward self.exp_used_pred = exp_used_pred # Initialize episode statistics values self._finished_episodes = 0 self._ep_statistics = [] self._min_stats_ep_batch = min_stats_ep_batch # Store helpers values self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.num_procs = sum(map(len, envs)) if isinstance(envs[0], list) else len(envs) self.num_frames = self.num_frames_per_proc * self.num_procs # Control parameters assert self.acmodel.recurrent or self.recurrence == 1 assert self.num_frames_per_proc % self.recurrence == 0 # Initialize experience values shape = (self.num_frames_per_proc, self.num_procs) self.obs = self.env.reset() self.obss = [None] * (shape[0]) if self.acmodel.recurrent: self.memory = torch.zeros(shape[1], self.acmodel.memory_size, device=self.device) self.memories = torch.zeros(*shape, self.acmodel.memory_size, device=self.device) self.mask = torch.ones(shape[1], device=self.device) self.masks = torch.zeros(*shape, device=self.device) self.actions = torch.zeros(*shape, device=self.device, dtype=torch.int) self.values_ext = torch.zeros(*shape, device=self.device) self.values_int = torch.zeros(*shape, device=self.device) self.rewards_ext = torch.zeros(*shape, device=self.device) self.rewards_int = torch.zeros(*shape, device=self.device) self.advantages_ext = torch.zeros(*shape, device=self.device) self.advantages_int = torch.zeros(*shape, device=self.device) self.log_probs = torch.zeros(*shape, device=self.device) # Initialize log values self.log_episode_return = torch.zeros(self.num_procs, device=self.device) self.log_episode_reshaped_return = torch.zeros(self.num_procs, device=self.device) self.log_episode_num_frames = torch.zeros(self.num_procs, device=self.device) self.log_done_counter = 0 self.log_return = [0] * self.num_procs self.log_reshaped_return = [0] * self.num_procs self.log_num_frames = [0] * self.num_procs def collect_experiences(self): """Collects rollouts and computes advantages. Runs several environments concurrently. The next actions are computed in a batch mode for all environments at the same time. The rollouts and advantages from all environments are concatenated together. Returns ------- exps : DictList Contains actions, rewards, advantages etc as attributes. Each attribute, e.g. `exps.reward` has a shape (self.num_frames_per_proc * num_envs, ...). k-th block of consecutive `self.num_frames_per_proc` frames contains data obtained from the k-th environment. Be careful not to mix data from different environments! logs : dict Useful stats about the training process, including the average reward, policy loss, value loss, etc. """ for i in range(self.num_frames_per_proc): # Do one agent-environment interaction preprocessed_obs = self.preprocess_obss(self.obs, device=self.device) with torch.no_grad(): if self.acmodel.recurrent: dist, value, memory = self.acmodel( preprocessed_obs, self.memory * self.mask.unsqueeze(1)) else: dist, value = self.acmodel(preprocessed_obs) action = dist.sample() obs, reward, done, info = self.env.step(action.cpu().numpy()) self.collect_interactions(info) # Update experiences values self.obss[i] = self.obs self.obs = obs if self.acmodel.recurrent: self.memories[i] = self.memory self.memory = memory self.masks[i] = self.mask self.mask = 1 - torch.tensor( done, device=self.device, dtype=torch.float) self.actions[i] = action self.values_ext[i] = value[0] self.values_int[i] = value[1] if self.reshape_reward is not None: self.rewards_ext[i] = torch.tensor([ self.reshape_reward(obs_, action_, reward_, done_) for obs_, action_, reward_, done_ in zip( obs, action, reward, done) ], device=self.device) else: self.rewards_ext[i] = torch.tensor(reward, device=self.device) self.log_probs[i] = dist.log_prob(action) # Update log values self.log_episode_return += torch.tensor(reward, device=self.device, dtype=torch.float) self.log_episode_reshaped_return += self.rewards_ext[i] self.log_episode_num_frames += torch.ones(self.num_procs, device=self.device) for i, done_ in enumerate(done): if done_: self.log_done_counter += 1 self.log_return.append(self.log_episode_return[i].item()) self.log_reshaped_return.append( self.log_episode_reshaped_return[i].item()) self.log_num_frames.append( self.log_episode_num_frames[i].item()) self.log_episode_return *= self.mask self.log_episode_reshaped_return *= self.mask self.log_episode_num_frames *= self.mask # ========================================================================================== # Define experiences: ---> for observations # the whole experience is the concatenation of the experience # of each process. # import pdb; pdb.set_trace() exps = DictList() exps.obs = [ self.obss[i][j] for j in range(self.num_procs) for i in range(self.num_frames_per_proc) ] # Preprocess experiences exps.obs = self.preprocess_obss(exps.obs, device=self.device) exps.action = self.actions.transpose(0, 1).reshape(-1) if self.acmodel.recurrent: # T x P x D -> P x T x D -> (P * T) x D exps.memory = self.memories.transpose(0, 1).reshape( -1, *self.memories.shape[2:]) # T x P -> P x T -> (P * T) x 1 exps.mask = self.masks.transpose(0, 1).reshape(-1).unsqueeze(1) # Add other data to experience buffer self.add_extra_experience(exps) # ========================================================================================== # -- Calculate intrinsic return self.rewards_int = self.calculate_intrinsic_reward( exps, self.rewards_int) # Add advantage and return to experiences # don;t use end of episode signal for intrinsic rewards preprocessed_obs = self.preprocess_obss(self.obs, device=self.device) with torch.no_grad(): if self.acmodel.recurrent: _, next_value, _ = self.acmodel( preprocessed_obs, self.memory * self.mask.unsqueeze(1)) else: _, next_value = self.acmodel(preprocessed_obs) # Calculate intrinsic rewards and advantages for i in reversed(range(self.num_frames_per_proc)): next_value_int = self.values_int[ i + 1] if i < self.num_frames_per_proc - 1 else next_value[1] next_advantage_int = self.advantages_int[ i + 1] if i < self.num_frames_per_proc - 1 else 0 delta = self.rewards_int[ i] + self.discount * next_value_int - self.values_int[i] self.advantages_int[ i] = delta + self.discount * self.gae_lambda * next_advantage_int # Calculate extrinisc rewards and advantages for i in reversed(range(self.num_frames_per_proc)): next_mask = self.masks[ i + 1] if i < self.num_frames_per_proc - 1 else self.mask next_value_ext = self.values_ext[ i + 1] if i < self.num_frames_per_proc - 1 else next_value[0] next_advantage_ext = self.advantages_ext[ i + 1] if i < self.num_frames_per_proc - 1 else 0 delta = self.rewards_ext[ i] + self.discount * next_value_ext * next_mask - self.values_ext[ i] self.advantages_ext[ i] = delta + self.discount * self.gae_lambda * next_advantage_ext * next_mask # ========================================================================================== # @ continue Define experiences: # the whole experience is the concatenation of the experience # of each process. # In comments below: # - T is self.num_frames_per_proc, # - P is self.num_procs, # - D is the dimensionality. # for all tensors below, T x P -> P x T -> P * T exps.value_ext = self.values_ext.transpose(0, 1).reshape(-1) exps.value_int = self.values_int.transpose(0, 1).reshape(-1) exps.reward_ext = self.rewards_ext.transpose(0, 1).reshape(-1) exps.reward_int = self.rewards_int.transpose(0, 1).reshape(-1) exps.advantage_ext = self.advantages_ext.transpose(0, 1).reshape(-1) exps.advantage_int = self.advantages_int.transpose(0, 1).reshape(-1) exps.returnn_ext = exps.value_ext + exps.advantage_ext exps.returnn_int = exps.value_int + exps.advantage_int exps.log_prob = self.log_probs.transpose(0, 1).reshape(-1) # Log some values keep = max(self.log_done_counter, self.num_procs) log = { "return_per_episode": self.log_return[-keep:], "reshaped_return_per_episode": self.log_reshaped_return[-keep:], "num_frames_per_episode": self.log_num_frames[-keep:], "num_frames": self.num_frames } aux_logs = self.process_interactions() # add extra logs with agent interactions for k in aux_logs: log[k] = aux_logs[k] self.log_done_counter = 0 self.log_return = self.log_return[-self.num_procs:] self.log_reshaped_return = self.log_reshaped_return[-self.num_procs:] self.log_num_frames = self.log_num_frames[-self.num_procs:] return exps, log def collect_interactions(self, info): # collect all end-of-episode statistics about environment for env_info in info: if len(env_info) > 0: self._finished_episodes += 1 self._ep_statistics.append(deepcopy(env_info)) def process_interactions(self): # process statistics about the agent's behaviour # in the environment if self._finished_episodes < self._min_stats_ep_batch: return get_interactions_stats([]) else: logs = get_interactions_stats(self._ep_statistics) # reset statistics self._finished_episodes = 0 self._ep_statistics = [] return logs @abstractmethod def update_parameters(self): raise NotImplemented @abstractmethod def get_save_data(self): raise NotImplemented @abstractmethod def calculate_intrinsic_reward(self, exps: DictList, dst_intrinsic_r: torch.Tensor): raise NotImplemented def add_extra_experience(self, exps: DictList): return def evaluate(self): return None
class BaseAlgo(ABC): """The base class for RL algorithms.""" def __init__(self, envs, acmodel, num_frames_per_proc, discount, lr, gae_lambda, entropy_coef, value_loss_coef, max_grad_norm, recurrence, preprocess_obss, reshape_reward): """ Initializes a `BaseAlgo` instance. Parameters: ---------- envs : list a list of environments that will be run in parallel acmodel : torch.Module the model num_frames_per_proc : int the number of frames collected by every process for an update discount : float the discount for future rewards lr : float the learning rate for optimizers gae_lambda : float the lambda coefficient in the GAE formula ([Schulman et al., 2015](https://arxiv.org/abs/1506.02438)) entropy_coef : float the weight of the entropy cost in the final objective value_loss_coef : float the weight of the value loss in the final objective max_grad_norm : float gradient will be clipped to be at most this value recurrence : int the number of steps the gradient is propagated back in time preprocess_obss : function a function that takes observations returned by the environment and converts them into the format that the model can handle reshape_reward : function a function that shapes the reward, takes an (observation, action, reward, done) tuple as an input """ # Store parameters self.env = ParallelEnv(envs) self.acmodel = acmodel self.acmodel.train() self.num_frames_per_proc = num_frames_per_proc self.discount = discount self.lr = lr self.gae_lambda = gae_lambda self.entropy_coef = entropy_coef self.value_loss_coef = value_loss_coef self.max_grad_norm = max_grad_norm self.recurrence = recurrence self.preprocess_obss = preprocess_obss or default_preprocess_obss self.reshape_reward = reshape_reward # Store helpers values self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.num_procs = len(envs) self.num_frames = self.num_frames_per_proc * self.num_procs # Control parameters if not (self.acmodel.recurrent): self.recurrence = 1 assert self.num_frames_per_proc % self.recurrence == 0 # Initialize experience values shape = (self.num_frames_per_proc, self.num_procs) self.obs = self.env.reset() self.obss = [None] * (shape[0]) if self.acmodel.recurrent: self.memory = torch.zeros(shape[1], self.acmodel.memory_size, device=self.device) self.memories = torch.zeros(*shape, self.acmodel.memory_size, device=self.device) self.mask = torch.ones(shape[1], device=self.device) self.masks = torch.zeros(*shape, device=self.device) self.actions = torch.zeros(*shape, device=self.device, dtype=torch.int) self.values = torch.zeros(*shape, device=self.device) self.rewards = torch.zeros(*shape, device=self.device) self.advantages = torch.zeros(*shape, device=self.device) self.log_probs = torch.zeros(*shape, device=self.device) # Initialize log values self.log_episode_return = torch.zeros(self.num_procs, device=self.device) self.log_episode_reshaped_return = torch.zeros(self.num_procs, device=self.device) self.log_episode_num_frames = torch.zeros(self.num_procs, device=self.device) self.log_done_counter = 0 self.log_return = [0] * self.num_procs self.log_reshaped_return = [0] * self.num_procs self.log_num_frames = [0] * self.num_procs def collect_experiences(self): """Collects rollouts and computes advantages. Runs several environments concurrently. The next actions are computed in a batch mode for all environments at the same time. The rollouts and advantages from all environments are concatenated together. Returns ------- exps : DictList Contains actions, rewards, advantages etc as attributes. Each attribute, e.g. `exps.reward` has a shape (self.num_frames_per_proc * num_envs, ...). k-th block of consecutive `self.num_frames_per_proc` frames contains data obtained from the k-th environment. Be careful not to mix data from different environments! logs : dict Useful stats about the training process, including the average reward, policy loss, value loss, etc. """ for i in range(self.num_frames_per_proc): # Do one agent-environment interaction preprocessed_obs = self.preprocess_obss(self.obs, device=self.device) with torch.no_grad(): if self.acmodel.recurrent: dist, value, memory = self.acmodel( preprocessed_obs, self.memory * self.mask.unsqueeze(1)) else: dist, value = self.acmodel(preprocessed_obs) action = dist.sample() obs, reward, done, _ = self.env.step(action.cpu().numpy()) # Update experiences values self.obss[i] = self.obs self.obs = obs if self.acmodel.recurrent: self.memories[i] = self.memory self.memory = memory self.masks[i] = self.mask self.mask = 1 - torch.tensor( done, device=self.device, dtype=torch.float) self.actions[i] = action self.values[i] = value if self.reshape_reward is not None: self.rewards[i] = torch.tensor([ self.reshape_reward(obs_, action_, reward_, done_) for obs_, action_, reward_, done_ in zip( obs, action, reward, done) ], device=self.device) else: self.rewards[i] = torch.tensor(reward, device=self.device) self.log_probs[i] = dist.log_prob(action) # Update log values self.log_episode_return += torch.tensor(reward, device=self.device, dtype=torch.float) self.log_episode_reshaped_return += self.rewards[i] self.log_episode_num_frames += torch.ones(self.num_procs, device=self.device) for i, done_ in enumerate(done): if done_: self.log_done_counter += 1 self.log_return.append(self.log_episode_return[i].item()) self.log_reshaped_return.append( self.log_episode_reshaped_return[i].item()) self.log_num_frames.append( self.log_episode_num_frames[i].item()) self.log_episode_return *= self.mask self.log_episode_reshaped_return *= self.mask self.log_episode_num_frames *= self.mask # Add advantage and return to experiences preprocessed_obs = self.preprocess_obss(self.obs, device=self.device) with torch.no_grad(): if self.acmodel.recurrent: _, next_value, _ = self.acmodel( preprocessed_obs, self.memory * self.mask.unsqueeze(1)) else: _, next_value = self.acmodel(preprocessed_obs) for i in reversed(range(self.num_frames_per_proc)): next_mask = self.masks[ i + 1] if i < self.num_frames_per_proc - 1 else self.mask next_value = self.values[ i + 1] if i < self.num_frames_per_proc - 1 else next_value next_advantage = self.advantages[ i + 1] if i < self.num_frames_per_proc - 1 else 0 delta = self.rewards[ i] + self.discount * next_value * next_mask - self.values[i] self.advantages[ i] = delta + self.discount * self.gae_lambda * next_advantage * next_mask # Define experiences: # the whole experience is the concatenation of the experience # of each process. # In comments below: # - T is self.num_frames_per_proc, # - P is self.num_procs, # - D is the dimensionality. exps = DictList() exps.obs = [ self.obss[i][j] for j in range(self.num_procs) for i in range(self.num_frames_per_proc) ] if self.acmodel.recurrent: # T x P x D -> P x T x D -> (P * T) x D exps.memory = self.memories.transpose(0, 1).reshape( -1, *self.memories.shape[2:]) # T x P -> P x T -> (P * T) x 1 exps.mask = self.masks.transpose(0, 1).reshape(-1).unsqueeze(1) # for all tensors below, T x P -> P x T -> P * T exps.action = self.actions.transpose(0, 1).reshape(-1) exps.value = self.values.transpose(0, 1).reshape(-1) exps.reward = self.rewards.transpose(0, 1).reshape(-1) exps.advantage = self.advantages.transpose(0, 1).reshape(-1) exps.returnn = exps.value + exps.advantage exps.log_prob = self.log_probs.transpose(0, 1).reshape(-1) # Preprocess experiences exps.obs = self.preprocess_obss(exps.obs, device=self.device) # Log some values keep = max(self.log_done_counter, self.num_procs) log = { "return_per_episode": self.log_return[-keep:], "reshaped_return_per_episode": self.log_reshaped_return[-keep:], "num_frames_per_episode": self.log_num_frames[-keep:], "num_frames": self.num_frames } self.log_done_counter = 0 self.log_return = self.log_return[-self.num_procs:] self.log_reshaped_return = self.log_reshaped_return[-self.num_procs:] self.log_num_frames = self.log_num_frames[-self.num_procs:] return exps, log @abstractmethod def update_parameters(self): pass
class BaseAlgo(ABC): def __init__(self, envs, acmodel, num_frames_per_proc, discount, lr, gae_tau, entropy_coef, value_loss_coef, max_grad_norm, recurrence, preprocess_obss, reshape_reward): # Store parameters self.env = ParallelEnv(envs) self.acmodel = acmodel self.acmodel.train() self.num_frames_per_proc = num_frames_per_proc self.discount = discount self.lr = lr self.gae_tau = gae_tau self.entropy_coef = entropy_coef self.value_loss_coef = value_loss_coef self.max_grad_norm = max_grad_norm self.recurrence = recurrence self.preprocess_obss = preprocess_obss or default_preprocess_obss self.reshape_reward = reshape_reward # Store helpers values self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.num_procs = len(envs) self.num_frames = self.num_frames_per_proc * self.num_procs # Control parameters if not (self.acmodel.recurrent): self.recurrence = 1 assert self.num_frames_per_proc % self.recurrence == 0 # Store experiences values shape = (self.num_frames_per_proc, self.num_procs) self.obs = self.env.reset() self.obss = [None] * (shape[0]) if self.acmodel.recurrent: self.memory = torch.zeros(shape[1], self.acmodel.memory_size, device=self.device) self.memories = torch.zeros(*shape, self.acmodel.memory_size, device=self.device) self.mask = torch.ones(shape[1], device=self.device) self.masks = torch.zeros(*shape, device=self.device) self.actions = torch.zeros(*shape, device=self.device, dtype=torch.int) self.values = torch.zeros(*shape, device=self.device) self.rewards = torch.zeros(*shape, device=self.device) self.advantages = torch.zeros(*shape, device=self.device) self.log_probs = torch.zeros(*shape, device=self.device) # Store log values self.log_episode_return = torch.zeros(self.num_procs, device=self.device) self.log_episode_reshaped_return = torch.zeros(self.num_procs, device=self.device) self.log_episode_num_frames = torch.zeros(self.num_procs, device=self.device) self.log_done_counter = 0 self.log_return = [0] * self.num_procs self.log_reshaped_return = [0] * self.num_procs self.log_num_frames = [0] * self.num_procs def collect_experiences(self): for i in range(self.num_frames_per_proc): # Do one agent-environment interaction preprocessed_obs = self.preprocess_obss(self.obs, device=self.device) with torch.no_grad(): if self.acmodel.recurrent: dist, value, memory = self.acmodel( preprocessed_obs, self.memory * self.mask.unsqueeze(1)) else: dist, value = self.acmodel(preprocessed_obs) action = dist.sample() obs, reward, done, _ = self.env.step(action.cpu().numpy()) # Update experiences values self.obss[i] = self.obs self.obs = obs if self.acmodel.recurrent: self.memories[i] = self.memory self.memory = memory self.masks[i] = self.mask self.mask = 1 - torch.tensor( done, device=self.device, dtype=torch.float) self.actions[i] = action self.values[i] = value if self.reshape_reward is not None: self.rewards[i] = torch.tensor([ self.reshape_reward(obs_, action_, reward_, done_) for obs_, action_, reward_, done_ in zip( obs, action, reward, done) ], device=self.device) else: self.rewards[i] = torch.tensor(reward, device=self.device) self.log_probs[i] = dist.log_prob(action) # Update log values self.log_episode_return += torch.tensor(reward, device=self.device, dtype=torch.float) self.log_episode_reshaped_return += self.rewards[i] self.log_episode_num_frames += torch.ones(self.num_procs, device=self.device) for i, done_ in enumerate(done): if done_: self.log_done_counter += 1 self.log_return.append(self.log_episode_return[i].item()) self.log_reshaped_return.append( self.log_episode_reshaped_return[i].item()) self.log_num_frames.append( self.log_episode_num_frames[i].item()) self.log_episode_return *= self.mask self.log_episode_reshaped_return *= self.mask self.log_episode_num_frames *= self.mask # Add advantage and return to experiences preprocessed_obs = self.preprocess_obss(self.obs, device=self.device) with torch.no_grad(): if self.acmodel.recurrent: _, next_value, _ = self.acmodel( preprocessed_obs, self.memory * self.mask.unsqueeze(1)) else: _, next_value = self.acmodel(preprocessed_obs) for i in reversed(range(self.num_frames_per_proc)): next_mask = self.masks[ i + 1] if i < self.num_frames_per_proc - 1 else self.mask next_value = self.values[ i + 1] if i < self.num_frames_per_proc - 1 else next_value next_advantage = self.advantages[ i + 1] if i < self.num_frames_per_proc - 1 else 0 delta = self.rewards[ i] + self.discount * next_value * next_mask - self.values[i] self.advantages[ i] = delta + self.discount * self.gae_tau * next_advantage * next_mask # Defines experiences exps = DictList() exps.obs = [obs for obss in self.obss for obs in obss] if self.acmodel.recurrent: exps.memory = self.memories.view(-1, *self.memories.shape[2:]) exps.mask = self.masks.view(-1, *self.masks.shape[2:]).unsqueeze(1) exps.action = self.actions.view(-1, *self.actions.shape[2:]) exps.value = self.values.view(-1, *self.values.shape[2:]) exps.reward = self.rewards.view(-1, *self.rewards.shape[2:]) exps.advantage = self.advantages.view(-1, *self.advantages.shape[2:]) exps.returnn = exps.value + exps.advantage exps.log_prob = self.log_probs.view(-1, *self.log_probs.shape[2:]) # Preprocess experiences exps.obs = self.preprocess_obss(exps.obs, device=self.device) # Log some values keep = max(self.log_done_counter, self.num_procs) log = { "return_per_episode": self.log_return[-keep:], "reshaped_return_per_episode": self.log_reshaped_return[-keep:], "num_frames_per_episode": self.log_num_frames[-keep:], "num_frames": self.num_frames } self.log_done_counter = 0 self.log_return = self.log_return[-self.num_procs:] self.log_reshaped_return = self.log_reshaped_return[-self.num_procs:] self.log_num_frames = self.log_num_frames[-self.num_procs:] return exps, log @abstractmethod def update_parameters(self): pass
def train_environment_model(environment_class, agent_name, n_environments=16, seed=0, learning_rate=5e-4, batch_per_environment=4, observation_weight=1, reward_weight=1, note=None, tensorboard=True, train_for_n_frames=None, log_interval=1, store_interval=10): saved_arguments = locals() date_suffix = datetime.datetime.now().strftime("%y-%m-%d-%H-%M-%S") note = note + "_" if note else "" model_name = "EM_{}{}_s{}_{}".format(note, environment_name(environment_class), seed, date_suffix) model_directory = utils.get_model_dir(model_name) logger = utils.get_logger(model_directory) csv_file, csv_writer = utils.get_csv_writer(model_directory) logger.info("{}\n".format(saved_arguments)) if tensorboard: from tensorboardX import SummaryWriter tensorboard_writer = SummaryWriter(model_directory) total_start_time = time.time() utils.seed(seed) agent_model = utils.load_model(utils.get_model_dir(agent_name)) environment_model = EnvironmentModel(environment_class) optimizer = torch.optim.Adam(environment_model.parameters(), lr=learning_rate) logger.info("Using pre-trained agent model: {}\n".format(agent_name)) logger.info("{}\n".format(agent_model)) logger.info("Environment model architecture: {}\n".format(agent_name)) logger.info("{}\n".format(environment_model)) environments = [] for i in range(n_environments): environment = instantiate_environment(environment_class) environment.seed(seed + 10000 * i) environments.append(environment) observation_preprocessor = MyObssPreprocessor( environments[0].observation_space) environments = ParallelEnv(environments) n_updates = 0 n_frames = 0 last_observations = environments.reset() while train_for_n_frames is None or n_frames < train_for_n_frames: batch_start_time = time.time() old_observation_batch = torch.Tensor() action_batch = torch.LongTensor() new_observation_batch = torch.Tensor() reward_batch = torch.Tensor() for batch in range(batch_per_environment): distributions, _, _ = agent_model( observation_preprocessor(last_observations), memory=None) actions = distributions.sample() new_observations, rewards, _, _ = environments.step( actions.numpy()) old_observation_batch = torch.cat( (old_observation_batch, observation_preprocessor(last_observations).image)) action_batch = torch.cat((action_batch, actions)) new_observation_batch = torch.cat( (new_observation_batch, observation_preprocessor(new_observations).image)) reward_batch = torch.cat( (reward_batch, torch.tensor(rewards, dtype=torch.float))) optimizer.zero_grad() predicted_observations, predicted_rewards = environment_model( old_observation_batch, action_batch) transposed_new_observation_batch = torch.transpose( torch.transpose(new_observation_batch, 1, 3), 2, 3) observation_loss = nn.functional.mse_loss( predicted_observations, transposed_new_observation_batch) reward_loss = nn.functional.mse_loss(predicted_rewards.squeeze(), reward_batch) total_loss = observation_loss * observation_weight + reward_loss * reward_weight total_loss.backward() optimizer.step() additional_frames = n_environments * batch_per_environment n_frames += additional_frames n_updates += 1 if n_updates % log_interval == 0: batch_end_time = time.time() fps = additional_frames / (batch_end_time - batch_start_time) duration = int(time.time() - total_start_time) header = ["update", "frames", "FPS", "duration"] data = [n_updates, n_frames, fps, duration] header += ["observation_loss", "reward_loss", "total_loss"] data += [ observation_loss.item(), reward_loss.item(), total_loss.item() ] logger.info( "U {} | F {:06} | FPS {:04.0f} | D {} | obsL {:.3f} | rewL {:.3f} | L {:.3f}" .format(*data)) if n_frames == additional_frames: csv_writer.writerow(header) csv_writer.writerow(data) csv_file.flush() if tensorboard: for field, value in zip(header, data): tensorboard_writer.add_scalar(field, value, n_frames) if n_updates % store_interval == 0: utils.save_model(environment_model, model_directory) logger.info("Model successfully saved")
class BaseAlgo(ABC): """The base class for RL algorithms.""" def __init__(self, envs, pi_old, pi_train, num_frames_per_proc, discount, lr, gae_lambda, entropy_coef, policy_reg_coef, value_reg_coef, value_loss_coef, max_grad_norm, preprocess_obss, reshape_reward, iter_type): """ Initializes a `BaseAlgo` instance. Parameters: ---------- envs : list a list of environments that will be run in parallel pi_old : torch.Module the old model (=teacher) pi_train : torch.Module the new model (=student). During 'normal' RL training, we execute this model, not the 'old' one. num_frames_per_proc : int the number of frames collected by every process for an update discount : float the discount for future rewards lr : float the learning rate for optimizers gae_lambda : float the lambda coefficient in the GAE formula ([Schulman et al., 2015](https://arxiv.org/abs/1506.02438)) entropy_coef : float the weight of the entropy cost in the final objective value_loss_coef : float the weight of the value loss in the final objective max_grad_norm : float gradient will be clipped to be at most this value preprocess_obss : function a function that takes observations returned by the environment and converts them into the format that the model can handle reshape_reward : function a function that shapes the reward, takes an (observation, action, reward, done) tuple as an input iter_type : string which type of ITER to use "none", "distill" (the normal one) or "kickstarting" (executing the student during distillation! """ # Store parameters self.env = ParallelEnv(envs) self.pi_train = pi_train self.pi_train.train() self.pi_old = pi_old if self.pi_old is not None: self.pi_old.train() self.num_frames_per_proc = num_frames_per_proc self.discount = discount self.lr = lr self.gae_lambda = gae_lambda self.entropy_coef = entropy_coef self.policy_reg_coef = policy_reg_coef self.value_reg_coef = value_reg_coef self.value_loss_coef = value_loss_coef self.max_grad_norm = max_grad_norm self.preprocess_obss = preprocess_obss or default_preprocess_obss self.reshape_reward = reshape_reward self.iter_type = iter_type # Store helpers values self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.num_procs = len(envs) self.num_frames = self.num_frames_per_proc * self.num_procs # Initialize experience values shape = (self.num_frames_per_proc, self.num_procs) self.reset_env() self.obss = [None]*(shape[0]) self.masks = torch.zeros(*shape, device=self.device) self.actions = torch.zeros(*shape, device=self.device, dtype=torch.int) self.values_old = torch.zeros(*shape, device=self.device) self.values_train = torch.zeros(*shape, device=self.device) self.rewards = torch.zeros(*shape, device=self.device) self.advantages_old = torch.zeros(*shape, device=self.device) self.advantages_train = torch.zeros(*shape, device=self.device) self.log_probs = torch.zeros(*shape, device=self.device) # Initialize log values self.log_episode_return = torch.zeros(self.num_procs, device=self.device) self.log_episode_reshaped_return = torch.zeros(self.num_procs, device=self.device) self.log_episode_num_frames = torch.zeros(self.num_procs, device=self.device) self.log_done_counter = 0 self.log_return = [0] * self.num_procs self.log_reshaped_return = [0] * self.num_procs self.log_num_frames = [0] * self.num_procs def switch_models(self, new_pi): self.pi_old = self.pi_train self.pi_train = new_pi self.pi_train.train() # if self.pi_old is not None: # self.pi_old.eval() parameters = list(self.pi_train.parameters()) if exp_config.also_update_old_policy: parameters += list(self.pi_old.parameters()) self.optimizer = torch.optim.Adam(parameters, self.lr, eps=self.adam_eps) def execute_model(self, alpha, **kwargs): """Execute model. Args: alpha: float between 0 and 1. If it's 0, we know we're not distilling and I don't need to execute old policy **kwargs: Other arguments for the `compute` function of the model. Should at least contain `obs`. Returns: dict containing 'dist' and 'value' for 'old' and 'train', as well es 'execute', which could be either, depending on whether iter_type=='distill' or iter_type=='kickstarting' """ if alpha == 0: # If alpha == 0, we're not currently distilling -> Don't need to execute old policy dist_train, value_train = self.pi_train.compute(**kwargs) dist_old, value_old = None, None dist_execute = dist_train else: dist_old, value_old = self.pi_old.compute(**kwargs) dist_train, value_train = self.pi_train.compute(**kwargs) assert self.iter_type in ["kickstarting", "distill"] dist_execute = dist_train if (self.iter_type == "kickstarting" or self.pi_old is None) else dist_old # Return new distribution, old value and weighted? sum of KL's return {"dist_execute": dist_execute, "dist_old": dist_old, "value_old": value_old, "dist_train": dist_train, "value_train": value_train} def reset_env(self): self.obs = self.env.reset() self.mask = torch.ones(self.num_procs, device=self.device) def collect_experiences(self, alpha): """Collects rollouts and computes advantages. Runs several environments concurrently. The next actions are computed in a batch mode for all environments at the same time. The rollouts and advantages from all environments are concatenated together. Args ------ alpha: float between 0 and 1 used to determine which policy to execute, based on whether we're currently distilling or not and what iter_type is Returns ------- exps : DictList Contains actions, rewards, advantages etc as attributes. Each attribute, e.g. `exps.reward` has a shape (self.num_frames_per_proc * num_envs, ...). k-th block of consecutive `self.num_frames_per_proc` frames contains data obtained from the k-th environment. Be careful not to mix data from different environments! logs : dict Useful stats about the training process, including the average reward, policy loss, value loss, etc. """ for i in range(self.num_frames_per_proc): # Do one agent-environment interaction preprocessed_obs = self.preprocess_obss(self.obs, device=self.device) with torch.no_grad(): model_results = self.execute_model(alpha=alpha, obs=preprocessed_obs) dist = model_results['dist_execute'] value_old = model_results['value_old'] value_train = model_results['value_train'] action = dist.sample() obs, reward, done, _ = self.env.step(action.cpu().numpy()) # Update experiences values self.obss[i] = self.obs self.obs = obs self.masks[i] = self.mask self.mask = 1 - torch.tensor(done, device=self.device, dtype=torch.float) self.actions[i] = action self.values_train[i] = value_train if alpha > 0: self.values_old[i] = value_old if self.reshape_reward is not None: self.rewards[i] = torch.tensor([ self.reshape_reward(obs_, action_, reward_, done_) for obs_, action_, reward_, done_ in zip(obs, action, reward, done) ], device=self.device) else: self.rewards[i] = torch.tensor(reward, device=self.device) self.log_probs[i] = dist.log_prob(action) # Update log values self.log_episode_return += torch.tensor(reward, device=self.device, dtype=torch.float) self.log_episode_reshaped_return += self.rewards[i] self.log_episode_num_frames += torch.ones(self.num_procs, device=self.device) for i, done_ in enumerate(done): if done_: self.log_done_counter += 1 self.log_return.append(self.log_episode_return[i].item()) self.log_reshaped_return.append(self.log_episode_reshaped_return[i].item()) self.log_num_frames.append(self.log_episode_num_frames[i].item()) self.log_episode_return *= self.mask self.log_episode_reshaped_return *= self.mask self.log_episode_num_frames *= self.mask # Add advantage and return to experiences preprocessed_obs = self.preprocess_obss(self.obs, device=self.device) with torch.no_grad(): model_results = self.execute_model(alpha=alpha, obs=preprocessed_obs) next_value_old = model_results['value_old'] next_value_train = model_results['value_train'] # For self.advantages_old for i in reversed(range(self.num_frames_per_proc)): next_mask = self.masks[i+1] if i < self.num_frames_per_proc - 1 else self.mask if alpha > 0: next_value_old = self.values_old[i+1] if i < self.num_frames_per_proc - 1 else next_value_old next_advantage_old = self.advantages_old[i+1] if i < self.num_frames_per_proc - 1 else 0 delta_old = self.rewards[i] + self.discount * next_value_old * next_mask - self.values_old[i] self.advantages_old[i] = delta_old + self.discount * self.gae_lambda * next_advantage_old * next_mask next_value_train = self.values_train[i+1] if i < self.num_frames_per_proc - 1 else next_value_train next_advantage_train = self.advantages_train[i+1] if i < self.num_frames_per_proc - 1 else 0 delta_train = self.rewards[i] + self.discount * next_value_train * next_mask - self.values_train[i] self.advantages_train[i] = delta_train + self.discount * self.gae_lambda * next_advantage_train * next_mask # Define experiences: # the whole experience is the concatenation of the experience # of each process. # In comments below: # - T is self.num_frames_per_proc, # - P is self.num_procs, # - D is the dimensionality. exps = DictList() exps.obs = [self.obss[i][j] for j in range(self.num_procs) for i in range(self.num_frames_per_proc)] # for all tensors below, T x P -> P x T -> P * T exps.action = self.actions.transpose(0, 1).reshape(-1) exps.advantage_old = self.advantages_old.transpose(0, 1).reshape(-1) exps.advantage_train = self.advantages_train.transpose(0, 1).reshape(-1) if alpha > 0: exps.value_old = self.values_old.transpose(0, 1).reshape(-1) exps.returnn_old = exps.value_old + exps.advantage_old exps.value_train = self.values_train.transpose(0, 1).reshape(-1) exps.returnn_train = exps.value_train + exps.advantage_train exps.reward = self.rewards.transpose(0, 1).reshape(-1) exps.log_prob = self.log_probs.transpose(0, 1).reshape(-1) # Preprocess experiences exps.obs = self.preprocess_obss(exps.obs, device=self.device) # Log some values keep = max(self.log_done_counter, self.num_procs) log = { "return_per_episode": self.log_return[-keep:], "reshaped_return_per_episode": self.log_reshaped_return[-keep:], "num_frames_per_episode": self.log_num_frames[-keep:], "num_frames": self.num_frames } self.log_done_counter = 0 self.log_return = self.log_return[-self.num_procs:] self.log_reshaped_return = self.log_reshaped_return[-self.num_procs:] self.log_num_frames = self.log_num_frames[-self.num_procs:] return exps, log @abstractmethod def update_parameters(self): pass