default=10, help="The number of worse episodes to show") args = parser.parse_args() # Set seed for all randomness sources utils.seed(args.seed) # Generate environment envs = [] for i in range(args.procs): env = gym.make(args.env) env.seed(args.seed + 10000 * i) envs.append(env) env = ParallelEnv(envs) # Define agent model_dir = utils.get_model_dir(args.model) agent = utils.Agent(args.env, env.observation_space, model_dir, args.argmax, args.procs) print("CUDA available: {}\n".format(torch.cuda.is_available())) # Initialize logs logs = {"num_frames_per_episode": [], "return_per_episode": []} # Run the agent start_time = time.time()
def __init__(self, envs, acmodel, num_frames_per_proc, discount, gae_lambda, entropy_coef, value_loss_coef, max_grad_norm, recurrence, preprocess_obss, reshape_reward): """ Initializes a `BaseAlgo` instance. Parameters: ---------- envs : list a list of environments that will be run in parallel acmodel : torch.Module the model num_frames_per_proc : int the number of frames collected by every process for an update discount : float the discount for future rewards lr : float the learning rate for optimizers gae_lambda : float the lambda coefficient in the GAE formula ([Schulman et al., 2015](https://arxiv.org/abs/1506.02438)) entropy_coef : float the weight of the entropy cost in the final objective value_loss_coef : float the weight of the value loss in the final objective max_grad_norm : float gradient will be clipped to be at most this value recurrence : int the number of steps the gradient is propagated back in time preprocess_obss : function a function that takes observations returned by the environment and converts them into the format that the model can handle reshape_reward : function a function that shapes the reward, takes an (observation, action, reward, done) tuple as an input """ # Store parameters # self.env = ParallelEnvChunks(envs) self.env = ParallelEnv(envs) self.acmodel = acmodel self.acmodel.train() self.num_frames_per_proc = num_frames_per_proc self.discount = discount self.gae_lambda = gae_lambda self.entropy_coef = entropy_coef self.value_loss_coef = value_loss_coef self.max_grad_norm = max_grad_norm self.recurrence = recurrence self.preprocess_obss = preprocess_obss or default_preprocess_obss self.reshape_reward = reshape_reward # Store helpers values self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.num_procs = len(envs) self.num_frames = self.num_frames_per_proc * self.num_procs # Control parameters assert self.acmodel.recurrent or self.recurrence == 1 assert self.num_frames_per_proc % self.recurrence == 0 # Initialize experience values shape = (self.num_frames_per_proc, self.num_procs) self.obs = self.env.reset() self.obss = [None] * (shape[0]) if self.acmodel.recurrent: self.memory = torch.zeros(shape[1], self.acmodel.memory_size, device=self.device) self.memories = torch.zeros(*shape, self.acmodel.memory_size, device=self.device) self.mask = torch.ones(shape[1], device=self.device) self.masks = torch.zeros(*shape, device=self.device) self.steer_actions = torch.zeros(*shape, device=self.device, dtype=torch.int) self.acc_actions = torch.zeros(*shape, device=self.device, dtype=torch.int) self.values = torch.zeros(*shape, device=self.device) self.rewards = torch.zeros(*shape, device=self.device) self.advantages = torch.zeros(*shape, device=self.device) self.steer_log_probs = torch.zeros(*shape, device=self.device) self.acc_log_probs = torch.zeros(*shape, device=self.device) # Initialize log values self.log_episode_return = torch.zeros(self.num_procs, device=self.device) self.log_episode_reshaped_return = torch.zeros(self.num_procs, device=self.device) self.log_episode_num_frames = torch.zeros(self.num_procs, device=self.device) self.log_done_counter = 0 self.log_return = [0] * self.num_procs self.log_reshaped_return = [0] * self.num_procs self.log_num_frames = [0] * self.num_procs
class BaseAlgoV0(ABC): """The base class for RL algorithms.""" def __init__(self, envs, acmodel, num_frames_per_proc, discount, gae_lambda, entropy_coef, value_loss_coef, max_grad_norm, recurrence, preprocess_obss, reshape_reward): """ Initializes a `BaseAlgo` instance. Parameters: ---------- envs : list a list of environments that will be run in parallel acmodel : torch.Module the model num_frames_per_proc : int the number of frames collected by every process for an update discount : float the discount for future rewards lr : float the learning rate for optimizers gae_lambda : float the lambda coefficient in the GAE formula ([Schulman et al., 2015](https://arxiv.org/abs/1506.02438)) entropy_coef : float the weight of the entropy cost in the final objective value_loss_coef : float the weight of the value loss in the final objective max_grad_norm : float gradient will be clipped to be at most this value recurrence : int the number of steps the gradient is propagated back in time preprocess_obss : function a function that takes observations returned by the environment and converts them into the format that the model can handle reshape_reward : function a function that shapes the reward, takes an (observation, action, reward, done) tuple as an input """ # Store parameters # self.env = ParallelEnvChunks(envs) self.env = ParallelEnv(envs) self.acmodel = acmodel self.acmodel.train() self.num_frames_per_proc = num_frames_per_proc self.discount = discount self.gae_lambda = gae_lambda self.entropy_coef = entropy_coef self.value_loss_coef = value_loss_coef self.max_grad_norm = max_grad_norm self.recurrence = recurrence self.preprocess_obss = preprocess_obss or default_preprocess_obss self.reshape_reward = reshape_reward # Store helpers values self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.num_procs = len(envs) self.num_frames = self.num_frames_per_proc * self.num_procs # Control parameters assert self.acmodel.recurrent or self.recurrence == 1 assert self.num_frames_per_proc % self.recurrence == 0 # Initialize experience values shape = (self.num_frames_per_proc, self.num_procs) self.obs = self.env.reset() self.obss = [None] * (shape[0]) if self.acmodel.recurrent: self.memory = torch.zeros(shape[1], self.acmodel.memory_size, device=self.device) self.memories = torch.zeros(*shape, self.acmodel.memory_size, device=self.device) self.mask = torch.ones(shape[1], device=self.device) self.masks = torch.zeros(*shape, device=self.device) self.steer_actions = torch.zeros(*shape, device=self.device, dtype=torch.int) self.acc_actions = torch.zeros(*shape, device=self.device, dtype=torch.int) self.values = torch.zeros(*shape, device=self.device) self.rewards = torch.zeros(*shape, device=self.device) self.advantages = torch.zeros(*shape, device=self.device) self.steer_log_probs = torch.zeros(*shape, device=self.device) self.acc_log_probs = torch.zeros(*shape, device=self.device) # Initialize log values self.log_episode_return = torch.zeros(self.num_procs, device=self.device) self.log_episode_reshaped_return = torch.zeros(self.num_procs, device=self.device) self.log_episode_num_frames = torch.zeros(self.num_procs, device=self.device) self.log_done_counter = 0 self.log_return = [0] * self.num_procs self.log_reshaped_return = [0] * self.num_procs self.log_num_frames = [0] * self.num_procs def collect_experiences(self): """Collects rollouts and computes advantages. Runs several environments concurrently. The next actions are computed in a batch mode for all environments at the same time. The rollouts and advantages from all environments are concatenated together. Returns ------- exps : DictList Contains actions, rewards, advantages etc as attributes. Each attribute, e.g. `exps.reward` has a shape (self.num_frames_per_proc * num_envs, ...). k-th block of consecutive `self.num_frames_per_proc` frames contains data obtained from the k-th environment. Be careful not to mix data from different environments! logs : dict Useful stats about the training process, including the average reward, policy loss, value loss, etc. """ for i in range(self.num_frames_per_proc): # Do one agent-environment interaction preprocessed_obs = self.preprocess_obss(self.obs, device=self.device) with torch.no_grad(): if self.acmodel.recurrent: dist, value, memory = self.acmodel( preprocessed_obs, self.memory * self.mask.unsqueeze(1)) else: dist, value = self.acmodel(preprocessed_obs) steer_dist, acc_dist = dist steer_action = steer_dist.sample() acc_action = acc_dist.sample() actions = list( zip(steer_action.cpu().numpy(), acc_action.cpu().numpy())) obs, reward, done, _ = self.env.step(actions) # Update experiences values self.obss[i] = self.obs self.obs = obs if self.acmodel.recurrent: self.memories[i] = self.memory self.memory = memory self.masks[i] = self.mask self.mask = 1 - torch.tensor( done, device=self.device, dtype=torch.float) self.steer_actions[i] = steer_action self.acc_actions[i] = acc_action self.values[i] = value if self.reshape_reward is not None: self.rewards[i] = torch.tensor([ self.reshape_reward(obs_, action_, reward_, done_) for obs_, action_, reward_, done_ in zip( obs, (steer_action, acc_action), reward, done) ], device=self.device) else: self.rewards[i] = torch.tensor(reward, device=self.device) self.steer_log_probs[i] = steer_dist.log_prob(steer_action) self.acc_log_probs[i] = acc_dist.log_prob(acc_action) # Update log values self.log_episode_return += torch.tensor(reward, device=self.device, dtype=torch.float) self.log_episode_reshaped_return += self.rewards[i] self.log_episode_num_frames += torch.ones(self.num_procs, device=self.device) for i, done_ in enumerate(done): if done_: self.log_done_counter += 1 self.log_return.append(self.log_episode_return[i].item()) self.log_reshaped_return.append( self.log_episode_reshaped_return[i].item()) self.log_num_frames.append( self.log_episode_num_frames[i].item()) self.log_episode_return *= self.mask self.log_episode_reshaped_return *= self.mask self.log_episode_num_frames *= self.mask # Add advantage and return to experiences preprocessed_obs = self.preprocess_obss(self.obs, device=self.device) with torch.no_grad(): if self.acmodel.recurrent: _, next_value, _ = self.acmodel( preprocessed_obs, self.memory * self.mask.unsqueeze(1)) else: _, next_value = self.acmodel(preprocessed_obs) for i in reversed(range(self.num_frames_per_proc)): next_mask = self.masks[ i + 1] if i < self.num_frames_per_proc - 1 else self.mask next_value = self.values[ i + 1] if i < self.num_frames_per_proc - 1 else next_value next_advantage = self.advantages[ i + 1] if i < self.num_frames_per_proc - 1 else 0 delta = self.rewards[ i] + self.discount * next_value * next_mask - self.values[i] self.advantages[ i] = delta + self.discount * self.gae_lambda * next_advantage * next_mask # Define experiences: # the whole experience is the concatenation of the experience # of each process. # In comments below: # - T is self.num_frames_per_proc, # - P is self.num_procs, # - D is the dimensionality. exps = DictList() exps.obs = [ self.obss[i][j] for j in range(self.num_procs) for i in range(self.num_frames_per_proc) ] if self.acmodel.recurrent: # T x P x D -> P x T x D -> (P * T) x D exps.memory = self.memories.transpose(0, 1).reshape( -1, *self.memories.shape[2:]) # T x P -> P x T -> (P * T) x 1 exps.mask = self.masks.transpose(0, 1).reshape(-1).unsqueeze(1) # for all tensors below, T x P -> P x T -> P * T exps.steer_action = self.steer_actions.transpose(0, 1).reshape(-1) exps.acc_action = self.acc_actions.transpose(0, 1).reshape(-1) exps.value = self.values.transpose(0, 1).reshape(-1) exps.reward = self.rewards.transpose(0, 1).reshape(-1) exps.advantage = self.advantages.transpose(0, 1).reshape(-1) exps.returnn = exps.value + exps.advantage exps.steer_log_prob = self.steer_log_probs.transpose(0, 1).reshape(-1) exps.acc_log_prob = self.acc_log_probs.transpose(0, 1).reshape(-1) # Preprocess experiences exps.obs = self.preprocess_obss(exps.obs, device=self.device) # Log some values keep = max(self.log_done_counter, self.num_procs) log = { "return_per_episode": self.log_return[-keep:], "reshaped_return_per_episode": self.log_reshaped_return[-keep:], "num_frames_per_episode": self.log_num_frames[-keep:], "num_frames": self.num_frames } self.log_done_counter = 0 self.log_return = self.log_return[-self.num_procs:] self.log_reshaped_return = self.log_reshaped_return[-self.num_procs:] self.log_num_frames = self.log_num_frames[-self.num_procs:] return exps, log @abstractmethod def update_parameters(self): pass def evaluate(self): pass
def start(model, seed, episodes, size): env_name = "MiniGrid-DoorKey-" + str(size) + "x" + str(size) + "-v0" utils.seed(seed) procs = 10 argmax = False all_data = np.zeros(shape=(size, 8)) print("Evaluating storage/" + model) for _wall in range(2, size - 2): # Generate environment envs = [] for i in range(procs): env = gym.make(env_name) env.setWallID(_wall) envs.append(env) env = ParallelEnv(envs) # Define agent save_dir = utils.get_save_dir(model) agent = utils.Agent(save_dir, env.observation_space, argmax, procs) # print("CUDA available: {}\n".format(torch.cuda.is_available())) # Initialize logs logs = {"num_frames_per_episode": [], "return_per_episode": []} # Run the agent start_time = time.time() obss = env.reset() log_done_counter = 0 log_episode_return = torch.zeros(procs, device=agent.device) log_episode_num_frames = torch.zeros(procs, device=agent.device) while log_done_counter < episodes: actions = agent.get_actions(obss) obss, rewards, dones, _ = env.step(actions) agent.analyze_feedbacks(rewards, dones) log_episode_return += torch.tensor(rewards, device=agent.device, dtype=torch.float) log_episode_num_frames += torch.ones(procs, device=agent.device) for i, done in enumerate(dones): if done: log_done_counter += 1 logs["return_per_episode"].append( log_episode_return[i].item()) logs["num_frames_per_episode"].append( log_episode_num_frames[i].item()) mask = 1 - torch.tensor( dones, device=agent.device, dtype=torch.float) log_episode_return *= mask log_episode_num_frames *= mask end_time = time.time() # Print logs num_frames = sum(logs["num_frames_per_episode"]) fps = num_frames / (end_time - start_time) duration = int(end_time - start_time) return_per_episode = utils.synthesize(logs["return_per_episode"]) num_frames_per_episode = utils.synthesize( logs["num_frames_per_episode"]) print( "Wall {:3d} | F {:6.0f} | FPS {:4.0f} | D {:3d} | R:x̄σmM {:.2f} {:.2f} {:.2f} {:.2f} | F:x̄σmM {:6.1f} {:6.1f} {:6.1f} {:6.1f}" .format(_wall, num_frames, fps, duration, *return_per_episode.values(), *num_frames_per_episode.values())) all_data[_wall, 0] = return_per_episode["mean"] all_data[_wall, 1] = return_per_episode["std"] all_data[_wall, 2] = return_per_episode["min"] all_data[_wall, 3] = return_per_episode["max"] all_data[_wall, 4] = num_frames_per_episode["mean"] all_data[_wall, 5] = num_frames_per_episode["std"] all_data[_wall, 6] = num_frames_per_episode["min"] all_data[_wall, 7] = num_frames_per_episode["max"] return all_data