def run_eval_loop(sample_stochastically=True): start_time = time.time() prefix = "stochastic_" if sample_stochastically else "" for i in range(num_episodes): obs = env.reset() video.init(enabled=(i == 0)) done = False episode_reward = 0 while not done: # center crop image if args.encoder_type == "pixel" and "crop" in args.data_augs: obs = utils.center_crop_image(obs, args.image_size) if args.encoder_type == "pixel" and "translate" in args.data_augs: # first crop the center with pre_image_size obs = utils.center_crop_image( obs, args.pre_transform_image_size) # then translate cropped to center obs = utils.center_translate(obs, args.image_size) with utils.eval_mode(agent): if sample_stochastically: action = agent.sample_action(obs / 255.0) else: action = agent.select_action(obs / 255.0) obs, reward, done, _ = env.step(action) video.record(env) episode_reward += reward video.save("%d.mp4" % step) L.log("eval/" + prefix + "episode_reward", episode_reward, step) all_ep_rewards.append(episode_reward) L.log("eval/" + prefix + "eval_time", time.time() - start_time, step) mean_ep_reward = np.mean(all_ep_rewards) best_ep_reward = np.max(all_ep_rewards) std_ep_reward = np.std(all_ep_rewards) L.log("eval/" + prefix + "mean_episode_reward", mean_ep_reward, step) L.log("eval/" + prefix + "best_episode_reward", best_ep_reward, step) filename = (args.work_dir + "/" + args.domain_name + "--" + args.task_name + "-" + args.data_augs + "--s" + str(args.seed) + "--eval_scores.npy") key = args.domain_name + "-" + args.task_name + "-" + args.data_augs try: log_data = np.load(filename, allow_pickle=True) log_data = log_data.item() except: log_data = {} if key not in log_data: log_data[key] = {} log_data[key][step] = {} log_data[key][step]["step"] = step log_data[key][step]["mean_ep_reward"] = mean_ep_reward log_data[key][step]["max_ep_reward"] = best_ep_reward log_data[key][step]["std_ep_reward"] = std_ep_reward log_data[key][step]["env_step"] = step * args.action_repeat np.save(filename, log_data)
def run_eval_loop(sample_stochastically=True): start_time = time.time() prefix = 'stochastic_' if sample_stochastically else '' for i in range(num_episodes): obs = env.reset() video.init(enabled=(i == 0)) done = False episode_reward = 0 while not done: # center crop image if args.encoder_type == 'pixel' and 'crop' in args.data_augs: obs = utils.center_crop_image(obs, args.image_size) if args.encoder_type == 'pixel' and 'translate' in args.data_augs: # first crop the center with pre_image_size obs = utils.center_crop_image( obs, args.pre_transform_image_size) # then translate cropped to center obs = utils.center_translate(obs, args.image_size) with utils.eval_mode(agent): if sample_stochastically: action = agent.sample_action(obs / 255.) else: action = agent.select_action(obs / 255.) obs, reward, done, _ = env.step(action) video.record(env) episode_reward += reward video.save('%d.mp4' % step) L.log('eval/' + prefix + 'episode_reward', episode_reward, step) all_ep_rewards.append(episode_reward) L.log('eval/' + prefix + 'eval_time', time.time() - start_time, step) mean_ep_reward = np.mean(all_ep_rewards) best_ep_reward = np.max(all_ep_rewards) std_ep_reward = np.std(all_ep_rewards) L.log('eval/' + prefix + 'mean_episode_reward', mean_ep_reward, step) L.log('eval/' + prefix + 'best_episode_reward', best_ep_reward, step) filename = args.work_dir + '/' + args.domain_name + '--' + args.task_name + '-' + args.data_augs + '--s' + str( args.seed) + '--eval_scores.npy' key = args.domain_name + '-' + args.task_name + '-' + args.data_augs try: log_data = np.load(filename, allow_pickle=True) log_data = log_data.item() except: log_data = {} if key not in log_data: log_data[key] = {} log_data[key][step] = {} log_data[key][step]['step'] = step log_data[key][step]['mean_ep_reward'] = mean_ep_reward log_data[key][step]['max_ep_reward'] = best_ep_reward log_data[key][step]['std_ep_reward'] = std_ep_reward log_data[key][step]['env_step'] = step * args.action_repeat np.save(filename, log_data) return log_data[key][step]
def sample_action(self, obs, goal_obs): if obs.shape[-1] != self.image_size: obs = utils.center_crop_image(obs, self.image_size) goal_obs = utils.center_crop_image(goal_obs, self.image_size) with torch.no_grad(): obs = torch.FloatTensor(obs).to(self.device) obs = obs.unsqueeze(0) goal_obs = torch.FloatTensor(goal_obs).to(self.device) goal_obs = goal_obs.unsqueeze(0) mu, pi, _, _ = self.actor(obs, goal_obs, compute_log_pi=False) return pi.cpu().data.numpy().flatten()
def __getitem__(self, idx): # Nawid - Obtains item from replay buffer ''' Remove the randomness in the dataloading of each sample as the dataloader itself should be able to find the different values idx = np.random.randint( 0, self.capacity if self.full else self.idx, size=1 ) idx = idx[0] ''' obses = np.expand_dims( self.obses[idx], 0 ) # Need to expand dim to allow it to be the shape for cropping, then need to squeeze so its a 4d tensor rather than 5d with an extra dim so it can be used with the dataloader next_obses = np.expand_dims(self.next_obses[idx], 0) pos = obses.copy() #obs and next_obs if self.rand_crop: obses_input = random_crop( obses, self.image_size) #center_crop_image(obses,self.image_size) # next_obses_input = random_crop(next_obses, self.image_size) else: obses_input = center_crop_image(obses, self.image_size) next_obses_input = center_crop_image(next_obses, self.image_size) # random crop images obses_anc = random_crop(obses, self.image_size) pos = random_crop(pos, self.image_size) next_obses_anc = random_crop( next_obses, self.image_size ) # Set anchor for the next observation in order to contrast with the contrastive loss # Squeeze shape obses_input = np.squeeze(obses_input) next_obses_input = np.squeeze(next_obses_input) obses_anc = np.squeeze(obses_anc) pos = np.squeeze(pos) next_obses_anc = np.squeeze(next_obses_anc) action = self.actions[idx] if self.transform: obses_input = self.transform(obses_input) next_obses_input = self.transform(next_obses_input) obses_anc = self.transform(obses_anc) pos = self.transform(pos) next_obses_anc = self.transform(next_obses_anc) cpc_kwargs = dict( obs_anchor=obses_anc, obs_pos=pos, next_obs_anchor=next_obses_anc ) # Nawid Postitive example is pos whilst anchor is obses return obses_input, action, next_obses_input, cpc_kwargs
def sample_action(self, obs): if isinstance(obs, list): if obs[0].shape[-1] != self.image_size: obs = [ utils.center_crop_image(obs[0], self.image_size), obs[1] ] else: if obs.shape[-1] != self.image_size: obs = utils.center_crop_image(obs, self.image_size) with torch.no_grad(): obs = self.obs_to_torch(obs) mu, pi, _, _ = self.actor(obs, compute_log_pi=False) return pi.cpu().data.numpy().flatten()
def run_eval_loop(sample_stochastically=True): start_time = time.time() prefix = 'stochastic_' if sample_stochastically else '' for i in range(num_episodes): obs = env.reset() video.init(enabled=(i == 0)) done = False episode_reward = 0 while not done: # center crop image if args.encoder_type == 'pixel': obs = utils.center_crop_image(obs, args.image_size) with utils.eval_mode(agent): if sample_stochastically: action = agent.sample_action(obs) else: action = agent.select_action(obs) obs, reward, done, _ = env.step(action) video.record(env) episode_reward += reward video.save('%d.mp4' % step) L.log('eval/' + prefix + 'episode_reward', episode_reward, step) all_ep_rewards.append(episode_reward) L.log('eval/' + prefix + 'eval_time', time.time() - start_time, step) mean_ep_reward = np.mean(all_ep_rewards) best_ep_reward = np.max(all_ep_rewards) L.log('eval/' + prefix + 'mean_episode_reward', mean_ep_reward, step) L.log('eval/' + prefix + 'best_episode_reward', best_ep_reward, step)
def run_eval_loop(sample_stochastically=True): start_time = time.time() prefix = 'stochastic_' if sample_stochastically else '' for i in range(num_episodes): obs = env.reset() done = False episode_reward = 0 while not done: # center crop image if args.encoder_type == 'pixel': obs = utils.center_crop_image(obs, args.image_size) with utils.eval_mode(agent): if sample_stochastically: action = agent.sample_action(obs) else: action = agent.select_action(obs) obs, reward, done, _ = env.step(action) episode_reward += reward all_ep_rewards.append(episode_reward) mean_ep_reward = np.mean(all_ep_rewards) best_ep_reward = np.max(all_ep_rewards) logger.log({ 'mean_reward': mean_ep_reward, 'max_reward': best_ep_reward, })
def run_eval_loop(sample_stochastically=True): start_time = time.time() prefix = 'stochastic_' if sample_stochastically else '' for i in range(num_episodes): obs = env.reset() done = False episode_reward = 0 while not done: # center crop image if args.encoder_type == 'pixel': obs = utils.center_crop_image(obs, args.image_size) with utils.eval_mode(agent): if sample_stochastically: print("sample_stochastically") action = random.randint(0, 11) else: print("agent selected") action = agent.select_action(obs) obs, reward, done = env.step(action) episode_reward += reward L.log('eval/' + prefix + 'episode_reward', episode_reward, step) all_ep_rewards.append(episode_reward) L.log('eval/' + prefix + 'eval_time', time.time() - start_time, step) mean_ep_reward = np.mean(all_ep_rewards) best_ep_reward = np.max(all_ep_rewards) L.log('eval/' + prefix + 'mean_episode_reward', mean_ep_reward, step) L.log('eval/' + prefix + 'best_episode_reward', best_ep_reward, step)
def run_eval_loop2(sample_stochastically=True, cor_func="no_cor", cor_sev=1): cor = Corruptor(cor_func=cor_func, severity=cor_sev) start_time = time.time() prefix = 'stochastic_' if sample_stochastically else '' all_ep_rewards = [] for i in range(num_episodes): obs = env.reset() obs = cor.corrupt_stacked_images( obs, args.frame_stack) # added corruption after env done = False episode_reward = 0 while not done: # center crop image if args.encoder_type == 'pixel' and 'crop' in args.data_augs: obs = utils.center_crop_image(obs, args.image_size) if args.encoder_type == 'pixel' and 'translate' in args.data_augs: # first crop the center with pre_image_size obs = utils.center_crop_image( obs, args.pre_transform_image_size) # then translate cropped to center obs = utils.center_translate(obs, args.image_size) with utils.eval_mode(agent): if sample_stochastically: action = agent.sample_action(obs / 255.) else: action = agent.select_action(obs / 255.) obs, reward, done, _ = env.step(action) obs = cor.corrupt_stacked_images( obs, args.frame_stack) # added corruption after env episode_reward += reward all_ep_rewards.append(episode_reward) mean_ep_reward = np.mean(all_ep_rewards) best_ep_reward = np.max(all_ep_rewards) std_ep_reward = np.std(all_ep_rewards) end_time = time.time() return step, mean_ep_reward, best_ep_reward, std_ep_reward, end_time - start_time
def sample_action(self, obs_tuple): [obs, image_obs] = obs_tuple if image_obs.shape[-1] != self.image_size: image_obs = utils.center_crop_image(image_obs, self.image_size) with torch.no_grad(): image_obs = torch.FloatTensor(image_obs).to(self.device) image_obs = image_obs.unsqueeze(0) #print("test shape sample_action: ", image_obs.shape, ": ", obs.shape) mu, pi, _, _ = self.actor([obs, image_obs], compute_log_pi=False) return pi.cpu().data.numpy().flatten()
def sample_action(self, obs): if obs['img'].shape[-1] != self.image_size: state, img = utils.split_obs(obs) img = utils.center_crop_image(img, self.image_size) obs = utils.combine_obs(state, img) with torch.no_grad(): obs['img'] = torch.FloatTensor(obs['img']).to( self.device).unsqueeze(0) obs['state'] = torch.FloatTensor(obs['state']).to( self.device).unsqueeze(0) mu, pi, _, _ = self.actor(obs, compute_log_pi=False) return pi.cpu().data.numpy().flatten()
def sample_cpc(self): # Nawid - samples images I believe start = time.time() idxs = np.random.randint( 0, self.capacity if self.full else self.idx, size=self.batch_size) # Used to randomly sample indices obses = self.obses[idxs] # Nawid - Samples observation pos = obses.copy() # Nawid - next_obses = self.next_obses[idxs] # Random crop or centre crops the image if self.rand_crop: obses_input = random_crop(obses, self.image_size) next_obses_input = random_crop(next_obses, self.image_size) else: obses_input = center_crop_image(obses, self.image_size) next_obses_input = centre_crop_image(next_obses, self.image_size) # Nawid - Crop images randomly obses_anc = random_crop(obses, self.image_size) pos = random_crop(pos, self.image_size) obses_input, next_obses_input = np.transpose( obses_input, (0, 3, 1, 2)), np.transpose(next_obses_input, (0, 3, 1, 2)) obses_anc, pos = np.transpose(obses_anc, (0, 3, 1, 2)), np.transpose( pos, (0, 3, 1, 2)) obses_input = torch.tensor(obses_input, device=self.device).float() / 255 actions = torch.as_tensor(self.actions[idxs], device=self.device) next_obses_input = torch.tensor(next_obses_input, device=self.device).float() / 255 obses_anc = torch.as_tensor(obses_anc, device=self.device).float( ) / 255 # Random color jitter turns the values already into torch tenros pos = torch.as_tensor(pos, device=self.device).float() / 255 obses_anc = random_color_jitter(obses_anc, batch_size=self.batch_size, frames=self.frames) pos = random_color_jitter(pos, batch_size=self.batch_size, frames=self.frames) cpc_kwargs = dict( obs_anchor=obses_anc, obs_pos=pos, time_anchor=None, time_pos=None ) # Nawid Postitive example is pos whilst anchor is obses return obses_input, actions, next_obses_input, cpc_kwargs
def run_eval_loop(sample_stochastically=True): start_time = time.time() prefix = "stochastic_" if sample_stochastically else "" for i in tqdm(range(num_episodes), desc='eval', unit='ep'): obs = env.reset() video.init(enabled=(i == 0)) done = False episode_reward = 0 episode_info = defaultdict(int) while not done: # center crop image if args.encoder_type == "mixed": state, img = utils.split_obs(obs) img = utils.center_crop_image(img, args.image_size) obs = utils.combine_obs(state, img) with utils.eval_mode(agent): if sample_stochastically: action = agent.sample_action(obs) else: action = agent.select_action(obs) obs, reward, done, info = env.step(action) for k in keys_to_monitor: episode_info[k] += info[k] video.record(env, yaw=i) episode_reward += reward for k in keys_to_monitor: L.log("eval/" + prefix + k, np.sum(episode_info[k]), step) video.save("%d.mp4" % step) L.log("eval/" + prefix + "episode_reward", episode_reward, step) all_ep_rewards.append(episode_reward) L.log("eval/" + prefix + "eval_time", time.time() - start_time, step) mean_ep_reward = np.mean(all_ep_rewards) best_ep_reward = np.max(all_ep_rewards) L.log("eval/" + prefix + "mean_episode_reward", mean_ep_reward, step) L.log("eval/" + prefix + "best_episode_reward", best_ep_reward, step)
def run_eval_loop(sample_stochastically=True): start_time = time.time() prefix = 'stochastic_' if sample_stochastically else '' num_successes = 0 for i in range(num_episodes): obs = env.reset() video.init(enabled=(i == 0)) done = False episode_reward = 0 episode_success = False while not done: # center crop image if (args.agent == 'curl_sac' and args.encoder_type == 'pixel') or\ (args.agent == 'rad_sac' and (args.encoder_type == 'pixel' or 'crop' in args.data_augs or 'translate' in args.data_augs)): if isinstance(obs, list): obs[0] = utils.center_crop_image( obs[0], args.image_size) else: obs = utils.center_crop_image(obs, args.image_size) with utils.eval_mode(agent): if sample_stochastically: action = agent.sample_action(obs) else: action = agent.select_action(obs) obs, reward, done, info = env.step(action) if info.get('is_success'): episode_success = True video.record(env) episode_reward += reward num_successes += episode_success video.save('%d.mp4' % step) L.log('eval/' + prefix + 'episode_reward', episode_reward, step) all_ep_rewards.append(episode_reward) L.log('eval/' + prefix + 'eval_time', time.time() - start_time, step) if num_episodes > 0: mean_ep_reward = np.mean(all_ep_rewards) best_ep_reward = np.max(all_ep_rewards) std_ep_reward = np.std(all_ep_rewards) success_rate = num_successes / num_episodes else: mean_ep_reward = 0 best_ep_reward = 0 std_ep_reward = 0 success_rate = 0 L.log('eval/' + prefix + 'mean_episode_reward', mean_ep_reward, step) L.log('eval/' + prefix + 'best_episode_reward', best_ep_reward, step) L.log('eval/' + prefix + 'success_rate', success_rate, step) filename = args.work_dir + '/eval_scores.npy' key = args.domain_name + '-' + str( args.task_name) + '-' + args.data_augs try: log_data = np.load(filename, allow_pickle=True) log_data = log_data.item() except FileNotFoundError: log_data = {} if key not in log_data: log_data[key] = {} log_data[key][step] = {} log_data[key][step]['step'] = step log_data[key][step]['mean_ep_reward'] = mean_ep_reward log_data[key][step]['max_ep_reward'] = best_ep_reward log_data[key][step]['success_rate'] = success_rate log_data[key][step]['std_ep_reward'] = std_ep_reward log_data[key][step]['env_step'] = step * args.action_repeat np.save(filename, log_data)
def run_eval_loop(sample_stochastically=True): start_time = time.time() prefix = 'stochastic_' if sample_stochastically else '' for i in range(num_episodes): image_log_dir = utils.make_dir(os.path.join(image_dir, str(i))) obs = env.reset() video.init(enabled=(i == 0)) done = False episode_reward = 0 episode_step = 0 while not done: # center crop image if episode_step % 100 == 0: observation = env.render("rgb_array") plt.imsave( image_log_dir + "/result_" + str(episode_step) + ".png", observation) if args.encoder_type == 'pixel': obs = utils.center_crop_image(obs, args.image_size) goal_obs = utils.center_crop_image(goal_sample, args.image_size) else: goal_obs = goal_sample with utils.eval_mode(agent): if sample_stochastically: action = agent.sample_action(obs, goal_obs) else: action = agent.select_action(obs, goal_obs) obs, reward, done, distance = env.step(action) if args.reward_type == 'dist': reward = agent.dist_reward(obs, goal_sample) video.record(env) episode_reward += reward episode_step += 1 if done: observation = env.render("rgb_array") plt.imsave(image_log_dir + "/result_final.png", observation) video.save('%d.mp4' % step) L.log('eval/' + prefix + 'episode_reward', episode_reward, step) all_ep_rewards.append(episode_reward) all_ep_distance.append(distance) L.log('eval/' + prefix + 'eval_time', time.time() - start_time, step) mean_ep_reward = np.mean(all_ep_rewards) best_ep_reward = np.max(all_ep_rewards) std_ep_reward = np.std(all_ep_rewards) mean_ep_distance = np.mean(all_ep_distance) best_ep_distance = np.max(all_ep_distance) std_ep_distance = np.std(all_ep_distance) L.log('eval/' + prefix + 'mean_episode_reward', mean_ep_reward, step) L.log('eval/' + prefix + 'best_episode_reward', best_ep_reward, step) L.log('eval/' + prefix + 'mean_distance_to_goal', mean_ep_distance, step) L.log('eval/' + prefix + 'best_distance_to_goal', best_ep_distance, step) # Log to csv. log_csv["step"].append(step) log_csv["mean_reward"].append(mean_ep_reward) log_csv["mean_distance_to_goal"].append(mean_ep_distance) log_csv["std_distance_to_goal"].append(std_ep_distance) pd.DataFrame(log_csv).to_csv(csv_dir + "/log.csv", index=False)
def main(): import logging from rich.logging import RichHandler logging.basicConfig( level=logging.INFO, handlers=[RichHandler(rich_tracebacks=True, markup=True)]) args = parse_args() if args.seed == -1: args.__dict__["seed"] = np.random.randint(1, 1000000) print('seed', args.__dict__["seed"]) print(args) utils.set_seed_everywhere(args.seed) env = gym.make(args.domain_name, render=args.render) print(env) # TODO action repeat wrapper? env.seed(args.seed) # # stack several consecutive frames together if args.encoder_type == "mixed": from apple_gym.env.wrappers import FrameStack, ImageState, PermuteImages env = FrameStack( PermuteImages(ImageState(env), keys=["img"]), n=args.frame_stack, keys=["img"], ) if args.load == 'auto': load_dirs = Path(args.work_dir).glob('*/model/curl*.pt') load_dirs = sorted(set([str(d.parent) for d in load_dirs])) print('load_dirs', load_dirs) args.load = str(load_dirs[-1]) print('auto load', load_dirs) # make directory ts = time.gmtime() ts = time.strftime("%m-%d", ts) env_name = args.domain_name exp_name = (env_name + "-" + ts + "-im" + str(args.image_size) + "-b" + str(args.batch_size) + "-s" + str(args.seed) + "-" + args.encoder_type) args.work_dir = args.work_dir + "/" + exp_name print('work_dir', args.work_dir) utils.make_dir(args.work_dir) video_dir = utils.make_dir(os.path.join(args.work_dir, "video")) model_dir = utils.make_dir(os.path.join(args.work_dir, "model")) buffer_dir = utils.make_dir(os.path.join(args.work_dir, "buffer")) video = VideoRecorder(video_dir if args.save_video else None) with open(os.path.join(args.work_dir, "args.json"), "w") as f: json.dump(vars(args), f, sort_keys=True, indent=4) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(f'device {device}') # shapes action_shape = env.action_space.shape img = env.observation_space.sample()["img"] img_aug = utils.center_crop_image(img, args.image_size) obs_shape = { "img": img_aug.shape, "state": env.observation_space["state"].shape } replay_buffer = utils.ReplayBuffer( obs_space=env.observation_space, action_space=env.action_space, capacity=args.replay_buffer_capacity, batch_size=args.batch_size, device=device, image_size=args.image_size, ) agent = make_agent(obs_shape=obs_shape, action_shape=action_shape, args=args, device=device) if args.load is not None: agent.load_curl(args.load) # summarize obs = env.observation_space.sample() state, img = utils.split_obs(obs) img_crop = utils.center_crop_image(img, agent.image_size) obs_crop = utils.combine_obs(state, img_crop) obs_crop['img'] = torch.FloatTensor(obs_crop['img']).to( agent.device).unsqueeze(0) obs_crop['state'] = torch.FloatTensor(obs_crop['state']).to( agent.device).unsqueeze(0) action = agent.sample_action(obs) action = torch.FloatTensor(action).to(agent.device).unsqueeze(0) from torchsummaryX import summary with torch.no_grad(): print(agent.critic) summary(agent.critic, obs_crop, action) print(agent.actor) summary(agent.actor, obs_crop) L = Logger(args.work_dir, use_tb=args.save_tb) episode, episode_reward, done = 0, 0, True episode_info = defaultdict(int) start_time = time.time() for step in tqdm(range(args.num_train_steps), desc="train", unit="step", mininterval=360): # evaluate agent periodically if (step % args.eval_freq == 0) and (step >= args.eval_freq): L.log("eval/episode", episode, step) evaluate(env, agent, video, args.num_eval_episodes, L, step, args) if args.save_model: agent.save_curl(model_dir, step) if args.save_buffer: replay_buffer.save(buffer_dir) if done: if step > 0: if step % args.log_interval == 0: L.log("train/duration", time.time() - start_time, step) L.dump(step) start_time = time.time() if step % args.log_interval == 0: L.log("train/episode_reward", episode_reward, step) for k in keys_to_monitor: L.log("train/episode_info" + k, np.sum(episode_info[k]), step) obs = env.reset() assert env.observation_space.contains( obs ), f"obs should be in space. ob={obs} space={env.observation_space}" done = False episode_reward = 0 episode_info = defaultdict(int) episode_step = 0 episode += 1 if step % args.log_interval == 0: L.log("train/episode", episode, step) # sample action for data collection if step < args.init_steps: action = env.action_space.sample() else: with utils.eval_mode(agent): action = agent.sample_action(obs) assert env.action_space.contains( action ), f"obs should be in space. ob={action} space={env.action_space}" if step % 10 == 0: # run training update if step >= args.init_steps: num_updates = 1 for _ in range(num_updates): agent.update(replay_buffer, L, step) next_obs, reward, done, info = env.step(action) # allow infinite bootstrap done_bool = 0 if episode_step + 1 == env._max_episode_steps else float( done) episode_reward += reward replay_buffer.add(obs, action, reward, next_obs, done_bool) for k in keys_to_monitor: episode_info[k] += info[k] obs = next_obs episode_step += 1