Exemple #1
0
    def run_eval_loop(sample_stochastically=True):
        start_time = time.time()
        prefix = "stochastic_" if sample_stochastically else ""
        for i in range(num_episodes):
            obs = env.reset()
            video.init(enabled=(i == 0))
            done = False
            episode_reward = 0
            while not done:
                # center crop image
                if args.encoder_type == "pixel" and "crop" in args.data_augs:
                    obs = utils.center_crop_image(obs, args.image_size)
                if args.encoder_type == "pixel" and "translate" in args.data_augs:
                    # first crop the center with pre_image_size
                    obs = utils.center_crop_image(
                        obs, args.pre_transform_image_size)
                    # then translate cropped to center
                    obs = utils.center_translate(obs, args.image_size)
                with utils.eval_mode(agent):
                    if sample_stochastically:
                        action = agent.sample_action(obs / 255.0)
                    else:
                        action = agent.select_action(obs / 255.0)
                obs, reward, done, _ = env.step(action)
                video.record(env)
                episode_reward += reward

            video.save("%d.mp4" % step)
            L.log("eval/" + prefix + "episode_reward", episode_reward, step)
            all_ep_rewards.append(episode_reward)

        L.log("eval/" + prefix + "eval_time", time.time() - start_time, step)
        mean_ep_reward = np.mean(all_ep_rewards)
        best_ep_reward = np.max(all_ep_rewards)
        std_ep_reward = np.std(all_ep_rewards)
        L.log("eval/" + prefix + "mean_episode_reward", mean_ep_reward, step)
        L.log("eval/" + prefix + "best_episode_reward", best_ep_reward, step)

        filename = (args.work_dir + "/" + args.domain_name + "--" +
                    args.task_name + "-" + args.data_augs + "--s" +
                    str(args.seed) + "--eval_scores.npy")
        key = args.domain_name + "-" + args.task_name + "-" + args.data_augs
        try:
            log_data = np.load(filename, allow_pickle=True)
            log_data = log_data.item()
        except:
            log_data = {}

        if key not in log_data:
            log_data[key] = {}

        log_data[key][step] = {}
        log_data[key][step]["step"] = step
        log_data[key][step]["mean_ep_reward"] = mean_ep_reward
        log_data[key][step]["max_ep_reward"] = best_ep_reward
        log_data[key][step]["std_ep_reward"] = std_ep_reward
        log_data[key][step]["env_step"] = step * args.action_repeat

        np.save(filename, log_data)
Exemple #2
0
    def run_eval_loop(sample_stochastically=True):
        start_time = time.time()
        prefix = 'stochastic_' if sample_stochastically else ''
        for i in range(num_episodes):
            obs = env.reset()
            video.init(enabled=(i == 0))
            done = False
            episode_reward = 0
            while not done:
                # center crop image
                if args.encoder_type == 'pixel' and 'crop' in args.data_augs:
                    obs = utils.center_crop_image(obs, args.image_size)
                if args.encoder_type == 'pixel' and 'translate' in args.data_augs:
                    # first crop the center with pre_image_size
                    obs = utils.center_crop_image(
                        obs, args.pre_transform_image_size)
                    # then translate cropped to center
                    obs = utils.center_translate(obs, args.image_size)
                with utils.eval_mode(agent):
                    if sample_stochastically:
                        action = agent.sample_action(obs / 255.)
                    else:
                        action = agent.select_action(obs / 255.)
                obs, reward, done, _ = env.step(action)
                video.record(env)
                episode_reward += reward

            video.save('%d.mp4' % step)
            L.log('eval/' + prefix + 'episode_reward', episode_reward, step)
            all_ep_rewards.append(episode_reward)

        L.log('eval/' + prefix + 'eval_time', time.time() - start_time, step)
        mean_ep_reward = np.mean(all_ep_rewards)
        best_ep_reward = np.max(all_ep_rewards)
        std_ep_reward = np.std(all_ep_rewards)
        L.log('eval/' + prefix + 'mean_episode_reward', mean_ep_reward, step)
        L.log('eval/' + prefix + 'best_episode_reward', best_ep_reward, step)

        filename = args.work_dir + '/' + args.domain_name + '--' + args.task_name + '-' + args.data_augs + '--s' + str(
            args.seed) + '--eval_scores.npy'
        key = args.domain_name + '-' + args.task_name + '-' + args.data_augs
        try:
            log_data = np.load(filename, allow_pickle=True)
            log_data = log_data.item()
        except:
            log_data = {}

        if key not in log_data:
            log_data[key] = {}

        log_data[key][step] = {}
        log_data[key][step]['step'] = step
        log_data[key][step]['mean_ep_reward'] = mean_ep_reward
        log_data[key][step]['max_ep_reward'] = best_ep_reward
        log_data[key][step]['std_ep_reward'] = std_ep_reward
        log_data[key][step]['env_step'] = step * args.action_repeat

        np.save(filename, log_data)
        return log_data[key][step]
Exemple #3
0
    def sample_action(self, obs, goal_obs):
        if obs.shape[-1] != self.image_size:
            obs = utils.center_crop_image(obs, self.image_size)
            goal_obs = utils.center_crop_image(goal_obs, self.image_size)

        with torch.no_grad():
            obs = torch.FloatTensor(obs).to(self.device)
            obs = obs.unsqueeze(0)
            goal_obs = torch.FloatTensor(goal_obs).to(self.device)
            goal_obs = goal_obs.unsqueeze(0)
            mu, pi, _, _ = self.actor(obs, goal_obs, compute_log_pi=False)
            return pi.cpu().data.numpy().flatten()
Exemple #4
0
    def __getitem__(self, idx):  # Nawid - Obtains item from replay buffer
        ''' Remove the randomness in the dataloading of each sample as the dataloader itself should be able to find the different values
        idx = np.random.randint(
            0, self.capacity if self.full else self.idx, size=1
        )
        idx = idx[0]
        '''

        obses = np.expand_dims(
            self.obses[idx], 0
        )  # Need to expand dim to allow it to be the shape for cropping, then need to squeeze so its a 4d tensor rather than 5d with an extra dim so it can be used with the dataloader
        next_obses = np.expand_dims(self.next_obses[idx], 0)
        pos = obses.copy()

        #obs and next_obs
        if self.rand_crop:
            obses_input = random_crop(
                obses,
                self.image_size)  #center_crop_image(obses,self.image_size) #
            next_obses_input = random_crop(next_obses, self.image_size)
        else:
            obses_input = center_crop_image(obses, self.image_size)
            next_obses_input = center_crop_image(next_obses, self.image_size)

        # random crop images
        obses_anc = random_crop(obses, self.image_size)
        pos = random_crop(pos, self.image_size)
        next_obses_anc = random_crop(
            next_obses, self.image_size
        )  # Set anchor for the next observation in order to contrast with the contrastive loss

        # Squeeze shape
        obses_input = np.squeeze(obses_input)
        next_obses_input = np.squeeze(next_obses_input)
        obses_anc = np.squeeze(obses_anc)
        pos = np.squeeze(pos)
        next_obses_anc = np.squeeze(next_obses_anc)

        action = self.actions[idx]

        if self.transform:
            obses_input = self.transform(obses_input)
            next_obses_input = self.transform(next_obses_input)
            obses_anc = self.transform(obses_anc)
            pos = self.transform(pos)
            next_obses_anc = self.transform(next_obses_anc)

        cpc_kwargs = dict(
            obs_anchor=obses_anc, obs_pos=pos, next_obs_anchor=next_obses_anc
        )  # Nawid  Postitive example is pos whilst anchor is obses
        return obses_input, action, next_obses_input, cpc_kwargs
Exemple #5
0
    def sample_action(self, obs):
        if isinstance(obs, list):
            if obs[0].shape[-1] != self.image_size:
                obs = [
                    utils.center_crop_image(obs[0], self.image_size), obs[1]
                ]
        else:
            if obs.shape[-1] != self.image_size:
                obs = utils.center_crop_image(obs, self.image_size)

        with torch.no_grad():
            obs = self.obs_to_torch(obs)
            mu, pi, _, _ = self.actor(obs, compute_log_pi=False)
            return pi.cpu().data.numpy().flatten()
Exemple #6
0
    def run_eval_loop(sample_stochastically=True):
        start_time = time.time()
        prefix = 'stochastic_' if sample_stochastically else ''
        for i in range(num_episodes):
            obs = env.reset()
            video.init(enabled=(i == 0))
            done = False
            episode_reward = 0
            while not done:
                # center crop image
                if args.encoder_type == 'pixel':
                    obs = utils.center_crop_image(obs, args.image_size)
                with utils.eval_mode(agent):
                    if sample_stochastically:
                        action = agent.sample_action(obs)
                    else:
                        action = agent.select_action(obs)
                obs, reward, done, _ = env.step(action)
                video.record(env)
                episode_reward += reward

            video.save('%d.mp4' % step)
            L.log('eval/' + prefix + 'episode_reward', episode_reward, step)
            all_ep_rewards.append(episode_reward)

        L.log('eval/' + prefix + 'eval_time', time.time() - start_time, step)
        mean_ep_reward = np.mean(all_ep_rewards)
        best_ep_reward = np.max(all_ep_rewards)
        L.log('eval/' + prefix + 'mean_episode_reward', mean_ep_reward, step)
        L.log('eval/' + prefix + 'best_episode_reward', best_ep_reward, step)
Exemple #7
0
    def run_eval_loop(sample_stochastically=True):
        start_time = time.time()
        prefix = 'stochastic_' if sample_stochastically else ''
        for i in range(num_episodes):
            obs = env.reset()
            done = False
            episode_reward = 0
            while not done:
                # center crop image
                if args.encoder_type == 'pixel':
                    obs = utils.center_crop_image(obs, args.image_size)
                with utils.eval_mode(agent):
                    if sample_stochastically:
                        action = agent.sample_action(obs)
                    else:
                        action = agent.select_action(obs)
                obs, reward, done, _ = env.step(action)
                episode_reward += reward

            all_ep_rewards.append(episode_reward)

        mean_ep_reward = np.mean(all_ep_rewards)
        best_ep_reward = np.max(all_ep_rewards)

        logger.log({
            'mean_reward': mean_ep_reward,
            'max_reward': best_ep_reward,
        })
Exemple #8
0
    def run_eval_loop(sample_stochastically=True):
        start_time = time.time()
        prefix = 'stochastic_' if sample_stochastically else ''
        for i in range(num_episodes):
            obs = env.reset()
            done = False
            episode_reward = 0
            while not done:
                # center crop image
                if args.encoder_type == 'pixel':
                    obs = utils.center_crop_image(obs, args.image_size)
                with utils.eval_mode(agent):
                    if sample_stochastically:
                        print("sample_stochastically")
                        action = random.randint(0, 11)
                    else:
                        print("agent selected")
                        action = agent.select_action(obs)
                obs, reward, done = env.step(action)
                episode_reward += reward

            L.log('eval/' + prefix + 'episode_reward', episode_reward, step)
            all_ep_rewards.append(episode_reward)

        L.log('eval/' + prefix + 'eval_time', time.time() - start_time, step)
        mean_ep_reward = np.mean(all_ep_rewards)
        best_ep_reward = np.max(all_ep_rewards)
        L.log('eval/' + prefix + 'mean_episode_reward', mean_ep_reward, step)
        L.log('eval/' + prefix + 'best_episode_reward', best_ep_reward, step)
Exemple #9
0
    def run_eval_loop2(sample_stochastically=True,
                       cor_func="no_cor",
                       cor_sev=1):
        cor = Corruptor(cor_func=cor_func, severity=cor_sev)

        start_time = time.time()
        prefix = 'stochastic_' if sample_stochastically else ''

        all_ep_rewards = []
        for i in range(num_episodes):
            obs = env.reset()
            obs = cor.corrupt_stacked_images(
                obs, args.frame_stack)  # added corruption after env
            done = False
            episode_reward = 0
            while not done:
                # center crop image
                if args.encoder_type == 'pixel' and 'crop' in args.data_augs:
                    obs = utils.center_crop_image(obs, args.image_size)
                if args.encoder_type == 'pixel' and 'translate' in args.data_augs:
                    # first crop the center with pre_image_size
                    obs = utils.center_crop_image(
                        obs, args.pre_transform_image_size)
                    # then translate cropped to center
                    obs = utils.center_translate(obs, args.image_size)
                with utils.eval_mode(agent):
                    if sample_stochastically:
                        action = agent.sample_action(obs / 255.)
                    else:
                        action = agent.select_action(obs / 255.)
                obs, reward, done, _ = env.step(action)
                obs = cor.corrupt_stacked_images(
                    obs, args.frame_stack)  # added corruption after env
                episode_reward += reward

            all_ep_rewards.append(episode_reward)

        mean_ep_reward = np.mean(all_ep_rewards)
        best_ep_reward = np.max(all_ep_rewards)
        std_ep_reward = np.std(all_ep_rewards)

        end_time = time.time()

        return step, mean_ep_reward, best_ep_reward, std_ep_reward, end_time - start_time
Exemple #10
0
    def sample_action(self, obs_tuple):
        [obs, image_obs] = obs_tuple
        if image_obs.shape[-1] != self.image_size:
            image_obs = utils.center_crop_image(image_obs, self.image_size)

        with torch.no_grad():
            image_obs = torch.FloatTensor(image_obs).to(self.device)
            image_obs = image_obs.unsqueeze(0)
            #print("test shape sample_action: ", image_obs.shape, ": ", obs.shape)
            mu, pi, _, _ = self.actor([obs, image_obs], compute_log_pi=False)
            return pi.cpu().data.numpy().flatten()
Exemple #11
0
    def sample_action(self, obs):
        if obs['img'].shape[-1] != self.image_size:
            state, img = utils.split_obs(obs)
            img = utils.center_crop_image(img, self.image_size)
            obs = utils.combine_obs(state, img)

        with torch.no_grad():
            obs['img'] = torch.FloatTensor(obs['img']).to(
                self.device).unsqueeze(0)
            obs['state'] = torch.FloatTensor(obs['state']).to(
                self.device).unsqueeze(0)
            mu, pi, _, _ = self.actor(obs, compute_log_pi=False)
            return pi.cpu().data.numpy().flatten()
Exemple #12
0
    def sample_cpc(self):  # Nawid - samples images I believe

        start = time.time()
        idxs = np.random.randint(
            0, self.capacity if self.full else self.idx,
            size=self.batch_size)  # Used to randomly sample indices

        obses = self.obses[idxs]  # Nawid - Samples observation
        pos = obses.copy()  # Nawid -
        next_obses = self.next_obses[idxs]

        # Random crop or centre crops the image
        if self.rand_crop:
            obses_input = random_crop(obses, self.image_size)
            next_obses_input = random_crop(next_obses, self.image_size)
        else:
            obses_input = center_crop_image(obses, self.image_size)
            next_obses_input = centre_crop_image(next_obses, self.image_size)

        # Nawid - Crop images randomly
        obses_anc = random_crop(obses, self.image_size)
        pos = random_crop(pos, self.image_size)

        obses_input, next_obses_input = np.transpose(
            obses_input, (0, 3, 1, 2)), np.transpose(next_obses_input,
                                                     (0, 3, 1, 2))
        obses_anc, pos = np.transpose(obses_anc, (0, 3, 1, 2)), np.transpose(
            pos, (0, 3, 1, 2))

        obses_input = torch.tensor(obses_input,
                                   device=self.device).float() / 255
        actions = torch.as_tensor(self.actions[idxs], device=self.device)
        next_obses_input = torch.tensor(next_obses_input,
                                        device=self.device).float() / 255
        obses_anc = torch.as_tensor(obses_anc, device=self.device).float(
        ) / 255  # Random color jitter turns the values already into torch tenros
        pos = torch.as_tensor(pos, device=self.device).float() / 255

        obses_anc = random_color_jitter(obses_anc,
                                        batch_size=self.batch_size,
                                        frames=self.frames)
        pos = random_color_jitter(pos,
                                  batch_size=self.batch_size,
                                  frames=self.frames)

        cpc_kwargs = dict(
            obs_anchor=obses_anc, obs_pos=pos, time_anchor=None, time_pos=None
        )  # Nawid  Postitive example is pos whilst anchor is obses

        return obses_input, actions, next_obses_input, cpc_kwargs
Exemple #13
0
    def run_eval_loop(sample_stochastically=True):
        start_time = time.time()
        prefix = "stochastic_" if sample_stochastically else ""
        for i in tqdm(range(num_episodes), desc='eval', unit='ep'):
            obs = env.reset()
            video.init(enabled=(i == 0))
            done = False
            episode_reward = 0
            episode_info = defaultdict(int)
            while not done:
                # center crop image
                if args.encoder_type == "mixed":
                    state, img = utils.split_obs(obs)
                    img = utils.center_crop_image(img, args.image_size)
                    obs = utils.combine_obs(state, img)
                with utils.eval_mode(agent):
                    if sample_stochastically:
                        action = agent.sample_action(obs)
                    else:
                        action = agent.select_action(obs)
                obs, reward, done, info = env.step(action)

                for k in keys_to_monitor:
                    episode_info[k] += info[k]
                video.record(env, yaw=i)
                episode_reward += reward

            for k in keys_to_monitor:
                L.log("eval/" + prefix + k, np.sum(episode_info[k]), step)
            video.save("%d.mp4" % step)
            L.log("eval/" + prefix + "episode_reward", episode_reward, step)
            all_ep_rewards.append(episode_reward)

        L.log("eval/" + prefix + "eval_time", time.time() - start_time, step)
        mean_ep_reward = np.mean(all_ep_rewards)
        best_ep_reward = np.max(all_ep_rewards)
        L.log("eval/" + prefix + "mean_episode_reward", mean_ep_reward, step)
        L.log("eval/" + prefix + "best_episode_reward", best_ep_reward, step)
Exemple #14
0
    def run_eval_loop(sample_stochastically=True):
        start_time = time.time()
        prefix = 'stochastic_' if sample_stochastically else ''
        num_successes = 0
        for i in range(num_episodes):
            obs = env.reset()
            video.init(enabled=(i == 0))
            done = False
            episode_reward = 0
            episode_success = False
            while not done:
                # center crop image
                if (args.agent == 'curl_sac' and args.encoder_type == 'pixel') or\
                        (args.agent == 'rad_sac' and (args.encoder_type == 'pixel' or 'crop' in args.data_augs or 'translate' in args.data_augs)):
                    if isinstance(obs, list):
                        obs[0] = utils.center_crop_image(
                            obs[0], args.image_size)
                    else:
                        obs = utils.center_crop_image(obs, args.image_size)
                with utils.eval_mode(agent):
                    if sample_stochastically:
                        action = agent.sample_action(obs)
                    else:
                        action = agent.select_action(obs)
                obs, reward, done, info = env.step(action)
                if info.get('is_success'):
                    episode_success = True
                video.record(env)
                episode_reward += reward
            num_successes += episode_success

            video.save('%d.mp4' % step)
            L.log('eval/' + prefix + 'episode_reward', episode_reward, step)
            all_ep_rewards.append(episode_reward)

        L.log('eval/' + prefix + 'eval_time', time.time() - start_time, step)
        if num_episodes > 0:
            mean_ep_reward = np.mean(all_ep_rewards)
            best_ep_reward = np.max(all_ep_rewards)
            std_ep_reward = np.std(all_ep_rewards)
            success_rate = num_successes / num_episodes
        else:
            mean_ep_reward = 0
            best_ep_reward = 0
            std_ep_reward = 0
            success_rate = 0
        L.log('eval/' + prefix + 'mean_episode_reward', mean_ep_reward, step)
        L.log('eval/' + prefix + 'best_episode_reward', best_ep_reward, step)
        L.log('eval/' + prefix + 'success_rate', success_rate, step)

        filename = args.work_dir + '/eval_scores.npy'
        key = args.domain_name + '-' + str(
            args.task_name) + '-' + args.data_augs
        try:
            log_data = np.load(filename, allow_pickle=True)
            log_data = log_data.item()
        except FileNotFoundError:
            log_data = {}

        if key not in log_data:
            log_data[key] = {}

        log_data[key][step] = {}
        log_data[key][step]['step'] = step
        log_data[key][step]['mean_ep_reward'] = mean_ep_reward
        log_data[key][step]['max_ep_reward'] = best_ep_reward
        log_data[key][step]['success_rate'] = success_rate
        log_data[key][step]['std_ep_reward'] = std_ep_reward
        log_data[key][step]['env_step'] = step * args.action_repeat

        np.save(filename, log_data)
Exemple #15
0
    def run_eval_loop(sample_stochastically=True):
        start_time = time.time()
        prefix = 'stochastic_' if sample_stochastically else ''
        for i in range(num_episodes):
            image_log_dir = utils.make_dir(os.path.join(image_dir, str(i)))
            obs = env.reset()
            video.init(enabled=(i == 0))
            done = False
            episode_reward = 0
            episode_step = 0
            while not done:
                # center crop image
                if episode_step % 100 == 0:
                    observation = env.render("rgb_array")
                    plt.imsave(
                        image_log_dir + "/result_" + str(episode_step) +
                        ".png", observation)
                if args.encoder_type == 'pixel':
                    obs = utils.center_crop_image(obs, args.image_size)
                    goal_obs = utils.center_crop_image(goal_sample,
                                                       args.image_size)
                else:
                    goal_obs = goal_sample
                with utils.eval_mode(agent):
                    if sample_stochastically:
                        action = agent.sample_action(obs, goal_obs)
                    else:
                        action = agent.select_action(obs, goal_obs)
                obs, reward, done, distance = env.step(action)
                if args.reward_type == 'dist':
                    reward = agent.dist_reward(obs, goal_sample)
                video.record(env)
                episode_reward += reward
                episode_step += 1
                if done:
                    observation = env.render("rgb_array")
                    plt.imsave(image_log_dir + "/result_final.png",
                               observation)

            video.save('%d.mp4' % step)
            L.log('eval/' + prefix + 'episode_reward', episode_reward, step)
            all_ep_rewards.append(episode_reward)
            all_ep_distance.append(distance)

        L.log('eval/' + prefix + 'eval_time', time.time() - start_time, step)
        mean_ep_reward = np.mean(all_ep_rewards)
        best_ep_reward = np.max(all_ep_rewards)
        std_ep_reward = np.std(all_ep_rewards)
        mean_ep_distance = np.mean(all_ep_distance)
        best_ep_distance = np.max(all_ep_distance)
        std_ep_distance = np.std(all_ep_distance)
        L.log('eval/' + prefix + 'mean_episode_reward', mean_ep_reward, step)
        L.log('eval/' + prefix + 'best_episode_reward', best_ep_reward, step)
        L.log('eval/' + prefix + 'mean_distance_to_goal', mean_ep_distance,
              step)
        L.log('eval/' + prefix + 'best_distance_to_goal', best_ep_distance,
              step)

        # Log to csv.
        log_csv["step"].append(step)
        log_csv["mean_reward"].append(mean_ep_reward)
        log_csv["mean_distance_to_goal"].append(mean_ep_distance)
        log_csv["std_distance_to_goal"].append(std_ep_distance)
        pd.DataFrame(log_csv).to_csv(csv_dir + "/log.csv", index=False)
Exemple #16
0
def main():
    import logging
    from rich.logging import RichHandler
    logging.basicConfig(
        level=logging.INFO,
        handlers=[RichHandler(rich_tracebacks=True, markup=True)])

    args = parse_args()
    if args.seed == -1:
        args.__dict__["seed"] = np.random.randint(1, 1000000)
        print('seed', args.__dict__["seed"])
    print(args)
    utils.set_seed_everywhere(args.seed)

    env = gym.make(args.domain_name, render=args.render)
    print(env)
    # TODO action repeat wrapper?

    env.seed(args.seed)

    # # stack several consecutive frames together
    if args.encoder_type == "mixed":
        from apple_gym.env.wrappers import FrameStack, ImageState, PermuteImages

        env = FrameStack(
            PermuteImages(ImageState(env), keys=["img"]),
            n=args.frame_stack,
            keys=["img"],
        )

    if args.load == 'auto':
        load_dirs = Path(args.work_dir).glob('*/model/curl*.pt')
        load_dirs = sorted(set([str(d.parent) for d in load_dirs]))
        print('load_dirs', load_dirs)
        args.load = str(load_dirs[-1])
        print('auto load', load_dirs)

    # make directory
    ts = time.gmtime()
    ts = time.strftime("%m-%d", ts)
    env_name = args.domain_name
    exp_name = (env_name + "-" + ts + "-im" + str(args.image_size) + "-b" +
                str(args.batch_size) + "-s" + str(args.seed) + "-" +
                args.encoder_type)
    args.work_dir = args.work_dir + "/" + exp_name
    print('work_dir', args.work_dir)

    utils.make_dir(args.work_dir)
    video_dir = utils.make_dir(os.path.join(args.work_dir, "video"))
    model_dir = utils.make_dir(os.path.join(args.work_dir, "model"))
    buffer_dir = utils.make_dir(os.path.join(args.work_dir, "buffer"))

    video = VideoRecorder(video_dir if args.save_video else None)

    with open(os.path.join(args.work_dir, "args.json"), "w") as f:
        json.dump(vars(args), f, sort_keys=True, indent=4)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f'device {device}')

    # shapes
    action_shape = env.action_space.shape
    img = env.observation_space.sample()["img"]
    img_aug = utils.center_crop_image(img, args.image_size)
    obs_shape = {
        "img": img_aug.shape,
        "state": env.observation_space["state"].shape
    }

    replay_buffer = utils.ReplayBuffer(
        obs_space=env.observation_space,
        action_space=env.action_space,
        capacity=args.replay_buffer_capacity,
        batch_size=args.batch_size,
        device=device,
        image_size=args.image_size,
    )

    agent = make_agent(obs_shape=obs_shape,
                       action_shape=action_shape,
                       args=args,
                       device=device)
    if args.load is not None:
        agent.load_curl(args.load)

    # summarize
    obs = env.observation_space.sample()
    state, img = utils.split_obs(obs)
    img_crop = utils.center_crop_image(img, agent.image_size)
    obs_crop = utils.combine_obs(state, img_crop)
    obs_crop['img'] = torch.FloatTensor(obs_crop['img']).to(
        agent.device).unsqueeze(0)
    obs_crop['state'] = torch.FloatTensor(obs_crop['state']).to(
        agent.device).unsqueeze(0)
    action = agent.sample_action(obs)
    action = torch.FloatTensor(action).to(agent.device).unsqueeze(0)
    from torchsummaryX import summary
    with torch.no_grad():
        print(agent.critic)
        summary(agent.critic, obs_crop, action)
        print(agent.actor)
        summary(agent.actor, obs_crop)

    L = Logger(args.work_dir, use_tb=args.save_tb)

    episode, episode_reward, done = 0, 0, True
    episode_info = defaultdict(int)
    start_time = time.time()

    for step in tqdm(range(args.num_train_steps),
                     desc="train",
                     unit="step",
                     mininterval=360):
        # evaluate agent periodically

        if (step % args.eval_freq == 0) and (step >= args.eval_freq):
            L.log("eval/episode", episode, step)
            evaluate(env, agent, video, args.num_eval_episodes, L, step, args)
            if args.save_model:
                agent.save_curl(model_dir, step)
            if args.save_buffer:
                replay_buffer.save(buffer_dir)

        if done:
            if step > 0:
                if step % args.log_interval == 0:
                    L.log("train/duration", time.time() - start_time, step)
                    L.dump(step)
                start_time = time.time()
            if step % args.log_interval == 0:
                L.log("train/episode_reward", episode_reward, step)

            for k in keys_to_monitor:
                L.log("train/episode_info" + k, np.sum(episode_info[k]), step)

            obs = env.reset()
            assert env.observation_space.contains(
                obs
            ), f"obs should be in space. ob={obs} space={env.observation_space}"
            done = False
            episode_reward = 0
            episode_info = defaultdict(int)
            episode_step = 0
            episode += 1
            if step % args.log_interval == 0:
                L.log("train/episode", episode, step)

        # sample action for data collection
        if step < args.init_steps:
            action = env.action_space.sample()
        else:
            with utils.eval_mode(agent):
                action = agent.sample_action(obs)
        assert env.action_space.contains(
            action
        ), f"obs should be in space. ob={action} space={env.action_space}"

        if step % 10 == 0:
            # run training update
            if step >= args.init_steps:
                num_updates = 1
                for _ in range(num_updates):
                    agent.update(replay_buffer, L, step)

        next_obs, reward, done, info = env.step(action)

        # allow infinite bootstrap
        done_bool = 0 if episode_step + 1 == env._max_episode_steps else float(
            done)
        episode_reward += reward
        replay_buffer.add(obs, action, reward, next_obs, done_bool)
        for k in keys_to_monitor:
            episode_info[k] += info[k]

        obs = next_obs
        episode_step += 1