コード例 #1
0
    def __init__(self, cfg):
        self.work_dir = os.getcwd()
        print(f'workspace: {self.work_dir}')

        self.cfg = cfg

        self.logger = Logger(self.work_dir,
                             save_tb=cfg.log_save_tb,
                             log_frequency=cfg.log_frequency_step,
                             agent=cfg.agent.name,
                             action_repeat=cfg.action_repeat)

        utils.set_seed_everywhere(cfg.seed)
        self.device = torch.device(cfg.device)
        self.env = make_env(cfg)

        cfg.agent.params.obs_shape = self.env.observation_space.shape
        print(self.env.action_space.shape)
        cfg.agent.params.action_shape = (self.env.action_space.n,)
        cfg.agent.params.action_range = [
            0,
            12
        ]
        self.agent = hydra.utils.instantiate(cfg.agent)

        self.replay_buffer = ReplayBuffer(self.env.observation_space.shape,
                                          self.env.action_space.shape,
                                          cfg.replay_buffer_capacity,
                                          self.cfg.image_pad, self.device)

        self.video_recorder = VideoRecorder(
            self.work_dir if cfg.save_video else None)
        self.step = 0
コード例 #2
0
    def __init__(self, cfg):
        self.work_dir = os.getcwd()
        print(f'workspace: {self.work_dir}')

        self.cfg = cfg

        self.logger = Logger(self.work_dir,
                             save_tb=cfg.log_save_tb,
                             log_frequency=cfg.log_frequency,
                             agent=cfg.agent.name)

        utils.set_seed_everywhere(cfg.seed)
        self.device = torch.device(cfg.device)
        self.env = utils.make_env(cfg)

        cfg.agent.params.obs_dim = self.env.observation_space.shape[0]
        cfg.agent.params.action_dim = self.env.action_space.shape[0]
        cfg.agent.params.action_range = [
            float(self.env.action_space.low.min()),
            float(self.env.action_space.high.max())
        ]
        self.agent = hydra.utils.instantiate(cfg.agent)

        self.replay_buffer = ReplayBuffer(self.env.observation_space.shape,
                                          self.env.action_space.shape,
                                          int(cfg.replay_buffer_capacity),
                                          self.device)

        self.video_recorder = VideoRecorder(
            self.work_dir if cfg.save_video else None)
        self.step = 0
コード例 #3
0
    def __init__(self, cfg):
        self.work_dir = os.getcwd()
        print(f'workspace: {self.work_dir}')

        self.cfg = cfg

        self.logger = Logger(self.work_dir,
                             save_tb=cfg.log_save_tb,
                             log_frequency=cfg.log_frequency,
                             agent=cfg.agent.name)

        utils.set_seed_everywhere(cfg.seed)
        self.device = torch.device(cfg.device)
        self.train_envs, self.test_envs = utils.make_env(cfg)

        cfg.agent.params.obs_dim = self.train_envs[0].observation_space.shape[0] + cfg.noise_dims
        cfg.agent.params.action_dim = self.train_envs[0].action_space.shape[0]
        if cfg.agent.name != 'sac':
            cfg.agent.params.num_envs = cfg.num_train_envs
        cfg.agent.params.action_range = [
            float(self.train_envs[0].action_space.low.min()),
            float(self.train_envs[0].action_space.high.max())
        ]
        self.agent = hydra.utils.instantiate(cfg.agent)
        self.agent.seq_len = cfg.seq_len

        self.replay_buffer = MultiEnvReplayBuffer((cfg.agent.params.obs_dim,),  # hard coded
                                          self.train_envs[0].action_space.shape,
                                          int(cfg.replay_buffer_capacity),
                                          self.device, num_envs=cfg.num_train_envs, seq_len=cfg.seq_len)

        self.video_recorder = VideoRecorder(
            self.work_dir if cfg.save_video else None)
        self.step = [0] * cfg.num_train_envs
コード例 #4
0
ファイル: train.py プロジェクト: JadenTravnik/proto
    def __init__(self, cfg):
        self.work_dir = os.getcwd()
        print(f'workspace: {self.work_dir}')
        self.model_dir = utils.make_dir(self.work_dir, 'model')
        self.buffer_dir = utils.make_dir(self.work_dir, 'buffer')

        self.cfg = cfg

        self.logger = Logger(self.work_dir,
                             save_tb=cfg.log_save_tb,
                             log_frequency=cfg.log_frequency_step,
                             action_repeat=cfg.action_repeat,
                             agent=cfg.agent.name)

        utils.set_seed_everywhere(cfg.seed)
        self.device = torch.device(cfg.device)
        self.env = dmc.make(cfg.env, cfg.frame_stack, cfg.action_repeat,
                            cfg.seed)
        self.eval_env = dmc.make(cfg.env, cfg.frame_stack, cfg.action_repeat,
                                 cfg.seed + 1)

        obs_spec = self.env.observation_spec()['pixels']
        action_spec = self.env.action_spec()

        cfg.agent.params.obs_shape = obs_spec.shape
        cfg.agent.params.action_shape = action_spec.shape
        cfg.agent.params.action_range = [
            float(action_spec.minimum.min()),
            float(action_spec.maximum.max())
        ]
        # exploration agent uses intrinsic reward
        self.expl_agent = hydra.utils.instantiate(cfg.agent,
                                                  task_agnostic=True)
        # task agent uses extr extrinsic reward
        self.task_agent = hydra.utils.instantiate(cfg.agent,
                                                  task_agnostic=False)
        self.task_agent.assign_modules_from(self.expl_agent)

        if cfg.load_pretrained:
            pretrained_path = utils.find_pretrained_agent(
                cfg.pretrained_dir, cfg.env, cfg.seed, cfg.pretrained_step)
            print(f'snapshot is taken from: {pretrained_path}')
            pretrained_agent = utils.load(pretrained_path)
            self.task_agent.assign_modules_from(pretrained_agent)

        # buffer for the task-agnostic phase
        self.expl_buffer = ReplayBuffer(obs_spec.shape, action_spec.shape,
                                        cfg.replay_buffer_capacity,
                                        self.device)
        # buffer for task-specific phase
        self.task_buffer = ReplayBuffer(obs_spec.shape, action_spec.shape,
                                        cfg.replay_buffer_capacity,
                                        self.device)

        self.eval_video_recorder = VideoRecorder(
            self.work_dir if cfg.save_video else None)
        self.step = 0
コード例 #5
0
    def __init__(self, cfg):

        self.work_dir = os.getcwd()
        """Hack to adjust action_repeat"""
        adjust_action_repeat_hack(cfg)

        print(f"CFG:\n{'-'*100}\n{cfg}\n{'-'*100}")

        self.cfg = cfg
        experiment_name = f"{cfg.full_title}_{cfg.run_id}"

        self.logger = Logger(self.work_dir,
                             save_tb=cfg.log_save_tb,
                             save_wb=cfg.log_save_wandb,
                             log_frequency=cfg.log_frequency_step,
                             agent=cfg.agent.name,
                             action_repeat=cfg.action_repeat,
                             cfg=dict(flatten_cfg(cfg)),
                             plot_project="drqtest",
                             experiment=experiment_name)
        utils.set_seed_everywhere(cfg.seed)
        self.device = torch.device(cfg.device)
        self.env = make_env(cfg)

        cfg.agent.params.obs_shape = self.env.observation_space.shape
        cfg.agent.params.action_shape = self.env.action_space.shape
        cfg.agent.params.action_range = [
            float(self.env.action_space.low.min()),
            float(self.env.action_space.high.max())
        ]
        self.agent = hydra.utils.instantiate(cfg.agent)

        print(f"ACTOR:\n{'-'*100}\n{self.agent.actor}\n{'-'*100}")
        print(f"CRITIC:\n{'-'*100}\n{self.agent.critic}\n{'-'*100}")

        self.replay_buffer = ReplayBuffer(
            self.env.observation_space.shape,
            self.env.action_space.shape,
            cfg.replay_buffer_capacity,
            self.cfg.image_pad,
            self.device,
            use_aug=cfg.replay_buffer_augmentation)

        self.video_recorder = VideoRecorder(
            self.work_dir if cfg.save_video else None)
        self.step = 0
コード例 #6
0
ファイル: train.py プロジェクト: avandekleut/drq
    def __init__(
            self,
            log_save_tb=True,
            log_frequency_step=10000,
            agent_name='drq',
            # device='cuda',
            device='cpu',
            env='cartpole_swingup',
            seed=1,
            image_size=84,
            action_repeat=8,
            frame_stack=3,
            replay_buffer_capacity=100000,
            image_pad=4,
            save_video=True):
        self.work_dir = os.getcwd()
        print(f'workspace: {self.work_dir}')

        self.logger = Logger(self.work_dir,
                             save_tb=log_save_tb,
                             log_frequency=log_frequency_step,
                             agent=agent_name,
                             action_repeat=action_repeat)

        utils.set_seed_everywhere(seed)
        self.device = torch.device(device)
        self.env = make_env(env, seed, image_size, action_repeat, frame_stack)

        self.agent = DRQAgent(
            obs_shape=self.env.observation_space.shape,
            action_shape=self.env.action_space.shape,
            action_range=(float(self.env.action_space.low.min()),
                          float(self.env.action_space.high.max())),
            device=self.device)

        self.replay_buffer = ReplayBuffer(self.env.observation_space.shape,
                                          self.env.action_space.shape,
                                          replay_buffer_capacity, image_pad,
                                          self.device)

        self.video_recorder = VideoRecorder(
            self.work_dir if save_video else None)
        self.step = 0
コード例 #7
0
ファイル: train.py プロジェクト: qingfengwuhen/RE3
    def __init__(self, cfg):
        self.work_dir = os.getcwd()
        print(f"workspace: {self.work_dir}")

        self.cfg = cfg

        self.logger = Logger(
            self.work_dir,
            save_tb=cfg.log_save_tb,
            log_frequency=cfg.log_frequency_step,
            agent=cfg.agent.name,
            action_repeat=cfg.action_repeat,
        )

        utils.set_seed_everywhere(cfg.seed)
        self.device = torch.device(cfg.device)
        self.env = make_env(cfg, eval=False)

        cfg.agent.params.obs_shape = self.env.observation_space.shape
        cfg.agent.params.action_shape = self.env.action_space.shape
        cfg.agent.params.action_range = [
            float(self.env.action_space.low.min()),
            float(self.env.action_space.high.max()),
        ]
        self.agent = hydra.utils.instantiate(cfg.agent)

        self.replay_buffer = ReplayBuffer(
            self.env.observation_space.shape,
            self.env.action_space.shape,
            self.env.state_space.shape,
            cfg.replay_buffer_capacity,
            self.cfg.image_size,
            self.agent.random_encoder,
            self.cfg.aug_type,
            self.cfg.use_drq,
            self.device,
        )

        self.video_recorder = VideoRecorder(
            self.work_dir if cfg.save_video else None)
        self.step = 0
コード例 #8
0
ファイル: train.py プロジェクト: lukashermann/drq
    def __init__(self, cfg):
        self.work_dir = os.getcwd()
        print(f'workspace: {self.work_dir}')

        self.cfg = cfg

        self.logger = Logger(self.work_dir,
                             save_tb=cfg.log_save_tb,
                             log_frequency=cfg.log_frequency_step,
                             agent=cfg.agent.name,
                             action_repeat=cfg.action_repeat)

        utils.set_seed_everywhere(cfg.seed)
        self.device = torch.device(cfg.device)
        self.env = make_env(cfg, self.logger)
        self.eval_env = gym.make(cfg.env)
        if "img_only" not in cfg.env:
            self.eval_env = DictToBoxWrapper(DictTransposeImage(self.eval_env))
        else:
            self.eval_env = TransposeImage(self.eval_env)
        # env = utils.FrameStack(env, k=cfg.frame_stack)

        self.eval_env.seed(cfg.seed + 111)

        cfg.agent.params.obs_shape = self.env.observation_space.shape
        cfg.agent.params.action_shape = self.env.action_space.shape
        cfg.agent.params.action_range = [
            float(self.env.action_space.low.min()),
            float(self.env.action_space.high.max())
        ]
        self.agent = hydra.utils.instantiate(cfg.agent)

        self.replay_buffer = ReplayBuffer(self.env.observation_space.shape,
                                          self.env.action_space.shape,
                                          cfg.replay_buffer_capacity,
                                          self.cfg.image_pad, self.device)

        self.video_recorder = VideoRecorder(
            self.work_dir if cfg.save_video else None)
        self.step = 0
コード例 #9
0
def main(args):
    # Initialize environment
    env = init_env(args)
    model_dir = utils.make_dir(os.path.join(args.work_dir, 'model'))
    video_dir = utils.make_dir(os.path.join(args.work_dir, 'video'))
    video = VideoRecorder(video_dir if args.save_video else None,
                          height=448,
                          width=448,
                          camera_id=args.video_camera_id)

    # Prepare agent
    assert torch.cuda.is_available(), 'must have cuda enabled'
    device = torch.device(args.device)
    # cropped_obs_shape = (3 * args.frame_stack, 84, 84)
    agent = make_agent(obs_shape=env.observation_space.shape,
                       action_shape=env.action_space.shape,
                       action_range=[
                           float(env.action_space.low.min()),
                           float(env.action_space.high.max())
                       ],
                       device=device,
                       args=args)
    agent.load(model_dir, args.load_checkpoint)

    # Evaluate agent without PAD
    print(
        f'Evaluating {args.work_dir} for {args.num_eval_episodes} episodes (mode: {args.mode})'
    )
    eval_reward, eval_invs_pred_var = evaluate(env, agent, args, video)
    print('eval reward:', int(eval_reward))
    print('eval inverse predictor variance: ', eval_invs_pred_var)

    # # Evaluate agent with PAD (if applicable)
    # pad_reward = None
    # if args.use_inv or args.use_curl or args.use_rot:
    # 	env = init_env(args)
    # 	print(f'Policy Adaptation during Deployment of {args.work_dir} for {args.pad_num_episodes} episodes '
    # 		  f'(mode: {args.mode})')
    # 	pad_reward = evaluate(env, agent, args, video, adapt=True)
    # 	print('pad reward:', int(pad_reward))

    # Save results
    if args.eval_results:
        results_fp = os.path.join(args.work_dir, '{}.pt'.format(args.mode))
        torch.save(
            {
                'args': args,
                'eval_reward': eval_reward,
                'eval_invs_pred_var': eval_invs_pred_var
            }, results_fp)
        print('Saved results to', results_fp)
コード例 #10
0
ファイル: train.py プロジェクト: jinyx728/iGibson
    def __init__(self, cfg):
        self.work_dir = os.getcwd()
        print(f'workspace: {self.work_dir}')

        self.cfg = cfg

        self.logger = Logger(self.work_dir,
                             save_tb=cfg.log_save_tb,
                             log_frequency=cfg.log_frequency_step,
                             agent=cfg.agent.name,
                             action_repeat=cfg.action_repeat)

        utils.set_seed_everywhere(cfg.seed)
        self.device = torch.device(cfg.device)

        gibson_config_filename = os.path.join(
            os.path.dirname(gibson2.__file__),
            '../examples/configs/hand_drawer.yaml')
        self.env = HandDrawerEnv(config_file=gibson_config_filename,
                                 mode='headless')
        self.env = utils.FrameStack(self.env, k=cfg.frame_stack)

        cfg.agent.params.obs_shape = self.env.observation_space.shape
        cfg.agent.params.action_shape = self.env.action_space.shape
        cfg.agent.params.action_range = [
            float(self.env.action_space.low.min()),
            float(self.env.action_space.high.max())
        ]
        self.agent = hydra.utils.instantiate(cfg.agent)

        self.replay_buffer = ReplayBuffer(self.env.observation_space.shape,
                                          self.env.action_space.shape,
                                          cfg.replay_buffer_capacity,
                                          self.cfg.image_pad, self.device)

        self.video_recorder = VideoRecorder(
            self.work_dir if cfg.save_video else None)
        self.step = 0
コード例 #11
0
ファイル: train.py プロジェクト: boxiXia/pytorch_sac
    def __init__(self, cfg):
        self.work_dir = os.getcwd()
        print(f'workspace: {self.work_dir}')
        self.cfg = cfg

        self.logger = Logger(self.work_dir,
                             save_tb=cfg.log_save_tb,
                             log_frequency=cfg.log_frequency,
                             agent=cfg.agent.name)

        setSeedEverywhere(cfg.seed)
        self.device = torch.device(cfg.device)
        # self.env = utils.makeEnv(cfg)
        self.env = hydra.utils.call(cfg.env)

        cfg.agent.obs_dim = self.env.observation_space.shape[0]
        cfg.agent.action_dim = self.env.action_space.shape[0]
        cfg.agent.action_range = [
            float(self.env.action_space.low.min()),
            float(self.env.action_space.high.max())
        ]
        cfg.agent.n_step = cfg.replay_buffer.n_step # n-step experience replay
        self.agent = hydra.utils.instantiate(cfg.agent,_recursive_=False)

        self.replay_buffer = ReplayBuffer(
            capacity=cfg.replay_buffer.capacity,
            obs_shape = self.env.observation_space.shape,
            action_shape = self.env.action_space.shape,
            obs_dtype = self.env.observation_space.dtype,
            action_dtype = self.env.action_space.dtype,
            n_step = cfg.replay_buffer.n_step, # n-step experience replay
            discount=cfg.agent.discount, # per step discount
            device = self.device)

        self.video_recorder = VideoRecorder(
            self.work_dir if cfg.save_video else None)
        self.step = 0
コード例 #12
0
ファイル: train.py プロジェクト: HosseinSheikhi/drq
    def __init__(self, cfg):
        self.work_dir = os.getcwd()
        print(f'workspace: {self.work_dir}')

        self.cfg = cfg

        self.logger = Logger(self.work_dir + "_" + self.cfg.env +
                             "_eval2k_effective_{}_seed_{}".format(
                                 self.cfg.effective_aug, self.cfg.seed),
                             save_tb=cfg.log_save_tb,
                             log_frequency=cfg.log_frequency_step,
                             agent=cfg.agent.name,
                             action_repeat=cfg.action_repeat)

        self.effective_aug = self.cfg.effective_aug
        utils.set_seed_everywhere(cfg.seed)
        self.device = torch.device(cfg.device)
        self.env = make_env(cfg)

        cfg.agent.params.obs_shape = self.env.observation_space.shape
        cfg.agent.params.action_shape = self.env.action_space.shape
        cfg.agent.params.action_range = [
            float(self.env.action_space.low.min()),
            float(self.env.action_space.high.max())
        ]
        self.agent = hydra.utils.instantiate(cfg.agent)

        self.replay_buffer = ReplayBuffer(self.env.observation_space.shape,
                                          self.env.action_space.shape,
                                          cfg.replay_buffer_capacity,
                                          self.cfg.image_pad, self.device,
                                          self.effective_aug)

        self.video_recorder = VideoRecorder(
            self.work_dir if cfg.save_video else None)
        self.step = 0
コード例 #13
0
ファイル: train.py プロジェクト: denisyarats/metarl
    def __init__(self, cfg):
        self.work_dir = os.getcwd()
        print(f'workspace: {self.work_dir}')

        self.cfg = cfg

        self.logger = Logger(self.work_dir,
                             save_tb=cfg.log_save_tb,
                             log_frequency=cfg.log_frequency_step,
                             agent=cfg.agent.name)

        utils.set_seed_everywhere(cfg.seed)
        self.device = torch.device(cfg.device)
        self.env = dmc.make_meta(cfg.env, cfg.episode_length, cfg.seed)
        self.eval_env = dmc.make_meta(cfg.env, cfg.episode_length,
                                      cfg.seed + 1)

        obs_spec = self.env.observation_spec()['features']
        action_spec = self.env.action_spec()

        cfg.agent.params.obs_shape = obs_spec.shape
        cfg.agent.params.action_shape = action_spec.shape
        cfg.agent.params.action_range = [
            float(action_spec.minimum.min()),
            float(action_spec.maximum.max())
        ]
        self.agent = hydra.utils.instantiate(cfg.agent)

        self.replay_buffer = MetaReplayBuffer(cfg.train_tasks, obs_spec.shape,
                                              action_spec.shape,
                                              cfg.replay_buffer_capacity,
                                              self.device)

        self.eval_video_recorder = VideoRecorder(
            self.work_dir if cfg.save_video else None)
        self.step = 0
コード例 #14
0
ファイル: train.py プロジェクト: trevormcinroe/thesis
    def __init__(self, cfg):
        self.work_dir = '/media/trevor/mariadb/thesis/'
        print(f'workspace: {self.work_dir}')

        self.cfg = cfg

        self.logger = Logger(self.work_dir,
                             save_tb=cfg.log_save_tb,
                             log_frequency=cfg.log_frequency_step,
                             agent=cfg.agent.name,
                             action_repeat=cfg.action_repeat)

        utils.set_seed_everywhere(cfg.seed)
        self.device = torch.device(cfg.device)
        self.env = make_env(cfg)

        cfg.agent.params.obs_shape = self.env.observation_space.shape
        cfg.agent.params.action_shape = self.env.action_space.shape
        cfg.agent.params.action_range = [
            float(self.env.action_space.low.min()),
            float(self.env.action_space.high.max())
        ]

        self.agent = hydra.utils.instantiate(cfg.agent)

        self.replay_buffer = ReplayBuffer(self.env.observation_space.shape,
                                          self.env.action_space.shape,
                                          cfg.replay_buffer_capacity,
                                          self.cfg.image_pad, self.device,
                                          self.cfg.env)

        # obs_shape = (3 * 3, 84, 84)
        # pre_aug_obs_shape = (3 * 3, 100, 100)
        #
        # self.replay_buffer = ReplayBuffer(
        #     obs_shape=pre_aug_obs_shape,
        #     action_shape=self.env.action_space.shape,
        #     capacity=cfg.replay_buffer_capacity,
        #     batch_size=cfg.batch_size,
        #     device=self.device,
        #     image_size=84,
        #     pre_image_size=100,
        # )

        self.video_recorder = VideoRecorder(
            self.work_dir if cfg.save_video else None)
        self.step = 0
コード例 #15
0
    def eval(self):
        average_episode_reward = 0
        average_episode_success = 0

        video_recorder = VideoRecorder()
        video_recorder.init()

        for episode in range(self.num_eval_episodes):

            obs_dict = self.env.reset()
            obs = obs_dict[self.observation_key]
            obs_g = obs_dict[self.desired_goal_key]
            done = False
            episode_reward = 0
            episode_step = 0

            while not done:
                action = self.agent.act(obs, obs_g, sample=True)

                next_obs_dict, reward, done, info = self.env.step(action)

                done = float(done)
                episode_reward += reward

                achieved_goal = next_obs_dict[self.achieved_goal_key]

                obs = next_obs_dict[self.observation_key]
                obs_g = next_obs_dict[self.desired_goal_key]
                episode_step += 1

                video_recorder.record(next_obs_dict)

            average_episode_reward += episode_reward / self.num_eval_episodes
            average_episode_success += float(
                info['is_success']) / self.num_eval_episodes

        video_recorder.save(f'{self.step}.mp4')

        tune.report(
            eval_reward=average_episode_reward,
            eval_is_success=average_episode_success,
            timesteps_this_iter=0,
        )
コード例 #16
0
def main(args):
    # Initialize environment
    env = init_env(args)
    model_dir = utils.make_dir(os.path.join(args.work_dir, 'model'))
    video_dir = utils.make_dir(os.path.join(args.work_dir, 'video'))
    video = VideoRecorder(video_dir if args.save_video else None,
                          height=448,
                          width=448)

    # Prepare agent
    assert torch.cuda.is_available(), 'must have cuda enabled'
    cropped_obs_shape = (3 * args.frame_stack, 84, 84)
    agent = make_agent(obs_shape=cropped_obs_shape,
                       action_shape=env.action_space.shape,
                       args=args)
    agent.load(model_dir, args.load_checkpoint)

    # Evaluate agent without PAD
    print(
        f'Evaluating {args.work_dir} for {args.pad_num_episodes} episodes (mode: {args.mode})'
    )
    eval_reward = evaluate(env, agent, args, video)
    print('eval reward:', int(eval_reward))

    # Evaluate agent with PAD (if applicable)
    pad_reward = None
    if args.use_inv or args.use_curl or args.use_rot:
        env = init_env(args)
        print(
            f'Policy Adaptation during Deployment of {args.work_dir} for {args.pad_num_episodes} episodes '
            f'(mode: {args.mode})')
        pad_reward = evaluate(env, agent, args, video, adapt=True)
        print('pad reward:', int(pad_reward))

    # Save results
    results_fp = os.path.join(args.work_dir, f'{args.mode}_pad.pt')
    torch.save(
        {
            'args': args,
            'eval_reward': eval_reward,
            'pad_reward': pad_reward
        }, results_fp)
    print('Saved results to', results_fp)
コード例 #17
0
def main(args):
	# Initialize environment
	utils.set_seed_everywhere(args.seed)
	env = make_pad_env(
		domain_name=args.domain_name,
		task_name=args.task_name,
		seed=args.seed,
		episode_length=args.episode_length,
		action_repeat=args.action_repeat,
		mode=args.mode
	)

	utils.make_dir(args.work_dir)
	model_dir = utils.make_dir(os.path.join(args.work_dir, 'model'))
	video_dir = utils.make_dir(os.path.join(args.work_dir, 'video'))
	video = VideoRecorder(video_dir if args.save_video else None)

	# Prepare agent
	assert torch.cuda.is_available(), 'must have cuda enabled'
	replay_buffer = utils.ReplayBuffer(
		obs_shape=env.observation_space.shape,
		action_shape=env.action_space.shape,
		capacity=args.train_steps,
		batch_size=args.batch_size
	)
	cropped_obs_shape = (3*args.frame_stack, 84, 84)
	agent = make_agent(
		obs_shape=cropped_obs_shape,
		action_shape=env.action_space.shape,
		args=args
	)

	L = Logger(args.work_dir, use_tb=False)
	episode, episode_reward, done = 0, 0, True
	start_time = time.time()
	for step in range(args.train_steps+1):
		if done:
			if step > 0:
				L.log('train/duration', time.time() - start_time, step)
				start_time = time.time()
				L.dump(step)

			# Evaluate agent periodically
			if step % args.eval_freq == 0:
				print('Evaluating:', args.work_dir)
				L.log('eval/episode', episode, step)
				evaluate(env, agent, video, args.eval_episodes, L, step)
			
			# Save agent periodically
			if step % args.save_freq == 0 and step > 0:
				if args.save_model:
					agent.save(model_dir, step)

			L.log('train/episode_reward', episode_reward, step)

			obs = env.reset()
			done = False
			episode_reward = 0
			episode_step = 0
			episode += 1

			L.log('train/episode', episode, step)

		# Sample action for data collection
		if step < args.init_steps:
			action = env.action_space.sample()
		else:
			with utils.eval_mode(agent):
				action = agent.sample_action(obs)

		# Run training update
		if step >= args.init_steps:
			num_updates = args.init_steps if step == args.init_steps else 1
			for _ in range(num_updates):
				agent.update(replay_buffer, L, step)

		# Take step
		next_obs, reward, done, _ = env.step(action)
		done_bool = 0 if episode_step + 1 == env._max_episode_steps else float(done)
		replay_buffer.add(obs, action, reward, next_obs, done_bool)
		episode_reward += reward
		obs = next_obs

		episode_step += 1
コード例 #18
0
class Workspace(object):
    def __init__(self, cfg):
        self.work_dir = os.getcwd().split('runs')[0] + 'runs/'
        self.work_dir = self.work_dir + \
            '2020.10.21/jaco_reach_site_features_drq_agent.cls=agents.drq_agent.DRQAgent,agent.name=drq,batch_size=64,lr=0.005/seed=0/'
        self.model_dir = self.work_dir + '/agent_model'
        print(f'workspace: {self.work_dir}')
        self.cfg = cfg
        self.log_eval_dir = self.work_dir + '/eval_standalone'
        # Use a separate eval dir to avoid overwriting training files
        if not os.path.exists(self.log_eval_dir):
            os.makedirs(self.log_eval_dir)
        self.logger = Logger(self.log_eval_dir,
                             save_tb=cfg.log_save_tb,
                             log_frequency=cfg.log_frequency_step,
                             agent=cfg.agent.name,
                             action_repeat=cfg.action_repeat,
                             overwrite=True)

        utils.set_seed_everywhere(cfg.seed)
        self.device = torch.device(cfg.device)

        # Environment Sampler
        self.num_train_envs = cfg.num_envs
        self.env_sampler = utils.EnvSampler(cfg,
                                            False,
                                            False,
                                            work_dir=self.work_dir)
        experiment_identifier = self.work_dir.split('runs')[1]
        self.eval_envs = self.env_sampler.sample_eval_envs(
            experiment_identifier)
        env_sample_key = list(self.eval_envs.keys())[0]
        sample_env = self.eval_envs[env_sample_key]
        cfg.agent.params.obs_shape = sample_env.observation_space.shape
        cfg.agent.params.action_shape = sample_env.action_space.shape
        cfg.agent.params.action_range = [
            float(sample_env.action_space.low.min()),
            float(sample_env.action_space.high.max())
        ]
        if cfg.lowobs_append:
            if cfg.env == 'jaco_reach_site_features':
                cfg.agent.params.lstate_shape = 49
            else:
                cfg.agent.params.lstate_shape = 9
        else:
            cfg.agent.params.lstate_shape = 0

        self.agent = hydra.utils.instantiate(cfg.agent)

        self.render_train_samples = True
        if self.render_train_samples:
            if cfg.env.startswith('jaco'):
                height = 256
                width = 256
            else:
                height = width = 500
            from PIL import Image
            for env_idx, env in self.eval_envs.items():
                name = 'StandAloneEval_Unseen_Environment_' + str(
                    env_idx) + '.png'
                img_path = self.work_dir + name
                env.reset()
                obs = env.render(mode='rgb_array', height=height, width=width)
                im = Image.fromarray(obs)
                im.save(img_path)

        self.video_recorder = VideoRecorder(
            self.work_dir if cfg.save_video else None, phase='eval_standalone')

        self.reload_weights = cfg.reload_weights
        self.train_vid_interval = cfg.train_vid_interval

        self.eval_trials = 100
        self.step = 0

    def evaluate(self):
        average_episode_reward = 0
        for episode in range(self.eval_trials):
            print('Episode Trial ', episode)
            self.video_recorder.init(enabled=True)
            eval_env = self.eval_envs[random.sample(list(self.eval_envs),
                                                    1)[0]]
            obs = eval_env.reset()
            done = False
            episode_reward = 0
            episode_step = 0
            while (episode_step <= eval_env._max_episode_steps - 1):
                with utils.eval_mode(self.agent):
                    action = self.agent.act(obs, sample=False)
                obs, reward, _, _ = eval_env.step(action)
                self.video_recorder.record(eval_env)
                episode_reward += reward
                episode_step += 1
                self.step += 1
                if done: break
            average_episode_reward += episode_reward
            print('Episode Reward ', episode_reward)
            self.video_recorder.save(f'{self.step}.mp4')
        average_episode_reward /= self.eval_trials
        self.logger.log('eval_standalone/episode_reward',
                        average_episode_reward, self.step)
        self.logger.dump(self.step, ty='eval_standalone')

    def run(self):
        if os.path.exists(self.model_dir):
            latest_step = utils.get_latest_file(self.model_dir)
            self.agent.load(self.model_dir, latest_step)
        else:
            raise ValueError('Could not reload weights!')

        self.evaluate()
コード例 #19
0
ファイル: train.py プロジェクト: HosseinSheikhi/drq
class Workspace(object):
    def __init__(self, cfg):
        self.work_dir = os.getcwd()
        print(f'workspace: {self.work_dir}')

        self.cfg = cfg

        self.logger = Logger(self.work_dir + "_" + self.cfg.env +
                             "_eval2k_effective_{}_seed_{}".format(
                                 self.cfg.effective_aug, self.cfg.seed),
                             save_tb=cfg.log_save_tb,
                             log_frequency=cfg.log_frequency_step,
                             agent=cfg.agent.name,
                             action_repeat=cfg.action_repeat)

        self.effective_aug = self.cfg.effective_aug
        utils.set_seed_everywhere(cfg.seed)
        self.device = torch.device(cfg.device)
        self.env = make_env(cfg)

        cfg.agent.params.obs_shape = self.env.observation_space.shape
        cfg.agent.params.action_shape = self.env.action_space.shape
        cfg.agent.params.action_range = [
            float(self.env.action_space.low.min()),
            float(self.env.action_space.high.max())
        ]
        self.agent = hydra.utils.instantiate(cfg.agent)

        self.replay_buffer = ReplayBuffer(self.env.observation_space.shape,
                                          self.env.action_space.shape,
                                          cfg.replay_buffer_capacity,
                                          self.cfg.image_pad, self.device,
                                          self.effective_aug)

        self.video_recorder = VideoRecorder(
            self.work_dir if cfg.save_video else None)
        self.step = 0

    def evaluate(self):
        average_episode_reward = 0
        for episode in range(self.cfg.num_eval_episodes):
            obs = self.env.reset()
            self.video_recorder.init(enabled=(episode == 0))
            done = False
            episode_reward = 0
            episode_step = 0
            while not done:
                with utils.eval_mode(self.agent):
                    action = self.agent.act(obs, sample=False)
                obs, reward, done, info = self.env.step(action)
                self.video_recorder.record(self.env)
                episode_reward += reward
                episode_step += 1

            average_episode_reward += episode_reward
            self.video_recorder.save(f'{self.step}.mp4')
        average_episode_reward /= self.cfg.num_eval_episodes
        self.logger.log('eval/episode_reward', average_episode_reward,
                        self.step)
        self.logger.dump(self.step)

    def run(self):
        episode, episode_reward, episode_step, done = 0, 0, 1, True
        start_time = time.time()
        while self.step < self.cfg.num_train_steps:
            if done:
                if self.step > 0:
                    self.logger.log('train/duration',
                                    time.time() - start_time, self.step)
                    start_time = time.time()
                    self.logger.dump(
                        self.step, save=(self.step > self.cfg.num_seed_steps))

                # evaluate agent periodically
                if self.step % self.cfg.eval_frequency == 0:
                    self.logger.log('eval/episode', episode, self.step)
                    self.evaluate()

                self.logger.log('train/episode_reward', episode_reward,
                                self.step)

                obs = self.env.reset()
                done = False
                episode_reward = 0
                episode_step = 0
                episode += 1

                self.logger.log('train/episode', episode, self.step)

            # sample action for data collection
            if self.step < self.cfg.num_seed_steps:
                action = self.env.action_space.sample()
            else:
                with utils.eval_mode(self.agent):
                    action = self.agent.act(obs, sample=True)

            # run training update
            if self.step >= self.cfg.num_seed_steps:
                for _ in range(self.cfg.num_train_iters):
                    self.agent.update(self.replay_buffer, self.logger,
                                      self.step)

            next_obs, reward, done, info = self.env.step(action)

            # allow infinite bootstrap
            done = float(done)
            done_no_max = 0 if episode_step + 1 == self.env._max_episode_steps else done
            episode_reward += reward

            self.replay_buffer.add(obs, action, reward, next_obs, done,
                                   done_no_max)

            obs = next_obs
            episode_step += 1
            self.step += 1
コード例 #20
0
    def __init__(self, cfg):
        self.work_dir = os.getcwd().split('runs')[0] + 'runs/'
        self.work_dir = self.work_dir + \
            '2020.10.21/jaco_reach_site_features_drq_agent.cls=agents.drq_agent.DRQAgent,agent.name=drq,batch_size=64,lr=0.005/seed=0/'
        self.model_dir = self.work_dir + '/agent_model'
        print(f'workspace: {self.work_dir}')
        self.cfg = cfg
        self.log_eval_dir = self.work_dir + '/eval_standalone'
        # Use a separate eval dir to avoid overwriting training files
        if not os.path.exists(self.log_eval_dir):
            os.makedirs(self.log_eval_dir)
        self.logger = Logger(self.log_eval_dir,
                             save_tb=cfg.log_save_tb,
                             log_frequency=cfg.log_frequency_step,
                             agent=cfg.agent.name,
                             action_repeat=cfg.action_repeat,
                             overwrite=True)

        utils.set_seed_everywhere(cfg.seed)
        self.device = torch.device(cfg.device)

        # Environment Sampler
        self.num_train_envs = cfg.num_envs
        self.env_sampler = utils.EnvSampler(cfg,
                                            False,
                                            False,
                                            work_dir=self.work_dir)
        experiment_identifier = self.work_dir.split('runs')[1]
        self.eval_envs = self.env_sampler.sample_eval_envs(
            experiment_identifier)
        env_sample_key = list(self.eval_envs.keys())[0]
        sample_env = self.eval_envs[env_sample_key]
        cfg.agent.params.obs_shape = sample_env.observation_space.shape
        cfg.agent.params.action_shape = sample_env.action_space.shape
        cfg.agent.params.action_range = [
            float(sample_env.action_space.low.min()),
            float(sample_env.action_space.high.max())
        ]
        if cfg.lowobs_append:
            if cfg.env == 'jaco_reach_site_features':
                cfg.agent.params.lstate_shape = 49
            else:
                cfg.agent.params.lstate_shape = 9
        else:
            cfg.agent.params.lstate_shape = 0

        self.agent = hydra.utils.instantiate(cfg.agent)

        self.render_train_samples = True
        if self.render_train_samples:
            if cfg.env.startswith('jaco'):
                height = 256
                width = 256
            else:
                height = width = 500
            from PIL import Image
            for env_idx, env in self.eval_envs.items():
                name = 'StandAloneEval_Unseen_Environment_' + str(
                    env_idx) + '.png'
                img_path = self.work_dir + name
                env.reset()
                obs = env.render(mode='rgb_array', height=height, width=width)
                im = Image.fromarray(obs)
                im.save(img_path)

        self.video_recorder = VideoRecorder(
            self.work_dir if cfg.save_video else None, phase='eval_standalone')

        self.reload_weights = cfg.reload_weights
        self.train_vid_interval = cfg.train_vid_interval

        self.eval_trials = 100
        self.step = 0
コード例 #21
0
ファイル: train.py プロジェクト: uneebjaved/rad
def main():
    args = parse_args()
    if args.seed == -1:
        args.__dict__["seed"] = np.random.randint(1, 1000000)
    utils.set_seed_everywhere(args.seed)

    pre_transform_image_size = args.pre_transform_image_size if 'crop' in args.data_augs else args.image_size
    pre_image_size = args.pre_transform_image_size  # record the pre transform image size for translation

    env = dmc2gym.make(domain_name=args.domain_name,
                       task_name=args.task_name,
                       seed=args.seed,
                       visualize_reward=False,
                       from_pixels=(args.encoder_type == 'pixel'),
                       height=pre_transform_image_size,
                       width=pre_transform_image_size,
                       frame_skip=args.action_repeat)

    env.seed(args.seed)

    # stack several consecutive frames together
    if args.encoder_type == 'pixel':
        env = utils.FrameStack(env, k=args.frame_stack)

    # make directory
    ts = time.gmtime()
    ts = time.strftime("%m-%d", ts)
    env_name = args.domain_name + '-' + args.task_name
    exp_name = env_name + '-' + ts + '-im' + str(args.image_size) +'-b'  \
    + str(args.batch_size) + '-s' + str(args.seed)  + '-' + args.encoder_type
    args.work_dir = args.work_dir + '/' + exp_name

    utils.make_dir(args.work_dir)
    video_dir = utils.make_dir(os.path.join(args.work_dir, 'video'))
    model_dir = utils.make_dir(os.path.join(args.work_dir, 'model'))
    buffer_dir = utils.make_dir(os.path.join(args.work_dir, 'buffer'))

    video = VideoRecorder(video_dir if args.save_video else None)

    with open(os.path.join(args.work_dir, 'args.json'), 'w') as f:
        json.dump(vars(args), f, sort_keys=True, indent=4)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    action_shape = env.action_space.shape

    if args.encoder_type == 'pixel':
        obs_shape = (3 * args.frame_stack, args.image_size, args.image_size)
        pre_aug_obs_shape = (3 * args.frame_stack, pre_transform_image_size,
                             pre_transform_image_size)
    else:
        obs_shape = env.observation_space.shape
        pre_aug_obs_shape = obs_shape

    replay_buffer = utils.ReplayBuffer(
        obs_shape=pre_aug_obs_shape,
        action_shape=action_shape,
        capacity=args.replay_buffer_capacity,
        batch_size=args.batch_size,
        device=device,
        image_size=args.image_size,
        pre_image_size=pre_image_size,
    )

    agent = make_agent(obs_shape=obs_shape,
                       action_shape=action_shape,
                       args=args,
                       device=device)

    L = Logger(args.work_dir, use_tb=args.save_tb)

    episode, episode_reward, done = 0, 0, True
    start_time = time.time()

    for step in range(args.num_train_steps):
        # evaluate agent periodically

        if step % args.eval_freq == 0:
            L.log('eval/episode', episode, step)
            evaluate(env, agent, video, args.num_eval_episodes, L, step, args)
            if args.save_model:
                agent.save_curl(model_dir, step)
            if args.save_buffer:
                replay_buffer.save(buffer_dir)

        if done:
            if step > 0:
                if step % args.log_interval == 0:
                    L.log('train/duration', time.time() - start_time, step)
                    L.dump(step)
                start_time = time.time()
            if step % args.log_interval == 0:
                L.log('train/episode_reward', episode_reward, step)

            obs = env.reset()
            done = False
            episode_reward = 0
            episode_step = 0
            episode += 1
            if step % args.log_interval == 0:
                L.log('train/episode', episode, step)

        # sample action for data collection
        if step < args.init_steps:
            action = env.action_space.sample()
        else:
            with utils.eval_mode(agent):
                action = agent.sample_action(obs / 255.)

        # run training update
        if step >= args.init_steps:
            num_updates = 1
            for _ in range(num_updates):
                agent.update(replay_buffer, L, step)

        next_obs, reward, done, _ = env.step(action)

        # allow infinit bootstrap
        done_bool = 0 if episode_step + 1 == env._max_episode_steps else float(
            done)
        episode_reward += reward
        replay_buffer.add(obs, action, reward, next_obs, done_bool)

        obs = next_obs
        episode_step += 1
コード例 #22
0
def main(args):
    # Set seed
    utils.set_seed_everywhere(args.seed)

    # Initialize environments
    gym.logger.set_level(40)
    env = make_env(domain_name=args.domain_name,
                   task_name=args.task_name,
                   seed=args.seed + 42,
                   episode_length=args.episode_length,
                   action_repeat=args.action_repeat,
                   mode=args.eval_mode)

    # Set working directory
    work_dir = os.path.join(args.log_dir,
                            args.domain_name + '_' + args.task_name,
                            args.algorithm, str(args.seed))
    print('Working directory:', work_dir)
    assert os.path.exists(
        work_dir), 'specified working directory does not exist'
    model_dir = utils.make_dir(os.path.join(work_dir, 'model'))
    video_dir = utils.make_dir(os.path.join(work_dir, 'video'))
    video = VideoRecorder(video_dir if args.save_video else None,
                          height=448,
                          width=448)

    # Check if evaluation has already been run
    results_fp = os.path.join(work_dir, args.eval_mode + '.pt')
    assert not os.path.exists(
        results_fp), f'{args.eval_mode} results already exist for {work_dir}'

    # Prepare agent
    assert torch.cuda.is_available(), 'must have cuda enabled'
    cropped_obs_shape = (3 * args.frame_stack, 84, 84)
    agent = make_agent(obs_shape=cropped_obs_shape,
                       action_shape=env.action_space.shape,
                       args=args)
    agent = torch.load(os.path.join(model_dir, str(args.train_steps) + '.pt'))
    agent.train(False)

    print(
        f'\nEvaluating {work_dir} for {args.eval_episodes} episodes (mode: {args.eval_mode})'
    )
    reward = evaluate(env, agent, video, args.eval_episodes, args.eval_mode)
    print('Reward:', int(reward))

    adapt_reward = None
    if args.algorithm == 'pad':
        env = make_env(domain_name=args.domain_name,
                       task_name=args.task_name,
                       seed=args.seed + 42,
                       episode_length=args.episode_length,
                       action_repeat=args.action_repeat,
                       mode=args.eval_mode)
        adapt_reward = evaluate(env,
                                agent,
                                video,
                                args.eval_episodes,
                                args.eval_mode,
                                adapt=True)
        print('Adapt reward:', int(adapt_reward))

    # Save results
    torch.save({
        'args': args,
        'reward': reward,
        'adapt_reward': adapt_reward
    }, results_fp)
    print('Saved results to', results_fp)
コード例 #23
0
ファイル: train.py プロジェクト: avandekleut/drq
class Workspace(object):
    def __init__(
            self,
            log_save_tb=True,
            log_frequency_step=10000,
            agent_name='drq',
            # device='cuda',
            device='cpu',
            env='cartpole_swingup',
            seed=1,
            image_size=84,
            action_repeat=8,
            frame_stack=3,
            replay_buffer_capacity=100000,
            image_pad=4,
            save_video=True):
        self.work_dir = os.getcwd()
        print(f'workspace: {self.work_dir}')

        self.logger = Logger(self.work_dir,
                             save_tb=log_save_tb,
                             log_frequency=log_frequency_step,
                             agent=agent_name,
                             action_repeat=action_repeat)

        utils.set_seed_everywhere(seed)
        self.device = torch.device(device)
        self.env = make_env(env, seed, image_size, action_repeat, frame_stack)

        self.agent = DRQAgent(
            obs_shape=self.env.observation_space.shape,
            action_shape=self.env.action_space.shape,
            action_range=(float(self.env.action_space.low.min()),
                          float(self.env.action_space.high.max())),
            device=self.device)

        self.replay_buffer = ReplayBuffer(self.env.observation_space.shape,
                                          self.env.action_space.shape,
                                          replay_buffer_capacity, image_pad,
                                          self.device)

        self.video_recorder = VideoRecorder(
            self.work_dir if save_video else None)
        self.step = 0

    def evaluate(
        self,
        num_eval_episodes=10,
    ):
        average_episode_reward = 0
        for episode in range(num_eval_episodes):
            obs = self.env.reset()
            self.video_recorder.init(enabled=(episode == 0))
            done = False
            episode_reward = 0
            episode_step = 0
            while not done:
                with utils.eval_mode(self.agent):
                    action = self.agent.act(obs, sample=False)
                obs, reward, done, info = self.env.step(action)
                self.video_recorder.record(self.env)
                episode_reward += reward
                episode_step += 1

            average_episode_reward += episode_reward
            self.video_recorder.save(f'{self.step}.mp4')
        average_episode_reward /= num_eval_episodes
        self.logger.log('eval/episode_reward', average_episode_reward,
                        self.step)
        self.logger.dump(self.step)

    def run(self,
            num_train_steps=1000000,
            num_train_iters=1,
            num_seed_steps=1000,
            eval_frequency=5000):
        episode, episode_reward, episode_step, done = 0, 0, 1, True
        start_time = time.time()
        while self.step < num_train_steps:
            if done:
                if self.step > 0:
                    self.logger.log('train/duration',
                                    time.time() - start_time, self.step)
                    start_time = time.time()
                    self.logger.dump(self.step,
                                     save=(self.step > num_seed_steps))

                # evaluate agent periodically
                if self.step % eval_frequency == 0:
                    self.logger.log('eval/episode', episode, self.step)
                    self.evaluate()

                self.logger.log('train/episode_reward', episode_reward,
                                self.step)

                obs = self.env.reset()
                done = False
                episode_reward = 0
                episode_step = 0
                episode += 1

                self.logger.log('train/episode', episode, self.step)

            # sample action for data collection
            if self.step < num_seed_steps:
                action = self.env.action_space.sample()
            else:
                with utils.eval_mode(self.agent):
                    action = self.agent.act(obs, sample=True)

            # run training update
            if self.step >= num_seed_steps:
                for _ in range(num_train_iters):
                    self.agent.update(self.replay_buffer, self.logger,
                                      self.step)

            next_obs, reward, done, info = self.env.step(action)

            # allow infinite bootstrap
            done = float(done)
            done_no_max = 0 if episode_step + 1 == self.env._max_episode_steps else done
            episode_reward += reward

            self.replay_buffer.add(obs, action, reward, next_obs, done,
                                   done_no_max)

            obs = next_obs
            episode_step += 1
            self.step += 1
コード例 #24
0
ファイル: train.py プロジェクト: vincentmaye/pytorch_sac_ae
def main():
    args = parse_args()
    utils.set_seed_everywhere(args.seed)

    # Robot stuff
    action_space = ActionSpace.DELTA_EE_POSE_IMPEDANCE
    blocking_action = True
    env = RobotEnv(name='peg_in_hole',
                   simulation=True,
                   action_space=action_space,
                   isotropic_gains=True,
                   render=False,
                   blocking_action=blocking_action,
                   rotation_axis=(0, 0, 1),
                   observation_type=dict(camera=1, q=0, dq=0, tau=0, x=0,
                                         dx=0))

    # stack several consecutive frames together
    if args.encoder_type == 'pixel':
        env = utils.FrameStack(env, k=args.frame_stack)

    utils.make_dir(args.work_dir)
    video_dir = utils.make_dir(os.path.join(args.work_dir, 'video'))
    model_dir = utils.make_dir(os.path.join(args.work_dir, 'model'))
    buffer_dir = utils.make_dir(os.path.join(args.work_dir, 'buffer'))

    video = VideoRecorder(video_dir if args.save_video else None)

    with open(os.path.join(args.work_dir, 'args.json'), 'w') as f:
        json.dump(vars(args), f, sort_keys=True, indent=4)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # the dmc2gym wrapper standardizes actions
    #assert env.action_space.low.min()   >= -1
    #assert env.action_space.high.max()  <=  1

    replay_buffer = utils.ReplayBuffer(
        obs_shape=env.observation_space['camera'],
        action_shape=env.action_space.shape,
        capacity=args.replay_buffer_capacity,
        batch_size=args.batch_size,
        device=device)

    agent = make_agent(obs_shape=env.observation_space['camera'],
                       action_shape=env.action_space.shape,
                       args=args,
                       device=device)

    L = Logger(args.work_dir, use_tb=args.save_tb)

    episode, episode_reward, prev_episode_reward, done = 0, 0, 0, True
    start_time = time.time()
    for step in range(args.num_train_steps):
        if done:
            if step > 0:
                L.log('train/duration', time.time() - start_time, step)
                start_time = time.time()
                L.dump(step)

            # evaluate agent periodically
            if step % args.eval_freq == 0 and step > 0:
                L.log('eval/episode', episode, step)
                evaluate(env, agent, video, args.num_eval_episodes, L, step)
                if args.save_model:
                    agent.save(model_dir, step)
                if args.save_buffer:
                    replay_buffer.save(buffer_dir)

            L.log('train/episode_reward', episode_reward, step)

            env.step(np.array([0, 0, 0.1, 0, 0, 0]))  # Prevent getting stuck
            obs = env.reset()
            done, episode_reward, episode_step = False, 0, 0
            episode += 1

            L.log('train/episode', episode, step)

        # sample action for data collection
        if step < args.init_steps:
            action = env.action_space.sample()
        else:
            with utils.eval_mode(agent):
                action = agent.sample_action(obs)
                temp = action
                print("Temp action: {}".format(temp))
                action = np.multiply(action, env.action_space.high)

        # run training update
        if step >= args.init_steps:
            num_updates = args.init_steps if step == args.init_steps else 1
            for _ in range(num_updates):
                agent.update(replay_buffer, L, step)

        next_obs, reward, done, _ = env.step(action)
        print("E: {}   | S: {}   | R: {:.4f} | ER: {:.4f} | A: {}".format(
            episode, step, round(reward, 4), round(episode_reward, 4), action))

        # Reset environment if agent gets stuck (stuck means for 100 steps no increase in reward)
        if step % 100 == 0 and step > 0:
            if np.abs(
                    prev_episode_reward - episode_reward
            ) < 1e-5:  # If change in reward is negligible after 100 steps restart
                env.step(np.array([0, 0, 0.1, 0, 0, 0]))
                obs = env.reset()
            prev_episode_reward = episode_reward

        # allow infinit bootstrap
        done_bool = 0 if episode_step + 1 == env._max_episode_steps else float(
            done)
        episode_reward += reward

        replay_buffer.add(obs, action, reward, next_obs, done_bool)

        obs = next_obs
        episode_step += 1
コード例 #25
0
ファイル: test.py プロジェクト: jinyx728/iGibson
def main():
    args = parse_args()
    if args.seed == -1:
        args.__dict__["seed"] = np.random.randint(1, 1000000)
    utils.set_seed_everywhere(args.seed)

    gibson_config_filename = os.path.join(
        os.path.dirname(gibson2.__file__),
        '../examples/configs/hand_drawer.yaml')
    env = HandDrawerEnv(config_file=gibson_config_filename, mode='headless')

    env.seed(args.seed)

    # stack several consecutive frames together
    if args.encoder_type == 'pixel':
        env = utils.FrameStack(env, k=args.frame_stack)

    # make directory
    ts = time.gmtime()
    ts = time.strftime("%m-%d", ts)
    env_name = args.domain_name + '-' + args.task_name
    exp_name = env_name + '-' + ts + '-im' + str(args.image_size) +'-b'  \
    + str(args.batch_size) + '-s' + str(args.seed)  + '-' + args.encoder_type
    args.work_dir = args.work_dir + '/' + exp_name

    utils.make_dir(args.work_dir)
    video_dir = utils.make_dir(os.path.join(args.work_dir, 'video'))
    model_dir = utils.make_dir(os.path.join(args.work_dir, 'model'))
    buffer_dir = utils.make_dir(os.path.join(args.work_dir, 'buffer'))

    video = VideoRecorder(video_dir if args.save_video else None)

    with open(os.path.join(args.work_dir, 'args.json'), 'w') as f:
        json.dump(vars(args), f, sort_keys=True, indent=4)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    action_shape = env.action_space.shape

    if args.encoder_type == 'pixel':
        obs_shape = (2 * args.frame_stack, args.image_size, args.image_size)
        pre_aug_obs_shape = (2 * args.frame_stack,
                             args.pre_transform_image_size,
                             args.pre_transform_image_size)
    else:
        obs_shape = env.observation_space.shape
        pre_aug_obs_shape = obs_shape

    replay_buffer = utils.ReplayBuffer(
        obs_shape=pre_aug_obs_shape,
        action_shape=action_shape,
        capacity=args.replay_buffer_capacity,
        batch_size=args.batch_size,
        device=device,
        image_size=args.image_size,
    )

    agent = make_agent(obs_shape=obs_shape,
                       action_shape=action_shape,
                       args=args,
                       device=device)

    L = Logger(args.work_dir, use_tb=args.save_tb)

    evaluate(env, agent, video, args.num_eval_episodes, L, step, args)
コード例 #26
0
def main():
    args = parse_args()
    utils.set_seed_everywhere(args.seed)

    env = dmc2gym.make(domain_name=args.domain_name,
                       task_name=args.task_name,
                       seed=args.seed,
                       visualize_reward=False,
                       from_pixels=(args.encoder_type == 'pixel'),
                       height=args.image_size,
                       width=args.image_size,
                       frame_skip=args.action_repeat)
    env.seed(args.seed)

    # stack several consecutive frames together
    if args.encoder_type == 'pixel':
        env = utils.FrameStack(env, k=args.frame_stack)

    args.work_dir = os.path.join(
        args.work_dir,
        f'{args.domain_name}-{args.task_name}-seed{args.seed}-{datetime.now().strftime("%Y%m%d-%H%M")}'
    )

    utils.make_dir(args.work_dir)
    video_dir = utils.make_dir(os.path.join(args.work_dir, 'video'))
    model_dir = utils.make_dir(os.path.join(args.work_dir, 'model'))
    buffer_dir = utils.make_dir(os.path.join(args.work_dir, 'buffer'))

    video = VideoRecorder(video_dir if args.save_video else None)

    with open(os.path.join(args.work_dir, 'args.json'), 'w') as f:
        json.dump(vars(args), f, sort_keys=True, indent=4)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print("Using device: ", device)

    # the dmc2gym wrapper standardizes actions
    assert env.action_space.low.min() >= -1
    assert env.action_space.high.max() <= 1

    replay_buffer = utils.ReplayBuffer(obs_shape=env.observation_space.shape,
                                       action_shape=env.action_space.shape,
                                       capacity=args.replay_buffer_capacity,
                                       batch_size=args.batch_size,
                                       device=device)

    agent = make_agent(obs_shape=env.observation_space.shape,
                       action_shape=env.action_space.shape,
                       args=args,
                       device=device)

    L = Logger(args.work_dir, use_tb=args.save_tb)

    episode, episode_reward, done = 0, 0, True
    start_time = time.time()
    for step in range(args.num_train_steps):
        if done:
            if step > 0:
                L.log('train/duration', time.time() - start_time, step)
                start_time = time.time()
                L.dump(step)

            # evaluate agent periodically
            if step % args.eval_freq == 0:
                L.log('eval/episode', episode, step)
                evaluate(env, agent, video, args.num_eval_episodes, L, step)
                if args.save_model:
                    agent.save(model_dir, step)
                if args.save_buffer:
                    replay_buffer.save(buffer_dir)

            L.log('train/episode_reward', episode_reward, step)

            obs = env.reset()
            done = False
            episode_reward = 0
            episode_step = 0
            episode += 1

            L.log('train/episode', episode, step)

        # sample action for data collection
        if step < args.init_steps:
            action = env.action_space.sample()
        else:
            with utils.eval_mode(agent):
                action = agent.sample_action(obs)

        # run training update
        if step >= args.init_steps:
            num_updates = args.init_steps if step == args.init_steps else 1
            for _ in range(num_updates):
                agent.update(replay_buffer, L, step)

        next_obs, reward, done, _ = env.step(action)

        # allow infinit bootstrap
        done_bool = 0 if episode_step + 1 == env._max_episode_steps else float(
            done)
        episode_reward += reward

        replay_buffer.add(obs, action, reward, next_obs, done_bool)

        obs = next_obs
        episode_step += 1
コード例 #27
0
def main():
    args = parse_args()
    if args.seed == -1: 
        args.__dict__["seed"] = np.random.randint(1,1000000)
    utils.set_seed_everywhere(args.seed)
    env = dmc2gym.make(domain_name=args.domain_name, task_name=args.task_name, seed=args.seed) 
    env.seed(args.seed)
    
    method = args.agent + " (H="+ str(args.k_step) +")"

    model_kind = "dynode_model" if args.agent == "DyNODE-SAC" else "nn_model"

    # make directory
    env_name = args.domain_name + '-' + args.task_name
    args.work_dir = args.work_dir + '/' + env_name + '/' + method + '/' + str(args.seed)
    
    utils.make_dir(args.work_dir)
    video_dir = utils.make_dir(os.path.join(args.work_dir, 'video'))
    model_dir = utils.make_dir(os.path.join(args.work_dir, 'model'))

    video = VideoRecorder(video_dir if args.save_video else None)

    with open(os.path.join(args.work_dir, 'args.json'), 'w+') as f:
        json.dump(vars(args), f, sort_keys=True, indent=4)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    action_shape = env.action_space.shape
    obs_shape = env.observation_space.shape

    replay_buffer = utils.ReplayBuffer(obs_shape=obs_shape, action_shape=action_shape, 
                    capacity=args.replay_buffer_capacity, batch_size=args.batch_size, device=device)

    agent = DyNODESacAgent(obs_shape=obs_shape, action_shape=action_shape, device=device, model_kind = model_kind,
            kind=args.kind, step_MVE = args.k_step, hidden_dim=args.hidden_dim, discount=args.discount,
            init_temperature=args.init_temperature, alpha_lr=args.alpha_lr,
            alpha_beta=args.alpha_beta, actor_lr=args.actor_lr, actor_beta=args.actor_beta,
            actor_log_std_min=args.actor_log_std_min, actor_log_std_max=args.actor_log_std_max,
            critic_lr=args.critic_lr, critic_beta=args.critic_beta,
            critic_tau=args.critic_tau, critic_target_update_freq=args.critic_target_update_freq,
            model_lr=args.model_lr, log_interval=args.log_interval)
    
    L = Logger(args.work_dir, use_tb=args.save_tb)

    episode, episode_reward, done = 0, 0, True
    start_time = time.time()

    for step in range(args.num_train_steps):
        # evaluate agent periodically

        if step % args.eval_freq == 0:
            L.log('eval/episode', episode, step)
            evaluate(env, agent, video, args.num_eval_episodes, L, step, args)
            if args.save_model:
                agent.save_model(model_dir, step)

        if done:
            if step > 0:
                if step % args.log_interval == 0:
                    L.log('train/duration', time.time() - start_time, step)
                    L.dump(step)
                start_time = time.time()
            if step % args.log_interval == 0:
                L.log('train/episode_reward', episode_reward, step)

            obs = env.reset()
            done = False
            episode_reward = 0
            episode_step = 0
            episode += 1
            if step % args.log_interval == 0:
                L.log('train/episode', episode, step)

        # sample action for data collection
        if step < args.init_steps:
            action = env.action_space.sample()
        else:
            with utils.eval_mode(agent):
                action = agent.sample_action(obs)

        if step >= args.model_warm_up:
            for _ in range(args.model_num_updates):
                agent.update_model(replay_buffer, L, step)

        # run training update
        if step >= args.init_steps:
            for _ in range(2):
                agent.update(replay_buffer, L, step)

        next_obs, reward, done, _ = env.step(action)

        # allow infinit bootstrap
        done_bool = 0 if episode_step + 1 == env._max_episode_steps else float(done)
        episode_reward += reward        
        replay_buffer.add(obs, action, reward, next_obs, done_bool, done)

        obs = next_obs
        episode_step += 1
コード例 #28
0
class Workspace(object):
    def __init__(self, cfg):
        self.work_dir = os.getcwd()
        print(f'workspace: {self.work_dir}')

        self.cfg = cfg

        self.logger = Logger(self.work_dir,
                             save_tb=cfg.log_save_tb,
                             log_frequency=cfg.log_frequency,
                             agent=cfg.agent.name)

        utils.set_seed_everywhere(cfg.seed)
        self.device = torch.device(cfg.device)
        self.env = utils.make_env(cfg)
        self.obs_shape = self.env.observation_space['observation'].shape
        self.goal_shape = self.env.observation_space['desired_goal'].shape

        cfg.agent.params.obs_dim = self.obs_shape[0]
        cfg.agent.params.goal_dim = self.goal_shape[0]
        cfg.agent.params.action_dim = self.env.action_space.shape[0]
        cfg.agent.params.action_range = [
            float(self.env.action_space.low.min()),
            float(self.env.action_space.high.max())
        ]
        self.agent = hydra.utils.instantiate(cfg.agent)

        self.replay_buffer = ReplayBuffer(self.obs_shape, self.goal_shape,
                                          self.env.action_space.shape,
                                          int(cfg.replay_buffer_capacity),
                                          self.device)

        self.video_recorder = VideoRecorder(
            self.work_dir if cfg.save_video else None)
        self.step = 0

    def evaluate(self):
        for episode in range(self.cfg.num_eval_episodes):
            obs = self.env.reset()
            self.agent.reset()
            self.video_recorder.init(enabled=(episode == 0))
            done = False
            episode_reward = 0
            while not done:
                with utils.eval_mode(self.agent):
                    action = self.agent.act(obs['observation'],
                                            obs['desired_goal'],
                                            sample=False)
                obs, reward, done, _ = self.env.step(action)
                self.video_recorder.record(self.env)
                episode_reward += reward

            self.video_recorder.save(f'{self.step}.mp4')
            self.logger.log('eval/episode_reward', episode_reward, self.step)
        self.logger.dump(self.step)

    def run_her(self, path_buffer):

        #first_obs = path_buffer[0][0]
        #last_obs = path_buffer[-1][0]
        #first_goal = first_obs['achieved_goal']
        #last_goal = last_obs['achieved_goal']
        #goal_changed = np.mean(last_goal - first_goal)**2 > 1e-6

        #if goal_changed:
        for n, ts in enumerate(path_buffer):
            # select goal id
            if self.cfg.her_strat == 'future':
                i = np.random.randint(n, len(path_buffer))
            elif self.cfg.her_strat == 'last':
                i = -1
            new_goal_obs = path_buffer[i][3]
            new_goal = new_goal_obs['achieved_goal']
            # relabel
            obs, action, reward, next_obs, done, done_no_max = ts
            obs['desired_goal'] = new_goal
            next_obs['desired_goal'] = new_goal
            reward = self.env.compute_reward(next_obs['achieved_goal'],
                                             new_goal, None)
            self.replay_buffer.add(obs, action, reward, next_obs, done,
                                   done_no_max)

    def run(self):
        episode, episode_reward, done = 0, 0, True
        start_time = time.time()
        path_buffer = []
        while self.step < self.cfg.num_train_steps:
            if done:
                if self.step > 0:
                    self.logger.log('train/duration',
                                    time.time() - start_time, self.step)
                    start_time = time.time()
                    self.logger.dump(
                        self.step, save=(self.step > self.cfg.num_seed_steps))

                # evaluate agent periodically
                if self.step > 0 and self.step % self.cfg.eval_frequency == 0:
                    self.logger.log('eval/episode', episode, self.step)
                    if self.cfg.save_model:
                        self.agent.save()
                        self.agent.load()
                    self.evaluate()

                self.logger.log('train/episode_reward', episode_reward,
                                self.step)

                obs = self.env.reset()
                self.agent.reset()
                done = False
                episode_reward = 0
                episode_step = 0
                episode += 1

                self.logger.log('train/episode', episode, self.step)

                # her
                if self.cfg.her_iters > 0 and len(path_buffer):
                    for k in range(self.cfg.her_iters):
                        self.run_her(path_buffer)
                path_buffer = []

            # sample action for data collection
            if self.step < self.cfg.num_seed_steps:
                action = self.env.action_space.sample()
            else:
                with utils.eval_mode(self.agent):
                    action = self.agent.act(obs['observation'],
                                            obs['desired_goal'],
                                            sample=True)

            # run training update
            if self.step >= self.cfg.num_seed_steps:
                self.agent.update(self.replay_buffer, self.logger, self.step)

            next_obs, reward, done, _ = self.env.step(action)

            # allow infinite bootstrap
            done = float(done)
            done_no_max = 0 if episode_step + 1 == self.env._max_episode_steps else done
            episode_reward += reward

            self.replay_buffer.add(obs, action, reward, next_obs, done,
                                   done_no_max)
            path_buffer.append(
                [obs, action, reward, next_obs, done, done_no_max])

            obs = next_obs
            episode_step += 1
            self.step += 1
コード例 #29
0
def main():
    args = parse_args()
    if args.seed == -1:
        args.__dict__["seed"] = np.random.randint(1, 1000000)
    exp_id = str(int(np.random.random() * 100000))
    utils.set_seed_everywhere(args.seed)

    env = env_wrapper.make(domain_name=args.domain_name,
                           task_name=args.task_name,
                           seed=args.seed,
                           visualize_reward=False,
                           from_pixels=(args.observation_type == 'pixel'
                                        or args.observation_type == 'hybrid'),
                           cameras=args.cameras,
                           height=args.pre_transform_image_size,
                           width=args.pre_transform_image_size,
                           frame_skip=args.action_repeat,
                           reward_type=args.reward_type,
                           change_model=args.change_model)

    env.seed(args.seed)
    if args.special_reset is not None:
        env.set_special_reset(args.special_reset)
    if args.demo_special_reset is not None:
        env.set_special_reset(args.demo_special_reset)

    if args.observation_type == 'hybrid':
        env.set_hybrid_obs()

    # stack several consecutive frames together
    if args.encoder_type == 'pixel':
        env = utils.FrameStack(env, k=args.frame_stack)

    # make directory
    ts = time.gmtime()
    ts = time.strftime("%m-%d", ts)
    if args.task_name is None:
        env_name = args.domain_name
    else:
        env_name = args.domain_name + '-' + args.task_name
    exp_name = args.reward_type + '-' + args.agent + '-' + args.encoder_type + '-' + args.data_augs
    exp_name += '-' + ts + '-' + env_name + '-im' + str(
        args.image_size) + '-b' + str(args.batch_size) + '-nu' + str(
            args.num_updates)
    if args.observation_type == 'hybrid':
        exp_name += '-hybrid'
    if args.change_model:
        exp_name += '-change_model'
    if args.bc_only:
        exp_name += '-bc_only'

    exp_name += '-s' + str(args.seed)

    exp_name += '-id' + exp_id
    args.work_dir = args.work_dir + '/' + exp_name
    utils.make_dir(args.work_dir)
    video_dir = utils.make_dir(os.path.join(args.work_dir, 'video'))
    model_dir = utils.make_dir(os.path.join(args.work_dir, 'model'))
    buffer_dir = utils.make_dir(os.path.join(args.work_dir, 'buffer'))

    print("Working in directory:", args.work_dir)

    video = VideoRecorder(video_dir if args.save_video else None,
                          camera_id=args.cameras[0])

    with open(os.path.join(args.work_dir, 'args.json'), 'w') as f:
        json.dump(vars(args), f, sort_keys=True, indent=4)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    action_shape = env.action_space.shape

    if args.encoder_type == 'pixel':
        cpf = 3 * len(args.cameras)
        obs_shape = (cpf * args.frame_stack, args.image_size, args.image_size)
        pre_aug_obs_shape = (cpf * args.frame_stack,
                             args.pre_transform_image_size,
                             args.pre_transform_image_size)
    else:
        obs_shape = env.observation_space.shape
        pre_aug_obs_shape = obs_shape

    replay_buffer = utils.ReplayBuffer(
        obs_shape=pre_aug_obs_shape,
        action_shape=action_shape,
        capacity=args.replay_buffer_capacity,
        batch_size=args.batch_size,
        device=device,
        image_size=args.image_size,
        hybrid_state_shape=env.hybrid_state_shape,
        load_dir=args.replay_buffer_load_dir)

    if args.demo_model_dir is not None:  # collect demonstrations using a state-trained expert
        episode_step, done = 0, True
        state_obs, obs = None, None
        episode_success = False
        original_encoder_type = args.encoder_type
        args.encoder_type = 'identity'

        if isinstance(env, utils.FrameStack):
            original_env = env.env
        else:
            original_env = env

        expert_agent = make_agent(
            obs_shape=original_env.observation_space.shape,
            action_shape=action_shape,
            args=args,
            device=device,
            hybrid_state_shape=env.hybrid_state_shape)
        args.encoder_type = original_encoder_type
        expert_agent.load(args.demo_model_dir, args.demo_model_step)
        print('Collecting expert trajectories...')
        t = 0
        while t < args.demo_samples:
            if done:
                episode_step = 0
                episode_success = False
                if args.demo_special_reset is not None:
                    env.reset(save_special_steps=True)
                    special_steps_dict = env.special_reset_save
                    obs_list = special_steps_dict['obs']
                    act_list = special_steps_dict['act']
                    reward_list = special_steps_dict['reward']
                    for i in range(len(act_list)):
                        replay_buffer.add(obs_list[i], act_list[i],
                                          reward_list[i], obs_list[i + 1],
                                          False)
                    episode_step += len(act_list)
                    t += len(act_list)
                    obs = obs_list[-1]
                    state_obs = original_env._get_state_obs()
                else:
                    obs = env.reset()
                    state_obs = original_env._get_state_obs()

            action = expert_agent.sample_action(state_obs)
            next_obs, reward, done, info = env.step(action)
            if info.get('is_success'):
                episode_success = True
            state_obs = original_env._get_state_obs()

            # allow infinite bootstrap
            done_bool = 0 if episode_step + 1 == env._max_episode_steps else float(
                done)

            replay_buffer.add(obs, action, reward, next_obs, done_bool)

            obs = next_obs
            episode_step += 1
            t += 1

            if args.success_demo_only and done and not episode_success:
                t -= episode_step
                replay_buffer.idx -= episode_step

        env.set_special_reset(args.special_reset)

    print('Starting with replay buffer filled to {}.'.format(
        replay_buffer.idx))

    # args.init_steps = max(0, args.init_steps - args.replay_buffer_load_pi_t)  # maybe tune this

    agent = make_agent(obs_shape=obs_shape,
                       action_shape=action_shape,
                       args=args,
                       device=device,
                       hybrid_state_shape=env.hybrid_state_shape)
    if args.model_dir is not None:
        agent.load(args.model_dir, args.model_step)
    L = Logger(args.work_dir, use_tb=args.save_tb)

    episode, episode_reward, done = 0, 0, True
    start_time = time.time()

    def eval_and_save():
        if args.save_model:
            agent.save_curl(model_dir, step)
        if args.save_buffer:
            replay_buffer.save(buffer_dir)
        if args.save_sac:
            agent.save(model_dir, step)
        L.log('eval/episode', episode, step)
        print('evaluating')
        evaluate(env, agent, video, args.num_eval_episodes, L, step, args)

    if args.warmup_cpc:
        print("Warming up cpc for " + str(args.warmup_cpc) + ' steps.')
        for i in range(args.warmup_cpc):
            agent.update_cpc_only(replay_buffer,
                                  L,
                                  step=0,
                                  ema=args.warmup_cpc_ema)
        print('Warmed up cpc.')

    if args.warmup_offline_sac:
        for i in range(args.warmup_offline_sac):
            agent.update_sac_only(replay_buffer, L, step=0)

    if args.bc_only:
        step = 0
        for i in range(100):
            agent.train_bc(replay_buffer)
            step += 1
        eval_and_save()
        return

    time_computing = 0
    time_acting = 0
    callback_fn = None
    step = 0

    if args.synch_update:
        callback_fn = lambda: lambda: [
            agent.update(replay_buffer,
                         L,
                         step,
                         log_networks=nu == 0 and step % args.log_networks_freq
                         == 0) for nu in range(args.num_updates)
        ] if step >= args.init_steps and not is_eval else 0  # pointers should all work properly, and execute in the proper frame

    if callback_fn is not None:
        env.env._env.env.set_callback(
            callback_fn)  # envwrapper (camera), framestack, timelimit

    # for step in range(args.num_train_steps):
    while step < args.num_train_steps:

        # evaluate agent periodically
        if step % args.eval_freq == 0:
            is_eval = True
            eval_and_save()
            is_eval = False

        if done:
            if step > 0:
                if step % args.log_interval == 0:
                    L.log('train/duration', time.time() - start_time, step)
                    L.dump(step)
                start_time = time.time()
            if step % args.log_interval == 0:
                L.log('train/episode_reward', episode_reward, step)

            time_tmp = time.time()
            obs = env.reset()
            time_acting += time.time() - time_tmp
            episode_reward = 0
            episode_step = 0
            episode += 1
            if step % args.log_interval == 0:
                L.log('train/episode', episode, step)

        # sample action for data collection
        if step < args.init_steps:
            action = env.action_space.sample()
        else:
            with utils.eval_mode(agent):
                action = agent.sample_action(obs)
        if step == args.init_steps and args.demo_samples == 0:
            if args.warmup_cpc:
                for i in range(args.warmup_cpc):
                    print("Warming up cpc for " + str(args.warmup_cpc) +
                          ' steps.')
                    agent.update_cpc_only(replay_buffer, L, step=0)
                    print('Warmed up cpc.')

        # run training update
        time_tmp = time.time()

        if step >= args.init_steps and not args.synch_update:
            for nu in range(args.num_updates):
                agent.update(replay_buffer,
                             L,
                             step,
                             log_networks=nu == 0
                             and step % args.log_networks_freq == 0)

        time_computing += time.time() - time_tmp

        time_tmp = time.time()

        next_obs, reward, done, _ = env.step(action)
        time_acting += time.time() - time_tmp

        # allow infinite bootstrap
        done_bool = 0 if episode_step + 1 == env._max_episode_steps else float(
            done)
        episode_reward += reward

        replay_buffer.add(obs, action, reward, next_obs, done_bool)

        obs = next_obs
        episode_step += 1
        step += 1

    step = args.num_train_steps
    print("time spent computing:", time_computing)
    print("time spent acting:", time_acting)
    eval_and_save()
コード例 #30
0
class Workspace(object):
    def __init__(self, cfg):

        self.work_dir = os.getcwd()
        """Hack to adjust action_repeat"""
        adjust_action_repeat_hack(cfg)

        print(f"CFG:\n{'-'*100}\n{cfg}\n{'-'*100}")

        self.cfg = cfg
        experiment_name = f"{cfg.full_title}_{cfg.run_id}"

        self.logger = Logger(self.work_dir,
                             save_tb=cfg.log_save_tb,
                             save_wb=cfg.log_save_wandb,
                             log_frequency=cfg.log_frequency_step,
                             agent=cfg.agent.name,
                             action_repeat=cfg.action_repeat,
                             cfg=dict(flatten_cfg(cfg)),
                             plot_project="drqtest",
                             experiment=experiment_name)
        utils.set_seed_everywhere(cfg.seed)
        self.device = torch.device(cfg.device)
        self.env = make_env(cfg)

        cfg.agent.params.obs_shape = self.env.observation_space.shape
        cfg.agent.params.action_shape = self.env.action_space.shape
        cfg.agent.params.action_range = [
            float(self.env.action_space.low.min()),
            float(self.env.action_space.high.max())
        ]
        self.agent = hydra.utils.instantiate(cfg.agent)

        print(f"ACTOR:\n{'-'*100}\n{self.agent.actor}\n{'-'*100}")
        print(f"CRITIC:\n{'-'*100}\n{self.agent.critic}\n{'-'*100}")

        self.replay_buffer = ReplayBuffer(
            self.env.observation_space.shape,
            self.env.action_space.shape,
            cfg.replay_buffer_capacity,
            self.cfg.image_pad,
            self.device,
            use_aug=cfg.replay_buffer_augmentation)

        self.video_recorder = VideoRecorder(
            self.work_dir if cfg.save_video else None)
        self.step = 0

    def evaluate(self):
        average_episode_reward = 0
        for episode in range(self.cfg.num_eval_episodes):
            obs = self.env.reset()
            self.video_recorder.init(enabled=(episode == 0))
            done = False
            episode_reward = 0
            episode_step = 0
            while not done:
                with utils.eval_mode(self.agent):
                    action = self.agent.act(obs, sample=False)
                obs, reward, done, info = self.env.step(action)
                self.video_recorder.record(self.env)
                episode_reward += reward
                episode_step += 1

            average_episode_reward += episode_reward
            self.video_recorder.save(f'{self.step}.mp4')
        average_episode_reward /= self.cfg.num_eval_episodes
        self.logger.log('eval/episode_reward', average_episode_reward,
                        self.step)
        self.logger.dump(self.step)

    def run(self):
        episode, episode_reward, episode_step, done = 0, 0, 1, True
        start_time = time.time()

        while self.step < self.cfg.num_train_steps:
            if done:
                if self.step > 0:
                    self.logger.log('train/duration',
                                    time.time() - start_time, self.step)
                    start_time = time.time()
                    self.logger.dump(
                        self.step, save=(self.step > self.cfg.num_seed_steps))

                # evaluate agent periodically
                if self.step % self.cfg.eval_frequency == 0:
                    self.logger.log('eval/episode', episode, self.step)
                    self.evaluate()

                self.logger.log('train/episode_reward', episode_reward,
                                self.step)

                obs = self.env.reset()
                done = False
                episode_reward = 0
                episode_step = 0
                episode += 1

                self.logger.log('train/episode', episode, self.step)

            # sample action for data collection
            if self.step < self.cfg.num_seed_steps:
                action = self.env.action_space.sample()
            else:
                with utils.eval_mode(self.agent):
                    action = self.agent.act(obs, sample=True)

            # run training update
            if self.step >= self.cfg.num_seed_steps:
                for _ in range(self.cfg.num_train_iters):
                    self.agent.update(self.replay_buffer, self.logger,
                                      self.step)

            next_obs, reward, done, info = self.env.step(action)

            # allow infinite bootstrap
            done = float(done)
            done_no_max = 0 if episode_step + 1 == self.env._max_episode_steps else done
            episode_reward += reward

            self.replay_buffer.add(obs, action, reward, next_obs, done,
                                   done_no_max)

            obs = next_obs
            episode_step += 1
            self.step += 1