Exemple #1
0
    def __init__(self, config: Config):
        raise ValueError("class under rewriting")
        self.name = config.name
        self.stat_logger: Logger = Logger(
            config,
            log_interval=config.log_interval,
        )
        self.config = config
        self.hyperparameters = config.hyperparameters
        self.eps_clip = config.hyperparameters['eps_clip']

        self.test_env = config.test_environment_make_function()
        self.action_size = self.test_env.action_space.shape[0]
        self.env = None
        self.create_env()

        self.memory = Torch_Arbitrary_Replay_Buffer(
            buffer_size=10**4,
            batch_size=10**4,
            phi=config.phi,
            seed=0,
            device=self.config.device,
            sample_order=[
                'state', 'action', 'reward', 'log_prob', 'done', 'next_state'
            ],
            do_it_auto=False,
        )

        state_shape = config.phi(self.test_env.reset()).shape
        action_size = self.test_env.action_space.shape[0]

        if self.hyperparameters['use_icm']:
            self._icm: ICM = ICM(state_description=state_description,
                                 action_size=action_size,
                                 encoded_state_size=6,
                                 device=self.device,
                                 batch_size=256,
                                 buffer_size=10**5,
                                 update_per_step=1,
                                 config=config.hyperparameters['icm_config'])
        self.ac: ActorCritic = ActorCritic(
            state_shape=state_shape,
            action_size=action_size,
            hidden_size=128,
            device=self.config.device,
            action_std=0.5,
            double_action_size_on_output=False,
        )
        self.optimizer = torch.optim.Adam(
            self.ac.parameters(),
            # chain(
            #     self.ac.parameters(),
            #     self._icm.parameters()
            # ) if self.hyperparameters['use_icm'] else self.ac.parameters(),
            lr=config.hyperparameters['lr'],
            betas=config.hyperparameters['betas'],
        )

        self.ac_old: ActorCritic = ActorCritic(
            state_shape=state_shape,
            action_size=action_size,
            hidden_size=128,
            device=self.config.device,
            action_std=0.5,
            double_action_size_on_output=False,
        )
        self.update_old_policy()
        self.mse = nn.MSELoss()

        self.folder_save_path = os.path.join('model_saves', 'PPO', self.name)
        self.episode_number = 0
        self.global_step_number = 0
        self._total_grad_steps = 0
        self.current_game_stats = None
        self.flush_stats()

        self.accumulated_reward_mean = None
        self.accumulated_reward_std = None

        self._exp_moving_track_progress = 0.0
    def __init__(self, config: Config):
        self.name = config.name
        self.stat_logger: Logger = Logger(config,
                                          log_interval=config.log_interval)
        self.config: Config = config
        self.hyperparameters = config.hyperparameters
        self.eps_clip = config.hyperparameters['eps_clip']

        self.test_env = config.test_environment_make_function()
        self.action_size = self.test_env.action_space.shape[0]
        self.env = None
        self.create_env()

        self.memory = Torch_Arbitrary_Replay_Buffer(
            buffer_size=10**4,
            batch_size=10**4,
            phi=config.phi,
            seed=0,
            device=self.config.device,
            sample_order=[
                'state', 'action', 'reward', 'log_prob', 'done', 'next_state'
            ],
            do_it_auto=False,
        )

        state_shape = config.phi(self.test_env.reset()).shape
        action_size = self.test_env.action_space.shape[0]

        self.ac: ActorCritic = ActorCritic(
            state_shape=state_shape,
            action_size=action_size,
            hidden_size=128,
            device=self.config.device,
            action_std=0.5,
            double_action_size_on_output=False,
        )
        self.optimizer = torch.optim.Adam(
            self.ac.parameters(),
            lr=config.hyperparameters['lr'],
            betas=config.hyperparameters['betas'],
        )

        self.ac_old: ActorCritic = ActorCritic(
            state_shape=state_shape,
            action_size=action_size,
            hidden_size=128,
            device=self.config.device,
            action_std=0.5,
            double_action_size_on_output=False,
        )
        self.update_old_policy()
        self.mse = nn.MSELoss()

        self.folder_save_path = os.path.join('model_saves', 'PPO', self.name)
        self.episode_number = 0
        self.global_step_number = 0
        self._total_grad_steps = 0
        self._wandb_anim_save = 0

        self.accumulated_reward_mean = None
        self.accumulated_reward_std = None

        self._exp_moving_track_progress = 0.0
    def __init__(self, config: Config):
        self.name = config.name
        self.stat_logger: Logger = Logger(
            config,
            log_interval=config.log_interval,
        )
        self.config = config
        self.hyperparameters = config.hyperparameters
        self.eps_clip = config.hyperparameters['eps_clip']

        self.test_env = config.test_environment_make_function()
        self.env = None
        self.create_env()

        self.memory = Torch_Arbitrary_Replay_Buffer(
            buffer_size=10**4,
            batch_size=10**4,
            phi=None,
            seed=0,
            device=self.config.device,
            sample_order=[
                'state', 'action', 'reward', 'log_prob', 'done', 'next_state'
            ],
            do_it_auto=False,
            convert_to_torch=False,
        )

        state_shape = config.phi(self.test_env.reset()).shape
        print(f'state shape : {state_shape}')
        action_size = self.test_env.action_space.shape[0]

        self.ac: ActorCritic = ActorCritic(
            state_shape=state_shape,
            action_size=action_size,
            hidden_size=128,
            device=self.config.device,
            action_std=0.5,
            double_action_size_on_output=False,
        )
        self.optimizer = torch.optim.Adam(
            self.ac.parameters(),
            lr=config.hyperparameters['lr'],
            betas=config.hyperparameters['betas'],
        )

        self.ac_old: ActorCritic = ActorCritic(
            state_shape=state_shape,
            action_size=action_size,
            hidden_size=128,
            device=self.config.device,
            action_std=0.5,
            double_action_size_on_output=False,
        )
        self.update_old_policy()
        self.mse = nn.MSELoss()

        # self.image_transform = transforms.Compose([
        #     transforms.ToPILImage(),
        #     transforms.RandomCrop(
        #         (84, 84),
        #         padding=self.hyperparameters['drq_padding'],
        #         pad_if_needed=True,
        #         padding_mode='edge',
        #     ),
        #     transforms.ToTensor(),
        # ])
        self.image_transform = nn.Sequential(
            nn.ReplicationPad2d(self.hyperparameters['drq_padding']),
            kornia.augmentation.RandomCrop((84, 84)),
        )

        self.folder_save_path = os.path.join('model_saves', 'PPO', self.name)
        self.episode_number = 0
        self.global_step_number = 0
        self._total_grad_steps = 0
        self.current_game_stats = None
        self._wandb_anim_save = 0
        self.flush_stats()

        self.accumulated_reward_mean = None
        self.accumulated_reward_std = None

        self._exp_moving_track_progress = 0.0
Exemple #4
0
    def __init__(self, config: Config):
        print('start to init rainbow')
        self.config = config
        self.name = config.name
        self.hyperparameters = config.hyperparameters

        self.stat_logger: Logger = Logger(
            config,
            log_interval=config.log_interval *\
                         (1 + self.hyperparameters['parallel_env_num'] * int(self.hyperparameters['use_parallel_envs'])),
        )
        if self.hyperparameters['use_parallel_envs']:
            self.env = SubprocVecEnv_tf2(
                [
                    config.environment_make_function
                    for _ in range(self.hyperparameters['parallel_env_num'])
                ],
                state_flatter=None,
            )
        else:
            self.env = config.environment_make_function()

        self.test_env = config.test_environment_make_function()

        # function to prepare row observation to chainer format
        print(f"rainbow mode : {self.config.mode}")

        n_actions = self.test_env.action_space.n

        n_atoms = 51
        v_max = 10
        v_min = -10
        q_func = DistributionalDuelingDQN_VectorPicture(
            config.phi(self.test_env.reset()).shape,
            n_actions,
            n_atoms,
            v_min,
            v_max,
        )

        # Noisy nets
        links.to_factorized_noisy(
            q_func, sigma_scale=self.hyperparameters['noisy_net_sigma'])
        # Turn off explorer
        explorer = explorers.Greedy()

        # Draw the computational graph and save it in the output directory.
        # chainerrl.misc.draw_computational_graph(
        #     [q_func(np.zeros((4, 84, 84), dtype=np.float32)[None])],
        #     os.path.join(args.outdir, 'model'))

        # Use the same hyper parameters as https://arxiv.org/abs/1707.06887
        opt = chainer.optimizers.Adam(self.hyperparameters['lr'],
                                      eps=1.5 * 10**-4)
        opt.setup(q_func)

        # Prioritized Replay
        # Anneal beta from beta0 to 1 throughout training
        update_interval = 4
        betasteps = self.config.env_steps_to_run / update_interval
        rbuf = replay_buffer.PrioritizedReplayBuffer(
            10**6,
            alpha=0.5,
            beta0=0.4,
            betasteps=betasteps,
            num_steps=3,
            normalize_by_max='memory',
        )

        self.agent = agents.CategoricalDoubleDQN(
            q_func,
            opt,
            rbuf,
            gpu=self.config.rainbow_gpu,
            gamma=0.99,
            explorer=explorer,
            minibatch_size=32,
            replay_start_size=self.hyperparameters['replay_start_size'],
            target_update_interval=16000,
            update_interval=update_interval,
            batch_accumulator='mean',
            phi=config.phi,
        )

        # self.folder_save_path = os.path.join('model_saves', 'Rainbow', self.name)
        self.episode_number = 0
        self.global_step_number = 0
        self.batch_step_number = 0
        self._total_grad_steps = 0
        self.current_game_stats = None
        self.flush_stats()
        # self.tf_writer = config.tf_writer

        self.accumulated_reward_mean = None
        self.accumulated_reward_std = None

        self._exp_moving_track_progress = 0.0