def __init__(self, config: Config): self.config = config self.hyperparameters = config.hyperparameters self.env = config.environment_make_function() self.memory = TorchReplayBuffer( size=10 ** 6, phi=config.phi, device=self.config.device, ) state_shape = config.phi(self.env.reset()).shape self.action_size = self.env.action_space.shape[0] self.actor = StateAdaptiveActor(state_shape, self.action_size, self.config.device).to(self.config.device) self.actor_target: nn.Module = copy.deepcopy(self.actor) self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=self.hyperparameters['lr']) self.critic = DoubleStateAdaptiveCritic(state_shape, self.action_size, self.config.device).to(self.config.device) self.critic_target: nn.Module = copy.deepcopy(self.critic) self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=self.hyperparameters['lr']) self.total_it = 0 self.stat_logger: Logger = Logger(config, log_interval=self.config.log_interval) self.episode_number = 0 self.global_step_number = 0 self._total_grad_steps = 0 self.current_game_stats = None self.flush_stats() self.accumulated_reward_mean = None self.accumulated_reward_std = None self._exp_moving_track_progress = 0.0
def make_single_config(agent_config_dict) -> Config: config = Config() for name, value in agent_config_dict.items(): if hasattr(config, name): setattr(config, name, value) continue config.hyperparameters.update({name: value}) return config
def __init__(self, config: Config): raise ValueError("class under rewriting") self.name = config.name self.stat_logger: Logger = Logger( config, log_interval=config.log_interval, ) self.config = config self.hyperparameters = config.hyperparameters self.eps_clip = config.hyperparameters['eps_clip'] self.test_env = config.test_environment_make_function() self.action_size = self.test_env.action_space.shape[0] self.env = None self.create_env() self.memory = Torch_Arbitrary_Replay_Buffer( buffer_size=10**4, batch_size=10**4, phi=config.phi, seed=0, device=self.config.device, sample_order=[ 'state', 'action', 'reward', 'log_prob', 'done', 'next_state' ], do_it_auto=False, ) state_shape = config.phi(self.test_env.reset()).shape action_size = self.test_env.action_space.shape[0] if self.hyperparameters['use_icm']: self._icm: ICM = ICM(state_description=state_description, action_size=action_size, encoded_state_size=6, device=self.device, batch_size=256, buffer_size=10**5, update_per_step=1, config=config.hyperparameters['icm_config']) self.ac: ActorCritic = ActorCritic( state_shape=state_shape, action_size=action_size, hidden_size=128, device=self.config.device, action_std=0.5, double_action_size_on_output=False, ) self.optimizer = torch.optim.Adam( self.ac.parameters(), # chain( # self.ac.parameters(), # self._icm.parameters() # ) if self.hyperparameters['use_icm'] else self.ac.parameters(), lr=config.hyperparameters['lr'], betas=config.hyperparameters['betas'], ) self.ac_old: ActorCritic = ActorCritic( state_shape=state_shape, action_size=action_size, hidden_size=128, device=self.config.device, action_std=0.5, double_action_size_on_output=False, ) self.update_old_policy() self.mse = nn.MSELoss() self.folder_save_path = os.path.join('model_saves', 'PPO', self.name) self.episode_number = 0 self.global_step_number = 0 self._total_grad_steps = 0 self.current_game_stats = None self.flush_stats() self.accumulated_reward_mean = None self.accumulated_reward_std = None self._exp_moving_track_progress = 0.0
def __init__(self, config: Config): self.name = config.name self.stat_logger: Logger = Logger( config, log_interval=config.log_interval, ) self.config = config self.hyperparameters = config.hyperparameters self.eps_clip = config.hyperparameters['eps_clip'] self.test_env = config.test_environment_make_function() self.env = None self.create_env() self.memory = Torch_Arbitrary_Replay_Buffer( buffer_size=10**4, batch_size=10**4, phi=None, seed=0, device=self.config.device, sample_order=[ 'state', 'action', 'reward', 'log_prob', 'done', 'next_state' ], do_it_auto=False, convert_to_torch=False, ) state_shape = config.phi(self.test_env.reset()).shape print(f'state shape : {state_shape}') action_size = self.test_env.action_space.shape[0] self.ac: ActorCritic = ActorCritic( state_shape=state_shape, action_size=action_size, hidden_size=128, device=self.config.device, action_std=0.5, double_action_size_on_output=False, ) self.optimizer = torch.optim.Adam( self.ac.parameters(), lr=config.hyperparameters['lr'], betas=config.hyperparameters['betas'], ) self.ac_old: ActorCritic = ActorCritic( state_shape=state_shape, action_size=action_size, hidden_size=128, device=self.config.device, action_std=0.5, double_action_size_on_output=False, ) self.update_old_policy() self.mse = nn.MSELoss() # self.image_transform = transforms.Compose([ # transforms.ToPILImage(), # transforms.RandomCrop( # (84, 84), # padding=self.hyperparameters['drq_padding'], # pad_if_needed=True, # padding_mode='edge', # ), # transforms.ToTensor(), # ]) self.image_transform = nn.Sequential( nn.ReplicationPad2d(self.hyperparameters['drq_padding']), kornia.augmentation.RandomCrop((84, 84)), ) self.folder_save_path = os.path.join('model_saves', 'PPO', self.name) self.episode_number = 0 self.global_step_number = 0 self._total_grad_steps = 0 self.current_game_stats = None self._wandb_anim_save = 0 self.flush_stats() self.accumulated_reward_mean = None self.accumulated_reward_std = None self._exp_moving_track_progress = 0.0
def __init__(self, config: Config): self.name = config.name self.stat_logger: Logger = Logger(config, log_interval=config.log_interval) self.config: Config = config self.hyperparameters = config.hyperparameters self.eps_clip = config.hyperparameters['eps_clip'] self.test_env = config.test_environment_make_function() self.action_size = self.test_env.action_space.shape[0] self.env = None self.create_env() self.memory = Torch_Arbitrary_Replay_Buffer( buffer_size=10**4, batch_size=10**4, phi=config.phi, seed=0, device=self.config.device, sample_order=[ 'state', 'action', 'reward', 'log_prob', 'done', 'next_state' ], do_it_auto=False, ) state_shape = config.phi(self.test_env.reset()).shape action_size = self.test_env.action_space.shape[0] self.ac: ActorCritic = ActorCritic( state_shape=state_shape, action_size=action_size, hidden_size=128, device=self.config.device, action_std=0.5, double_action_size_on_output=False, ) self.optimizer = torch.optim.Adam( self.ac.parameters(), lr=config.hyperparameters['lr'], betas=config.hyperparameters['betas'], ) self.ac_old: ActorCritic = ActorCritic( state_shape=state_shape, action_size=action_size, hidden_size=128, device=self.config.device, action_std=0.5, double_action_size_on_output=False, ) self.update_old_policy() self.mse = nn.MSELoss() self.folder_save_path = os.path.join('model_saves', 'PPO', self.name) self.episode_number = 0 self.global_step_number = 0 self._total_grad_steps = 0 self._wandb_anim_save = 0 self.accumulated_reward_mean = None self.accumulated_reward_std = None self._exp_moving_track_progress = 0.0
def __init__(self, config: Config): self.name = config.name # self.tf_writer = config.tf_writer self.config = config self.environment = config.environment_make_function() self.action_size = self.environment.action_space.shape[0] self.hyperparameters = config.hyperparameters self.folder_save_path = os.path.join('model_saves', 'SAC', self.name) self.critic_local = QNet( state_description=self.environment.observation_space, action_size=self.action_size, hidden_size=256, device=self.config.device, ) self.critic_local_2 = QNet( state_description=self.environment.observation_space, action_size=self.action_size, hidden_size=256, device=self.config.device, ) self.critic_optimizer = torch.optim.Adam( self.critic_local.parameters(), lr=self.hyperparameters["Critic"]["learning_rate"], eps=1e-4, ) self.critic_optimizer_2 = torch.optim.Adam( self.critic_local_2.parameters(), lr=self.hyperparameters["Critic"]["learning_rate"], eps=1e-4, ) self.critic_target = QNet( state_description=self.environment.observation_space, action_size=self.action_size, hidden_size=256, device=self.config.device, ) self.critic_target_2 = QNet( state_description=self.environment.observation_space, action_size=self.action_size, hidden_size=256, device=self.config.device, ) SAC.copy_model_over(self.critic_local, self.critic_target) SAC.copy_model_over(self.critic_local_2, self.critic_target_2) self.memory = Torch_Separated_Replay_Buffer( self.hyperparameters["buffer_size"], self.hyperparameters["batch_size"], self.hyperparameters["seed"], device=self.config.device, state_extractor=get_state_combiner_by_settings_file( self.hyperparameters['env_settings_file_path']), state_producer=from_image_vector_to_combined_state, ) self.actor_local = Policy( state_description=self.environment.observation_space, action_size=self.action_size, hidden_size=256, device=self.config.device, ) self.actor_optimizer = torch.optim.Adam( self.actor_local.parameters(), lr=self.hyperparameters["Actor"]["learning_rate"], eps=1e-4) self.target_entropy = -torch.prod( torch.Tensor(self.environment.action_space.shape).to( self.device)).item() # heuristic value from the paper self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.alpha = self.log_alpha.exp() self.alpha_optim = Adam( [self.log_alpha], lr=self.hyperparameters["Actor"]["learning_rate"], eps=1e-4) self._game_stats = {} self._last_episode_save_count = 0 self._current_run_global_steps = 0 self.episode_number = 0 self.global_step_number = 0
def __init__(self, config: Config): print('start to init rainbow') self.config = config self.name = config.name self.hyperparameters = config.hyperparameters self.stat_logger: Logger = Logger( config, log_interval=config.log_interval *\ (1 + self.hyperparameters['parallel_env_num'] * int(self.hyperparameters['use_parallel_envs'])), ) if self.hyperparameters['use_parallel_envs']: self.env = SubprocVecEnv_tf2( [ config.environment_make_function for _ in range(self.hyperparameters['parallel_env_num']) ], state_flatter=None, ) else: self.env = config.environment_make_function() self.test_env = config.test_environment_make_function() # function to prepare row observation to chainer format print(f"rainbow mode : {self.config.mode}") n_actions = self.test_env.action_space.n n_atoms = 51 v_max = 10 v_min = -10 q_func = DistributionalDuelingDQN_VectorPicture( config.phi(self.test_env.reset()).shape, n_actions, n_atoms, v_min, v_max, ) # Noisy nets links.to_factorized_noisy( q_func, sigma_scale=self.hyperparameters['noisy_net_sigma']) # Turn off explorer explorer = explorers.Greedy() # Draw the computational graph and save it in the output directory. # chainerrl.misc.draw_computational_graph( # [q_func(np.zeros((4, 84, 84), dtype=np.float32)[None])], # os.path.join(args.outdir, 'model')) # Use the same hyper parameters as https://arxiv.org/abs/1707.06887 opt = chainer.optimizers.Adam(self.hyperparameters['lr'], eps=1.5 * 10**-4) opt.setup(q_func) # Prioritized Replay # Anneal beta from beta0 to 1 throughout training update_interval = 4 betasteps = self.config.env_steps_to_run / update_interval rbuf = replay_buffer.PrioritizedReplayBuffer( 10**6, alpha=0.5, beta0=0.4, betasteps=betasteps, num_steps=3, normalize_by_max='memory', ) self.agent = agents.CategoricalDoubleDQN( q_func, opt, rbuf, gpu=self.config.rainbow_gpu, gamma=0.99, explorer=explorer, minibatch_size=32, replay_start_size=self.hyperparameters['replay_start_size'], target_update_interval=16000, update_interval=update_interval, batch_accumulator='mean', phi=config.phi, ) # self.folder_save_path = os.path.join('model_saves', 'Rainbow', self.name) self.episode_number = 0 self.global_step_number = 0 self.batch_step_number = 0 self._total_grad_steps = 0 self.current_game_stats = None self.flush_stats() # self.tf_writer = config.tf_writer self.accumulated_reward_mean = None self.accumulated_reward_std = None self._exp_moving_track_progress = 0.0