def __init__( self, agent: Union[ActorSpec, CriticSpec], env: EnvironmentSpec, db_server: DBSpec = None, exploration_handler: ExplorationHandler = None, logdir: str = None, id: int = 0, mode: str = "infer", buffer_size: int = int(1e4), weights_sync_period: int = 1, seeds: List = None, episode_limit: int = None, force_store: bool = False, gc_period: int = 10, ): self._device = UtilsFactory.prepare_device() self._seed = 42 + id self._sampler_id = id self._infer = mode == "infer" self.seeds = seeds # logging self._prepare_logger(logdir, mode) # environment, model, exploration & action handlers self.env = env self.agent = agent self.exploration_handler = exploration_handler self.episode_index = 0 self.episode_runner = EpisodeRunner( env=self.env, agent=self.agent, device=self._device, capacity=buffer_size, deterministic=self._infer ) # synchronization configuration self.db_server = db_server self.weights_sync_period = weights_sync_period self.episode_limit = episode_limit or _BIG_NUM self._force_store = force_store self._sampler_weight_mode = \ "critic" if env.discrete_actions else "actor" self._gc_period = gc_period
def __init__( self, critic: CriticSpec, gamma: float, n_step: int, critic_loss_params: Dict = None, critic_optimizer_params: Dict = None, critic_scheduler_params: Dict = None, critic_grad_clip_params: Dict = None, critic_tau: float = 1.0, **kwargs ): self._device = UtilsFactory.prepare_device() self.critic = critic.to(self._device) self.target_critic = copy.deepcopy(critic).to(self._device) # preparation agent_stuff = get_agent_stuff_from_params( agent=self.critic, loss_params=critic_loss_params, optimizer_params=critic_optimizer_params, scheduler_params=critic_scheduler_params, grad_clip_params=critic_grad_clip_params ) # criterion self._critic_loss_params = agent_stuff["loss_params"] self.critic_criterion = agent_stuff["criterion"] # optimizer self._critic_optimizer_params = agent_stuff["optimizer_params"] self.critic_optimizer = agent_stuff["optimizer"] # scheduler self._critic_scheduler_params = agent_stuff["scheduler_params"] self.critic_scheduler = agent_stuff["scheduler"] # grad clipping self._critic_grad_clip_params = agent_stuff["grad_clip_params"] self.critic_grad_clip_fn = agent_stuff["grad_clip_fn"] # other hyperparameters self._n_step = n_step self._gamma = gamma self.critic_tau = critic_tau # other init self._init(**kwargs)
def __init__(self, actor: ActorSpec, gamma: float, n_step: int, actor_loss_params: Dict = None, actor_optimizer_params: Dict = None, actor_scheduler_params: Dict = None, actor_grad_clip_params: Dict = None, **kwargs): self._device = UtilsFactory.prepare_device() self.actor = actor.to(self._device) # actor preparation actor_stuff = get_agent_stuff_from_params( agent=self.actor, loss_params=actor_loss_params, optimizer_params=actor_optimizer_params, scheduler_params=actor_scheduler_params, grad_clip_params=actor_grad_clip_params) # criterion self._actor_loss_params = actor_stuff["loss_params"] self.actor_criterion = actor_stuff["criterion"] # optimizer self._actor_optimizer_params = actor_stuff["optimizer_params"] self.actor_optimizer = actor_stuff["optimizer"] # scheduler self._actor_scheduler_params = actor_stuff["scheduler_params"] self.actor_scheduler = actor_stuff["scheduler"] # grad clipping self._actor_grad_clip_params = actor_stuff["grad_clip_params"] self.actor_grad_clip_fn = actor_stuff["grad_clip_fn"] # other hyperparameters self._n_step = n_step self._gamma = gamma # other init self._init(**kwargs)
def __init__(self, actor, env, id, logdir=None, redis_server=None, redis_prefix=None, buffer_size=int(1e4), history_len=1, weights_sync_period=1, mode="infer", resume=None, action_noise_prob=0, action_noise_t=1, random_process=None, param_noise_prob=0, param_noise_d=0.2, param_noise_steps=1000, seeds=None, action_clip=(-1, 1), episode_limit=None, force_store=False, min_episode_steps=None, min_episode_reward=None): self._seed = 42 + id set_global_seeds(self._seed) self._sampler_id = id self._device = UtilsFactory.prepare_device() self.actor = copy.deepcopy(actor).to(self._device) self.env = env self.redis_server = redis_server self.redis_prefix = redis_prefix or "" self.resume = resume self.episode_limit = episode_limit or int(2**32 - 2) self.force_store = force_store self.min_episode_steps = min_episode_steps self.min_episode_reward = min_episode_reward self.hard_seeds = set() min_episode_flag_ = \ min_episode_steps is None and min_episode_reward is None assert min_episode_flag_ or seeds is None self.min_episode_steps = self.min_episode_steps or -int(1e6) self.min_episode_reward = self.min_episode_reward or -int(1e6) self.history_len = history_len self.buffer_size = buffer_size self.weights_sync_period = weights_sync_period self.episode_index = 0 self.action_clip = action_clip self.infer = mode == "infer" self.seeds = seeds self.action_noise_prob = action_noise_prob self.action_noise_t = action_noise_t self.random_process = random_process or RandomProcess() self.param_noise_prob = param_noise_prob self.param_noise_d = param_noise_d self.param_noise_steps = param_noise_steps if self.infer: self.action_noise_prob = 0 self.param_noise_prob = 0 if logdir is not None: current_date = datetime.now().strftime("%y-%m-%d-%H-%M-%S-%M-%f") logpath = f"{logdir}/sampler-{mode}-{id}-{current_date}" os.makedirs(logpath, exist_ok=True) self.logger = SummaryWriter(logpath) else: self.logger = None self.buffer = SamplerBuffer( capacity=self.buffer_size, observation_shape=self.env.observation_shape, action_shape=self.env.action_shape)
def __init__( self, actor: ActorSpec, critic: CriticSpec, gamma: float, n_step: int, actor_loss_params: Dict = None, critic_loss_params: Dict = None, actor_optimizer_params: Dict = None, critic_optimizer_params: Dict = None, actor_scheduler_params: Dict = None, critic_scheduler_params: Dict = None, actor_grad_clip_params: Dict = None, critic_grad_clip_params: Dict = None, actor_tau=1.0, critic_tau=1.0, action_boundaries=None, **kwargs ): self._device = UtilsFactory.prepare_device() self.actor = actor.to(self._device) self.critic = critic.to(self._device) self.target_actor = copy.deepcopy(actor).to(self._device) self.target_critic = copy.deepcopy(critic).to(self._device) # actor preparation actor_stuff = get_agent_stuff_from_params( agent=self.actor, loss_params=actor_loss_params, optimizer_params=actor_optimizer_params, scheduler_params=actor_scheduler_params, grad_clip_params=actor_grad_clip_params ) # criterion self._actor_loss_params = actor_stuff["loss_params"] self.actor_criterion = actor_stuff["criterion"] # optimizer self._actor_optimizer_params = actor_stuff["optimizer_params"] self.actor_optimizer = actor_stuff["optimizer"] # scheduler self._actor_scheduler_params = actor_stuff["scheduler_params"] self.actor_scheduler = actor_stuff["scheduler"] # grad clipping self._actor_grad_clip_params = actor_stuff["grad_clip_params"] self.actor_grad_clip_fn = actor_stuff["grad_clip_fn"] # critic preparation critic_stuff = get_agent_stuff_from_params( agent=self.critic, loss_params=critic_loss_params, optimizer_params=critic_optimizer_params, scheduler_params=critic_scheduler_params, grad_clip_params=critic_grad_clip_params ) # criterion self._critic_loss_params = critic_stuff["loss_params"] self.critic_criterion = critic_stuff["criterion"] # optimizer self._critic_optimizer_params = critic_stuff["optimizer_params"] self.critic_optimizer = critic_stuff["optimizer"] # scheduler self._critic_scheduler_params = critic_stuff["scheduler_params"] self.critic_scheduler = critic_stuff["scheduler"] # grad clipping self._critic_grad_clip_params = critic_stuff["grad_clip_params"] self.critic_grad_clip_fn = critic_stuff["grad_clip_fn"] # other hyperparameters self._n_step = n_step self._gamma = gamma self._actor_tau = actor_tau self._critic_tau = critic_tau if action_boundaries is not None: assert len(action_boundaries) == 2, \ "Should be min and max action boundaries" self._action_boundaries = action_boundaries # other init self._init(**kwargs)
def __init__(self, actor: ActorSpec, critic: CriticSpec, gamma: float, n_step: int, actor_loss_params: Dict = None, critic_loss_params: Dict = None, actor_optimizer_params: Dict = None, critic_optimizer_params: Dict = None, actor_scheduler_params: Dict = None, critic_scheduler_params: Dict = None, actor_grad_clip_params: Dict = None, critic_grad_clip_params: Dict = None, **kwargs): self._device = UtilsFactory.prepare_device() self.actor = actor.to(self._device) self.critic = critic.to(self._device) # actor preparation actor_stuff = get_agent_stuff_from_params( agent=self.actor, loss_params=actor_loss_params, optimizer_params=actor_optimizer_params, scheduler_params=actor_scheduler_params, grad_clip_params=actor_grad_clip_params) # criterion self._actor_loss_params = actor_stuff["loss_params"] self.actor_criterion = actor_stuff["criterion"] # optimizer self._actor_optimizer_params = actor_stuff["optimizer_params"] self.actor_optimizer = actor_stuff["optimizer"] # scheduler self._actor_scheduler_params = actor_stuff["scheduler_params"] self.actor_scheduler = actor_stuff["scheduler"] # grad clipping self._actor_grad_clip_params = actor_stuff["grad_clip_params"] self.actor_grad_clip_fn = actor_stuff["grad_clip_fn"] # critic preparation critic_stuff = get_agent_stuff_from_params( agent=self.critic, loss_params=critic_loss_params, optimizer_params=critic_optimizer_params, scheduler_params=critic_scheduler_params, grad_clip_params=critic_grad_clip_params) # criterion self._critic_loss_params = critic_stuff["loss_params"] self.critic_criterion = critic_stuff["criterion"] # optimizer self._critic_optimizer_params = critic_stuff["optimizer_params"] self.critic_optimizer = critic_stuff["optimizer"] # scheduler self._critic_scheduler_params = critic_stuff["scheduler_params"] self.critic_scheduler = critic_stuff["scheduler"] # grad clipping self._critic_grad_clip_params = critic_stuff["grad_clip_params"] self.critic_grad_clip_fn = critic_stuff["grad_clip_fn"] # other hyperparameters assert n_step == 1, "For now, on-policy setup works only with n-step=1" self._n_step = n_step self._gamma = gamma # other init self._init(**kwargs)
def __init__( self, actor, critic, gamma, n_step, actor_optimizer_params, critic_optimizer_params, actor_grad_clip_params=None, critic_grad_clip_params=None, actor_loss_params=None, critic_loss_params=None, actor_scheduler_params=None, critic_scheduler_params=None, resume=None, load_optimizer=True, actor_tau=1.0, critic_tau=1.0, min_action=-1.0, max_action=1.0, **kwargs ): self._device = UtilsFactory.prepare_device() self.actor = actor.to(self._device) self.critic = critic.to(self._device) self.target_actor = copy.deepcopy(actor).to(self._device) self.target_critic = copy.deepcopy(critic).to(self._device) self.actor_optimizer = OPTIMIZERS.get_from_params( **actor_optimizer_params, params=prepare_optimizable_params(self.actor) ) self.critic_optimizer = OPTIMIZERS.get_from_params( **critic_optimizer_params, params=prepare_optimizable_params(self.critic) ) self.actor_optimizer_params = actor_optimizer_params self.critic_optimizer_params = critic_optimizer_params actor_scheduler_params = actor_scheduler_params or {} critic_scheduler_params = critic_scheduler_params or {} self.actor_scheduler = SCHEDULERS.get_from_params( **actor_scheduler_params, optimizer=self.actor_optimizer ) self.critic_scheduler = SCHEDULERS.get_from_params( **critic_scheduler_params, optimizer=self.critic_optimizer ) self.actor_scheduler_params = actor_scheduler_params self.critic_scheduler_params = critic_scheduler_params self.n_step = n_step self.gamma = gamma actor_grad_clip_params = actor_grad_clip_params or {} critic_grad_clip_params = critic_grad_clip_params or {} self.actor_grad_clip_fn = \ GRAD_CLIPPERS.get_from_params(**actor_grad_clip_params) self.critic_grad_clip_fn = \ GRAD_CLIPPERS.get_from_params(**critic_grad_clip_params) self.actor_grad_clip_params = actor_grad_clip_params self.critic_grad_clip_params = critic_grad_clip_params self.actor_criterion = CRITERIONS.get_from_params( **(actor_loss_params or {}) ) self.critic_criterion = CRITERIONS.get_from_params( **(critic_loss_params or {}) ) self.actor_loss_params = actor_loss_params self.critic_loss_params = critic_loss_params self.actor_tau = actor_tau self.critic_tau = critic_tau self.min_action = min_action self.max_action = max_action self._init(**kwargs) if resume is not None: self.load_checkpoint(resume, load_optimizer=load_optimizer)
def __init__(self, actor, critic, gamma, n_step, actor_optimizer_params, critic_optimizer_params, actor_grad_clip_params=None, critic_grad_clip_params=None, actor_loss_params=None, critic_loss_params=None, actor_scheduler_params=None, critic_scheduler_params=None, resume=None, load_optimizer=True, actor_tau=1.0, critic_tau=1.0, min_action=-1.0, max_action=1.0, **kwargs): # hack to prevent cycle dependencies from catalyst.contrib.registry import Registry self._device = UtilsFactory.prepare_device() self.actor = actor.to(self._device) self.critic = critic.to(self._device) self.target_actor = copy.deepcopy(actor).to(self._device) self.target_critic = copy.deepcopy(critic).to(self._device) self.actor_optimizer = Registry.get_optimizer(self.actor, **actor_optimizer_params) self.critic_optimizer = Registry.get_optimizer( self.critic, **critic_optimizer_params) self.actor_optimizer_params = actor_optimizer_params self.critic_optimizer_params = critic_optimizer_params actor_scheduler_params = actor_scheduler_params or {} critic_scheduler_params = critic_scheduler_params or {} self.actor_scheduler = Registry.get_scheduler(self.actor_optimizer, **actor_scheduler_params) self.critic_scheduler = Registry.get_scheduler( self.critic_optimizer, **critic_scheduler_params) self.actor_scheduler_params = actor_scheduler_params self.critic_scheduler_params = critic_scheduler_params self.n_step = n_step self.gamma = gamma actor_grad_clip_params = actor_grad_clip_params or {} critic_grad_clip_params = critic_grad_clip_params or {} self.actor_grad_clip_fn = Registry.get_grad_clip_fn( **actor_grad_clip_params) self.critic_grad_clip_fn = Registry.get_grad_clip_fn( **critic_grad_clip_params) self.actor_grad_clip_params = actor_grad_clip_params self.critic_grad_clip_params = critic_grad_clip_params self.actor_criterion = Registry.get_criterion( **(actor_loss_params or {})) self.critic_criterion = Registry.get_criterion( **(critic_loss_params or {})) self.actor_loss_params = actor_loss_params self.critic_loss_params = critic_loss_params self.actor_tau = actor_tau self.critic_tau = critic_tau self.min_action = min_action self.max_action = max_action self._init(**kwargs) if resume is not None: self.load_checkpoint(resume, load_optimizer=load_optimizer)