def __init__(self, in_features: FeatureType, action_size: int, **kwargs): """ Parameters: hidden_layers: (default: (128, 128)) Shape of the hidden layers that are fully connected networks. gamma: (default: 0.99) Discount value. tau: (default: 0.02) Soft copy fraction. batch_size: (default 64) Number of samples in a batch. buffer_size: (default: 1e6) Size of the prioritized experience replay buffer. warm_up: (default: 0) Number of samples that needs to be observed before starting to learn. update_freq: (default: 1) Number of samples between policy updates. number_updates: (default: 1) Number of times of batch sampling/training per `update_freq`. alpha: (default: 0.2) Weight of log probs in value function. alpha_lr: (default: None) If provided, it will add alpha as a training parameters and `alpha_lr` is its learning rate. action_scale: (default: 1.) Scale for returned action values. max_grad_norm_alpha: (default: 1.) Gradient clipping for the alpha. max_grad_norm_actor: (default 10.) Gradient clipping for the actor. max_grad_norm_critic: (default: 10.) Gradient clipping for the critic. device: Defaults to CUDA if available. """ super().__init__(**kwargs) self.device = kwargs.get("device", DEVICE) self.in_features: Tuple[int] = (in_features, ) if isinstance( in_features, int) else tuple(in_features) self.state_size: int = in_features if isinstance( in_features, int) else reduce(operator.mul, in_features) self.action_size = action_size self.gamma: float = float(self._register_param(kwargs, 'gamma', 0.99)) self.tau: float = float(self._register_param(kwargs, 'tau', 0.02)) self.batch_size: int = int( self._register_param(kwargs, 'batch_size', 64)) self.buffer_size: int = int( self._register_param(kwargs, 'buffer_size', int(1e6))) self.memory = PERBuffer(self.batch_size, self.buffer_size) self.action_min = self._register_param(kwargs, 'action_min', -1) self.action_max = self._register_param(kwargs, 'action_max', 1) self.action_scale = self._register_param(kwargs, 'action_scale', 1) self.warm_up = int(self._register_param(kwargs, 'warm_up', 0)) self.update_freq = int(self._register_param(kwargs, 'update_freq', 1)) self.number_updates = int( self._register_param(kwargs, 'number_updates', 1)) self.actor_number_updates = int( self._register_param(kwargs, 'actor_number_updates', 1)) self.critic_number_updates = int( self._register_param(kwargs, 'critic_number_updates', 1)) # Reason sequence initiation. hidden_layers = to_numbers_seq( self._register_param(kwargs, 'hidden_layers', (128, 128))) actor_hidden_layers = to_numbers_seq( self._register_param(kwargs, 'actor_hidden_layers', hidden_layers)) critic_hidden_layers = to_numbers_seq( self._register_param(kwargs, 'critic_hidden_layers', hidden_layers)) self.simple_policy = bool( self._register_param(kwargs, "simple_policy", False)) if self.simple_policy: self.policy = MultivariateGaussianPolicySimple( self.action_size, **kwargs) self.actor = ActorBody(self.state_size, self.policy.param_dim * self.action_size, hidden_layers=actor_hidden_layers, device=self.device) else: self.policy = GaussianPolicy(actor_hidden_layers[-1], self.action_size, out_scale=self.action_scale, device=self.device) self.actor = ActorBody(self.state_size, actor_hidden_layers[-1], hidden_layers=actor_hidden_layers[:-1], device=self.device) self.double_critic = DoubleCritic(self.in_features, self.action_size, CriticBody, hidden_layers=critic_hidden_layers, device=self.device) self.target_double_critic = DoubleCritic( self.in_features, self.action_size, CriticBody, hidden_layers=critic_hidden_layers, device=self.device) # Target sequence initiation hard_update(self.target_double_critic, self.double_critic) # Optimization sequence initiation. self.target_entropy = -self.action_size alpha_lr = self._register_param(kwargs, "alpha_lr") self.alpha_lr = float(alpha_lr) if alpha_lr else None alpha_init = float(self._register_param(kwargs, "alpha", 0.2)) self.log_alpha = torch.tensor(np.log(alpha_init), device=self.device, requires_grad=True) actor_lr = float(self._register_param(kwargs, 'actor_lr', 3e-4)) critic_lr = float(self._register_param(kwargs, 'critic_lr', 3e-4)) self.actor_params = list(self.actor.parameters()) + list( self.policy.parameters()) self.critic_params = list(self.double_critic.parameters()) self.actor_optimizer = optim.Adam(self.actor_params, lr=actor_lr) self.critic_optimizer = optim.Adam(list(self.critic_params), lr=critic_lr) if self.alpha_lr is not None: self.alpha_optimizer = optim.Adam([self.log_alpha], lr=self.alpha_lr) self.max_grad_norm_alpha = float( self._register_param(kwargs, "max_grad_norm_alpha", 1.0)) self.max_grad_norm_actor = float( self._register_param(kwargs, "max_grad_norm_actor", 10.0)) self.max_grad_norm_critic = float( self._register_param(kwargs, "max_grad_norm_critic", 10.0)) # Breath, my child. self.iteration = 0 self._loss_actor = float('inf') self._loss_critic = float('inf') self._metrics: Dict[str, Union[float, Dict[str, float]]] = {}
def __init__(self, state_size: int, action_size: int, hidden_layers: Sequence[int] = (128, 128), **kwargs): super().__init__(**kwargs) self.device = self._register_param(kwargs, "device", DEVICE) self.state_size = state_size self.action_size = action_size self.num_atoms = int(self._register_param(kwargs, 'num_atoms', 51)) v_min = float(self._register_param(kwargs, 'v_min', -10)) v_max = float(self._register_param(kwargs, 'v_max', 10)) # Reason sequence initiation. self.action_min = float(self._register_param(kwargs, 'action_min', -1)) self.action_max = float(self._register_param(kwargs, 'action_max', 1)) self.action_scale = int(self._register_param(kwargs, 'action_scale', 1)) self.gamma = float(self._register_param(kwargs, 'gamma', 0.99)) self.tau = float(self._register_param(kwargs, 'tau', 0.02)) self.batch_size: int = int( self._register_param(kwargs, 'batch_size', 64)) self.buffer_size: int = int( self._register_param(kwargs, 'buffer_size', int(1e6))) self.buffer = PERBuffer(self.batch_size, self.buffer_size) self.n_steps = int(self._register_param(kwargs, "n_steps", 3)) self.n_buffer = NStepBuffer(n_steps=self.n_steps, gamma=self.gamma) self.warm_up: int = int(self._register_param(kwargs, 'warm_up', 0)) self.update_freq: int = int( self._register_param(kwargs, 'update_freq', 1)) if kwargs.get("simple_policy", False): std_init = kwargs.get("std_init", 1.0) std_max = kwargs.get("std_max", 1.5) std_min = kwargs.get("std_min", 0.25) self.policy = MultivariateGaussianPolicySimple(self.action_size, std_init=std_init, std_min=std_min, std_max=std_max, device=self.device) else: self.policy = MultivariateGaussianPolicy(self.action_size, device=self.device) self.actor_hidden_layers = to_numbers_seq( self._register_param(kwargs, 'actor_hidden_layers', hidden_layers)) self.critic_hidden_layers = to_numbers_seq( self._register_param(kwargs, 'critic_hidden_layers', hidden_layers)) # This looks messy but it's not that bad. Actor, critic_net and Critic(critic_net). Then the same for `target_`. self.actor = ActorBody(state_size, self.policy.param_dim * action_size, hidden_layers=self.actor_hidden_layers, gate_out=torch.tanh, device=self.device) critic_net = CriticBody(state_size, action_size, out_features=self.num_atoms, hidden_layers=self.critic_hidden_layers, device=self.device) self.critic = CategoricalNet(num_atoms=self.num_atoms, v_min=v_min, v_max=v_max, net=critic_net, device=self.device) self.target_actor = ActorBody(state_size, self.policy.param_dim * action_size, hidden_layers=self.actor_hidden_layers, gate_out=torch.tanh, device=self.device) target_critic_net = CriticBody(state_size, action_size, out_features=self.num_atoms, hidden_layers=self.critic_hidden_layers, device=self.device) self.target_critic = CategoricalNet(num_atoms=self.num_atoms, v_min=v_min, v_max=v_max, net=target_critic_net, device=self.device) # Target sequence initiation hard_update(self.target_actor, self.actor) hard_update(self.target_critic, self.critic) # Optimization sequence initiation. self.actor_lr = float(self._register_param(kwargs, 'actor_lr', 3e-4)) self.critic_lr = float(self._register_param(kwargs, 'critic_lr', 3e-4)) self.value_loss_func = nn.BCELoss(reduction='none') # self.actor_params = list(self.actor.parameters()) #+ list(self.policy.parameters()) self.actor_params = list(self.actor.parameters()) + list( self.policy.parameters()) self.actor_optimizer = Adam(self.actor_params, lr=self.actor_lr) self.critic_optimizer = Adam(self.critic.parameters(), lr=self.critic_lr) self.max_grad_norm_actor = float( self._register_param(kwargs, "max_grad_norm_actor", 50.0)) self.max_grad_norm_critic = float( self._register_param(kwargs, "max_grad_norm_critic", 50.0)) # Breath, my child. self.iteration = 0 self._loss_actor = float('nan') self._loss_critic = float('nan') self._display_dist = torch.zeros(self.critic.z_atoms.shape) self._metric_batch_error = torch.zeros(self.batch_size) self._metric_batch_value_dist = torch.zeros(self.batch_size)
def __init__(self, state_size: int, action_size: int, actor_lr: float = 2e-3, critic_lr: float = 2e-3, noise_scale: float = 0.2, noise_sigma: float = 0.1, **kwargs): super().__init__(**kwargs) self.device = self._register_param(kwargs, "device", DEVICE) self.state_size = state_size self.action_size = action_size # Reason sequence initiation. hidden_layers = to_numbers_seq( self._register_param(kwargs, 'hidden_layers', (128, 128))) self.actor = ActorBody(state_size, action_size, hidden_layers=hidden_layers, gate_out=torch.tanh).to(self.device) self.critic = CriticBody(state_size, action_size, hidden_layers=hidden_layers).to(self.device) self.target_actor = ActorBody(state_size, action_size, hidden_layers=hidden_layers, gate_out=torch.tanh).to(self.device) self.target_critic = CriticBody(state_size, action_size, hidden_layers=hidden_layers).to( self.device) # Noise sequence initiation self.noise = GaussianNoise(shape=(action_size, ), mu=1e-8, sigma=noise_sigma, scale=noise_scale, device=self.device) # Target sequence initiation hard_update(self.target_actor, self.actor) hard_update(self.target_critic, self.critic) # Optimization sequence initiation. self.actor_lr = float( self._register_param(kwargs, 'actor_lr', actor_lr)) self.critic_lr = float( self._register_param(kwargs, 'critic_lr', critic_lr)) self.actor_optimizer = Adam(self.actor.parameters(), lr=self.actor_lr) self.critic_optimizer = Adam(self.critic.parameters(), lr=self.critic_lr) self.max_grad_norm_actor = float( self._register_param(kwargs, "max_grad_norm_actor", 10.0)) self.max_grad_norm_critic = float( self._register_param(kwargs, "max_grad_norm_critic", 10.0)) self.action_min = float(self._register_param(kwargs, 'action_min', -1)) self.action_max = float(self._register_param(kwargs, 'action_max', 1)) self.action_scale = float( self._register_param(kwargs, 'action_scale', 1)) self.gamma = float(self._register_param(kwargs, 'gamma', 0.99)) self.tau = float(self._register_param(kwargs, 'tau', 0.02)) self.batch_size = int(self._register_param(kwargs, 'batch_size', 64)) self.buffer_size = int( self._register_param(kwargs, 'buffer_size', int(1e6))) self.buffer = ReplayBuffer(self.batch_size, self.buffer_size) self.warm_up = int(self._register_param(kwargs, 'warm_up', 0)) self.update_freq = int(self._register_param(kwargs, 'update_freq', 1)) self.number_updates = int( self._register_param(kwargs, 'number_updates', 1)) # Breath, my child. self.reset_agent() self.iteration = 0 self._loss_actor = 0. self._loss_critic = 0.
def reset_agent(self) -> None: self.actor.reset_parameters() self.policy.reset_parameters() self.double_critic.reset_parameters() hard_update(self.target_double_critic, self.double_critic)
def __init__(self, state_size: int, action_size: int, hidden_layers: Sequence[int]=(128, 128), **kwargs): """ Parameters: state_size (int): Number of input dimensions. action_size (int): Number of output dimensions hidden_layers (tuple of ints): Tuple defining hidden dimensions in fully connected nets. Default: (128, 128). Keyword parameters: gamma (float): Discount value. Default: 0.99. tau (float): Soft-copy factor. Default: 0.02. actor_lr (float): Learning rate for the actor (policy). Default: 0.0003. critic_lr (float): Learning rate for the critic (value function). Default: 0.0003. actor_hidden_layers (tuple of ints): Shape of network for actor. Default: `hideen_layers`. critic_hidden_layers (tuple of ints): Shape of network for critic. Default: `hideen_layers`. max_grad_norm_actor (float) Maximum norm value for actor gradient. Default: 100. max_grad_norm_critic (float): Maximum norm value for critic gradient. Default: 100. num_atoms (int): Number of discrete values for the value distribution. Default: 51. v_min (float): Value distribution minimum (left most) value. Default: -10. v_max (float): Value distribution maximum (right most) value. Default: 10. n_steps (int): Number of steps (N-steps) for the TD. Defualt: 3. batch_size (int): Number of samples used in learning. Default: 64. buffer_size (int): Maximum number of samples to store. Default: 1e6. warm_up (int): Number of samples to observe before starting any learning step. Default: 0. update_freq (int): Number of steps between each learning step. Default 1. action_min (float): Minimum returned action value. Default: -1. action_max (float): Maximum returned action value. Default: 1. action_scale (float): Multipler value for action. Default: 1. """ super().__init__(**kwargs) self.device = self._register_param(kwargs, "device", DEVICE) self.state_size = state_size self.action_size = action_size self.num_atoms = int(self._register_param(kwargs, 'num_atoms', 51)) v_min = float(self._register_param(kwargs, 'v_min', -10)) v_max = float(self._register_param(kwargs, 'v_max', 10)) # Reason sequence initiation. self.action_min = float(self._register_param(kwargs, 'action_min', -1)) self.action_max = float(self._register_param(kwargs, 'action_max', 1)) self.action_scale = int(self._register_param(kwargs, 'action_scale', 1)) self.gamma = float(self._register_param(kwargs, 'gamma', 0.99)) self.tau = float(self._register_param(kwargs, 'tau', 0.02)) self.batch_size: int = int(self._register_param(kwargs, 'batch_size', 64)) self.buffer_size: int = int(self._register_param(kwargs, 'buffer_size', int(1e6))) self.buffer = PERBuffer(self.batch_size, self.buffer_size) self.n_steps = int(self._register_param(kwargs, "n_steps", 3)) self.n_buffer = NStepBuffer(n_steps=self.n_steps, gamma=self.gamma) self.warm_up: int = int(self._register_param(kwargs, 'warm_up', 0)) self.update_freq: int = int(self._register_param(kwargs, 'update_freq', 1)) if kwargs.get("simple_policy", False): std_init = kwargs.get("std_init", 1.0) std_max = kwargs.get("std_max", 1.5) std_min = kwargs.get("std_min", 0.25) self.policy = MultivariateGaussianPolicySimple(self.action_size, std_init=std_init, std_min=std_min, std_max=std_max, device=self.device) else: self.policy = MultivariateGaussianPolicy(self.action_size, device=self.device) self.actor_hidden_layers = to_numbers_seq(self._register_param(kwargs, 'actor_hidden_layers', hidden_layers)) self.critic_hidden_layers = to_numbers_seq(self._register_param(kwargs, 'critic_hidden_layers', hidden_layers)) # This looks messy but it's not that bad. Actor, critic_net and Critic(critic_net). Then the same for `target_`. self.actor = ActorBody( state_size, self.policy.param_dim*action_size, hidden_layers=self.actor_hidden_layers, gate_out=torch.tanh, device=self.device ) critic_net = CriticBody( state_size, action_size, out_features=self.num_atoms, hidden_layers=self.critic_hidden_layers, device=self.device ) self.critic = CategoricalNet( num_atoms=self.num_atoms, v_min=v_min, v_max=v_max, net=critic_net, device=self.device ) self.target_actor = ActorBody( state_size, self.policy.param_dim*action_size, hidden_layers=self.actor_hidden_layers, gate_out=torch.tanh, device=self.device ) target_critic_net = CriticBody( state_size, action_size, out_features=self.num_atoms, hidden_layers=self.critic_hidden_layers, device=self.device ) self.target_critic = CategoricalNet( num_atoms=self.num_atoms, v_min=v_min, v_max=v_max, net=target_critic_net, device=self.device ) # Target sequence initiation hard_update(self.target_actor, self.actor) hard_update(self.target_critic, self.critic) # Optimization sequence initiation. self.actor_lr = float(self._register_param(kwargs, 'actor_lr', 3e-4)) self.critic_lr = float(self._register_param(kwargs, 'critic_lr', 3e-4)) self.value_loss_func = nn.BCELoss(reduction='none') # self.actor_params = list(self.actor.parameters()) #+ list(self.policy.parameters()) self.actor_params = list(self.actor.parameters()) + list(self.policy.parameters()) self.actor_optimizer = Adam(self.actor_params, lr=self.actor_lr) self.critic_optimizer = Adam(self.critic.parameters(), lr=self.critic_lr) self.max_grad_norm_actor = float(self._register_param(kwargs, "max_grad_norm_actor", 100)) self.max_grad_norm_critic = float(self._register_param(kwargs, "max_grad_norm_critic", 100)) # Breath, my child. self.iteration = 0 self._loss_actor = float('nan') self._loss_critic = float('nan') self._display_dist = torch.zeros(self.critic.z_atoms.shape) self._metric_batch_error = torch.zeros(self.batch_size) self._metric_batch_value_dist = torch.zeros(self.batch_size)
def __init__(self, state_size: int, action_size: int, noise_scale: float = 0.2, noise_sigma: float = 0.1, **kwargs): """ Parameters: state_size (int): Number of input dimensions. action_size (int): Number of output dimensions noise_scale (float): Added noise amplitude. Default: 0.2. noise_sigma (float): Added noise variance. Default: 0.1. Keyword parameters: hidden_layers (tuple of ints): Tuple defining hidden dimensions in fully connected nets. Default: (128, 128). actor_lr (float): Learning rate for the actor (policy). Default: 0.003. critic_lr (float): Learning rate for the critic (value function). Default: 0.003. gamma (float): Discount value. Default: 0.99. tau (float): Soft-copy factor. Default: 0.02. actor_hidden_layers (tuple of ints): Shape of network for actor. Default: `hideen_layers`. critic_hidden_layers (tuple of ints): Shape of network for critic. Default: `hideen_layers`. max_grad_norm_actor (float) Maximum norm value for actor gradient. Default: 100. max_grad_norm_critic (float): Maximum norm value for critic gradient. Default: 100. batch_size (int): Number of samples used in learning. Default: 64. buffer_size (int): Maximum number of samples to store. Default: 1e6. warm_up (int): Number of samples to observe before starting any learning step. Default: 0. update_freq (int): Number of steps between each learning step. Default 1. number_updates (int): How many times to use learning step in the learning phase. Default: 1. action_min (float): Minimum returned action value. Default: -1. action_max (float): Maximum returned action value. Default: 1. action_scale (float): Multipler value for action. Default: 1. """ super().__init__(**kwargs) self.device = self._register_param( kwargs, "device", DEVICE) # Default device is CUDA if available # Reason sequence initiation. self.state_size = state_size self.action_size = action_size hidden_layers = to_numbers_seq( self._register_param(kwargs, 'hidden_layers', (128, 128))) self.actor = ActorBody(state_size, action_size, hidden_layers=hidden_layers).to(self.device) self.critic = DoubleCritic(state_size, action_size, CriticBody, hidden_layers=hidden_layers).to(self.device) self.target_actor = ActorBody(state_size, action_size, hidden_layers=hidden_layers).to( self.device) self.target_critic = DoubleCritic(state_size, action_size, CriticBody, hidden_layers=hidden_layers).to( self.device) # Noise sequence initiation # self.noise = GaussianNoise(shape=(action_size,), mu=1e-8, sigma=noise_sigma, scale=noise_scale, device=device) self.noise = OUProcess(shape=action_size, scale=noise_scale, sigma=noise_sigma, device=self.device) # Target sequence initiation hard_update(self.target_actor, self.actor) hard_update(self.target_critic, self.critic) # Optimization sequence initiation. actor_lr = float(self._register_param(kwargs, 'actor_lr', 3e-3)) critic_lr = float(self._register_param(kwargs, 'critic_lr', 3e-3)) self.actor_optimizer = AdamW(self.actor.parameters(), lr=actor_lr) self.critic_optimizer = AdamW(self.critic.parameters(), lr=critic_lr) self.max_grad_norm_actor: float = float( kwargs.get("max_grad_norm_actor", 100)) self.max_grad_norm_critic: float = float( kwargs.get("max_grad_norm_critic", 100)) self.action_min = float(self._register_param(kwargs, 'action_min', -1.)) self.action_max = float(self._register_param(kwargs, 'action_max', 1.)) self.action_scale = float( self._register_param(kwargs, 'action_scale', 1.)) self.gamma = float(self._register_param(kwargs, 'gamma', 0.99)) self.tau = float(self._register_param(kwargs, 'tau', 0.02)) self.batch_size = int(self._register_param(kwargs, 'batch_size', 64)) self.buffer_size = int( self._register_param(kwargs, 'buffer_size', int(1e6))) self.buffer = ReplayBuffer(self.batch_size, self.buffer_size) self.warm_up = int(self._register_param(kwargs, 'warm_up', 0)) self.update_freq = int(self._register_param(kwargs, 'update_freq', 1)) self.update_policy_freq = int( self._register_param(kwargs, 'update_policy_freq', 1)) self.number_updates = int( self._register_param(kwargs, 'number_updates', 1)) self.noise_reset_freq = int( self._register_param(kwargs, 'noise_reset_freq', 10000)) # Breath, my child. self.reset_agent() self.iteration = 0 self._loss_actor = 0. self._loss_critic = 0.
def __init__(self, state_size: int, action_size: int, num_agents: int, **kwargs): """Initiation of the Multi Agent DDPG. All keywords are also passed to DDPG agents. Parameters: state_size (int): Dimensionality of the state. action_size (int): Dimensionality of the action. num_agents (int): Number of agents. Keyword Arguments: hidden_layers (tuple of ints): Shape for fully connected hidden layers. noise_scale (float): Default: 1.0. Noise amplitude. noise_sigma (float): Default: 0.5. Noise variance. actor_lr (float): Default: 0.001. Learning rate for actor network. critic_lr (float): Default: 0.001. Learning rate for critic network. gamma (float): Default: 0.99. Discount value tau (float): Default: 0.02. Soft copy value. gradient_clip (optional float): Max norm for learning gradient. If None then no clip. batch_size (int): Number of samples per learning. buffer_size (int): Number of previous samples to remember. warm_up (int): Number of samples to see before start learning. update_freq (int): How many samples between learning sessions. number_updates (int): How many learning cycles per learning session. """ self.device = self._register_param(kwargs, "device", DEVICE, update=True) self.state_size: int = state_size self.action_size = action_size self.num_agents: int = num_agents self.agent_names: List[str] = kwargs.get("agent_names", map(str, range(self.num_agents))) hidden_layers = to_numbers_seq(self._register_param(kwargs, 'hidden_layers', (100, 100), update=True)) noise_scale = float(self._register_param(kwargs, 'noise_scale', 0.5)) noise_sigma = float(self._register_param(kwargs, 'noise_sigma', 1.0)) actor_lr = float(self._register_param(kwargs, 'actor_lr', 3e-4)) critic_lr = float(self._register_param(kwargs, 'critic_lr', 3e-4)) self.agents: Dict[str, DDPGAgent] = OrderedDict({ agent_name: DDPGAgent( state_size, action_size, actor_lr=actor_lr, critic_lr=critic_lr, noise_scale=noise_scale, noise_sigma=noise_sigma, **kwargs, ) for agent_name in self.agent_names }) self.gamma = float(self._register_param(kwargs, 'gamma', 0.99)) self.tau = float(self._register_param(kwargs, 'tau', 0.02)) self.gradient_clip: Optional[float] = self._register_param(kwargs, 'gradient_clip') self.batch_size = int(self._register_param(kwargs, 'batch_size', 64)) self.buffer_size = int(self._register_param(kwargs, 'buffer_size', int(1e6))) self.buffer = ReplayBuffer(self.batch_size, self.buffer_size) self.warm_up = int(self._register_param(kwargs, 'warm_up', 0)) self.update_freq = int(self._register_param(kwargs, 'update_freq', 1)) self.number_updates = int(self._register_param(kwargs, 'number_updates', 1)) self.critic = CriticBody(num_agents*state_size, num_agents*action_size, hidden_layers=hidden_layers).to(self.device) self.target_critic = CriticBody(num_agents*state_size, num_agents*action_size, hidden_layers=hidden_layers).to(self.device) self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=critic_lr) hard_update(self.target_critic, self.critic) self._step_data = {} self._loss_critic: float = float('inf') self._loss_actor: Dict[str, float] = {name: float('inf') for name in self.agent_names} self.reset()
def __init__(self, state_size: int, action_size: int, actor_lr: float = 1e-3, critic_lr: float = 1e-3, noise_scale: float = 0.2, noise_sigma: float = 0.1, device=None, **kwargs): super().__init__(**kwargs) self.device = device if device is not None else DEVICE # Reason sequence initiation. self.state_size = state_size self.action_size = action_size hidden_layers = to_numbers_seq( self._register_param(kwargs, 'hidden_layers', (128, 128))) self.actor = ActorBody(state_size, action_size, hidden_layers=hidden_layers).to(self.device) self.critic = DoubleCritic(state_size, action_size, CriticBody, hidden_layers=hidden_layers).to(self.device) self.target_actor = ActorBody(state_size, action_size, hidden_layers=hidden_layers).to( self.device) self.target_critic = DoubleCritic(state_size, action_size, CriticBody, hidden_layers=hidden_layers).to( self.device) # Noise sequence initiation # self.noise = GaussianNoise(shape=(action_size,), mu=1e-8, sigma=noise_sigma, scale=noise_scale, device=device) self.noise = OUProcess(shape=action_size, scale=noise_scale, sigma=noise_sigma, device=self.device) # Target sequence initiation hard_update(self.target_actor, self.actor) hard_update(self.target_critic, self.critic) # Optimization sequence initiation. self.actor_optimizer = AdamW(self.actor.parameters(), lr=actor_lr) self.critic_optimizer = AdamW(self.critic.parameters(), lr=critic_lr) self.max_grad_norm_actor: float = float( kwargs.get("max_grad_norm_actor", 10.0)) self.max_grad_norm_critic: float = float( kwargs.get("max_grad_norm_critic", 10.0)) self.action_min = float(self._register_param(kwargs, 'action_min', -1.)) self.action_max = float(self._register_param(kwargs, 'action_max', 1.)) self.action_scale = float( self._register_param(kwargs, 'action_scale', 1.)) self.gamma = float(self._register_param(kwargs, 'gamma', 0.99)) self.tau = float(self._register_param(kwargs, 'tau', 0.02)) self.batch_size = int(self._register_param(kwargs, 'batch_size', 64)) self.buffer_size = int( self._register_param(kwargs, 'buffer_size', int(1e5))) self.buffer = ReplayBuffer(self.batch_size, self.buffer_size) self.warm_up = int(self._register_param(kwargs, 'warm_up', 0)) self.update_freq = int(self._register_param(kwargs, 'update_freq', 1)) self.update_policy_freq = int( self._register_param(kwargs, 'update_policy_freq', 1)) self.number_updates = int( self._register_param(kwargs, 'number_updates', 1)) self.noise_reset_freq = int( self._register_param(kwargs, 'noise_reset_freq', 10000)) # Breath, my child. self.reset_agent() self.iteration = 0 self._loss_actor = 0. self._loss_critic = 0.
def __init__(self, state_size: int, action_size: int, hidden_layers: Sequence[int] = (128, 128), **kwargs): """ Parameters: num_atoms: Number of discrete values for the value distribution. Default 51. v_min: Value distribution minimum (left most) value. Default -10. v_max: Value distribution maximum (right most) value. Default 10. n_steps: Number of steps (N-steps) for the TD. Defualt 3. num_workers: Number of workers that will assume this agent. """ super().__init__(**kwargs) self.device = self._register_param(kwargs, "device", DEVICE) self.state_size = state_size self.action_size = action_size self.num_atoms = int(self._register_param(kwargs, 'num_atoms', 51)) v_min = float(self._register_param(kwargs, 'v_min', -10)) v_max = float(self._register_param(kwargs, 'v_max', 10)) # Reason sequence initiation. self.action_min = float(self._register_param(kwargs, 'action_min', -1)) self.action_max = float(self._register_param(kwargs, 'action_max', 1)) self.action_scale = float( self._register_param(kwargs, 'action_scale', 1)) self.gamma = float(self._register_param(kwargs, 'gamma', 0.99)) self.tau = float(self._register_param(kwargs, 'tau', 0.02)) self.batch_size = int(self._register_param(kwargs, 'batch_size', 64)) self.buffer_size = int( self._register_param(kwargs, 'buffer_size', int(1e6))) self.buffer = PERBuffer(self.batch_size, self.buffer_size) self.n_steps = int(self._register_param(kwargs, "n_steps", 3)) self.n_buffer = NStepBuffer(n_steps=self.n_steps, gamma=self.gamma) self.warm_up = int(self._register_param(kwargs, 'warm_up', 0)) self.update_freq = int(self._register_param(kwargs, 'update_freq', 1)) self.actor_hidden_layers = to_numbers_seq( self._register_param(kwargs, 'actor_hidden_layers', hidden_layers)) self.critic_hidden_layers = to_numbers_seq( self._register_param(kwargs, 'critic_hidden_layers', hidden_layers)) if kwargs.get("simple_policy", False): std_init = float(self._register_param(kwargs, "std_init", 1.0)) std_max = float(self._register_param(kwargs, "std_max", 2.0)) std_min = float(self._register_param(kwargs, "std_min", 0.05)) self.policy = MultivariateGaussianPolicySimple(self.action_size, std_init=std_init, std_min=std_min, std_max=std_max, device=self.device) else: self.policy = MultivariateGaussianPolicy(self.action_size, device=self.device) # This looks messy but it's not that bad. Actor, critic_net and Critic(critic_net). Then the same for `target_`. self.actor = ActorBody(state_size, self.policy.param_dim * action_size, hidden_layers=self.actor_hidden_layers, gate_out=torch.tanh, device=self.device) critic_net = CriticBody(state_size, action_size, out_features=self.num_atoms, hidden_layers=self.critic_hidden_layers, device=self.device) self.critic = CategoricalNet(num_atoms=self.num_atoms, v_min=v_min, v_max=v_max, net=critic_net, device=self.device) self.target_actor = ActorBody(state_size, self.policy.param_dim * action_size, hidden_layers=self.actor_hidden_layers, gate_out=torch.tanh, device=self.device) target_critic_net = CriticBody(state_size, action_size, out_features=self.num_atoms, hidden_layers=self.critic_hidden_layers, device=self.device) self.target_critic = CategoricalNet(num_atoms=self.num_atoms, v_min=v_min, v_max=v_max, net=target_critic_net, device=self.device) # Target sequence initiation hard_update(self.target_actor, self.actor) hard_update(self.target_critic, self.critic) # Optimization sequence initiation. self.actor_lr = float(self._register_param(kwargs, 'actor_lr', 3e-4)) self.critic_lr = float(self._register_param(kwargs, 'critic_lr', 3e-4)) self.value_loss_func = nn.BCELoss(reduction='none') self.actor_params = list(self.actor.parameters()) + list( self.policy.parameters()) self.actor_optimizer = Adam(self.actor_params, lr=self.actor_lr) self.critic_optimizer = Adam(self.critic.parameters(), lr=self.critic_lr) self.max_grad_norm_actor: float = float( self._register_param(kwargs, "max_grad_norm_actor", 50.0)) self.max_grad_norm_critic: float = float( self._register_param(kwargs, "max_grad_norm_critic", 50.0)) self.num_workers = int(self._register_param(kwargs, "num_workers", 1)) # Breath, my child. self.iteration = 0 self._loss_actor = float('nan') self._loss_critic = float('nan')