def __init__( self, state_shape: Tuple[int, ...], action_size: int, model: BaseModel, policy: Policy, memory: PrioritizedMemory, lr_scheduler: _LRScheduler, optimizer: torch.optim.Optimizer, batch_size: int = 32, gamma: float = 0.95, tau: float = 1e-3, update_frequency: int = 5, seed: int = None, action_repeats: int = 1, gradient_clip: float = 1, ): """Initialize an Agent object. Args: state_shape (Tuple[int, ...]): Shape of the state action_size (int): Number of possible integer actions model (torch.nn.Module): Model producing actions from state policy (Policy): memory: Memory, lr_scheduler: _LRScheduler, optimizer: torch.optim.Optimizer, batch_size: int = 32, gamma: float = 0.95, tau: float = 1e-3, update_frequency: int = 5, seed: int = None """ super().__init__(action_size=action_size, state_shape=state_shape) self.batch_size = batch_size self.gamma = gamma self.tau = tau self.update_frequency = update_frequency self.gradient_clip = gradient_clip self.previous_action: Optional[Action] = None self.action_repeats = action_repeats # Double DQN self.online_qnetwork = model.to(device) self.target_qnetwork = deepcopy(model).to(device).eval() self.memory = memory self.losses = [] self.policy: Policy = policy self.optimizer: optimizer = optimizer self.lr_scheduler: _LRScheduler = lr_scheduler if seed: set_seed(seed) self.online_qnetwork.set_seed(seed) self.target_qnetwork.set_seed(seed)
def __init__(self, stream_ids: List[str], capacity, seed=None): self.streams: Dict[str, Memory] = {} if seed: set_seed(seed) for s in stream_ids: self.streams[s] = Memory(capacity, seed)
def __init__(self, task_name: str, env: UnityEnvironment, seed: int): set_seed(seed) self.env = env self.task_name = task_name self.env_info = None self.training_scores = None self.evaluation_scores = None
def __init__(self, actor_model, critic_model, action_size, continuous_actions: bool, initial_std=0.2, continuous_action_range_clip: Optional[tuple] = (-1, 1), seed=None): super(MAPPO_Actor_Critic, self).__init__() if seed is not None: set_seed(seed) self.actor = actor_model self.critic = critic_model self.action_size = action_size self.continuous_actions = continuous_actions self.std = nn.Parameter(torch.ones(1, action_size) * initial_std) self.continuous_action_range_clip = continuous_action_range_clip
def __init__(self, stream_ids: List[str], capacity, state_shape, beta_scheduler, alpha_scheduler, min_priority: Optional[float] = None, num_stacked_frames=1, seed=None, continuous_actions=False): self.streams: Dict[str, PrioritizedMemory] = {} if seed: set_seed(seed) for s in stream_ids: self.streams[s] = ExtendedPrioritizedMemory( capacity, state_shape, beta_scheduler, alpha_scheduler, min_priority=min_priority, num_stacked_frames=num_stacked_frames, seed=seed, continuous_actions=continuous_actions )
def __init__(self, state_shape, action_size, seed, map_agent_to_state_slice, map_agent_to_action_slice): """Initialize an Agent object. Params ====== state_shape (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ super().__init__(action_size=action_size, state_shape=state_shape) if seed is not None: set_seed(seed) self.target_actor = lambda x: torch.randint(0, self.action_size + 1, (len(x), 1)).to(device) self.online_actor = lambda x: torch.randint(0, self.action_size + 1, (len(x), 1)).to(device) self.online_critic = {} self.map_agent_to_state_slice = map_agent_to_state_slice self.map_agent_to_action_slice = map_agent_to_action_slice
def __init__(self, capacity: int, state_shape: tuple, beta_scheduler: ParameterScheduler, alpha_scheduler: ParameterScheduler, min_priority: float = 1e-3, seed: int = None, continuous_actions: bool = False, ): self.capacity = capacity self.state_shape = state_shape self.curr_write_idx = 0 self.available_samples = 0 # Memory buffer and priority sum-tree self.buffer = ReplayBuffer(state_shape, capacity) self.sum_tree = SumTree([0 for _ in range(self.capacity)]) self.beta_scheduler = beta_scheduler self.alpha_scheduler = alpha_scheduler self.beta = beta_scheduler.initial self.alpha = alpha_scheduler.initial self.min_priority = min_priority self.continuous_actions = continuous_actions if seed: set_seed(seed)
def set_seed(seed: int): set_seed(seed)
def __init__(self, seed): """""" set_seed(seed) self.memory = []
def set_seed(self, seed: int): """ Set seed of model for consistency """ set_seed(seed)
def set_seed(seed): if seed: set_seed(seed)
def __init__( self, state_size: int, action_size: int, seed: int, actor_critic_factory: Callable[[], PPO_Actor_Critic], optimizer_factory: Callable[[torch.nn.Module.parameters], torch.optim.Optimizer], grad_clip: float = 1., gamma: float = 0.99, batch_size: int = 1024, gae_factor: float = 0.95, epsilon: float = 0.2, beta_scheduler: ParameterScheduler = ParameterScheduler( initial=0.02, lambda_fn=lambda i: 0.02 * 0.995**i, final=1e-4), std_scale_scheduler: ParameterScheduler = ParameterScheduler( initial=0.5, lambda_fn=lambda i: 0.5 * 0.995**i, final=0.2), continuous_actions: bool = False, continuous_action_range_clip: tuple = (-1, 1), min_batches_for_training: int = 32, num_learning_updates: int = 4, ): """ :param state_size: The state size of the agent :param action_size: The action size of the agent :param seed: Seed for reproducibility :param actor_critic_factory: Function returning the actor-critic model :param optimizer_factory: Function returning the optimizer for the actor-critic model :param grad_clip: Clip absolute value of the gradient above this value :param gamma: Discount factor :param batch_size: SGD minibatch size :param gae_factor: Factor used to down-weight rewards, presented as lambda in the GAE paper :param epsilon: Small constant parameter to clip the objective function by :param beta_scheduler: Scheduler for parameter beta, the coefficient for the entropy term :param std_scale_scheduler: Scheduler for the std of the normal distribution used to sample actions from in the policy network. Only used for continuous actions :param continuous_actions: Whether the action space is continuous or discrete :param continuous_action_range_clip: The range to clip continuous actions above. Only used for continuous actions :param min_batches_for_training: Minimum number of batches to accumulate before performing training :param num_learning_updates: Number of epochs to train for over before discarding samples """ super().__init__(state_size, action_size) if seed is not None: set_seed(seed) self.online_actor_critic = actor_critic_factory().to(device) self.target_actor_critic = actor_critic_factory().to(device).eval() self.target_actor_critic.load_state_dict( self.online_actor_critic.state_dict()) self.optimizer = optimizer_factory( self.online_actor_critic.parameters()) self.current_trajectory_memory = Trajectories(seed) self.grad_clip = grad_clip self.gamma = gamma self.batch_size = batch_size self.gae_factor = gae_factor self.beta_scheduler = beta_scheduler self.epsilon = epsilon self.beta = self.beta_scheduler.initial self.std_scale_scheduler = std_scale_scheduler self.std_scale = self.std_scale_scheduler.initial self.previous_std_scale = None self.continuous_actions = continuous_actions self.continuous_action_range_clip = continuous_action_range_clip self.min_batches_for_training = min_batches_for_training self.num_learning_updates = num_learning_updates self.warmup = False self.current_trajectory = []
def __init__(self, agent_id, policy, state_shape, action_size, seed, critic_factory: Callable, actor_factory: Callable, critic_optimizer_factory: Callable, actor_optimizer_factory: Callable, memory_factory: Callable, num_learning_updates=10, tau: float = 1e-2, batch_size: int = 512, update_frequency: int = 20, critic_grad_norm_clip: int = 1, policy_update_frequency: int = 2, homogeneous_agents: bool = False): super().__init__(action_size=action_size, state_shape=state_shape) if seed is not None: set_seed(seed) self.n_seed = np.random.seed(seed) self.num_learning_updates = num_learning_updates self.tau = tau self.agent_id = agent_id self.batch_size = batch_size self.update_frequency = update_frequency self.critic_grad_norm_clip = critic_grad_norm_clip self.policy_update_frequency = policy_update_frequency self.policy = policy self.homogeneous_agents = homogeneous_agents # critic local and target network (Q-Learning) if self.homogeneous_agents and MADDPGAgent.online_critic is None: MADDPGAgent.online_critic = critic_factory().to(device).float() MADDPGAgent.target_critic = critic_factory().to(device).float() MADDPGAgent.target_critic.load_state_dict( self.online_critic.state_dict()) # actor local and target network (Policy gradient) MADDPGAgent.online_actor = actor_factory().to(device).float() MADDPGAgent.target_actor = actor_factory().to(device).float() MADDPGAgent.target_actor.load_state_dict( self.online_actor.state_dict()) # optimizer for critic and actor network MADDPGAgent.critic_optimizer = critic_optimizer_factory( self.online_critic.parameters()) MADDPGAgent.actor_optimizer = actor_optimizer_factory( self.online_actor.parameters()) self.online_critic = MADDPGAgent.online_critic self.target_critic = MADDPGAgent.target_critic # actor local and target network (Policy gradient) self.online_actor = MADDPGAgent.online_actor self.target_actor = MADDPGAgent.target_actor # optimizer for critic and actor network self.critic_optimizer = MADDPGAgent.critic_optimizer self.actor_optimizer = MADDPGAgent.actor_optimizer else: self.online_critic = critic_factory().to(device).float() self.target_critic = critic_factory().to(device).float() self.target_critic.load_state_dict(self.online_critic.state_dict()) # actor local and target network (Policy gradient) self.online_actor = actor_factory().to(device).float() self.target_actor = actor_factory().to(device).float() self.target_actor.load_state_dict(self.online_actor.state_dict()) # optimizer for critic and actor network self.critic_optimizer = critic_optimizer_factory( self.online_critic.parameters()) self.actor_optimizer = actor_optimizer_factory( self.online_actor.parameters()) self.memory = memory_factory()