class AgentResourcePool(): def __init__(self, state_size, action_size, random_seed): self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed)
class Agent(): """Interacts with and learns from the environment.""" memory = None actor_local = None actor_target = None actor_optimizer = None critic_local = None critic_target = None critic_optimizer = None instances = [] def __init__(self, state_size, action_size, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # initialize Class level Actor Network # if Agent.actor_local is None: # Agent.actor_local = Actor(state_size, action_size, random_seed).to(device) # if Agent.actor_target is None: # Agent.actor_target = Actor(state_size, action_size, random_seed).to(device) # if Agent.actor_optimizer is None: # Agent.actor_optimizer = optim.Adam(Agent.actor_local.parameters(), lr=LR_ACTOR) # self.actor_local = Agent.actor_local # self.actor_target = Agent.actor_target # self.actor_optimizer = Agent.actor_optimizer # Critic Network (w/ Target Network) # self.critic_local = Critic(state_size, action_size, random_seed).to(device) # self.critic_target = Critic(state_size, action_size, random_seed).to(device) # self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Initilise Class levell Critic Network if Agent.critic_local is None: Agent.critic_local = Critic(state_size, action_size, random_seed).to(device) if Agent.critic_target is None: Agent.critic_target = Critic(state_size, action_size, random_seed).to(device) if Agent.critic_optimizer is None: Agent.critic_optimizer = optim.Adam( Agent.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) self.critic_local = Agent.critic_local self.critic_target = Agent.critic_target self.critic_optimizer = Agent.critic_optimizer # Noise process self.noise = OUNoise(action_size, random_seed) # Replay memory - only intitialise once per class if Agent.memory is None: print("Initialising ReplayBuffer") Agent.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) # else: # print("Sharing ReplayBuffer %s", Agent.memory) # Add this instances - we need to access all agent states whilst learning self.agent_num = len(Agent.instances) Agent.instances.append(self) print("Appended to Agent.instances agent {}".format(self.agent_num)) def step(self, time_step, state, action, reward, next_state, done): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward Agent.memory.add(state, action, reward, next_state, done) # only learn every n_time_steps if time_step % N_TIME_STEPS != 0: return # Learn, if enough samples are available in memory if len(Agent.memory) > BATCH_SIZE: for i in range(N_LEARN_UPDATES): experiences = Agent.memory.sample() self.learn(experiences, GAMMA) def act(self, state, add_noise=True, noise_amplitude=0.0): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() * noise_amplitude return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class DDPG_Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, random_seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) def step(self, state, action, reward, next_state, done): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward self.memory.add(state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class DDPG_trainer(object): def __init__(self, nb_state, nb_action): self.nb_state = nb_state self.nb_action = nb_action self.actor = Actor(self.nb_state, self.nb_action) self.actor_target = Actor(self.nb_state, self.nb_action) self.actor_optim = Adam(self.actor.parameters(), lr=LEARNING_RATE) self.critic = Critic(self.nb_state, self.nb_action) self.critic_target = Critic(self.nb_state, self.nb_action) self.critic_optim = Adam(self.critic.parameters(), lr=LEARNING_RATE) hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) #Create replay buffer self.memory = SequentialMemory(limit=MEMORY_SIZE, window_length=1) self.random_process = OrnsteinUhlenbeckProcess(size=nb_action, theta=OU_THETA, mu=OU_MU, sigma=OU_SIGMA) self.is_training = True self.epsilon = 1.0 self.a_t = None self.s_t = None if USE_CUDA: self.cuda() def cuda(self): self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda() def select_action(self, s_t, decay_epsilon=True): action = to_numpy(self.actor(to_tensor(np.array([s_t])))).squeeze(0) action += self.is_training * max(self.epsilon, 0) * self.random_process.sample() action = np.clip(action, -1., 1.) if decay_epsilon: self.epsilon -= DELTA_EPSILON self.a_t = action return action def reset(self, observation): self.start_state = observation self.random_process.reset_states() def observe(self, r_t, s_t1, done): if self.is_training: self.memory.append(self.s_t, self.a_t, r_t, done) self.s_t = s_t1 def update_all(self): # Help Warm Up if self.memory.nb_entries < BATCH_SIZE * 2: return # Sample batch state_batch, action_batch, reward_batch, \ next_state_batch, terminal_batch = self.memory.sample_and_split(BATCH_SIZE) # Prepare for the target q batch with torch.no_grad(): next_q_values = self.critic_target([ to_tensor(next_state_batch), self.actor_target(to_tensor(next_state_batch)), ]) target_q_batch = to_tensor(reward_batch) + \ DISCOUNT * to_tensor(terminal_batch.astype(np.float)) * next_q_values # Critic update self.critic.zero_grad() for state in state_batch: if state.shape[0] <= 2: # print("Error sampled memory!") return q_batch = self.critic( [to_tensor(state_batch), to_tensor(action_batch)]) value_loss = CRITERION(q_batch, target_q_batch) value_loss.backward() self.critic_optim.step() # Actor update self.actor.zero_grad() policy_loss = -self.critic( [to_tensor(state_batch), self.actor(to_tensor(state_batch))]) policy_loss = policy_loss.mean() policy_loss.backward() self.actor_optim.step() # Target update soft_update(self.actor_target, self.actor, TAU) soft_update(self.critic_target, self.critic, TAU)
class DDPGAgent(): """DDPG agent that interacts with and learns from the environment. The agents model is implemented in 'ddpg_model.py'. It consists of two neural networks; one for the actor, and one for the critic. The DDPGAgent class makes use of two other classes: ReplayBuffer, OUNoise """ def __init__(self, state_size, action_size, num_agents, random_seed): """Initialize an Agent object. Arguments: state_size (int) -- dimension of each state action_size (int) -- dimension of each action num_agents (int) -- number of agents (brains) random_seed (int) -- random seed """ self.state_size = state_size self.action_size = action_size self.num_agents = num_agents self.seed = random.seed(random_seed) ### Make neural networks (local and target) for both actor and critic, and set optimizers # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise((num_agents, action_size), random_seed) # Initialize replay memory ### self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) def step(self, state, action, reward, next_state, done, timestep): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience in memory for i in range(self.num_agents): self.memory.add(state[i, :], action[i, :], reward[i], next_state[i, :], done[i]) # Learn every UPDATE_EVERY time steps if timestep % UPDATE_EVERY == 0: # If we have collected enough experience in our memory i.e. more # than the mini-batch size, then call the self.learn() function if len(self.memory) > BATCH_SIZE: # Number of updates per timestep for _ in range(NUM_UPDATES): experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, add_noise=True): """Returns actions for given state as per current policy. Arguments: state {[type]} -- Current state add_noise {bool} -- Add noise (exploration) to the actions (default: {True}) Returns: [float] -- Actions """ # Convert 'state' numpy array to pytorch tensor using the current device # i.e. GPU or CPU. state = torch.from_numpy(state).float().to(device) # Set the module in evaluation mode. self.actor_local.eval() with torch.no_grad(): # Evaluate the network with the current state action = self.actor_local(state).cpu().data.numpy() # Set the module in training mode. self.actor_local.train() if add_noise: # Add noise to the actions to add exploration action += self.noise.sample() # Return the clipped actions return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Arguments: experiences {Tuple[torch.Tensor]} -- tuple of (s, a, r, s', done) tuples gamma {float} -- discount factor """ # Experiences, mini-batch of 128 states, actions, rewards, next_states, dones = experiences # ---------------------------- Update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() # Clip the gradients torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) # Take one step with the optimizer self.critic_optimizer.step() # ---------------------------- Update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- Update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Arguments: local_model -- PyTorch model (weights will be copied from) target_model -- PyTorch model (weights will be copied to) tau (float) -- interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class DDPG(object): def __init__(self, nb_states, nb_actions, args): # self.cuda = USE_CUDA #args.cuda self.cuda = args.cuda self.nb_states = nb_states self.nb_actions = nb_actions #Init models #actor_kwargs = {'n_inp':self.nb_states, 'n_feature_list':[args.hidden1,args.hidden2], 'n_class':self.nb_actions} #self.actor = MLP(**actor_kwargs) #self.actor_target = MLP(**actor_kwargs) #self.critic = MLP(**actor_kwargs) #TODO: actor and critic has same structure for now. #self.critic_target = MLP(**actor_kwargs) net_cfg = { 'hidden1': args.hidden1, 'hidden2': args.hidden2, 'init_w': args.init_w } self.actor = Actor(self.nb_states, self.nb_actions, **net_cfg) self.actor_target = Actor(self.nb_states, self.nb_actions, **net_cfg) self.critic = Critic(self.nb_states, self.nb_actions, **net_cfg) self.critic_target = Critic(self.nb_states, self.nb_actions, **net_cfg) self.criterion = nn.MSELoss() if self.cuda: self.actor = self.actor.cuda( ) # torch.nn.DataParallel(self.model).cuda() #TODO dataparallel not working self.critic = self.critic.cuda() self.actor_target = self.actor_target.cuda() self.critic_target = self.critic_target.cuda() self.criterion = self.criterion.cuda() # Set optimizer self.actor_optim = torch.optim.Adam(self.actor.parameters(), lr=args.prate) self.critic_optim = torch.optim.Adam(self.critic.parameters(), lr=args.rate) # Loss function self.loss_fn = torch.nn.MSELoss(size_average=False) hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) self.memory = SequentialMemory(limit=args.rmsize, window_length=args.window_length) self.random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=args.ou_theta, mu=args.ou_mu, sigma=args.ou_sigma) # Hyper-parameters self.batch_size = args.bsize self.tau = args.tau self.discount = args.discount self.depsilon = 1.0 / args.epsilon self.epsilon = 1.0 self.s_t = None # Most recent state self.a_t = None # Most recent action self.is_training = True def update_policy(self): # Sample batch state_batch, action_batch, reward_batch, next_state_batch, terminal_batch = self.memory.sample_and_split( self.batch_size) # Prepare for the target q batch next_q_values = self.critic_target([ to_tensor(next_state_batch, volatile=True), self.actor_target(to_tensor(next_state_batch, volatile=True)), ]) next_q_values.volatile = False target_q_batch = to_tensor(reward_batch) + \ self.discount*to_tensor(terminal_batch.astype(np.float))*next_q_values # Critic update self.critic.zero_grad() q_batch = self.critic( [to_tensor(state_batch), to_tensor(action_batch)]) value_loss = self.criterion(q_batch, target_q_batch) value_loss.backward() self.critic_optim.step() # Actor update self.actor.zero_grad() policy_loss = -self.critic( [to_tensor(state_batch), self.actor(to_tensor(state_batch))]) policy_loss = policy_loss.mean() policy_loss.backward() self.actor_optim.step() # Target update soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau) def eval(self): self.actor.eval() self.actor_target.eval() self.critic.eval() self.critic_target.eval() def random_action(self): action = np.random.uniform(-1., 1., self.nb_actions) self.a_t = action return action def observe(self, r_t, s_t1, done): if self.is_training: self.memory.append(self.s_t, self.a_t, r_t, done) self.s_t = s_t1 def select_action(self, s_t, decay_epsilon=True): action = to_numpy(self.actor(to_tensor(np.array([s_t])))).squeeze(0) action += self.is_training * max(self.epsilon, 0) * self.random_process.sample() action = np.clip(action, -1., 1.) if decay_epsilon: self.epsilon -= self.depsilon self.a_t = action return action def reset(self, obs): self.s_t = obs self.random_process.reset_states() def load_wts(self, modelfile): if os.path.isfile(modelfile + 'model.pth.tar'): checkpoint = torch.load(modelfile) self.actor.load_state_dict(checkpoint['actor_state_dict']) self.critic.load_state_dict(checkpoint['critic_state_dict']) self.actor_optim.load_state_dict(checkpoint['actor_optim']) self.critic_optim.load_state_dict(checkpoint['critic_optim']) return checkpoint['step'] else: return 0 def save_wts(self, savefile, step): saveme = { #TODO save other stuff too, like epoch etc 'actor_state_dict': self.actor.state_dict(), 'critic_state_dict': self.critic.state_dict(), 'actor_optim': self.actor_optim.state_dict(), 'critic_optim': self.critic_optim.state_dict(), 'step': step } torch.save(saveme, savefile + 'model.pth.tar')
class DDPGPolicy(object): def __init__(self,env_name, policy_config,device = 'cpu'): self.device = device self.env = gym.make(env_name) #仅仅用于设置observation self.obs_dim = self.env.observation_space.shape[0] if isinstance(self.env.action_space, gym.spaces.Box): self.action_dim = self.env.action_space.shape[0] elif isinstance(self.env.action_space, gym.spaces.Discrete): raise TypeError('Unsupported action type') else: raise ValueError('unsupport action ', type(self.action_dim)) self.action_limit = self.env.action_space.high[0] self.lr = policy_config['lr'] self.actor = Actor(self.obs_dim, self.action_dim).to(device) self.critic = Critic(self.obs_dim, self.action_dim).to(device) self.actor_target = deepcopy(self.actor) self.critic_target = deepcopy(self.critic) hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) self.actor_optim = torch.optim.Adam(params=self.actor.parameters(), lr=0.001) self.critic_optim = torch.optim.Adam(params=self.critic.parameters(), lr=0.001) self.discount_factor = policy_config['discount_factor'] self.tau = 0.005 def train_on_batch(self,rollouts_batch): # loss = r+q(st) - q (st+1), minimums loss obs, acs, next_obs, dones, r, un_r, summed_r = convert_listofrollouts(paths=rollouts_batch) acs = torch.tensor(acs).float().to(self.device) obs = torch.FloatTensor(obs).to(self.device) next_obs = torch.FloatTensor(next_obs).to(self.device) #acs_one_hot = torch.eye(2).to(self.device).index_select(0,acs)# to one hot discrete action space dones = torch.IntTensor(dones).to(self.device) r = torch.FloatTensor(r).to(self.device) # update critic self.critic_optim.zero_grad() act_target = self.actor_target(next_obs).to(self.device) q_target = r + self.discount_factor * self.critic_target(next_obs,act_target)* (1-dones) q_pred = self.critic(obs,acs) critic_loss = torch.nn.functional.mse_loss(q_pred,q_target) critic_loss.backward() self.critic_optim.step() #update actor self.actor_optim.zero_grad() actor_loss = -torch.mean(self.critic(obs,self.actor(obs))) actor_loss.backward() self.actor_optim.step() info = {'loss': actor_loss.cpu().detach().numpy(), # scale 'model_out': q_target, # torch.tensor [sum(batch), ac_dim], } return info def update_target_network(self): soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau) def get_weights(self): #TODO: actor and critic parameters return {k:v for k,v in self.actor.state_dict().items()} def set_weights(self,weights): self.actor.load_state_dict(weights) def compute_actions(self, obs, noise_scale): #通过noise来判断是否需要增加噪声,如果是在eval中noise为0 obs = obs.to(self.device) actions = self.actor(obs).cpu().detach().numpy() actions += noise_scale * np.random.rand(self.action_dim) actions = np.clip(actions, -self.action_limit, self.action_limit) return actions def reset(self):#在env.reset的同时需要reset random_process self.random_process.reset_states()
class DDPGAgent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, memory, device='cpu', params=None): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action memory (obj): Memory buffer to sample device (str): device string between cuda:0 and cpu params (dict): hyper-parameters """ self.state_size = state_size self.action_size = action_size self.device = device self.step_t = 0 self.update_every = params['update_every'] # Set parameters self.gamma = params['gamma'] self.tau = params['tau'] self.seed = random.seed(params['seed']) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, params['seed'], params['actor_units'][0], params['actor_units'][1]).to(device) self.actor_target = Actor(state_size, action_size, params['seed'], params['actor_units'][0], params['actor_units'][1]).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=params['lr_actor']) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, params['seed'], params['critic_units'][0], params['critic_units'][1]).to(device) self.critic_target = Critic(state_size, action_size, params['seed'], params['critic_units'][0], params['critic_units'][1]).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=params['lr_critic'], weight_decay=params['weight_decay']) # Noise process self.noise = OUNoise(action_size, params['seed'], theta=params['noise_theta'], sigma=params['noise_sigma']) # Replay memory self.memory = memory def store_weights(self, filenames): """Store weights of Actor/Critic Params ====== filenames (list): string of filename to store weights of actor and critic filenames[0] = actor weights filenames[1] = critic weights """ torch.save(self.actor_local.state_dict(), filenames[0]) torch.save(self.critic_local.state_dict(), filenames[1]) def load_weights(self, filenames): """Load weights of Actor/Critic Params ====== filenames (list): string of filename to load weights of actor and critic filenames[0] = actor weights filenames[1] = critic weights """ self.actor_local.load_state_dict(torch.load(filenames[0])) self.critic_local.load_state_dict(torch.load(filenames[1])) def step(self, state, action, reward, next_state, done): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward self.memory.add(state, action, reward, next_state, done) self.step_t = (self.step_t + 1) % self.update_every # Learn, if enough samples are available in memory if self.step_t == 0 and len( self.memory) > self.memory.get_batch_size(): experiences = self.memory.sample() self.learn(experiences, self.gamma) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(self.device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, self.tau) self.soft_update(self.actor_local, self.actor_target, self.tau) @staticmethod def soft_update(local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): def __init__(self, n_state, n_action, n_agents, random_seed, device="cpu"): """Initialize an Agent object. Params ------ n_state : int dimension of each state n_action : int dimension of each action random_seed : int random seed device : which device is used, cpu or cuda. """ self.n_state = n_state self.n_action = n_action self.n_agents = n_agents self.random_seed = np.random.seed(random_seed) self.device = device # Networks for the first agent # Local Actor, Local Critic, Target Actor, Target Critic self.actor_local1 = Actor(self.n_state, self.n_action, self.random_seed).to(self.device) self.actor_local1.apply(initialize_weights) self.critic_local1 = Critic(self.n_state * self.n_agents, self.n_action * self.n_agents, self.random_seed).to(self.device) self.critic_local1.apply(initialize_weights) self.actor_target1 = Actor(self.n_state, self.n_action, self.random_seed).to(self.device) self.actor_target1.apply(initialize_weights) self.actor_target1.eval() self.critic_target1 = Critic(self.n_state * self.n_agents, self.n_action * self.n_agents, self.random_seed).to(self.device) self.critic_target1.apply(initialize_weights) self.critic_target1.eval() # Networks for the second agent # Local Actor, Local Critic, Target Actor, Target Critic self.actor_local2 = Actor(self.n_state, self.n_action, self.random_seed).to(self.device) self.actor_local2.apply(initialize_weights) self.critic_local2 = Critic(self.n_state * self.n_agents, self.n_action * self.n_agents, self.random_seed).to(self.device) self.critic_local2.apply(initialize_weights) self.actor_target2 = Actor(self.n_state, self.n_action, self.random_seed).to(self.device) self.actor_target2.apply(initialize_weights) self.actor_target2.eval() self.critic_target2 = Critic(self.n_state * self.n_agents, self.n_action * self.n_agents, self.random_seed).to(self.device) self.actor_target2.apply(initialize_weights) self.critic_target2.eval() # optimizers self.actor_optimizer1 = optim.Adam(self.actor_local1.parameters(), lr=LR_ACTOR) self.actor_optimizer2 = optim.Adam(self.actor_local2.parameters(), lr=LR_ACTOR) self.critic_optimizer1 = optim.Adam(self.critic_local1.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) self.critic_optimizer2 = optim.Adam(self.critic_local2.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(n_action * 2, random_seed + 1, mu=0., theta=THETA, sigma=SIGMA) # Replay Buffer self.memory = ReplayBuffer(n_action, BUFFER_SIZE, BATCH_SIZE, random_seed + 2, self.device) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): pass # Save experience / reward self.memory.add(state, action, reward, next_state, done) self.t_step = (self.t_step + 1) % UPDATE_EVERY # Learn, if enough samples are available in memory if self.t_step == 0 and len(self.memory) > BATCH_SIZE: for _ in range(N_LEARNING): experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, add_noise=True): state0 = torch.from_numpy(state[0]).unsqueeze(dim=0).float().to( self.device) state1 = torch.from_numpy(state[1]).unsqueeze(dim=0).float().to( self.device) self.actor_local1.eval() self.actor_local2.eval() with torch.no_grad(): action0 = self.actor_local1(state0).cpu().data.numpy() action1 = self.actor_local2(state1).cpu().data.numpy() action = np.vstack([action0, action1]) self.actor_local1.train() self.actor_local2.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models with torch.no_grad(): actions_next1 = self.actor_target1(next_states[:, 0:24]) actions_next2 = self.actor_target2(next_states[:, 24:]) actions_next = torch.cat((actions_next1, actions_next2), dim=1) Q_targets_next1 = self.critic_target1(next_states, actions_next) Q_targets_next2 = self.critic_target2(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets1 = rewards[:, 0].unsqueeze( dim=1) + (gamma * Q_targets_next1 * (1 - dones[:, 0].unsqueeze(dim=1))) Q_targets2 = rewards[:, 1].unsqueeze( dim=1) + (gamma * Q_targets_next2 * (1 - dones[:, 1].unsqueeze(dim=1))) # Compute critic loss Q_expected1 = self.critic_local1(states, actions) Q_expected2 = self.critic_local2(states, actions) critic_loss1 = F.mse_loss(Q_expected1, Q_targets1.detach()) critic_loss2 = F.mse_loss(Q_expected2, Q_targets2.detach()) # Minimize the loss self.critic_optimizer1.zero_grad() critic_loss1.backward() torch.nn.utils.clip_grad_norm_(self.critic_local1.parameters(), 1) self.critic_optimizer1.step() self.critic_optimizer2.zero_grad() critic_loss2.backward() torch.nn.utils.clip_grad_norm_(self.critic_local2.parameters(), 1) self.critic_optimizer2.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred1 = self.actor_local1(states[:, 0:24]) actions_pred2 = self.actor_local2(states[:, 24:]) actions_pred = torch.cat((actions_pred1, actions_pred2), dim=1) actor_loss1 = -self.critic_local1(states, actions_pred).mean() # Minimize the loss self.actor_optimizer1.zero_grad() actor_loss1.backward(retain_graph=True) self.actor_optimizer1.step() actor_loss2 = -self.critic_local2(states, actions_pred).mean() self.actor_optimizer2.zero_grad() actor_loss2.backward(retain_graph=True) self.actor_optimizer2.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local1, self.critic_target1, TAU) self.soft_update(self.actor_local1, self.actor_target1, TAU) self.soft_update(self.critic_local2, self.critic_target2, TAU) self.soft_update(self.actor_local2, self.actor_target2, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters Arguments """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): '''Interact with and learn from environment.''' def __init__(self, state_size, action_size, seed): self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.t_step = 0 # counter for activating learning every few steps self.running_c_loss = 0 self.running_a_loss = 0 self.training_cnt = 0 # Actor network (w/ target network) self.actor_local = Actor(state_size, action_size, seed).to(device) self.actor_target = Actor(state_size, action_size, seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic network (w/ target network) self.critic_local = Critic(state_size, action_size, seed).to(device) self.critic_target = Critic(state_size, action_size, seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, seed) # Prioritized replay memory self.prioritized_memory = PrioritizedMemory(BATCH_SIZE, BUFFER_SIZE, seed) def act(self, state, mode): '''Returns actions for given state as per current policy. Params ====== state (array): current state mode (string): train or test epsilon (float): for epsilon-greedy action selection ''' state = torch.from_numpy(state).unsqueeze(0).float().to( device) # shape of state (1, state_size) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if mode == 'test': return np.clip(action, -1, 1) elif mode == 'train': # if train, then add OUNoise in action action += self.noise.sample() return np.clip(action, -1, 1) def step(self, state, action, reward, next_state, done): # add new experience in memory self.prioritized_memory.add(state, action, reward, next_state, done) # activate learning every few steps self.t_step = self.t_step + 1 if self.t_step % LEARN_EVERY_STEP == 0: # If enough samples are available in memory, get random subset and learn if len(self.prioritized_memory) >= BUFFER_SIZE: for _ in range(10): # update 10 times per learning idxes, experiences, is_weights = self.prioritized_memory.sample( device) self.learn(experiences, GAMMA, is_weights=is_weights, leaf_idxes=idxes) def reset(self): self.noise.reset() def learn(self, experiences, gamma, is_weights, leaf_idxes): """ Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Policy loss = (1/n)*Q_local(s,a) -> for deterministic policy (no log prob) Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor is_weights (tensor array): importance-sampling weights for prioritized experience replay leaf_idxes (numpy array): indexes for update priorities in SumTree """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # rewards = rewards # TODO: rewards are clipped to be in [-1,1] actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) td_errors = ( Q_targets - Q_expected).tanh() # TD-errors are clipped to be in [-1,1] abs_errors = td_errors.abs().cpu().data.numpy() # pull back to cpu self.prioritized_memory.batch_update( leaf_idxes, abs_errors) # update priorities in SumTree c_loss = (is_weights * (td_errors**2)).mean( ) # adjust squared TD loss by Importance-Sampling Weights self.running_c_loss += float(c_loss.cpu().data.numpy()) self.training_cnt += 1 # Minimize the loss self.critic_optimizer.zero_grad() c_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) # clip gradient to max 1 self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) a_loss = self.critic_local(states, actions_pred) a_loss = -a_loss.mean() self.running_a_loss += float(a_loss.cpu().data.numpy()) # Minimize the loss self.actor_optimizer.zero_grad() a_loss.backward() torch.nn.utils.clip_grad_norm_(self.actor_local.parameters(), 1) # clip gradient to max 1 self.actor_optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): '''Interact with and learn from environment.''' def __init__(self, state_size, action_size, seed): self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.t_step = 0 # counter for activating learning every few steps self.TAU = 1e-2 self.gamma = 0.99 self.BUFFER_SIZE = int(1e6) self.BATCH_SIZE = 1024 self.LR_CRITIC = 1e-3 self.LR_ACTOR = 1e-3 self.WEIGHT_DECAY = 0.0 self.EPSILON = 1.0 self.EPSILON_DECAY = 0.99 # Actor network (w/ target network) self.actor_local = Actor(self.state_size, self.action_size, seed).to(device) self.actor_target = Actor(self.state_size, self.action_size, seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.LR_ACTOR) # Critic network (w/ target network) self.critic_local = Critic(self.state_size, self.action_size, seed).to(device) self.critic_target = Critic(self.state_size, self.action_size, seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=self.LR_CRITIC, weight_decay=self.WEIGHT_DECAY) # Noise process self.noise = OUNoise(self.action_size, self.seed) def act(self, state, add_noise=True): """ Given a state choose an action Params ====== state (float ndarray): state of the environment """ state = torch.from_numpy(state).unsqueeze(0).float().to(device) self.actor_local.eval( ) # set network on eval mode, this has any effect only on certain modules (Dropout, BatchNorm, etc.) with torch.no_grad(): action = self.actor_local(state).cpu().squeeze(0).data.numpy() self.actor_local.train() # set nework on train mode if add_noise: action += self.noise.sample() * self.EPSILON return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences): """ Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Policy loss = (1/n)*Q_local(s,a) -> for deterministic policy (no log prob) Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor is_weights (tensor array): importance-sampling weights for prioritized experience replay leaf_idxes (numpy array): indexes for update priorities in SumTree """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, self.TAU) self.soft_update(self.actor_local, self.actor_target, self.TAU) self.EPSILON *= self.EPSILON_DECAY self.noise.reset() def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): '''Interact with and learn from environment.''' def __init__(self, state_size, action_size, seed): self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.t_step = 0 # counter for activating learning every few steps self.running_c_loss = 0 self.running_a_loss = 0 self.training_cnt = 0 # Actor network (w/ target network) self.actor_local = Actor(state_size, action_size, seed).to(device) self.actor_target = Actor(state_size, action_size, seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic network (w/ target network) self.critic_local = Critic(state_size, action_size, seed).to(device) self.critic_target = Critic(state_size, action_size, seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) def act(self, state, mode): '''Returns actions for given state as per current policy. Params ====== state (array): current state mode (string): train or test epsilon (float): for epsilon-greedy action selection ''' state = torch.from_numpy(state).unsqueeze(0).float().to( device) # shape of state (1, state_size) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if mode == 'test': return np.clip(action, -1, 1) elif mode == 'train': # if train, then add OUNoise in action action += self.noise.sample() return np.clip(action, -1, 1) def step(self, state, action, reward, next_state, done): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward self.memory.add(state, action, reward, next_state, done) # activate learning every few steps self.t_step = self.t_step + 1 if self.t_step % LEARN_EVERY_STEP == 0: # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE: for _ in range(10): # update 10 times per learning experiences = self.memory.sample() self.learn(experiences, GAMMA) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) self.running_c_loss += float(critic_loss.cpu().data.numpy()) self.training_cnt += 1 # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() self.running_a_loss += float(actor_loss.cpu().data.numpy()) # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() #torch.nn.utils.clip_grad_norm_(self.actor_local.parameters(), 1) # clip gradient to max 1 self.actor_optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, num_agents, state_size, action_size, random_seed, buffer_size, batch_size, gamma, TAU, lr_actor, lr_critic, weight_decay, a_hidden_sizes, c_hidden_sizes): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) # Hyperparameters self.BUFFER_SIZE = buffer_size self.BATCH_SIZE = batch_size self.GAMMA = gamma self.TAU = TAU self.LR_ACTOR = lr_actor self.LR_CRITIC = lr_critic self.WEIGHT_DECAY = weight_decay self.ACTOR_HL_SIZE = a_hidden_sizes self.CRITIC_HL_SIZE = c_hidden_sizes self.num_agents = num_agents # Actor Network (w/ Target Network) self.actor_local_1 = Actor(state_size, action_size, random_seed, self.ACTOR_HL_SIZE).to(device) self.actor_target_1 = Actor(state_size, action_size, random_seed, self.ACTOR_HL_SIZE).to(device) self.actor_optimizer_1 = optim.Adam(self.actor_local_1.parameters(), lr=self.LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local_1 = Critic(state_size, action_size, random_seed, self.CRITIC_HL_SIZE).to(device) self.critic_target_1 = Critic(state_size, action_size, random_seed, self.CRITIC_HL_SIZE).to(device) self.critic_optimizer_1 = optim.Adam(self.critic_local_1.parameters(), lr=self.LR_CRITIC, weight_decay=self.WEIGHT_DECAY) # Actor Network (w/ Target Network) self.actor_local_2 = Actor(state_size, action_size, random_seed, self.ACTOR_HL_SIZE).to(device) self.actor_target_2 = Actor(state_size, action_size, random_seed, self.ACTOR_HL_SIZE).to(device) self.actor_optimizer_2 = optim.Adam(self.actor_local_2.parameters(), lr=self.LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local_2 = Critic(state_size, action_size, random_seed, self.CRITIC_HL_SIZE).to(device) self.critic_target_2 = Critic(state_size, action_size, random_seed, self.CRITIC_HL_SIZE).to(device) self.critic_optimizer_2 = optim.Adam(self.critic_local_2.parameters(), lr=self.LR_CRITIC, weight_decay=self.WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, random_seed) # Replay memory self.memory = ReplayBuffer(action_size, self.BUFFER_SIZE, self.BATCH_SIZE, random_seed) def step(self, states, actions, rewards, next_states, dones): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward for i in range(states.shape[0]): self.memory.add(states[i], actions[i], rewards[i], next_states[i], dones[i]) if len(self.memory) > self.BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, self.GAMMA) def act(self, states, add_noise=True): """Returns actions for given state as per current policy.""" states = torch.from_numpy(states).float().to(device) self.actor_local_1.eval() self.actor_local_2.eval() action_values = [states.shape[0], self.action_size] with torch.no_grad(): action_values[0] = self.actor_local_1(states[0]).cpu().data.numpy() action_values[1] = self.actor_local_2(states[1]).cpu().data.numpy() self.actor_local_1.train() self.actor_local_2.train() #print (action_values) if add_noise: action_values += self.noise.sample() #print (action_values) #print (np.clip(action_values, -1, 1)) return np.clip(action_values, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next_1 = self.actor_target_1(next_states) actions_next_2 = self.actor_target_2(next_states) Q_targets_next_1 = self.critic_target_1(next_states, actions_next_1.detach()) Q_targets_next_2 = self.critic_target_2(next_states, actions_next_2.detach()) # Compute Q targets for current states (y_i) Q_targets_1 = rewards + (gamma * Q_targets_next_1 * (1 - dones)) Q_targets_2 = rewards + (gamma * Q_targets_next_2 * (1 - dones)) # Compute critic loss Q_expected_1 = self.critic_local_1(states, actions) Q_expected_2 = self.critic_local_2(states, actions) critic_loss_1 = F.mse_loss(Q_expected_1, Q_targets_1.detach()) critic_loss_2 = F.mse_loss(Q_expected_2, Q_targets_2.detach()) # Minimize the loss self.critic_optimizer_1.zero_grad() self.critic_optimizer_2.zero_grad() critic_loss_1.backward() critic_loss_2.backward() #torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) # adds gradient clipping to stabilize learning self.critic_optimizer_1.step() self.critic_optimizer_2.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred_1 = self.actor_local_1(states) actions_pred_2 = self.actor_local_2(states) actor_loss_1 = -self.critic_local_1(states, actions_pred_1).mean() actor_loss_2 = -self.critic_local_2(states, actions_pred_2).mean() # Minimize the loss self.actor_optimizer_1.zero_grad() self.actor_optimizer_2.zero_grad() actor_loss_1.backward() actor_loss_2.backward() self.actor_optimizer_1.step() self.actor_optimizer_2.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local_1, self.critic_target_1, self.TAU) self.soft_update(self.critic_local_2, self.critic_target_2, self.TAU) self.soft_update(self.actor_local_1, self.actor_target_1, self.TAU) self.soft_update(self.actor_local_2, self.actor_target_2, self.TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent: def __init__(self, state_size=OBS_DIM, action_size=ACT_DIM, random_seed=0): """Initialize an Agent object. Params ===== state_size (int): dimension of all observation action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.epsilon = EPSILON self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC) self.noise = OUNoise(action_size, random_seed) self.memory = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE, random_seed) def step(self, state, action, reward, next_state): """Save an experience in replay buffer and use random samples from buffer to learn.""" self.memory.add(state, action, reward, next_state) if len(self.memory ) > BATCH_SIZE: # begin to learn when replay buffer is full experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, add_noise=True): """Return actions for given state as per current policy.""" state = state[None, :] state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.epsilon * self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples Q_target = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q_value Params ===== experiences (Tuple[torch.Tensor]): tuple of (s,a,r,s',done) gamma (float): discount factor """ states, actions, rewards, next_states = experiences # ----------------- update critic network weights ---------------- # # get predicted next_state actions and Q_values from target models actions_next = self.actor_target(next_states) q_targets_next = self.critic_target(next_states, actions_next) # compute Q targets for current states q_targets = rewards + gamma * q_targets_next # compute critic loss q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(q_expected, q_targets) # minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # ---------------- update actor network weights ---------------- # # compute the loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------- update target networks ------------------ # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) # ----------------- update noise -------------------- # self.epsilon -= EPSILON_DECAY self.noise.reset() @staticmethod def soft_update(local_model, target_model, tau): """Soft update model parameters θ_target = τ * θ_local + (1 - τ) * θ_target Params ===== local_model: Network weights to be copied from target_model: Network weights to be copied to tau(float): interpolation parameter """ for local_param, target_param in zip(local_model.parameters(), target_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def restore(self, save_path): actor_checkpoint = torch.load(save_path + '/checkpoint_actor.pth') self.actor_local.load_state_dict(actor_checkpoint) critic_checkpoint = torch.load(save_path + '/checkpoint_critic.pth') self.actor_local.load_state_dict(critic_checkpoint) print('Successfully load network weights!')
class DDPG_Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, brain_name, seed, params=default_params, device=None): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ params = self._fill_params(params) # implementation and identity self.device = device if device is not None else torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") self.name = params['name'] self.brain_name = brain_name # set environment information self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, seed, fc1_units=params['layers_actor'][0], fc2_units=params['layers_actor'][1]).to( self.device) self.actor_target = Actor(state_size, action_size, seed, fc1_units=params['layers_actor'][0], fc2_units=params['layers_actor'][1]).to( self.device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=params['lr_actor']) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, seed, fcs1_units=params['layers_critic'][0], fc2_units=params['layers_critic'][1]).to( self.device) self.critic_target = Critic(state_size, action_size, seed, fcs1_units=params['layers_critic'][0], fc2_units=params['layers_critic'][1]).to( self.device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=params['lr_critic'], weight_decay=params['weight_decay']) # Noise process self.noise = OUNoise(action_size, seed) # Replay memory self.memory = ReplayBuffer(action_size, params['buffer_size'], params['batch_size'], seed, device=self.device) # save params self.params = params def _fill_params(self, src_params): params = { 'name': self._get_param_or_default('name', src_params, default_params), 'buffer_size': self._get_param_or_default('buffer_size', src_params, default_params), 'batch_size': self._get_param_or_default('batch_size', src_params, default_params), 'layers_actor': self._get_param_or_default('layers_actor', src_params, default_params), 'layers_critic': self._get_param_or_default('layers_critic', src_params, default_params), 'lr_actor': self._get_param_or_default('lr_actor', src_params, default_params), 'lr_critic': self._get_param_or_default('lr_critic', src_params, default_params), 'gamma': self._get_param_or_default('gamma', src_params, default_params), 'tau': self._get_param_or_default('tau', src_params, default_params), 'weight_decay': self._get_param_or_default('weight_decay', src_params, default_params) } return params def display_params(self, force_print=False): if force_print: print(self.params) return self.params def _get_param_or_default(self, key, src_params, default_params): if key in src_params: return src_params[key] else: return default_params[key] def step(self, state, action, reward, next_state, done): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward self.memory.add(state, action, reward, next_state, done) def start_learn(self): # Learn, if enough samples are available in memory # decoupled from step method to allow multiple steps per learning pass if len(self.memory) > self.params['batch_size']: experiences = self.memory.sample() self.learn(experiences, self.params['gamma']) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(self.device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, self.params['tau']) self.soft_update(self.actor_local, self.actor_target, self.params['tau']) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class DDPGAgent: def __init__(self, action_size=4, state_size=33, num_agents=20, max_steps=1000, seed=0, train_mode=True): self.train_mode = train_mode self.action_size = action_size self.state_size = state_size self.num_agents = num_agents self.max_steps = max_steps self.step_count = 0 self.scores = np.zeros(self.num_agents) self.states, self.actions, self.rewards, self.next_states, self.dones = None, None, None, None, None self.noise = OUNoise(self.action_size, seed) self.memory = AgentMemory(batch_size=BATCH_SIZE, buffer_size=MEMORY_BUFFER, seed=seed) self.actor = Actor(self.state_size, self.action_size, seed) self.critic = Critic(self.state_size, self.action_size, seed) self.target_actor = Actor(self.state_size, self.action_size, seed) self.target_critic = Critic(self.state_size, self.action_size, seed) self.actor_opt = optim.Adam(self.actor.parameters(), lr=LR_ACTOR) self.critic_opt = optim.Adam(self.critic.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) hard_update(self.actor, self.target_actor) hard_update(self.critic, self.target_critic) def reset(self): self.noise.reset() self.step_count = 0 self.scores = np.zeros(self.num_agents) self.states, self.actions, self.rewards, self.next_states, self.dones = None, None, None, None, None def step(self): self.scores += np.array(self.rewards) self.step_count += 1 self.memory.add(self.states, self.actions, self.rewards, self.next_states, self.dones) if self.memory.has_enough_memory(): for i in range(UPDATE_FREQUENCY_PER_STEP): states, actions, rewards, next_states, dones = self.memory.sample( ) self.learn(states, actions, rewards, next_states, dones) self.soft_update() def act(self, add_noise=True): states = array_to_tensor(self.states) self.actor.eval() with torch.no_grad(): actions = self.actor(states) actions = actions.cpu().data.numpy() self.actor.train() if add_noise: noise = self.noise.sample() actions += noise actions = np.clip(actions, -1, 1) return actions def learn(self, states, actions, rewards, next_states, dones): # Update critic self.critic_opt.zero_grad() critic_loss = ddpg_compute_critic_loss(states, actions, rewards, next_states, dones, self.target_actor, self.target_critic, self.critic) critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic.parameters(), 1) self.critic_opt.step() # Update actor self.actor_opt.zero_grad() actor_loss = ddpg_compute_actor_loss(states, self.actor, self.critic) actor_loss.backward() torch.nn.utils.clip_grad_norm_(self.actor.parameters(), 1) self.actor_opt.step() # Update target nets self.soft_update() def soft_update(self): soft_update(self.actor, self.target_actor, TAU) soft_update(self.critic, self.target_critic, TAU)