def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences ## TODO: compute and minimize the loss "*** YOUR CODE HERE ***" # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
class soft_actor_critic_agent(object): def __init__(self, num_inputs, action_space, \ device, hidden_size, seed, lr, gamma, tau, alpha): self.gamma = gamma self.tau = tau self.alpha = alpha self.device = device self.seed = seed self.seed = torch.manual_seed(seed) torch.cuda.manual_seed(seed) #torch.cuda.manual_seed_all(seed) #torch.backends.cudnn.deterministic=True self.critic = QNetwork(seed, num_inputs, action_space.shape[0], hidden_size).to(device=self.device) self.critic_optim = Adam(self.critic.parameters(), lr=lr) self.critic_target = QNetwork(seed, num_inputs, action_space.shape[0], hidden_size).to(self.device) hard_update(self.critic_target, self.critic) # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper self.target_entropy = -torch.prod(torch.Tensor(action_space.shape).to(self.device)).item() self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.alpha_optim = Adam([self.log_alpha], lr=lr) self.policy = GaussianPolicy(seed, num_inputs, action_space.shape[0], \ hidden_size, action_space).to(self.device) self.policy_optim = Adam(self.policy.parameters(), lr=lr) def select_action(self, state, eval=False): state = torch.FloatTensor(state).to(self.device).unsqueeze(0) if eval == False: action, _, _ = self.policy.sample(state) else: _, _, action = self.policy.sample(state) return action.detach().cpu().numpy()[0] def update_parameters(self, memory, batch_size, updates): # Sample a batch from memory state_batch, action_batch, reward_batch, next_state_batch, mask_batch = memory.sample(batch_size=batch_size) state_batch = torch.FloatTensor(state_batch).to(self.device) next_state_batch = torch.FloatTensor(next_state_batch).to(self.device) action_batch = torch.FloatTensor(action_batch).to(self.device) reward_batch = torch.FloatTensor(reward_batch).to(self.device).unsqueeze(1) mask_batch = torch.FloatTensor(mask_batch).to(self.device).unsqueeze(1) with torch.no_grad(): next_state_action, next_state_log_pi, _ = self.policy.sample(next_state_batch) Q1_next_target, Q2_next_target = self.critic_target(next_state_batch, next_state_action) min_q_next_target = torch.min(Q1_next_target, Q2_next_target) - self.alpha * next_state_log_pi next_q_value = reward_batch + mask_batch * self.gamma * (min_q_next_target) # Two Q-functions to mitigate positive bias in the policy improvement step Q1, Q2 = self.critic(state_batch, action_batch) Q1_loss = F.mse_loss(Q1, next_q_value) Q2_loss = F.mse_loss(Q2, next_q_value) action_batch_pi, log_pi, _ = self.policy.sample(state_batch) Q1_pi, Q2_pi = self.critic(state_batch, action_batch_pi) min_q_pi = torch.min(Q1_pi, Q2_pi) policy_loss = ((self.alpha * log_pi) - min_q_pi).mean() self.critic_optim.zero_grad() Q1_loss.backward() self.critic_optim.step() self.critic_optim.zero_grad() Q2_loss.backward() self.critic_optim.step() self.policy_optim.zero_grad() policy_loss.backward() self.policy_optim.step() alpha_loss = -(self.log_alpha * (log_pi + self.target_entropy).detach()).mean() self.alpha_optim.zero_grad() alpha_loss.backward() self.alpha_optim.step() self.alpha = self.log_alpha.exp() alpha_tlogs = self.alpha.clone() # For TensorboardX logs soft_update(self.critic_target, self.critic, self.tau)
class AbstractAgent(metaclass=ABCMeta): """Abstract Base Agent""" def __init__(self, state_size, action_size, memory, seed, configs): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.lr = configs['LR'] self.update_every = configs['UPDATE_EVERY'] self.batch_size = configs['BATCH_SIZE'] self.gamma = configs['GAMMA'] self.tau = configs['TAU'] # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=self.lr) # Replay memory # ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) self.memory = memory # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 # Loss self.criterion = nn.MSELoss() def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % self.update_every if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > self.batch_size: experiences = self.memory.sample() self._learn(experiences, self.gamma) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) @abstractmethod def _learn(self, experiences, gamma): raise NotImplementedError
class SAC(object): def __init__(self, num_inputs, action_space, args): self.gamma = args.gamma self.tau = args.tau self.alpha = args.alpha self.policy_type = args.policy self.target_update_interval = args.target_update_interval self.automatic_entropy_tuning = args.automatic_entropy_tuning self.device = torch.device("cuda" if args.cuda else "cpu") self.critic = QNetwork(num_inputs, action_space.shape[0], args.hidden_size).to(device=self.device) self.critic_optim = Adam(self.critic.parameters(), lr=args.lr) self.critic_target = QNetwork(num_inputs, action_space.shape[0], args.hidden_size).to(self.device) hard_update(self.critic_target, self.critic) if self.policy_type == "Gaussian": # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper if self.automatic_entropy_tuning is True: self.target_entropy = -torch.prod( torch.Tensor(action_space.shape).to(self.device)).item() self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.alpha_optim = Adam([self.log_alpha], lr=args.lr) self.policy = GaussianPolicy(num_inputs, action_space.shape[0], args.hidden_size, action_space).to(self.device) self.policy_optim = Adam(self.policy.parameters(), lr=args.lr) else: self.alpha = 0 self.automatic_entropy_tuning = False self.policy = DeterministicPolicy(num_inputs, action_space.shape[0], args.hidden_size, action_space).to(self.device) self.policy_optim = Adam(self.policy.parameters(), lr=args.lr) def select_action(self, state, evaluate=False): state = torch.FloatTensor(state).to(self.device).unsqueeze(0) if evaluate is False: action, _, _ = self.policy.sample(state) else: _, _, action = self.policy.sample(state) return action.detach().cpu().numpy()[0] def update_parameters(self, memory, batch_size, updates): # Sample a batch from memory state_batch, action_batch, reward_batch, next_state_batch, mask_batch = memory.sample( batch_size=batch_size) state_batch = torch.FloatTensor(state_batch).to(self.device) next_state_batch = torch.FloatTensor(next_state_batch).to(self.device) action_batch = torch.FloatTensor(action_batch).to(self.device) reward_batch = torch.FloatTensor(reward_batch).to( self.device).unsqueeze(1) mask_batch = torch.FloatTensor(mask_batch).to(self.device).unsqueeze(1) with torch.no_grad(): next_state_action, next_state_log_pi, _ = self.policy.sample( next_state_batch) qf1_next_target, qf2_next_target = self.critic_target( next_state_batch, next_state_action) min_qf_next_target = torch.min( qf1_next_target, qf2_next_target) - self.alpha * next_state_log_pi next_q_value = reward_batch + mask_batch * self.gamma * ( min_qf_next_target) qf1, qf2 = self.critic( state_batch, action_batch ) # Two Q-functions to mitigate positive bias in the policy improvement step qf1_loss = F.mse_loss( qf1, next_q_value ) # JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2] qf2_loss = F.mse_loss( qf2, next_q_value ) # JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2] pi, log_pi, _ = self.policy.sample(state_batch) qf1_pi, qf2_pi = self.critic(state_batch, pi) min_qf_pi = torch.min(qf1_pi, qf2_pi) policy_loss = ((self.alpha * log_pi) - min_qf_pi).mean( ) # Jπ = 𝔼st∼D,εt∼N[α * logπ(f(εt;st)|st) − Q(st,f(εt;st))] self.critic_optim.zero_grad() qf1_loss.backward() self.critic_optim.step() self.critic_optim.zero_grad() qf2_loss.backward() self.critic_optim.step() self.policy_optim.zero_grad() policy_loss.backward() self.policy_optim.step() if self.automatic_entropy_tuning: alpha_loss = -(self.log_alpha * (log_pi + self.target_entropy).detach()).mean() self.alpha_optim.zero_grad() alpha_loss.backward() self.alpha_optim.step() self.alpha = self.log_alpha.exp() alpha_tlogs = self.alpha.clone() # For TensorboardX logs else: alpha_loss = torch.tensor(0.).to(self.device) alpha_tlogs = torch.tensor(self.alpha) # For TensorboardX logs if updates % self.target_update_interval == 0: soft_update(self.critic_target, self.critic, self.tau) return qf1_loss.item(), qf2_loss.item(), policy_loss.item( ), alpha_loss.item(), alpha_tlogs.item() # Save model parameters def save_model(self, env_name, suffix="", actor_path=None, critic_path=None): if not os.path.exists('models/'): os.makedirs('models/') if actor_path is None: actor_path = "models/sac_actor_{}_{}".format(env_name, suffix) if critic_path is None: critic_path = "models/sac_critic_{}_{}".format(env_name, suffix) print('Saving models to {} and {}'.format(actor_path, critic_path)) torch.save(self.policy.state_dict(), actor_path) torch.save(self.critic.state_dict(), critic_path) # Load model parameters def load_model(self, actor_path, critic_path): print('Loading models from {} and {}'.format(actor_path, critic_path)) if actor_path is not None: self.policy.load_state_dict(torch.load(actor_path)) if critic_path is not None: self.critic.load_state_dict(torch.load(critic_path))
class SAC(object): def __init__(self, input_space, action_space, args): self.use_expert = args.use_expert self.gamma = args.gamma self.tau = args.tau self.alpha = args.alpha self.action_range = [action_space.low, action_space.high] self.policy_type = args.policy self.target_update_interval = args.target_update_interval self.automatic_entropy_tuning = args.automatic_entropy_tuning # self.device = torch.device("cuda" if args.cuda else "cpu") self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # print(torch.cuda.is_available()) # print(torch.cuda.current_device()) # print(torch.cuda.device(0)) # print(torch.cuda.device_count()) # print(torch.cuda.get_device_name()) # print(torch.backends.cudnn.version()) # print(torch.backends.cudnn.is_available()) self.critic = QNetwork(input_space, action_space.shape[0], args.hidden_size).to(device=self.device) self.critic_optim = Adam(self.critic.parameters(), lr=args.lr) self.critic_target = QNetwork(input_space, action_space.shape[0], args.hidden_size).to(self.device) hard_update(self.critic_target, self.critic) if self.policy_type == "Gaussian": # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper if self.automatic_entropy_tuning is True: self.target_entropy = -torch.prod(torch.Tensor(action_space.shape).to(self.device)).item() self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.alpha_optim = Adam([self.log_alpha], lr=args.lr) self.policy = GaussianPolicy(input_space, action_space.shape[0], args.hidden_size, action_space).to(self.device) self.policy_optim = Adam(self.policy.parameters(), lr=args.lr) else: raise ValueError("Not supper another type yet.") # SAC_V # self.value = ValueNetwork(input_space).to(device=self.device) # self.value_target = ValueNetwork(input_space).to(self.device) # self.value_optim = Adam(self.value.parameters(), lr=args.lr) # hard_update(self.value_target, self.value) # self.policy = GaussianPolicy(input_space, action_space.shape[0], args.hidden_size).to(self.device) # self.policy_optim = Adam(self.policy.parameters(), lr=args.lr) def select_action(self, state, eval=False): state_map = torch.FloatTensor(state['map']).to(self.device).unsqueeze(0) state_lidar = torch.FloatTensor(state['lidar']).to(self.device).unsqueeze(0) state_goal = torch.FloatTensor(state['goal']).to(self.device).unsqueeze(0) state_plan_len = torch.FloatTensor(state['plan_len']).to(self.device).unsqueeze(0) state_robot_info= torch.FloatTensor(state['robot_info']).to(self.device).unsqueeze(0) state = {'map': state_map, 'lidar': state_lidar, 'goal': state_goal, 'plan_len': state_plan_len, 'robot_info': state_robot_info} if eval is False: action, _, _ = self.policy.sample(state) else: _, _, action = self.policy.sample(state) action = action.detach().cpu().numpy()[0] # return self.rescale_action(action) return action def rescale_action(self, action): return action * (self.action_range[1] - self.action_range[0]) / 2.0 +\ (self.action_range[1] + self.action_range[0]) / 2.0 def update_parameters(self, memory, batch_size, updates): # Sample a batch from memory if not self.use_expert: state_batch, action_batch, reward_batch, next_state_batch, mask_batch = memory.sample(batch_size=batch_size) else: state_batch, action_batch, reward_batch, next_state_batch, mask_batch, s_e_batch, a_e_batch = memory.sample(batch_size=batch_size, use_expert=True) # State is array of dictionary like [{"map":value, "lidar":value, "goal":value}, ...] # So, convert list to dict below: _state_batch = {'map':[], 'lidar':[], 'goal':[], 'plan_len':[], 'robot_info':[]} _next_state_batch = {'map':[], 'lidar':[], 'goal':[], 'plan_len':[], 'robot_info':[]} _state_expert_batch = {'map':[], 'lidar':[], 'goal':[], 'plan_len':[], 'robot_info':[]} for s in state_batch: _state_batch['map'].append(s['map']) _state_batch['lidar'].append(s['lidar']) _state_batch['goal'].append(s['goal']) _state_batch['plan_len'].append(s['plan_len']) _state_batch['robot_info'].append(s['robot_info']) for s in next_state_batch: _next_state_batch['map'].append(s['map']) _next_state_batch['lidar'].append(s['lidar']) _next_state_batch['goal'].append(s['goal']) _next_state_batch['plan_len'].append(s['plan_len']) _next_state_batch['robot_info'].append(s['robot_info']) if self.use_expert: for s in s_e_batch: _state_expert_batch['map'].append(s['map']) _state_expert_batch['lidar'].append(s['lidar']) _state_expert_batch['goal'].append(s['goal']) _state_expert_batch['plan_len'].append(s['plan_len']) _state_expert_batch['robot_info'].append(s['robot_info']) _state_batch['map'] = torch.FloatTensor(_state_batch['map']).to(self.device) _state_batch['lidar'] = torch.FloatTensor(_state_batch['lidar']).to(self.device) _state_batch['goal'] = torch.FloatTensor(_state_batch['goal']).to(self.device) _state_batch['plan_len'] = torch.FloatTensor(_state_batch['plan_len']).to(self.device) _state_batch['robot_info'] = torch.FloatTensor(_state_batch['robot_info']).to(self.device) _next_state_batch['map'] = torch.FloatTensor(_next_state_batch['map']).to(self.device) _next_state_batch['lidar'] = torch.FloatTensor(_next_state_batch['lidar']).to(self.device) _next_state_batch['goal'] = torch.FloatTensor(_next_state_batch['goal']).to(self.device) _next_state_batch['plan_len'] = torch.FloatTensor(_next_state_batch['plan_len']).to(self.device) _next_state_batch['robot_info'] = torch.FloatTensor(_next_state_batch['robot_info']).to(self.device) if self.use_expert: _state_expert_batch['map'] = torch.FloatTensor(_state_expert_batch['map']).to(self.device) _state_expert_batch['lidar'] = torch.FloatTensor(_state_expert_batch['lidar']).to(self.device) _state_expert_batch['goal'] = torch.FloatTensor(_state_expert_batch['goal']).to(self.device) _state_expert_batch['plan_len'] = torch.FloatTensor(_state_expert_batch['plan_len']).to(self.device) _state_expert_batch['robot_info'] = torch.FloatTensor(_state_expert_batch['robot_info']).to(self.device) _action_expert_batch = torch.FloatTensor(a_e_batch).to(self.device) action_batch = torch.FloatTensor(action_batch).to(self.device) reward_batch = torch.FloatTensor(reward_batch).to(self.device).unsqueeze(1) mask_batch = torch.FloatTensor(mask_batch).to(self.device).unsqueeze(1) # SAC_V # with torch.no_grad(): # vf_next_target = self.value_target(_new_next_state_batch) # next_q_value = reward_batch + mask_batch * self.gamma * (vf_next_target) with torch.no_grad(): next_state_action, next_state_log_pi, _ = self.policy.sample(_next_state_batch) qf1_next_target, qf2_next_target = self.critic_target(_next_state_batch, next_state_action) min_qf_next_target = torch.min(qf1_next_target, qf2_next_target) - self.alpha * next_state_log_pi next_q_value = reward_batch + mask_batch * self.gamma * (min_qf_next_target) qf1, qf2 = self.critic(_state_batch, action_batch) # Two Q-functions to mitigate positive bias in the policy improvement step qf1_loss = F.mse_loss(qf1, next_q_value) # JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2] qf2_loss = F.mse_loss(qf2, next_q_value) # JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2] qf_loss = qf1_loss + qf2_loss self.critic_optim.zero_grad() qf_loss.backward() self.critic_optim.step() # Update Policy if not self.use_expert: pi, log_pi, _ = self.policy.sample(_state_batch) qf1_pi, qf2_pi = self.critic(_state_batch, pi) min_qf_pi = torch.min(qf1_pi, qf2_pi) else: pi, log_pi, _ = self.policy.sample(_state_expert_batch) qf1_pi, qf2_pi = self.critic(_state_expert_batch, pi) min_qf_pi = torch.min(qf1_pi, qf2_pi) policy_loss = ((self.alpha * log_pi) - min_qf_pi).mean() # Jπ = 𝔼st∼D,εt∼N[α * logπ(f(εt;st)|st) − Q(st,f(εt;st))] self.policy_optim.zero_grad() policy_loss.backward() self.policy_optim.step() if self.automatic_entropy_tuning: alpha_loss = -(self.log_alpha * (log_pi + self.target_entropy).detach()).mean() self.alpha_optim.zero_grad() alpha_loss.backward() self.alpha_optim.step() self.alpha = self.log_alpha.exp() alpha_tlogs = self.alpha.clone() # For TensorboardX logs else: alpha_loss = torch.tensor(0.).to(self.device) alpha_tlogs = torch.tensor(self.alpha) # For TensorboardX logs if updates % self.target_update_interval == 0: soft_update(self.critic_target, self.critic, self.tau) return qf1_loss.item(), qf2_loss.item(), policy_loss.item(), alpha_loss.item(), alpha_tlogs.item() # # SAC_V # # Regularization Loss # reg_loss = 0.001 * (mean.pow(2).mean() + log_std.pow(2).mean()) # policy_loss += reg_loss # self.policy_optim.zero_grad() # policy_loss.backward() # self.policy_optim.step() # # Update Value # if not self.use_expert: # vf = self.value(_new_state_batch) # else: # vf = self.value(_new_s_e_batch) # with torch.no_grad(): # vf_target = min_qf_pi - (self.alpha * log_pi) # vf_loss = F.mse_loss(vf, vf_target) # JV = 𝔼(st)~D[0.5(V(st) - (𝔼at~π[Q(st,at) - α * logπ(at|st)]))^2] # self.value_optim.zero_grad() # vf_loss.backward() # self.value_optim.step() # if updates % self.target_update_interval == 0: # soft_update(self.value_target, self.value, self.tau) # return vf_loss.item(), qf1_loss.item(), qf2_loss.item(), policy_loss.item() # Save model parameters def save_model(self, env_name, suffix="", actor_path=None, critic_path=None): if not os.path.exists('models/'): os.makedirs('models/') if actor_path is None: actor_path = "models/sac_actor_{}_{}".format(env_name, suffix) if critic_path is None: critic_path = "models/sac_critic_{}_{}".format(env_name, suffix) print('Saving models to\n {}\n, {}\n'.format(actor_path, critic_path)) torch.save(self.policy.state_dict(), actor_path) torch.save(self.critic.state_dict(), critic_path) # Load model parameters def load_model(self, actor_path, critic_path): print('Loading models from\n {}\n, {}\n'.format(actor_path, critic_path)) if actor_path is not None: self.policy.load_state_dict(torch.load(actor_path)) if critic_path is not None: self.critic.load_state_dict(torch.load(critic_path))
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed, compute_weights=False): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.compute_weights = compute_weights # Algorithms to enable during training self.PrioritizedReplayBuffer = True # Use False to enable uniform sampling self.HardTargetUpdate = True # Use False to enable soft target update # building the policy and target Q-networks for the agent, such that the target Q-network is kept frozen to avoid the training instability issues # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) # main policy network self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) # target network self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) self.criterion = nn.MSELoss() # Replay memory # building the experience replay memory used to avoid training instability issues # Below: PER self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, EXPERIENCES_PER_SAMPLING, seed, compute_weights) # Below: Uniform by method defined in this script #self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_NN_EVERY steps) self.t_step_nn = 0 # Initialize time step (for updating every UPDATE_MEM_PAR_EVERY steps) self.t_step_mem_par = 0 # Initialize time step (for updating every UPDATE_MEM_EVERY steps) self.t_step_mem = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_NN_EVERY time steps. self.t_step_nn = (self.t_step_nn + 1) % UPDATE_NN_EVERY self.t_step_mem = (self.t_step_mem + 1) % UPDATE_MEM_EVERY self.t_step_mem_par = (self.t_step_mem_par + 1) % UPDATE_MEM_PAR_EVERY if self.t_step_mem_par == 0: self.memory.update_parameters() if self.t_step_nn == 0: # If enough samples are available in memory, get random subset and learn if self.memory.experience_count > EXPERIENCES_PER_SAMPLING: sampling = self.memory.sample() self.learn(sampling, GAMMA) if self.t_step_mem == 0: self.memory.update_memory_sampling() def act(self, state, eps=0.): """A function to select an action based on the Epsilon greedy policy. Epislon percent of times the agent will select a random action while 1-Epsilon percent of the time the agent will select the action with the highest Q value as predicted by the neural network. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) # here we calculate action values (Q values) self.qnetwork_local.eval( ) # model deactivate norm, dropout etc. layers as it is expected with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train( ) # model.train() sets the modules in the network in training mode # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.cpu().numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, sampling, gamma): """Update value parameters using given batch of experience tuples. Function for training the neural network. The function will update the weights of the newtwork Params ====== sampling (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones, weights, indices = sampling # Target (absolute) Q values from target Q network q_target = self.qnetwork_target(next_states).detach().max( 1)[0].unsqueeze(1) # Predictions from local Q network expected_values = rewards + gamma * q_target * (1 - dones) output = self.qnetwork_local(states).gather(1, actions) # computing the loss loss = F.mse_loss(output, expected_values) # Loss Function: Mean Square Error if self.compute_weights: with torch.no_grad(): weight = sum(np.multiply(weights, loss.data.cpu().numpy())) loss *= weight # Minimizing the loss by optimizer self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) # ------------------- update priorities ------------------- # delta = abs(expected_values - output.detach()).cpu().numpy() #print("delta", delta) self.memory.update_priorities(delta, indices) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) # def hard_update(self): # """ This hard_update method performs direct update of target network # weight update from local network weights instantly""" # Write the algorithm here def load_models(self, policy_net_filename, target_net_filename): """ Function to load the parameters of the policy and target models """ print('Loading model...') self.qnetwork_local.load_model(policy_net_filename) self.qnetwork_target.load_model(target_net_filename)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # use helper calc_loss function & not this directly self.criterion = nn.MSELoss() # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ # first unsqueeze to make it a batch with one sample in it state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) # set the mode back to training mode self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: # output of model needs to be Q values of actions 0 - (n-1) And output[ind_x] needs to correspond to action_x return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) # helper for learn func # y_j does not depend on the weight parameters that gradient descent will be training def calc_y_j(self, r_j, dones, gamma, target_out): # 1 or 0 flag; if episode doesn't terminate at j+1 (aka done == False), y_j product now already includes the gamma multiplication factor # use [[x] for x in y] kind of list comprehension because need them to be batch_size by 1 like r_j # use .to(device) to move to gpu (not just setting device arg when creating a tensor) dones_flags = torch.Tensor([[0] if done == True else [gamma] for done in dones]).float().to(device) max_q_target_out = torch.Tensor([[torch.max(q_for_all_actions)] for q_for_all_actions in target_out]).float().to(device) # RuntimeError: Can't call numpy() on Variable that requires grad. Use var.detach().numpy() instead. #dones_flags = torch.from_numpy(np.vstack([0 if done == True else gamma for done in dones])).float().to(device) #max_q_target_out = torch.from_numpy(np.vstack([torch.max(q_for_all_actions) for q_for_all_actions in target_out])).float().to(device) y_j = r_j + dones_flags * max_q_target_out return y_j # helper for learn func def calc_loss(self, y_j, pred_out, actions): # need pred_out_actions_taken to be a tensor & built by combining (concatenating) other tensors to maintain gradients # actions is batch_size by 1- only have to iterate through rows for i in range(actions.shape[0]): # action taken. is an index for which col to look at in pred_out (pred_out is batch_size by n_actions] action_ind = actions[i, 0].item() if i == 0: # need to take h from 0 dimensional to 2 dimensional pred_out_actions_taken = pred_out[i, action_ind].unsqueeze(0).unsqueeze(0) else: # concat along dim 0 -> vertically stack rows pred_out_actions_taken = torch.cat((pred_out_actions_taken, pred_out[i, action_ind].unsqueeze(0).unsqueeze(0)), dim=0) # loss is MSE between pred_out_actions_taken (input) and y_j (target) return self.criterion(pred_out_actions_taken, y_j) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ # torch states, actions, rewards, next_states, dones = experiences "*** YOUR CODE HERE ***" ## TODO: compute and minimize the loss # vstack takes one argument (sequence)- stacks the pieces of that argument vertically # SELF NOTE: think about what (if anything additional) needs .todevice() # make sure to zero the gradients self.optimizer.zero_grad() # q_network_local model output from forward pass pred_out = self.qnetwork_local(states) target_out = self.qnetwork_target(next_states) # compute the loss for q_network_local vs q_network_target y_j = self.calc_y_j(rewards, dones, gamma, target_out) # calc gradient & take step down the gradient loss = self.calc_loss(y_j, pred_out, actions) loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 # save sample (error,<s,a,r,s'>) to the replay memory def _append_sample(self, state, action, reward, next_state, done): state_on_device = torch.from_numpy(state).float().unsqueeze(0).to( device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state_on_device) expected_Q = action_values[(0, action)] self.qnetwork_local.train() next_state_on_device = torch.from_numpy(next_state).float().unsqueeze( 0).to(device) self.qnetwork_target.eval() with torch.no_grad(): predicted_Q = self.qnetwork_target(next_state_on_device) max_predicted_Q = predicted_Q.max(1)[0] self.qnetwork_target.train() target_Q = reward + (GAMMA * max_predicted_Q * (1 - done)) error = expected_Q - target_Q error = abs(error.cpu().data.numpy()[0]) self.memory.add(state, action, reward, next_state, done) # Simplified Prioritized Experience Replay # add additional samples if the error is large if error > 1.: error = pow(error, 0.6) count = int(round(error, 0)) for _ in range(count): self.memory.add(state, action, reward, next_state, done) def step(self, state, action, reward, next_state, done): # Save experience in replay memory if USE_SPER: self._append_sample(state, action, reward, next_state, done) else: self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # get the expected Q value using the local model expected_Q = self.qnetwork_local(states).gather(1, actions) if USE_DDQN: # ----- DDQN local_next_state_max_action = self.qnetwork_local( next_states).detach().argmax(1).unsqueeze(1) max_predicted_Q = self.qnetwork_target( next_states).detach().gather(1, local_next_state_max_action) else: # ---- Vanilla DQN max_predicted_Q = self.qnetwork_target(next_states).detach().max( 1)[0].unsqueeze(1) # get the target Q using current state target_Q = rewards + (gamma * max_predicted_Q * (1 - dones)) # get the loss loss = F.mse_loss(expected_Q, target_Q) # minimise the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent: """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed, buffer_size=int(1e5), batch_size=64, gamma=0.99, tau=1e-3 , lr=5e-4, double_dqn=True, huber_loss=False, update_every=4): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.double_dqn = double_dqn self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=lr) self.batch_size = batch_size self.gamma = gamma self.tau = tau self.update_every = update_every if huber_loss: self.loss_function = F.smooth_l1_loss else: self.loss_function = F.mse_loss # Replay memory self.memory = ReplayBuffer(action_size, buffer_size, batch_size, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % self.update_every if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples """ states, actions, rewards, next_states, dones = experiences if self.double_dqn: # Get predicted Q values from target model using the arg_max predicted by a local model argmax_actions = self.qnetwork_local(next_states).detach().argmax(1).unsqueeze(1) Q_targets_next = self.qnetwork_target(next_states).detach().gather(1, argmax_actions) else: # Get max predicted Q values (for next states) from target model Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1) # Compute Q targets for current states Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) # Compute loss loss = self.loss_function(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target) def soft_update(self, local_model, target_model): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(self.tau*local_param.data + (1.0-self.tau)*target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) #expliding else: return random.choice(np.arange(self.action_size)) #exploration def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences ## TODO: compute and minimize the loss "*** YOUR CODE HERE ***" """ for i in range(BATCH_SIZE): if not dones[i]: max_val = self.qnetwork_target(next_states[i]) best_val = max_val.argmax() target = rewards[i] + gamma*(max_val[best_val]) else: target = rewards[i] current = self.qnetwork_local(states[i])[actions[i]] #current = self.qnetwork_local(states).gather(-1, actions.reshape(actions.size()[0], 1)) self.loss = F.mse_loss(target, current) #self.loss.requires_grad = True self.optimizer.zero_grad() self.loss.backward() self.optimizer.step() """ max_val = self.qnetwork_target(next_states) best_val = max_val.argmax(dim=-1) max_val = max_val.gather(-1, best_val.reshape(max_val.size()[0], 1)) target = rewards + gamma * max_val * (1 - dones) current = self.qnetwork_local(states).gather( -1, actions.reshape(actions.size()[0], 1)) self.loss = F.mse_loss(current, target) self.optimizer.zero_grad() self.loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """ Initialize Agent, inclduing: DQN Hyperparameters Local and Targat State-Action Policy Networks Replay Memory Buffer from Replay Buffer Class (define below) """ def __init__(self, state_size, action_size, dqn_type='DQN', replay_memory_size=1e5, batch_size=64, gamma=0.99, learning_rate=1e-3, target_tau=2e-3, update_rate=4, seed=0): """ DQN Agent Parameters ====== state_size (int): dimension of each state action_size (int): dimension of each action dqn_type (string): can be either 'DQN' for vanillia dqn learning (default) or 'DDQN' for double-DQN. replay_memory size (int): size of the replay memory buffer (typically 5e4 to 5e6) batch_size (int): size of the memory batch used for model updates (typically 32, 64 or 128) gamma (float): paramete for setting the discoun ted value of future rewards (typically .95 to .995) learning_rate (float): specifies the rate of model learing (typically 1e-4 to 1e-3)) seed (int): random seed for initializing training point. """ self.dqn_type = dqn_type self.state_size = state_size self.action_size = action_size self.buffer_size = int(replay_memory_size) self.batch_size = batch_size self.gamma = gamma self.learn_rate = learning_rate self.tau = target_tau self.update_rate = update_rate self.seed = random.seed(seed) """ # DQN Agent Q-Network # For DQN training, two nerual network models are employed; # (a) A network that is updated every (step % update_rate == 0) # (b) A target network, with weights updated to equal the network at a slower (target_tau) rate. # The slower modulation of the target network weights operates to stablize learning. """ self.network = QNetwork(state_size, action_size, seed).to(device) self.target_network = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.network.parameters(), lr=self.learn_rate) # Replay memory self.memory = ReplayBuffer(action_size, self.buffer_size, self.batch_size, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 ######################################################## # STEP() method # def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % self.update_rate if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences, self.gamma) ######################################################## # ACT() method # def act(self, state, eps=0.0): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.network.eval() with torch.no_grad(): action_values = self.network(state) self.network.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) ######################################################## # LEARN() method # Update value parameters using given batch of experience tuples. def learn(self, experiences, gamma, DQN=True): """ Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # Get Q values from current observations (s, a) using model nextwork Qsa = self.network(states).gather(1, actions) if (self.dqn_type == 'DDQN'): #Double DQN #************************ Qsa_prime_actions = self.network(next_states).detach().max(1)[1].unsqueeze(1) Qsa_prime_targets = self.target_network(next_states)[Qsa_prime_actions].unsqueeze(1) else: #Regular (Vanilla) DQN #************************ # Get max Q values for (s',a') from target model Qsa_prime_target_values = self.target_network(next_states).detach() Qsa_prime_targets = Qsa_prime_target_values.max(1)[0].unsqueeze(1) # Compute Q targets for current states Qsa_targets = rewards + (gamma * Qsa_prime_targets * (1 - dones)) # Compute loss (error) loss = F.mse_loss(Qsa, Qsa_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.network, self.target_network, self.tau) ######################################################## """ Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target """ def soft_update(self, local_model, target_model, tau): """ Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
class Agent(): """ Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed): """ Initialize an Agent object. INPUTS: ------------ state_size - (int) dimension of each state action_size - (int) dimension of each action seed - (int) random seed OUTPUTS: ------------ no direct """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): """ Update the agent's knowledge, using the most recently sampled tuple. INPUTS: ------------ state - (array_like) the previous state of the environment (8,) action - (int) the agent's previous choice of action reward - (float) last reward received next_state - (torch tensor) the current state of the environment done - (bool) whether the episode is complete (True or False) OUTPUTS: ------------ no direct """ # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() #print('experiences') #print(experiences) self.learn(experiences, GAMMA) def act(self, state, eps=0.): """ Returns actions for given state as per current policy. INPUTS: ------------ state - (numpy array_like) current state eps - (float) epsilon, for epsilon-greedy action selection OUTPUTS: ------------ act_select - (int) next epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: act_select = np.argmax(action_values.cpu().data.numpy()) return act_select else: act_select = random.choice(np.arange(self.action_size)) return act_select def learn(self, experiences, gamma): """ Update value parameters using given batch of experience tuples. INPUTS: ------------ experiences - (Tuple[torch.Variable]) tuple of (s, a, r, s', done) tuples gamma - (float) discount factor OUTPUTS: ------------ """ states, actions, rewards, next_states, dones = experiences ## Compute and minimize the loss # Get max predicted Q values (for next states) from target model Q_targets_next = self.qnetwork_target(next_states).detach().max( 1)[0].unsqueeze(1) #print(Q_targets_next) # Compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) #print(Q_targets) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) #print(Q_expected) # Compute loss loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """ Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target INPUTS: ------------ local_model - (PyTorch model) weights will be copied from target_model - (PyTorch model) weights will be copied to tau - (float) interpolation parameter OUTPUTS: ------------ no direct """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): def __init__(self, state_size, action_size, random_seed): self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, random_seed) self.qnetwork_target = QNetwork(state_size, action_size, random_seed) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Move to GPU if CUDA is available if train_on_gpu: self.qnetwork_local.cuda() self.qnetwork_target.cuda() # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience / reward self.memory.add(state, action, reward, next_state, done) self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().unsqueeze(0) if train_on_gpu: state = state.cuda() self.qnetwork_local.eval() action_values = self.qnetwork_local(Variable(state, volatile=True)) self.qnetwork_local.train() max_action = np.argmax(action_values.cpu().data.numpy()) policy_s = np.ones(self.action_size) * eps / self.action_size policy_s[max_action] = 1 - eps + (eps / self.action_size) action = np.random.choice(np.arange(self.action_size), p=policy_s) return action def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # Get max predicted Q values from target model Q_targets_next = self.qnetwork_target(next_states).detach().max( 1)[0].unsqueeze(1) # Compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute loss Q_expected = self.qnetwork_local(states).gather(1, actions) loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ----------------------- update target network ----------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Double_DQNAgent(object): """Interacts with and learns from the environment.""" def __init__(self, args, state_size, action_size): """Initialize an Agent object. Args: param1 (args) : command line arguments param2 (int) : state size environment param3 (int) : dimension of each action """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(args.seed) self.batch_size = args.batch_size self.tau = args.tau self.update_every = args.update_every self.memory_size = args.buffer_size self.gamma = args.discount self.lr = args.lr self.device = args.device # Q-Network if torch.cuda.is_available(): self.qnetwork_local = QNetwork(state_size, action_size, args.hidden_size_1, args.hidden_size_2, args.seed) self.qnetwork_target = QNetwork(state_size, action_size, args.hidden_size_1, args.hidden_size_2, args.seed) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=self.lr) # Replay memory # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def act_e_greedy(self, state, epsilon=0.001): """ acts with epsilon greedy policy epsilon exploration vs exploitation traide off Args: param1(int): state param2(float): epsilon Return : action int number between 0 and 4 """ return random.choice(np.arange( self.action_size)) if np.random.random() < epsilon else self.act( state) def act(self, state): """ acts greedy(max) based on a single state Args: param1 (int) : state """ self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() return np.argmax(action_values.cpu().data.numpy()) def learn(self, mem): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ idxs, states, actions, rewards, next_states, nonterminals, weights = mem.sample( self.batch_size) states = states.squeeze(1) q_values = self.qnetwork_local(states) next_q_values = self.qnetwork_target(next_states) next_q_values = next_q_values.squeeze(1) q_value = q_values.gather(1, actions.unsqueeze(1)).squeeze(1) next_q_value = next_q_values.max(1)[0] nonterminals = nonterminals.squeeze(1) expected_q_value = rewards + (self.gamma * next_q_value * nonterminals) loss = F.mse_loss(q_value, expected_q_value) self.optimizer.zero_grad() loss.backward() self.optimizer.step() loss = (q_value - expected_q_value.detach()).pow(2) * weights prios = loss + 1e-5 mem.update_priorities(idxs, prios.detach().cpu().numpy() ) # Update priorities of sampled transitions # ------------------- update target network ------------------- # self.soft_update() def soft_update(self, tau=1e-3): """ swaps the network weights from the online to the target Args: param1 (float): tau """ for target_param, local_param in zip(self.qnetwork_target.parameters(), self.qnetwork_local.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def update_target_net(self): """ copy the model weights from the online to the target network """ self.qnetwork_target.load_state_dict(self.qnetwork_local.state_dict()) def save(self, path): """ save the model weights to a file Args: param1 (string): pathname """ torch.save(self.qnetwork_local.state_dict(), os.path.join(path, 'model.pth')) def train(self): """ activates the backprob. layers for the online network """ self.qnetwork_local.train() def eval(self): """ invoke the eval from the online network deactivates the backprob layers like dropout will work in eval model instead """ self.qnetwork_local.eval()
class Agent(): def __init__(self, state_size, action_size, behavior_name, index_player, replay_memory_size=1e4, batch_size=100, gamma=0.99, learning_rate=1e-4, target_tau=1e-3, update_rate=10, seed=0): self.state_size = state_size self.current_state = [] self.action_size = action_size self.buffer_size = int(replay_memory_size) self.batch_size = batch_size self.gamma = gamma self.learn_rate = learning_rate self.tau = target_tau self.update_rate = update_rate self.seed = random.seed(seed) self.behavior_name = behavior_name self.index_player = index_player self.close_ball_reward = 0 self.touch_ball_reward = 0 """ Now we define two models: (a) one netwoek will be updated every (step % update_rate == 0), (b) A target network, with weights updated to equal to equal to the network (a) at a slower (target_tau) rate. """ self.network = QNetwork(state_size, action_size, seed).to(device) self.target_network = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.network.parameters(), lr=self.learn_rate) # Replay memory self.memory = ReplayBuffer(action_size, self.buffer_size, self.batch_size, seed) # Initialize time step ( for updating every UPDATE_EVERY steps) self.t_step = 0 def load_model(self, path_model, path_target=None): params = torch.load(path_model) #self.network.set_params(params) self.network.load_state_dict(torch.load(path_model)) if path_target != None: self.target_network.load_state_dict(torch.load(path_target)) def choose_action(self, state, eps=0.0): eps = -1 state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.network.eval() with torch.no_grad(): action_values = self.network(state) self.network.train() # epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy() ) # return a number from 0 to action_size else: return random.choice(np.arange( self.action_size)) # return a number from 0 to action_size def Read(self): decision_steps, terminal_steps = env.get_steps(self.behavior_name) try: signal_front = np.array( sensor_front_sig( decision_steps.obs[0][self.index_player, :])) # 3 x 11 x 8 signal_back = np.array( sensor_back_sig( decision_steps.obs[1][self.index_player, :])) # 3 x 3 x 8 #pre_state = [] signal_front = np.array(signal_front) #print(signal_front.shape) #print(signal_back.shape) r = np.concatenate((signal_front, signal_back), axis=1) #print(r.shape) #input('ff') #pre_state.extend(list(np.array(signal_front).flatten())) #pre_state.extend(list(np.array(signal_back).flatten())) #state = np.array(pre_state) self.current_state = r count_close_to_ball = 0 count_touch_ball = 0 count_back_touch = 0 count_back_close = 0 self.rew_d_to_our_post = 0 self.rew_for_ball_dist = -0.1 # Front Observation for i in range(len(signal_front[0])): if signal_front[0][i][0] == 1.0: count_close_to_ball += 1 self.rew_for_ball_dist = max( 0.3 * (1 - signal_front[0][i][7]), self.rew_for_ball_dist) # Kicked the ball at the front if signal_front[0][i][7] <= 0.03: count_touch_ball += 1 if signal_front[0][i][1] == 1.0: self.rew_d_to_our_post = -0.1 if signal_front[0][i][2] == 1.0: self.rew_d_to_our_post = 0.1 # Back observation for i in range(len(signal_back[0])): if signal_back[0][i][0] == 1.0: count_back_close += 1 # Touches the ball at the back if signal_back[0][i][7] <= 0.03: count_back_touch += 0 self.back_touch = 1 if count_back_touch > 0 else 0 self.back_close = 1 if count_back_close > 0 else 0 # add reward if kick the ball self.touch_ball_reward = 3 if count_touch_ball > 0 else 0 # Penalize for back touching the ball if count_back_touch > 0: self.touch_ball_reward = -1.5 # Penalize if the ball is not in view self.close_ball_reward = 0.1 if count_close_to_ball > 0 else -0.1 # Penalize if the ball is behind the agent if count_back_close > 0: self.close_ball_reward = -0.1 return self.current_state except: self.touch_ball_reward = 0 self.close_ball_reward = 0 return self.current_state
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed, ALPHA) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 # Initialize learning step for updating beta self.learn_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get prioritized subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA, BETA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma, beta): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor beta (float): initial value for beta, which controls how much importance weights affect learning """ states, actions, rewards, next_states, dones, probabilities, indices = experiences if double_dqn: # Get the Q values for each next_state, action pair from the # local/online/behavior Q network: Q_targets_next_local = self.qnetwork_local( next_states).detach() # Get the corresponding best action for those next_states: _, a_prime = Q_targets_next_local.max(1) # Get the Q values from the target Q network but following a_prime, # which belongs to the local network, not the target network: Q_targets_next = self.qnetwork_target(next_states).detach() Q_targets_next = Q_targets_next.gather(1, a_prime.unsqueeze(1)) else: # Get max predicted Q values (for next states) from target model Q_targets_next = self.qnetwork_target( next_states).detach().max(1)[0].unsqueeze(1) # Compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) # Compute and update new priorities new_priorities = (abs(Q_expected - Q_targets) + EPSILON_PER).detach() self.memory.update_priority(new_priorities, indices) # Update beta parameter (b). By default beta will reach 1 after # 25,000 training steps (~325 episodes in the Banana environment): b = min(1.0, beta + self.learn_step * (1.0 - beta) / BETA_ITERS) self.learn_step += 1 # Compute and apply importance sampling weights to TD Errors ISweights = (((1 / len(self.memory)) * (1 / probabilities))**b) max_ISweight = torch.max(ISweights) ISweights /= max_ISweight Q_targets *= ISweights Q_expected *= ISweights # Compute loss loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
def save_model(model: QNetwork, output_file: Path) -> None: """Save the weights of the trained agent""" torch.save(model.state_dict(), output_file)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed, 64, 64).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed, 64, 64).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) self.loss_fn = torch.nn.MSELoss() # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 # torch.nn.utils.clip_grad_value_(self.qnetwork_local.parameters(), clip_value = 1) def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences ## TODO: compute and minimize the loss "*** YOUR CODE HERE ***" # # Get max predicted Q values (for next states) from target model # Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1) # # Compute Q targets for current states # Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # # Get expected Q values from local model # Q_expected = self.qnetwork_local(states).gather(1, actions) # # Compute loss # loss = F.mse_loss(Q_expected, Q_targets) # # Minimize the loss # self.optimizer.zero_grad() # loss.backward() # self.optimizer.step() optimizer = self.optimizer loss_fn = self.loss_fn ## this is required as we're not learning qnetwork_targets weights # with torch.no_grad(): # Q_target = rewards + gamma * (torch.max(self.qnetwork_target(next_states), dim=1)[0].view(64,1))*(1 - dones) # Q_target[dones == True] = rewards[dones == True] # Q_pred = torch.max(self.qnetwork_local(states), dim=1)[0].view(64,1) ## Double DQNs #argmax on Target W best_actions_by_local_nn = torch.max( self.qnetwork_local(next_states).detach(), dim=1)[1].unsqueeze(1) action_values_by_target_nn = self.qnetwork_target( next_states).detach().gather(1, best_actions_by_local_nn) Q_target = rewards + gamma * action_values_by_target_nn * (1 - dones) Q_pred = self.qnetwork_local(states).gather(1, actions) optimizer.zero_grad() loss = loss_fn(Q_pred, Q_target) loss.backward() optimizer.step() # print("Loss=", loss.item()) # print("Loss=", loss, # "Local params L2=", torch.norm(self.qnetwork_local.parameters(), 2), # "Local params grad L2=", torch.norm(self.qnetwork_local.parameters().grad, 2)) # with torch.no_grad(): # for param in self.qnetwork_local.parameters(): # param -= learning_rate * param.grad # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class DDQNAgentPrioExpReplay: """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=PARAM.LR) # Replay memory self.memory = PrioritizedReplayBuffer(action_size, 20000, PARAM.BATCH_SIZE, 0, PARAM.PROBABILITY_EXPONENT) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 self.eps = 1 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % PARAM.UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > PARAM.BATCH_SIZE: experiences, experience_indices, importance_weights = self.memory.sample( ) self.learn(experiences, experience_indices, importance_weights, PARAM.GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ self.eps = eps state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def get_ddqn_targets(self, next_states, rewards, gamma, dones): # get best action according to online value function approximation q_online = self.qnetwork_local(next_states).detach() q_online = q_online.argmax(1) # get value of target function at position of best online action q_target = self.qnetwork_target(next_states).detach() q_target = q_target.index_select(1, q_online)[:, 0] # reshape q_target = q_target.unsqueeze(1) # calculate more correct q-value given the current reward Q_targets = rewards + (gamma * q_target * (1 - dones)) return Q_targets def learn(self, experiences, experience_indices, importance_weights, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences Q_targets = self.get_ddqn_targets(next_states, rewards, gamma, dones) # Get expected Q values q_exp = self.qnetwork_local(states) # print(q_exp) # gets the q values along dimension 1 according to the actions, which is used as index # >>> t = torch.tensor([[1,2],[3,4]]) # >>> torch.gather(t, 1, torch.tensor([[0],[1]])) # tensor([[ 1], # [ 4]]) q_exp = q_exp.gather(1, actions) # print(q_exp) error = torch.abs(q_exp - Q_targets) with torch.no_grad(): # update priority # we need ".cpu()" here because the values need to be copied to memory before converting them to numpy, # else they are just present in the GPU errors = np.squeeze(error.cpu().data.numpy()) self.memory.set_priorities(experience_indices, errors) # compute loss squared_error = torch.mul(error, error) with torch.no_grad(): w = torch.from_numpy( importance_weights**(1 - self.eps)).float().to(device) w = w.detach() squared_error = torch.squeeze(squared_error) weighted_squared_error = torch.mul(squared_error, w) loss = torch.mean(weighted_squared_error) #loss = F.mse_loss(q_exp, Q_targets) # reset optimizer gradient self.optimizer.zero_grad() # do backpropagation loss.backward() # do optimize step self.optimizer.step() # ------------------- update target network ------------------- # # according to the algorithm in # https://proceedings.neurips.cc/paper/2010/file/091d584fced301b442654dd8c23b3fc9-Paper.pdf # one should update randomly in ether direction #update_direction = np.random.binomial(1, 0.5) #if update_direction == 0: # self.soft_update(self.qnetwork_local, self.qnetwork_target, PARAM.TAU) #else: # self.soft_update(self.qnetwork_target, self.qnetwork_local, PARAM.TAU) self.soft_update(self.qnetwork_local, self.qnetwork_target, PARAM.TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed, buffer_size=int(1e5), batch_size=64, gamma=0.99, tau=1e-3, lr=5e-4, update_every=4, double_dqn=False, dueling_dqn=False, prioritized_replay=False): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed buffer_size (int): replay buffer size batch_size (int): minibatch size gamma (float): discount factor tau (float): for soft update of target parameters lr (float): learning rate update_every (int): how often to update the network double_dqn (bool) use double Q-network when 'True' dueling_dqn (bool): use dueling Q-network when 'True' prioritized_replay (bool): use prioritized replay buffer when 'True' """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.buffer_size = buffer_size self.batch_size = batch_size self.gamma = gamma self.tau = tau self.lr = lr self.update_every = update_every self.double_dqn = double_dqn self.dueling_dqn = dueling_dqn self.prioritized_replay = prioritized_replay # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed, dueling_dqn=dueling_dqn).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed, dueling_dqn=dueling_dqn).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=self.lr) # Replay memory self.memory = ReplayBuffer(action_size, self.buffer_size, self.batch_size, seed, prioritized_replay=prioritized_replay) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps self.t_step = (self.t_step + 1) % self.update_every if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences, self.gamma) def act(self, state, eps=0.): """Returns actions for a given state as per the current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using a given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ if self.prioritized_replay: states, actions, rewards, next_states, dones, indices, weights = experiences else: states, actions, rewards, next_states, dones = experiences # Get max predicted Q-values (for next states) from target model if self.double_dqn: # Use local model to choose an action, and target model to evaluate that action Q_local_max = self.qnetwork_local(next_states).detach().max(1)[1].unsqueeze(1) Q_targets_next = self.qnetwork_target(next_states).gather(1, Q_local_max) else: Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1) # Compute Q-targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Get expected Q-values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) # Compute loss loss = F.mse_loss(Q_expected, Q_targets) if self.prioritized_replay: priorities = np.sqrt(loss.detach().cpu().data.numpy()) self.memory.update_priorities(indices, priorities) loss = loss * weights loss = loss.mean() # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, self.tau) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): where weights will be copied from target_model (PyTorch model): where weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class SAC(object): def __init__(self, num_inputs, action_space, args, process_obs=None, opt_level='O1'): self.gamma = args.gamma self.tau = args.tau self.alpha = args.alpha self.device = torch.device("cuda" if args.cuda else "cpu") self.dtype = torch.float self.policy_type = args.policy self.target_update_interval = args.target_update_interval self.automatic_entropy_tuning = args.automatic_entropy_tuning self.process_obs = process_obs.to(self.device).to(self.dtype) self.critic = QNetwork(num_inputs, action_space.shape[0], args.hidden_size).to(device=self.device).to( self.dtype) self.critic_optim = Adam(list(self.critic.parameters()) + list(process_obs.parameters()), lr=args.lr) self.critic_target = QNetwork(num_inputs, action_space.shape[0], args.hidden_size).to(self.device).to( self.dtype) hard_update(self.critic_target, self.critic) if self.policy_type == "Gaussian": # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper if self.automatic_entropy_tuning is True: self.target_entropy = -torch.prod( torch.Tensor(action_space.shape).to(self.device)).item() self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device, dtype=self.dtype) self.alpha_optim = Adam([self.log_alpha], lr=args.lr) self.policy = GaussianPolicy(num_inputs, action_space.shape[0], args.hidden_size, action_space).to( self.device).to(self.dtype) self.policy_optim = Adam(list(self.policy.parameters()) + list(process_obs.parameters()), lr=args.lr) else: self.alpha = 0 self.automatic_entropy_tuning = False self.policy = DeterministicPolicy( num_inputs, action_space.shape[0], args.hidden_size, action_space).to(self.device).to(self.dtype) self.policy_optim = Adam(list(self.policy.parameters()) + list(process_obs.parameters()), lr=args.lr) if opt_level is not None: model, optimizer = amp.initialize([ self.policy, self.process_obs, self.critic, self.critic_target ], [self.policy_optim, self.critic_optim], opt_level=opt_level) def select_action(self, obs, evaluate=False): with torch.no_grad(): obs = torch.FloatTensor(obs).to(self.device).unsqueeze(0).to( self.dtype) state = self.process_obs(obs) if evaluate is False: action, _, _ = self.policy.sample(state) else: _, _, action = self.policy.sample(state) action = action.detach().cpu().numpy()[0] return action def update_parameters(self, memory, batch_size, updates): # Sample a batch from memory obs_batch, action_batch, reward_batch, next_obs_batch, mask_batch = memory.sample( batch_size=batch_size) obs_batch = torch.FloatTensor(obs_batch).to(self.device).to(self.dtype) next_obs_batch = torch.FloatTensor(next_obs_batch).to(self.device).to( self.dtype) action_batch = torch.FloatTensor(action_batch).to(self.device).to( self.dtype) reward_batch = torch.FloatTensor(reward_batch).to( self.device).unsqueeze(1).to(self.dtype) mask_batch = torch.FloatTensor(mask_batch).to( self.device).unsqueeze(1).to(self.dtype) state_batch = self.process_obs(obs_batch) with torch.no_grad(): next_state_batch = self.process_obs(next_obs_batch) next_state_action, next_state_log_pi, _ = self.policy.sample( next_state_batch) qf1_next_target, qf2_next_target = self.critic_target( next_state_batch, next_state_action) min_qf_next_target = torch.min( qf1_next_target, qf2_next_target) - self.alpha * next_state_log_pi next_q_value = reward_batch + mask_batch * self.gamma * ( min_qf_next_target) qf1, qf2 = self.critic( state_batch, action_batch ) # Two Q-functions to mitigate positive bias in the policy improvement step qf1_loss = F.mse_loss( qf1, next_q_value ) # JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2] qf2_loss = F.mse_loss( qf2, next_q_value ) # JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2] qf_loss = qf1_loss + qf2_loss self.critic_optim.zero_grad() assert torch.isfinite(qf_loss).all() with amp.scale_loss(qf_loss, self.critic_optim) as qf_loss: qf_loss.backward() self.critic_optim.step() state_batch = self.process_obs(obs_batch) pi, log_pi, _ = self.policy.sample(state_batch) qf1_pi, qf2_pi = self.critic(state_batch.detach(), pi) min_qf_pi = torch.min(qf1_pi, qf2_pi) policy_loss = ((self.alpha * log_pi) - min_qf_pi).mean( ) # Jπ = 𝔼st∼D,εt∼N[α * logπ(f(εt;st)|st) − Q(st,f(εt;st))] self.policy_optim.zero_grad() assert torch.isfinite(policy_loss).all() with amp.scale_loss(policy_loss, self.policy_optim) as policy_loss: policy_loss.backward() self.policy_optim.step() if self.automatic_entropy_tuning: alpha_loss = -(self.log_alpha * (log_pi + self.target_entropy).detach()).mean() self.alpha_optim.zero_grad() alpha_loss.backward() self.alpha_optim.step() self.alpha = self.log_alpha.exp() alpha_tlogs = self.alpha.clone() # For TensorboardX logs else: alpha_loss = torch.tensor(0.).to(self.device).to(self.dtype) alpha_tlogs = torch.tensor(self.alpha) # For TensorboardX logs if updates % self.target_update_interval == 0: soft_update(self.critic_target, self.critic, self.tau) return qf1_loss.item(), qf2_loss.item(), policy_loss.item( ), alpha_loss.item(), alpha_tlogs.item() # Save model parameters def save_model(self, actor_path=None, critic_path=None, process_obs_path=None): logger.debug( f'saving models to {actor_path} and {critic_path} and {process_obs_path}' ) torch.save(self.policy.state_dict(), actor_path) torch.save(self.critic.state_dict(), critic_path) torch.save(self.process_obs.state_dict(), process_obs_path) # Load model parameters def load_model(self, actor_path=None, critic_path=None, process_obs_path=None): logger.info( f'Loading models from {actor_path} and {critic_path} and {process_obs_path}' ) if actor_path is not None: self.policy.load_state_dict(torch.load(actor_path)) if critic_path is not None: self.critic.load_state_dict(torch.load(critic_path)) if process_obs_path is not None: self.process_obs.load_state_dict(torch.load(process_obs_path))
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed, lr_decay=0.9999): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network if USE_DUELING_NETWORK: self.qnetwork_local = DuelingQNetwork(state_size, action_size, seed, [128, 32], [64, 32]).to(device) self.qnetwork_target = DuelingQNetwork(state_size, action_size, seed, [128, 32], [64, 32]).to(device) self.qnetwork_target.eval() else: self.qnetwork_local = QNetwork(state_size, action_size, seed, fc1_units=128, fc2_units=32).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed, fc1_units=128, fc2_units=32).to(device) self.qnetwork_target.eval() self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) self.lr_scheduler = optim.lr_scheduler.ExponentialLR( self.optimizer, lr_decay) # Replay memory if USE_PRIORITIZED_REPLAY_BUFFER: self.memory = PrioritizedReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed, device, alpha=0.6, beta=0.4, beta_scheduler=1.0) else: self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones, w = experiences if USE_DOUBLE_DQN: self.qnetwork_local.eval() Q_local = self.qnetwork_local(next_states) greedy_actions = torch.argmax(Q_local, axis=1).unsqueeze(1) self.qnetwork_local.train() Q_targets_next = self.qnetwork_target(next_states) Q_targets_next = Q_targets_next.gather(1, greedy_actions) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) else: # Get max predicted Q values (for next states) from target model Q_targets_next = self.qnetwork_target(next_states).detach().max( 1)[0].unsqueeze(1) Q_targets_next = Q_targets_next.max(dim=1, keepdim=True)[0] # Compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) # Compute loss if USE_PRIORITIZED_REPLAY_BUFFER: Q_targets.sub_(Q_expected) Q_targets.squeeze_() Q_targets.pow_(2) with torch.no_grad(): td_error = Q_targets.detach() td_error.pow_(0.5) self.memory.update_priorities(td_error) Q_targets.mul_(w) loss = Q_targets.mean() else: loss = F.mse_loss(Q_expected, Q_targets) # Mback-propagation self.optimizer.zero_grad() loss.backward() self.optimizer.step() self.lr_scheduler.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
def __init__(self, num_inputs, action_space, args, process_obs=None, opt_level='O1'): self.gamma = args.gamma self.tau = args.tau self.alpha = args.alpha self.device = torch.device("cuda" if args.cuda else "cpu") self.dtype = torch.float self.policy_type = args.policy self.target_update_interval = args.target_update_interval self.automatic_entropy_tuning = args.automatic_entropy_tuning self.process_obs = process_obs.to(self.device).to(self.dtype) self.critic = QNetwork(num_inputs, action_space.shape[0], args.hidden_size).to(device=self.device).to( self.dtype) self.critic_optim = Adam(list(self.critic.parameters()) + list(process_obs.parameters()), lr=args.lr) self.critic_target = QNetwork(num_inputs, action_space.shape[0], args.hidden_size).to(self.device).to( self.dtype) hard_update(self.critic_target, self.critic) if self.policy_type == "Gaussian": # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper if self.automatic_entropy_tuning is True: self.target_entropy = -torch.prod( torch.Tensor(action_space.shape).to(self.device)).item() self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device, dtype=self.dtype) self.alpha_optim = Adam([self.log_alpha], lr=args.lr) self.policy = GaussianPolicy(num_inputs, action_space.shape[0], args.hidden_size, action_space).to( self.device).to(self.dtype) self.policy_optim = Adam(list(self.policy.parameters()) + list(process_obs.parameters()), lr=args.lr) else: self.alpha = 0 self.automatic_entropy_tuning = False self.policy = DeterministicPolicy( num_inputs, action_space.shape[0], args.hidden_size, action_space).to(self.device).to(self.dtype) self.policy_optim = Adam(list(self.policy.parameters()) + list(process_obs.parameters()), lr=args.lr) if opt_level is not None: model, optimizer = amp.initialize([ self.policy, self.process_obs, self.critic, self.critic_target ], [self.policy_optim, self.critic_optim], opt_level=opt_level)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() # this change the local net to eval mode with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train( ) # this just return the local net back to train mode # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences ## TODO: compute and minimize the loss "*** YOUR CODE HERE ***" # Get max predicted Q values (for next states) from target model target_q_next = self.qnetwork_target(next_states).detach().max( 1)[0].unsqueeze(1) """ # disregard action, get best value! # why so many next states? answer: the qnetwork will return each corresponding next states action, the max will pick from each the best action # explanation on detach (https://discuss.pytorch.org/t/detach-no-grad-and-requires-grad/16915/7) """ # Compute Q targets for current states target_q = rewards + (gamma * target_q_next * (1 - dones)) # Get expected Q values from local model expected_q = self.qnetwork_local(states).gather(1, actions) """ this uses gather instead of detach like target since it only give a s*** to action taken # explanation on gather (https://stackoverflow.com/questions/50999977/what-does-the-gather-function-do-in-pytorch-in-layman-terms) """ # Compute loss loss = F.mse_loss(expected_q, target_q) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") # dim = 0 [30, xxx] -> [10, ...], [10, ...], [10, ...] on 3 GPUs net = nn.DataParallel(self.qnetwork_local) if torch.cuda.is_available(): print("using GPUs!") net.cuda() # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences ## TODO: compute and minimize the loss "*** YOUR CODE HERE ***" # target net update # Get max predicted Q values (for next states) from target model # double means two w, Compute Q targets for current states double_Q_targets_next = self.qnetwork_target(next_states).detach().max( 1)[0].unsqueeze(1) # choose the index of the max Q, get the action of policy best_actions = np.argmax(double_Q_targets_next, dim=1, keepdim=True) # implement best action to obtain Q(S_t+1, Q_argmax(S_t+1, a, w), w^-) Q_target_next = self.qnetwork_local(states).gather( 1, best_actions).detach() Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) # Compute loss loss = F.mse_loss(Q_expected, Q_targets) #logger.info('mse: {}'.format(delta)) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # evolutionary step - increase survival chances #logger.info('avg reward: {} mse:{}'.format(delta, np.mean(experiences.rewards()))) # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # Get max predicted Q values (for next states) from target model Q_targets_next = self.qnetwork_target(next_states).detach().max( 1)[0].unsqueeze(1) # Compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) # Compute loss loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class SAC(object): def __init__(self, num_inputs, action_space, args): self.num_inputs = num_inputs self.action_space = action_space.shape[0] self.gamma = args.gamma self.tau = args.tau self.alpha = args.alpha self.policy_type = args.policy self.target_update_interval = args.target_update_interval self.automatic_entropy_tuning = args.automatic_entropy_tuning self.critic = QNetwork(self.num_inputs, self.action_space, args.hidden_size) self.critic_optim = Adam(self.critic.parameters(), lr=args.lr) # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper if self.automatic_entropy_tuning == True: self.target_entropy = -torch.prod(torch.Tensor( action_space.shape)).item() self.log_alpha = torch.zeros(1, requires_grad=True) self.alpha_optim = Adam([self.log_alpha], lr=args.lr) else: pass if self.policy_type == "Gaussian": self.policy = GaussianPolicy(self.num_inputs, self.action_space, args.hidden_size) self.policy_optim = Adam(self.policy.parameters(), lr=args.lr) self.value = ValueNetwork(self.num_inputs, args.hidden_size) self.value_target = ValueNetwork(self.num_inputs, args.hidden_size) self.value_optim = Adam(self.value.parameters(), lr=args.lr) hard_update(self.value_target, self.value) else: self.policy = DeterministicPolicy(self.num_inputs, self.action_space, args.hidden_size) self.policy_optim = Adam(self.policy.parameters(), lr=args.lr) self.critic_target = QNetwork(self.num_inputs, self.action_space, args.hidden_size) hard_update(self.critic_target, self.critic) def select_action(self, state, eval=False): state = torch.FloatTensor(state).unsqueeze(0) if eval == False: self.policy.train() action, _, _, _, _ = self.policy.evaluate(state) else: self.policy.eval() _, _, _, action, _ = self.policy.evaluate(state) #action = torch.tanh(action) action = action.detach().cpu().numpy() return action[0] def update_parameters(self, state_batch, action_batch, reward_batch, next_state_batch, mask_batch, updates): state_batch = torch.FloatTensor(state_batch) next_state_batch = torch.FloatTensor(next_state_batch) action_batch = torch.FloatTensor(action_batch) reward_batch = torch.FloatTensor(reward_batch) mask_batch = torch.FloatTensor(np.float32(mask_batch)) reward_batch = reward_batch.unsqueeze( 1) # reward_batch = [batch_size, 1] mask_batch = mask_batch.unsqueeze(1) # mask_batch = [batch_size, 1] """ Use two Q-functions to mitigate positive bias in the policy improvement step that is known to degrade performance of value based methods. Two Q-functions also significantly speed up training, especially on harder task. """ expected_q1_value, expected_q2_value = self.critic( state_batch, action_batch) new_action, log_prob, _, mean, log_std = self.policy.evaluate( state_batch) if self.automatic_entropy_tuning: """ Alpha Loss """ alpha_loss = -(self.log_alpha * (log_prob + self.target_entropy).detach()).mean() self.alpha_optim.zero_grad() alpha_loss.backward() self.alpha_optim.step() self.alpha = self.log_alpha.exp() alpha_logs = self.alpha.clone() # For TensorboardX logs else: alpha_loss = torch.tensor(0.) alpha_logs = self.alpha # For TensorboardX logs if self.policy_type == "Gaussian": """ Including a separate function approximator for the soft value can stabilize training. """ expected_value = self.value(state_batch) target_value = self.value_target(next_state_batch) next_q_value = reward_batch + mask_batch * self.gamma * target_value else: """ There is no need in principle to include a separate function approximator for the state value. We use a target critic network for deterministic policy and eradicate the value value network completely. """ next_state_action, _, _, _, _, = self.policy.evaluate( next_state_batch) target_critic_1, target_critic_2 = self.critic_target( next_state_batch, next_state_action) target_critic = torch.min(target_critic_1, target_critic_2) next_q_value = reward_batch + mask_batch * self.gamma * target_critic """ Soft Q-function parameters can be trained to minimize the soft Bellman residual JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2] ∇JQ = ∇Q(st,at)(Q(st,at) - r(st,at) - γV(target)(st+1)) """ q1_value_loss = F.mse_loss(expected_q1_value, next_q_value.detach()) q2_value_loss = F.mse_loss(expected_q2_value, next_q_value.detach()) q1_new, q2_new = self.critic(state_batch, new_action) expected_new_q_value = torch.min(q1_new, q2_new) if self.policy_type == "Gaussian": """ Including a separate function approximator for the soft value can stabilize training and is convenient to train simultaneously with the other networks Update the V towards the min of two Q-functions in order to reduce overestimation bias from function approximation error. JV = 𝔼st~D[0.5(V(st) - (𝔼at~π[Qmin(st,at) - α * log π(at|st)]))^2] ∇JV = ∇V(st)(V(st) - Q(st,at) + (α * logπ(at|st))) """ next_value = expected_new_q_value - (self.alpha * log_prob) value_loss = F.mse_loss(expected_value, next_value.detach()) else: pass """ Reparameterization trick is used to get a low variance estimator f(εt;st) = action sampled from the policy εt is an input noise vector, sampled from some fixed distribution Jπ = 𝔼st∼D,εt∼N[α * logπ(f(εt;st)|st) − Q(st,f(εt;st))] ∇Jπ = ∇log π + ([∇at (α * logπ(at|st)) − ∇at Q(st,at)])∇f(εt;st) """ policy_loss = ((self.alpha * log_prob) - expected_new_q_value).mean() # Regularization Loss mean_loss = 0.001 * mean.pow(2).mean() std_loss = 0.001 * log_std.pow(2).mean() policy_loss += mean_loss + std_loss self.critic_optim.zero_grad() q1_value_loss.backward() self.critic_optim.step() self.critic_optim.zero_grad() q2_value_loss.backward() self.critic_optim.step() if self.policy_type == "Gaussian": self.value_optim.zero_grad() value_loss.backward() self.value_optim.step() else: value_loss = torch.tensor(0.) self.policy_optim.zero_grad() policy_loss.backward() self.policy_optim.step() """ We update the target weights to match the current value function weights periodically Update target parameter after every n(args.target_update_interval) updates """ if updates % self.target_update_interval == 0 and self.policy_type == "Deterministic": soft_update(self.critic_target, self.critic, self.tau) elif updates % self.target_update_interval == 0 and self.policy_type == "Gaussian": soft_update(self.value_target, self.value, self.tau) return value_loss.item(), q1_value_loss.item(), q2_value_loss.item( ), policy_loss.item(), alpha_loss.item(), alpha_logs # Save model parameters def save_model(self, env_name, suffix="", actor_path=None, critic_path=None, value_path=None): if not os.path.exists('models/'): os.makedirs('models/') if actor_path is None: actor_path = "models/sac_actor_{}_{}".format(env_name, suffix) if critic_path is None: critic_path = "models/sac_critic_{}_{}".format(env_name, suffix) if value_path is None: value_path = "models/sac_value_{}_{}".format(env_name, suffix) print('Saving models to {}, {} and {}'.format(actor_path, critic_path, value_path)) torch.save(self.value.state_dict(), value_path) torch.save(self.policy.state_dict(), actor_path) torch.save(self.critic.state_dict(), critic_path) # Load model parameters def load_model(self, actor_path, critic_path, value_path): print('Loading models from {}, {} and {}'.format( actor_path, critic_path, value_path)) if actor_path is not None: self.policy.load_state_dict(torch.load(actor_path)) if critic_path is not None: self.critic.load_state_dict(torch.load(critic_path)) if value_path is not None: self.value.load_state_dict(torch.load(value_path))
class BEARQL(object): def __init__(self, num_inputs, action_space, args): self.gamma = args.gamma self.tau = args.tau self.critic = QNetwork(num_inputs, action_space.shape[0], args.hidden_size).to(device) self.critic_optim = Adam(self.critic.parameters(), lr=args.lr) self.critic_target = QNetwork(num_inputs, action_space.shape[0], args.hidden_size).to(device) hard_update(self.critic_target, self.critic) self.policy = GaussianPolicy(num_inputs, action_space.shape[0], args.hidden_size, action_space).to(device) self.policy_optim = Adam(self.policy.parameters(), lr=args.lr) self.policy_target = GaussianPolicy(num_inputs, action_space.shape[0], args.hidden_size, action_space).to(device) hard_update(self.policy_target, self.policy) # dual_lambda self.dual_lambda = args.init_dual_lambda self.dual_step_size = args.dual_step_size self.cost_epsilon = args.cost_epsilon # coefficient_weight assigned to ensemble variance term self.coefficient_weight = args.coefficient_weight self.dual_grad_times = args.dual_grad_times # used in evaluation def select_action(self, state): # sample multiple policies and perform a greedy maximization of Q over these policies with torch.no_grad(): state = torch.FloatTensor(state.reshape(1, -1)).repeat(10, 1).to(device) # state = torch.FloatTensor(state.reshape(1, -1)).to(device) action, _, mean = self.policy.sample(state) # q1, q2 = self.critic(state, action) q1, q2, q3 = self.critic(state, action) ind = q1.max(0)[1] return action[ind].cpu().data.numpy().flatten() # return action.cpu().data.numpy().flatten() # MMD functions def compute_kernel(self, x, y, sigma): batch_size = x.shape[0] x_size = x.shape[1] y_size = y.shape[1] dim = x.shape[2] tiled_x = x.view(batch_size, x_size, 1, dim).repeat([1, 1, y_size, 1]) tiled_y = y.view(batch_size, 1, y_size, dim).repeat([1, x_size, 1, 1]) return torch.exp(-(tiled_x - tiled_y).pow(2).sum(dim=3) / (2 * sigma)) def compute_mmd(self, x, y, sigma=20.): x_kernel = self.compute_kernel(x, x, sigma) y_kernel = self.compute_kernel(y, y, sigma) xy_kernel = self.compute_kernel(x, y, sigma) square_mmd = x_kernel.mean((1, 2)) + y_kernel.mean((1, 2)) - 2 * xy_kernel.mean((1, 2)) return square_mmd def train(self, prior, memory, batch_size, m=4, n=4): # Sample replay buffer / batch state_np, action_np, reward_np, next_state_np, mask_np = memory.sample(batch_size=batch_size) state_batch = torch.FloatTensor(state_np).to(device) next_state_batch = torch.FloatTensor(next_state_np).to(device) action_batch = torch.FloatTensor(action_np).to(device) reward_batch = torch.FloatTensor(reward_np).to(device).unsqueeze(1) mask_batch = torch.FloatTensor(mask_np).to(device).unsqueeze(1) # Critic Training with torch.no_grad(): # Duplicate state 10 times next_state_rep = torch.FloatTensor(np.repeat(next_state_np, 10, axis=0)).to(device) # Soft Clipped Double Q-learning next_state_action, _, _ = self.policy_target.sample(next_state_rep) target_Q1, target_Q2, target_Q3 = self.critic_target(next_state_rep, next_state_action) target_cat = torch.cat([target_Q1, target_Q2, target_Q3], 1) target_Q = 0.75 * target_cat.min(1)[0] + 0.25 * target_cat.max(1)[0] target_Q = target_Q.view(batch_size, -1).max(1)[0].view(-1, 1) next_q_value = reward_batch + mask_batch * self.gamma * target_Q qf1, qf2, qf3 = self.critic(state_batch, action_batch) # ensemble of k Q-functions q_loss = F.mse_loss(qf1, next_q_value) + F.mse_loss(qf2, next_q_value) + F.mse_loss(qf3, next_q_value) self.critic_optim.zero_grad() q_loss.backward() self.critic_optim.step() # Actor Training with torch.no_grad(): state_rep_m = torch.FloatTensor(np.repeat(state_np, m, axis=0)).to(device) state_rep_n = torch.FloatTensor(np.repeat(state_np, n, axis=0)).to(device) for i in range(self.dual_grad_times): prior_a_rep, _, _ = prior.sample(state_rep_n) prior_a_rep = prior_a_rep.view(batch_size, n, -1) pi_rep, _, _ = self.policy.sample(state_rep_m) pi_rep = pi_rep.view(batch_size, m, -1) mmd_dist = self.compute_mmd(prior_a_rep, pi_rep) pi, _, _ = self.policy.sample(state_batch) qf1_pi, qf2_pi, qf3_pi = self.critic(state_batch, pi) qf_cat = torch.cat([qf1_pi, qf2_pi, qf3_pi], 1) # min_qf_pi = torch.min(qf1_pi, qf2_pi) # used in TD3 # use conservative estimate of Q as used in BEAR qf_mean = qf_cat.mean(1) qf_var = qf_cat.var(1) min_qf_pi = qf_mean - self.coefficient_weight * qf_var.sqrt() # used in BEAR policy_loss = -(min_qf_pi - self.dual_lambda*mmd_dist).mean() self.policy_optim.zero_grad() policy_loss.backward() self.policy_optim.step() # Dual Lambda Training self.dual_gradients = mmd_dist.mean().item() - self.cost_epsilon self.dual_lambda += self.dual_step_size * self.dual_gradients self.dual_lambda = np.clip(self.dual_lambda, np.power(np.e, -5), np.power(np.e, 10)) # Update Target Networks soft_update(self.critic_target, self.critic, self.tau) soft_update(self.policy_target, self.policy, self.tau) return q_loss.item(), policy_loss.item(), self.dual_lambda, mmd_dist.mean().item() # Save model parameters def save_model(self, env_name, suffix="", actor_path=None, critic_path=None): if not os.path.exists('models/'): os.makedirs('models/') if actor_path is None: actor_path = "models/BEAR_actor_{}_{}".format(env_name, suffix) if critic_path is None: critic_path = "models/BEAR_critic_{}_{}".format(env_name, suffix) print('Saving models to {} and {}'.format(actor_path, critic_path)) torch.save(self.policy.state_dict(), actor_path) torch.save(self.critic.state_dict(), critic_path) # Load model parameters def load_model(self, actor_path, critic_path): print('Loading models from {} and {}'.format(actor_path, critic_path)) if actor_path is not None: self.policy.load_state_dict(torch.load(actor_path)) if critic_path is not None: self.critic.load_state_dict(torch.load(critic_path))
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed, checkpoint_path=None): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) # Load the model only when the checkpoint is available if checkpoint_path is not None: self.qnetwork_local.load_state_dict(torch.load(checkpoint_path)) print("Checkpoint loaded successfully") self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences self.optimizer.zero_grad() #target with torch.no_grad(): #Double DQN ddqn_max_indices = self.qnetwork_local(next_states).max(dim=1)[1] target_op = self.qnetwork_target(next_states) target_op = target_op.gather(1, ddqn_max_indices.view(-1, 1)) ''' # DQN target_op = self.qnetwork_target(next_states).max(dim=1)[0].view(-1,1) ''' targets = rewards + target_op * (1 - dones) * gamma predictions = self.qnetwork_local(states) predictions = predictions.gather(1, actions.view(-1, 1)) loss = torch.nn.MSELoss()(predictions, targets) loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): def __init__(self, state_size, action_size, hidden_layers, buffer_size=int(1e6), batch_size=32, gamma=.99, tau=1, lr=2.5e-4, update_local=4, update_target=10000, ddqn=False, seed=1): """Initialize Agent object Params ====== state_size (int): Dimension of states action_size (int): Dimension of actions hidden_layers (list of ints): number of nodes in the hidden layers buffer_size (int): size of replay buffer batch_size (int): size of sample gamma (float): discount factor tau (float): (soft) update of target parameters lr (float): learning rate update_local (int): update local after every x steps update_target (int): update target after every x steps ddqn (boolean): Double Deep Q-Learning seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Hyperparameters self.buffer_size = buffer_size # replay buffer self.batch_size = batch_size # minibatch size self.gamma = gamma # discount factor self.tau = tau # (soft) update of target parameters self.lr = lr # learning rate self.update_local = update_local # update local network after every x steps self.update_target = update_target # update target network with local network weights # Q Network self.qnet_local = \ QNetwork(state_size, action_size, hidden_layers, seed).to(device) self.qnet_target = \ QNetwork(state_size, action_size, hidden_layers, seed).to(device) self.optimizer = optim.Adam(self.qnet_local.parameters(), lr=lr) # Replay buffer self.memory = ReplayBuffer(action_size, buffer_size, batch_size, seed) # Initialize time step self.t_step = 0 # Double Deep Q-Learning flag self.ddqn = ddqn def step(self, state, action, reward, next_state, done): # Save experience in replay buffer self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE LOCAL time steps self.t_step += 1 if self.t_step % self.update_local == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > self.batch_size: sample = self.memory.sample() if self.t_step % self.update_target == 0: do_target_update = True else: do_target_update = False self.__learn(sample, self.gamma, do_target_update) def act(self, state, epsilon=0): """Returns action given a state according to local Q Network (current policy) Params ====== state (array_like): current state epsilon (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnet_local.eval() with torch.no_grad(): action_values = self.qnet_local(state) self.qnet_local.train() # Epsilon greedy action selection if random.random() > epsilon: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def __learn(self, sample, gamma, do_target_update): """Update value parameters using given batch of sampled experiences tuples Params ====== sample (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = sample if not self.ddqn: # Get max predicted Q values (for next states) from target model Q_targets_next = \ self.qnet_target(next_states).detach().max(1)[0].unsqueeze(1) else: # Get actions (for next states) with max Q values from local net next_actions = \ self.qnet_local(next_states).detach().max(1)[1].unsqueeze(1) # Get predicted Q values from target model Q_targets_next = \ self.qnet_target(next_states).gather(1, next_actions) # Compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model Q_expected = self.qnet_local(states).gather(1, actions) # Compute loss loss = F.mse_loss(Q_expected, Q_targets) # Minimize loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # Update target network if do_target_update: self.__target_net_update(self.qnet_local, self.qnet_target, self.tau) def __target_net_update(self, local_net, target_net, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param \ in zip(target_net.parameters(), local_net.parameters()): target_param.data.\ copy_(tau*local_param.data + (1.0 - tau)*target_param.data) def get_info(self): output = """ Replay Buffer size: {} \n Batch size: {} \n Discout factor: {} \n tau: {} \n Learning Rate: {} \n Update local network after every {} steps \n Update target network with local network parameters after every {} steps \n DDQN: {} """ print( output.format(self.buffer_size, self.batch_size, self.gamma, self.tau, self.lr, self.update_local, self.update_target, self.ddqn))
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences for i in range(len(experiences)): # loop through all experiences in batch self.optimizer.zero_grad() # zero out gradient if dones[i]: y = rewards[i] # if the experience was the end of the episode y = this reward else: # if the experience is not the end of the episode # y = immediate reward + gamma*(reward of best action in next state) y = rewards[i] + gamma*(max(self.qnetwork_target(next_states[i]))) q = self.qnetwork_local(states[i]) # determine value of curent state in local copy of model z = (y - q[actions[i]])**2 # calculate loss due to this experience z.backward() # compute gradient and update weights using optimizer self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)