class Agent: def __init__(self, seed, state_size, action_size, net_type="dqn"): """if net_type is dqn, perform deep Q network; if ddqn, perform double deep Q network""" self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.net_type = net_type # replay buffer self.memory = replaybuffer(action_size, BATCH_SIZE, seed) # define target and local Q network self.qnetwork_local = Qnetwork(state_size, action_size, seed).to(device) self.qnetwork_target = Qnetwork(state_size, action_size, seed).to(device) # define optimizer for qnetwork_local self.optim = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # define time step for soft updating cycle self.time_step = 0 def collect(self, state, action, reward, next_state, done): # collect the new sample self.memory.add(state, action, reward, next_state, done) # use time step to decide if it needs to learn or not self.time_step = (self.time_step + 1) % UPDATE_EVERY if self.time_step == 0: if len(self.memory) >= BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, ALPHA) def act(self, state): state = torch.from_numpy(state).unsqueeze(0).float().to(device) # get action_values self.qnetwork_local.eval() with torch.no_grad(): action_vals = self.qnetwork_local(state) self.qnetwork_local.train() # use epsilon_greedy policies to decide which action to take policy = np.ones(self.action_size) * (EPSILON / self.action_size) best = torch.argmax(action_vals).item() policy[best] = 1 - EPSILON + (EPSILON / self.action_size) return np.random.choice(np.arange(self.action_size), p=policy) def learn(self, experiences, alpha): states, actions, rewards, next_states, dones = experiences # parameter learning for local network if self.net_type == "dqn": TD_target = rewards + GAMMA * (self.qnetwork_target( next_states).detach().max(1)[0].unsqueeze(1)) * (1 - dones) if self.net_type == "ddqn": best = self.qnetwork_local(next_states).detach().max( 1)[1].unsqueeze(1) TD_target = rewards + GAMMA * ( self.qnetwork_target(next_states).detach().gather(1, best)) TD_estimate = self.qnetwork_local(states).gather(1, actions) loss = F.mse_loss(TD_target, TD_estimate) self.optim.zero_grad() loss.backward() self.optim.step() # parameter soft updating for target network self.soft_update(self.qnetwork_local, self.qnetwork_target, alpha) def soft_update(self, local_network, target_network, alpha): for local_params, target_params in zip(local_network.parameters(), target_network.parameters()): target_params.data.copy_(alpha * local_params.data + (1 - alpha) * target_params.data)
class smart_agent(): def __init__(self, state_size, action_size, seed): self.state_size = state_size self.action_size = action_size #self.device=device self.seed = random.seed(seed) self.q_network_local = Qnetwork(state_size, action_size, seed).to(device) self.q_network_target = Qnetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.q_network_local.parameters(), lr=LR) ####REPLAY MEMORY######## self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) self.t_step = 0 def step(self, state, action, reward, next_state, done): self.memory.add(state, action, reward, next_state, done) self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: if len(self.memory) > BATCH_SIZE: xp = self.memory.sample() self.learn(xp, GAMMA) def act(self, state, eps=0.): state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.q_network_local.eval() with torch.no_grad(): action_value = self.q_network_local(state) self.q_network_local.train() #Epsilon greedy selection if random.random() > eps: return np.argmax(action_value.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, xp, gamma): state, action, reward, next_state, done = xp q_target_next = self.q_network_target(next_state).detach().max( 1)[0].unsqueeze(1) q_target = reward + (gamma * q_target_next * (1 - done)) q_expected = self.q_network_local(state).gather(1, action) #MSE LOSS loss = F.mse_loss(q_expected, q_target) self.optimizer.zero_grad() loss.backward() self.optimizer.step() self.soft_update(self.q_network_local, self.q_network_target, TAU) def soft_update(self, local_model, target_model, tau): for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class agent(): def __init__(self, state_size, action_size, seed): """Initialize an Agent Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) #Qnetwork self.Qnetwork_local = Qnetwork(state_size, action_size, seed).to(device) self.Qnetwork_target = Qnetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.Qnetwork_local.parameters(), lr=lr) #replay memory self.memory = ReplayBuffer(action_size, buffer_size, batch_size, seed) #init time step self.t_step = 0 def step(self, state, action, reward, next_state, done): #save step to replay memory self.memory.add(state, action, reward, next_state, done) #learn every update_every t_steps self.t_step = (self.t_step + 1) % update_every if self.t_step == 0: #check if enough samples are in memory if there are then learn if len(self.memory) > batch_size: exps = self.memory.sample() self.learn(exps, gamma) def act(self, state, eps=0.): '''Returns actions for a given state based on the current policy Params ====== state (array_like): current state eps (float) epsilon for epsilon-greedy action selection ''' state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.Qnetwork_local.eval() with torch.no_grad(): action_values = self.Qnetwork_local(state) self.Qnetwork_local.train() #epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, exps, gamma): """Update value parameters using a batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, done = exps #get max predicted Q values for next state from target model Q_targets_next = self.Qnetwork_target(next_states).detach().max( 1)[0].unsqueeze(1) #compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - done)) #calculate expected Q values from local model Q_expected = self.Qnetwork_local(states).gather(1, actions) #compute loss loss = F.mse_loss(Q_expected, Q_targets) #minimize loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() #update target network self.soft_update(self.Qnetwork_local, self.Qnetwork_target, tau) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class DQNAgent(): def __init__(self, state_size, action_size, double=False, duel=False): self.state_size = state_size self.action_size = action_size self.discounted_factor = 0.99 self.learning_rate = 0.001 self.double = double # Define Model if duel: self.local_model = Duel_Qnetwork(state_size, action_size).to(device) self.target_model = Duel_Qnetwork(state_size, action_size).to(device) else: self.local_model = Qnetwork(state_size, action_size).to(device) self.target_model = Qnetwork(state_size, action_size).to(device) # Define optimizer self.optimizer = optim.Adam(self.local_model.parameters(), lr=self.learning_rate) # Define Buffer self.buffer = Replay_buffer(action_size, buffer_size=BUFFER_SIZE, batch_size=BATCH_SIZE) # time_step, local_model update, target_model update self.t_step = 0 self.target_update_t = 0 def get_action(self, state, eps=0.0): """state (numpy.ndarray)""" state = torch.from_numpy(state.reshape( 1, self.state_size)).float().to(device) self.local_model.eval() with torch.no_grad(): action_values = self.local_model(state) # .detach().cpu() self.local_model.train() # epsilon greedy policy if random.random() < eps: action = np.random.randint(4) return action else: action = np.argmax(action_values.cpu().data.numpy()) return int(action) def append_sample(self, state, action, reward, next_state, done): self.buffer.add(state, action, reward, next_state, done) self.t_step += 1 if self.t_step % LOCAL_UPDATE == 0: """If there are enough experiences""" if self.buffer.__len__() > BATCH_SIZE: experiences = self.buffer.sample() self.learn(experiences) # self.target_update_t += 1 # if self.target_update_t % TARGET_UPDATE == 0: self.soft_target_model_update(TAU) def learn(self, experiences): """experiences ;tensor """ states, actions, rewards, next_states, dones = experiences pred_q = self.local_model(states).gather(1, actions) if self.double: _, argmax_actions = torch.max( self.local_model.forward(next_states).detach(), 1, keepdim=True) pred_next_q = self.target_model.forward(next_states).gather( 1, argmax_actions) else: pred_next_q, _ = torch.max( self.target_model.forward(next_states).detach(), 1, keepdim=True) target_q = rewards + ( (1 - dones) * self.discounted_factor * pred_next_q) loss = F.mse_loss(target_q, pred_q) self.optimizer.zero_grad() loss.backward() self.optimizer.step() def soft_target_model_update(self, tau): for target_param, local_param in zip(self.target_model.parameters(), self.local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent: def __init__(self, state_size, action_size, gamma=0.99, lr=5e-4, buffer_size=int(1e5), batch_size=64, tau=1e-3): # defining local and target networks self.qnet_local = Qnetwork(state_size, action_size).to(device) self.qnet_target = Qnetwork(state_size, action_size).to(device) # set local and target parameters equal to each other self.soft_update(tau=1.0) # experience replay buffer self.memory = ReplayBuffer(buffer_size, batch_size) # defining variables self.state_size = state_size self.action_size = action_size self.buffer_size = buffer_size self.batch_size = batch_size self.gamma = gamma self.lr = lr self.tau = tau self.t_step = 0 # optimizer self.optimizer = optim.Adam(self.qnet_local.parameters(), lr=self.lr) def step(self, state, action, reward, next_state, done): """ saves the step info in the memory buffer and perform a learning iteration Input : state,action,reward,state,done : non-batched numpy arrays Output : none """ # add sample to the memory buffer self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY # use replay buffer to learn if it has enough samples if self.t_step == 0: if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) def learn(self, experiences): """ perform a learning iteration by using sampled experience batch Input : experience : tuple from the memory buffer states, actions, rewards, next_states, dones = experiences eg : states.shape = [N,state_size] Output : none """ states, actions, rewards, next_states, dones, wj, choose = experiences #states, actions, rewards, next_states, dones = experiences # set optimizer grdient to zero self.optimizer.zero_grad() # predicted action value q_pred = self.qnet_local.forward(states).gather(1, actions) # target action value ## use double DQNs, refer https://arxiv.org/abs/1509.06461 next_action_local = self.qnet_local.forward(next_states).max(1)[1] q_target = rewards + self.gamma * ( 1 - dones) * self.qnet_target.forward(next_states)[ range(self.batch_size), next_action_local].unsqueeze(1) # compute td error td_error = q_target - q_pred # update td error in Replay buffer self.memory.update_td_error(choose, td_error.detach().cpu().numpy().squeeze()) # defining loss loss = ((wj * td_error)**2).mean() # running backprop and optimizer step loss.backward() self.optimizer.step() # run soft update self.soft_update(self.tau) def act(self, state, eps=0.): """ return the local model's predicted action for the given state Input : state : [state_size] Output : action : scalar action as action space is discrete with dim = 1 """ state = torch.from_numpy(state).float().unsqueeze(dim=0).to( device) # converts numpy array to torch tensor self.qnet_local.eval() # put net in test mode with torch.no_grad(): max_action = np.argmax( self.qnet_local(state)[0].cpu().data.numpy()) self.qnet_local.train() # put net back in train mode rand_num = np.random.rand( ) # sample a random number uniformly between 0 and 1 # implementing epsilon greedy policy if rand_num < eps: return np.random.randint(self.action_size) else: return max_action def soft_update(self, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target """ for target_param, local_param in zip(self.qnet_target.parameters(), self.qnet_local.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): ''' Agent interacts with env and learn the optimal policy by learning optimal value function ''' def __init__(self, state_size, action_size): self.state_size = state_size self.action_size = action_size self.Qlocal = Qnetwork(self.state_size, self.action_size).to(device) # Local Network self.Qtarget = Qnetwork(self.state_size, self.action_size).to(device) # Taget Network self.optim = optim.Adam(self.Qlocal.parameters(), lr) self.buffer = replay_buffer(buffer_max_size, batch_size) # replay buffer self.t_step = 0 # used in updating the target network weights from local network def learn(self, exp, gamma): ''' takes exp and gamma for training the local network in predicting proper q value calculates the next time step q value from next state ''' state, action, reward, next_state, done = exp index = self.Qlocal(next_state).detach().max(1)[1].unsqueeze( 1 ) # double q learning to get max value of action from secondary network q_val = self.Qtarget(next_state).detach().gather( 1, index) # get the q value from the index which gave max value y_onehot = torch.zeros( batch_size, self.action_size).cuda() # update the values for choosen action y_onehot.scatter_(1, action, 1) # creating the one hot vector Q_val_n = reward + (gamma * q_val * (1 - done) ) # Estimated the Target Q value for state Q_target = y_onehot * Q_val_n # Traget network Qvalue for given action pre = self.Qlocal(state) # Qvalue estimated by the local network Q_local = y_onehot * pre # Local network Qvalue for given action loss = F.mse_loss(Q_local, Q_target) # Loss function self.optim.zero_grad() loss.backward() self.optim.step() self.update( self.Qlocal, self.Qtarget, tau ) # updating the Target network weight with Local network weight def step(self, state, action, reward, next_state, done): ''' Interacts with the env to get the one step experience and update the replay buffer trains Local network one in four times it interacts with env ''' self.buffer.add(state, action, reward, next_state, done) # Adding to replay buffer self.t_step += 1 if (self.t_step % 4 == 0): # Training ones in four times if (len(self.buffer) > batch_size): experiences = self.buffer.sample() self.learn(experiences, gamma) def act(self, state, eps): ''' Given the state provide the appropriate action given by e-greedy policy ''' state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.Qlocal.eval() # network in eval mode to calculate the q value with torch.no_grad(): action_values = self.Qlocal( state) # Q value estimate for given state self.Qlocal.train() # network in train mode if random.random( ) > eps: # e-greedy policy for choosing the action from q values return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def update(self, local_model, target_model, tau): ''' Updates the Target network with some part of local network ''' for l, t in zip(local_model.parameters(), target_model.parameters()): t.data.copy_(t.data * (1.0 - tau) + l.data * tau)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action """ self.state_size = state_size self.action_size = action_size #self.seed = random.seed(seed) # Q-Network self.qnetwork_local = Qnetwork(state_size, action_size).to(device) self.qnetwork_target = Qnetwork(state_size, action_size).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > 1000: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences ## TODO: compute and minimize the loss "*** YOUR CODE HERE ***" nxt_pred = 0.04 * torch.log( torch.exp(self.qnetwork_target(next_states) * 25.0).sum(-1)).detach().unsqueeze(1) y_onehot = torch.zeros(BATCH_SIZE, self.action_size).to(device) y_onehot.scatter_(1, actions, 1) Q_val_n = rewards + (gamma * nxt_pred * (1 - dones)) Q_target = y_onehot * Q_val_n pre = self.qnetwork_local(states) Q_local = y_onehot * pre loss = F.mse_loss(Q_local, Q_target) self.optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(self.qnetwork_local.parameters(), 1) self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)