class Agent(): def __init__(self, state_size, action_size, config): self.env_name = config["env_name"] self.state_size = state_size self.action_size = action_size self.seed = config["seed"] self.clip = config["clip"] self.device = 'cuda' print("Clip ", self.clip) print("cuda ", torch.cuda.is_available()) self.double_dqn = config["DDQN"] print("Use double dqn", self.double_dqn) self.lr_pre = config["lr_pre"] self.batch_size = config["batch_size"] self.lr = config["lr"] self.tau = config["tau"] print("self tau", self.tau) self.gamma = 0.99 self.fc1 = config["fc1_units"] self.fc2 = config["fc2_units"] self.fc3 = config["fc3_units"] self.qnetwork_local = QNetwork(state_size, action_size, self.fc1, self.fc2, self.fc3, self.seed).to(self.device) self.qnetwork_target = QNetwork(state_size, action_size, self.fc1, self.fc2,self.fc3, self.seed).to(self.device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=self.lr) self.soft_update(self.qnetwork_local, self.qnetwork_target, 1) self.q_shift_local = QNetwork(state_size, action_size, self.fc1, self.fc2, self.fc3, self.seed).to(self.device) self.q_shift_target = QNetwork(state_size, action_size, self.fc1, self.fc2, self.fc3, self.seed).to(self.device) self.optimizer_shift = optim.Adam(self.q_shift_local.parameters(), lr=self.lr) self.soft_update(self.q_shift_local, self.q_shift_target, 1) self.R_local = QNetwork(state_size, action_size, self.fc1, self.fc2, self.fc3, self.seed).to(self.device) self.R_target = QNetwork(state_size, action_size, self.fc1, self.fc2, self.fc3, self.seed).to(self.device) self.optimizer_r = optim.Adam(self.R_local.parameters(), lr=self.lr) self.soft_update(self.R_local, self.R_target, 1) self.expert_q = DQNetwork(state_size, action_size, seed=self.seed).to(self.device) self.expert_q.load_state_dict(torch.load('checkpoint.pth')) self.memory = Memory(action_size, config["buffer_size"], self.batch_size, self.seed, self.device) self.t_step = 0 self.steps = 0 self.predicter = Classifier(state_size, action_size, self.seed).to(self.device) self.optimizer_pre = optim.Adam(self.predicter.parameters(), lr=self.lr_pre) pathname = "lr_{}_batch_size_{}_fc1_{}_fc2_{}_fc3_{}_seed_{}".format(self.lr, self.batch_size, self.fc1, self.fc2, self.fc3, self.seed) pathname += "_clip_{}".format(config["clip"]) pathname += "_tau_{}".format(config["tau"]) now = datetime.now() dt_string = now.strftime("%d_%m_%Y_%H:%M:%S") pathname += dt_string tensorboard_name = str(config["locexp"]) + '/runs/' + pathname self.writer = SummaryWriter(tensorboard_name) print("summery writer ", tensorboard_name) self.average_prediction = deque(maxlen=100) self.average_same_action = deque(maxlen=100) self.all_actions = [] for a in range(self.action_size): action = torch.Tensor(1) * 0 + a self.all_actions.append(action.to(self.device)) def learn(self, memory): logging.debug("--------------------------New episode-----------------------------------------------") states, next_states, actions, dones = memory.expert_policy(self.batch_size) self.steps += 1 self.state_action_frq(states, actions) self.compute_shift_function(states, next_states, actions, dones) for i in range(1): for a in range(self.action_size): action = torch.ones([self.batch_size, 1], device= self.device) * a self.compute_r_function(states, action) self.compute_q_function(states, next_states, actions, dones) self.soft_update(self.q_shift_local, self.q_shift_target, self.tau) self.soft_update(self.R_local, self.R_target, self.tau) self.soft_update(self.qnetwork_local, self.qnetwork_target, self.tau) return def learn_predicter(self, memory): """ """ states, next_states, actions, dones = memory.expert_policy(self.batch_size) self.state_action_frq(states, actions) def state_action_frq(self, states, action): """ Train classifer to compute state action freq """ self.predicter.train() output = self.predicter(states, train=True) output = output.squeeze(0) # logging.debug("out predicter {})".format(output)) y = action.type(torch.long).squeeze(1) #print("y shape", y.shape) loss = nn.CrossEntropyLoss()(output, y) self.optimizer_pre.zero_grad() loss.backward() #torch.nn.utils.clip_grad_norm_(self.predicter.parameters(), 1) self.optimizer_pre.step() self.writer.add_scalar('Predict_loss', loss, self.steps) self.predicter.eval() def test_predicter(self, memory): """ """ self.predicter.eval() same_state_predition = 0 for i in range(memory.idx): states = memory.obses[i] actions = memory.actions[i] states = torch.as_tensor(states, device=self.device).unsqueeze(0) actions = torch.as_tensor(actions, device=self.device) output = self.predicter(states) output = F.softmax(output, dim=1) # create one hot encode y from actions y = actions.type(torch.long).item() p =torch.argmax(output.data).item() if y==p: same_state_predition += 1 #self.average_prediction.append(same_state_predition) #average_pred = np.mean(self.average_prediction) #self.writer.add_scalar('Average prediction acc', average_pred, self.steps) #logging.debug("Same prediction {} of 100".format(same_state_predition)) text = "Same prediction {} of {} ".format(same_state_predition, memory.idx) print(text) # self.writer.add_scalar('Action prediction acc', same_state_predition, self.steps) self.predicter.train() def get_action_prob(self, states, actions): """ """ actions = actions.type(torch.long) # check if action prob is zero output = self.predicter(states) output = F.softmax(output, dim=1) # print("get action_prob ", output) # output = output.squeeze(0) action_prob = output.gather(1, actions) action_prob = action_prob + torch.finfo(torch.float32).eps # check if one action if its to small if action_prob.shape[0] == 1: if action_prob.cpu().detach().numpy()[0][0] < 1e-4: return None # logging.debug("action_prob {})".format(action_prob)) action_prob = torch.log(action_prob) action_prob = torch.clamp(action_prob, min= self.clip, max=0) return action_prob def compute_shift_function(self, states, next_states, actions, dones): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ actions = actions.type(torch.int64) with torch.no_grad(): # Get max predicted Q values (for next states) from target model if self.double_dqn: qt = self.q_shift_local(next_states) max_q, max_actions = qt.max(1) Q_targets_next = self.qnetwork_target(next_states).gather(1, max_actions.unsqueeze(1)) else: Q_targets_next = self.qnetwork_target(next_states).max(1)[0].unsqueeze(1) # Compute Q targets for current states Q_targets = (self.gamma * Q_targets_next * (dones)) # Get expected Q values from local model Q_expected = self.q_shift_local(states).gather(1, actions) # Compute loss loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer_shift.zero_grad() loss.backward() self.writer.add_scalar('Shift_loss', loss, self.steps) self.optimizer_shift.step() def compute_r_function(self, states, actions, debug=False, log=False): """ """ actions = actions.type(torch.int64) # sum all other actions # print("state shape ", states.shape) size = states.shape[0] idx = 0 all_zeros = [] with torch.no_grad(): y_shift = self.q_shift_target(states).gather(1, actions) log_a = self.get_action_prob(states, actions) index_list = index_None_value(log_a) # print("is none", index_list) if index_list is None: return y_r_part1 = log_a - y_shift y_r_part2 = torch.empty((size, 1), dtype=torch.float32).to(self.device) for a, s in zip(actions, states): y_h = 0 taken_actions = 0 for b in self.all_actions: b = b.type(torch.int64).unsqueeze(1) n_b = self.get_action_prob(s.unsqueeze(0), b) if torch.eq(a, b) or n_b is None: logging.debug("best action {} ".format(a)) logging.debug("n_b action {} ".format(b)) logging.debug("n_b {} ".format(n_b)) continue taken_actions += 1 r_hat = self.R_target(s.unsqueeze(0)).gather(1, b) y_s = self.q_shift_target(s.unsqueeze(0)).gather(1, b) n_b = n_b - y_s y_h += (r_hat - n_b) if debug: print("action", b.item()) print("r_pre {:.3f}".format(r_hat.item())) print("n_b {:.3f}".format(n_b.item())) if taken_actions == 0: all_zeros.append(idx) else: y_r_part2[idx] = (1. / taken_actions) * y_h idx += 1 #print(y_r_part2, y_r_part1) y_r = y_r_part1 + y_r_part2 #print("_________________") #print("r update zeros ", len(all_zeros)) if len(index_list) > 0: print("none list", index_list) y = self.R_local(states).gather(1, actions) if log: text = "Action {:.2f} y target {:.2f} = n_a {:.2f} + {:.2f} and pre{:.2f}".format(actions.item(), y_r.item(), y_r_part1.item(), y_r_part2.item(), y.item()) logging.debug(text) if debug: print("expet action ", actions.item()) # print("y r {:.3f}".format(y.item())) # print("log a prob {:.3f}".format(log_a.item())) # print("n_a {:.3f}".format(y_r_part1.item())) print("Correct action p {:.3f} ".format(y.item())) print("Correct action target {:.3f} ".format(y_r.item())) print("part1 corret action {:.2f} ".format(y_r_part1.item())) print("part2 incorret action {:.2f} ".format(y_r_part2.item())) #print("y", y.shape) #print("y_r", y_r.shape) r_loss = F.mse_loss(y, y_r) #con = input() #sys.exit() # Minimize the loss self.optimizer_r.zero_grad() r_loss.backward() #torch.nn.utils.clip_grad_norm_(self.R_local.parameters(), 5) self.optimizer_r.step() self.writer.add_scalar('Reward_loss', r_loss, self.steps) if debug: print("after update r pre ", self.R_local(states).gather(1, actions).item()) print("after update r target ", self.R_target(states).gather(1, actions).item()) # ------------------- update target network ------------------- # #self.soft_update(self.R_local, self.R_target, 5e-3) if debug: print("after soft upda r target ", self.R_target(states).gather(1, actions).item()) def compute_q_function(self, states, next_states, actions, dones, debug=False, log= False): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ actions = actions.type(torch.int64) if debug: print("---------------q_update------------------") print("expet action ", actions.item()) print("state ", states) with torch.no_grad(): # Get max predicted Q values (for next states) from target model if self.double_dqn: qt = self.qnetwork_local(next_states) max_q, max_actions = qt.max(1) Q_targets_next = self.qnetwork_target(next_states).gather(1, max_actions.unsqueeze(1)) else: Q_targets_next = self.qnetwork_target(next_states).max(1)[0].unsqueeze(1) # Compute Q targets for current states rewards = self.R_target(states).gather(1, actions) Q_targets = rewards + (self.gamma * Q_targets_next * (dones)) if debug: print("reward {}".format(rewards.item())) print("Q target next {}".format(Q_targets_next.item())) print("Q_target {}".format(Q_targets.item())) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) if log: text = "Action {:.2f} q target {:.2f} = r_a {:.2f} + target {:.2f} and pre{:.2f}".format(actions.item(), Q_targets.item(), rewards.item(), Q_targets_next.item(), Q_expected.item()) logging.debug(text) if debug: print("q for a {}".format(Q_expected)) # Compute loss loss = F.mse_loss(Q_expected, Q_targets) self.writer.add_scalar('Q_loss', loss, self.steps) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() if debug: print("q after update {}".format(self.qnetwork_local(states))) print("q loss {}".format(loss.item())) # ------------------- update target network ------------------- # def dqn_train(self, n_episodes=2000, max_t=1000, eps_start=1.0, eps_end=0.01, eps_decay=0.995): env = gym.make('LunarLander-v2') scores = [] # list containing scores from each episode scores_window = deque(maxlen=100) # last 100 scores eps = eps_start for i_episode in range(1, n_episodes+1): state = env.reset() score = 0 for t in range(max_t): self.t_step += 1 action = self.dqn_act(state, eps) next_state, reward, done, _ = env.step(action) self.step(state, action, reward, next_state, done) state = next_state score += reward if done: self.test_q() break scores_window.append(score) # save most recent score scores.append(score) # save most recent score eps = max(eps_end, eps_decay*eps) # decrease epsilon print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)), end="") if i_episode % 100 == 0: print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window))) if np.mean(scores_window)>=200.0: print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode-100, np.mean(scores_window))) break def test_policy(self): env = gym.make('LunarLander-v2') logging.debug("new episode") average_score = [] average_steps = [] average_action = [] for i in range(5): state = env.reset() score = 0 same_action = 0 logging.debug("new episode") for t in range(200): state = torch.from_numpy(state).float().unsqueeze(0).to(self.device) q_expert = self.expert_q(state) q_values = self.qnetwork_local(state) logging.debug("q expert a0: {:.2f} a1: {:.2f} a2: {:.2f} a3: {:.2f}".format(q_expert.data[0][0], q_expert.data[0][1], q_expert.data[0][2], q_expert.data[0][3])) logging.debug("q values a0: {:.2f} a1: {:.2f} a2: {:.2f} a3: {:.2f} )".format(q_values.data[0][0], q_values.data[0][1], q_values.data[0][2], q_values.data[0][3])) action = torch.argmax(q_values).item() action_e = torch.argmax(q_expert).item() if action == action_e: same_action += 1 next_state, reward, done, _ = env.step(action) state = next_state score += reward if done: average_score.append(score) average_steps.append(t) average_action.append(same_action) break mean_steps = np.mean(average_steps) mean_score = np.mean(average_score) mean_action= np.mean(average_action) self.writer.add_scalar('Ave_epsiode_length', mean_steps , self.steps) self.writer.add_scalar('Ave_same_action', mean_action, self.steps) self.writer.add_scalar('Ave_score', mean_score, self.steps) def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % 4 if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.update_q(experiences) def dqn_act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def update_q(self, experiences, debug=False): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # Get max predicted Q values (for next states) from target model with torch.no_grad(): Q_targets_next = self.qnetwork_target(next_states).max(1)[0].unsqueeze(1) # Compute Q targets for current states Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) if debug: print("----------------------") print("----------------------") print("Q target", Q_targets) print("pre", Q_expected) print("all local",self.qnetwork_local(states)) # Compute loss loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target) def test_q(self): experiences = self.memory.test_sample() self.update_q(experiences, True) def test_q_value(self, memory): same_action = 0 test_elements = memory.idx all_diff = 0 error = True self.predicter.eval() for i in range(test_elements): # print("lop", i) states = memory.obses[i] next_states = memory.next_obses[i] actions = memory.actions[i] dones = memory.not_dones[i] states = torch.as_tensor(states, device=self.device).unsqueeze(0) next_states = torch.as_tensor(next_states, device=self.device) actions = torch.as_tensor(actions, device=self.device) dones = torch.as_tensor(dones, device=self.device) with torch.no_grad(): output = self.predicter(states) output = F.softmax(output, dim=1) q_values = self.qnetwork_local(states) expert_values = self.expert_q(states) print("q values ", q_values) print("ex values ", expert_values) best_action = torch.argmax(q_values).item() actions = actions.type(torch.int64) q_max = q_values.max(1) #print("q values", q_values) q = q_values[0][actions.item()].item() #print("q action", q) max_q = q_max[0].data.item() diff = max_q - q all_diff += diff #print("q best", max_q) #print("difference ", diff) if actions.item() != best_action: r = self.R_local(states) rt = self.R_target(states) qt = self.qnetwork_target(states) logging.debug("------------------false action --------------------------------") logging.debug("expert action {})".format(actions.item())) logging.debug("out predicter a0: {:.2f} a1: {:.2f} a2: {:.2f} a3: {:.2f} )".format(output.data[0][0], output.data[0][1], output.data[0][2], output.data[0][3])) logging.debug("q values a0: {:.2f} a1: {:.2f} a2: {:.2f} a3: {:.2f} )".format(q_values.data[0][0], q_values.data[0][1], q_values.data[0][2], q_values.data[0][3])) logging.debug("q target a0: {:.2f} a1: {:.2f} a2: {:.2f} a3: {:.2f} )".format(qt.data[0][0], qt.data[0][1], qt.data[0][2], qt.data[0][3])) logging.debug("rewards a0: {:.2f} a1: {:.2f} a2: {:.2f} a3: {:.2f} )".format(r.data[0][0], r.data[0][1], r.data[0][2], r.data[0][3])) logging.debug("re target a0: {:.2f} a1: {:.2f} a2: {:.2f} a3: {:.2f} )".format(rt.data[0][0], rt.data[0][1], rt.data[0][2], rt.data[0][3])) """ logging.debug("---------Reward Function------------") action = torch.Tensor(1) * 0 + 0 self.compute_r_function(states, action.unsqueeze(0).to(self.device), log= True) action = torch.Tensor(1) * 0 + 1 self.compute_r_function(states, action.unsqueeze(0).to(self.device), log= True) action = torch.Tensor(1) * 0 + 2 self.compute_r_function(states, action.unsqueeze(0).to(self.device), log= True) action = torch.Tensor(1) * 0 + 3 self.compute_r_function(states, action.unsqueeze(0).to(self.device), log= True) logging.debug("------------------Q Function --------------------------------") action = torch.Tensor(1) * 0 + 0 self.compute_q_function(states, next_states.unsqueeze(0), action.unsqueeze(0).to(self.device), dones, log= True) action = torch.Tensor(1) * 0 + 1 self.compute_q_function(states, next_states.unsqueeze(0), action.unsqueeze(0).to(self.device), dones, log= True) action = torch.Tensor(1) * 0 + 2 self.compute_q_function(states, next_states.unsqueeze(0), action.unsqueeze(0).to(self.device), dones, log= True) action = torch.Tensor(1) * 0 + 3 self.compute_q_function(states, next_states.unsqueeze(0), action.unsqueeze(0).to(self.device), dones, log= True) """ if actions.item() == best_action: same_action += 1 continue print("-------------------------------------------------------------------------------") print("state ", i) print("expert ", actions) print("q values", q_values.data) print("action prob predicter ", output.data) self.compute_r_function(states, actions.unsqueeze(0), True) self.compute_q_function(states, next_states.unsqueeze(0), actions.unsqueeze(0), dones, True) else: if error: continue print("-------------------------------------------------------------------------------") print("expert action ", actions.item()) print("best action q ", best_action) print(i) error = False continue # logging.debug("experte action {} q fun {}".format(actions.item(), q_values)) print("-------------------------------------------------------------------------------") print("state ", i) print("expert ", actions) print("q values", q_values.data) print("action prob predicter ", output.data) self.compute_r_function(states, actions.unsqueeze(0), True) self.compute_q_function(states, next_states.unsqueeze(0), actions.unsqueeze(0), dones, True) self.writer.add_scalar('diff', all_diff, self.steps) self.average_same_action.append(same_action) av_action = np.mean(self.average_same_action) self.writer.add_scalar('Same_action', same_action, self.steps) print("Same actions {} of {}".format(same_action, test_elements)) self.predicter.train() def soft_update(self, local_model, target_model, tau=4): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ # print("use tau", tau) for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def save(self, filename): """ """ mkdir("", filename) torch.save(self.predicter.state_dict(), filename + "_predicter.pth") torch.save(self.optimizer_pre.state_dict(), filename + "_predicter_optimizer.pth") torch.save(self.qnetwork_local.state_dict(), filename + "_q_net.pth") """ torch.save(self.optimizer_q.state_dict(), filename + "_q_net_optimizer.pth") torch.save(self.q_shift_local.state_dict(), filename + "_q_shift_net.pth") torch.save(self.optimizer_q_shift.state_dict(), filename + "_q_shift_net_optimizer.pth") """ print("save models to {}".format(filename)) def load(self, filename): self.predicter.load_state_dict(torch.load(filename + "_predicter.pth")) self.optimizer_pre.load_state_dict(torch.load(filename + "_predicter_optimizer.pth")) print("Load models to {}".format(filename))
class Agent(): def __init__(self, state_size, action_size, config): self.seed = config["seed"] torch.manual_seed(self.seed) np.random.seed(seed=self.seed) random.seed(self.seed) env = gym.make(config["env_name"]) self.env = FrameStack(env, config) self.env.seed(self.seed) self.state_size = state_size self.action_size = action_size self.clip = config["clip"] self.device = 'cuda' self.double_dqn = config["DDQN"] self.lr_pre = config["lr_pre"] self.batch_size = config["batch_size"] self.lr = config["lr"] self.tau = config["tau"] self.gamma = 0.99 self.fc1 = config["fc1_units"] self.fc2 = config["fc2_units"] self.qnetwork_local = QNetwork(state_size, action_size, self.fc1, self.fc2, self.seed).to(self.device) self.qnetwork_target = QNetwork(state_size, action_size, self.fc1, self.fc2, self.seed).to(self.device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=self.lr) self.soft_update(self.qnetwork_local, self.qnetwork_target, 1) self.q_shift_local = QNetwork(state_size, action_size, self.fc1, self.fc2, self.seed).to(self.device) self.q_shift_target = QNetwork(state_size, action_size, self.fc1, self.fc2, self.seed).to(self.device) self.optimizer_shift = optim.Adam(self.q_shift_local.parameters(), lr=self.lr) self.soft_update(self.q_shift_local, self.q_shift_target, 1) self.R_local = QNetwork(state_size, action_size, self.fc1, self.fc2, self.seed).to(self.device) self.R_target = QNetwork(state_size, action_size, self.fc1, self.fc2, self.seed).to(self.device) self.optimizer_r = optim.Adam(self.R_local.parameters(), lr=self.lr) self.soft_update(self.R_local, self.R_target, 1) self.steps = 0 self.predicter = QNetwork(state_size, action_size, self.fc1, self.fc2, self.seed).to(self.device) self.optimizer_pre = optim.Adam(self.predicter.parameters(), lr=self.lr_pre) #self.encoder_freq = Encoder(config).to(self.device) #self.encoder_optimizer_frq = torch.optim.Adam(self.encoder_freq.parameters(), self.lr) self.encoder = Encoder(config).to(self.device) self.encoder_optimizer = torch.optim.Adam(self.encoder.parameters(), self.lr) pathname = "lr_{}_batch_size_{}_fc1_{}_fc2_{}_seed_{}".format( self.lr, self.batch_size, self.fc1, self.fc2, self.seed) pathname += "_clip_{}".format(config["clip"]) pathname += "_tau_{}".format(config["tau"]) now = datetime.now() dt_string = now.strftime("%d_%m_%Y_%H:%M:%S") pathname += dt_string tensorboard_name = str(config["locexp"]) + '/runs/' + pathname self.vid_path = str(config["locexp"]) + '/vid' self.writer = SummaryWriter(tensorboard_name) self.average_prediction = deque(maxlen=100) self.average_same_action = deque(maxlen=100) self.all_actions = [] for a in range(self.action_size): action = torch.Tensor(1) * 0 + a self.all_actions.append(action.to(self.device)) def learn(self, memory_ex): logging.debug( "--------------------------New update-----------------------------------------------" ) self.steps += 1 states, next_states, actions, dones = memory_ex.expert_policy( self.batch_size) states = states.type(torch.float32).div_(255) states = self.encoder.create_vector(states) next_states = next_states.type(torch.float32).div_(255) next_states = self.encoder.create_vector(next_states) self.state_action_frq(states, actions) actions = torch.randint(0, 3, (self.batch_size, 1), dtype=torch.int64, device=self.device) self.compute_shift_function(states.detach(), next_states, actions, dones) self.compute_r_function(states.detach(), actions) self.compute_q_function(states.detach(), next_states, actions, dones) self.soft_update(self.R_local, self.R_target, self.tau) self.soft_update(self.q_shift_local, self.q_shift_target, self.tau) self.soft_update(self.qnetwork_local, self.qnetwork_target, self.tau) return def compute_q_function(self, states, next_states, actions, dones): """Update value parameters using given batch of experience tuples. """ actions = actions.type(torch.int64) # Get max predicted Q values (for next states) from target model if self.double_dqn: q_values = self.qnetwork_local(next_states).detach() _, best_action = q_values.max(1) best_action = best_action.unsqueeze(1) Q_targets_next = self.qnetwork_target(next_states).detach() Q_targets_next = Q_targets_next.gather(1, best_action) else: Q_targets_next = self.qnetwork_target(next_states).detach().max( 1)[0].unsqueeze(1) # Compute Q targets for current states # Get expected Q values from local model # Compute loss rewards = self.R_target(states).detach().gather( 1, actions.detach()).squeeze(0) Q_targets = rewards + (self.gamma * Q_targets_next * (dones)) Q_expected = self.qnetwork_local(states).gather(1, actions) loss = F.mse_loss(Q_expected, Q_targets.detach()) # Get max predicted Q values (for next states) from target model self.writer.add_scalar('Q_loss', loss, self.steps) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.encoder_optimizer.step() # torch.nn.utils.clip_grad_norm_(self.qnetwork_local.parameters(), 1) self.optimizer.step() def compute_shift_function(self, states, next_states, actions, dones): """Update Q shift parameters using given batch of experience tuples """ actions = actions.type(torch.int64) with torch.no_grad(): # Get max predicted Q values (for next states) from target model if self.double_dqn: q_shift = self.q_shift_local(next_states) max_q, max_actions = q_shift.max(1) Q_targets_next = self.qnetwork_target(next_states).gather( 1, max_actions.unsqueeze(1)) else: Q_targets_next = self.qnetwork_target( next_states).detach().max(1)[0].unsqueeze(1) # Compute Q targets for current states Q_targets = self.gamma * Q_targets_next # Get expected Q values from local model Q_expected = self.q_shift_local(states).gather(1, actions) # Compute loss loss = F.mse_loss(Q_expected, Q_targets.detach()) # Minimize the loss self.optimizer_shift.zero_grad() loss.backward() self.writer.add_scalar('Shift_loss', loss, self.steps) self.optimizer_shift.step() def compute_r_function(self, states, actions, debug=False, log=False): """ compute reward for the state action pair """ actions = actions.type(torch.int64) # sum all other actions size = states.shape[0] idx = 0 all_zeros = [1 for i in range(actions.shape[0])] zeros = False y_shift = self.q_shift_target(states).gather(1, actions).detach() log_a = self.get_action_prob(states, actions).detach() y_r_part1 = log_a - y_shift y_r_part2 = torch.empty((size, 1), dtype=torch.float32).to(self.device) for a, s in zip(actions, states): y_h = 0 taken_actions = 0 for b in self.all_actions: b = b.type(torch.int64).unsqueeze(1) n_b = self.get_action_prob(s.unsqueeze(0), b) if torch.eq(a, b) or n_b is None: continue taken_actions += 1 y_s = self.q_shift_target(s.unsqueeze(0)).detach().gather( 1, b).item() n_b = n_b.data.item() - y_s r_hat = self.R_target(s.unsqueeze(0)).gather(1, b).item() y_h += (r_hat - n_b) if log: text = "a {} r _hat {:.2f} - n_b {:.2f} | sh {:.2f} ".format( b.item(), r_hat, n_b, y_s) logging.debug(text) if taken_actions == 0: all_zeros[idx] = 0 zeros = True y_r_part2[idx] = 0.0 else: y_r_part2[idx] = (1. / taken_actions) * y_h idx += 1 y_r = y_r_part1 + y_r_part2 # check if there are zeros (no update for this tuble) remove them from states and if zeros: mask = torch.BoolTensor(all_zeros) states = states[mask] actions = actions[mask] y_r = y_r[mask] y = self.R_local(states).gather(1, actions) if log: text = "Action {:.2f} r target {:.2f} = n_a {:.2f} + n_b {:.2f} y {:.2f}".format( actions[0].item(), y_r[0].item(), y_r_part1[0].item(), y_r_part2[0].item(), y[0].item()) logging.debug(text) r_loss = F.mse_loss(y, y_r.detach()) # Minimize the loss self.optimizer_r.zero_grad() r_loss.backward() # torch.nn.utils.clip_grad_norm_(self.R_local.parameters(), 5) self.optimizer_r.step() self.writer.add_scalar('Reward_loss', r_loss, self.steps) def get_action_prob(self, states, actions): """ compute prob for state action pair """ actions = actions.type(torch.long) # check if action prob is zero output = self.predicter(states) output = F.softmax(output, dim=1) action_prob = output.gather(1, actions) action_prob = action_prob + torch.finfo(torch.float32).eps # check if one action if its to small if action_prob.shape[0] == 1: if action_prob.cpu().detach().numpy()[0][0] < 1e-4: return None action_prob = torch.log(action_prob) action_prob = torch.clamp(action_prob, min=self.clip, max=0) return action_prob def state_action_frq(self, states, action): """ Train classifer to compute state action freq """ self.predicter.train() output = self.predicter(states) output = output.squeeze(0) y = action.type(torch.long).squeeze(1) loss = nn.CrossEntropyLoss()(output, y) self.optimizer_pre.zero_grad() self.encoder_optimizer.zero_grad() loss.backward() # torch.nn.utils.clip_grad_norm_(self.predicter.parameters(), 1) self.optimizer_pre.step() self.writer.add_scalar('Predict_loss', loss, self.steps) self.predicter.eval() def test_predicter(self, memory): """ Test the classifier """ self.predicter.eval() same_state_predition = 0 for i in range(memory.idx): states = memory.obses[i] actions = memory.actions[i] states = torch.as_tensor(states, device=self.device).unsqueeze(0) states = states.type(torch.float32).div_(255) states = self.encoder.create_vector(states) actions = torch.as_tensor(actions, device=self.device) output = self.predicter(states) output = F.softmax(output, dim=1) # create one hot encode y from actions y = actions.type(torch.long).item() p = torch.argmax(output.data).item() if y == p: same_state_predition += 1 text = "Same prediction {} of {} ".format(same_state_predition, memory.idx) print(text) logging.debug(text) def soft_update(self, local_model, target_model, tau=4): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ # print("use tau", tau) for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def load(self, filename): self.predicter.load_state_dict(torch.load(filename + "_predicter.pth")) self.optimizer_pre.load_state_dict( torch.load(filename + "_predicter_optimizer.pth")) self.R_local.load_state_dict(torch.load(filename + "_r_net.pth")) self.qnetwork_local.load_state_dict(torch.load(filename + "_q_net.pth")) print("Load models to {}".format(filename)) def save(self, filename): """ """ mkdir("", filename) torch.save(self.predicter.state_dict(), filename + "_predicter.pth") torch.save(self.optimizer_pre.state_dict(), filename + "_predicter_optimizer.pth") torch.save(self.qnetwork_local.state_dict(), filename + "_q_net.pth") torch.save(self.optimizer.state_dict(), filename + "_q_net_optimizer.pth") torch.save(self.R_local.state_dict(), filename + "_r_net.pth") torch.save(self.q_shift_local.state_dict(), filename + "_q_shift_net.pth") print("save models to {}".format(filename)) def test_q_value(self, memory): test_elements = memory.idx test_elements = 100 all_diff = 0 error = True used_elements_r = 0 used_elements_q = 0 r_error = 0 q_error = 0 for i in range(test_elements): states = memory.obses[i] actions = memory.actions[i] states = torch.as_tensor(states, device=self.device).unsqueeze(0) states = states.type(torch.float32).div_(255) states = self.encoder.create_vector(states) actions = torch.as_tensor(actions, device=self.device) one_hot = torch.Tensor([0 for i in range(self.action_size)], device="cpu") one_hot[actions.item()] = 1 with torch.no_grad(): r_values = self.R_local(states.detach()).detach() q_values = self.qnetwork_local(states.detach()).detach() soft_r = F.softmax(r_values, dim=1).to("cpu") soft_q = F.softmax(q_values, dim=1).to("cpu") actions = actions.type(torch.int64) kl_q = F.kl_div(soft_q.log(), one_hot, None, None, 'sum') kl_r = F.kl_div(soft_r.log(), one_hot, None, None, 'sum') if kl_r == float("inf"): pass else: r_error += kl_r used_elements_r += 1 if kl_q == float("inf"): pass else: q_error += kl_q used_elements_q += 1 average_q_kl = q_error / used_elements_q average_r_kl = r_error / used_elements_r text = "Kl div of Reward {} of {} elements".format( average_q_kl, used_elements_r) print(text) text = "Kl div of Q_values {} of {} elements".format( average_r_kl, used_elements_q) print(text) self.writer.add_scalar('KL_reward', average_r_kl, self.steps) self.writer.add_scalar('KL_q_values', average_q_kl, self.steps) def act(self, states): states = torch.as_tensor(states, device=self.device).unsqueeze(0) states = states.type(torch.float32).div_(255) states = self.encoder.create_vector(states) q_values = self.qnetwork_local(states.detach()).detach() action = torch.argmax(q_values).item() return action def eval_policy(self, record=False, eval_episodes=2): if record: env = wrappers.Monitor(self.env, str(self.vid_path) + "/{}".format(self.steps), video_callable=lambda episode_id: True, force=True) else: env = self.env average_reward = 0 scores_window = deque(maxlen=100) s = 0 for i_epiosde in range(eval_episodes): episode_reward = 0 state = env.reset() while True: s += 1 action = self.act(state) state, reward, done, _ = env.step(action) episode_reward += reward if done: break scores_window.append(episode_reward) if record: return average_reward = np.mean(scores_window) print("Eval Episode {} average Reward {} ".format( eval_episodes, average_reward)) self.writer.add_scalar('Eval_reward', average_reward, self.steps)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed, algorithm='DQN'): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 # set algorithm if algorithm == "DQN": self.learn = self.learnDQN elif algorithm == "DDQN": self.learn = self.learnDDQN else: raise ('algorithm {} not implemented'.format(algorithm)) def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learnDQN(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences ## compute and minimize the loss self.optimizer.zero_grad() # target (Qsa_next) Qsa_next = torch.max(self.qnetwork_target(next_states), dim=1, keepdim=True)[0] targets = rewards + gamma * Qsa_next * (1 - dones) # output (Qsa) action_values = self.qnetwork_local(states) outputs = action_values.gather(1, actions) loss = F.mse_loss(outputs, targets) loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def learnDDQN(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # Compute and minimize the loss self.optimizer.zero_grad() # target (Qsa_next) next_actions = torch.argmax(self.qnetwork_local(next_states), dim=1, keepdim=True) Qsa_next = self.qnetwork_target(next_states).gather(1, next_actions) targets = rewards + gamma * Qsa_next * (1 - dones) # output (Qsa) action_values = self.qnetwork_local(states) outputs = action_values.gather(1, actions) loss = F.mse_loss(outputs, targets) loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) print(device) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences ##compute and minimize the loss state_action_values = self.q_value(states, actions) next_state_action_values = self.max_q_value(next_states) expected_state_action_values = (next_state_action_values * gamma * (1 - dones)) + rewards loss = F.mse_loss(state_action_values, expected_state_action_values) # Optimize the model self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def q_value(self, state, action): q_values = self.qnetwork_local(state) state_action_value = q_values.gather(1, action) return state_action_value def max_q_value(self, state): max_state_action_value = self.qnetwork_target(state).max(1)[0].detach() return max_state_action_value.unsqueeze(1) def save(self): print("Model save as chechpint.pth") torch.save(self.qnetwork_local.state_dict(), 'checkpoint.pth')
def train(args): chrome_driver_path = args.chrome_driver_path checkpoint_path = args.checkpoint_path nb_actions = args.nb_actions initial_epsilon = args.initial_epsilon epsilon = initial_epsilon final_epsilon = args.final_epsilon gamma = args.gamma nb_memory = args.nb_memory nb_expolre = args.nb_expolre is_debug = args.is_debug batch_size = args.batch_size nb_observation = args.nb_observation desired_fps = args.desired_fps is_cuda = True if args.use_cuda and torch.cuda.is_available() else False log_frequency = args.log_frequency save_frequency = args.save_frequency ratio_of_win = args.ratio_of_win if args.exploiting: nb_observation = -1 epsilon = final_epsilon seed = 22 np.random.seed(seed) memory = deque() env = DinoSeleniumEnv(chrome_driver_path, speed=args.game_speed) agent = Agent(env) game_state = GameState(agent, debug=is_debug) qnetwork = QNetwork(nb_actions) if is_cuda: qnetwork.cuda() optimizer = torch.optim.Adam(qnetwork.parameters(), 1e-4) tmp_param = next(qnetwork.parameters()) try: m = torch.load(checkpoint_path) qnetwork.load_state_dict(m["qnetwork"]) optimizer.load_state_dict(m["optimizer"]) except: logger.warn("No model found in {}".format(checkpoint_path)) loss_fcn = torch.nn.MSELoss() action_indx = 0 # do nothing as the first action screen, reward, is_gameover, score = game_state.get_state(action_indx) current_state = np.expand_dims(screen, 0) # [IMAGE_CHANNELS,IMAGE_WIDTH,IMAGE_HEIGHT] current_state = np.tile(current_state, (IMAGE_CHANNELS, 1, 1)) initial_state = current_state t = 0 last_time = 0 sum_scores = 0 total_loss = 0 max_score = 0 qvalues = np.array([0, 0]) lost_action = [] win_actions = [] action_random = 0 action_greedy = 0 episodes = 0 nb_episodes = 0 if not args.exploiting: try: t, memory, epsilon, nb_episodes = pickle.load(open( "cache.p", "rb")) except: logger.warn("Could not load cache file! Starting from scratch.") try: while True: qnetwork.eval() if np.random.random() < epsilon: # epsilon greedy action_indx = np.random.randint(nb_actions) action_random += 1 else: action_greedy += 1 tensor = torch.from_numpy(current_state).float().unsqueeze(0) with torch.no_grad(): qvalues = qnetwork(tensor).squeeze() _, action_indx = qvalues.max(-1) action_indx = action_indx.item() if epsilon > final_epsilon and t > nb_observation: epsilon -= (initial_epsilon - final_epsilon) / nb_expolre screen, reward, is_gameover, score = game_state.get_state( action_indx) if is_gameover: episodes += 1 nb_episodes += 1 lost_action.append(action_indx) sum_scores += score else: win_actions.append(action_indx) if score > max_score: max_score = score if last_time: fps = 1 / (time.time() - last_time) if fps > desired_fps: time.sleep(1 / desired_fps - 1 / fps) if last_time and t % log_frequency == 0: logger.info('fps: {0}'.format(1 / (time.time() - last_time))) last_time = time.time() screen = np.expand_dims(screen, 0) next_state = np.append(screen, current_state[:IMAGE_CHANNELS - 1, :, :], axis=0) if not args.exploiting and (is_gameover or np.random.random() < ratio_of_win): memory.append((current_state, action_indx, reward, next_state, is_gameover)) if len(memory) > nb_memory: memory.popleft() if nb_observation > 0 and t > nb_observation: indxes = np.random.choice(len(memory), batch_size, replace=False) minibatch = [memory[b] for b in indxes] inputs = tmp_param.new(batch_size, IMAGE_CHANNELS, IMAGE_WIDTH, IMAGE_HEIGHT).zero_() targets = tmp_param.new(batch_size, nb_actions).zero_() for i, (state_t, action_t, reward_t, state_t1, is_gameover_t1) in enumerate(minibatch): inputs[i] = torch.from_numpy(state_t).float() tensor = inputs[i].unsqueeze(0) with torch.no_grad(): qvalues = qnetwork(tensor).squeeze() targets[i] = qvalues if is_gameover_t1: assert reward_t == -1 targets[i, action_t] = reward_t else: tensor = torch.from_numpy(state_t1).float().unsqueeze( 0) with torch.no_grad(): qvalues = qnetwork(tensor).squeeze() qvalues = qvalues.cpu().numpy() targets[i, action_t] = reward_t + gamma * qvalues.max() qnetwork.train() qnetwork.zero_grad() q_values = qnetwork(inputs) loss = loss_fcn(q_values, targets) loss.backward() optimizer.step() total_loss += loss.item() current_state = initial_state if is_gameover else next_state t += 1 if t % log_frequency == 0: logger.info( "For t {}: mean score is {} max score is {} mean loss: {} number of episode: {}" .format(t, sum_scores / (episodes + 0.1), max_score, total_loss / 1000, episodes)) logger.info( "t: {} action_index: {} reward: {} max qvalue: {} total number of eposodes so far: {}" .format(t, action_indx, reward, qvalues.max(), nb_episodes)) tmp = np.array(lost_action) dnc = (tmp == 0).sum() logger.info( "Lost actions do_nothing: {} jump: {} length of memory {}". format(dnc, len(tmp) - dnc, len(memory))) tmp = np.array(win_actions) dnc = (tmp == 0).sum() logger.info("Win actions do_nothing: {} jump: {}".format( dnc, len(tmp) - dnc)) logger.info("Greedy action {} Random action {}".format( action_greedy, action_random)) action_greedy = 0 action_random = 0 lost_action = [] win_actions = [] if episodes != 0: sum_scores = 0 total_loss = 0 episodes = 0 if t % save_frequency and not args.exploiting: env.pause_game() with open("cache.p", "wb") as fh: pickle.dump((t, memory, epsilon, nb_episodes), fh) gc.collect() torch.save( { "qnetwork": qnetwork.state_dict(), "optimizer": optimizer.state_dict() }, checkpoint_path) env.resume_game() except KeyboardInterrupt: if not args.exploiting: torch.save( { "qnetwork": qnetwork.state_dict(), "optimizer": optimizer.state_dict() }, checkpoint_path) with open("cache.p", "wb") as fh: pickle.dump((t, memory, epsilon, nb_episodes), fh)
class DQNAgent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, buffer_size, batch_size, gamma, tau, lr, hidden_1, hidden_2, update_every, epsilon, epsilon_min, eps_decay, seed ): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.buffer_size = buffer_size self.batch_size = batch_size self.gamma = gamma self.tau = tau self.lr = lr self.update_every = update_every self.seed = random.seed(seed) self.learn_steps = 0 self.epsilon = epsilon self.epsilon_min = epsilon_min self.eps_decay = eps_decay # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed, hidden_1, hidden_2).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed, hidden_1, hidden_2).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=lr) # Replay memory self.memory = ReplayBuffer(self.action_size, self.buffer_size, self.batch_size, self.seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % self.update_every if self.t_step == 0: # Sample if enough samples are available if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) def act(self, state): """Returns actions for given state as per current policy. Params ====== state (array_like): current state """ self.epsilon = max(self.epsilon*self.eps_decay, self.epsilon_min) state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > self.epsilon: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # Get max predicted Q values (for next states) from target model Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1) # Compute Q targets for current states Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) # Compute loss loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() self.learn_steps += 1 # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target) def soft_update(self, local_model, target_model): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(self.tau * local_param.data + (1.0 - self.tau) * target_param.data)
class PrioritizedDQNAgent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, buffer_size, batch_size, gamma, tau, lr, lr_decay, update_every, update_mem_every, update_mem_par_every, experience_per_sampling, seed, epsilon, epsilon_min, eps_decay, compute_weights, hidden_1, hidden_2, ): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.buffer_size = buffer_size self.batch_size = batch_size self.gamma = gamma self.tau = tau self.lr_decay = lr_decay self.update_every = update_every self.experience_per_sampling = experience_per_sampling self.update_mem_every = update_mem_every self.update_mem_par_every = update_mem_par_every self.seed = random.seed(seed) self.epsilon= epsilon self.epsilon_min = epsilon_min self.eps_decay = eps_decay self.compute_weights = compute_weights self.hidden_1 = hidden_1 self.hidden_2 = hidden_2 self.learn_steps = 0 self.epsilon = epsilon self.epsilon_min = epsilon_min self.eps_decay = eps_decay self.compute_weights = compute_weights # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed, hidden_1, hidden_2).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed, hidden_1, hidden_2).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=lr) self.scheduler = StepLR(self.optimizer, step_size=1, gamma=self.lr_decay) # Replay memory self.memory = PrioritizedReplayBuffer( self.action_size, self.buffer_size, self.batch_size, self.experience_per_sampling, self.seed, self.compute_weights) # Initialize time step (for updating every UPDATE_NN_EVERY steps) self.t_step_nn = 0 # Initialize time step (for updating every UPDATE_MEM_PAR_EVERY steps) self.t_step_mem_par = 0 # Initialize time step (for updating every UPDATE_MEM_EVERY steps) self.t_step_mem = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_NN_EVERY time steps. self.t_step_nn = (self.t_step_nn + 1) % self.update_every self.t_step_mem = (self.t_step_mem + 1) % self.update_mem_every self.t_step_mem_par = (self.t_step_mem_par + 1) % self.update_mem_par_every if self.t_step_mem_par == 0: self.memory.update_parameters() if self.t_step_nn == 0: # If enough samples are available in memory, get random subset and learn if self.memory.experience_count > self.experience_per_sampling: sampling = self.memory.sample() self.learn(sampling) if self.t_step_mem == 0: self.memory.update_memory_sampling() def act(self, state): """Returns actions for given state as per current policy. Params ====== state (array_like): current state """ self.epsilon = max(self.epsilon*self.eps_decay, self.epsilon_min) state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) #print(action_values) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > self.epsilon: #print(np.argmax(action_values.cpu().data.numpy())) return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, sampling): """Update value parameters using given batch of experience tuples. Params ====== sampling (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones, weights, indices = sampling # Get max predicted Q values (for next states) from target model Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1) # Compute Q targets for current states Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) # Compute loss loss = F.mse_loss(Q_expected, Q_targets) if self.compute_weights: with torch.no_grad(): weight = sum(np.multiply(weights, loss.data.cpu().numpy())) loss *= weight # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() self.scheduler.step() self.learn_steps += 1 # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target) # ------------------- update priorities ------------------- # delta = abs(Q_targets - Q_expected.detach()).cpu().numpy() self.memory.update_priorities(delta, indices) def soft_update(self, local_model, target_model): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(self.tau * local_param.data + (1.0 - self.tau) * target_param.data)
class DQNAgent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, config): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(config["seed"]) self.seed = config["seed"] self.gamma = 0.99 self.batch_size = config["batch_size"] self.lr = config["lr"] self.tau = config["tau"] self.fc1 = config["fc1_units"] self.fc2 = config["fc2_units"] self.device = config["device"] # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, self.fc1, self.fc2, self.seed).to(self.device) self.qnetwork_target = QNetwork(state_size, action_size, self.fc1, self.fc2, self.seed).to(self.device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=self.lr) self.encoder = Encoder(config).to(self.device) self.encoder_optimizer = torch.optim.Adam(self.encoder.parameters(), self.lr) # Replay memory # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, memory, writer): self.t_step += 1 if len(memory) > self.batch_size: if self.t_step % 4 == 0: experiences = memory.sample(self.batch_size) self.learn(experiences, writer) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(self.device) state = state.type(torch.float32).div_(255) self.qnetwork_local.eval() self.encoder.eval() with torch.no_grad(): state = self.encoder.create_vector(state) action_values = self.qnetwork_local(state) self.qnetwork_local.train() self.encoder.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, writer): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences states = states.type(torch.float32).div_(255) states = self.encoder.create_vector(states) next_states = next_states.type(torch.float32).div_(255) next_states = self.encoder.create_vector(next_states) actions = actions.type(torch.int64) # Get max predicted Q values (for next states) from target model Q_targets_next = self.qnetwork_target(next_states).detach().max( 1)[0].unsqueeze(1) # Compute Q targets for current states Q_targets = rewards + (self.gamma * Q_targets_next * dones) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) # Compute loss loss = F.mse_loss(Q_expected, Q_targets) writer.add_scalar('Q_loss', loss, self.t_step) # Minimize the loss self.optimizer.zero_grad() self.encoder_optimizer.zero_grad() loss.backward() self.optimizer.step() self.encoder_optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target) def soft_update(self, local_model, target_model): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(self.tau * local_param.data + (1.0 - self.tau) * target_param.data) def save(self, filename): """ """ mkdir("", filename) torch.save(self.qnetwork_local.state_dict(), filename + "_q_net.pth") torch.save(self.optimizer.state_dict(), filename + "_q_net_optimizer.pth") torch.save(self.encoder.state_dict(), filename + "_encoder.pth") torch.save(self.encoder_optimizer.state_dict(), filename + "_encoder_optimizer.pth") print("Save models to {}".format(filename))