class Agent(object): def __init__(self, g_list, test_g_list, env): self.g_list = g_list if test_g_list is None: self.test_g_list = g_list else: self.test_g_list = test_g_list self.env = env self.net = QNet() self.old_net = QNet() self.optimizer = optim.Adam(self.net.parameters(), lr=cmd_args.learning_rate) if cmd_args.ctx == 'gpu': self.net = self.net.cuda() self.old_net = self.old_net.cuda() self.eps_start = 1.0 self.eps_end = 1.0 self.eps_step = 10000 self.burn_in = 100 # number of iterations to run first set ("intial burning in to memory") of simulations? self.step = 0 self.best_eval = None self.pos = 0 self.sample_idxes = list(range(len(g_list))) random.shuffle(self.sample_idxes) self.take_snapshot() def take_snapshot(self): self.old_net.load_state_dict(self.net.state_dict()) # type = 0 for add, 1 for subtract def make_actions(self, greedy=True, _type=0): self.eps = self.eps_end + max( 0., (self.eps_start - self.eps_end) * (self.eps_step - max(0., self.step)) / self.eps_step) cur_state = self.env.getStateRef() actions, q_arrs = self.net(cur_state, None, greedy_acts=True, _type=_type) q_vals = [] for i in range(len(q_arrs)): tmp = q_arrs[i].numpy() tmp = tmp[actions[i]][0] q_vals.append(tmp) return actions, q_vals def run_simulation(self): self.env.setup(g_list) avg_rewards = [] t_a, t_s = 0, 0 for asdf in range(GLOBAL_EPISODE_STEPS): if asdf % 2 == 0: assert self.env.first_nodes == None for i in range(len(self.g_list)): g = self.g_list[i].to_networkx() con_nodes = list(set(list(sum(g.edges, ())))) for j in range(20): if (j not in con_nodes): rand_num = np.random.randint(0, 20) g.add_edge(j, rand_num) self.env.added_edges.append((j, rand_num)) self.g_list[i] = S2VGraph(g, label=self.g_list[i].label) action_type = (asdf % 4) // 2 # get Actions list_at, _ = self.make_actions(_type=action_type) # save State list_st = self.env.cloneState() cur_state = self.env.getStateRef() _, predicted_Q = self.net(cur_state, None, greedy_acts=False, _type=action_type) # get Rewards if self.env.first_nodes is not None: rewards = self.env.get_rewards(list_at, _type=action_type) avg_rewards.append(sum(rewards) / len(rewards)) else: rewards = [0] * len(g_list) # Update graph to get S' self.env.step(list_at, _type=action_type) # get next state if env.isTerminal(): s_prime = None else: s_prime = self.env.cloneState() # get S'and A' values try: sprime_at, q_primes = self.make_actions(_type=action_type) except: continue # Calculate Q(S', A') actual_Q = torch.Tensor(rewards) + torch.Tensor(q_primes) # Pass loss to network loss = F.mse_loss(predicted_Q, actual_Q) self.optimizer.zero_grad() loss.backward() self.optimizer.step() return avg_rewards def train(self): # set up progress bar pbar = tqdm(range(GLOBAL_NUM_STEPS), unit='steps') avgs = [] # for each iteration for self.step in pbar: # run simulation # side effects? avgs += self.run_simulation() #print("tmp: ", tmp) #avg_reward_step.append(sum(tmp)/len(tmp)) #plt.plot(tmp) #plt.show() #plt.savefig('test.png') print("avgs: ", avgs) mov_avg = np.convolve(np.array(avgs), np.ones(4), 'valid') / 4 print("mov avg: ", list(mov_avg)) print(type(mov_avg)) print(mov_avg.shape) plt.clf() plt.plot(list(mov_avg)) plt.title('running average of average rewards') plt.savefig("Results.png") plt.show()
class DqnAgent(): def __init__(self, obs_dims, act_dim, lr=1e-3, gamma=0.99, replay_buffer_size=10000, batch_size=64, epsilon_min=0.01, epsilon_dec=5e-5, target_update_frequency=64): self.buffer = ReplayBuffer(replay_buffer_size, obs_dims) self.batch_size = batch_size self.q_eval = QNet(obs_dims, act_dim) self.q_target = QNet(obs_dims, act_dim) self.obs_dims = obs_dims self.act_dim = act_dim self.learn_ctr = 0 self.target_update_frequency = target_update_frequency self.gamma = gamma self.epsilon = 1 self.epsilon_min = epsilon_min self.epsilon_dec = epsilon_dec self.optimizer = torch.optim.Adam(self.q_eval.parameters(), lr=lr) self.loss_fn = torch.nn.MSELoss() def update_target(self): if self.learn_ctr % self.target_update_frequency == 0: self.q_target.load_state_dict(self.q_eval.state_dict()) def decrement_epsilon(self): if self.epsilon > self.epsilon_min: self.epsilon = self.epsilon - self.epsilon_dec def choose_action(self, obs): if np.random.sample() < self.epsilon: return np.random.randint(self.act_dim) else: obs = torch.tensor(np.expand_dims(obs, axis=0), dtype=torch.float) return torch.argmax(self.q_eval(obs)).item() def store_transition(self, obs, act, rew, _obs, done): self.buffer.push(obs, act, rew, _obs, done) def sample_replay_buffer(self): return self.buffer.sample(self.batch_size) def learn(self): self.optimizer.zero_grad() obs, act, rew, _obs, done = self.sample_replay_buffer() obs = torch.tensor(obs, dtype=torch.float) act = torch.tensor(act, dtype=torch.long) rew = torch.tensor(rew, dtype=torch.long) _obs = torch.tensor(_obs, dtype=torch.float) done = torch.tensor(done, dtype=torch.long) idxs = torch.tensor(np.arange(self.batch_size), dtype=torch.long) q_pred = self.q_eval(obs)[idxs, act] q_next = self.q_target(_obs).max(dim=1)[0] q_target = rew + (1 - done) * self.gamma * q_next loss = self.loss_fn(q_target, q_pred) loss.backward() self.optimizer.step() self.update_target() self.decrement_epsilon()