def get_val_result(self, validation_graph): objective_vals = [] for g in validation_graph: env = MVC_environement(g) Xv, graph = env.reset_env() graph = torch.unsqueeze(graph, 0) Xv = Xv.clone() Xv = Xv.cuda() graph = graph.to(self.device) done = False self.non_selected = list(np.arange(env.num_nodes)) self.selected = [] while done == False: #Xv = Xv.cuda() Xv = Xv.to(self.device) val = self.forward(graph, Xv)[0] #val[selected] = -float('inf') #print(val) #action = int(torch.argmax(val).item()) action = self.take_action(graph, Xv, is_validation=True) Xv_next, reward, done = env.take_action(action) #non_selected.remove(action) #selected.append(action) Xv = Xv_next #print(selected) objective_vals.append(len(self.selected)) return sum(objective_vals) / len(objective_vals)
def train(self, g, num_eps=20): N_STEP = 2 fitted_q_exp = namedtuple("fitted_exp", ['graph', 'Xv', 'action', 'reward']) experience = namedtuple( "experience", ['graph', 'Xv', 'action', 'reward', 'next_Xv', 'is_done']) EPS_START = 1.00 EPS_END = 0.05 EPS_DECAY = 500 steps_done = 0 for e in range(num_eps): env = MVC_environement(g) Xv, graph = env.reset_env() Xv = Xv.clone() graph = torch.unsqueeze(graph, 0) done = False non_selected = list(np.arange(env.num_nodes)) selected = [] N = 0 fitted_experience_list = [] reward_list = [] self.agent.new_epsiode() while done == False: eps_threshold = EPS_END + (EPS_START - EPS_END) * math.exp( -1. * steps_done / EPS_DECAY) if np.random.uniform() > eps_threshold: val = self.agent(graph, Xv)[0] val[selected] = -float('inf') action = int(torch.argmax(val).item()) else: action = int(np.random.choice(non_selected)) Xv_next, reward, done = env.take_action(action) Xv_next = Xv_next.clone() fit_ex = fitted_q_exp(graph, Xv, action, reward) fitted_experience_list.append(fit_ex) non_selected.remove(action) selected.append(action) N += 1 reward_list.append(reward) if N >= N_STEP: n_reward = sum(reward_list) n_prev_ex = fitted_experience_list[0] n_graph = n_prev_ex.graph n_Xv = n_prev_ex.Xv n_action = n_prev_ex.action ex = experience(n_graph, n_Xv, torch.tensor([n_action]), torch.tensor([n_reward]), Xv_next, done) self.agent.store_transition(ex) fitted_experience_list.pop(0) reward_list.pop(0) Xv = Xv_next steps_done += 1 self.agent.train(batch_size=8, fitted_Q=True)
def get_val_result_batch(self, validation_graph, return_list=False): N = len(validation_graph) all_graphs = [] all_Xv = [] all_envs = [] for g in validation_graph: env = MVC_environement(g) all_envs.append(env) Xv, graph = env.reset_env() graph = torch.unsqueeze(graph, 0) all_graphs.append(graph) all_Xv.append(Xv) all_graphs = torch.cat(all_graphs, 0).to(self.device) all_Xv = torch.cat(all_Xv, 0).to(self.device) all_selected = [[] for _ in range(N)] all_dones = [False for _ in range(N)] all_done = False done_count = 0 while all_done == False: q_val = self.dqn(all_graphs, all_Xv) for i in range(N): if all_dones[i]: continue q_val[i][all_selected[i]] = -float('inf') action = torch.argmax(q_val[i]).item() all_selected[i].append(action) _, _, done = all_envs[i].take_action(action) all_Xv[i][action] = 1 if done: all_dones[i] = True done_count += 1 if (done_count == N): break #print(all_Xv) #print(q_val.shape) #del all_graphs #del all_Xv #break objective_vals = [] for s in all_selected: objective_vals.append(len(s)) if return_list: return objective_vals return sum(objective_vals) / len(objective_vals)
def train_with_graph(self, g): ''' g: networkx graph ''' N_STEP = 2 env = MVC_environement(g) Xv, graph = env.reset_env() Xv = Xv.clone() graph = torch.unsqueeze(graph, 0) done = False fitted_experience_list = [] reward_list = [] #self.new_epsiode() self.non_selected = list(np.arange(env.num_nodes)) self.selected = [] self.N = 0 while done == False: action = self.take_action(graph, Xv) Xv_next, reward, done = env.take_action(action) Xv_next = Xv_next.clone() fit_ex = fitted_q_exp(graph, Xv, action, reward) fitted_experience_list.append(fit_ex) self.N += 1 reward_list.append(reward) if self.N >= N_STEP: n_reward = sum(reward_list) n_prev_ex = fitted_experience_list[0] n_graph = n_prev_ex.graph n_Xv = n_prev_ex.Xv n_action = n_prev_ex.action #print(sum(n_Xv[0]) , sum(Xv_next[0])) #aa ex = experience(n_graph, n_Xv, torch.tensor([n_action]), torch.tensor([n_reward]), Xv_next, done) self.store_transition(ex) fitted_experience_list.pop(0) reward_list.pop(0) Xv = Xv_next self.train() self.episode_done += 1 if self.episode_done > 0 and self.episode_done % 8 == 0: #print(self.steps_done) self.update_target_network()
def get_val_result(self, validation_graph, run_sparse=False): if type(validation_graph) is not list: validation_graph = [validation_graph] objective_vals = [] for g in validation_graph: env = MVC_environement(g) Xv, graph = env.reset_env() if run_sparse: assert len(validation_graph) == 1 #graph = torch.unsqueeze(graph, 0) graph = to_sparse_tensor(graph) else: graph = torch.unsqueeze(graph, 0) Xv = Xv.clone() if 'cuda' in self.device: Xv = Xv.cuda() graph = graph.to(self.device) done = False self.non_selected = list(np.arange(env.num_nodes)) self.selected = [] while done == False: #Xvprint(len(self.selected)) #Xv = Xv.cuda() Xv = Xv.to(self.device) if run_sparse: val = self.forward(graph, Xv) else: val = self.forward(graph, Xv)[0] action = self.take_action(graph, Xv, is_validation=True) Xv_next, reward, done = env.take_action(action) Xv = Xv_next #print(selected) objective_vals.append(len(self.selected)) return sum(objective_vals) / len(objective_vals)
def get_val_result_batch(self, validation_graph, return_list=False, during_adaption=False): N = len(validation_graph) N_STEP = 2 all_graphs = [] all_Xv = [] all_envs = [] fitted_experience_list = [[] for _ in range(N)] for g in validation_graph: env = MVC_environement(g) all_envs.append(env) Xv, graph = env.reset_env() graph = torch.unsqueeze(graph, 0) all_graphs.append(graph) all_Xv.append(Xv) all_graphs = torch.cat(all_graphs, 0).to(self.device) all_Xv = torch.cat(all_Xv, 0).to(self.device) all_selected = [[] for _ in range(N)] all_dones = [False for _ in range(N)] all_done = False done_count = 0 cur_step = 0 while all_done == False: q_val = self.dqn(all_graphs, all_Xv) for i in range(N): if all_dones[i]: continue q_val[i][all_selected[i]] = -float('inf') action = torch.argmax(q_val[i]).item() if during_adaption: rand_val = random.uniform(0, 1) if rand_val > 0: probs = torch.nn.functional.softmax(q_val[i]) m = torch.distributions.categorical.Categorical( probs=probs) action = m.sample().item() #print(m,action) all_selected[i].append(action) Xv_next, rew, done = all_envs[i].take_action(action) if during_adaption: copy_xv = deepcopy(all_Xv[i:i + 1]) fit_ex = fitted_q_exp(all_graphs[i:i + 1], copy_xv, action, rew) fitted_experience_list[i].append(fit_ex) if len(fitted_experience_list[i]) >= N_STEP: n_reward = -N_STEP n_prev_ex = fitted_experience_list[i][0] n_graph = n_prev_ex.graph n_Xv = n_prev_ex.Xv #print( sum(n_Xv[0]) , sum(Xv_next[0])) #zz n_action = n_prev_ex.action ex = experience(n_graph, n_Xv, torch.tensor([n_action]), torch.tensor([n_reward]), Xv_next, done) self.adaption_buffer.push(ex) #print(len(self.adaption_buffer)) fitted_experience_list[i].pop(0) #ex = experience( all_graphs[i:i+1] , deepcopy(all_Xv[i:i+1]) , torch.tensor([action]) , torch.tensor([-1]) , Xv_next , done) #self.adaption_buffer.push(ex) #print(len(self.adaption_buffer)) #self.train(during_adaption = True , fitted_Q = True) all_Xv[i][action] = 1 if done: all_dones[i] = True done_count += 1 self.adaption_steps += 1 cur_step += 1 if during_adaption and cur_step >= N_STEP + 1: #print(len(self.adaption_buffer)) for ep in range(20): self.train(during_adaption=True, fitted_Q=False, batch_size=128) self.update_target_network() if (done_count == N): break #if during_adaption: # for zz in range(N): # self.train(during_adaption = True , fitted_Q = False) # if zz % 10 == 0: # self.update_target_network() #print(all_Xv) #print(q_val.shape) #del all_graphs #del all_Xv #break objective_vals = [] for s in all_selected: objective_vals.append(len(s)) if return_list: return objective_vals return sum(objective_vals) / len(objective_vals)