def evaluate_instance(self, idx): """ Evaluate an instance with the current model :param idx: the index of the instance in the validation set :return: the reward collected for this instance """ instance = self.validation_set[idx] env = Environment(instance, self.num_node_feats, self.num_edge_feats, self.reward_scaling, self.args.grid_size, self.args.max_tw_gap, self.args.max_tw_size) cur_state = env.get_initial_environment() total_reward = 0 while True: graph = env.make_nn_input(cur_state, self.args.mode) avail = env.get_valid_actions(cur_state) action = self.select_action(graph, avail) cur_state, reward = env.get_next_state_with_reward( cur_state, action) total_reward += reward if cur_state.is_done(): break return total_reward
def evaluate_instance(self, idx): """ Evaluate an instance with the current model :param idx: the index of the instance in the validation set :return: the reward collected for this instance """ instance = self.validation_set[idx] env = Environment(instance, self.num_node_feats, self.num_edge_feats, self.reward_scaling, self.args.grid_size, self.args.max_tw_gap, self.args.max_tw_size) cur_state = env.get_initial_environment() total_reward = 0 while True: graph = env.make_nn_input(cur_state, self.args.mode) avail = env.get_valid_actions(cur_state) available_tensor = torch.FloatTensor(avail) batched_graph = dgl.batch([graph, ]) out_action, _, _ = self.brain.policy_old.act(batched_graph, available_tensor) action = out_action.item() cur_state, reward = env.get_next_state_with_reward(cur_state, action) total_reward += reward if cur_state.is_done(): break return total_reward
def run_episode(self): """ Run the training for a single episode """ # Generate a random instance instance = TSPTW.generate_random_instance( n_city=self.args.n_city, grid_size=self.args.grid_size, max_tw_gap=self.args.max_tw_gap, max_tw_size=self.args.max_tw_size, seed=-1, is_integer_instance=False) env = Environment(instance, self.num_node_feats, self.num_edge_feats, self.reward_scaling, self.args.grid_size, self.args.max_tw_gap, self.args.max_tw_size) cur_state = env.get_initial_environment() while True: self.time_step += 1 graph = env.make_nn_input(cur_state, self.args.mode) avail = env.get_valid_actions(cur_state) available_tensor = torch.FloatTensor(avail) out_action, log_prob_action, _ = self.brain.policy_old.act( graph, available_tensor) action = out_action.item() cur_state, reward = env.get_next_state_with_reward( cur_state, action) self.memory.add_sample(graph, out_action, log_prob_action, reward, cur_state.is_done(), available_tensor) if self.time_step % self.args.update_timestep == 0: self.brain.update(self.memory) self.memory.clear_memory() self.time_step = 0 if cur_state.is_done(): break
def run_episode(self, episode_idx, memory_initialization): """ Run a single episode, either for initializing the memory (random episode in this case) or for training the model (following DQN algorithm) :param episode_idx: the index of the current episode done (without considering the memory initialization) :param memory_initialization: True if it is for initializing the memory :return: the loss and the current beta of the softmax selection """ # Generate a random instance instance = TSPTW.generate_random_instance( n_city=self.args.n_city, grid_size=self.args.grid_size, max_tw_gap=self.args.max_tw_gap, max_tw_size=self.args.max_tw_size, seed=-1, is_integer_instance=False) env = Environment(instance, self.num_node_feats, self.num_edge_feats, self.reward_scaling, self.args.grid_size, self.args.max_tw_gap, self.args.max_tw_size) cur_state = env.get_initial_environment() graph_list = [dgl.DGLGraph()] * self.n_action rewards_vector = np.zeros(self.n_action) actions_vector = np.zeros(self.n_action, dtype=np.int16) available_vector = np.zeros((self.n_action, self.args.n_city)) idx = 0 total_loss = 0 # the current temperature for the softmax selection: increase from 0 to MAX_BETA temperature = max( 0., min(self.args.max_softmax_beta, (episode_idx - 1) / STEP_EPSILON * self.args.max_softmax_beta)) # execute the episode while True: graph = env.make_nn_input(cur_state, self.args.mode) avail = env.get_valid_actions(cur_state) avail_idx = np.argwhere(avail == 1).reshape(-1) if memory_initialization: # if we are in the memory initialization phase, a random episode is selected action = random.choice(avail_idx) else: # otherwise, we do the softmax selection action = self.soft_select_action(graph, avail, temperature) # each time we do a step, we increase the counter, and we periodically synchronize the target network self.steps_done += 1 if self.steps_done % UPDATE_TARGET_FREQUENCY == 0: self.brain.update_target_model() cur_state, reward = env.get_next_state_with_reward( cur_state, action) graph_list[idx] = graph rewards_vector[idx] = reward actions_vector[idx] = action available_vector[idx] = avail if cur_state.is_done(): break idx += 1 episode_last_idx = idx # compute the n-step values for i in range(self.n_action): if i <= episode_last_idx: cur_graph = graph_list[i] cur_available = available_vector[i] else: cur_graph = graph_list[episode_last_idx] cur_available = available_vector[episode_last_idx] if i + self.n_step < self.n_action: next_graph = graph_list[i + self.n_step] next_available = available_vector[i + self.n_step] else: next_graph = dgl.DGLGraph() next_available = env.get_valid_actions(cur_state) # a state correspond to the graph, with the nodes that we can still visit state_features = (cur_graph, cur_available) next_state_features = (next_graph, next_available) # the n-step reward reward = sum(rewards_vector[i:i + self.n_step]) action = actions_vector[i] sample = (state_features, action, reward, next_state_features) if memory_initialization: # the error of the replay memory is equals to the reward, at initialization error = abs(reward) self.init_memory_counter += 1 step_loss = 0 else: # feed the memory with the new samples x, y, errors = self.get_targets([(0, sample, 0)]) error = errors[0] step_loss = self.learning() # learning procedure self.memory.add(error, sample) total_loss += step_loss return total_loss, temperature
args = parse_arguments() sys.stdout.flush() rl_algorithm = "ppo" load_folder = "./selected-models/ppo/tsptw/n-city-%d/grid-%d-tw-%d-%d" % \ (args.n_city, args.grid_size, args.max_tw_gap, args.max_tw_size) solver_binding = SolverBinding(load_folder, args.n_city, args.grid_size, args.max_tw_gap, args.max_tw_size, args.seed, rl_algorithm) env = Environment(solver_binding.instance, solver_binding.n_node_feat, solver_binding.n_edge_feat, 1, args.grid_size, args.max_tw_gap, args.max_tw_size) cur_state = env.get_initial_environment() sequences = [[[0], cur_state, 1.0]] total_reward = 0 for _ in range(args.n_city - 1): all_candidates = list() for i in range(len(sequences)): seq, state, score = sequences[i]