コード例 #1
0
    def evaluate_instance(self, idx):
        """
        Evaluate an instance with the current model
        :param idx: the index of the instance in the validation set
        :return: the reward collected for this instance
        """

        instance = self.validation_set[idx]
        env = Environment(instance, self.num_node_feats, self.num_edge_feats,
                          self.reward_scaling, self.args.grid_size,
                          self.args.max_tw_gap, self.args.max_tw_size)
        cur_state = env.get_initial_environment()

        total_reward = 0

        while True:
            graph = env.make_nn_input(cur_state, self.args.mode)

            avail = env.get_valid_actions(cur_state)

            action = self.select_action(graph, avail)

            cur_state, reward = env.get_next_state_with_reward(
                cur_state, action)

            total_reward += reward

            if cur_state.is_done():
                break

        return total_reward
コード例 #2
0
    def evaluate_instance(self, idx):
        """
        Evaluate an instance with the current model
        :param idx: the index of the instance in the validation set
        :return: the reward collected for this instance
        """

        instance = self.validation_set[idx]
        env = Environment(instance, self.num_node_feats, self.num_edge_feats, self.reward_scaling,
                          self.args.grid_size, self.args.max_tw_gap, self.args.max_tw_size)
        cur_state = env.get_initial_environment()

        total_reward = 0

        while True:

            graph = env.make_nn_input(cur_state, self.args.mode)
            avail = env.get_valid_actions(cur_state)

            available_tensor = torch.FloatTensor(avail)

            batched_graph = dgl.batch([graph, ])

            out_action, _, _ = self.brain.policy_old.act(batched_graph, available_tensor)

            action = out_action.item()

            cur_state, reward = env.get_next_state_with_reward(cur_state, action)

            total_reward += reward

            if cur_state.is_done():
                break

        return total_reward
コード例 #3
0
    def run_episode(self):
        """
        Run the training for a single episode
        """

        #  Generate a random instance
        instance = TSPTW.generate_random_instance(
            n_city=self.args.n_city,
            grid_size=self.args.grid_size,
            max_tw_gap=self.args.max_tw_gap,
            max_tw_size=self.args.max_tw_size,
            seed=-1,
            is_integer_instance=False)

        env = Environment(instance, self.num_node_feats, self.num_edge_feats,
                          self.reward_scaling, self.args.grid_size,
                          self.args.max_tw_gap, self.args.max_tw_size)

        cur_state = env.get_initial_environment()

        while True:

            self.time_step += 1

            graph = env.make_nn_input(cur_state, self.args.mode)
            avail = env.get_valid_actions(cur_state)

            available_tensor = torch.FloatTensor(avail)

            out_action, log_prob_action, _ = self.brain.policy_old.act(
                graph, available_tensor)

            action = out_action.item()
            cur_state, reward = env.get_next_state_with_reward(
                cur_state, action)

            self.memory.add_sample(graph, out_action, log_prob_action, reward,
                                   cur_state.is_done(), available_tensor)

            if self.time_step % self.args.update_timestep == 0:
                self.brain.update(self.memory)
                self.memory.clear_memory()
                self.time_step = 0

            if cur_state.is_done():
                break
コード例 #4
0
    def run_episode(self, episode_idx, memory_initialization):
        """
        Run a single episode, either for initializing the memory (random episode in this case)
        or for training the model (following DQN algorithm)
        :param episode_idx: the index of the current episode done (without considering the memory initialization)
        :param memory_initialization: True if it is for initializing the memory
        :return: the loss and the current beta of the softmax selection
        """

        #  Generate a random instance
        instance = TSPTW.generate_random_instance(
            n_city=self.args.n_city,
            grid_size=self.args.grid_size,
            max_tw_gap=self.args.max_tw_gap,
            max_tw_size=self.args.max_tw_size,
            seed=-1,
            is_integer_instance=False)

        env = Environment(instance, self.num_node_feats, self.num_edge_feats,
                          self.reward_scaling, self.args.grid_size,
                          self.args.max_tw_gap, self.args.max_tw_size)

        cur_state = env.get_initial_environment()

        graph_list = [dgl.DGLGraph()] * self.n_action
        rewards_vector = np.zeros(self.n_action)
        actions_vector = np.zeros(self.n_action, dtype=np.int16)
        available_vector = np.zeros((self.n_action, self.args.n_city))

        idx = 0
        total_loss = 0

        #  the current temperature for the softmax selection: increase from 0 to MAX_BETA
        temperature = max(
            0.,
            min(self.args.max_softmax_beta,
                (episode_idx - 1) / STEP_EPSILON * self.args.max_softmax_beta))

        #  execute the episode
        while True:

            graph = env.make_nn_input(cur_state, self.args.mode)
            avail = env.get_valid_actions(cur_state)
            avail_idx = np.argwhere(avail == 1).reshape(-1)

            if memory_initialization:  # if we are in the memory initialization phase, a random episode is selected
                action = random.choice(avail_idx)
            else:  # otherwise, we do the softmax selection
                action = self.soft_select_action(graph, avail, temperature)

                #  each time we do a step, we increase the counter, and we periodically synchronize the target network
                self.steps_done += 1
                if self.steps_done % UPDATE_TARGET_FREQUENCY == 0:
                    self.brain.update_target_model()

            cur_state, reward = env.get_next_state_with_reward(
                cur_state, action)

            graph_list[idx] = graph
            rewards_vector[idx] = reward
            actions_vector[idx] = action
            available_vector[idx] = avail

            if cur_state.is_done():
                break

            idx += 1

        episode_last_idx = idx

        #  compute the n-step values
        for i in range(self.n_action):

            if i <= episode_last_idx:
                cur_graph = graph_list[i]
                cur_available = available_vector[i]
            else:
                cur_graph = graph_list[episode_last_idx]
                cur_available = available_vector[episode_last_idx]

            if i + self.n_step < self.n_action:
                next_graph = graph_list[i + self.n_step]
                next_available = available_vector[i + self.n_step]
            else:
                next_graph = dgl.DGLGraph()
                next_available = env.get_valid_actions(cur_state)

            #  a state correspond to the graph, with the nodes that we can still visit
            state_features = (cur_graph, cur_available)
            next_state_features = (next_graph, next_available)

            #  the n-step reward
            reward = sum(rewards_vector[i:i + self.n_step])
            action = actions_vector[i]

            sample = (state_features, action, reward, next_state_features)

            if memory_initialization:
                # the error of the replay memory is equals to the reward, at initialization
                error = abs(reward)
                self.init_memory_counter += 1
                step_loss = 0
            else:
                # feed the memory with the new samples
                x, y, errors = self.get_targets([(0, sample, 0)])
                error = errors[0]
                step_loss = self.learning()  # learning procedure

            self.memory.add(error, sample)

            total_loss += step_loss

        return total_loss, temperature
コード例 #5
0
    args = parse_arguments()

    sys.stdout.flush()
    rl_algorithm = "ppo"


    load_folder = "./selected-models/ppo/tsptw/n-city-%d/grid-%d-tw-%d-%d" % \
                  (args.n_city, args.grid_size, args.max_tw_gap, args.max_tw_size)

    solver_binding = SolverBinding(load_folder, args.n_city, args.grid_size,
                                   args.max_tw_gap, args.max_tw_size,
                                   args.seed, rl_algorithm)

    env = Environment(solver_binding.instance, solver_binding.n_node_feat,
                      solver_binding.n_edge_feat, 1, args.grid_size,
                      args.max_tw_gap, args.max_tw_size)

    cur_state = env.get_initial_environment()

    sequences = [[[0], cur_state, 1.0]]

    total_reward = 0

    for _ in range(args.n_city - 1):

        all_candidates = list()

        for i in range(len(sequences)):
            seq, state, score = sequences[i]