예제 #1
0
class H_Agent:
    def __init__(self,
                 input_node_type=[0],
                 graph_size=4,
                 node_types=1,
                 environment_name=None,
                 total_episodes=None):
        self.max_number_nodes = graph_size
        self.node_types = node_types
        self.initial_nodes = input_node_type
        self.training_index = 0
        self.max_sample_size = 0
        self.loss_mini_batch = 0
        self.memory = ExperienceReplay(unusual_sample_factor=.4,
                                       buffer_size=100)
        self.epsilon = 0.5
        self.epsilon_delta = (.05 / total_episodes)
        self.gamma = 0.9
        self.episode_length = 30
        self.list_actions_taken = []

        if environment_name == 'workflow':
            self.env = workflow_env(number_inputs=self.initial_nodes,
                                    node_types=self.node_types,
                                    max_number_nodes=self.max_number_nodes)
        elif environment_name == 'single_action':
            self.env = workflow_env(number_inputs=self.initial_nodes,
                                    node_types=self.node_types,
                                    max_number_nodes=self.max_number_nodes)
        else:
            self.env = environment(number_inputs=self.initial_nodes,
                                   node_types=self.node_types,
                                   max_number_nodes=self.max_number_nodes)
        self.env.reset()

        D_in = self.env.node_types
        H1 = 20
        H2 = 20
        H3 = 20
        H4 = 20
        H5 = 1
        learning_rate = 1e-3
        weight_decay_val = 0.0

        self.NN = DQN(D_in, H1, H2, H3, H4, H5)
        self.TargetNetwork = DQN(D_in, H1, H2, H3, H4, H5)

        if torch.cuda.is_available():
            self.cuda = True
            self.NN = self.NN.cuda()
        else:
            self.cuda = False

        self.optimizer = optim.Adam(self.NN.parameters(),
                                    lr=learning_rate,
                                    weight_decay=weight_decay_val)
        self.optimizer.zero_grad()
        self.success_val = []

    def epsilon_greedy(self):
        self.env.reset()
        self.list_actions_taken = []
        self.epsilon -= self.epsilon_delta
        self.training_index += 1

        for e in range(self.episode_length):
            action = self.epsilon_greedy_action_selection()
            self.env + action
            if len(self.memory.buffer) >= 30:
                for (state, features, reward, done, time_step,
                     bellman_action) in self.memory.sample(30):
                    state_tuple = H_Agent.get_state(state, features)
                    q_val = self.compute_Q(state_tuple)
                    step_q = self.bellman_q_replay(reward, state, features,
                                                   done, bellman_action)
                    loss = (q_val - step_q).pow(2) / 2.0
                    loss.backward()
                    self.optimizer.step()
                    self.optimizer.zero_grad()

            state_tuple = H_Agent.get_state(self.env.state, self.env.features)
            q_val = self.compute_Q(state_tuple)

            step_q, bellman_action = self.compute_bellman_Q()
            self.memory + (self.env.state.copy(), self.env.features.copy(),
                           self.env.reward, self.env.terminated,
                           self.training_index, bellman_action)

            loss = (q_val - step_q).pow(2) / 2.0
            self.optimizer.zero_grad()

            loss.backward()
            self.optimizer.step()
            if self.training_index % 10 == 0:
                self.TargetNetwork.load_state_dict(self.NN.state_dict())
            if self.env.reward != 0.0:
                break

            if len(self.env.eligible_actions) == 0:
                break
        else:
            print('end episode, somehow took {} steps??'.format(
                self.episode_length))

    def epsilon_greedy_action_selection(self):
        if random.random() <= self.epsilon:
            return random.choice(self.env.eligible_actions)
        else:
            return self.choose_maximum_action()[0]

    def choose_maximum_action(self, greedy=False):
        self.TargetNetwork.eval()
        max_q = -float('Inf')
        actions_list = self.env.eligible_actions.copy()
        random.shuffle(actions_list)
        best_action = actions_list[0]
        amount_to_sample = len(
            actions_list
        ) if self.max_sample_size == 0 or greedy is True else min(
            self.max_sample_size, len(actions_list))
        for i in range(amount_to_sample):
            # search whole action space or to max sample search whichever is smaller
            action = actions_list[i]
            self.env + action
            state_tuple = H_Agent.get_state(self.env.state, self.env.features)
            q = self.compute_Q(
                state_tuple,
                self.TargetNetwork)  # what is the value of this state
            if q > max_q:
                max_q = q
                best_action = action
            self.env - action
        self.TargetNetwork.train()
        return best_action, max_q

    def best_action_replay(self, state, features, action):
        self.TargetNetwork.eval()

        new_state, new_features = self.env.take_action(state, features, action)
        state_tuple = H_Agent.get_state(new_state, new_features)
        q = self.compute_Q(
            state_tuple, self.TargetNetwork)  # what is the value of this state
        self.TargetNetwork.train()
        return q

    def bellman_q_replay(self, reward, state, features, done, action):
        r = reward
        if done:
            q_max = 0
        else:
            q_max = self.best_action_replay(state, features, action)
        return r + (self.gamma * q_max)

    def compute_bellman_Q(self):
        r = self.env.reward
        if len(self.env.eligible_actions) > 0:
            bellman_action, q_max = self.choose_maximum_action()
        else:
            q_max = 0
            bellman_action = 0
        return r + (self.gamma * q_max), bellman_action

    def compute_Q(self, state_tuple, network=None):
        if network is None:
            network = self.NN
        if self.cuda:
            state_tuple = (s.cuda() for s in state_tuple)
        q = network(state_tuple)
        return q

    def greedy(self):
        # print('greedystart')
        self.env.reset()
        # self.env = workflow_env([1, 2, 0], node_types=3, max_number_nodes=7)  # reset workflow_env
        for e in range(self.episode_length):
            if self.env.reward > 0.0:
                # print('amazing')
                self.success_val.append(1.0)
                break
            if len(self.env.eligible_actions) == 0:
                # print('no more possible actions')
                self.success_val.append(0.0)
                break
            a_max, _ = self.choose_maximum_action(greedy=True)
            self.env + a_max
        # print(self.env.state, self.env.reward, self.env.features)

    @staticmethod
    def get_state(state, features):
        in_adj_mat = torch.tensor(state, dtype=torch.float32)
        out_adj_mat = torch.t(in_adj_mat)
        v_feat = torch.tensor(features, dtype=torch.float32)
        return (v_feat, in_adj_mat, out_adj_mat)
예제 #2
0
class S_Agent:
    def __init__(self,
                 input_node_type=[0],
                 graph_size=4,
                 node_types=1,
                 environment_name=None,
                 total_episodes=None):
        #print('standard')
        self.max_number_nodes = graph_size
        self.node_types = node_types

        self.initial_nodes = input_node_type

        self.epsilon = 0.5
        self.epsilon_delta = (.05 / total_episodes)
        self.gamma = 0.9
        self.episode_length = 30

        if environment_name == 'workflow':
            self.env = workflow_env(number_inputs=self.initial_nodes,
                                    node_types=self.node_types,
                                    max_number_nodes=self.max_number_nodes)
            #print('w env')
        if environment_name == 'single':
            self.env = single_env(number_inputs=self.initial_nodes,
                                  node_types=self.node_types,
                                  max_number_nodes=self.max_number_nodes)
            #print('single env')
        else:
            self.env = environment(number_inputs=self.initial_nodes,
                                   node_types=self.node_types,
                                   max_number_nodes=self.max_number_nodes)
            #print('standard env')

        D_in = self.env.node_types
        H1 = 20
        H2 = 20
        H3 = 20
        H4 = 20
        H5 = 1
        learning_rate = 1e-3
        weight_decay_val = 0.0

        self.NN = DQN(D_in, H1, H2, H3, H4, H5)

        if torch.cuda.is_available():
            self.cuda = True
            self.NN = self.NN.cuda()
        else:
            self.cuda = False

        self.optimizer = optim.Adam(self.NN.parameters(),
                                    lr=learning_rate,
                                    weight_decay=weight_decay_val)
        self.success_val = []

    def epsilon_greedy(self):
        self.env.reset()

        self.epsilon -= self.epsilon_delta

        for e in range(self.episode_length):
            action = self.epsilon_greedy_action_selection()
            self.env + action

            q_val = self.compute_Q()

            step_q = self.compute_bellman_Q()

            loss = (q_val - step_q).pow(2) / 2.0
            self.optimizer.zero_grad()

            loss.backward()
            self.optimizer.step()
            if self.env.reward > 0.0:
                break

            if len(self.env.eligible_actions) == 0:
                break
        else:
            print('end episode, somehow took {} steps??'.format(
                self.episode_length))

    def epsilon_greedy_action_selection(self):
        if random.random() <= self.epsilon:
            return random.choice(self.env.eligible_actions)
        else:
            return self.choose_maximum_action()[0]

    def choose_maximum_action(self):
        self.NN.eval()
        max_q = -float('Inf')
        actions_list = self.env.eligible_actions.copy()
        random.shuffle(actions_list)
        best_action = actions_list[0]
        amount_to_sample = len(actions_list)
        for i in range(amount_to_sample):
            # search whole action space or to max sample search whichever is smaller
            action = actions_list[i]
            self.env + action
            q = self.compute_Q()
            if q > max_q:
                max_q = q
                best_action = action
            self.env - action
        self.NN.train()
        return best_action, max_q

    def compute_bellman_Q(self):
        r = self.env.reward
        if len(self.env.eligible_actions) > 0:
            _, q_max = self.choose_maximum_action()
        else:
            q_max = 0
        return r + (self.gamma * q_max)

    def compute_Q(self):
        state = S_Agent.get_state(self.env)
        if self.cuda:
            state = (s.cuda() for s in state)
        q = self.NN(state)
        return q

    def greedy(self):

        self.env.reset()

        for e in range(self.episode_length):
            if self.env.reward > 0.0:
                self.success_val.append(1.0)
                break
            if len(self.env.eligible_actions) == 0:
                # print('no more possible actions')
                self.success_val.append(0.0)
                break
            a_max, _ = self.choose_maximum_action()
            self.env + a_max
        # print(self.env.state, self.env.reward, self.env.features)

    @staticmethod
    def get_state(environment):
        in_adj_mat = torch.tensor(environment.state, dtype=torch.float32)
        out_adj_mat = torch.t(in_adj_mat)
        v_feat = torch.tensor(environment.features, dtype=torch.float32)
        return (v_feat, in_adj_mat, out_adj_mat)