Example #1
0
    def __init__(self, path_name, surffix, path_surffix):
        """
        parameters set
        """
        self.NUM_NODES = params['number of nodes in the cluster']
        # self.NUM_CONTAINERS = params['number of containers']

        # self.sim = Simulator()
        self.env = LraClusterEnv(num_nodes=self.NUM_NODES)

        ckpt_path_1 = path_surffix + path_name + "_1006_1" + "/model.ckpt"
        ckpt_path_2 = path_surffix + path_name + "_1006_2" + "/model.ckpt"
        ckpt_path_3 = path_surffix + path_name + "_1006_3" + "/model.ckpt"
        self.nodes_per_group = int(params['nodes per group'])
        # self.number_of_node_groups = int(self.NUM_NODES / self.nodes_per_group)
        """
        Build Network
        """
        self.n_actions = self.nodes_per_group  #: 3 nodes per group
        self.n_features = int(self.n_actions *
                              (self.env.NUM_APPS + 1 + self.env.NUM_APPS) + 1 +
                              self.env.NUM_APPS)
        #: 29

        self.RL_1 = PolicyGradient(n_actions=self.n_actions,
                                   n_features=self.n_features,
                                   learning_rate=params['learning rate'],
                                   suffix=surffix + '1a')

        self.RL_2 = PolicyGradient(n_actions=self.n_actions,
                                   n_features=self.n_features,
                                   learning_rate=params['learning rate'],
                                   suffix=surffix + '2a')

        self.RL_3 = PolicyGradient(n_actions=self.n_actions,
                                   n_features=self.n_features,
                                   learning_rate=params['learning rate'],
                                   suffix=surffix + '3a')

        self.RL_1.restore_session(ckpt_path_1)
        self.RL_2.restore_session(ckpt_path_2)
        self.RL_3.restore_session(ckpt_path_3)

        self.observation_episode_1, self.action_episode_1, self.reward_episode_1, self.safety_episode_1 = [], [], [], []
        self.observation_optimal_1, self.action_optimal_1, self.reward_optimal_1, self.safety_optimal_1 = [], [], [], []

        self.observation_episode_2, self.action_episode_2, self.reward_episode_2, self.safety_episode_2 = [], [], [], []
        self.observation_optimal_2, self.action_optimal_2, self.reward_optimal_2, self.safety_optimal_2 = [], [], [], []

        self.observation_episode_3, self.action_episode_3, self.reward_episode_3, self.safety_episode_3 = [], [], [], []
        self.observation_optimal_3, self.action_optimal_3, self.reward_optimal_3, self.safety_optimal_3 = [], [], [], []
Example #2
0
def train(params):
    """
    parameters set
    """
    print("Current params", params)

    NUM_NODES = params['number of nodes in the cluster']
    env = LraClusterEnv(num_nodes=NUM_NODES)
    batch_size = params['batch_size']
    ckpt_path_1 = "./checkpoint/" + params['path'] + "1/model.ckpt"
    ckpt_path_2 = "./checkpoint/" + params['path'] + "2/model.ckpt"
    ckpt_path_3 = "./checkpoint/" + params['path'] + "3/model.ckpt"

    ckpt_path_rec_1 = "./checkpoint/" + params['path'] + "1/model.ckpt"
    ckpt_path_rec_2 = "./checkpoint/" + params['path'] + "2/model.ckpt"
    ckpt_path_rec_3 = "./checkpoint/" + params['path'] + "3/model.ckpt"

    np_path = "./checkpoint/" + params['path'] + "/optimal_file_name.npz"
    Recover = params['recover']
    nodes_per_group = int(params['nodes per group'])
    replay_size = params['replay size']
    training_times_per_episode = 1  # TODO: if layers changes, training_times_per_episode should be modified
    # safety_requirement = 2.0 / 100.
    safety_requirement = params['safety_requirement']
    print(
        "######## safety_requirement = {} ########".format(safety_requirement))
    """
    Build Network
    """
    n_actions = nodes_per_group  #: 3 nodes per group
    n_features = int(n_actions * (env.NUM_APPS + 1 + env.NUM_APPS) + 1 +
                     env.NUM_APPS)  #: 3*9+1 = 28
    RL_1 = PolicyGradient(n_actions=n_actions,
                          n_features=n_features,
                          learning_rate=params['learning rate'],
                          suffix=str(100) + '1a',
                          safety_requirement=safety_requirement,
                          params=params)

    RL_2 = PolicyGradient(n_actions=n_actions,
                          n_features=n_features,
                          learning_rate=params['learning rate'],
                          suffix=str(100) + '2a',
                          safety_requirement=safety_requirement,
                          params=params)

    RL_3 = PolicyGradient(n_actions=n_actions,
                          n_features=n_features,
                          learning_rate=params['learning rate'],
                          suffix=str(100) + '3a',
                          safety_requirement=safety_requirement,
                          params=params)

    sim = Simulator()
    """
    Training
    """
    start_time = time.time()
    global_start_time = start_time
    number_optimal = []
    observation_episode_1, action_episode_1, reward_episode_1, safety_episode_1 = [], [], [], []
    observation_optimal_1, action_optimal_1, reward_optimal_1, safety_optimal_1 = [], [], [], []

    observation_episode_2, action_episode_2, reward_episode_2, safety_episode_2 = [], [], [], []
    observation_optimal_2, action_optimal_2, reward_optimal_2, safety_optimal_2 = [], [], [], []

    observation_episode_3, action_episode_3, reward_episode_3, safety_episode_3 = [], [], [], []
    observation_optimal_3, action_optimal_3, reward_optimal_3, safety_optimal_3 = [], [], [], []

    epoch_i = 0

    thre_entropy = 0.1
    # TODO: delete this range

    names = locals()
    for i in range(0, 10):
        names['highest_tput_' + str(i)] = 0
        names['observation_optimal_1_' + str(i)] = []
        names['action_optimal_1_' + str(i)] = []
        names['observation_optimal_2_' + str(i)] = []
        names['action_optimal_2_' + str(i)] = []
        names['observation_optimal_3_' + str(i)] = []
        names['action_optimal_3_' + str(i)] = []
        names['reward_optimal_1_' + str(i)] = []
        names['reward_optimal_2_' + str(i)] = []
        names['reward_optimal_3_' + str(i)] = []
        names['safety_optimal_1_' + str(i)] = []
        names['safety_optimal_2_' + str(i)] = []
        names['safety_optimal_3_' + str(i)] = []
        names['number_optimal_' + str(i)] = []
        names['optimal_range_' + str(i)] = 1.05
        names['lowest_vio_' + str(i)] = 500
        names['observation_optimal_1_vio_' + str(i)] = []
        names['action_optimal_1_vio_' + str(i)] = []
        names['observation_optimal_2_vio_' + str(i)] = []
        names['action_optimal_2_vio_' + str(i)] = []
        names['observation_optimal_3_vio_' + str(i)] = []
        names['action_optimal_3_vio_' + str(i)] = []
        names['reward_optimal_vio_1_' + str(i)] = []
        names['reward_optimal_vio_2_' + str(i)] = []
        names['reward_optimal_vio_3_' + str(i)] = []
        names['safety_optimal_vio_1_' + str(i)] = []
        names['safety_optimal_vio_2_' + str(i)] = []
        names['safety_optimal_vio_3_' + str(i)] = []
        names['number_optimal_vio_' + str(i)] = []
        names['optimal_range_vio_' + str(i)] = 1.1

    def store_episode_1(observations, actions):
        observation_episode_1.append(observations)
        action_episode_1.append(actions)

    def store_episode_2(observations, actions):
        observation_episode_2.append(observations)
        action_episode_2.append(actions)

    def store_episode_3(observations, actions):
        observation_episode_3.append(observations)
        action_episode_3.append(actions)

    tput_origimal_class = 0
    source_batch_, index_data_ = batch_data(
        NUM_CONTAINERS, env.NUM_APPS)  # index_data = [0,1,2,0,1,2]

    time_ep_acc = 0.0
    time_al_acc = 0.0
    while epoch_i < params['epochs']:
        time_ep_start = time.time()

        if Recover:
            print("Recover from {}".format(ckpt_path_rec_1))
            RL_1.restore_session(ckpt_path_rec_1)
            RL_2.restore_session(ckpt_path_rec_2)
            RL_3.restore_session(ckpt_path_rec_3)
            Recover = False

        observation = env.reset().copy()  # (9,9)
        source_batch = source_batch_.copy()
        index_data = index_data_.copy()
        """
        Episode
        """
        """
        first layer
        """
        time_al_start = time.time()

        source_batch_first = source_batch_.copy()
        observation_first_layer = np.zeros([nodes_per_group, env.NUM_APPS],
                                           int)
        for inter_episode_index in range(NUM_CONTAINERS):
            appid = index_data[inter_episode_index]
            source_batch_first[appid] -= 1
            observation_first_layer_copy = observation_first_layer.copy()
            observation_first_layer_copy[:, appid] += 1
            observation_first_layer_copy = np.append(
                observation_first_layer_copy,
                observation_first_layer_copy > 9 * 2,
                axis=1)
            observation_first_layer_copy = np.append(
                observation_first_layer_copy,
                observation_first_layer_copy.sum(axis=1).reshape(
                    nodes_per_group, 1),
                axis=1)
            observation_first_layer_copy = np.array(
                observation_first_layer_copy).reshape(1, -1)
            observation_first_layer_copy = np.append(
                observation_first_layer_copy, appid).reshape(1, -1)
            observation_first_layer_copy = np.append(
                observation_first_layer_copy,
                np.array(source_batch_first)).reshape(1, -1)
            action_1, prob_weights = RL_1.choose_action(
                observation_first_layer_copy.copy())
            observation_first_layer[action_1, appid] += 1
            store_episode_1(observation_first_layer_copy, action_1)
        """
        second layer
        """
        observation_second_layer_aggregation = np.empty([0, env.NUM_APPS],
                                                        int)  # 9*20

        number_cont_second_layer = []

        for second_layer_index in range(nodes_per_group):

            rnd_array = observation_first_layer[second_layer_index].copy()
            source_batch_second, index_data = batch_data_sub(rnd_array)
            observation_second_layer = np.zeros(
                [nodes_per_group, env.NUM_APPS], int)
            NUM_CONTAINERS_second = sum(source_batch_second)
            number_cont_second_layer.append(NUM_CONTAINERS_second)

            for inter_episode_index in range(NUM_CONTAINERS_second):
                appid = index_data[inter_episode_index]
                source_batch_second[appid] -= 1
                observation_second_layer_copy = observation_second_layer.copy()
                observation_second_layer_copy[:, appid] += 1
                observation_second_layer_copy = np.append(
                    observation_second_layer_copy,
                    observation_second_layer_copy > 3 * 2,
                    axis=1)
                observation_second_layer_copy = np.append(
                    observation_second_layer_copy,
                    observation_second_layer_copy.sum(axis=1).reshape(
                        nodes_per_group, 1),
                    axis=1)
                observation_second_layer_copy = np.array(
                    observation_second_layer_copy).reshape(1, -1)
                observation_second_layer_copy = np.append(
                    observation_second_layer_copy, appid).reshape(1, -1)
                observation_second_layer_copy = np.append(
                    observation_second_layer_copy,
                    np.array(source_batch_second)).reshape(1, -1)

                action_2, prob_weights = RL_2.choose_action(
                    observation_second_layer_copy.copy())
                observation_second_layer[action_2, appid] += 1
                store_episode_2(observation_second_layer_copy, action_2)

            observation_second_layer_aggregation = np.append(
                observation_second_layer_aggregation, observation_second_layer,
                0)
        """
        third layer
        """
        observation_third_layer_aggregation = np.empty([0, env.NUM_APPS],
                                                       int)  # 9*20
        number_cont_third_layer = []

        for third_layer_index in range(nodes_per_group * nodes_per_group):
            rnd_array = observation_second_layer_aggregation[
                third_layer_index].copy()
            source_batch_third, index_data = batch_data_sub(rnd_array)
            observation_third_layer = np.zeros([nodes_per_group, env.NUM_APPS],
                                               int)
            NUM_CONTAINERS_third = sum(source_batch_third)
            number_cont_third_layer.append(NUM_CONTAINERS_third)

            for inter_episode_index in range(NUM_CONTAINERS_third):
                appid = index_data[inter_episode_index]
                source_batch_third[appid] -= 1
                observation_third_layer_copy = observation_third_layer.copy()
                observation_third_layer_copy[:, appid] += 1

                observation_third_layer_copy = np.append(
                    observation_third_layer_copy,
                    observation_third_layer_copy > 1 * 2,
                    axis=1)
                observation_third_layer_copy = np.append(
                    observation_third_layer_copy,
                    observation_third_layer_copy.sum(axis=1).reshape(
                        nodes_per_group, 1),
                    axis=1)
                observation_third_layer_copy = np.array(
                    observation_third_layer_copy).reshape(1, -1)
                observation_third_layer_copy = np.append(
                    observation_third_layer_copy, appid).reshape(1, -1)
                observation_third_layer_copy = np.append(
                    observation_third_layer_copy,
                    np.array(source_batch_third)).reshape(1, -1)

                action_3, prob_weights = RL_3.choose_action(
                    observation_third_layer_copy.copy())
                observation_third_layer[action_3, appid] += 1
                store_episode_3(observation_third_layer_copy, action_3)

            observation_third_layer_aggregation = np.append(
                observation_third_layer_aggregation, observation_third_layer,
                0)

        time_al_end = time.time()
        time_al_acc += time_al_end - time_al_start
        """
        After an entire allocation, calculate total throughput, reward
        """
        env.state = observation_third_layer_aggregation.copy()
        assert sum(sum(env.state)) == NUM_CONTAINERS
        assert (env.state.sum(0) == source_batch_).all()
        tput_state = env.state
        tput_breakdown = sim.predict(tput_state.reshape(-1, env.NUM_APPS))
        tput = (tput_breakdown * tput_state).sum() / NUM_CONTAINERS
        reward_ratio = (tput - 0)

        state = env.state
        # These three are not actually used in training, just for logging
        list_check_per_app = (env.state > 1).sum() + max(
            (env.state - 1).max(), 0)
        list_check_sum = sum(
            env.state.sum(1) > params['container_limitation per node']
        ) + max(
            max(env.state.sum(1) - params['container_limitation per node']), 0)
        list_check_coex = sum((env.state[:, 1] > 0) * (env.state[:, 2] > 0))

        # list_check = list_check_sum + list_check_coex + list_check_per_app
        list_check = 0
        # error = 0
        # for node in range(NUM_NODES):
        #     for app in range(env.NUM_APPS):
        #         if env.state[node, app] > 1 or (app == 1 and env.state[node, 2] > 0) or (app == 2 and env.state[node, 1] > 0):
        #             error += env.state[node, app]
        # assert error==0

        # container limitation & deployment spread
        for node in range(NUM_NODES):
            for app in range(env.NUM_APPS):
                if env.state[node, :].sum() > params[
                        'container_limitation per node']:  #or env.state[node, app] > 1:
                    list_check += env.state[node, app]
        # hardware affinity & increamental deployment
        for app in range(7):
            node_now = np.where(env.state[:, app] > 0)[0]
            for node_ in node_now:
                if node_ not in app_node_set[app]:
                    list_check += env.state[node_, app]

        list_check_ratio = list_check / NUM_CONTAINERS

        safety_episode_1 = [list_check_ratio * 1.0
                            ] * len(observation_episode_1)
        reward_episode_1 = [reward_ratio * 1.0] * len(observation_episode_1)

        safety_episode_2 = [list_check_ratio * 1.0
                            ] * len(observation_episode_2)
        reward_episode_2 = [reward_ratio * 1.0] * len(observation_episode_2)

        safety_episode_3 = [list_check_ratio * 1.0
                            ] * len(observation_episode_3)
        reward_episode_3 = [reward_ratio * 1.0] * len(observation_episode_3)

        RL_1.store_tput_per_episode(tput, epoch_i, list_check,
                                    list_check_per_app, list_check_coex,
                                    list_check_sum)
        RL_2.store_tput_per_episode(tput, epoch_i, list_check, [], [], [])
        RL_3.store_tput_per_episode(tput, epoch_i, list_check, [], [], [])

        RL_1.store_training_samples_per_episode(observation_episode_1,
                                                action_episode_1,
                                                reward_episode_1,
                                                safety_episode_1)
        RL_2.store_training_samples_per_episode(observation_episode_2,
                                                action_episode_2,
                                                reward_episode_2,
                                                safety_episode_2)
        RL_3.store_training_samples_per_episode(observation_episode_3,
                                                action_episode_3,
                                                reward_episode_3,
                                                safety_episode_3)
        """
        check_tput_quality(tput)
        """
        if names['lowest_vio_' + str(tput_origimal_class)] > list_check:
            names['lowest_vio_' + str(tput_origimal_class)] = list_check
            names['observation_optimal_1_vio_' + str(
                tput_origimal_class
            )], names[
                'action_optimal_1_vio_' + str(tput_origimal_class)], names[
                    'observation_optimal_2_vio_' +
                    str(tput_origimal_class)], names[
                        'action_optimal_2_vio_' +
                        str(tput_origimal_class)], names[
                            'number_optimal_vio_' +
                            str(tput_origimal_class)], names[
                                'safety_optimal_vio_1_' +
                                str(tput_origimal_class)], names[
                                    'safety_optimal_vio_2_' +
                                    str(tput_origimal_class)], names[
                                        'safety_optimal_vio_3_' + str(
                                            tput_origimal_class
                                        )] = [], [], [], [], [], [], [], []
            names['observation_optimal_3_vio_' +
                  str(tput_origimal_class)], names[
                      'action_optimal_3_vio_' +
                      str(tput_origimal_class)] = [], []
            names['reward_optimal_vio_' + str(tput_origimal_class)] = []
            names['observation_optimal_1_vio_' +
                  str(tput_origimal_class)].extend(observation_episode_1)
            names['action_optimal_1_vio_' +
                  str(tput_origimal_class)].extend(action_episode_1)
            names['observation_optimal_2_vio_' +
                  str(tput_origimal_class)].extend(observation_episode_2)
            names['action_optimal_2_vio_' +
                  str(tput_origimal_class)].extend(action_episode_2)
            names['observation_optimal_3_vio_' +
                  str(tput_origimal_class)].extend(observation_episode_3)
            names['action_optimal_3_vio_' +
                  str(tput_origimal_class)].extend(action_episode_3)
            names['number_optimal_vio_' +
                  str(tput_origimal_class)].append(NUM_CONTAINERS)
            names['safety_optimal_vio_1_' +
                  str(tput_origimal_class)].extend(safety_episode_1)
            names['safety_optimal_vio_2_' +
                  str(tput_origimal_class)].extend(safety_episode_2)
            names['safety_optimal_vio_3_' +
                  str(tput_origimal_class)].extend(safety_episode_3)
            names['reward_optimal_vio_' +
                  str(tput_origimal_class)].extend(reward_episode_1)

            names['optimal_range_vio_' + str(tput_origimal_class)] = 1.1
        elif names['lowest_vio_' +
                   str(tput_origimal_class)] >= list_check / names[
                       'optimal_range_vio_' + str(tput_origimal_class)]:
            names['observation_optimal_1_vio_' +
                  str(tput_origimal_class)].extend(observation_episode_1)
            names['action_optimal_1_vio_' +
                  str(tput_origimal_class)].extend(action_episode_1)
            names['observation_optimal_2_vio_' +
                  str(tput_origimal_class)].extend(observation_episode_2)
            names['action_optimal_2_vio_' +
                  str(tput_origimal_class)].extend(action_episode_2)
            names['observation_optimal_3_vio_' +
                  str(tput_origimal_class)].extend(observation_episode_3)
            names['action_optimal_3_vio_' +
                  str(tput_origimal_class)].extend(action_episode_3)
            names['number_optimal_vio_' +
                  str(tput_origimal_class)].append(NUM_CONTAINERS)
            names['safety_optimal_vio_1_' +
                  str(tput_origimal_class)].extend(safety_episode_1)
            names['safety_optimal_vio_2_' +
                  str(tput_origimal_class)].extend(safety_episode_2)
            names['safety_optimal_vio_3_' +
                  str(tput_origimal_class)].extend(safety_episode_3)
            names['reward_optimal_vio_' +
                  str(tput_origimal_class)].extend(reward_episode_1)

        # if list_check_ratio <= safety_requirement*0.5:
        if list_check_ratio <= safety_requirement:
            if names['highest_tput_' + str(tput_origimal_class)] < tput:
                names['highest_tput_' + str(tput_origimal_class)] = tput

                names['observation_optimal_1_' + str(tput_origimal_class)], names[
                    'action_optimal_1_' + str(tput_origimal_class)], names[
                    'observation_optimal_2_' + str(tput_origimal_class)], names[
                    'action_optimal_2_' + str(tput_origimal_class)], \
                names['reward_optimal_1_' + str(tput_origimal_class)], names[
                    'reward_optimal_2_' + str(tput_origimal_class)], names[
                    'reward_optimal_3_' + str(tput_origimal_class)], \
                names['number_optimal_' + str(tput_origimal_class)], \
                names['safety_optimal_1_' + str(tput_origimal_class)], names[
                    'safety_optimal_2_' + str(tput_origimal_class)], names[
                    'safety_optimal_3_' + str(tput_origimal_class)] \
                    = [], [], [], [], [], [], [], [], [], [], []
                names['observation_optimal_3_' +
                      str(tput_origimal_class)], names[
                          'action_optimal_3_' +
                          str(tput_origimal_class)] = [], []

                names['observation_optimal_1_' +
                      str(tput_origimal_class)].extend(observation_episode_1)
                names['action_optimal_1_' +
                      str(tput_origimal_class)].extend(action_episode_1)
                names['observation_optimal_2_' +
                      str(tput_origimal_class)].extend(observation_episode_2)
                names['action_optimal_2_' +
                      str(tput_origimal_class)].extend(action_episode_2)
                names['observation_optimal_3_' +
                      str(tput_origimal_class)].extend(observation_episode_3)
                names['action_optimal_3_' +
                      str(tput_origimal_class)].extend(action_episode_3)

                names['number_optimal_' +
                      str(tput_origimal_class)].append(NUM_CONTAINERS)

                names['safety_optimal_1_' +
                      str(tput_origimal_class)].extend(safety_episode_1)
                names['safety_optimal_2_' +
                      str(tput_origimal_class)].extend(safety_episode_2)
                names['safety_optimal_3_' +
                      str(tput_origimal_class)].extend(safety_episode_3)
                names['reward_optimal_1_' +
                      str(tput_origimal_class)].extend(reward_episode_1)
                names['reward_optimal_2_' +
                      str(tput_origimal_class)].extend(reward_episode_2)
                names['reward_optimal_3_' +
                      str(tput_origimal_class)].extend(reward_episode_3)

                names['optimal_range_' + str(tput_origimal_class)] = 1.05

            elif names['highest_tput_' +
                       str(tput_origimal_class)] < tput * names[
                           'optimal_range_' + str(tput_origimal_class)]:
                names['observation_optimal_1_' +
                      str(tput_origimal_class)].extend(observation_episode_1)
                names['action_optimal_1_' +
                      str(tput_origimal_class)].extend(action_episode_1)
                names['observation_optimal_2_' +
                      str(tput_origimal_class)].extend(observation_episode_2)
                names['action_optimal_2_' +
                      str(tput_origimal_class)].extend(action_episode_2)
                names['observation_optimal_3_' +
                      str(tput_origimal_class)].extend(observation_episode_3)
                names['action_optimal_3_' +
                      str(tput_origimal_class)].extend(action_episode_3)

                names['number_optimal_' +
                      str(tput_origimal_class)].append(NUM_CONTAINERS)

                names['safety_optimal_1_' +
                      str(tput_origimal_class)].extend(safety_episode_1)
                names['safety_optimal_2_' +
                      str(tput_origimal_class)].extend(safety_episode_2)
                names['safety_optimal_3_' +
                      str(tput_origimal_class)].extend(safety_episode_3)
                names['reward_optimal_1_' +
                      str(tput_origimal_class)].extend(reward_episode_1)
                names['reward_optimal_2_' +
                      str(tput_origimal_class)].extend(reward_episode_2)
                names['reward_optimal_3_' +
                      str(tput_origimal_class)].extend(reward_episode_3)

        observation_episode_1, action_episode_1, reward_episode_1, safety_episode_1 = [], [], [], []
        observation_episode_2, action_episode_2, reward_episode_2, safety_episode_2 = [], [], [], []
        observation_episode_3, action_episode_3, reward_episode_3, safety_episode_3 = [], [], [], []
        """
        Each batch, RL.learn()
        """
        if (epoch_i % batch_size == 0) & (epoch_i > 1):
            for replay_class in range(0, 1):

                number_optimal = names['number_optimal_' + str(replay_class)]

                reward_optimal_1 = names['reward_optimal_1_' +
                                         str(replay_class)]
                reward_optimal_2 = names['reward_optimal_2_' +
                                         str(replay_class)]
                reward_optimal_3 = names['reward_optimal_3_' +
                                         str(replay_class)]
                safety_optimal_1 = names['safety_optimal_1_' +
                                         str(replay_class)]
                safety_optimal_2 = names['safety_optimal_2_' +
                                         str(replay_class)]
                safety_optimal_3 = names['safety_optimal_3_' +
                                         str(replay_class)]

                observation_optimal_1 = names['observation_optimal_1_' +
                                              str(replay_class)]
                action_optimal_1 = names['action_optimal_1_' +
                                         str(replay_class)]
                observation_optimal_2 = names['observation_optimal_2_' +
                                              str(replay_class)]
                action_optimal_2 = names['action_optimal_2_' +
                                         str(replay_class)]
                observation_optimal_3 = names['observation_optimal_3_' +
                                              str(replay_class)]
                action_optimal_3 = names['action_optimal_3_' +
                                         str(replay_class)]

                buffer_size = int(len(number_optimal))

                if buffer_size < replay_size:
                    # TODO: if layers changes, training_times_per_episode should be modified
                    RL_1.ep_obs.extend(observation_optimal_1)
                    RL_1.ep_as.extend(action_optimal_1)
                    RL_1.ep_rs.extend(reward_optimal_1)
                    RL_1.ep_ss.extend(safety_optimal_1)

                    RL_2.ep_obs.extend(observation_optimal_2)
                    RL_2.ep_as.extend(action_optimal_2)
                    RL_2.ep_rs.extend(reward_optimal_2)
                    RL_2.ep_ss.extend(safety_optimal_2)

                    RL_3.ep_obs.extend(observation_optimal_3)
                    RL_3.ep_as.extend(action_optimal_3)
                    RL_3.ep_rs.extend(reward_optimal_3)
                    RL_3.ep_ss.extend(safety_optimal_3)

                else:
                    replay_index = np.random.choice(range(buffer_size),
                                                    size=replay_size,
                                                    replace=False)
                    for replay_id in range(replay_size):
                        replace_start = replay_index[replay_id]
                        start_location = sum(number_optimal[:replace_start])
                        stop_location = sum(number_optimal[:replace_start + 1])
                        RL_1.ep_obs.extend(
                            observation_optimal_1[start_location:stop_location]
                        )
                        RL_1.ep_as.extend(
                            action_optimal_1[start_location:stop_location])
                        RL_1.ep_rs.extend(
                            reward_optimal_1[start_location:stop_location])
                        RL_1.ep_ss.extend(
                            safety_optimal_1[start_location:stop_location])

                        RL_2.ep_obs.extend(
                            observation_optimal_2[start_location:stop_location]
                        )
                        RL_2.ep_as.extend(
                            action_optimal_2[start_location:stop_location])
                        RL_2.ep_rs.extend(
                            reward_optimal_2[start_location:stop_location])
                        RL_2.ep_ss.extend(
                            safety_optimal_2[start_location:stop_location])

                        RL_3.ep_obs.extend(
                            observation_optimal_3[start_location:stop_location]
                        )
                        RL_3.ep_as.extend(
                            action_optimal_3[start_location:stop_location])
                        RL_3.ep_rs.extend(
                            reward_optimal_3[start_location:stop_location])
                        RL_3.ep_ss.extend(
                            safety_optimal_3[start_location:stop_location])

            if not RL_1.start_cpo:
                for replay_class in range(0, 1):
                    number_optimal = names['number_optimal_vio_' +
                                           str(replay_class)]
                    safety_optimal_1 = names['safety_optimal_vio_1_' +
                                             str(replay_class)]
                    safety_optimal_2 = names['safety_optimal_vio_2_' +
                                             str(replay_class)]
                    safety_optimal_3 = names['safety_optimal_vio_3_' +
                                             str(replay_class)]
                    reward_optimal = names['reward_optimal_vio_' +
                                           str(replay_class)]

                    observation_optimal_1 = names['observation_optimal_1_vio_'
                                                  + str(replay_class)]
                    action_optimal_1 = names['action_optimal_1_vio_' +
                                             str(replay_class)]
                    observation_optimal_2 = names['observation_optimal_2_vio_'
                                                  + str(replay_class)]
                    action_optimal_2 = names['action_optimal_2_vio_' +
                                             str(replay_class)]
                    observation_optimal_3 = names['observation_optimal_3_vio_'
                                                  + str(replay_class)]
                    action_optimal_3 = names['action_optimal_3_vio_' +
                                             str(replay_class)]

                    buffer_size = int(len(number_optimal))

                    if buffer_size < replay_size:
                        # TODO: if layers changes, training_times_per_episode should be modified
                        RL_1.ep_obs.extend(observation_optimal_1)
                        RL_1.ep_as.extend(action_optimal_1)
                        RL_1.ep_ss.extend(safety_optimal_1)
                        RL_1.ep_rs.extend(reward_optimal)

                        RL_2.ep_obs.extend(observation_optimal_2)
                        RL_2.ep_as.extend(action_optimal_2)
                        RL_2.ep_rs.extend(reward_optimal)
                        RL_2.ep_ss.extend(safety_optimal_2)

                        RL_3.ep_obs.extend(observation_optimal_3)
                        RL_3.ep_as.extend(action_optimal_3)
                        RL_3.ep_rs.extend(reward_optimal)
                        RL_3.ep_ss.extend(safety_optimal_3)

                    else:
                        replay_index = np.random.choice(range(buffer_size),
                                                        size=replay_size,
                                                        replace=False)
                        for replay_id in range(replay_size):
                            replace_start = replay_index[replay_id]
                            start_location = sum(
                                number_optimal[:replace_start])
                            stop_location = sum(number_optimal[:replace_start +
                                                               1])
                            RL_1.ep_obs.extend(observation_optimal_1[
                                start_location:stop_location])
                            RL_1.ep_as.extend(
                                action_optimal_1[start_location:stop_location])
                            RL_1.ep_rs.extend(
                                reward_optimal[start_location:stop_location])
                            RL_1.ep_ss.extend(
                                safety_optimal_1[start_location:stop_location])

                            RL_2.ep_obs.extend(observation_optimal_2[
                                start_location:stop_location])
                            RL_2.ep_as.extend(
                                action_optimal_2[start_location:stop_location])
                            RL_2.ep_rs.extend(
                                reward_optimal[start_location:stop_location])
                            RL_2.ep_ss.extend(
                                safety_optimal_2[start_location:stop_location])

                            RL_3.ep_obs.extend(observation_optimal_3[
                                start_location:stop_location])
                            RL_3.ep_as.extend(
                                action_optimal_3[start_location:stop_location])
                            RL_3.ep_rs.extend(
                                reward_optimal[start_location:stop_location])
                            RL_3.ep_ss.extend(
                                safety_optimal_3[start_location:stop_location])

            time_s = time.time()
            RL_1.learn(epoch_i, thre_entropy, Ifprint=True)
            RL_2.learn(epoch_i, thre_entropy)
            optim_case = RL_3.learn(epoch_i, thre_entropy)
            time_e = time.time()
            print("learning time epoch_i:", epoch_i, time_e - time_s)
            print("End2End time epoch_i", epoch_i, time_ep_acc)
            print("Allocate time epoch_i", epoch_i, time_al_acc)
            time_al_acc = 0.0
            time_ep_acc = 0.0
        """
        checkpoint, per 1000 episodes
        """
        if (epoch_i % 3000 == 0) & (epoch_i > 1):

            RL_1.save_session(ckpt_path_1)
            RL_2.save_session(ckpt_path_2)
            RL_3.save_session(ckpt_path_3)
            np.savez(np_path,
                     tputs=np.array(RL_1.tput_persisit),
                     candidate=np.array(RL_1.episode),
                     vi_perapp=np.array(RL_1.ss_perapp_persisit),
                     vi_coex=np.array(RL_1.ss_coex_persisit),
                     vi_sum=np.array(RL_1.ss_sum_persisit))
            """
            optimal range adaptively change
            """
            for class_replay in range(0, 1):
                number_optimal = names['number_optimal_' + str(class_replay)]
                count_size = int(len(number_optimal))

                if (count_size > 300):
                    names['optimal_range_' + str(class_replay)] *= 0.99
                    names['optimal_range_' + str(class_replay)] = max(
                        names['optimal_range_' + str(class_replay)], 1.01)
                    start_location = sum(names['number_optimal_' + str(
                        class_replay)][:-50]) * training_times_per_episode
                    names['observation_optimal_1_' +
                          str(class_replay)] = names[
                              'observation_optimal_1_' +
                              str(class_replay)][start_location:]
                    names['action_optimal_1_' + str(class_replay)] = names[
                        'action_optimal_1_' +
                        str(class_replay)][start_location:]
                    names['observation_optimal_2_' +
                          str(class_replay)] = names[
                              'observation_optimal_2_' +
                              str(class_replay)][start_location:]
                    names['action_optimal_2_' + str(class_replay)] = names[
                        'action_optimal_2_' +
                        str(class_replay)][start_location:]
                    names['observation_optimal_3_' +
                          str(class_replay)] = names[
                              'observation_optimal_3_' +
                              str(class_replay)][start_location:]
                    names['action_optimal_3_' + str(class_replay)] = names[
                        'action_optimal_3_' +
                        str(class_replay)][start_location:]
                    names['number_optimal_' +
                          str(class_replay)] = names['number_optimal_' +
                                                     str(class_replay)][-50:]
                    names['safety_optimal_1_' + str(class_replay)] = names[
                        'safety_optimal_1_' +
                        str(class_replay)][start_location:]
                    names['safety_optimal_2_' + str(class_replay)] = names[
                        'safety_optimal_2_' +
                        str(class_replay)][start_location:]
                    names['safety_optimal_3_' + str(class_replay)] = names[
                        'safety_optimal_3_' +
                        str(class_replay)][start_location:]
                    names['reward_optimal_1_' + str(class_replay)] = names[
                        'reward_optimal_1_' +
                        str(class_replay)][start_location:]
                    names['reward_optimal_2_' + str(class_replay)] = names[
                        'reward_optimal_2_' +
                        str(class_replay)][start_location:]
                    names['reward_optimal_3_' + str(class_replay)] = names[
                        'reward_optimal_3_' +
                        str(class_replay)][start_location:]

                print("optimal_range:",
                      names['optimal_range_' + str(class_replay)])

            thre_entropy *= 0.5
            thre_entropy = max(thre_entropy, 0.0001)

        epoch_i += 1

        time_ep_end = time.time()
        time_ep_acc += time_ep_end - time_ep_start

        if epoch_i > 10000:
            batch_size = 100
Example #3
0
def train(params):
    """
    parameters set
    """
    NUM_NODES = params['number of nodes in the cluster']
    env = LraClusterEnv(num_nodes=NUM_NODES)
    batch_size = params['batch_size']
    ckpt_path_1 = "./checkpoint/" + params['path'] + "1/model.ckpt"
    ckpt_path_2 = "./checkpoint/" + params['path'] + "2/model.ckpt"
    ckpt_path_3 = "./checkpoint/" + params['path'] + "3/model.ckpt"
    make_path(params['path'] + "1")
    make_path(params['path'] + "2")
    make_path(params['path'] + "3")
    useExternal = False
    np_path = "./checkpoint/" + params['path'] + "/optimal_file_name.npz"
    Recover = params['recover']
    nodes_per_group = int(params['nodes per group'])
    replay_size = params['replay size']
    training_times_per_episode = 1  # TODO: if layers changes, training_times_per_episode should be modified
    """
    Build Network
    """
    n_actions = nodes_per_group  #: 3 nodes per group
    n_features = int(n_actions * (env.NUM_APPS + 1 + env.NUM_APPS) + 1 +
                     env.NUM_APPS)  #: 3*9+1 = 28
    RL_1 = PolicyGradient(n_actions=n_actions,
                          n_features=n_features,
                          learning_rate=params['learning rate'],
                          suffix="100" + '1a')

    RL_2 = PolicyGradient(n_actions=n_actions,
                          n_features=n_features,
                          learning_rate=params['learning rate'],
                          suffix="100" + '2a')

    RL_3 = PolicyGradient(n_actions=n_actions,
                          n_features=n_features,
                          learning_rate=params['learning rate'],
                          suffix="100" + '3a')
    """
    Training
    """
    start_time = time.time()
    global_start_time = start_time
    number_optimal = []
    observation_episode_1, action_episode_1, reward_episode_1, safety_episode_1 = [], [], [], []
    observation_optimal_1, action_optimal_1, reward_optimal_1, safety_optimal_1 = [], [], [], []
    observation_episode_2, action_episode_2, reward_episode_2, safety_episode_2 = [], [], [], []
    observation_optimal_2, action_optimal_2, reward_optimal_2, safety_optimal_2 = [], [], [], []
    observation_episode_3, action_episode_3, reward_episode_3, safety_episode_3 = [], [], [], []
    observation_optimal_3, action_optimal_3, reward_optimal_3, safety_optimal_3 = [], [], [], []

    epoch_i = 0
    thre_entropy = 0.1

    names = locals()
    for i in range(0, 12):
        names['highest_tput_' + str(i)] = 0
        names['observation_optimal_1_' + str(i)] = []
        names['action_optimal_1_' + str(i)] = []
        names['observation_optimal_2_' + str(i)] = []
        names['action_optimal_2_' + str(i)] = []
        names['observation_optimal_3_' + str(i)] = []
        names['action_optimal_3_' + str(i)] = []
        names['reward_optimal_1_' + str(i)] = []
        names['reward_optimal_2_' + str(i)] = []
        names['reward_optimal_3_' + str(i)] = []
        names['safety_optimal_1_' + str(i)] = []
        names['safety_optimal_2_' + str(i)] = []
        names['safety_optimal_3_' + str(i)] = []
        names['number_optimal_' + str(i)] = []
        names['optimal_range_' + str(i)] = 1.05
        names['lowest_vio_' + str(i)] = 500
        names['observation_optimal_1_vio_' + str(i)] = []
        names['action_optimal_1_vio_' + str(i)] = []
        names['observation_optimal_2_vio_' + str(i)] = []
        names['action_optimal_2_vio_' + str(i)] = []
        names['observation_optimal_3_vio_' + str(i)] = []
        names['action_optimal_3_vio_' + str(i)] = []
        names['reward_optimal_vio_1_' + str(i)] = []
        names['reward_optimal_vio_2_' + str(i)] = []
        names['reward_optimal_vio_3_' + str(i)] = []
        names['safety_optimal_vio_1_' + str(i)] = []
        names['safety_optimal_vio_2_' + str(i)] = []
        names['safety_optimal_vio_3_' + str(i)] = []
        names['number_optimal_vio_' + str(i)] = []
        names['optimal_range_vio_' + str(i)] = 1.1

    def store_episode_1(observations, actions):
        observation_episode_1.append(observations)
        action_episode_1.append(actions)

    def store_episode_2(observations, actions):
        observation_episode_2.append(observations)
        action_episode_2.append(actions)

    def store_episode_3(observations, actions):
        observation_episode_3.append(observations)
        action_episode_3.append(actions)

    tput_origimal_class = 0
    source_batch_, index_data_ = batch_data(
        NUM_CONTAINERS, env.NUM_APPS)  # index_data = [0,1,2,0,1,2]

    break_number = 0
    while epoch_i < params['epochs']:
        break_flag = False
        observation = env.reset().copy()  # (9,9)
        source_batch = source_batch_.copy()
        index_data = index_data_.copy()
        """
        Episode
        """
        """
        first layer
        """
        source_batch_first = source_batch_.copy()
        observation_first_layer = np.zeros([nodes_per_group, env.NUM_APPS],
                                           int)
        for inter_episode_index in range(NUM_CONTAINERS):

            appid = index_data[inter_episode_index]
            source_batch_first[appid] -= 1
            observation_first_layer_copy = observation_first_layer.copy()
            observation_first_layer_copy[:, appid] += 1
            observation_first_layer_copy = np.append(
                observation_first_layer_copy,
                observation_first_layer_copy > 9 * 2,
                axis=1)
            observation_first_layer_copy = np.append(
                observation_first_layer_copy,
                observation_first_layer_copy.sum(axis=1).reshape(
                    nodes_per_group, 1),
                axis=1)
            observation_first_layer_copy = np.array(
                observation_first_layer_copy).reshape(1, -1)
            observation_first_layer_copy = np.append(
                observation_first_layer_copy, appid).reshape(1, -1)
            observation_first_layer_copy = np.append(
                observation_first_layer_copy,
                np.array(source_batch_first)).reshape(1, -1)
            if useExternal:
                action_1 = inter_episode_index % 3
                prob_weights = []
            else:
                action_1, prob_weights = RL_1.choose_action(
                    observation_first_layer_copy.copy())
            observation_first_layer[action_1, appid] += 1
            store_episode_1(observation_first_layer_copy, action_1)
        """
        second layer
        """
        observation_second_layer_aggregation = np.empty([0, env.NUM_APPS],
                                                        int)  # 9*20

        number_cont_second_layer = []

        for second_layer_index in range(nodes_per_group):

            rnd_array = observation_first_layer[second_layer_index].copy()
            source_batch_second, index_data = batch_data_sub(rnd_array)
            observation_second_layer = np.zeros(
                [nodes_per_group, env.NUM_APPS], int)
            NUM_CONTAINERS_second = sum(source_batch_second)
            number_cont_second_layer.append(NUM_CONTAINERS_second)

            for inter_episode_index in range(NUM_CONTAINERS_second):

                appid = index_data[inter_episode_index]
                source_batch_second[appid] -= 1
                observation_second_layer_copy = observation_second_layer.copy()
                observation_second_layer_copy[:, appid] += 1
                observation_second_layer_copy = np.append(
                    observation_second_layer_copy,
                    observation_second_layer_copy > 3 * 2,
                    axis=1)
                observation_second_layer_copy = np.append(
                    observation_second_layer_copy,
                    observation_second_layer_copy.sum(axis=1).reshape(
                        nodes_per_group, 1),
                    axis=1)
                observation_second_layer_copy = np.array(
                    observation_second_layer_copy).reshape(1, -1)
                observation_second_layer_copy = np.append(
                    observation_second_layer_copy, appid).reshape(1, -1)
                observation_second_layer_copy = np.append(
                    observation_second_layer_copy,
                    np.array(source_batch_second)).reshape(1, -1)
                if useExternal:
                    action_2 = inter_episode_index % 3
                    prob_weights = []
                else:
                    action_2, prob_weights = RL_2.choose_action(
                        observation_second_layer_copy.copy())
                observation_second_layer[action_2, appid] += 1
                store_episode_2(observation_second_layer_copy, action_2)
            observation_second_layer_aggregation = np.append(
                observation_second_layer_aggregation, observation_second_layer,
                0)
        """
        third layer
        """
        observation_third_layer_aggregation = np.empty([0, env.NUM_APPS],
                                                       int)  # 9*20
        number_cont_third_layer = []

        for third_layer_index in range(nodes_per_group * nodes_per_group):

            rnd_array = observation_second_layer_aggregation[
                third_layer_index].copy()
            source_batch_third, index_data = batch_data_sub(rnd_array)
            observation_third_layer = np.zeros([nodes_per_group, env.NUM_APPS],
                                               int)
            NUM_CONTAINERS_third = sum(source_batch_third)
            number_cont_third_layer.append(NUM_CONTAINERS_third)

            for inter_episode_index in range(NUM_CONTAINERS_third):
                appid = index_data[inter_episode_index]
                source_batch_third[appid] -= 1
                observation_third_layer_copy = observation_third_layer.copy()
                observation_third_layer_copy[:, appid] += 1
                observation_third_layer_copy, mapping_index = handle_constraint(
                    observation_third_layer_copy.copy(), 3)
                if len(mapping_index) < 1:
                    break_flag = True
                    break
                assert len(mapping_index) > 0
                observation_third_layer_copy = np.append(
                    observation_third_layer_copy,
                    observation_third_layer_copy > 1 * 2,
                    axis=1)
                observation_third_layer_copy = np.append(
                    observation_third_layer_copy,
                    observation_third_layer_copy.sum(axis=1).reshape(
                        nodes_per_group, 1),
                    axis=1)
                observation_third_layer_copy = np.array(
                    observation_third_layer_copy).reshape(1, -1)
                observation_third_layer_copy = np.append(
                    observation_third_layer_copy, appid).reshape(1, -1)
                observation_third_layer_copy = np.append(
                    observation_third_layer_copy,
                    np.array(source_batch_third)).reshape(1, -1)
                if useExternal:
                    action_3 = inter_episode_index % 3
                    prob_weights = []
                else:
                    action_3, prob_weights = RL_3.choose_action(
                        observation_third_layer_copy.copy())
                observation_third_layer[mapping_index[action_3], appid] += 1
                store_episode_3(observation_third_layer_copy, action_3)
            if break_flag:
                break
            observation_third_layer_aggregation = np.append(
                observation_third_layer_aggregation, observation_third_layer,
                0)
        if break_flag:
            break_number += 1
        """
        After an entire allocation, calculate total throughput, reward
        """
        if not break_flag:
            env.state = observation_third_layer_aggregation.copy()
            tput_state = env.get_tput_total_env()
            tput = (sim.predict(tput_state.reshape(-1, env.NUM_APPS)) *
                    tput_state).sum() / NUM_CONTAINERS
            assert sum(sum(env.state)) == NUM_CONTAINERS
            assert (env.state.sum(0) == source_batch_).all()
            list_check = 0
            for node in range(NUM_NODES):
                for app in range(env.NUM_APPS):
                    if env.state[node, :].sum(
                    ) > params['container_limitation per node'] or env.state[
                            node, app] > 1 or (app == 1
                                               and env.state[node, 2] > 0) or (
                                                   app == 2
                                                   and env.state[node, 1] > 0):
                        list_check += env.state[node, app]
            list_check_ratio = -1.0 * list_check / NUM_CONTAINERS
        else:
            tput = 0
            list_check_ratio = 0
            list_check, list_check_per_app, list_check_coex, list_check_sum = 0, 0, 0, 0

        safety_episode_3 = [list_check_ratio * 1.0
                            ] * len(observation_episode_3)
        reward_episode_3 = [tput * 1.0] * len(observation_episode_3)

        safety_episode_2, reward_episode_2 = [], []
        for second_subcluster_index in range(nodes_per_group):
            safety_episode_2.extend(
                [list_check_ratio * 1.0] *
                int(number_cont_second_layer[second_subcluster_index]))
            reward_episode_2.extend(
                [tput * 1.0] *
                int(number_cont_second_layer[second_subcluster_index]))

        safety_episode_1 = [list_check_ratio * 1.0
                            ] * len(observation_episode_1)
        reward_episode_1 = [tput * 1.0] * len(observation_episode_1)

        RL_1.store_tput_per_episode(tput, list_check, epoch_i, [], [],
                                    list_check)
        RL_2.store_tput_per_episode(tput, list_check, epoch_i, [], [], [])
        RL_3.store_tput_per_episode(tput, list_check, epoch_i, [], [], [])
        RL_1.store_training_samples_per_episode(observation_episode_1,
                                                action_episode_1,
                                                safety_episode_1,
                                                reward_episode_1)
        RL_2.store_training_samples_per_episode(observation_episode_2,
                                                action_episode_2,
                                                safety_episode_2,
                                                reward_episode_2)
        RL_3.store_training_samples_per_episode(observation_episode_3,
                                                action_episode_3,
                                                safety_episode_3,
                                                reward_episode_3)
        """
        check_tput_quality(tput)
        """
        if names['highest_tput_' +
                 str(tput_origimal_class)] < tput and list_check_ratio == 0:
            names['highest_tput_' + str(tput_origimal_class)] = tput
            names['observation_optimal_1_' + str(tput_origimal_class)], names[
                'action_optimal_1_' + str(tput_origimal_class)], names[
                    'observation_optimal_2_' +
                    str(tput_origimal_class)], names['action_optimal_2_' + str(
                        tput_origimal_class)], names['reward_optimal_1_' + str(
                            tput_origimal_class
                        )], names['reward_optimal_2_' + str(
                            tput_origimal_class
                        )], names['reward_optimal_3_' + str(
                            tput_origimal_class
                        )], names['number_optimal_' + str(
                            tput_origimal_class
                        )], names['safety_optimal_1_' + str(
                            tput_origimal_class
                        )], names['safety_optimal_2_' + str(
                            tput_origimal_class
                        )], names['safety_optimal_3_' + str(
                            tput_origimal_class
                        )] = [], [], [], [], [], [], [], [], [], [], []
            names['observation_optimal_3_' + str(tput_origimal_class)], names[
                'action_optimal_3_' + str(tput_origimal_class)] = [], []
            names['observation_optimal_1_' +
                  str(tput_origimal_class)].extend(observation_episode_1)
            names['action_optimal_1_' +
                  str(tput_origimal_class)].extend(action_episode_1)
            names['observation_optimal_2_' +
                  str(tput_origimal_class)].extend(observation_episode_2)
            names['action_optimal_2_' +
                  str(tput_origimal_class)].extend(action_episode_2)
            names['observation_optimal_3_' +
                  str(tput_origimal_class)].extend(observation_episode_3)
            names['action_optimal_3_' +
                  str(tput_origimal_class)].extend(action_episode_3)
            names['number_optimal_' +
                  str(tput_origimal_class)].append(NUM_CONTAINERS)
            names['safety_optimal_1_' +
                  str(tput_origimal_class)].extend(safety_episode_1)
            names['safety_optimal_2_' +
                  str(tput_origimal_class)].extend(safety_episode_2)
            names['safety_optimal_3_' +
                  str(tput_origimal_class)].extend(safety_episode_3)
            names['reward_optimal_1_' +
                  str(tput_origimal_class)].extend(reward_episode_1)
            names['reward_optimal_2_' +
                  str(tput_origimal_class)].extend(reward_episode_2)
            names['reward_optimal_3_' +
                  str(tput_origimal_class)].extend(reward_episode_3)
            names['optimal_range_' + str(tput_origimal_class)] = 1.05
        observation_episode_1, action_episode_1, reward_episode_1, safety_episode_1, reward_episode_1 = [], [], [], [], []
        observation_episode_2, action_episode_2, reward_episode_2, safety_episode_2, reward_episode_2 = [], [], [], [], []
        observation_episode_3, action_episode_3, reward_episode_3, safety_episode_3, reward_episode_3 = [], [], [], [], []
        """
        Each batch, RL.learn()
        """
        if (epoch_i % batch_size == 0) & (epoch_i > batch_size + 1):
            for replay_class in range(0, 1):

                number_optimal = names['number_optimal_' + str(replay_class)]

                reward_optimal_1 = names['reward_optimal_1_' +
                                         str(replay_class)]
                reward_optimal_2 = names['reward_optimal_2_' +
                                         str(replay_class)]
                reward_optimal_3 = names['reward_optimal_3_' +
                                         str(replay_class)]
                safety_optimal_1 = names['safety_optimal_1_' +
                                         str(replay_class)]
                safety_optimal_2 = names['safety_optimal_2_' +
                                         str(replay_class)]
                safety_optimal_3 = names['safety_optimal_3_' +
                                         str(replay_class)]

                observation_optimal_1 = names['observation_optimal_1_' +
                                              str(replay_class)]
                action_optimal_1 = names['action_optimal_1_' +
                                         str(replay_class)]
                observation_optimal_2 = names['observation_optimal_2_' +
                                              str(replay_class)]
                action_optimal_2 = names['action_optimal_2_' +
                                         str(replay_class)]
                observation_optimal_3 = names['observation_optimal_3_' +
                                              str(replay_class)]
                action_optimal_3 = names['action_optimal_3_' +
                                         str(replay_class)]

                buffer_size = int(len(number_optimal))

                if buffer_size < replay_size:
                    # TODO: if layers changes, training_times_per_episode should be modified
                    RL_1.ep_obs.extend(observation_optimal_1)
                    RL_1.ep_as.extend(action_optimal_1)
                    RL_1.ep_rs.extend(reward_optimal_1)
                    RL_1.ep_ss.extend(safety_optimal_1)

                    RL_2.ep_obs.extend(observation_optimal_2)
                    RL_2.ep_as.extend(action_optimal_2)
                    RL_2.ep_rs.extend(reward_optimal_2)
                    RL_2.ep_ss.extend(safety_optimal_2)

                    RL_3.ep_obs.extend(observation_optimal_3)
                    RL_3.ep_as.extend(action_optimal_3)
                    RL_3.ep_rs.extend(reward_optimal_3)
                    RL_3.ep_ss.extend(safety_optimal_3)

                else:
                    replay_index = np.random.choice(range(buffer_size),
                                                    size=replay_size,
                                                    replace=False)
                    for replay_id in range(replay_size):
                        replace_start = replay_index[replay_id]
                        start_location = sum(number_optimal[:replace_start])
                        stop_location = sum(number_optimal[:replace_start + 1])
                        RL_1.ep_obs.extend(
                            observation_optimal_1[start_location:stop_location]
                        )
                        RL_1.ep_as.extend(
                            action_optimal_1[start_location:stop_location])
                        RL_1.ep_rs.extend(
                            reward_optimal_1[start_location:stop_location])
                        RL_1.ep_ss.extend(
                            safety_optimal_1[start_location:stop_location])

                        RL_2.ep_obs.extend(
                            observation_optimal_2[start_location:stop_location]
                        )
                        RL_2.ep_as.extend(
                            action_optimal_2[start_location:stop_location])
                        RL_2.ep_rs.extend(
                            reward_optimal_2[start_location:stop_location])
                        RL_2.ep_ss.extend(
                            safety_optimal_2[start_location:stop_location])

                        RL_3.ep_obs.extend(
                            observation_optimal_3[start_location:stop_location]
                        )
                        RL_3.ep_as.extend(
                            action_optimal_3[start_location:stop_location])
                        RL_3.ep_rs.extend(
                            reward_optimal_3[start_location:stop_location])
                        RL_3.ep_ss.extend(
                            safety_optimal_3[start_location:stop_location])

            RL_1.learn(epoch_i, thre_entropy, IfPrint=True)
            RL_2.learn(epoch_i, thre_entropy)
            if len(RL_3.ep_obs) > 1:
                RL_3.learn(epoch_i, thre_entropy)
        """
        checkpoint, per 1000 episodes
        """
        if (epoch_i % 3000 == 0) & (epoch_i > 1):
            RL_1.save_session(ckpt_path_1)
            RL_2.save_session(ckpt_path_2)
            RL_3.save_session(ckpt_path_3)
            np.savez(np_path,
                     tputs=np.array(RL_1.tput_persisit),
                     candidate=np.array(RL_1.episode),
                     vi_perapp=np.array(RL_1.ss_perapp_persisit),
                     vi_coex=np.array(RL_1.ss_coex_persisit),
                     vi_sum=np.array(RL_1.ss_sum_persisit),
                     break_number=break_number)
            """
            optimal range adaptively change
            """
            thre_entropy *= 0.5
            thre_entropy = max(thre_entropy, 0.01)
        epoch_i += 1
Example #4
0
def train(params):

    """
    parameters set
    """
    NUM_NODES = params['number of nodes in the cluster']
    env = LraClusterEnv(num_nodes=NUM_NODES)
    batch_size = params['batch_size']
    ckpt_path_1 = "./checkpoint/" + params['path'] + "1/model.ckpt"
    ckpt_path_2 = "./checkpoint/" + params['path'] + "2/model.ckpt"
    ckpt_path_3 = "./checkpoint/" + params['path'] + "3/model.ckpt"
    make_path(params['path'] + "1")
    make_path(params['path'] + "2")
    make_path(params['path'] + "3")
    np_path = "./checkpoint/" + params['path'] + "/optimal_file_name.npz"
    Recover = params['recover']
    nodes_per_group = int(params['nodes per group'])
    replay_size = params['replay size']
    training_times_per_episode = 1
    alpha = params['alpha']

    """
    Build Network
    """
    n_actions = nodes_per_group  #: 3 nodes per group
    n_features = int(n_actions * (env.NUM_APPS + 1 + env.NUM_APPS )+ 1 + env.NUM_APPS)  #: 3*9+1 = 28
    RL_1 = PolicyGradient(
        n_actions=n_actions,
        n_features=n_features,
        learning_rate=params['learning rate'],
        suffix="100" + '1a')

    RL_2 = PolicyGradient(
        n_actions=n_actions,
        n_features=n_features,
        learning_rate=params['learning rate'],
        suffix="100" + '2a')

    RL_3 = PolicyGradient(
        n_actions=n_actions,
        n_features=n_features,
        learning_rate=params['learning rate'],
        suffix="100" + '3a')

    """
    Training
    """
    start_time = time.time()
    global_start_time = start_time
    number_optimal = []
    observation_episode_1, action_episode_1, reward_episode_1, safety_episode_1 = [], [], [], []
    observation_optimal_1, action_optimal_1, reward_optimal_1, safety_optimal_1 = [], [], [], []
    observation_episode_2, action_episode_2, reward_episode_2, safety_episode_2 = [], [], [], []
    observation_optimal_2, action_optimal_2, reward_optimal_2, safety_optimal_2 = [], [], [], []
    observation_episode_3, action_episode_3, reward_episode_3, safety_episode_3 = [], [], [], []
    observation_optimal_3, action_optimal_3, reward_optimal_3, safety_optimal_3 = [], [], [], []

    epoch_i = 0
    thre_entropy = 0.1

    names = locals()
    for i in range(0, 12):
        names['highest_tput_' + str(i)] = 0.1
        names['observation_optimal_1_' + str(i)] = []
        names['action_optimal_1_' + str(i)] = []
        names['observation_optimal_2_' + str(i)] = []
        names['action_optimal_2_' + str(i)] = []
        names['observation_optimal_3_' + str(i)] = []
        names['action_optimal_3_' + str(i)] = []
        names['reward_optimal_' + str(i)] = []
        names['safety_optimal_' + str(i)] = []
        names['number_optimal_' + str(i)] = []
        names['optimal_range_' + str(i)] = 1.05
        names['lowest_vio_' + str(i)] = 500
        names['observation_optimal_1_vio_' + str(i)] = []
        names['action_optimal_1_vio_' + str(i)] = []
        names['observation_optimal_2_vio_' + str(i)] = []
        names['action_optimal_2_vio_' + str(i)] = []
        names['observation_optimal_3_vio_' + str(i)] = []
        names['action_optimal_3_vio_' + str(i)] = []
        names['reward_optimal_vio_' + str(i)] = []
        names['safety_optimal_vio_1_' + str(i)] = []
        names['safety_optimal_vio_2_' + str(i)] = []
        names['safety_optimal_vio_3_' + str(i)] = []
        names['number_optimal_vio_' + str(i)] = []
        names['optimal_range_vio_' + str(i)] = 1.1

    def store_episode_1(observations, actions):
        observation_episode_1.append(observations)
        action_episode_1.append(actions)

    def store_episode_2(observations, actions):
        observation_episode_2.append(observations)
        action_episode_2.append(actions)

    def store_episode_3(observations, actions):
        observation_episode_3.append(observations)
        action_episode_3.append(actions)

    NUM_CONTAINERS = 100
    tput_origimal_class = 0
    source_batch_, index_data_ = batch_data(NUM_CONTAINERS, env.NUM_APPS)
    while epoch_i < params['epochs']:
        observation = env.reset().copy()  # (9,9)
        source_batch = source_batch_.copy()
        index_data = index_data_.copy()

        """
        Episode
        """
        """
        first layer
        """
        source_batch_first = source_batch_.copy()
        observation_first_layer = np.zeros([nodes_per_group, env.NUM_APPS], int)
        for inter_episode_index in range(NUM_CONTAINERS):

            appid = index_data[inter_episode_index]
            source_batch_first[appid] -= 1
            observation_first_layer_copy = observation_first_layer.copy()
            observation_first_layer_copy[:, appid] += 1
            observation_first_layer_copy = np.append(observation_first_layer_copy, observation_first_layer_copy > 9 * 2, axis=1)
            observation_first_layer_copy = np.append(observation_first_layer_copy, observation_first_layer_copy.sum(axis=1).reshape(nodes_per_group, 1), axis=1)
            observation_first_layer_copy = np.array(observation_first_layer_copy).reshape(1, -1)
            observation_first_layer_copy = np.append(observation_first_layer_copy, appid).reshape(1, -1)
            observation_first_layer_copy = np.append(observation_first_layer_copy, np.array(source_batch_first)).reshape(1, -1)
            action_1, prob_weights = RL_1.choose_action(observation_first_layer_copy.copy())
            observation_first_layer[action_1, appid] += 1
            store_episode_1(observation_first_layer_copy, action_1)

        """
        second layer
        """
        observation_second_layer_aggregation = np.empty([0, env.NUM_APPS], int)  # 9*20

        number_cont_second_layer = []

        for second_layer_index in range(nodes_per_group):

            rnd_array = observation_first_layer[second_layer_index].copy()
            source_batch_second, index_data = batch_data_sub(rnd_array)
            observation_second_layer = np.zeros([nodes_per_group, env.NUM_APPS], int)
            NUM_CONTAINERS_second = sum(source_batch_second)
            number_cont_second_layer.append(NUM_CONTAINERS_second)

            for inter_episode_index in range(NUM_CONTAINERS_second):

                appid = index_data[inter_episode_index]
                source_batch_second[appid] -= 1
                observation_second_layer_copy = observation_second_layer.copy()
                observation_second_layer_copy[:, appid] += 1
                observation_second_layer_copy = np.append(observation_second_layer_copy, observation_second_layer_copy > 3 * 2, axis=1)
                observation_second_layer_copy = np.append(observation_second_layer_copy, observation_second_layer_copy.sum(axis=1).reshape(nodes_per_group, 1), axis=1)
                observation_second_layer_copy = np.array(observation_second_layer_copy).reshape(1, -1)
                observation_second_layer_copy = np.append(observation_second_layer_copy, appid).reshape(1, -1)
                observation_second_layer_copy = np.append(observation_second_layer_copy, np.array(source_batch_second)).reshape(1, -1)
                action_2, prob_weights = RL_2.choose_action(observation_second_layer_copy.copy())
                observation_second_layer[action_2, appid] += 1
                store_episode_2(observation_second_layer_copy, action_2)

            observation_second_layer_aggregation = np.append(observation_second_layer_aggregation, observation_second_layer, 0)

        """
        third layer
        """
        observation_third_layer_aggregation = np.empty([0, env.NUM_APPS], int)  # 9*20
        number_cont_third_layer = []

        for third_layer_index in range(nodes_per_group * nodes_per_group):

            rnd_array = observation_second_layer_aggregation[third_layer_index].copy()
            source_batch_third, index_data = batch_data_sub(rnd_array)
            observation_third_layer = np.zeros([nodes_per_group, env.NUM_APPS], int)
            NUM_CONTAINERS_third = sum(source_batch_third)
            number_cont_third_layer.append(NUM_CONTAINERS_third)

            for inter_episode_index in range(NUM_CONTAINERS_third):
                appid = index_data[inter_episode_index]
                source_batch_third[appid] -= 1
                observation_third_layer_copy = observation_third_layer.copy()
                observation_third_layer_copy[:, appid] += 1
                observation_third_layer_copy = np.append(observation_third_layer_copy, observation_third_layer_copy > 1 * 2, axis=1)
                observation_third_layer_copy = np.append(observation_third_layer_copy, observation_third_layer_copy.sum(axis=1).reshape(nodes_per_group, 1), axis=1)
                observation_third_layer_copy = np.array(observation_third_layer_copy).reshape(1, -1)
                observation_third_layer_copy = np.append(observation_third_layer_copy, appid).reshape(1, -1)
                observation_third_layer_copy = np.append(observation_third_layer_copy, np.array(source_batch_third)).reshape(1, -1)

                action_3, prob_weights = RL_3.choose_action(observation_third_layer_copy.copy())
                observation_third_layer[action_3, appid] += 1
                store_episode_3(observation_third_layer_copy, action_3)
            observation_third_layer_aggregation = np.append(observation_third_layer_aggregation, observation_third_layer, 0)

        """
        After an entire allocation, calculate total throughput, reward
        """
        env.state = observation_third_layer_aggregation.copy()
        tput_state = env.get_tput_total_env()
        tput = (sim.predict(tput_state.reshape(-1, env.NUM_APPS)) * tput_state).sum() / NUM_CONTAINERS
        assert sum(sum(env.state)) == NUM_CONTAINERS
        assert (env.state.sum(0) == source_batch_).all()

        list_check = 0
        for node in range(NUM_NODES):
            for app in range(env.NUM_APPS):
                if env.state[node, :].sum() > params['container_limitation per node'] or env.state[node, app] > 1 or (app == 1 and env.state[node, 2] > 0) or (app == 2 and env.state[node, 1] > 0):
                    list_check += env.state[node, app]

        list_check_ratio = -1.0 * list_check / NUM_CONTAINERS

        safety_episode_3, reward_episode_3  = [], []
        for thrid_subcluster_index in range(nodes_per_group * nodes_per_group):
            list_check_ratio = 0 - list_check  # - list_check_baseline
            safety_episode_3.extend([list_check_ratio * 1.0] * int(number_cont_third_layer[thrid_subcluster_index]))
            reward_episode_3.extend([tput * 1.0] * int(number_cont_third_layer[thrid_subcluster_index]))

        safety_episode_2, reward_episode_2  = [], []
        for second_subcluster_index in range(nodes_per_group):
            safety_episode_2.extend([list_check_ratio * 1.0] * int(number_cont_second_layer[second_subcluster_index]))
            reward_episode_2.extend([tput * 1.0] * int(number_cont_second_layer[second_subcluster_index]))

        safety_episode_1 = [list_check_ratio * 1.0] * len(observation_episode_1)
        reward_episode_1 = [tput * 1.0] * len(observation_episode_1)
        RL_1.store_tput_per_episode(tput, list_check, epoch_i, [], [], list_check)
        RL_2.store_tput_per_episode(tput, list_check, epoch_i, [],[],[])
        RL_3.store_tput_per_episode(tput, list_check, epoch_i, [],[],[])
        RL_1.store_training_samples_per_episode(observation_episode_1, action_episode_1, safety_episode_1, reward_episode_1)
        RL_2.store_training_samples_per_episode(observation_episode_2, action_episode_2, safety_episode_2, reward_episode_2)
        RL_3.store_training_samples_per_episode(observation_episode_3, action_episode_3, safety_episode_3, reward_episode_3)

        """
        check_tput_quality(tput)
        """
        if names['lowest_vio_' + str(tput_origimal_class)] > list_check:
            names['lowest_vio_' + str(tput_origimal_class)] = list_check
            names['observation_optimal_1_vio_' + str(tput_origimal_class)], names['action_optimal_1_vio_' + str(tput_origimal_class)], names['observation_optimal_2_vio_' + str(tput_origimal_class)], names['action_optimal_2_vio_' + str(tput_origimal_class)], names['number_optimal_vio_' + str(tput_origimal_class)], names['safety_optimal_vio_1_' + str(tput_origimal_class)], names['safety_optimal_vio_2_' + str(tput_origimal_class)], names['safety_optimal_vio_3_' + str(tput_origimal_class)] = [], [], [], [], [], [], [], []
            names['observation_optimal_3_vio_' + str(tput_origimal_class)], names['action_optimal_3_vio_' + str(tput_origimal_class)] = [], []
            names['reward_optimal_vio_' + str(tput_origimal_class)] = []
            names['observation_optimal_1_vio_' + str(tput_origimal_class)].extend(observation_episode_1)
            names['action_optimal_1_vio_' + str(tput_origimal_class)].extend(action_episode_1)
            names['observation_optimal_2_vio_' + str(tput_origimal_class)].extend(observation_episode_2)
            names['action_optimal_2_vio_' + str(tput_origimal_class)].extend(action_episode_2)
            names['observation_optimal_3_vio_' + str(tput_origimal_class)].extend(observation_episode_3)
            names['action_optimal_3_vio_' + str(tput_origimal_class)].extend(action_episode_3)
            names['number_optimal_vio_' + str(tput_origimal_class)].append(NUM_CONTAINERS)
            names['safety_optimal_vio_1_' + str(tput_origimal_class)].extend(safety_episode_1)
            names['safety_optimal_vio_2_' + str(tput_origimal_class)].extend(safety_episode_2)
            names['safety_optimal_vio_3_' + str(tput_origimal_class)].extend(safety_episode_3)
            names['reward_optimal_vio_' + str(tput_origimal_class)].extend(reward_episode_1)

            names['optimal_range_vio_' + str(tput_origimal_class)] = 1.1
        elif names['lowest_vio_' + str(tput_origimal_class)] >= list_check / names['optimal_range_vio_' + str(tput_origimal_class)]:
            names['observation_optimal_1_vio_' + str(tput_origimal_class)].extend(observation_episode_1)
            names['action_optimal_1_vio_' + str(tput_origimal_class)].extend(action_episode_1)
            names['observation_optimal_2_vio_' + str(tput_origimal_class)].extend(observation_episode_2)
            names['action_optimal_2_vio_' + str(tput_origimal_class)].extend(action_episode_2)
            names['observation_optimal_3_vio_' + str(tput_origimal_class)].extend(observation_episode_3)
            names['action_optimal_3_vio_' + str(tput_origimal_class)].extend(action_episode_3)
            names['number_optimal_vio_' + str(tput_origimal_class)].append(NUM_CONTAINERS)
            names['safety_optimal_vio_1_' + str(tput_origimal_class)].extend(safety_episode_1)
            names['safety_optimal_vio_2_' + str(tput_origimal_class)].extend(safety_episode_2)
            names['safety_optimal_vio_3_' + str(tput_origimal_class)].extend(safety_episode_3)
            names['reward_optimal_vio_' + str(tput_origimal_class)].extend(reward_episode_1)
        if names['highest_tput_' + str(tput_origimal_class)] < tput:
            names['highest_tput_' + str(tput_origimal_class)] = tput
        observation_episode_1, action_episode_1, reward_episode_1, safety_episode_1, reward_episode_1 = [], [], [], [], []
        observation_episode_2, action_episode_2, reward_episode_2, safety_episode_2, reward_episode_2 = [], [], [], [], []
        observation_episode_3, action_episode_3, reward_episode_3, safety_episode_3, reward_episode_3 = [], [], [], [], []

        """
        Each batch, RL.learn()
        """
        if (epoch_i % batch_size == 0) & (epoch_i > batch_size+1):
            RL_1.learn(epoch_i, thre_entropy, IfPrint=True, alpha=alpha)
            RL_2.learn(epoch_i, thre_entropy, alpha=alpha)
            RL_3.learn(epoch_i, thre_entropy, alpha=alpha)

        """
        checkpoint, per 1000 episodes
        """
        if (epoch_i % 3000 == 0) & (epoch_i > 1):
            RL_1.save_session(ckpt_path_1)
            RL_2.save_session(ckpt_path_2)
            RL_3.save_session(ckpt_path_3)
            np.savez(np_path, tputs=np.array(RL_1.tput_persisit), candidate=np.array(RL_1.episode), vi_perapp=np.array(RL_1.ss_perapp_persisit), vi_coex=np.array(RL_1.ss_coex_persisit), vi_sum=np.array(RL_1.ss_sum_persisit))
            """
            optimal range adaptively change
            """
            thre_entropy *= 0.5
            thre_entropy = max(thre_entropy, 0.01)
        epoch_i += 1
Example #5
0
    def get_total_tput(self, rnd_array):

        # assert sum(rnd_array) == 81
        source_batch_, index_data = self.batch_data(
            rnd_array.astype(int))  # index_data = [0,1,2,0,1,2]
        env = LraClusterEnv(num_nodes=self.NUM_NODES)
        observation = env.reset().copy()  # (9,9)
        source_batch = source_batch_.copy()
        nodes_per_group = int(params['nodes per group'])
        NUM_CONTAINERS = int(sum(rnd_array))
        """
        Episode
        """
        """
        first layer
        """
        source_batch_first = source_batch_.copy()
        observation_first_layer = np.zeros([nodes_per_group, env.NUM_APPS],
                                           int)
        for inter_episode_index in range(NUM_CONTAINERS):
            appid = index_data[inter_episode_index]
            source_batch_first[appid] -= 1
            observation_first_layer_copy = observation_first_layer.copy()
            observation_first_layer_copy[:, appid] += 1

            observation_first_layer_copy = np.append(
                observation_first_layer_copy,
                observation_first_layer_copy > 9 * 2,
                axis=1)
            observation_first_layer_copy = np.append(
                observation_first_layer_copy,
                observation_first_layer_copy.sum(axis=1).reshape(
                    nodes_per_group, 1),
                axis=1)
            # observation_first_layer_copy = np.append(observation_first_layer_copy, ((observation_first_layer_copy[:, 2] > 0) * (observation_first_layer_copy[:, 3] > 0)).reshape(nodes_per_group, 1), axis=1)
            observation_first_layer_copy = np.array(
                observation_first_layer_copy).reshape(1, -1)
            observation_first_layer_copy = np.append(
                observation_first_layer_copy, appid).reshape(1, -1)
            observation_first_layer_copy = np.append(
                observation_first_layer_copy,
                np.array(source_batch_first)).reshape(1, -1)

            action_1, prob_weights = self.RL_1.choose_action_determine(
                observation_first_layer_copy.copy())

            observation_first_layer[action_1, appid] += 1

            # self.store_episode_1(observation_first_layer_copy, action_1)
        """
        second layer
        """
        observation_second_layer_aggregation = np.empty([0, env.NUM_APPS],
                                                        int)  # 9*20

        number_cont_second_layer = []

        for second_layer_index in range(nodes_per_group):

            rnd_array = observation_first_layer[second_layer_index].copy()
            source_batch_second, index_data = self.batch_data_sub(rnd_array)

            observation_second_layer = np.zeros(
                [nodes_per_group, env.NUM_APPS], int)

            NUM_CONTAINERS_second = sum(source_batch_second)

            number_cont_second_layer.append(NUM_CONTAINERS_second)

            for inter_episode_index in range(NUM_CONTAINERS_second):
                appid = index_data[inter_episode_index]
                source_batch_second[appid] -= 1
                observation_second_layer_copy = observation_second_layer.copy()
                observation_second_layer_copy[:, appid] += 1

                observation_second_layer_copy = np.append(
                    observation_second_layer_copy,
                    observation_second_layer_copy > 3 * 2,
                    axis=1)
                observation_second_layer_copy = np.append(
                    observation_second_layer_copy,
                    observation_second_layer_copy.sum(axis=1).reshape(
                        nodes_per_group, 1),
                    axis=1)
                # observation_second_layer_copy = np.append(observation_second_layer_copy, ((observation_second_layer_copy[:, 2] > 0) * (observation_second_layer_copy[:, 3] > 0)).reshape(nodes_per_group, 1), axis=1)
                observation_second_layer_copy = np.array(
                    observation_second_layer_copy).reshape(1, -1)
                observation_second_layer_copy = np.append(
                    observation_second_layer_copy, appid).reshape(1, -1)
                observation_second_layer_copy = np.append(
                    observation_second_layer_copy,
                    np.array(source_batch_second)).reshape(1, -1)

                action_2, prob_weights = self.RL_2.choose_action_determine(
                    observation_second_layer_copy.copy())

                observation_second_layer[action_2, appid] += 1

                # self.store_episode_2(observation_second_layer_copy, action_2)

            observation_second_layer_aggregation = np.append(
                observation_second_layer_aggregation, observation_second_layer,
                0)
        """
        third layer
        """
        observation_third_layer_aggregation = np.empty([0, env.NUM_APPS],
                                                       int)  # 9*20
        number_cont_third_layer = []

        for third_layer_index in range(nodes_per_group * nodes_per_group):

            rnd_array = observation_second_layer_aggregation[
                third_layer_index].copy()
            source_batch_third, index_data = self.batch_data_sub(rnd_array)

            observation_third_layer = np.zeros([nodes_per_group, env.NUM_APPS],
                                               int)

            NUM_CONTAINERS_third = sum(source_batch_third)
            number_cont_third_layer.append(NUM_CONTAINERS_third)

            for inter_episode_index in range(NUM_CONTAINERS_third):
                appid = index_data[inter_episode_index]
                source_batch_third[appid] -= 1
                observation_third_layer_copy = observation_third_layer.copy()
                observation_third_layer_copy[:, appid] += 1

                observation_third_layer_copy = np.append(
                    observation_third_layer_copy,
                    observation_third_layer_copy > 1 * 2,
                    axis=1)
                observation_third_layer_copy = np.append(
                    observation_third_layer_copy,
                    observation_third_layer_copy.sum(axis=1).reshape(
                        nodes_per_group, 1),
                    axis=1)
                # observation_third_layer_copy = np.append(observation_third_layer_copy, ((observation_third_layer_copy[:, 2] > 0) * (observation_third_layer_copy[:, 3] > 0)).reshape(nodes_per_group, 1), axis=1)
                observation_third_layer_copy = np.array(
                    observation_third_layer_copy).reshape(1, -1)
                observation_third_layer_copy = np.append(
                    observation_third_layer_copy, appid).reshape(1, -1)
                observation_third_layer_copy = np.append(
                    observation_third_layer_copy,
                    np.array(source_batch_third)).reshape(1, -1)

                action_3, prob_weights = self.RL_3.choose_action_determine(
                    observation_third_layer_copy.copy())

                observation_third_layer[action_3, appid] += 1

                # self.store_episode_3(observation_third_layer_copy, action_3)

            observation_third_layer_aggregation = np.append(
                observation_third_layer_aggregation, observation_third_layer,
                0)
        """
        After an entire allocation, calculate total throughput, reward
        """
        env.state = observation_third_layer_aggregation.copy()
        assert sum(sum(env.state)) == NUM_CONTAINERS
        assert (env.state.sum(0) == source_batch_).all()
        """
        After an entire allocation, calculate total throughput, reward
        """
        # state = env.state
        # assert sum(sum(self.env.state)) == 81

        return env.state
def train(params):

    import pandas as pd
    df = pd.read_csv(params['alloc_path'], sep=',', header=0)
    allocation = df.values[:, 0:7]
    df = pd.read_csv(params['rps_path'], sep=',', header=0)
    rps = df.values
    env = LraClusterEnv(num_nodes=9)
    capacity = params['container_limitation per node']
    NUM_APP = 7
    miss_sample = 0
    state_set = np.empty((0, 4), int)
    rps_set = np.empty((0, capacity), float)
    num_cardinality = 0
    for s_tag in range(NUM_APP):
        container_list = np.zeros([1, NUM_APP])
        container_list[0, s_tag] += 1
        exist = (allocation == container_list[0]).all(1).any()
        if exist:
            tput_breakdown_single = rps[allocation.tolist().index(
                container_list[0].tolist())]
        else:
            tput_node, tput_breakdown_single = (
                env.get_throughput_given_state(container_list))
            tput_breakdown_single = tput_breakdown_single[0]
            miss_sample += 1
        tput_s_tag_original = tput_breakdown_single[s_tag]
        for c_tag in range(NUM_APP):
            tput_s_tag_set = []
            for num_c_tag in range(0, capacity):
                container_list = np.zeros([1, NUM_APP])
                container_list[0, s_tag] += 1
                container_list[0, c_tag] += num_c_tag
                exist = (allocation == container_list[0]).all(1).any()
                if exist:
                    tput_breakdown_single = rps[allocation.tolist().index(
                        container_list[0].tolist())]

                else:
                    tput_node, tput_breakdown_single = (
                        env.get_throughput_given_state(container_list))
                    tput_breakdown_single = tput_breakdown_single[0]
                    miss_sample += 1
                tput_s_tag = tput_breakdown_single[s_tag]
                tput_s_tag_set.append(tput_s_tag)
            if np.max(tput_s_tag_set) / np.min(tput_s_tag_set) > 1.1:
                for cehck_num in range(1, capacity):
                    if tput_s_tag_set[
                            cehck_num] / tput_s_tag_original > 1.1:  #1.4
                        interfence_tag = 1  # larger than
                        state_set = np.append(state_set,
                                              np.array([
                                                  s_tag + 1, c_tag + 1,
                                                  interfence_tag, cehck_num
                                              ]).reshape([1, 4]),
                                              axis=0)
                        rps_set = np.append(
                            rps_set,
                            np.array(tput_s_tag_set /
                                     tput_s_tag_original).reshape([1, 8]),
                            axis=0)
                        break
                    if tput_s_tag_set[
                            cehck_num] / tput_s_tag_original < 0.9:  #0.6:
                        interfence_tag = 0  # less than
                        if c_tag == s_tag:
                            cehck_num += 1
                        state_set = np.append(state_set,
                                              np.array([
                                                  s_tag + 1, c_tag + 1,
                                                  interfence_tag, cehck_num - 1
                                              ]).reshape([1, 4]),
                                              axis=0)
                        rps_set = np.append(
                            rps_set,
                            np.array(tput_s_tag_set /
                                     tput_s_tag_original).reshape([1, 8]),
                            axis=0)
                        break
            rise = 0
            fall = 0
            for cehck_num in range(1, capacity):
                if tput_s_tag_set[cehck_num] / tput_s_tag_set[cehck_num -
                                                              1] > 1.2:
                    rise = 1
                if tput_s_tag_set[cehck_num] / tput_s_tag_set[cehck_num -
                                                              1] < 0.8:
                    fall = 1
            if rise * fall > 0:
                num_cardinality += 1
                print(s_tag + 1, c_tag + 1)

    import pandas as pd
    save = pd.DataFrame(
        state_set, columns=["app_s", "app_c", "less_or_lager", "threshold"])
    save.to_csv('./interference_applist.csv', index=False, header=True)

    save_1 = pd.DataFrame(rps_set, columns=[0, 1, 2, 3, 4, 5, 6, 7])
    save_1.to_csv('./interference_rpslist.csv', index=False, header=True)
    print("num_cardinality: %d" % num_cardinality)
    print("miss_sample:", miss_sample)
Example #7
0
def train(params):
    """
    parameters set
    """
    NUM_NODES = params['number of nodes in the cluster']
    env = LraClusterEnv(num_nodes=NUM_NODES)
    batch_size = params['batch_size']
    ckpt_path_1 = "./checkpoint/" + params['path'] + "1/model.ckpt"
    ckpt_path_2 = "./checkpoint/" + params['path'] + "2/model.ckpt"
    ckpt_path_3 = "./checkpoint/" + params['path'] + "3/model.ckpt"

    ckpt_path_rec_1 = "../results/cpo/newhypernode/" + params[
        'rec_path'] + "1/model.ckpt"
    ckpt_path_rec_2 = "../results/cpo/newhypernode/" + params[
        'rec_path'] + "2/model.ckpt"
    ckpt_path_rec_3 = "../results/cpo/newhypernode/" + params[
        'rec_path'] + "3/model.ckpt"

    np_path = "./checkpoint/" + params['path'] + "/optimal_file_name.npz"
    Recover = params['recover']
    nodes_per_group = int(params['nodes per group'])
    replay_size = params['replay size']
    training_times_per_episode = 1  # TODO: if layers changes, training_times_per_episode should be modified
    safety_requirement = 6
    # if NUM_CONTAINERS_start > 100:
    #     safety_requirement = 1.0
    # if NUM_CONTAINERS_start > 150:
    #     safety_requirement = 1.0
    # if NUM_CONTAINERS_start > 180:
    #     safety_requirement = 1.0
    """
    Build Network
    """
    n_actions = nodes_per_group  #: 3 nodes per group
    n_features = int(n_actions * (env.NUM_APPS + 1 + env.NUM_APPS) + 1 +
                     env.NUM_APPS)  #: 3*9+1 = 28
    RL_1 = PolicyGradient(n_actions=n_actions,
                          n_features=n_features,
                          learning_rate=params['learning rate'],
                          suffix=str(100) + '1a',
                          safety_requirement=safety_requirement)

    RL_2 = PolicyGradient(n_actions=n_actions,
                          n_features=n_features,
                          learning_rate=params['learning rate'],
                          suffix=str(100) + '2a',
                          safety_requirement=safety_requirement)

    RL_3 = PolicyGradient(n_actions=n_actions,
                          n_features=n_features,
                          learning_rate=params['learning rate'],
                          suffix=str(100) + '3a',
                          safety_requirement=safety_requirement)

    sim = Simulator()
    """
    Training
    """
    start_time = time.time()
    global_start_time = start_time
    number_optimal = []
    observation_episode_1, action_episode_1, reward_episode_1, safety_episode_1 = [], [], [], []
    observation_optimal_1, action_optimal_1, reward_optimal_1, safety_optimal_1 = [], [], [], []

    observation_episode_2, action_episode_2, reward_episode_2, safety_episode_2 = [], [], [], []
    observation_optimal_2, action_optimal_2, reward_optimal_2, safety_optimal_2 = [], [], [], []

    observation_episode_3, action_episode_3, reward_episode_3, safety_episode_3 = [], [], [], []
    observation_optimal_3, action_optimal_3, reward_optimal_3, safety_optimal_3 = [], [], [], []

    epoch_i = 0

    thre_entropy = 0.1
    # TODO: delete this range

    names = locals()
    for i in range(0, 10):
        names['highest_tput_' + str(i)] = 0

        names['observation_optimal_1_' + str(i)] = []
        names['action_optimal_1_' + str(i)] = []
        names['observation_optimal_2_' + str(i)] = []
        names['action_optimal_2_' + str(i)] = []
        names['observation_optimal_3_' + str(i)] = []
        names['action_optimal_3_' + str(i)] = []

        names['reward_optimal_1_' + str(i)] = []
        names['reward_optimal_2_' + str(i)] = []
        names['reward_optimal_3_' + str(i)] = []
        names['safety_optimal_1_' + str(i)] = []
        names['safety_optimal_2_' + str(i)] = []
        names['safety_optimal_3_' + str(i)] = []

        names['number_optimal_' + str(i)] = []
        names['optimal_range_' + str(i)] = 1.05

        names['lowest_vio_' + str(i)] = 500

        names['observation_optimal_1_vio_' + str(i)] = []
        names['action_optimal_1_vio_' + str(i)] = []
        names['observation_optimal_2_vio_' + str(i)] = []
        names['action_optimal_2_vio_' + str(i)] = []
        names['observation_optimal_3_vio_' + str(i)] = []
        names['action_optimal_3_vio_' + str(i)] = []

        names['reward_optimal_vio_1_' + str(i)] = []
        names['reward_optimal_vio_2_' + str(i)] = []
        names['reward_optimal_vio_3_' + str(i)] = []
        names['safety_optimal_vio_1_' + str(i)] = []
        names['safety_optimal_vio_2_' + str(i)] = []
        names['safety_optimal_vio_3_' + str(i)] = []

        names['number_optimal_vio_' + str(i)] = []
        names['optimal_range_vio_' + str(i)] = 1.1

    def store_episode_1(observations, actions):
        observation_episode_1.append(observations)
        action_episode_1.append(actions)

    def store_episode_2(observations, actions):
        observation_episode_2.append(observations)
        action_episode_2.append(actions)

    def store_episode_3(observations, actions):
        observation_episode_3.append(observations)
        action_episode_3.append(actions)

    NUM_CONTAINERS = 50
    tput_origimal_class = 0
    source_batch_, index_data_ = batch_data(
        NUM_CONTAINERS, env.NUM_APPS)  # index_data = [0,1,2,0,1,2]

    while epoch_i < params['epochs']:
        if Recover:
            RL_1.restore_session(ckpt_path_rec_1)
            RL_2.restore_session(ckpt_path_rec_2)
            RL_3.restore_session(ckpt_path_rec_3)
            Recover = False

        observation = env.reset().copy()  # (9,9)
        source_batch = source_batch_.copy()
        index_data = index_data_.copy()
        """
        Episode
        """
        """
        first layer
        """
        source_batch_first = source_batch_.copy()
        observation_first_layer = np.zeros([nodes_per_group, env.NUM_APPS],
                                           int)
        for inter_episode_index in range(NUM_CONTAINERS):

            appid = index_data[inter_episode_index]
            source_batch_first[appid] -= 1
            observation_first_layer_copy = observation_first_layer.copy()
            observation_first_layer_copy[:, appid] += 1

            observation_first_layer_copy = np.append(
                observation_first_layer_copy,
                observation_first_layer_copy > 9 * 2,
                axis=1)
            observation_first_layer_copy = np.append(
                observation_first_layer_copy,
                observation_first_layer_copy.sum(axis=1).reshape(
                    nodes_per_group, 1),
                axis=1)
            # observation_first_layer_copy = np.append(observation_first_layer_copy, ((observation_first_layer_copy[:, 2] > 0) * (observation_first_layer_copy[:, 3] > 0)).reshape(nodes_per_group, 1), axis=1)
            observation_first_layer_copy = np.array(
                observation_first_layer_copy).reshape(1, -1)
            observation_first_layer_copy = np.append(
                observation_first_layer_copy, appid).reshape(1, -1)
            observation_first_layer_copy = np.append(
                observation_first_layer_copy,
                np.array(source_batch_first)).reshape(1, -1)

            action_1, prob_weights = RL_1.choose_action(
                observation_first_layer_copy.copy())

            observation_first_layer[action_1, appid] += 1

            store_episode_1(observation_first_layer_copy, action_1)
        """
        second layer
        """
        observation_second_layer_aggregation = np.empty([0, env.NUM_APPS],
                                                        int)  # 9*20

        number_cont_second_layer = []

        for second_layer_index in range(nodes_per_group):

            rnd_array = observation_first_layer[second_layer_index].copy()
            source_batch_second, index_data = batch_data_sub(rnd_array)

            observation_second_layer = np.zeros(
                [nodes_per_group, env.NUM_APPS], int)

            NUM_CONTAINERS_second = sum(source_batch_second)

            number_cont_second_layer.append(NUM_CONTAINERS_second)

            for inter_episode_index in range(NUM_CONTAINERS_second):

                appid = index_data[inter_episode_index]
                source_batch_second[appid] -= 1
                observation_second_layer_copy = observation_second_layer.copy()
                observation_second_layer_copy[:, appid] += 1

                observation_second_layer_copy = np.append(
                    observation_second_layer_copy,
                    observation_second_layer_copy > 3 * 2,
                    axis=1)
                observation_second_layer_copy = np.append(
                    observation_second_layer_copy,
                    observation_second_layer_copy.sum(axis=1).reshape(
                        nodes_per_group, 1),
                    axis=1)
                # observation_second_layer_copy = np.append(observation_second_layer_copy, ((observation_second_layer_copy[:, 2] > 0) * (observation_second_layer_copy[:, 3] > 0)).reshape(nodes_per_group, 1), axis=1)
                observation_second_layer_copy = np.array(
                    observation_second_layer_copy).reshape(1, -1)
                observation_second_layer_copy = np.append(
                    observation_second_layer_copy, appid).reshape(1, -1)
                observation_second_layer_copy = np.append(
                    observation_second_layer_copy,
                    np.array(source_batch_second)).reshape(1, -1)

                action_2, prob_weights = RL_2.choose_action(
                    observation_second_layer_copy.copy())

                observation_second_layer[action_2, appid] += 1

                store_episode_2(observation_second_layer_copy, action_2)

            observation_second_layer_aggregation = np.append(
                observation_second_layer_aggregation, observation_second_layer,
                0)
        """
        third layer
        """
        observation_third_layer_aggregation = np.empty([0, env.NUM_APPS],
                                                       int)  # 9*20
        number_cont_third_layer = []

        for third_layer_index in range(nodes_per_group * nodes_per_group):

            rnd_array = observation_second_layer_aggregation[
                third_layer_index].copy()
            source_batch_third, index_data = batch_data_sub(rnd_array)

            observation_third_layer = np.zeros([nodes_per_group, env.NUM_APPS],
                                               int)

            NUM_CONTAINERS_third = sum(source_batch_third)
            number_cont_third_layer.append(NUM_CONTAINERS_third)

            for inter_episode_index in range(NUM_CONTAINERS_third):
                appid = index_data[inter_episode_index]
                source_batch_third[appid] -= 1
                observation_third_layer_copy = observation_third_layer.copy()
                observation_third_layer_copy[:, appid] += 1

                observation_third_layer_copy = np.append(
                    observation_third_layer_copy,
                    observation_third_layer_copy > 1 * 2,
                    axis=1)
                observation_third_layer_copy = np.append(
                    observation_third_layer_copy,
                    observation_third_layer_copy.sum(axis=1).reshape(
                        nodes_per_group, 1),
                    axis=1)
                # observation_third_layer_copy = np.append(observation_third_layer_copy, ((observation_third_layer_copy[:, 2] > 0) * (observation_third_layer_copy[:, 3] > 0)).reshape(nodes_per_group, 1), axis=1)
                observation_third_layer_copy = np.array(
                    observation_third_layer_copy).reshape(1, -1)
                observation_third_layer_copy = np.append(
                    observation_third_layer_copy, appid).reshape(1, -1)
                observation_third_layer_copy = np.append(
                    observation_third_layer_copy,
                    np.array(source_batch_third)).reshape(1, -1)

                action_3, prob_weights = RL_3.choose_action(
                    observation_third_layer_copy.copy())

                observation_third_layer[action_3, appid] += 1

                store_episode_3(observation_third_layer_copy, action_3)

            observation_third_layer_aggregation = np.append(
                observation_third_layer_aggregation, observation_third_layer,
                0)
        """
        After an entire allocation, calculate total throughput, reward
        """
        env.state = observation_third_layer_aggregation.copy()
        assert sum(sum(env.state)) == NUM_CONTAINERS
        assert (env.state.sum(0) == source_batch_).all()
        tput_state = env.state
        tput_breakdown = sim.predict(tput_state.reshape(-1, env.NUM_APPS))
        tput = (tput_state.sum(1) < 1e-10).sum()
        reward_ratio = (tput - 0)

        state = env.state
        list_check_per_app = (env.state > 1).sum() + max(
            (env.state - 1).max(), 0)
        list_check_sum = sum(
            env.state.sum(1) > params['container_limitation per node']
        ) + max(
            max(env.state.sum(1) - params['container_limitation per node']), 0)
        list_check_coex = sum((env.state[:, 1] > 0) * (env.state[:, 2] > 0))
        list_check = list_check_sum + list_check_coex + list_check_per_app

        list_check += ((tput_breakdown < 0.8) * tput_state).sum()

        list_check_ratio = list_check

        safety_episode_1 = [list_check_ratio * 1.0
                            ] * len(observation_episode_1)
        reward_episode_1 = [reward_ratio * 1.0] * len(observation_episode_1)

        safety_episode_2 = [list_check_ratio * 1.0
                            ] * len(observation_episode_2)
        reward_episode_2 = [reward_ratio * 1.0] * len(observation_episode_2)

        safety_episode_3 = [list_check_ratio * 1.0
                            ] * len(observation_episode_3)
        reward_episode_3 = [reward_ratio * 1.0] * len(observation_episode_3)

        RL_1.store_tput_per_episode(tput, epoch_i, list_check,
                                    list_check_per_app, list_check_coex,
                                    list_check_sum)
        RL_2.store_tput_per_episode(tput, epoch_i, list_check, [], [], [])
        RL_3.store_tput_per_episode(tput, epoch_i, list_check, [], [], [])

        RL_1.store_training_samples_per_episode(observation_episode_1,
                                                action_episode_1,
                                                reward_episode_1,
                                                safety_episode_1)
        RL_2.store_training_samples_per_episode(observation_episode_2,
                                                action_episode_2,
                                                reward_episode_2,
                                                safety_episode_2)
        RL_3.store_training_samples_per_episode(observation_episode_3,
                                                action_episode_3,
                                                reward_episode_3,
                                                safety_episode_3)
        """
        check_tput_quality(tput)
        """
        if names['lowest_vio_' + str(tput_origimal_class)] > list_check:
            names['lowest_vio_' + str(tput_origimal_class)] = list_check
            names['observation_optimal_1_vio_' + str(
                tput_origimal_class
            )], names[
                'action_optimal_1_vio_' + str(tput_origimal_class)], names[
                    'observation_optimal_2_vio_' +
                    str(tput_origimal_class)], names[
                        'action_optimal_2_vio_' +
                        str(tput_origimal_class)], names[
                            'number_optimal_vio_' +
                            str(tput_origimal_class)], names[
                                'safety_optimal_vio_1_' +
                                str(tput_origimal_class)], names[
                                    'safety_optimal_vio_2_' +
                                    str(tput_origimal_class)], names[
                                        'safety_optimal_vio_3_' + str(
                                            tput_origimal_class
                                        )] = [], [], [], [], [], [], [], []
            names['observation_optimal_3_vio_' +
                  str(tput_origimal_class)], names[
                      'action_optimal_3_vio_' +
                      str(tput_origimal_class)] = [], []
            names['reward_optimal_vio_' + str(tput_origimal_class)] = []
            names['observation_optimal_1_vio_' +
                  str(tput_origimal_class)].extend(observation_episode_1)
            names['action_optimal_1_vio_' +
                  str(tput_origimal_class)].extend(action_episode_1)
            names['observation_optimal_2_vio_' +
                  str(tput_origimal_class)].extend(observation_episode_2)
            names['action_optimal_2_vio_' +
                  str(tput_origimal_class)].extend(action_episode_2)
            names['observation_optimal_3_vio_' +
                  str(tput_origimal_class)].extend(observation_episode_3)
            names['action_optimal_3_vio_' +
                  str(tput_origimal_class)].extend(action_episode_3)
            names['number_optimal_vio_' +
                  str(tput_origimal_class)].append(NUM_CONTAINERS)
            names['safety_optimal_vio_1_' +
                  str(tput_origimal_class)].extend(safety_episode_1)
            names['safety_optimal_vio_2_' +
                  str(tput_origimal_class)].extend(safety_episode_2)
            names['safety_optimal_vio_3_' +
                  str(tput_origimal_class)].extend(safety_episode_3)
            names['reward_optimal_vio_' +
                  str(tput_origimal_class)].extend(reward_episode_1)

            names['optimal_range_vio_' + str(tput_origimal_class)] = 1.1
        elif names['lowest_vio_' +
                   str(tput_origimal_class)] >= list_check / names[
                       'optimal_range_vio_' + str(tput_origimal_class)]:
            names['observation_optimal_1_vio_' +
                  str(tput_origimal_class)].extend(observation_episode_1)
            names['action_optimal_1_vio_' +
                  str(tput_origimal_class)].extend(action_episode_1)
            names['observation_optimal_2_vio_' +
                  str(tput_origimal_class)].extend(observation_episode_2)
            names['action_optimal_2_vio_' +
                  str(tput_origimal_class)].extend(action_episode_2)
            names['observation_optimal_3_vio_' +
                  str(tput_origimal_class)].extend(observation_episode_3)
            names['action_optimal_3_vio_' +
                  str(tput_origimal_class)].extend(action_episode_3)
            names['number_optimal_vio_' +
                  str(tput_origimal_class)].append(NUM_CONTAINERS)
            names['safety_optimal_vio_1_' +
                  str(tput_origimal_class)].extend(safety_episode_1)
            names['safety_optimal_vio_2_' +
                  str(tput_origimal_class)].extend(safety_episode_2)
            names['safety_optimal_vio_3_' +
                  str(tput_origimal_class)].extend(safety_episode_3)
            names['reward_optimal_vio_' +
                  str(tput_origimal_class)].extend(reward_episode_1)

        if list_check <= safety_requirement * 0.5:
            if names['highest_tput_' + str(tput_origimal_class)] < tput:
                names['highest_tput_' + str(tput_origimal_class)] = tput

                names['observation_optimal_1_' + str(tput_origimal_class)], names['action_optimal_1_' + str(tput_origimal_class)], names['observation_optimal_2_' + str(tput_origimal_class)], names['action_optimal_2_' + str(tput_origimal_class)],\
                names['reward_optimal_1_' + str(tput_origimal_class)],names['reward_optimal_2_' + str(tput_origimal_class)],names['reward_optimal_3_' + str(tput_origimal_class)], \
                names['number_optimal_' + str(tput_origimal_class)],\
                names['safety_optimal_1_' + str(tput_origimal_class)],names['safety_optimal_2_' + str(tput_origimal_class)],names['safety_optimal_3_' + str(tput_origimal_class)]\
                    = [], [], [], [], [], [], [], [], [], [], []
                names['observation_optimal_3_' +
                      str(tput_origimal_class)], names[
                          'action_optimal_3_' +
                          str(tput_origimal_class)] = [], []

                names['observation_optimal_1_' +
                      str(tput_origimal_class)].extend(observation_episode_1)
                names['action_optimal_1_' +
                      str(tput_origimal_class)].extend(action_episode_1)
                names['observation_optimal_2_' +
                      str(tput_origimal_class)].extend(observation_episode_2)
                names['action_optimal_2_' +
                      str(tput_origimal_class)].extend(action_episode_2)
                names['observation_optimal_3_' +
                      str(tput_origimal_class)].extend(observation_episode_3)
                names['action_optimal_3_' +
                      str(tput_origimal_class)].extend(action_episode_3)

                names['number_optimal_' +
                      str(tput_origimal_class)].append(NUM_CONTAINERS)

                names['safety_optimal_1_' +
                      str(tput_origimal_class)].extend(safety_episode_1)
                names['safety_optimal_2_' +
                      str(tput_origimal_class)].extend(safety_episode_2)
                names['safety_optimal_3_' +
                      str(tput_origimal_class)].extend(safety_episode_3)
                names['reward_optimal_1_' +
                      str(tput_origimal_class)].extend(reward_episode_1)
                names['reward_optimal_2_' +
                      str(tput_origimal_class)].extend(reward_episode_2)
                names['reward_optimal_3_' +
                      str(tput_origimal_class)].extend(reward_episode_3)

                names['optimal_range_' + str(tput_origimal_class)] = 1.05

            elif names['highest_tput_' +
                       str(tput_origimal_class)] < tput * names[
                           'optimal_range_' + str(tput_origimal_class)]:
                names['observation_optimal_1_' +
                      str(tput_origimal_class)].extend(observation_episode_1)
                names['action_optimal_1_' +
                      str(tput_origimal_class)].extend(action_episode_1)
                names['observation_optimal_2_' +
                      str(tput_origimal_class)].extend(observation_episode_2)
                names['action_optimal_2_' +
                      str(tput_origimal_class)].extend(action_episode_2)
                names['observation_optimal_3_' +
                      str(tput_origimal_class)].extend(observation_episode_3)
                names['action_optimal_3_' +
                      str(tput_origimal_class)].extend(action_episode_3)

                names['number_optimal_' +
                      str(tput_origimal_class)].append(NUM_CONTAINERS)

                names['safety_optimal_1_' +
                      str(tput_origimal_class)].extend(safety_episode_1)
                names['safety_optimal_2_' +
                      str(tput_origimal_class)].extend(safety_episode_2)
                names['safety_optimal_3_' +
                      str(tput_origimal_class)].extend(safety_episode_3)
                names['reward_optimal_1_' +
                      str(tput_origimal_class)].extend(reward_episode_1)
                names['reward_optimal_2_' +
                      str(tput_origimal_class)].extend(reward_episode_2)
                names['reward_optimal_3_' +
                      str(tput_origimal_class)].extend(reward_episode_3)

        observation_episode_1, action_episode_1, reward_episode_1, safety_episode_1 = [], [], [], []
        observation_episode_2, action_episode_2, reward_episode_2, safety_episode_2 = [], [], [], []
        observation_episode_3, action_episode_3, reward_episode_3, safety_episode_3 = [], [], [], []
        """
        Each batch, RL.learn()
        """
        if (epoch_i % batch_size == 0) & (epoch_i > 1):
            for replay_class in range(0, 1):

                number_optimal = names['number_optimal_' + str(replay_class)]

                reward_optimal_1 = names['reward_optimal_1_' +
                                         str(replay_class)]
                reward_optimal_2 = names['reward_optimal_2_' +
                                         str(replay_class)]
                reward_optimal_3 = names['reward_optimal_3_' +
                                         str(replay_class)]
                safety_optimal_1 = names['safety_optimal_1_' +
                                         str(replay_class)]
                safety_optimal_2 = names['safety_optimal_2_' +
                                         str(replay_class)]
                safety_optimal_3 = names['safety_optimal_3_' +
                                         str(replay_class)]

                observation_optimal_1 = names['observation_optimal_1_' +
                                              str(replay_class)]
                action_optimal_1 = names['action_optimal_1_' +
                                         str(replay_class)]
                observation_optimal_2 = names['observation_optimal_2_' +
                                              str(replay_class)]
                action_optimal_2 = names['action_optimal_2_' +
                                         str(replay_class)]
                observation_optimal_3 = names['observation_optimal_3_' +
                                              str(replay_class)]
                action_optimal_3 = names['action_optimal_3_' +
                                         str(replay_class)]

                buffer_size = int(len(number_optimal))

                if buffer_size < replay_size:
                    # TODO: if layers changes, training_times_per_episode should be modified
                    RL_1.ep_obs.extend(observation_optimal_1)
                    RL_1.ep_as.extend(action_optimal_1)
                    RL_1.ep_rs.extend(reward_optimal_1)
                    RL_1.ep_ss.extend(safety_optimal_1)

                    RL_2.ep_obs.extend(observation_optimal_2)
                    RL_2.ep_as.extend(action_optimal_2)
                    RL_2.ep_rs.extend(reward_optimal_2)
                    RL_2.ep_ss.extend(safety_optimal_2)

                    RL_3.ep_obs.extend(observation_optimal_3)
                    RL_3.ep_as.extend(action_optimal_3)
                    RL_3.ep_rs.extend(reward_optimal_3)
                    RL_3.ep_ss.extend(safety_optimal_3)

                else:
                    replay_index = np.random.choice(range(buffer_size),
                                                    size=replay_size,
                                                    replace=False)
                    for replay_id in range(replay_size):
                        replace_start = replay_index[replay_id]
                        start_location = sum(number_optimal[:replace_start])
                        stop_location = sum(number_optimal[:replace_start + 1])
                        RL_1.ep_obs.extend(
                            observation_optimal_1[start_location:stop_location]
                        )
                        RL_1.ep_as.extend(
                            action_optimal_1[start_location:stop_location])
                        RL_1.ep_rs.extend(
                            reward_optimal_1[start_location:stop_location])
                        RL_1.ep_ss.extend(
                            safety_optimal_1[start_location:stop_location])

                        RL_2.ep_obs.extend(
                            observation_optimal_2[start_location:stop_location]
                        )
                        RL_2.ep_as.extend(
                            action_optimal_2[start_location:stop_location])
                        RL_2.ep_rs.extend(
                            reward_optimal_2[start_location:stop_location])
                        RL_2.ep_ss.extend(
                            safety_optimal_2[start_location:stop_location])

                        RL_3.ep_obs.extend(
                            observation_optimal_3[start_location:stop_location]
                        )
                        RL_3.ep_as.extend(
                            action_optimal_3[start_location:stop_location])
                        RL_3.ep_rs.extend(
                            reward_optimal_3[start_location:stop_location])
                        RL_3.ep_ss.extend(
                            safety_optimal_3[start_location:stop_location])

            if not RL_1.start_cpo:
                for replay_class in range(0, 1):
                    number_optimal = names['number_optimal_vio_' +
                                           str(replay_class)]
                    safety_optimal_1 = names['safety_optimal_vio_1_' +
                                             str(replay_class)]
                    safety_optimal_2 = names['safety_optimal_vio_2_' +
                                             str(replay_class)]
                    safety_optimal_3 = names['safety_optimal_vio_3_' +
                                             str(replay_class)]
                    reward_optimal = names['reward_optimal_vio_' +
                                           str(replay_class)]

                    observation_optimal_1 = names['observation_optimal_1_vio_'
                                                  + str(replay_class)]
                    action_optimal_1 = names['action_optimal_1_vio_' +
                                             str(replay_class)]
                    observation_optimal_2 = names['observation_optimal_2_vio_'
                                                  + str(replay_class)]
                    action_optimal_2 = names['action_optimal_2_vio_' +
                                             str(replay_class)]
                    observation_optimal_3 = names['observation_optimal_3_vio_'
                                                  + str(replay_class)]
                    action_optimal_3 = names['action_optimal_3_vio_' +
                                             str(replay_class)]

                    buffer_size = int(len(number_optimal))

                    if buffer_size < replay_size:
                        # TODO: if layers changes, training_times_per_episode should be modified
                        RL_1.ep_obs.extend(observation_optimal_1)
                        RL_1.ep_as.extend(action_optimal_1)
                        RL_1.ep_ss.extend(safety_optimal_1)
                        RL_1.ep_rs.extend(reward_optimal)

                        RL_2.ep_obs.extend(observation_optimal_2)
                        RL_2.ep_as.extend(action_optimal_2)
                        RL_2.ep_rs.extend(reward_optimal)
                        RL_2.ep_ss.extend(safety_optimal_2)

                        RL_3.ep_obs.extend(observation_optimal_3)
                        RL_3.ep_as.extend(action_optimal_3)
                        RL_3.ep_rs.extend(reward_optimal)
                        RL_3.ep_ss.extend(safety_optimal_3)

                    else:
                        replay_index = np.random.choice(range(buffer_size),
                                                        size=replay_size,
                                                        replace=False)
                        for replay_id in range(replay_size):
                            replace_start = replay_index[replay_id]
                            start_location = sum(
                                number_optimal[:replace_start])
                            stop_location = sum(number_optimal[:replace_start +
                                                               1])
                            RL_1.ep_obs.extend(observation_optimal_1[
                                start_location:stop_location])
                            RL_1.ep_as.extend(
                                action_optimal_1[start_location:stop_location])
                            RL_1.ep_rs.extend(
                                reward_optimal[start_location:stop_location])
                            RL_1.ep_ss.extend(
                                safety_optimal_1[start_location:stop_location])

                            RL_2.ep_obs.extend(observation_optimal_2[
                                start_location:stop_location])
                            RL_2.ep_as.extend(
                                action_optimal_2[start_location:stop_location])
                            RL_2.ep_rs.extend(
                                reward_optimal[start_location:stop_location])
                            RL_2.ep_ss.extend(
                                safety_optimal_2[start_location:stop_location])

                            RL_3.ep_obs.extend(observation_optimal_3[
                                start_location:stop_location])
                            RL_3.ep_as.extend(
                                action_optimal_3[start_location:stop_location])
                            RL_3.ep_rs.extend(
                                reward_optimal[start_location:stop_location])
                            RL_3.ep_ss.extend(
                                safety_optimal_3[start_location:stop_location])

            RL_1.learn(epoch_i, thre_entropy, Ifprint=True)
            RL_2.learn(epoch_i, thre_entropy)
            optim_case = RL_3.learn(epoch_i, thre_entropy)
        """
        checkpoint, per 1000 episodes
        """
        if (epoch_i % 3000 == 0) & (epoch_i > 1):
            for class_replay in range(0, 1):
                highest_value = names['highest_tput_' + str(class_replay)]
                print("\n epoch: %d, highest tput: %f" %
                      (epoch_i, highest_value))

                # lowest_vio_ = names['lowest_vio_' + str(class_replay)]
                # print("\n epoch: %d, lowest_vio: %f" % (epoch_i, lowest_vio_))

            RL_1.save_session(ckpt_path_1)
            RL_2.save_session(ckpt_path_2)
            RL_3.save_session(ckpt_path_3)
            np.savez(np_path,
                     tputs=np.array(RL_1.tput_persisit),
                     candidate=np.array(RL_1.episode),
                     vi_perapp=np.array(RL_1.ss_perapp_persisit),
                     vi_coex=np.array(RL_1.ss_coex_persisit),
                     vi_sum=np.array(RL_1.ss_sum_persisit))
            # print("epoch:", epoch_i, "mean(sum): ", np.mean(RL_1.ss_sum_persisit[-500:]), "mean(coex): ", np.mean(RL_1.coex_persisit[-500:]))
            """
            optimal range adaptively change
            """
            for class_replay in range(0, 1):
                number_optimal = names['number_optimal_' + str(class_replay)]
                count_size = int(len(number_optimal))

                if (count_size > 300):
                    names['optimal_range_' + str(class_replay)] *= 0.99
                    names['optimal_range_' + str(class_replay)] = max(
                        names['optimal_range_' + str(class_replay)], 1.01)

                    start_location = sum(names['number_optimal_' + str(
                        class_replay)][:-50]) * training_times_per_episode

                    names['observation_optimal_1_' +
                          str(class_replay)] = names[
                              'observation_optimal_1_' +
                              str(class_replay)][start_location:]
                    names['action_optimal_1_' + str(class_replay)] = names[
                        'action_optimal_1_' +
                        str(class_replay)][start_location:]

                    names['observation_optimal_2_' +
                          str(class_replay)] = names[
                              'observation_optimal_2_' +
                              str(class_replay)][start_location:]
                    names['action_optimal_2_' + str(class_replay)] = names[
                        'action_optimal_2_' +
                        str(class_replay)][start_location:]

                    names['observation_optimal_3_' +
                          str(class_replay)] = names[
                              'observation_optimal_3_' +
                              str(class_replay)][start_location:]
                    names['action_optimal_3_' + str(class_replay)] = names[
                        'action_optimal_3_' +
                        str(class_replay)][start_location:]

                    names['number_optimal_' +
                          str(class_replay)] = names['number_optimal_' +
                                                     str(class_replay)][-50:]

                    names['safety_optimal_1_' + str(class_replay)] = names[
                        'safety_optimal_1_' +
                        str(class_replay)][start_location:]
                    names['safety_optimal_2_' + str(class_replay)] = names[
                        'safety_optimal_2_' +
                        str(class_replay)][start_location:]
                    names['safety_optimal_3_' + str(class_replay)] = names[
                        'safety_optimal_3_' +
                        str(class_replay)][start_location:]
                    names['reward_optimal_1_' + str(class_replay)] = names[
                        'reward_optimal_1_' +
                        str(class_replay)][start_location:]
                    names['reward_optimal_2_' + str(class_replay)] = names[
                        'reward_optimal_2_' +
                        str(class_replay)][start_location:]
                    names['reward_optimal_3_' + str(class_replay)] = names[
                        'reward_optimal_3_' +
                        str(class_replay)][start_location:]

                print("optimal_range:",
                      names['optimal_range_' + str(class_replay)])

            print(prob_weights)
            # if optim_case > 0:
            thre_entropy *= 0.5
            # if epoch_i < 50000:
            #     thre_entropy = max(thre_entropy, 0.01)
            # else:
            thre_entropy = max(thre_entropy, 0.0001)

        epoch_i += 1
        if epoch_i > 20000:
            batch_size = 200
Example #8
0
def train(params):
    time_epoch_set = []
    start_time = time.time()
    """
    parameters set
    """
    NUM_NODES = params['number of nodes in the cluster']
    NUM_CONTAINERS = params['number of containers']
    env = LraClusterEnv(num_nodes=NUM_NODES)
    batch_size = params['batch_size']
    ckpt_path_1 = "./checkpoint/" + params['path'] + "_1" + "/model.ckpt"
    ckpt_path_2 = "./checkpoint/" + params['path'] + "_2" + "/model.ckpt"
    ckpt_path_3 = "./checkpoint/" + params['path'] + "_3" + "/model.ckpt"
    np_path = "./checkpoint/" + params['path'] + "/optimal_file_name.npz"
    Recover = params['recover']
    nodes_per_group = int(params['nodes per group'])
    replay_size = params['replay size']
    training_times_per_episode = 1
    UseExperienceReplay = False
    """
    Build Network
    """
    n_actions = nodes_per_group  #: 3 nodes per group
    n_features = int(n_actions * env.NUM_APPS + 1 +
                     env.NUM_APPS)  #: 3*7+1+7 = 29
    RL_1 = PolicyGradient(n_actions=n_actions,
                          n_features=n_features,
                          learning_rate=params['learning rate'],
                          suffix=str(params['number of containers']) + '1')

    RL_2 = PolicyGradient(n_actions=n_actions,
                          n_features=n_features,
                          learning_rate=params['learning rate'],
                          suffix=str(params['number of containers']) + '2')

    RL_3 = PolicyGradient(n_actions=n_actions,
                          n_features=n_features,
                          learning_rate=params['learning rate'],
                          suffix=str(params['number of containers']) + '3')
    sim = Simulator()
    """
    Training
    """
    start_time = time.time()
    global_start_time = start_time

    observation_episode_1, action_episode_1, reward_episode_1 = [], [], []
    observation_episode_2, action_episode_2, reward_episode_2 = [], [], []
    observation_episode_3, action_episode_3, reward_episode_3 = [], [], []

    epoch_i = 0
    entropy_weight = 0.1
    for i in range(0, 1):
        names['highest_tput_' + str(i)] = 0.1
        names['observation_optimal_1_' + str(i)] = []
        names['action_optimal_1_' + str(i)] = []
        names['reward_optimal_1_' + str(i)] = []
        names['number_optimal_' + str(i)] = []
        names['optimal_range_' + str(i)] = 1.2

    for i in range(0, 1):
        names['observation_optimal_2_' + str(i)] = []
        names['action_optimal_2_' + str(i)] = []
        names['reward_optimal_2_' + str(i)] = []

    for i in range(0, 1):
        names['observation_optimal_3_' + str(i)] = []
        names['action_optimal_3_' + str(i)] = []
        names['reward_optimal_3_' + str(i)] = []

    def store_episode_1(observations, actions):
        observation_episode_1.append(observations)
        action_episode_1.append(actions)

    def store_episode_2(observations, actions):
        observation_episode_2.append(observations)
        action_episode_2.append(actions)

    def store_episode_3(observations, actions):
        observation_episode_3.append(observations)
        action_episode_3.append(actions)

    while epoch_i < params['epochs']:

        tput_origimal_class = 0
        source_batch_, index_data = batch_data(
            NUM_CONTAINERS, env.NUM_APPS)  # index_data = [0,1,2,0,1,2]
        observation = env.reset().copy()  # (9,9)
        source_batch = source_batch_.copy()
        source_batch_cpoy = source_batch.copy()

        total = source_batch
        # observation = observation_original.copy()
        limit = (1 - observation)
        capicity = (params['container_limitation per node'] -
                    observation.sum(1)).reshape(-1)  # 27
        s = Solver()
        # app sum == batch

        for i in range(7):
            s.add(z3.Sum(names['x' + str(i)]) == int(total[i]))

        # node capacity
        for node in range(27):
            s.add(
                z3.Sum([names['x' + str(i)][node]
                        for i in range(7)]) <= int(capicity[node]))

        # >=0
        for i in range(7):
            for node in range(27):
                s.add(names['x' + str(i)][node] >= 0)

        # per app spread
        for i in range(7):
            for node in range(27):
                s.add(names['x' + str(i)][node] <= int(limit[node, i]))

        # App1 and App2 not exist
        for node in range(27):
            s.add(names['x' + str(1)][node] + names['x' + str(2)][node] <= 1)

        def handle_constraint(NUM_NODES, appid, source_batch):

            observation_original = observation.copy()

            mapping_index = []
            list_check = []

            t2 = time.time()
            for place in range(27):
                s.push()
                s.add(names['x' +
                            str(appid)][place] >= env.state[place][appid] + 1)

                if s.check() == z3.sat:
                    list_check.append(False)
                else:
                    list_check.append(True)
                s.pop()

            t3 = time.time()
            # print("formulate: ", t2 - t1)
            # print("calculate: ", t3 - t2)
            good_index = np.where(np.array(list_check) == False)[0]
            length = len(good_index)
            if length < 1:
                test = 1
            index_replace = 0
            for node in range(NUM_NODES):
                if list_check[node]:  # bad node
                    # index_this_replace = good_index[np.random.randint(length)]
                    index_this_replace = good_index[index_replace % length]
                    index_replace += 1
                    observation_original[node] = observation[
                        index_this_replace]
                    mapping_index.append(index_this_replace)
                else:
                    mapping_index.append(node)
                    observation_original[node] = observation[node]

            return observation_original, mapping_index

        """
        Episode
        """
        for inter_episode_index in range(NUM_CONTAINERS):

            source_batch[index_data[inter_episode_index]] -= 1

            appid = index_data[inter_episode_index]
            observation, mapping_index = handle_constraint(
                NUM_NODES, appid, source_batch_cpoy)
            observation[:, index_data[inter_episode_index]] += 1
            assert len(mapping_index) > 0

            observation_first_layer = np.empty([0, env.NUM_APPS], int)
            number_of_first_layer_nodes = int(NUM_NODES / nodes_per_group)  # 9
            for i in range(nodes_per_group):
                observation_new = np.sum(
                    observation[i * number_of_first_layer_nodes:(i + 1) *
                                number_of_first_layer_nodes],
                    0).reshape(1, -1)
                observation_first_layer = np.append(observation_first_layer,
                                                    observation_new, 0)
            observation_first_layer[:, index_data[inter_episode_index]] += 1
            observation_first_layer = np.array(
                observation_first_layer).reshape(1, -1)
            observation_first_layer = np.append(
                observation_first_layer,
                index_data[inter_episode_index]).reshape(1, -1)
            observation_first_layer = np.append(
                observation_first_layer,
                np.array(source_batch)).reshape(1, -1)  # (1,29)

            action_1, prob_weights = RL_1.choose_action(
                observation_first_layer.copy())

            observation_copy = observation.copy()
            observation_copy = observation_copy[action_1 *
                                                number_of_first_layer_nodes:
                                                (action_1 + 1) *
                                                number_of_first_layer_nodes]
            number_of_second_layer_nodes = int(number_of_first_layer_nodes /
                                               nodes_per_group)  # 9/3 = 3
            observation_second_layer = np.empty([0, env.NUM_APPS], int)
            for i in range(nodes_per_group):
                observation_new = np.sum(
                    observation_copy[i * number_of_second_layer_nodes:(i + 1) *
                                     number_of_second_layer_nodes],
                    0).reshape(1, -1)
                observation_second_layer = np.append(observation_second_layer,
                                                     observation_new, 0)
            observation_second_layer[:, index_data[inter_episode_index]] += 1
            observation_second_layer = np.array(
                observation_second_layer).reshape(1, -1)
            observation_second_layer = np.append(
                observation_second_layer,
                index_data[inter_episode_index]).reshape(1, -1)
            observation_second_layer = np.append(
                observation_second_layer,
                np.array(source_batch)).reshape(1, -1)
            action_2, prob_weights = RL_2.choose_action(
                observation_second_layer.copy())

            observation_copy = observation_copy[action_2 *
                                                number_of_second_layer_nodes:
                                                (action_2 + 1) *
                                                number_of_second_layer_nodes]
            number_of_third_layer_nodes = int(number_of_second_layer_nodes /
                                              nodes_per_group)  # 3/3 = 1
            observation_third_layer = np.empty([0, env.NUM_APPS], int)
            for i in range(nodes_per_group):
                observation_new = np.sum(
                    observation_copy[i * number_of_third_layer_nodes:(i + 1) *
                                     number_of_third_layer_nodes],
                    0).reshape(1, -1)
                observation_third_layer = np.append(observation_third_layer,
                                                    observation_new, 0)
            observation_third_layer[:, index_data[inter_episode_index]] += 1
            observation_third_layer = np.array(
                observation_third_layer).reshape(1, -1)
            observation_third_layer = np.append(
                observation_third_layer,
                index_data[inter_episode_index]).reshape(1, -1)
            observation_third_layer = np.append(
                observation_third_layer,
                np.array(source_batch)).reshape(1, -1)

            action_3, prob_weights = RL_3.choose_action(
                observation_third_layer.copy())

            final_decision = action_1 * number_of_first_layer_nodes + action_2 * number_of_second_layer_nodes + action_3 * number_of_third_layer_nodes

            appid = index_data[inter_episode_index]
            # observation_ = env.step(action*nodes_per_group + Node_index[action], appid)
            observation_ = env.step(mapping_index[final_decision], appid)
            decision = mapping_index[final_decision]
            s.add(
                names['x' +
                      str(appid)][decision] >= int(env.state[decision][appid]))
            # for i in range(number_of_node_groups):
            store_episode_1(observation_first_layer, action_1)
            store_episode_2(observation_second_layer, action_2)
            store_episode_3(observation_third_layer, action_3)
            observation = observation_.copy()  # (9,9)
        """
        After an entire allocation, calculate total throughput, reward
        """
        # start_ = time.time()
        tput_state = env.get_tput_total_env()
        tput = (sim.predict(tput_state.reshape(-1, env.NUM_APPS)) *
                tput_state).sum() / NUM_CONTAINERS

        # print(time.time() - start_)
        # tput = 1.0 * tput / NUM_CONTAINERS
        RL_1.store_tput_per_episode(tput, epoch_i)
        assert (np.sum(env.state, axis=1) <=
                params['container_limitation per node']).all()
        assert sum(sum(env.state)) == NUM_CONTAINERS
        list_check = 0
        for node in range(NUM_NODES):
            for app in range(env.NUM_APPS):
                if env.state[node, :].sum(
                ) > params['container_limitation per node'] or env.state[
                        node, app] > 1 or (app == 1 and env.state[node, 2] > 0
                                           ) or (app == 2
                                                 and env.state[node, 1] > 0):
                    list_check += env.state[node, app]
        assert (list_check == 0)

        reward_ratio = (tput)

        reward_episode_1 = [reward_ratio] * len(observation_episode_1)
        reward_episode_2 = [reward_ratio] * len(observation_episode_2)
        reward_episode_3 = [reward_ratio] * len(observation_episode_3)

        RL_1.store_training_samples_per_episode(observation_episode_1,
                                                action_episode_1,
                                                reward_episode_1, 0)
        RL_2.store_training_samples_per_episode(observation_episode_2,
                                                action_episode_2,
                                                reward_episode_2, 0)
        RL_3.store_training_samples_per_episode(observation_episode_3,
                                                action_episode_3,
                                                reward_episode_3, 0)
        """
        check_tput_quality(tput)
        """
        if names['highest_tput_' + str(tput_origimal_class)] < tput:
            highest_tput_original = names['highest_tput_' +
                                          str(tput_origimal_class)]
            optimal_range_original = names['optimal_range_' +
                                           str(tput_origimal_class)]
            names['highest_tput_' + str(tput_origimal_class)] = tput
            names['number_optimal_' + str(tput_origimal_class)] = []

            names['observation_optimal_1_' + str(tput_origimal_class)], names[
                'action_optimal_1_' + str(tput_origimal_class)], names[
                    'reward_optimal_1_' +
                    str(tput_origimal_class)] = [], [], []
            names['observation_optimal_2_' + str(tput_origimal_class)], names[
                'action_optimal_2_' + str(tput_origimal_class)], names[
                    'reward_optimal_2_' +
                    str(tput_origimal_class)] = [], [], []
            names['observation_optimal_3_' + str(tput_origimal_class)], names[
                'action_optimal_3_' + str(tput_origimal_class)], names[
                    'reward_optimal_3_' +
                    str(tput_origimal_class)] = [], [], []
            if UseExperienceReplay:
                names['observation_optimal_1_' +
                      str(tput_origimal_class)].extend(observation_episode_1)
                names['action_optimal_1_' +
                      str(tput_origimal_class)].extend(action_episode_1)
                names['reward_optimal_1_' +
                      str(tput_origimal_class)].extend(reward_episode_1)

                names['observation_optimal_2_' +
                      str(tput_origimal_class)].extend(observation_episode_2)
                names['action_optimal_2_' +
                      str(tput_origimal_class)].extend(action_episode_2)
                names['reward_optimal_2_' +
                      str(tput_origimal_class)].extend(reward_episode_2)

                names['observation_optimal_3_' +
                      str(tput_origimal_class)].extend(observation_episode_3)
                names['action_optimal_3_' +
                      str(tput_origimal_class)].extend(action_episode_3)
                names['reward_optimal_3_' +
                      str(tput_origimal_class)].extend(reward_episode_3)

            names['number_optimal_' +
                  str(tput_origimal_class)].append(NUM_CONTAINERS)
            names['optimal_range_' + str(tput_origimal_class)] = min(
                1.2, tput / (highest_tput_original / optimal_range_original))
        elif names['highest_tput_' + str(tput_origimal_class)] < tput * names[
                'optimal_range_' + str(tput_origimal_class)]:

            if UseExperienceReplay:

                names['observation_optimal_1_' +
                      str(tput_origimal_class)].extend(observation_episode_1)
                names['action_optimal_1_' +
                      str(tput_origimal_class)].extend(action_episode_1)
                names['reward_optimal_1_' +
                      str(tput_origimal_class)].extend(reward_episode_1)

                names['observation_optimal_2_' +
                      str(tput_origimal_class)].extend(observation_episode_2)
                names['action_optimal_2_' +
                      str(tput_origimal_class)].extend(action_episode_2)
                names['reward_optimal_2_' +
                      str(tput_origimal_class)].extend(reward_episode_2)

                names['observation_optimal_3_' +
                      str(tput_origimal_class)].extend(observation_episode_3)
                names['action_optimal_3_' +
                      str(tput_origimal_class)].extend(action_episode_3)
                names['reward_optimal_3_' +
                      str(tput_origimal_class)].extend(reward_episode_3)

            names['number_optimal_' +
                  str(tput_origimal_class)].append(NUM_CONTAINERS)

        observation_episode_1, action_episode_1, reward_episode_1 = [], [], []
        observation_episode_2, action_episode_2, reward_episode_2 = [], [], []
        observation_episode_3, action_episode_3, reward_episode_3 = [], [], []
        """
        Each batch, RL.learn()
        """
        # records_per_episode = NUM_CONTAINERS * training_times_per_episode
        if (epoch_i % batch_size == 0) & (epoch_i > 1):
            if UseExperienceReplay:
                for replay_class in range(0, 1):

                    reward_optimal_1 = names['reward_optimal_1_' +
                                             str(replay_class)]
                    observation_optimal_1 = names['observation_optimal_1_' +
                                                  str(replay_class)]
                    action_optimal_1 = names['action_optimal_1_' +
                                             str(replay_class)]

                    reward_optimal_2 = names['reward_optimal_2_' +
                                             str(replay_class)]
                    observation_optimal_2 = names['observation_optimal_2_' +
                                                  str(replay_class)]
                    action_optimal_2 = names['action_optimal_2_' +
                                             str(replay_class)]

                    reward_optimal_3 = names['reward_optimal_3_' +
                                             str(replay_class)]
                    observation_optimal_3 = names['observation_optimal_3_' +
                                                  str(replay_class)]
                    action_optimal_3 = names['action_optimal_3_' +
                                             str(replay_class)]

                    number_optimal = names['number_optimal_' +
                                           str(replay_class)]

                    buffer_size = int(len(number_optimal))
                    assert sum(
                        number_optimal) * training_times_per_episode == len(
                            action_optimal_1)

                    if buffer_size < replay_size:
                        # TODO: if layers changes, training_times_per_episode should be modified
                        RL_1.ep_obs.extend(observation_optimal_1)
                        RL_1.ep_as.extend(action_optimal_1)
                        RL_1.ep_rs.extend(reward_optimal_1)

                        RL_2.ep_obs.extend(observation_optimal_2)
                        RL_2.ep_as.extend(action_optimal_2)
                        RL_2.ep_rs.extend(reward_optimal_2)

                        RL_3.ep_obs.extend(observation_optimal_3)
                        RL_3.ep_as.extend(action_optimal_3)
                        RL_3.ep_rs.extend(reward_optimal_3)

                    else:
                        replay_index = np.random.choice(range(buffer_size),
                                                        size=replay_size,
                                                        replace=False)
                        for replay_id in range(replay_size):
                            replace_start = replay_index[replay_id]
                            start_location = sum(number_optimal[:replace_start]
                                                 ) * training_times_per_episode
                            stop_location = sum(
                                number_optimal[:replace_start +
                                               1]) * training_times_per_episode

                            RL_1.ep_obs.extend(observation_optimal_1[
                                start_location:stop_location])
                            RL_1.ep_as.extend(
                                action_optimal_1[start_location:stop_location])
                            RL_1.ep_rs.extend(
                                reward_optimal_1[start_location:stop_location])

                            RL_2.ep_obs.extend(observation_optimal_2[
                                start_location:stop_location])
                            RL_2.ep_as.extend(
                                action_optimal_2[start_location:stop_location])
                            RL_2.ep_rs.extend(
                                reward_optimal_2[start_location:stop_location])

                            RL_3.ep_obs.extend(observation_optimal_3[
                                start_location:stop_location])
                            RL_3.ep_as.extend(
                                action_optimal_3[start_location:stop_location])
                            RL_3.ep_rs.extend(
                                reward_optimal_3[start_location:stop_location])

            # entropy_weight=0.1
            RL_1.learn(epoch_i, entropy_weight, True)
            RL_2.learn(epoch_i, entropy_weight, False)
            RL_3.learn(epoch_i, entropy_weight, False)
        """
        checkpoint, per 1000 episodes
        """
        if (epoch_i % 500 == 0) & (epoch_i > 1):
            highest_value = 0
            for class_replay in range(0, 1):
                highest_value = names['highest_tput_' + str(class_replay)]
                optimal_number = len(names['number_optimal_' +
                                           str(class_replay)])
                print("\n epoch: %d, highest tput: %f, optimal_number: %d" %
                      (epoch_i, highest_value, optimal_number))

            RL_1.save_session(ckpt_path_1)
            RL_2.save_session(ckpt_path_2)
            RL_3.save_session(ckpt_path_3)

            np.savez(np_path,
                     tputs=np.array(RL_1.tput_persisit),
                     candidate=np.array(RL_1.episode))
            """
            optimal range adaptively change
            """
            print(prob_weights)
            print(prob_weights)
            entropy_weight *= 0.5
            entropy_weight = max(entropy_weight, 0.002)
            print("time by now: ", time.time() - start_time)

        epoch_i += 1