def __init__(self, path_name, surffix, path_surffix): """ parameters set """ self.NUM_NODES = params['number of nodes in the cluster'] # self.NUM_CONTAINERS = params['number of containers'] # self.sim = Simulator() self.env = LraClusterEnv(num_nodes=self.NUM_NODES) ckpt_path_1 = path_surffix + path_name + "_1006_1" + "/model.ckpt" ckpt_path_2 = path_surffix + path_name + "_1006_2" + "/model.ckpt" ckpt_path_3 = path_surffix + path_name + "_1006_3" + "/model.ckpt" self.nodes_per_group = int(params['nodes per group']) # self.number_of_node_groups = int(self.NUM_NODES / self.nodes_per_group) """ Build Network """ self.n_actions = self.nodes_per_group #: 3 nodes per group self.n_features = int(self.n_actions * (self.env.NUM_APPS + 1 + self.env.NUM_APPS) + 1 + self.env.NUM_APPS) #: 29 self.RL_1 = PolicyGradient(n_actions=self.n_actions, n_features=self.n_features, learning_rate=params['learning rate'], suffix=surffix + '1a') self.RL_2 = PolicyGradient(n_actions=self.n_actions, n_features=self.n_features, learning_rate=params['learning rate'], suffix=surffix + '2a') self.RL_3 = PolicyGradient(n_actions=self.n_actions, n_features=self.n_features, learning_rate=params['learning rate'], suffix=surffix + '3a') self.RL_1.restore_session(ckpt_path_1) self.RL_2.restore_session(ckpt_path_2) self.RL_3.restore_session(ckpt_path_3) self.observation_episode_1, self.action_episode_1, self.reward_episode_1, self.safety_episode_1 = [], [], [], [] self.observation_optimal_1, self.action_optimal_1, self.reward_optimal_1, self.safety_optimal_1 = [], [], [], [] self.observation_episode_2, self.action_episode_2, self.reward_episode_2, self.safety_episode_2 = [], [], [], [] self.observation_optimal_2, self.action_optimal_2, self.reward_optimal_2, self.safety_optimal_2 = [], [], [], [] self.observation_episode_3, self.action_episode_3, self.reward_episode_3, self.safety_episode_3 = [], [], [], [] self.observation_optimal_3, self.action_optimal_3, self.reward_optimal_3, self.safety_optimal_3 = [], [], [], []
def train(params): """ parameters set """ print("Current params", params) NUM_NODES = params['number of nodes in the cluster'] env = LraClusterEnv(num_nodes=NUM_NODES) batch_size = params['batch_size'] ckpt_path_1 = "./checkpoint/" + params['path'] + "1/model.ckpt" ckpt_path_2 = "./checkpoint/" + params['path'] + "2/model.ckpt" ckpt_path_3 = "./checkpoint/" + params['path'] + "3/model.ckpt" ckpt_path_rec_1 = "./checkpoint/" + params['path'] + "1/model.ckpt" ckpt_path_rec_2 = "./checkpoint/" + params['path'] + "2/model.ckpt" ckpt_path_rec_3 = "./checkpoint/" + params['path'] + "3/model.ckpt" np_path = "./checkpoint/" + params['path'] + "/optimal_file_name.npz" Recover = params['recover'] nodes_per_group = int(params['nodes per group']) replay_size = params['replay size'] training_times_per_episode = 1 # TODO: if layers changes, training_times_per_episode should be modified # safety_requirement = 2.0 / 100. safety_requirement = params['safety_requirement'] print( "######## safety_requirement = {} ########".format(safety_requirement)) """ Build Network """ n_actions = nodes_per_group #: 3 nodes per group n_features = int(n_actions * (env.NUM_APPS + 1 + env.NUM_APPS) + 1 + env.NUM_APPS) #: 3*9+1 = 28 RL_1 = PolicyGradient(n_actions=n_actions, n_features=n_features, learning_rate=params['learning rate'], suffix=str(100) + '1a', safety_requirement=safety_requirement, params=params) RL_2 = PolicyGradient(n_actions=n_actions, n_features=n_features, learning_rate=params['learning rate'], suffix=str(100) + '2a', safety_requirement=safety_requirement, params=params) RL_3 = PolicyGradient(n_actions=n_actions, n_features=n_features, learning_rate=params['learning rate'], suffix=str(100) + '3a', safety_requirement=safety_requirement, params=params) sim = Simulator() """ Training """ start_time = time.time() global_start_time = start_time number_optimal = [] observation_episode_1, action_episode_1, reward_episode_1, safety_episode_1 = [], [], [], [] observation_optimal_1, action_optimal_1, reward_optimal_1, safety_optimal_1 = [], [], [], [] observation_episode_2, action_episode_2, reward_episode_2, safety_episode_2 = [], [], [], [] observation_optimal_2, action_optimal_2, reward_optimal_2, safety_optimal_2 = [], [], [], [] observation_episode_3, action_episode_3, reward_episode_3, safety_episode_3 = [], [], [], [] observation_optimal_3, action_optimal_3, reward_optimal_3, safety_optimal_3 = [], [], [], [] epoch_i = 0 thre_entropy = 0.1 # TODO: delete this range names = locals() for i in range(0, 10): names['highest_tput_' + str(i)] = 0 names['observation_optimal_1_' + str(i)] = [] names['action_optimal_1_' + str(i)] = [] names['observation_optimal_2_' + str(i)] = [] names['action_optimal_2_' + str(i)] = [] names['observation_optimal_3_' + str(i)] = [] names['action_optimal_3_' + str(i)] = [] names['reward_optimal_1_' + str(i)] = [] names['reward_optimal_2_' + str(i)] = [] names['reward_optimal_3_' + str(i)] = [] names['safety_optimal_1_' + str(i)] = [] names['safety_optimal_2_' + str(i)] = [] names['safety_optimal_3_' + str(i)] = [] names['number_optimal_' + str(i)] = [] names['optimal_range_' + str(i)] = 1.05 names['lowest_vio_' + str(i)] = 500 names['observation_optimal_1_vio_' + str(i)] = [] names['action_optimal_1_vio_' + str(i)] = [] names['observation_optimal_2_vio_' + str(i)] = [] names['action_optimal_2_vio_' + str(i)] = [] names['observation_optimal_3_vio_' + str(i)] = [] names['action_optimal_3_vio_' + str(i)] = [] names['reward_optimal_vio_1_' + str(i)] = [] names['reward_optimal_vio_2_' + str(i)] = [] names['reward_optimal_vio_3_' + str(i)] = [] names['safety_optimal_vio_1_' + str(i)] = [] names['safety_optimal_vio_2_' + str(i)] = [] names['safety_optimal_vio_3_' + str(i)] = [] names['number_optimal_vio_' + str(i)] = [] names['optimal_range_vio_' + str(i)] = 1.1 def store_episode_1(observations, actions): observation_episode_1.append(observations) action_episode_1.append(actions) def store_episode_2(observations, actions): observation_episode_2.append(observations) action_episode_2.append(actions) def store_episode_3(observations, actions): observation_episode_3.append(observations) action_episode_3.append(actions) tput_origimal_class = 0 source_batch_, index_data_ = batch_data( NUM_CONTAINERS, env.NUM_APPS) # index_data = [0,1,2,0,1,2] time_ep_acc = 0.0 time_al_acc = 0.0 while epoch_i < params['epochs']: time_ep_start = time.time() if Recover: print("Recover from {}".format(ckpt_path_rec_1)) RL_1.restore_session(ckpt_path_rec_1) RL_2.restore_session(ckpt_path_rec_2) RL_3.restore_session(ckpt_path_rec_3) Recover = False observation = env.reset().copy() # (9,9) source_batch = source_batch_.copy() index_data = index_data_.copy() """ Episode """ """ first layer """ time_al_start = time.time() source_batch_first = source_batch_.copy() observation_first_layer = np.zeros([nodes_per_group, env.NUM_APPS], int) for inter_episode_index in range(NUM_CONTAINERS): appid = index_data[inter_episode_index] source_batch_first[appid] -= 1 observation_first_layer_copy = observation_first_layer.copy() observation_first_layer_copy[:, appid] += 1 observation_first_layer_copy = np.append( observation_first_layer_copy, observation_first_layer_copy > 9 * 2, axis=1) observation_first_layer_copy = np.append( observation_first_layer_copy, observation_first_layer_copy.sum(axis=1).reshape( nodes_per_group, 1), axis=1) observation_first_layer_copy = np.array( observation_first_layer_copy).reshape(1, -1) observation_first_layer_copy = np.append( observation_first_layer_copy, appid).reshape(1, -1) observation_first_layer_copy = np.append( observation_first_layer_copy, np.array(source_batch_first)).reshape(1, -1) action_1, prob_weights = RL_1.choose_action( observation_first_layer_copy.copy()) observation_first_layer[action_1, appid] += 1 store_episode_1(observation_first_layer_copy, action_1) """ second layer """ observation_second_layer_aggregation = np.empty([0, env.NUM_APPS], int) # 9*20 number_cont_second_layer = [] for second_layer_index in range(nodes_per_group): rnd_array = observation_first_layer[second_layer_index].copy() source_batch_second, index_data = batch_data_sub(rnd_array) observation_second_layer = np.zeros( [nodes_per_group, env.NUM_APPS], int) NUM_CONTAINERS_second = sum(source_batch_second) number_cont_second_layer.append(NUM_CONTAINERS_second) for inter_episode_index in range(NUM_CONTAINERS_second): appid = index_data[inter_episode_index] source_batch_second[appid] -= 1 observation_second_layer_copy = observation_second_layer.copy() observation_second_layer_copy[:, appid] += 1 observation_second_layer_copy = np.append( observation_second_layer_copy, observation_second_layer_copy > 3 * 2, axis=1) observation_second_layer_copy = np.append( observation_second_layer_copy, observation_second_layer_copy.sum(axis=1).reshape( nodes_per_group, 1), axis=1) observation_second_layer_copy = np.array( observation_second_layer_copy).reshape(1, -1) observation_second_layer_copy = np.append( observation_second_layer_copy, appid).reshape(1, -1) observation_second_layer_copy = np.append( observation_second_layer_copy, np.array(source_batch_second)).reshape(1, -1) action_2, prob_weights = RL_2.choose_action( observation_second_layer_copy.copy()) observation_second_layer[action_2, appid] += 1 store_episode_2(observation_second_layer_copy, action_2) observation_second_layer_aggregation = np.append( observation_second_layer_aggregation, observation_second_layer, 0) """ third layer """ observation_third_layer_aggregation = np.empty([0, env.NUM_APPS], int) # 9*20 number_cont_third_layer = [] for third_layer_index in range(nodes_per_group * nodes_per_group): rnd_array = observation_second_layer_aggregation[ third_layer_index].copy() source_batch_third, index_data = batch_data_sub(rnd_array) observation_third_layer = np.zeros([nodes_per_group, env.NUM_APPS], int) NUM_CONTAINERS_third = sum(source_batch_third) number_cont_third_layer.append(NUM_CONTAINERS_third) for inter_episode_index in range(NUM_CONTAINERS_third): appid = index_data[inter_episode_index] source_batch_third[appid] -= 1 observation_third_layer_copy = observation_third_layer.copy() observation_third_layer_copy[:, appid] += 1 observation_third_layer_copy = np.append( observation_third_layer_copy, observation_third_layer_copy > 1 * 2, axis=1) observation_third_layer_copy = np.append( observation_third_layer_copy, observation_third_layer_copy.sum(axis=1).reshape( nodes_per_group, 1), axis=1) observation_third_layer_copy = np.array( observation_third_layer_copy).reshape(1, -1) observation_third_layer_copy = np.append( observation_third_layer_copy, appid).reshape(1, -1) observation_third_layer_copy = np.append( observation_third_layer_copy, np.array(source_batch_third)).reshape(1, -1) action_3, prob_weights = RL_3.choose_action( observation_third_layer_copy.copy()) observation_third_layer[action_3, appid] += 1 store_episode_3(observation_third_layer_copy, action_3) observation_third_layer_aggregation = np.append( observation_third_layer_aggregation, observation_third_layer, 0) time_al_end = time.time() time_al_acc += time_al_end - time_al_start """ After an entire allocation, calculate total throughput, reward """ env.state = observation_third_layer_aggregation.copy() assert sum(sum(env.state)) == NUM_CONTAINERS assert (env.state.sum(0) == source_batch_).all() tput_state = env.state tput_breakdown = sim.predict(tput_state.reshape(-1, env.NUM_APPS)) tput = (tput_breakdown * tput_state).sum() / NUM_CONTAINERS reward_ratio = (tput - 0) state = env.state # These three are not actually used in training, just for logging list_check_per_app = (env.state > 1).sum() + max( (env.state - 1).max(), 0) list_check_sum = sum( env.state.sum(1) > params['container_limitation per node'] ) + max( max(env.state.sum(1) - params['container_limitation per node']), 0) list_check_coex = sum((env.state[:, 1] > 0) * (env.state[:, 2] > 0)) # list_check = list_check_sum + list_check_coex + list_check_per_app list_check = 0 # error = 0 # for node in range(NUM_NODES): # for app in range(env.NUM_APPS): # if env.state[node, app] > 1 or (app == 1 and env.state[node, 2] > 0) or (app == 2 and env.state[node, 1] > 0): # error += env.state[node, app] # assert error==0 # container limitation & deployment spread for node in range(NUM_NODES): for app in range(env.NUM_APPS): if env.state[node, :].sum() > params[ 'container_limitation per node']: #or env.state[node, app] > 1: list_check += env.state[node, app] # hardware affinity & increamental deployment for app in range(7): node_now = np.where(env.state[:, app] > 0)[0] for node_ in node_now: if node_ not in app_node_set[app]: list_check += env.state[node_, app] list_check_ratio = list_check / NUM_CONTAINERS safety_episode_1 = [list_check_ratio * 1.0 ] * len(observation_episode_1) reward_episode_1 = [reward_ratio * 1.0] * len(observation_episode_1) safety_episode_2 = [list_check_ratio * 1.0 ] * len(observation_episode_2) reward_episode_2 = [reward_ratio * 1.0] * len(observation_episode_2) safety_episode_3 = [list_check_ratio * 1.0 ] * len(observation_episode_3) reward_episode_3 = [reward_ratio * 1.0] * len(observation_episode_3) RL_1.store_tput_per_episode(tput, epoch_i, list_check, list_check_per_app, list_check_coex, list_check_sum) RL_2.store_tput_per_episode(tput, epoch_i, list_check, [], [], []) RL_3.store_tput_per_episode(tput, epoch_i, list_check, [], [], []) RL_1.store_training_samples_per_episode(observation_episode_1, action_episode_1, reward_episode_1, safety_episode_1) RL_2.store_training_samples_per_episode(observation_episode_2, action_episode_2, reward_episode_2, safety_episode_2) RL_3.store_training_samples_per_episode(observation_episode_3, action_episode_3, reward_episode_3, safety_episode_3) """ check_tput_quality(tput) """ if names['lowest_vio_' + str(tput_origimal_class)] > list_check: names['lowest_vio_' + str(tput_origimal_class)] = list_check names['observation_optimal_1_vio_' + str( tput_origimal_class )], names[ 'action_optimal_1_vio_' + str(tput_origimal_class)], names[ 'observation_optimal_2_vio_' + str(tput_origimal_class)], names[ 'action_optimal_2_vio_' + str(tput_origimal_class)], names[ 'number_optimal_vio_' + str(tput_origimal_class)], names[ 'safety_optimal_vio_1_' + str(tput_origimal_class)], names[ 'safety_optimal_vio_2_' + str(tput_origimal_class)], names[ 'safety_optimal_vio_3_' + str( tput_origimal_class )] = [], [], [], [], [], [], [], [] names['observation_optimal_3_vio_' + str(tput_origimal_class)], names[ 'action_optimal_3_vio_' + str(tput_origimal_class)] = [], [] names['reward_optimal_vio_' + str(tput_origimal_class)] = [] names['observation_optimal_1_vio_' + str(tput_origimal_class)].extend(observation_episode_1) names['action_optimal_1_vio_' + str(tput_origimal_class)].extend(action_episode_1) names['observation_optimal_2_vio_' + str(tput_origimal_class)].extend(observation_episode_2) names['action_optimal_2_vio_' + str(tput_origimal_class)].extend(action_episode_2) names['observation_optimal_3_vio_' + str(tput_origimal_class)].extend(observation_episode_3) names['action_optimal_3_vio_' + str(tput_origimal_class)].extend(action_episode_3) names['number_optimal_vio_' + str(tput_origimal_class)].append(NUM_CONTAINERS) names['safety_optimal_vio_1_' + str(tput_origimal_class)].extend(safety_episode_1) names['safety_optimal_vio_2_' + str(tput_origimal_class)].extend(safety_episode_2) names['safety_optimal_vio_3_' + str(tput_origimal_class)].extend(safety_episode_3) names['reward_optimal_vio_' + str(tput_origimal_class)].extend(reward_episode_1) names['optimal_range_vio_' + str(tput_origimal_class)] = 1.1 elif names['lowest_vio_' + str(tput_origimal_class)] >= list_check / names[ 'optimal_range_vio_' + str(tput_origimal_class)]: names['observation_optimal_1_vio_' + str(tput_origimal_class)].extend(observation_episode_1) names['action_optimal_1_vio_' + str(tput_origimal_class)].extend(action_episode_1) names['observation_optimal_2_vio_' + str(tput_origimal_class)].extend(observation_episode_2) names['action_optimal_2_vio_' + str(tput_origimal_class)].extend(action_episode_2) names['observation_optimal_3_vio_' + str(tput_origimal_class)].extend(observation_episode_3) names['action_optimal_3_vio_' + str(tput_origimal_class)].extend(action_episode_3) names['number_optimal_vio_' + str(tput_origimal_class)].append(NUM_CONTAINERS) names['safety_optimal_vio_1_' + str(tput_origimal_class)].extend(safety_episode_1) names['safety_optimal_vio_2_' + str(tput_origimal_class)].extend(safety_episode_2) names['safety_optimal_vio_3_' + str(tput_origimal_class)].extend(safety_episode_3) names['reward_optimal_vio_' + str(tput_origimal_class)].extend(reward_episode_1) # if list_check_ratio <= safety_requirement*0.5: if list_check_ratio <= safety_requirement: if names['highest_tput_' + str(tput_origimal_class)] < tput: names['highest_tput_' + str(tput_origimal_class)] = tput names['observation_optimal_1_' + str(tput_origimal_class)], names[ 'action_optimal_1_' + str(tput_origimal_class)], names[ 'observation_optimal_2_' + str(tput_origimal_class)], names[ 'action_optimal_2_' + str(tput_origimal_class)], \ names['reward_optimal_1_' + str(tput_origimal_class)], names[ 'reward_optimal_2_' + str(tput_origimal_class)], names[ 'reward_optimal_3_' + str(tput_origimal_class)], \ names['number_optimal_' + str(tput_origimal_class)], \ names['safety_optimal_1_' + str(tput_origimal_class)], names[ 'safety_optimal_2_' + str(tput_origimal_class)], names[ 'safety_optimal_3_' + str(tput_origimal_class)] \ = [], [], [], [], [], [], [], [], [], [], [] names['observation_optimal_3_' + str(tput_origimal_class)], names[ 'action_optimal_3_' + str(tput_origimal_class)] = [], [] names['observation_optimal_1_' + str(tput_origimal_class)].extend(observation_episode_1) names['action_optimal_1_' + str(tput_origimal_class)].extend(action_episode_1) names['observation_optimal_2_' + str(tput_origimal_class)].extend(observation_episode_2) names['action_optimal_2_' + str(tput_origimal_class)].extend(action_episode_2) names['observation_optimal_3_' + str(tput_origimal_class)].extend(observation_episode_3) names['action_optimal_3_' + str(tput_origimal_class)].extend(action_episode_3) names['number_optimal_' + str(tput_origimal_class)].append(NUM_CONTAINERS) names['safety_optimal_1_' + str(tput_origimal_class)].extend(safety_episode_1) names['safety_optimal_2_' + str(tput_origimal_class)].extend(safety_episode_2) names['safety_optimal_3_' + str(tput_origimal_class)].extend(safety_episode_3) names['reward_optimal_1_' + str(tput_origimal_class)].extend(reward_episode_1) names['reward_optimal_2_' + str(tput_origimal_class)].extend(reward_episode_2) names['reward_optimal_3_' + str(tput_origimal_class)].extend(reward_episode_3) names['optimal_range_' + str(tput_origimal_class)] = 1.05 elif names['highest_tput_' + str(tput_origimal_class)] < tput * names[ 'optimal_range_' + str(tput_origimal_class)]: names['observation_optimal_1_' + str(tput_origimal_class)].extend(observation_episode_1) names['action_optimal_1_' + str(tput_origimal_class)].extend(action_episode_1) names['observation_optimal_2_' + str(tput_origimal_class)].extend(observation_episode_2) names['action_optimal_2_' + str(tput_origimal_class)].extend(action_episode_2) names['observation_optimal_3_' + str(tput_origimal_class)].extend(observation_episode_3) names['action_optimal_3_' + str(tput_origimal_class)].extend(action_episode_3) names['number_optimal_' + str(tput_origimal_class)].append(NUM_CONTAINERS) names['safety_optimal_1_' + str(tput_origimal_class)].extend(safety_episode_1) names['safety_optimal_2_' + str(tput_origimal_class)].extend(safety_episode_2) names['safety_optimal_3_' + str(tput_origimal_class)].extend(safety_episode_3) names['reward_optimal_1_' + str(tput_origimal_class)].extend(reward_episode_1) names['reward_optimal_2_' + str(tput_origimal_class)].extend(reward_episode_2) names['reward_optimal_3_' + str(tput_origimal_class)].extend(reward_episode_3) observation_episode_1, action_episode_1, reward_episode_1, safety_episode_1 = [], [], [], [] observation_episode_2, action_episode_2, reward_episode_2, safety_episode_2 = [], [], [], [] observation_episode_3, action_episode_3, reward_episode_3, safety_episode_3 = [], [], [], [] """ Each batch, RL.learn() """ if (epoch_i % batch_size == 0) & (epoch_i > 1): for replay_class in range(0, 1): number_optimal = names['number_optimal_' + str(replay_class)] reward_optimal_1 = names['reward_optimal_1_' + str(replay_class)] reward_optimal_2 = names['reward_optimal_2_' + str(replay_class)] reward_optimal_3 = names['reward_optimal_3_' + str(replay_class)] safety_optimal_1 = names['safety_optimal_1_' + str(replay_class)] safety_optimal_2 = names['safety_optimal_2_' + str(replay_class)] safety_optimal_3 = names['safety_optimal_3_' + str(replay_class)] observation_optimal_1 = names['observation_optimal_1_' + str(replay_class)] action_optimal_1 = names['action_optimal_1_' + str(replay_class)] observation_optimal_2 = names['observation_optimal_2_' + str(replay_class)] action_optimal_2 = names['action_optimal_2_' + str(replay_class)] observation_optimal_3 = names['observation_optimal_3_' + str(replay_class)] action_optimal_3 = names['action_optimal_3_' + str(replay_class)] buffer_size = int(len(number_optimal)) if buffer_size < replay_size: # TODO: if layers changes, training_times_per_episode should be modified RL_1.ep_obs.extend(observation_optimal_1) RL_1.ep_as.extend(action_optimal_1) RL_1.ep_rs.extend(reward_optimal_1) RL_1.ep_ss.extend(safety_optimal_1) RL_2.ep_obs.extend(observation_optimal_2) RL_2.ep_as.extend(action_optimal_2) RL_2.ep_rs.extend(reward_optimal_2) RL_2.ep_ss.extend(safety_optimal_2) RL_3.ep_obs.extend(observation_optimal_3) RL_3.ep_as.extend(action_optimal_3) RL_3.ep_rs.extend(reward_optimal_3) RL_3.ep_ss.extend(safety_optimal_3) else: replay_index = np.random.choice(range(buffer_size), size=replay_size, replace=False) for replay_id in range(replay_size): replace_start = replay_index[replay_id] start_location = sum(number_optimal[:replace_start]) stop_location = sum(number_optimal[:replace_start + 1]) RL_1.ep_obs.extend( observation_optimal_1[start_location:stop_location] ) RL_1.ep_as.extend( action_optimal_1[start_location:stop_location]) RL_1.ep_rs.extend( reward_optimal_1[start_location:stop_location]) RL_1.ep_ss.extend( safety_optimal_1[start_location:stop_location]) RL_2.ep_obs.extend( observation_optimal_2[start_location:stop_location] ) RL_2.ep_as.extend( action_optimal_2[start_location:stop_location]) RL_2.ep_rs.extend( reward_optimal_2[start_location:stop_location]) RL_2.ep_ss.extend( safety_optimal_2[start_location:stop_location]) RL_3.ep_obs.extend( observation_optimal_3[start_location:stop_location] ) RL_3.ep_as.extend( action_optimal_3[start_location:stop_location]) RL_3.ep_rs.extend( reward_optimal_3[start_location:stop_location]) RL_3.ep_ss.extend( safety_optimal_3[start_location:stop_location]) if not RL_1.start_cpo: for replay_class in range(0, 1): number_optimal = names['number_optimal_vio_' + str(replay_class)] safety_optimal_1 = names['safety_optimal_vio_1_' + str(replay_class)] safety_optimal_2 = names['safety_optimal_vio_2_' + str(replay_class)] safety_optimal_3 = names['safety_optimal_vio_3_' + str(replay_class)] reward_optimal = names['reward_optimal_vio_' + str(replay_class)] observation_optimal_1 = names['observation_optimal_1_vio_' + str(replay_class)] action_optimal_1 = names['action_optimal_1_vio_' + str(replay_class)] observation_optimal_2 = names['observation_optimal_2_vio_' + str(replay_class)] action_optimal_2 = names['action_optimal_2_vio_' + str(replay_class)] observation_optimal_3 = names['observation_optimal_3_vio_' + str(replay_class)] action_optimal_3 = names['action_optimal_3_vio_' + str(replay_class)] buffer_size = int(len(number_optimal)) if buffer_size < replay_size: # TODO: if layers changes, training_times_per_episode should be modified RL_1.ep_obs.extend(observation_optimal_1) RL_1.ep_as.extend(action_optimal_1) RL_1.ep_ss.extend(safety_optimal_1) RL_1.ep_rs.extend(reward_optimal) RL_2.ep_obs.extend(observation_optimal_2) RL_2.ep_as.extend(action_optimal_2) RL_2.ep_rs.extend(reward_optimal) RL_2.ep_ss.extend(safety_optimal_2) RL_3.ep_obs.extend(observation_optimal_3) RL_3.ep_as.extend(action_optimal_3) RL_3.ep_rs.extend(reward_optimal) RL_3.ep_ss.extend(safety_optimal_3) else: replay_index = np.random.choice(range(buffer_size), size=replay_size, replace=False) for replay_id in range(replay_size): replace_start = replay_index[replay_id] start_location = sum( number_optimal[:replace_start]) stop_location = sum(number_optimal[:replace_start + 1]) RL_1.ep_obs.extend(observation_optimal_1[ start_location:stop_location]) RL_1.ep_as.extend( action_optimal_1[start_location:stop_location]) RL_1.ep_rs.extend( reward_optimal[start_location:stop_location]) RL_1.ep_ss.extend( safety_optimal_1[start_location:stop_location]) RL_2.ep_obs.extend(observation_optimal_2[ start_location:stop_location]) RL_2.ep_as.extend( action_optimal_2[start_location:stop_location]) RL_2.ep_rs.extend( reward_optimal[start_location:stop_location]) RL_2.ep_ss.extend( safety_optimal_2[start_location:stop_location]) RL_3.ep_obs.extend(observation_optimal_3[ start_location:stop_location]) RL_3.ep_as.extend( action_optimal_3[start_location:stop_location]) RL_3.ep_rs.extend( reward_optimal[start_location:stop_location]) RL_3.ep_ss.extend( safety_optimal_3[start_location:stop_location]) time_s = time.time() RL_1.learn(epoch_i, thre_entropy, Ifprint=True) RL_2.learn(epoch_i, thre_entropy) optim_case = RL_3.learn(epoch_i, thre_entropy) time_e = time.time() print("learning time epoch_i:", epoch_i, time_e - time_s) print("End2End time epoch_i", epoch_i, time_ep_acc) print("Allocate time epoch_i", epoch_i, time_al_acc) time_al_acc = 0.0 time_ep_acc = 0.0 """ checkpoint, per 1000 episodes """ if (epoch_i % 3000 == 0) & (epoch_i > 1): RL_1.save_session(ckpt_path_1) RL_2.save_session(ckpt_path_2) RL_3.save_session(ckpt_path_3) np.savez(np_path, tputs=np.array(RL_1.tput_persisit), candidate=np.array(RL_1.episode), vi_perapp=np.array(RL_1.ss_perapp_persisit), vi_coex=np.array(RL_1.ss_coex_persisit), vi_sum=np.array(RL_1.ss_sum_persisit)) """ optimal range adaptively change """ for class_replay in range(0, 1): number_optimal = names['number_optimal_' + str(class_replay)] count_size = int(len(number_optimal)) if (count_size > 300): names['optimal_range_' + str(class_replay)] *= 0.99 names['optimal_range_' + str(class_replay)] = max( names['optimal_range_' + str(class_replay)], 1.01) start_location = sum(names['number_optimal_' + str( class_replay)][:-50]) * training_times_per_episode names['observation_optimal_1_' + str(class_replay)] = names[ 'observation_optimal_1_' + str(class_replay)][start_location:] names['action_optimal_1_' + str(class_replay)] = names[ 'action_optimal_1_' + str(class_replay)][start_location:] names['observation_optimal_2_' + str(class_replay)] = names[ 'observation_optimal_2_' + str(class_replay)][start_location:] names['action_optimal_2_' + str(class_replay)] = names[ 'action_optimal_2_' + str(class_replay)][start_location:] names['observation_optimal_3_' + str(class_replay)] = names[ 'observation_optimal_3_' + str(class_replay)][start_location:] names['action_optimal_3_' + str(class_replay)] = names[ 'action_optimal_3_' + str(class_replay)][start_location:] names['number_optimal_' + str(class_replay)] = names['number_optimal_' + str(class_replay)][-50:] names['safety_optimal_1_' + str(class_replay)] = names[ 'safety_optimal_1_' + str(class_replay)][start_location:] names['safety_optimal_2_' + str(class_replay)] = names[ 'safety_optimal_2_' + str(class_replay)][start_location:] names['safety_optimal_3_' + str(class_replay)] = names[ 'safety_optimal_3_' + str(class_replay)][start_location:] names['reward_optimal_1_' + str(class_replay)] = names[ 'reward_optimal_1_' + str(class_replay)][start_location:] names['reward_optimal_2_' + str(class_replay)] = names[ 'reward_optimal_2_' + str(class_replay)][start_location:] names['reward_optimal_3_' + str(class_replay)] = names[ 'reward_optimal_3_' + str(class_replay)][start_location:] print("optimal_range:", names['optimal_range_' + str(class_replay)]) thre_entropy *= 0.5 thre_entropy = max(thre_entropy, 0.0001) epoch_i += 1 time_ep_end = time.time() time_ep_acc += time_ep_end - time_ep_start if epoch_i > 10000: batch_size = 100
def train(params): """ parameters set """ NUM_NODES = params['number of nodes in the cluster'] env = LraClusterEnv(num_nodes=NUM_NODES) batch_size = params['batch_size'] ckpt_path_1 = "./checkpoint/" + params['path'] + "1/model.ckpt" ckpt_path_2 = "./checkpoint/" + params['path'] + "2/model.ckpt" ckpt_path_3 = "./checkpoint/" + params['path'] + "3/model.ckpt" make_path(params['path'] + "1") make_path(params['path'] + "2") make_path(params['path'] + "3") useExternal = False np_path = "./checkpoint/" + params['path'] + "/optimal_file_name.npz" Recover = params['recover'] nodes_per_group = int(params['nodes per group']) replay_size = params['replay size'] training_times_per_episode = 1 # TODO: if layers changes, training_times_per_episode should be modified """ Build Network """ n_actions = nodes_per_group #: 3 nodes per group n_features = int(n_actions * (env.NUM_APPS + 1 + env.NUM_APPS) + 1 + env.NUM_APPS) #: 3*9+1 = 28 RL_1 = PolicyGradient(n_actions=n_actions, n_features=n_features, learning_rate=params['learning rate'], suffix="100" + '1a') RL_2 = PolicyGradient(n_actions=n_actions, n_features=n_features, learning_rate=params['learning rate'], suffix="100" + '2a') RL_3 = PolicyGradient(n_actions=n_actions, n_features=n_features, learning_rate=params['learning rate'], suffix="100" + '3a') """ Training """ start_time = time.time() global_start_time = start_time number_optimal = [] observation_episode_1, action_episode_1, reward_episode_1, safety_episode_1 = [], [], [], [] observation_optimal_1, action_optimal_1, reward_optimal_1, safety_optimal_1 = [], [], [], [] observation_episode_2, action_episode_2, reward_episode_2, safety_episode_2 = [], [], [], [] observation_optimal_2, action_optimal_2, reward_optimal_2, safety_optimal_2 = [], [], [], [] observation_episode_3, action_episode_3, reward_episode_3, safety_episode_3 = [], [], [], [] observation_optimal_3, action_optimal_3, reward_optimal_3, safety_optimal_3 = [], [], [], [] epoch_i = 0 thre_entropy = 0.1 names = locals() for i in range(0, 12): names['highest_tput_' + str(i)] = 0 names['observation_optimal_1_' + str(i)] = [] names['action_optimal_1_' + str(i)] = [] names['observation_optimal_2_' + str(i)] = [] names['action_optimal_2_' + str(i)] = [] names['observation_optimal_3_' + str(i)] = [] names['action_optimal_3_' + str(i)] = [] names['reward_optimal_1_' + str(i)] = [] names['reward_optimal_2_' + str(i)] = [] names['reward_optimal_3_' + str(i)] = [] names['safety_optimal_1_' + str(i)] = [] names['safety_optimal_2_' + str(i)] = [] names['safety_optimal_3_' + str(i)] = [] names['number_optimal_' + str(i)] = [] names['optimal_range_' + str(i)] = 1.05 names['lowest_vio_' + str(i)] = 500 names['observation_optimal_1_vio_' + str(i)] = [] names['action_optimal_1_vio_' + str(i)] = [] names['observation_optimal_2_vio_' + str(i)] = [] names['action_optimal_2_vio_' + str(i)] = [] names['observation_optimal_3_vio_' + str(i)] = [] names['action_optimal_3_vio_' + str(i)] = [] names['reward_optimal_vio_1_' + str(i)] = [] names['reward_optimal_vio_2_' + str(i)] = [] names['reward_optimal_vio_3_' + str(i)] = [] names['safety_optimal_vio_1_' + str(i)] = [] names['safety_optimal_vio_2_' + str(i)] = [] names['safety_optimal_vio_3_' + str(i)] = [] names['number_optimal_vio_' + str(i)] = [] names['optimal_range_vio_' + str(i)] = 1.1 def store_episode_1(observations, actions): observation_episode_1.append(observations) action_episode_1.append(actions) def store_episode_2(observations, actions): observation_episode_2.append(observations) action_episode_2.append(actions) def store_episode_3(observations, actions): observation_episode_3.append(observations) action_episode_3.append(actions) tput_origimal_class = 0 source_batch_, index_data_ = batch_data( NUM_CONTAINERS, env.NUM_APPS) # index_data = [0,1,2,0,1,2] break_number = 0 while epoch_i < params['epochs']: break_flag = False observation = env.reset().copy() # (9,9) source_batch = source_batch_.copy() index_data = index_data_.copy() """ Episode """ """ first layer """ source_batch_first = source_batch_.copy() observation_first_layer = np.zeros([nodes_per_group, env.NUM_APPS], int) for inter_episode_index in range(NUM_CONTAINERS): appid = index_data[inter_episode_index] source_batch_first[appid] -= 1 observation_first_layer_copy = observation_first_layer.copy() observation_first_layer_copy[:, appid] += 1 observation_first_layer_copy = np.append( observation_first_layer_copy, observation_first_layer_copy > 9 * 2, axis=1) observation_first_layer_copy = np.append( observation_first_layer_copy, observation_first_layer_copy.sum(axis=1).reshape( nodes_per_group, 1), axis=1) observation_first_layer_copy = np.array( observation_first_layer_copy).reshape(1, -1) observation_first_layer_copy = np.append( observation_first_layer_copy, appid).reshape(1, -1) observation_first_layer_copy = np.append( observation_first_layer_copy, np.array(source_batch_first)).reshape(1, -1) if useExternal: action_1 = inter_episode_index % 3 prob_weights = [] else: action_1, prob_weights = RL_1.choose_action( observation_first_layer_copy.copy()) observation_first_layer[action_1, appid] += 1 store_episode_1(observation_first_layer_copy, action_1) """ second layer """ observation_second_layer_aggregation = np.empty([0, env.NUM_APPS], int) # 9*20 number_cont_second_layer = [] for second_layer_index in range(nodes_per_group): rnd_array = observation_first_layer[second_layer_index].copy() source_batch_second, index_data = batch_data_sub(rnd_array) observation_second_layer = np.zeros( [nodes_per_group, env.NUM_APPS], int) NUM_CONTAINERS_second = sum(source_batch_second) number_cont_second_layer.append(NUM_CONTAINERS_second) for inter_episode_index in range(NUM_CONTAINERS_second): appid = index_data[inter_episode_index] source_batch_second[appid] -= 1 observation_second_layer_copy = observation_second_layer.copy() observation_second_layer_copy[:, appid] += 1 observation_second_layer_copy = np.append( observation_second_layer_copy, observation_second_layer_copy > 3 * 2, axis=1) observation_second_layer_copy = np.append( observation_second_layer_copy, observation_second_layer_copy.sum(axis=1).reshape( nodes_per_group, 1), axis=1) observation_second_layer_copy = np.array( observation_second_layer_copy).reshape(1, -1) observation_second_layer_copy = np.append( observation_second_layer_copy, appid).reshape(1, -1) observation_second_layer_copy = np.append( observation_second_layer_copy, np.array(source_batch_second)).reshape(1, -1) if useExternal: action_2 = inter_episode_index % 3 prob_weights = [] else: action_2, prob_weights = RL_2.choose_action( observation_second_layer_copy.copy()) observation_second_layer[action_2, appid] += 1 store_episode_2(observation_second_layer_copy, action_2) observation_second_layer_aggregation = np.append( observation_second_layer_aggregation, observation_second_layer, 0) """ third layer """ observation_third_layer_aggregation = np.empty([0, env.NUM_APPS], int) # 9*20 number_cont_third_layer = [] for third_layer_index in range(nodes_per_group * nodes_per_group): rnd_array = observation_second_layer_aggregation[ third_layer_index].copy() source_batch_third, index_data = batch_data_sub(rnd_array) observation_third_layer = np.zeros([nodes_per_group, env.NUM_APPS], int) NUM_CONTAINERS_third = sum(source_batch_third) number_cont_third_layer.append(NUM_CONTAINERS_third) for inter_episode_index in range(NUM_CONTAINERS_third): appid = index_data[inter_episode_index] source_batch_third[appid] -= 1 observation_third_layer_copy = observation_third_layer.copy() observation_third_layer_copy[:, appid] += 1 observation_third_layer_copy, mapping_index = handle_constraint( observation_third_layer_copy.copy(), 3) if len(mapping_index) < 1: break_flag = True break assert len(mapping_index) > 0 observation_third_layer_copy = np.append( observation_third_layer_copy, observation_third_layer_copy > 1 * 2, axis=1) observation_third_layer_copy = np.append( observation_third_layer_copy, observation_third_layer_copy.sum(axis=1).reshape( nodes_per_group, 1), axis=1) observation_third_layer_copy = np.array( observation_third_layer_copy).reshape(1, -1) observation_third_layer_copy = np.append( observation_third_layer_copy, appid).reshape(1, -1) observation_third_layer_copy = np.append( observation_third_layer_copy, np.array(source_batch_third)).reshape(1, -1) if useExternal: action_3 = inter_episode_index % 3 prob_weights = [] else: action_3, prob_weights = RL_3.choose_action( observation_third_layer_copy.copy()) observation_third_layer[mapping_index[action_3], appid] += 1 store_episode_3(observation_third_layer_copy, action_3) if break_flag: break observation_third_layer_aggregation = np.append( observation_third_layer_aggregation, observation_third_layer, 0) if break_flag: break_number += 1 """ After an entire allocation, calculate total throughput, reward """ if not break_flag: env.state = observation_third_layer_aggregation.copy() tput_state = env.get_tput_total_env() tput = (sim.predict(tput_state.reshape(-1, env.NUM_APPS)) * tput_state).sum() / NUM_CONTAINERS assert sum(sum(env.state)) == NUM_CONTAINERS assert (env.state.sum(0) == source_batch_).all() list_check = 0 for node in range(NUM_NODES): for app in range(env.NUM_APPS): if env.state[node, :].sum( ) > params['container_limitation per node'] or env.state[ node, app] > 1 or (app == 1 and env.state[node, 2] > 0) or ( app == 2 and env.state[node, 1] > 0): list_check += env.state[node, app] list_check_ratio = -1.0 * list_check / NUM_CONTAINERS else: tput = 0 list_check_ratio = 0 list_check, list_check_per_app, list_check_coex, list_check_sum = 0, 0, 0, 0 safety_episode_3 = [list_check_ratio * 1.0 ] * len(observation_episode_3) reward_episode_3 = [tput * 1.0] * len(observation_episode_3) safety_episode_2, reward_episode_2 = [], [] for second_subcluster_index in range(nodes_per_group): safety_episode_2.extend( [list_check_ratio * 1.0] * int(number_cont_second_layer[second_subcluster_index])) reward_episode_2.extend( [tput * 1.0] * int(number_cont_second_layer[second_subcluster_index])) safety_episode_1 = [list_check_ratio * 1.0 ] * len(observation_episode_1) reward_episode_1 = [tput * 1.0] * len(observation_episode_1) RL_1.store_tput_per_episode(tput, list_check, epoch_i, [], [], list_check) RL_2.store_tput_per_episode(tput, list_check, epoch_i, [], [], []) RL_3.store_tput_per_episode(tput, list_check, epoch_i, [], [], []) RL_1.store_training_samples_per_episode(observation_episode_1, action_episode_1, safety_episode_1, reward_episode_1) RL_2.store_training_samples_per_episode(observation_episode_2, action_episode_2, safety_episode_2, reward_episode_2) RL_3.store_training_samples_per_episode(observation_episode_3, action_episode_3, safety_episode_3, reward_episode_3) """ check_tput_quality(tput) """ if names['highest_tput_' + str(tput_origimal_class)] < tput and list_check_ratio == 0: names['highest_tput_' + str(tput_origimal_class)] = tput names['observation_optimal_1_' + str(tput_origimal_class)], names[ 'action_optimal_1_' + str(tput_origimal_class)], names[ 'observation_optimal_2_' + str(tput_origimal_class)], names['action_optimal_2_' + str( tput_origimal_class)], names['reward_optimal_1_' + str( tput_origimal_class )], names['reward_optimal_2_' + str( tput_origimal_class )], names['reward_optimal_3_' + str( tput_origimal_class )], names['number_optimal_' + str( tput_origimal_class )], names['safety_optimal_1_' + str( tput_origimal_class )], names['safety_optimal_2_' + str( tput_origimal_class )], names['safety_optimal_3_' + str( tput_origimal_class )] = [], [], [], [], [], [], [], [], [], [], [] names['observation_optimal_3_' + str(tput_origimal_class)], names[ 'action_optimal_3_' + str(tput_origimal_class)] = [], [] names['observation_optimal_1_' + str(tput_origimal_class)].extend(observation_episode_1) names['action_optimal_1_' + str(tput_origimal_class)].extend(action_episode_1) names['observation_optimal_2_' + str(tput_origimal_class)].extend(observation_episode_2) names['action_optimal_2_' + str(tput_origimal_class)].extend(action_episode_2) names['observation_optimal_3_' + str(tput_origimal_class)].extend(observation_episode_3) names['action_optimal_3_' + str(tput_origimal_class)].extend(action_episode_3) names['number_optimal_' + str(tput_origimal_class)].append(NUM_CONTAINERS) names['safety_optimal_1_' + str(tput_origimal_class)].extend(safety_episode_1) names['safety_optimal_2_' + str(tput_origimal_class)].extend(safety_episode_2) names['safety_optimal_3_' + str(tput_origimal_class)].extend(safety_episode_3) names['reward_optimal_1_' + str(tput_origimal_class)].extend(reward_episode_1) names['reward_optimal_2_' + str(tput_origimal_class)].extend(reward_episode_2) names['reward_optimal_3_' + str(tput_origimal_class)].extend(reward_episode_3) names['optimal_range_' + str(tput_origimal_class)] = 1.05 observation_episode_1, action_episode_1, reward_episode_1, safety_episode_1, reward_episode_1 = [], [], [], [], [] observation_episode_2, action_episode_2, reward_episode_2, safety_episode_2, reward_episode_2 = [], [], [], [], [] observation_episode_3, action_episode_3, reward_episode_3, safety_episode_3, reward_episode_3 = [], [], [], [], [] """ Each batch, RL.learn() """ if (epoch_i % batch_size == 0) & (epoch_i > batch_size + 1): for replay_class in range(0, 1): number_optimal = names['number_optimal_' + str(replay_class)] reward_optimal_1 = names['reward_optimal_1_' + str(replay_class)] reward_optimal_2 = names['reward_optimal_2_' + str(replay_class)] reward_optimal_3 = names['reward_optimal_3_' + str(replay_class)] safety_optimal_1 = names['safety_optimal_1_' + str(replay_class)] safety_optimal_2 = names['safety_optimal_2_' + str(replay_class)] safety_optimal_3 = names['safety_optimal_3_' + str(replay_class)] observation_optimal_1 = names['observation_optimal_1_' + str(replay_class)] action_optimal_1 = names['action_optimal_1_' + str(replay_class)] observation_optimal_2 = names['observation_optimal_2_' + str(replay_class)] action_optimal_2 = names['action_optimal_2_' + str(replay_class)] observation_optimal_3 = names['observation_optimal_3_' + str(replay_class)] action_optimal_3 = names['action_optimal_3_' + str(replay_class)] buffer_size = int(len(number_optimal)) if buffer_size < replay_size: # TODO: if layers changes, training_times_per_episode should be modified RL_1.ep_obs.extend(observation_optimal_1) RL_1.ep_as.extend(action_optimal_1) RL_1.ep_rs.extend(reward_optimal_1) RL_1.ep_ss.extend(safety_optimal_1) RL_2.ep_obs.extend(observation_optimal_2) RL_2.ep_as.extend(action_optimal_2) RL_2.ep_rs.extend(reward_optimal_2) RL_2.ep_ss.extend(safety_optimal_2) RL_3.ep_obs.extend(observation_optimal_3) RL_3.ep_as.extend(action_optimal_3) RL_3.ep_rs.extend(reward_optimal_3) RL_3.ep_ss.extend(safety_optimal_3) else: replay_index = np.random.choice(range(buffer_size), size=replay_size, replace=False) for replay_id in range(replay_size): replace_start = replay_index[replay_id] start_location = sum(number_optimal[:replace_start]) stop_location = sum(number_optimal[:replace_start + 1]) RL_1.ep_obs.extend( observation_optimal_1[start_location:stop_location] ) RL_1.ep_as.extend( action_optimal_1[start_location:stop_location]) RL_1.ep_rs.extend( reward_optimal_1[start_location:stop_location]) RL_1.ep_ss.extend( safety_optimal_1[start_location:stop_location]) RL_2.ep_obs.extend( observation_optimal_2[start_location:stop_location] ) RL_2.ep_as.extend( action_optimal_2[start_location:stop_location]) RL_2.ep_rs.extend( reward_optimal_2[start_location:stop_location]) RL_2.ep_ss.extend( safety_optimal_2[start_location:stop_location]) RL_3.ep_obs.extend( observation_optimal_3[start_location:stop_location] ) RL_3.ep_as.extend( action_optimal_3[start_location:stop_location]) RL_3.ep_rs.extend( reward_optimal_3[start_location:stop_location]) RL_3.ep_ss.extend( safety_optimal_3[start_location:stop_location]) RL_1.learn(epoch_i, thre_entropy, IfPrint=True) RL_2.learn(epoch_i, thre_entropy) if len(RL_3.ep_obs) > 1: RL_3.learn(epoch_i, thre_entropy) """ checkpoint, per 1000 episodes """ if (epoch_i % 3000 == 0) & (epoch_i > 1): RL_1.save_session(ckpt_path_1) RL_2.save_session(ckpt_path_2) RL_3.save_session(ckpt_path_3) np.savez(np_path, tputs=np.array(RL_1.tput_persisit), candidate=np.array(RL_1.episode), vi_perapp=np.array(RL_1.ss_perapp_persisit), vi_coex=np.array(RL_1.ss_coex_persisit), vi_sum=np.array(RL_1.ss_sum_persisit), break_number=break_number) """ optimal range adaptively change """ thre_entropy *= 0.5 thre_entropy = max(thre_entropy, 0.01) epoch_i += 1
def train(params): """ parameters set """ NUM_NODES = params['number of nodes in the cluster'] env = LraClusterEnv(num_nodes=NUM_NODES) batch_size = params['batch_size'] ckpt_path_1 = "./checkpoint/" + params['path'] + "1/model.ckpt" ckpt_path_2 = "./checkpoint/" + params['path'] + "2/model.ckpt" ckpt_path_3 = "./checkpoint/" + params['path'] + "3/model.ckpt" make_path(params['path'] + "1") make_path(params['path'] + "2") make_path(params['path'] + "3") np_path = "./checkpoint/" + params['path'] + "/optimal_file_name.npz" Recover = params['recover'] nodes_per_group = int(params['nodes per group']) replay_size = params['replay size'] training_times_per_episode = 1 alpha = params['alpha'] """ Build Network """ n_actions = nodes_per_group #: 3 nodes per group n_features = int(n_actions * (env.NUM_APPS + 1 + env.NUM_APPS )+ 1 + env.NUM_APPS) #: 3*9+1 = 28 RL_1 = PolicyGradient( n_actions=n_actions, n_features=n_features, learning_rate=params['learning rate'], suffix="100" + '1a') RL_2 = PolicyGradient( n_actions=n_actions, n_features=n_features, learning_rate=params['learning rate'], suffix="100" + '2a') RL_3 = PolicyGradient( n_actions=n_actions, n_features=n_features, learning_rate=params['learning rate'], suffix="100" + '3a') """ Training """ start_time = time.time() global_start_time = start_time number_optimal = [] observation_episode_1, action_episode_1, reward_episode_1, safety_episode_1 = [], [], [], [] observation_optimal_1, action_optimal_1, reward_optimal_1, safety_optimal_1 = [], [], [], [] observation_episode_2, action_episode_2, reward_episode_2, safety_episode_2 = [], [], [], [] observation_optimal_2, action_optimal_2, reward_optimal_2, safety_optimal_2 = [], [], [], [] observation_episode_3, action_episode_3, reward_episode_3, safety_episode_3 = [], [], [], [] observation_optimal_3, action_optimal_3, reward_optimal_3, safety_optimal_3 = [], [], [], [] epoch_i = 0 thre_entropy = 0.1 names = locals() for i in range(0, 12): names['highest_tput_' + str(i)] = 0.1 names['observation_optimal_1_' + str(i)] = [] names['action_optimal_1_' + str(i)] = [] names['observation_optimal_2_' + str(i)] = [] names['action_optimal_2_' + str(i)] = [] names['observation_optimal_3_' + str(i)] = [] names['action_optimal_3_' + str(i)] = [] names['reward_optimal_' + str(i)] = [] names['safety_optimal_' + str(i)] = [] names['number_optimal_' + str(i)] = [] names['optimal_range_' + str(i)] = 1.05 names['lowest_vio_' + str(i)] = 500 names['observation_optimal_1_vio_' + str(i)] = [] names['action_optimal_1_vio_' + str(i)] = [] names['observation_optimal_2_vio_' + str(i)] = [] names['action_optimal_2_vio_' + str(i)] = [] names['observation_optimal_3_vio_' + str(i)] = [] names['action_optimal_3_vio_' + str(i)] = [] names['reward_optimal_vio_' + str(i)] = [] names['safety_optimal_vio_1_' + str(i)] = [] names['safety_optimal_vio_2_' + str(i)] = [] names['safety_optimal_vio_3_' + str(i)] = [] names['number_optimal_vio_' + str(i)] = [] names['optimal_range_vio_' + str(i)] = 1.1 def store_episode_1(observations, actions): observation_episode_1.append(observations) action_episode_1.append(actions) def store_episode_2(observations, actions): observation_episode_2.append(observations) action_episode_2.append(actions) def store_episode_3(observations, actions): observation_episode_3.append(observations) action_episode_3.append(actions) NUM_CONTAINERS = 100 tput_origimal_class = 0 source_batch_, index_data_ = batch_data(NUM_CONTAINERS, env.NUM_APPS) while epoch_i < params['epochs']: observation = env.reset().copy() # (9,9) source_batch = source_batch_.copy() index_data = index_data_.copy() """ Episode """ """ first layer """ source_batch_first = source_batch_.copy() observation_first_layer = np.zeros([nodes_per_group, env.NUM_APPS], int) for inter_episode_index in range(NUM_CONTAINERS): appid = index_data[inter_episode_index] source_batch_first[appid] -= 1 observation_first_layer_copy = observation_first_layer.copy() observation_first_layer_copy[:, appid] += 1 observation_first_layer_copy = np.append(observation_first_layer_copy, observation_first_layer_copy > 9 * 2, axis=1) observation_first_layer_copy = np.append(observation_first_layer_copy, observation_first_layer_copy.sum(axis=1).reshape(nodes_per_group, 1), axis=1) observation_first_layer_copy = np.array(observation_first_layer_copy).reshape(1, -1) observation_first_layer_copy = np.append(observation_first_layer_copy, appid).reshape(1, -1) observation_first_layer_copy = np.append(observation_first_layer_copy, np.array(source_batch_first)).reshape(1, -1) action_1, prob_weights = RL_1.choose_action(observation_first_layer_copy.copy()) observation_first_layer[action_1, appid] += 1 store_episode_1(observation_first_layer_copy, action_1) """ second layer """ observation_second_layer_aggregation = np.empty([0, env.NUM_APPS], int) # 9*20 number_cont_second_layer = [] for second_layer_index in range(nodes_per_group): rnd_array = observation_first_layer[second_layer_index].copy() source_batch_second, index_data = batch_data_sub(rnd_array) observation_second_layer = np.zeros([nodes_per_group, env.NUM_APPS], int) NUM_CONTAINERS_second = sum(source_batch_second) number_cont_second_layer.append(NUM_CONTAINERS_second) for inter_episode_index in range(NUM_CONTAINERS_second): appid = index_data[inter_episode_index] source_batch_second[appid] -= 1 observation_second_layer_copy = observation_second_layer.copy() observation_second_layer_copy[:, appid] += 1 observation_second_layer_copy = np.append(observation_second_layer_copy, observation_second_layer_copy > 3 * 2, axis=1) observation_second_layer_copy = np.append(observation_second_layer_copy, observation_second_layer_copy.sum(axis=1).reshape(nodes_per_group, 1), axis=1) observation_second_layer_copy = np.array(observation_second_layer_copy).reshape(1, -1) observation_second_layer_copy = np.append(observation_second_layer_copy, appid).reshape(1, -1) observation_second_layer_copy = np.append(observation_second_layer_copy, np.array(source_batch_second)).reshape(1, -1) action_2, prob_weights = RL_2.choose_action(observation_second_layer_copy.copy()) observation_second_layer[action_2, appid] += 1 store_episode_2(observation_second_layer_copy, action_2) observation_second_layer_aggregation = np.append(observation_second_layer_aggregation, observation_second_layer, 0) """ third layer """ observation_third_layer_aggregation = np.empty([0, env.NUM_APPS], int) # 9*20 number_cont_third_layer = [] for third_layer_index in range(nodes_per_group * nodes_per_group): rnd_array = observation_second_layer_aggregation[third_layer_index].copy() source_batch_third, index_data = batch_data_sub(rnd_array) observation_third_layer = np.zeros([nodes_per_group, env.NUM_APPS], int) NUM_CONTAINERS_third = sum(source_batch_third) number_cont_third_layer.append(NUM_CONTAINERS_third) for inter_episode_index in range(NUM_CONTAINERS_third): appid = index_data[inter_episode_index] source_batch_third[appid] -= 1 observation_third_layer_copy = observation_third_layer.copy() observation_third_layer_copy[:, appid] += 1 observation_third_layer_copy = np.append(observation_third_layer_copy, observation_third_layer_copy > 1 * 2, axis=1) observation_third_layer_copy = np.append(observation_third_layer_copy, observation_third_layer_copy.sum(axis=1).reshape(nodes_per_group, 1), axis=1) observation_third_layer_copy = np.array(observation_third_layer_copy).reshape(1, -1) observation_third_layer_copy = np.append(observation_third_layer_copy, appid).reshape(1, -1) observation_third_layer_copy = np.append(observation_third_layer_copy, np.array(source_batch_third)).reshape(1, -1) action_3, prob_weights = RL_3.choose_action(observation_third_layer_copy.copy()) observation_third_layer[action_3, appid] += 1 store_episode_3(observation_third_layer_copy, action_3) observation_third_layer_aggregation = np.append(observation_third_layer_aggregation, observation_third_layer, 0) """ After an entire allocation, calculate total throughput, reward """ env.state = observation_third_layer_aggregation.copy() tput_state = env.get_tput_total_env() tput = (sim.predict(tput_state.reshape(-1, env.NUM_APPS)) * tput_state).sum() / NUM_CONTAINERS assert sum(sum(env.state)) == NUM_CONTAINERS assert (env.state.sum(0) == source_batch_).all() list_check = 0 for node in range(NUM_NODES): for app in range(env.NUM_APPS): if env.state[node, :].sum() > params['container_limitation per node'] or env.state[node, app] > 1 or (app == 1 and env.state[node, 2] > 0) or (app == 2 and env.state[node, 1] > 0): list_check += env.state[node, app] list_check_ratio = -1.0 * list_check / NUM_CONTAINERS safety_episode_3, reward_episode_3 = [], [] for thrid_subcluster_index in range(nodes_per_group * nodes_per_group): list_check_ratio = 0 - list_check # - list_check_baseline safety_episode_3.extend([list_check_ratio * 1.0] * int(number_cont_third_layer[thrid_subcluster_index])) reward_episode_3.extend([tput * 1.0] * int(number_cont_third_layer[thrid_subcluster_index])) safety_episode_2, reward_episode_2 = [], [] for second_subcluster_index in range(nodes_per_group): safety_episode_2.extend([list_check_ratio * 1.0] * int(number_cont_second_layer[second_subcluster_index])) reward_episode_2.extend([tput * 1.0] * int(number_cont_second_layer[second_subcluster_index])) safety_episode_1 = [list_check_ratio * 1.0] * len(observation_episode_1) reward_episode_1 = [tput * 1.0] * len(observation_episode_1) RL_1.store_tput_per_episode(tput, list_check, epoch_i, [], [], list_check) RL_2.store_tput_per_episode(tput, list_check, epoch_i, [],[],[]) RL_3.store_tput_per_episode(tput, list_check, epoch_i, [],[],[]) RL_1.store_training_samples_per_episode(observation_episode_1, action_episode_1, safety_episode_1, reward_episode_1) RL_2.store_training_samples_per_episode(observation_episode_2, action_episode_2, safety_episode_2, reward_episode_2) RL_3.store_training_samples_per_episode(observation_episode_3, action_episode_3, safety_episode_3, reward_episode_3) """ check_tput_quality(tput) """ if names['lowest_vio_' + str(tput_origimal_class)] > list_check: names['lowest_vio_' + str(tput_origimal_class)] = list_check names['observation_optimal_1_vio_' + str(tput_origimal_class)], names['action_optimal_1_vio_' + str(tput_origimal_class)], names['observation_optimal_2_vio_' + str(tput_origimal_class)], names['action_optimal_2_vio_' + str(tput_origimal_class)], names['number_optimal_vio_' + str(tput_origimal_class)], names['safety_optimal_vio_1_' + str(tput_origimal_class)], names['safety_optimal_vio_2_' + str(tput_origimal_class)], names['safety_optimal_vio_3_' + str(tput_origimal_class)] = [], [], [], [], [], [], [], [] names['observation_optimal_3_vio_' + str(tput_origimal_class)], names['action_optimal_3_vio_' + str(tput_origimal_class)] = [], [] names['reward_optimal_vio_' + str(tput_origimal_class)] = [] names['observation_optimal_1_vio_' + str(tput_origimal_class)].extend(observation_episode_1) names['action_optimal_1_vio_' + str(tput_origimal_class)].extend(action_episode_1) names['observation_optimal_2_vio_' + str(tput_origimal_class)].extend(observation_episode_2) names['action_optimal_2_vio_' + str(tput_origimal_class)].extend(action_episode_2) names['observation_optimal_3_vio_' + str(tput_origimal_class)].extend(observation_episode_3) names['action_optimal_3_vio_' + str(tput_origimal_class)].extend(action_episode_3) names['number_optimal_vio_' + str(tput_origimal_class)].append(NUM_CONTAINERS) names['safety_optimal_vio_1_' + str(tput_origimal_class)].extend(safety_episode_1) names['safety_optimal_vio_2_' + str(tput_origimal_class)].extend(safety_episode_2) names['safety_optimal_vio_3_' + str(tput_origimal_class)].extend(safety_episode_3) names['reward_optimal_vio_' + str(tput_origimal_class)].extend(reward_episode_1) names['optimal_range_vio_' + str(tput_origimal_class)] = 1.1 elif names['lowest_vio_' + str(tput_origimal_class)] >= list_check / names['optimal_range_vio_' + str(tput_origimal_class)]: names['observation_optimal_1_vio_' + str(tput_origimal_class)].extend(observation_episode_1) names['action_optimal_1_vio_' + str(tput_origimal_class)].extend(action_episode_1) names['observation_optimal_2_vio_' + str(tput_origimal_class)].extend(observation_episode_2) names['action_optimal_2_vio_' + str(tput_origimal_class)].extend(action_episode_2) names['observation_optimal_3_vio_' + str(tput_origimal_class)].extend(observation_episode_3) names['action_optimal_3_vio_' + str(tput_origimal_class)].extend(action_episode_3) names['number_optimal_vio_' + str(tput_origimal_class)].append(NUM_CONTAINERS) names['safety_optimal_vio_1_' + str(tput_origimal_class)].extend(safety_episode_1) names['safety_optimal_vio_2_' + str(tput_origimal_class)].extend(safety_episode_2) names['safety_optimal_vio_3_' + str(tput_origimal_class)].extend(safety_episode_3) names['reward_optimal_vio_' + str(tput_origimal_class)].extend(reward_episode_1) if names['highest_tput_' + str(tput_origimal_class)] < tput: names['highest_tput_' + str(tput_origimal_class)] = tput observation_episode_1, action_episode_1, reward_episode_1, safety_episode_1, reward_episode_1 = [], [], [], [], [] observation_episode_2, action_episode_2, reward_episode_2, safety_episode_2, reward_episode_2 = [], [], [], [], [] observation_episode_3, action_episode_3, reward_episode_3, safety_episode_3, reward_episode_3 = [], [], [], [], [] """ Each batch, RL.learn() """ if (epoch_i % batch_size == 0) & (epoch_i > batch_size+1): RL_1.learn(epoch_i, thre_entropy, IfPrint=True, alpha=alpha) RL_2.learn(epoch_i, thre_entropy, alpha=alpha) RL_3.learn(epoch_i, thre_entropy, alpha=alpha) """ checkpoint, per 1000 episodes """ if (epoch_i % 3000 == 0) & (epoch_i > 1): RL_1.save_session(ckpt_path_1) RL_2.save_session(ckpt_path_2) RL_3.save_session(ckpt_path_3) np.savez(np_path, tputs=np.array(RL_1.tput_persisit), candidate=np.array(RL_1.episode), vi_perapp=np.array(RL_1.ss_perapp_persisit), vi_coex=np.array(RL_1.ss_coex_persisit), vi_sum=np.array(RL_1.ss_sum_persisit)) """ optimal range adaptively change """ thre_entropy *= 0.5 thre_entropy = max(thre_entropy, 0.01) epoch_i += 1
def get_total_tput(self, rnd_array): # assert sum(rnd_array) == 81 source_batch_, index_data = self.batch_data( rnd_array.astype(int)) # index_data = [0,1,2,0,1,2] env = LraClusterEnv(num_nodes=self.NUM_NODES) observation = env.reset().copy() # (9,9) source_batch = source_batch_.copy() nodes_per_group = int(params['nodes per group']) NUM_CONTAINERS = int(sum(rnd_array)) """ Episode """ """ first layer """ source_batch_first = source_batch_.copy() observation_first_layer = np.zeros([nodes_per_group, env.NUM_APPS], int) for inter_episode_index in range(NUM_CONTAINERS): appid = index_data[inter_episode_index] source_batch_first[appid] -= 1 observation_first_layer_copy = observation_first_layer.copy() observation_first_layer_copy[:, appid] += 1 observation_first_layer_copy = np.append( observation_first_layer_copy, observation_first_layer_copy > 9 * 2, axis=1) observation_first_layer_copy = np.append( observation_first_layer_copy, observation_first_layer_copy.sum(axis=1).reshape( nodes_per_group, 1), axis=1) # observation_first_layer_copy = np.append(observation_first_layer_copy, ((observation_first_layer_copy[:, 2] > 0) * (observation_first_layer_copy[:, 3] > 0)).reshape(nodes_per_group, 1), axis=1) observation_first_layer_copy = np.array( observation_first_layer_copy).reshape(1, -1) observation_first_layer_copy = np.append( observation_first_layer_copy, appid).reshape(1, -1) observation_first_layer_copy = np.append( observation_first_layer_copy, np.array(source_batch_first)).reshape(1, -1) action_1, prob_weights = self.RL_1.choose_action_determine( observation_first_layer_copy.copy()) observation_first_layer[action_1, appid] += 1 # self.store_episode_1(observation_first_layer_copy, action_1) """ second layer """ observation_second_layer_aggregation = np.empty([0, env.NUM_APPS], int) # 9*20 number_cont_second_layer = [] for second_layer_index in range(nodes_per_group): rnd_array = observation_first_layer[second_layer_index].copy() source_batch_second, index_data = self.batch_data_sub(rnd_array) observation_second_layer = np.zeros( [nodes_per_group, env.NUM_APPS], int) NUM_CONTAINERS_second = sum(source_batch_second) number_cont_second_layer.append(NUM_CONTAINERS_second) for inter_episode_index in range(NUM_CONTAINERS_second): appid = index_data[inter_episode_index] source_batch_second[appid] -= 1 observation_second_layer_copy = observation_second_layer.copy() observation_second_layer_copy[:, appid] += 1 observation_second_layer_copy = np.append( observation_second_layer_copy, observation_second_layer_copy > 3 * 2, axis=1) observation_second_layer_copy = np.append( observation_second_layer_copy, observation_second_layer_copy.sum(axis=1).reshape( nodes_per_group, 1), axis=1) # observation_second_layer_copy = np.append(observation_second_layer_copy, ((observation_second_layer_copy[:, 2] > 0) * (observation_second_layer_copy[:, 3] > 0)).reshape(nodes_per_group, 1), axis=1) observation_second_layer_copy = np.array( observation_second_layer_copy).reshape(1, -1) observation_second_layer_copy = np.append( observation_second_layer_copy, appid).reshape(1, -1) observation_second_layer_copy = np.append( observation_second_layer_copy, np.array(source_batch_second)).reshape(1, -1) action_2, prob_weights = self.RL_2.choose_action_determine( observation_second_layer_copy.copy()) observation_second_layer[action_2, appid] += 1 # self.store_episode_2(observation_second_layer_copy, action_2) observation_second_layer_aggregation = np.append( observation_second_layer_aggregation, observation_second_layer, 0) """ third layer """ observation_third_layer_aggregation = np.empty([0, env.NUM_APPS], int) # 9*20 number_cont_third_layer = [] for third_layer_index in range(nodes_per_group * nodes_per_group): rnd_array = observation_second_layer_aggregation[ third_layer_index].copy() source_batch_third, index_data = self.batch_data_sub(rnd_array) observation_third_layer = np.zeros([nodes_per_group, env.NUM_APPS], int) NUM_CONTAINERS_third = sum(source_batch_third) number_cont_third_layer.append(NUM_CONTAINERS_third) for inter_episode_index in range(NUM_CONTAINERS_third): appid = index_data[inter_episode_index] source_batch_third[appid] -= 1 observation_third_layer_copy = observation_third_layer.copy() observation_third_layer_copy[:, appid] += 1 observation_third_layer_copy = np.append( observation_third_layer_copy, observation_third_layer_copy > 1 * 2, axis=1) observation_third_layer_copy = np.append( observation_third_layer_copy, observation_third_layer_copy.sum(axis=1).reshape( nodes_per_group, 1), axis=1) # observation_third_layer_copy = np.append(observation_third_layer_copy, ((observation_third_layer_copy[:, 2] > 0) * (observation_third_layer_copy[:, 3] > 0)).reshape(nodes_per_group, 1), axis=1) observation_third_layer_copy = np.array( observation_third_layer_copy).reshape(1, -1) observation_third_layer_copy = np.append( observation_third_layer_copy, appid).reshape(1, -1) observation_third_layer_copy = np.append( observation_third_layer_copy, np.array(source_batch_third)).reshape(1, -1) action_3, prob_weights = self.RL_3.choose_action_determine( observation_third_layer_copy.copy()) observation_third_layer[action_3, appid] += 1 # self.store_episode_3(observation_third_layer_copy, action_3) observation_third_layer_aggregation = np.append( observation_third_layer_aggregation, observation_third_layer, 0) """ After an entire allocation, calculate total throughput, reward """ env.state = observation_third_layer_aggregation.copy() assert sum(sum(env.state)) == NUM_CONTAINERS assert (env.state.sum(0) == source_batch_).all() """ After an entire allocation, calculate total throughput, reward """ # state = env.state # assert sum(sum(self.env.state)) == 81 return env.state
def train(params): import pandas as pd df = pd.read_csv(params['alloc_path'], sep=',', header=0) allocation = df.values[:, 0:7] df = pd.read_csv(params['rps_path'], sep=',', header=0) rps = df.values env = LraClusterEnv(num_nodes=9) capacity = params['container_limitation per node'] NUM_APP = 7 miss_sample = 0 state_set = np.empty((0, 4), int) rps_set = np.empty((0, capacity), float) num_cardinality = 0 for s_tag in range(NUM_APP): container_list = np.zeros([1, NUM_APP]) container_list[0, s_tag] += 1 exist = (allocation == container_list[0]).all(1).any() if exist: tput_breakdown_single = rps[allocation.tolist().index( container_list[0].tolist())] else: tput_node, tput_breakdown_single = ( env.get_throughput_given_state(container_list)) tput_breakdown_single = tput_breakdown_single[0] miss_sample += 1 tput_s_tag_original = tput_breakdown_single[s_tag] for c_tag in range(NUM_APP): tput_s_tag_set = [] for num_c_tag in range(0, capacity): container_list = np.zeros([1, NUM_APP]) container_list[0, s_tag] += 1 container_list[0, c_tag] += num_c_tag exist = (allocation == container_list[0]).all(1).any() if exist: tput_breakdown_single = rps[allocation.tolist().index( container_list[0].tolist())] else: tput_node, tput_breakdown_single = ( env.get_throughput_given_state(container_list)) tput_breakdown_single = tput_breakdown_single[0] miss_sample += 1 tput_s_tag = tput_breakdown_single[s_tag] tput_s_tag_set.append(tput_s_tag) if np.max(tput_s_tag_set) / np.min(tput_s_tag_set) > 1.1: for cehck_num in range(1, capacity): if tput_s_tag_set[ cehck_num] / tput_s_tag_original > 1.1: #1.4 interfence_tag = 1 # larger than state_set = np.append(state_set, np.array([ s_tag + 1, c_tag + 1, interfence_tag, cehck_num ]).reshape([1, 4]), axis=0) rps_set = np.append( rps_set, np.array(tput_s_tag_set / tput_s_tag_original).reshape([1, 8]), axis=0) break if tput_s_tag_set[ cehck_num] / tput_s_tag_original < 0.9: #0.6: interfence_tag = 0 # less than if c_tag == s_tag: cehck_num += 1 state_set = np.append(state_set, np.array([ s_tag + 1, c_tag + 1, interfence_tag, cehck_num - 1 ]).reshape([1, 4]), axis=0) rps_set = np.append( rps_set, np.array(tput_s_tag_set / tput_s_tag_original).reshape([1, 8]), axis=0) break rise = 0 fall = 0 for cehck_num in range(1, capacity): if tput_s_tag_set[cehck_num] / tput_s_tag_set[cehck_num - 1] > 1.2: rise = 1 if tput_s_tag_set[cehck_num] / tput_s_tag_set[cehck_num - 1] < 0.8: fall = 1 if rise * fall > 0: num_cardinality += 1 print(s_tag + 1, c_tag + 1) import pandas as pd save = pd.DataFrame( state_set, columns=["app_s", "app_c", "less_or_lager", "threshold"]) save.to_csv('./interference_applist.csv', index=False, header=True) save_1 = pd.DataFrame(rps_set, columns=[0, 1, 2, 3, 4, 5, 6, 7]) save_1.to_csv('./interference_rpslist.csv', index=False, header=True) print("num_cardinality: %d" % num_cardinality) print("miss_sample:", miss_sample)
def train(params): """ parameters set """ NUM_NODES = params['number of nodes in the cluster'] env = LraClusterEnv(num_nodes=NUM_NODES) batch_size = params['batch_size'] ckpt_path_1 = "./checkpoint/" + params['path'] + "1/model.ckpt" ckpt_path_2 = "./checkpoint/" + params['path'] + "2/model.ckpt" ckpt_path_3 = "./checkpoint/" + params['path'] + "3/model.ckpt" ckpt_path_rec_1 = "../results/cpo/newhypernode/" + params[ 'rec_path'] + "1/model.ckpt" ckpt_path_rec_2 = "../results/cpo/newhypernode/" + params[ 'rec_path'] + "2/model.ckpt" ckpt_path_rec_3 = "../results/cpo/newhypernode/" + params[ 'rec_path'] + "3/model.ckpt" np_path = "./checkpoint/" + params['path'] + "/optimal_file_name.npz" Recover = params['recover'] nodes_per_group = int(params['nodes per group']) replay_size = params['replay size'] training_times_per_episode = 1 # TODO: if layers changes, training_times_per_episode should be modified safety_requirement = 6 # if NUM_CONTAINERS_start > 100: # safety_requirement = 1.0 # if NUM_CONTAINERS_start > 150: # safety_requirement = 1.0 # if NUM_CONTAINERS_start > 180: # safety_requirement = 1.0 """ Build Network """ n_actions = nodes_per_group #: 3 nodes per group n_features = int(n_actions * (env.NUM_APPS + 1 + env.NUM_APPS) + 1 + env.NUM_APPS) #: 3*9+1 = 28 RL_1 = PolicyGradient(n_actions=n_actions, n_features=n_features, learning_rate=params['learning rate'], suffix=str(100) + '1a', safety_requirement=safety_requirement) RL_2 = PolicyGradient(n_actions=n_actions, n_features=n_features, learning_rate=params['learning rate'], suffix=str(100) + '2a', safety_requirement=safety_requirement) RL_3 = PolicyGradient(n_actions=n_actions, n_features=n_features, learning_rate=params['learning rate'], suffix=str(100) + '3a', safety_requirement=safety_requirement) sim = Simulator() """ Training """ start_time = time.time() global_start_time = start_time number_optimal = [] observation_episode_1, action_episode_1, reward_episode_1, safety_episode_1 = [], [], [], [] observation_optimal_1, action_optimal_1, reward_optimal_1, safety_optimal_1 = [], [], [], [] observation_episode_2, action_episode_2, reward_episode_2, safety_episode_2 = [], [], [], [] observation_optimal_2, action_optimal_2, reward_optimal_2, safety_optimal_2 = [], [], [], [] observation_episode_3, action_episode_3, reward_episode_3, safety_episode_3 = [], [], [], [] observation_optimal_3, action_optimal_3, reward_optimal_3, safety_optimal_3 = [], [], [], [] epoch_i = 0 thre_entropy = 0.1 # TODO: delete this range names = locals() for i in range(0, 10): names['highest_tput_' + str(i)] = 0 names['observation_optimal_1_' + str(i)] = [] names['action_optimal_1_' + str(i)] = [] names['observation_optimal_2_' + str(i)] = [] names['action_optimal_2_' + str(i)] = [] names['observation_optimal_3_' + str(i)] = [] names['action_optimal_3_' + str(i)] = [] names['reward_optimal_1_' + str(i)] = [] names['reward_optimal_2_' + str(i)] = [] names['reward_optimal_3_' + str(i)] = [] names['safety_optimal_1_' + str(i)] = [] names['safety_optimal_2_' + str(i)] = [] names['safety_optimal_3_' + str(i)] = [] names['number_optimal_' + str(i)] = [] names['optimal_range_' + str(i)] = 1.05 names['lowest_vio_' + str(i)] = 500 names['observation_optimal_1_vio_' + str(i)] = [] names['action_optimal_1_vio_' + str(i)] = [] names['observation_optimal_2_vio_' + str(i)] = [] names['action_optimal_2_vio_' + str(i)] = [] names['observation_optimal_3_vio_' + str(i)] = [] names['action_optimal_3_vio_' + str(i)] = [] names['reward_optimal_vio_1_' + str(i)] = [] names['reward_optimal_vio_2_' + str(i)] = [] names['reward_optimal_vio_3_' + str(i)] = [] names['safety_optimal_vio_1_' + str(i)] = [] names['safety_optimal_vio_2_' + str(i)] = [] names['safety_optimal_vio_3_' + str(i)] = [] names['number_optimal_vio_' + str(i)] = [] names['optimal_range_vio_' + str(i)] = 1.1 def store_episode_1(observations, actions): observation_episode_1.append(observations) action_episode_1.append(actions) def store_episode_2(observations, actions): observation_episode_2.append(observations) action_episode_2.append(actions) def store_episode_3(observations, actions): observation_episode_3.append(observations) action_episode_3.append(actions) NUM_CONTAINERS = 50 tput_origimal_class = 0 source_batch_, index_data_ = batch_data( NUM_CONTAINERS, env.NUM_APPS) # index_data = [0,1,2,0,1,2] while epoch_i < params['epochs']: if Recover: RL_1.restore_session(ckpt_path_rec_1) RL_2.restore_session(ckpt_path_rec_2) RL_3.restore_session(ckpt_path_rec_3) Recover = False observation = env.reset().copy() # (9,9) source_batch = source_batch_.copy() index_data = index_data_.copy() """ Episode """ """ first layer """ source_batch_first = source_batch_.copy() observation_first_layer = np.zeros([nodes_per_group, env.NUM_APPS], int) for inter_episode_index in range(NUM_CONTAINERS): appid = index_data[inter_episode_index] source_batch_first[appid] -= 1 observation_first_layer_copy = observation_first_layer.copy() observation_first_layer_copy[:, appid] += 1 observation_first_layer_copy = np.append( observation_first_layer_copy, observation_first_layer_copy > 9 * 2, axis=1) observation_first_layer_copy = np.append( observation_first_layer_copy, observation_first_layer_copy.sum(axis=1).reshape( nodes_per_group, 1), axis=1) # observation_first_layer_copy = np.append(observation_first_layer_copy, ((observation_first_layer_copy[:, 2] > 0) * (observation_first_layer_copy[:, 3] > 0)).reshape(nodes_per_group, 1), axis=1) observation_first_layer_copy = np.array( observation_first_layer_copy).reshape(1, -1) observation_first_layer_copy = np.append( observation_first_layer_copy, appid).reshape(1, -1) observation_first_layer_copy = np.append( observation_first_layer_copy, np.array(source_batch_first)).reshape(1, -1) action_1, prob_weights = RL_1.choose_action( observation_first_layer_copy.copy()) observation_first_layer[action_1, appid] += 1 store_episode_1(observation_first_layer_copy, action_1) """ second layer """ observation_second_layer_aggregation = np.empty([0, env.NUM_APPS], int) # 9*20 number_cont_second_layer = [] for second_layer_index in range(nodes_per_group): rnd_array = observation_first_layer[second_layer_index].copy() source_batch_second, index_data = batch_data_sub(rnd_array) observation_second_layer = np.zeros( [nodes_per_group, env.NUM_APPS], int) NUM_CONTAINERS_second = sum(source_batch_second) number_cont_second_layer.append(NUM_CONTAINERS_second) for inter_episode_index in range(NUM_CONTAINERS_second): appid = index_data[inter_episode_index] source_batch_second[appid] -= 1 observation_second_layer_copy = observation_second_layer.copy() observation_second_layer_copy[:, appid] += 1 observation_second_layer_copy = np.append( observation_second_layer_copy, observation_second_layer_copy > 3 * 2, axis=1) observation_second_layer_copy = np.append( observation_second_layer_copy, observation_second_layer_copy.sum(axis=1).reshape( nodes_per_group, 1), axis=1) # observation_second_layer_copy = np.append(observation_second_layer_copy, ((observation_second_layer_copy[:, 2] > 0) * (observation_second_layer_copy[:, 3] > 0)).reshape(nodes_per_group, 1), axis=1) observation_second_layer_copy = np.array( observation_second_layer_copy).reshape(1, -1) observation_second_layer_copy = np.append( observation_second_layer_copy, appid).reshape(1, -1) observation_second_layer_copy = np.append( observation_second_layer_copy, np.array(source_batch_second)).reshape(1, -1) action_2, prob_weights = RL_2.choose_action( observation_second_layer_copy.copy()) observation_second_layer[action_2, appid] += 1 store_episode_2(observation_second_layer_copy, action_2) observation_second_layer_aggregation = np.append( observation_second_layer_aggregation, observation_second_layer, 0) """ third layer """ observation_third_layer_aggregation = np.empty([0, env.NUM_APPS], int) # 9*20 number_cont_third_layer = [] for third_layer_index in range(nodes_per_group * nodes_per_group): rnd_array = observation_second_layer_aggregation[ third_layer_index].copy() source_batch_third, index_data = batch_data_sub(rnd_array) observation_third_layer = np.zeros([nodes_per_group, env.NUM_APPS], int) NUM_CONTAINERS_third = sum(source_batch_third) number_cont_third_layer.append(NUM_CONTAINERS_third) for inter_episode_index in range(NUM_CONTAINERS_third): appid = index_data[inter_episode_index] source_batch_third[appid] -= 1 observation_third_layer_copy = observation_third_layer.copy() observation_third_layer_copy[:, appid] += 1 observation_third_layer_copy = np.append( observation_third_layer_copy, observation_third_layer_copy > 1 * 2, axis=1) observation_third_layer_copy = np.append( observation_third_layer_copy, observation_third_layer_copy.sum(axis=1).reshape( nodes_per_group, 1), axis=1) # observation_third_layer_copy = np.append(observation_third_layer_copy, ((observation_third_layer_copy[:, 2] > 0) * (observation_third_layer_copy[:, 3] > 0)).reshape(nodes_per_group, 1), axis=1) observation_third_layer_copy = np.array( observation_third_layer_copy).reshape(1, -1) observation_third_layer_copy = np.append( observation_third_layer_copy, appid).reshape(1, -1) observation_third_layer_copy = np.append( observation_third_layer_copy, np.array(source_batch_third)).reshape(1, -1) action_3, prob_weights = RL_3.choose_action( observation_third_layer_copy.copy()) observation_third_layer[action_3, appid] += 1 store_episode_3(observation_third_layer_copy, action_3) observation_third_layer_aggregation = np.append( observation_third_layer_aggregation, observation_third_layer, 0) """ After an entire allocation, calculate total throughput, reward """ env.state = observation_third_layer_aggregation.copy() assert sum(sum(env.state)) == NUM_CONTAINERS assert (env.state.sum(0) == source_batch_).all() tput_state = env.state tput_breakdown = sim.predict(tput_state.reshape(-1, env.NUM_APPS)) tput = (tput_state.sum(1) < 1e-10).sum() reward_ratio = (tput - 0) state = env.state list_check_per_app = (env.state > 1).sum() + max( (env.state - 1).max(), 0) list_check_sum = sum( env.state.sum(1) > params['container_limitation per node'] ) + max( max(env.state.sum(1) - params['container_limitation per node']), 0) list_check_coex = sum((env.state[:, 1] > 0) * (env.state[:, 2] > 0)) list_check = list_check_sum + list_check_coex + list_check_per_app list_check += ((tput_breakdown < 0.8) * tput_state).sum() list_check_ratio = list_check safety_episode_1 = [list_check_ratio * 1.0 ] * len(observation_episode_1) reward_episode_1 = [reward_ratio * 1.0] * len(observation_episode_1) safety_episode_2 = [list_check_ratio * 1.0 ] * len(observation_episode_2) reward_episode_2 = [reward_ratio * 1.0] * len(observation_episode_2) safety_episode_3 = [list_check_ratio * 1.0 ] * len(observation_episode_3) reward_episode_3 = [reward_ratio * 1.0] * len(observation_episode_3) RL_1.store_tput_per_episode(tput, epoch_i, list_check, list_check_per_app, list_check_coex, list_check_sum) RL_2.store_tput_per_episode(tput, epoch_i, list_check, [], [], []) RL_3.store_tput_per_episode(tput, epoch_i, list_check, [], [], []) RL_1.store_training_samples_per_episode(observation_episode_1, action_episode_1, reward_episode_1, safety_episode_1) RL_2.store_training_samples_per_episode(observation_episode_2, action_episode_2, reward_episode_2, safety_episode_2) RL_3.store_training_samples_per_episode(observation_episode_3, action_episode_3, reward_episode_3, safety_episode_3) """ check_tput_quality(tput) """ if names['lowest_vio_' + str(tput_origimal_class)] > list_check: names['lowest_vio_' + str(tput_origimal_class)] = list_check names['observation_optimal_1_vio_' + str( tput_origimal_class )], names[ 'action_optimal_1_vio_' + str(tput_origimal_class)], names[ 'observation_optimal_2_vio_' + str(tput_origimal_class)], names[ 'action_optimal_2_vio_' + str(tput_origimal_class)], names[ 'number_optimal_vio_' + str(tput_origimal_class)], names[ 'safety_optimal_vio_1_' + str(tput_origimal_class)], names[ 'safety_optimal_vio_2_' + str(tput_origimal_class)], names[ 'safety_optimal_vio_3_' + str( tput_origimal_class )] = [], [], [], [], [], [], [], [] names['observation_optimal_3_vio_' + str(tput_origimal_class)], names[ 'action_optimal_3_vio_' + str(tput_origimal_class)] = [], [] names['reward_optimal_vio_' + str(tput_origimal_class)] = [] names['observation_optimal_1_vio_' + str(tput_origimal_class)].extend(observation_episode_1) names['action_optimal_1_vio_' + str(tput_origimal_class)].extend(action_episode_1) names['observation_optimal_2_vio_' + str(tput_origimal_class)].extend(observation_episode_2) names['action_optimal_2_vio_' + str(tput_origimal_class)].extend(action_episode_2) names['observation_optimal_3_vio_' + str(tput_origimal_class)].extend(observation_episode_3) names['action_optimal_3_vio_' + str(tput_origimal_class)].extend(action_episode_3) names['number_optimal_vio_' + str(tput_origimal_class)].append(NUM_CONTAINERS) names['safety_optimal_vio_1_' + str(tput_origimal_class)].extend(safety_episode_1) names['safety_optimal_vio_2_' + str(tput_origimal_class)].extend(safety_episode_2) names['safety_optimal_vio_3_' + str(tput_origimal_class)].extend(safety_episode_3) names['reward_optimal_vio_' + str(tput_origimal_class)].extend(reward_episode_1) names['optimal_range_vio_' + str(tput_origimal_class)] = 1.1 elif names['lowest_vio_' + str(tput_origimal_class)] >= list_check / names[ 'optimal_range_vio_' + str(tput_origimal_class)]: names['observation_optimal_1_vio_' + str(tput_origimal_class)].extend(observation_episode_1) names['action_optimal_1_vio_' + str(tput_origimal_class)].extend(action_episode_1) names['observation_optimal_2_vio_' + str(tput_origimal_class)].extend(observation_episode_2) names['action_optimal_2_vio_' + str(tput_origimal_class)].extend(action_episode_2) names['observation_optimal_3_vio_' + str(tput_origimal_class)].extend(observation_episode_3) names['action_optimal_3_vio_' + str(tput_origimal_class)].extend(action_episode_3) names['number_optimal_vio_' + str(tput_origimal_class)].append(NUM_CONTAINERS) names['safety_optimal_vio_1_' + str(tput_origimal_class)].extend(safety_episode_1) names['safety_optimal_vio_2_' + str(tput_origimal_class)].extend(safety_episode_2) names['safety_optimal_vio_3_' + str(tput_origimal_class)].extend(safety_episode_3) names['reward_optimal_vio_' + str(tput_origimal_class)].extend(reward_episode_1) if list_check <= safety_requirement * 0.5: if names['highest_tput_' + str(tput_origimal_class)] < tput: names['highest_tput_' + str(tput_origimal_class)] = tput names['observation_optimal_1_' + str(tput_origimal_class)], names['action_optimal_1_' + str(tput_origimal_class)], names['observation_optimal_2_' + str(tput_origimal_class)], names['action_optimal_2_' + str(tput_origimal_class)],\ names['reward_optimal_1_' + str(tput_origimal_class)],names['reward_optimal_2_' + str(tput_origimal_class)],names['reward_optimal_3_' + str(tput_origimal_class)], \ names['number_optimal_' + str(tput_origimal_class)],\ names['safety_optimal_1_' + str(tput_origimal_class)],names['safety_optimal_2_' + str(tput_origimal_class)],names['safety_optimal_3_' + str(tput_origimal_class)]\ = [], [], [], [], [], [], [], [], [], [], [] names['observation_optimal_3_' + str(tput_origimal_class)], names[ 'action_optimal_3_' + str(tput_origimal_class)] = [], [] names['observation_optimal_1_' + str(tput_origimal_class)].extend(observation_episode_1) names['action_optimal_1_' + str(tput_origimal_class)].extend(action_episode_1) names['observation_optimal_2_' + str(tput_origimal_class)].extend(observation_episode_2) names['action_optimal_2_' + str(tput_origimal_class)].extend(action_episode_2) names['observation_optimal_3_' + str(tput_origimal_class)].extend(observation_episode_3) names['action_optimal_3_' + str(tput_origimal_class)].extend(action_episode_3) names['number_optimal_' + str(tput_origimal_class)].append(NUM_CONTAINERS) names['safety_optimal_1_' + str(tput_origimal_class)].extend(safety_episode_1) names['safety_optimal_2_' + str(tput_origimal_class)].extend(safety_episode_2) names['safety_optimal_3_' + str(tput_origimal_class)].extend(safety_episode_3) names['reward_optimal_1_' + str(tput_origimal_class)].extend(reward_episode_1) names['reward_optimal_2_' + str(tput_origimal_class)].extend(reward_episode_2) names['reward_optimal_3_' + str(tput_origimal_class)].extend(reward_episode_3) names['optimal_range_' + str(tput_origimal_class)] = 1.05 elif names['highest_tput_' + str(tput_origimal_class)] < tput * names[ 'optimal_range_' + str(tput_origimal_class)]: names['observation_optimal_1_' + str(tput_origimal_class)].extend(observation_episode_1) names['action_optimal_1_' + str(tput_origimal_class)].extend(action_episode_1) names['observation_optimal_2_' + str(tput_origimal_class)].extend(observation_episode_2) names['action_optimal_2_' + str(tput_origimal_class)].extend(action_episode_2) names['observation_optimal_3_' + str(tput_origimal_class)].extend(observation_episode_3) names['action_optimal_3_' + str(tput_origimal_class)].extend(action_episode_3) names['number_optimal_' + str(tput_origimal_class)].append(NUM_CONTAINERS) names['safety_optimal_1_' + str(tput_origimal_class)].extend(safety_episode_1) names['safety_optimal_2_' + str(tput_origimal_class)].extend(safety_episode_2) names['safety_optimal_3_' + str(tput_origimal_class)].extend(safety_episode_3) names['reward_optimal_1_' + str(tput_origimal_class)].extend(reward_episode_1) names['reward_optimal_2_' + str(tput_origimal_class)].extend(reward_episode_2) names['reward_optimal_3_' + str(tput_origimal_class)].extend(reward_episode_3) observation_episode_1, action_episode_1, reward_episode_1, safety_episode_1 = [], [], [], [] observation_episode_2, action_episode_2, reward_episode_2, safety_episode_2 = [], [], [], [] observation_episode_3, action_episode_3, reward_episode_3, safety_episode_3 = [], [], [], [] """ Each batch, RL.learn() """ if (epoch_i % batch_size == 0) & (epoch_i > 1): for replay_class in range(0, 1): number_optimal = names['number_optimal_' + str(replay_class)] reward_optimal_1 = names['reward_optimal_1_' + str(replay_class)] reward_optimal_2 = names['reward_optimal_2_' + str(replay_class)] reward_optimal_3 = names['reward_optimal_3_' + str(replay_class)] safety_optimal_1 = names['safety_optimal_1_' + str(replay_class)] safety_optimal_2 = names['safety_optimal_2_' + str(replay_class)] safety_optimal_3 = names['safety_optimal_3_' + str(replay_class)] observation_optimal_1 = names['observation_optimal_1_' + str(replay_class)] action_optimal_1 = names['action_optimal_1_' + str(replay_class)] observation_optimal_2 = names['observation_optimal_2_' + str(replay_class)] action_optimal_2 = names['action_optimal_2_' + str(replay_class)] observation_optimal_3 = names['observation_optimal_3_' + str(replay_class)] action_optimal_3 = names['action_optimal_3_' + str(replay_class)] buffer_size = int(len(number_optimal)) if buffer_size < replay_size: # TODO: if layers changes, training_times_per_episode should be modified RL_1.ep_obs.extend(observation_optimal_1) RL_1.ep_as.extend(action_optimal_1) RL_1.ep_rs.extend(reward_optimal_1) RL_1.ep_ss.extend(safety_optimal_1) RL_2.ep_obs.extend(observation_optimal_2) RL_2.ep_as.extend(action_optimal_2) RL_2.ep_rs.extend(reward_optimal_2) RL_2.ep_ss.extend(safety_optimal_2) RL_3.ep_obs.extend(observation_optimal_3) RL_3.ep_as.extend(action_optimal_3) RL_3.ep_rs.extend(reward_optimal_3) RL_3.ep_ss.extend(safety_optimal_3) else: replay_index = np.random.choice(range(buffer_size), size=replay_size, replace=False) for replay_id in range(replay_size): replace_start = replay_index[replay_id] start_location = sum(number_optimal[:replace_start]) stop_location = sum(number_optimal[:replace_start + 1]) RL_1.ep_obs.extend( observation_optimal_1[start_location:stop_location] ) RL_1.ep_as.extend( action_optimal_1[start_location:stop_location]) RL_1.ep_rs.extend( reward_optimal_1[start_location:stop_location]) RL_1.ep_ss.extend( safety_optimal_1[start_location:stop_location]) RL_2.ep_obs.extend( observation_optimal_2[start_location:stop_location] ) RL_2.ep_as.extend( action_optimal_2[start_location:stop_location]) RL_2.ep_rs.extend( reward_optimal_2[start_location:stop_location]) RL_2.ep_ss.extend( safety_optimal_2[start_location:stop_location]) RL_3.ep_obs.extend( observation_optimal_3[start_location:stop_location] ) RL_3.ep_as.extend( action_optimal_3[start_location:stop_location]) RL_3.ep_rs.extend( reward_optimal_3[start_location:stop_location]) RL_3.ep_ss.extend( safety_optimal_3[start_location:stop_location]) if not RL_1.start_cpo: for replay_class in range(0, 1): number_optimal = names['number_optimal_vio_' + str(replay_class)] safety_optimal_1 = names['safety_optimal_vio_1_' + str(replay_class)] safety_optimal_2 = names['safety_optimal_vio_2_' + str(replay_class)] safety_optimal_3 = names['safety_optimal_vio_3_' + str(replay_class)] reward_optimal = names['reward_optimal_vio_' + str(replay_class)] observation_optimal_1 = names['observation_optimal_1_vio_' + str(replay_class)] action_optimal_1 = names['action_optimal_1_vio_' + str(replay_class)] observation_optimal_2 = names['observation_optimal_2_vio_' + str(replay_class)] action_optimal_2 = names['action_optimal_2_vio_' + str(replay_class)] observation_optimal_3 = names['observation_optimal_3_vio_' + str(replay_class)] action_optimal_3 = names['action_optimal_3_vio_' + str(replay_class)] buffer_size = int(len(number_optimal)) if buffer_size < replay_size: # TODO: if layers changes, training_times_per_episode should be modified RL_1.ep_obs.extend(observation_optimal_1) RL_1.ep_as.extend(action_optimal_1) RL_1.ep_ss.extend(safety_optimal_1) RL_1.ep_rs.extend(reward_optimal) RL_2.ep_obs.extend(observation_optimal_2) RL_2.ep_as.extend(action_optimal_2) RL_2.ep_rs.extend(reward_optimal) RL_2.ep_ss.extend(safety_optimal_2) RL_3.ep_obs.extend(observation_optimal_3) RL_3.ep_as.extend(action_optimal_3) RL_3.ep_rs.extend(reward_optimal) RL_3.ep_ss.extend(safety_optimal_3) else: replay_index = np.random.choice(range(buffer_size), size=replay_size, replace=False) for replay_id in range(replay_size): replace_start = replay_index[replay_id] start_location = sum( number_optimal[:replace_start]) stop_location = sum(number_optimal[:replace_start + 1]) RL_1.ep_obs.extend(observation_optimal_1[ start_location:stop_location]) RL_1.ep_as.extend( action_optimal_1[start_location:stop_location]) RL_1.ep_rs.extend( reward_optimal[start_location:stop_location]) RL_1.ep_ss.extend( safety_optimal_1[start_location:stop_location]) RL_2.ep_obs.extend(observation_optimal_2[ start_location:stop_location]) RL_2.ep_as.extend( action_optimal_2[start_location:stop_location]) RL_2.ep_rs.extend( reward_optimal[start_location:stop_location]) RL_2.ep_ss.extend( safety_optimal_2[start_location:stop_location]) RL_3.ep_obs.extend(observation_optimal_3[ start_location:stop_location]) RL_3.ep_as.extend( action_optimal_3[start_location:stop_location]) RL_3.ep_rs.extend( reward_optimal[start_location:stop_location]) RL_3.ep_ss.extend( safety_optimal_3[start_location:stop_location]) RL_1.learn(epoch_i, thre_entropy, Ifprint=True) RL_2.learn(epoch_i, thre_entropy) optim_case = RL_3.learn(epoch_i, thre_entropy) """ checkpoint, per 1000 episodes """ if (epoch_i % 3000 == 0) & (epoch_i > 1): for class_replay in range(0, 1): highest_value = names['highest_tput_' + str(class_replay)] print("\n epoch: %d, highest tput: %f" % (epoch_i, highest_value)) # lowest_vio_ = names['lowest_vio_' + str(class_replay)] # print("\n epoch: %d, lowest_vio: %f" % (epoch_i, lowest_vio_)) RL_1.save_session(ckpt_path_1) RL_2.save_session(ckpt_path_2) RL_3.save_session(ckpt_path_3) np.savez(np_path, tputs=np.array(RL_1.tput_persisit), candidate=np.array(RL_1.episode), vi_perapp=np.array(RL_1.ss_perapp_persisit), vi_coex=np.array(RL_1.ss_coex_persisit), vi_sum=np.array(RL_1.ss_sum_persisit)) # print("epoch:", epoch_i, "mean(sum): ", np.mean(RL_1.ss_sum_persisit[-500:]), "mean(coex): ", np.mean(RL_1.coex_persisit[-500:])) """ optimal range adaptively change """ for class_replay in range(0, 1): number_optimal = names['number_optimal_' + str(class_replay)] count_size = int(len(number_optimal)) if (count_size > 300): names['optimal_range_' + str(class_replay)] *= 0.99 names['optimal_range_' + str(class_replay)] = max( names['optimal_range_' + str(class_replay)], 1.01) start_location = sum(names['number_optimal_' + str( class_replay)][:-50]) * training_times_per_episode names['observation_optimal_1_' + str(class_replay)] = names[ 'observation_optimal_1_' + str(class_replay)][start_location:] names['action_optimal_1_' + str(class_replay)] = names[ 'action_optimal_1_' + str(class_replay)][start_location:] names['observation_optimal_2_' + str(class_replay)] = names[ 'observation_optimal_2_' + str(class_replay)][start_location:] names['action_optimal_2_' + str(class_replay)] = names[ 'action_optimal_2_' + str(class_replay)][start_location:] names['observation_optimal_3_' + str(class_replay)] = names[ 'observation_optimal_3_' + str(class_replay)][start_location:] names['action_optimal_3_' + str(class_replay)] = names[ 'action_optimal_3_' + str(class_replay)][start_location:] names['number_optimal_' + str(class_replay)] = names['number_optimal_' + str(class_replay)][-50:] names['safety_optimal_1_' + str(class_replay)] = names[ 'safety_optimal_1_' + str(class_replay)][start_location:] names['safety_optimal_2_' + str(class_replay)] = names[ 'safety_optimal_2_' + str(class_replay)][start_location:] names['safety_optimal_3_' + str(class_replay)] = names[ 'safety_optimal_3_' + str(class_replay)][start_location:] names['reward_optimal_1_' + str(class_replay)] = names[ 'reward_optimal_1_' + str(class_replay)][start_location:] names['reward_optimal_2_' + str(class_replay)] = names[ 'reward_optimal_2_' + str(class_replay)][start_location:] names['reward_optimal_3_' + str(class_replay)] = names[ 'reward_optimal_3_' + str(class_replay)][start_location:] print("optimal_range:", names['optimal_range_' + str(class_replay)]) print(prob_weights) # if optim_case > 0: thre_entropy *= 0.5 # if epoch_i < 50000: # thre_entropy = max(thre_entropy, 0.01) # else: thre_entropy = max(thre_entropy, 0.0001) epoch_i += 1 if epoch_i > 20000: batch_size = 200
def train(params): time_epoch_set = [] start_time = time.time() """ parameters set """ NUM_NODES = params['number of nodes in the cluster'] NUM_CONTAINERS = params['number of containers'] env = LraClusterEnv(num_nodes=NUM_NODES) batch_size = params['batch_size'] ckpt_path_1 = "./checkpoint/" + params['path'] + "_1" + "/model.ckpt" ckpt_path_2 = "./checkpoint/" + params['path'] + "_2" + "/model.ckpt" ckpt_path_3 = "./checkpoint/" + params['path'] + "_3" + "/model.ckpt" np_path = "./checkpoint/" + params['path'] + "/optimal_file_name.npz" Recover = params['recover'] nodes_per_group = int(params['nodes per group']) replay_size = params['replay size'] training_times_per_episode = 1 UseExperienceReplay = False """ Build Network """ n_actions = nodes_per_group #: 3 nodes per group n_features = int(n_actions * env.NUM_APPS + 1 + env.NUM_APPS) #: 3*7+1+7 = 29 RL_1 = PolicyGradient(n_actions=n_actions, n_features=n_features, learning_rate=params['learning rate'], suffix=str(params['number of containers']) + '1') RL_2 = PolicyGradient(n_actions=n_actions, n_features=n_features, learning_rate=params['learning rate'], suffix=str(params['number of containers']) + '2') RL_3 = PolicyGradient(n_actions=n_actions, n_features=n_features, learning_rate=params['learning rate'], suffix=str(params['number of containers']) + '3') sim = Simulator() """ Training """ start_time = time.time() global_start_time = start_time observation_episode_1, action_episode_1, reward_episode_1 = [], [], [] observation_episode_2, action_episode_2, reward_episode_2 = [], [], [] observation_episode_3, action_episode_3, reward_episode_3 = [], [], [] epoch_i = 0 entropy_weight = 0.1 for i in range(0, 1): names['highest_tput_' + str(i)] = 0.1 names['observation_optimal_1_' + str(i)] = [] names['action_optimal_1_' + str(i)] = [] names['reward_optimal_1_' + str(i)] = [] names['number_optimal_' + str(i)] = [] names['optimal_range_' + str(i)] = 1.2 for i in range(0, 1): names['observation_optimal_2_' + str(i)] = [] names['action_optimal_2_' + str(i)] = [] names['reward_optimal_2_' + str(i)] = [] for i in range(0, 1): names['observation_optimal_3_' + str(i)] = [] names['action_optimal_3_' + str(i)] = [] names['reward_optimal_3_' + str(i)] = [] def store_episode_1(observations, actions): observation_episode_1.append(observations) action_episode_1.append(actions) def store_episode_2(observations, actions): observation_episode_2.append(observations) action_episode_2.append(actions) def store_episode_3(observations, actions): observation_episode_3.append(observations) action_episode_3.append(actions) while epoch_i < params['epochs']: tput_origimal_class = 0 source_batch_, index_data = batch_data( NUM_CONTAINERS, env.NUM_APPS) # index_data = [0,1,2,0,1,2] observation = env.reset().copy() # (9,9) source_batch = source_batch_.copy() source_batch_cpoy = source_batch.copy() total = source_batch # observation = observation_original.copy() limit = (1 - observation) capicity = (params['container_limitation per node'] - observation.sum(1)).reshape(-1) # 27 s = Solver() # app sum == batch for i in range(7): s.add(z3.Sum(names['x' + str(i)]) == int(total[i])) # node capacity for node in range(27): s.add( z3.Sum([names['x' + str(i)][node] for i in range(7)]) <= int(capicity[node])) # >=0 for i in range(7): for node in range(27): s.add(names['x' + str(i)][node] >= 0) # per app spread for i in range(7): for node in range(27): s.add(names['x' + str(i)][node] <= int(limit[node, i])) # App1 and App2 not exist for node in range(27): s.add(names['x' + str(1)][node] + names['x' + str(2)][node] <= 1) def handle_constraint(NUM_NODES, appid, source_batch): observation_original = observation.copy() mapping_index = [] list_check = [] t2 = time.time() for place in range(27): s.push() s.add(names['x' + str(appid)][place] >= env.state[place][appid] + 1) if s.check() == z3.sat: list_check.append(False) else: list_check.append(True) s.pop() t3 = time.time() # print("formulate: ", t2 - t1) # print("calculate: ", t3 - t2) good_index = np.where(np.array(list_check) == False)[0] length = len(good_index) if length < 1: test = 1 index_replace = 0 for node in range(NUM_NODES): if list_check[node]: # bad node # index_this_replace = good_index[np.random.randint(length)] index_this_replace = good_index[index_replace % length] index_replace += 1 observation_original[node] = observation[ index_this_replace] mapping_index.append(index_this_replace) else: mapping_index.append(node) observation_original[node] = observation[node] return observation_original, mapping_index """ Episode """ for inter_episode_index in range(NUM_CONTAINERS): source_batch[index_data[inter_episode_index]] -= 1 appid = index_data[inter_episode_index] observation, mapping_index = handle_constraint( NUM_NODES, appid, source_batch_cpoy) observation[:, index_data[inter_episode_index]] += 1 assert len(mapping_index) > 0 observation_first_layer = np.empty([0, env.NUM_APPS], int) number_of_first_layer_nodes = int(NUM_NODES / nodes_per_group) # 9 for i in range(nodes_per_group): observation_new = np.sum( observation[i * number_of_first_layer_nodes:(i + 1) * number_of_first_layer_nodes], 0).reshape(1, -1) observation_first_layer = np.append(observation_first_layer, observation_new, 0) observation_first_layer[:, index_data[inter_episode_index]] += 1 observation_first_layer = np.array( observation_first_layer).reshape(1, -1) observation_first_layer = np.append( observation_first_layer, index_data[inter_episode_index]).reshape(1, -1) observation_first_layer = np.append( observation_first_layer, np.array(source_batch)).reshape(1, -1) # (1,29) action_1, prob_weights = RL_1.choose_action( observation_first_layer.copy()) observation_copy = observation.copy() observation_copy = observation_copy[action_1 * number_of_first_layer_nodes: (action_1 + 1) * number_of_first_layer_nodes] number_of_second_layer_nodes = int(number_of_first_layer_nodes / nodes_per_group) # 9/3 = 3 observation_second_layer = np.empty([0, env.NUM_APPS], int) for i in range(nodes_per_group): observation_new = np.sum( observation_copy[i * number_of_second_layer_nodes:(i + 1) * number_of_second_layer_nodes], 0).reshape(1, -1) observation_second_layer = np.append(observation_second_layer, observation_new, 0) observation_second_layer[:, index_data[inter_episode_index]] += 1 observation_second_layer = np.array( observation_second_layer).reshape(1, -1) observation_second_layer = np.append( observation_second_layer, index_data[inter_episode_index]).reshape(1, -1) observation_second_layer = np.append( observation_second_layer, np.array(source_batch)).reshape(1, -1) action_2, prob_weights = RL_2.choose_action( observation_second_layer.copy()) observation_copy = observation_copy[action_2 * number_of_second_layer_nodes: (action_2 + 1) * number_of_second_layer_nodes] number_of_third_layer_nodes = int(number_of_second_layer_nodes / nodes_per_group) # 3/3 = 1 observation_third_layer = np.empty([0, env.NUM_APPS], int) for i in range(nodes_per_group): observation_new = np.sum( observation_copy[i * number_of_third_layer_nodes:(i + 1) * number_of_third_layer_nodes], 0).reshape(1, -1) observation_third_layer = np.append(observation_third_layer, observation_new, 0) observation_third_layer[:, index_data[inter_episode_index]] += 1 observation_third_layer = np.array( observation_third_layer).reshape(1, -1) observation_third_layer = np.append( observation_third_layer, index_data[inter_episode_index]).reshape(1, -1) observation_third_layer = np.append( observation_third_layer, np.array(source_batch)).reshape(1, -1) action_3, prob_weights = RL_3.choose_action( observation_third_layer.copy()) final_decision = action_1 * number_of_first_layer_nodes + action_2 * number_of_second_layer_nodes + action_3 * number_of_third_layer_nodes appid = index_data[inter_episode_index] # observation_ = env.step(action*nodes_per_group + Node_index[action], appid) observation_ = env.step(mapping_index[final_decision], appid) decision = mapping_index[final_decision] s.add( names['x' + str(appid)][decision] >= int(env.state[decision][appid])) # for i in range(number_of_node_groups): store_episode_1(observation_first_layer, action_1) store_episode_2(observation_second_layer, action_2) store_episode_3(observation_third_layer, action_3) observation = observation_.copy() # (9,9) """ After an entire allocation, calculate total throughput, reward """ # start_ = time.time() tput_state = env.get_tput_total_env() tput = (sim.predict(tput_state.reshape(-1, env.NUM_APPS)) * tput_state).sum() / NUM_CONTAINERS # print(time.time() - start_) # tput = 1.0 * tput / NUM_CONTAINERS RL_1.store_tput_per_episode(tput, epoch_i) assert (np.sum(env.state, axis=1) <= params['container_limitation per node']).all() assert sum(sum(env.state)) == NUM_CONTAINERS list_check = 0 for node in range(NUM_NODES): for app in range(env.NUM_APPS): if env.state[node, :].sum( ) > params['container_limitation per node'] or env.state[ node, app] > 1 or (app == 1 and env.state[node, 2] > 0 ) or (app == 2 and env.state[node, 1] > 0): list_check += env.state[node, app] assert (list_check == 0) reward_ratio = (tput) reward_episode_1 = [reward_ratio] * len(observation_episode_1) reward_episode_2 = [reward_ratio] * len(observation_episode_2) reward_episode_3 = [reward_ratio] * len(observation_episode_3) RL_1.store_training_samples_per_episode(observation_episode_1, action_episode_1, reward_episode_1, 0) RL_2.store_training_samples_per_episode(observation_episode_2, action_episode_2, reward_episode_2, 0) RL_3.store_training_samples_per_episode(observation_episode_3, action_episode_3, reward_episode_3, 0) """ check_tput_quality(tput) """ if names['highest_tput_' + str(tput_origimal_class)] < tput: highest_tput_original = names['highest_tput_' + str(tput_origimal_class)] optimal_range_original = names['optimal_range_' + str(tput_origimal_class)] names['highest_tput_' + str(tput_origimal_class)] = tput names['number_optimal_' + str(tput_origimal_class)] = [] names['observation_optimal_1_' + str(tput_origimal_class)], names[ 'action_optimal_1_' + str(tput_origimal_class)], names[ 'reward_optimal_1_' + str(tput_origimal_class)] = [], [], [] names['observation_optimal_2_' + str(tput_origimal_class)], names[ 'action_optimal_2_' + str(tput_origimal_class)], names[ 'reward_optimal_2_' + str(tput_origimal_class)] = [], [], [] names['observation_optimal_3_' + str(tput_origimal_class)], names[ 'action_optimal_3_' + str(tput_origimal_class)], names[ 'reward_optimal_3_' + str(tput_origimal_class)] = [], [], [] if UseExperienceReplay: names['observation_optimal_1_' + str(tput_origimal_class)].extend(observation_episode_1) names['action_optimal_1_' + str(tput_origimal_class)].extend(action_episode_1) names['reward_optimal_1_' + str(tput_origimal_class)].extend(reward_episode_1) names['observation_optimal_2_' + str(tput_origimal_class)].extend(observation_episode_2) names['action_optimal_2_' + str(tput_origimal_class)].extend(action_episode_2) names['reward_optimal_2_' + str(tput_origimal_class)].extend(reward_episode_2) names['observation_optimal_3_' + str(tput_origimal_class)].extend(observation_episode_3) names['action_optimal_3_' + str(tput_origimal_class)].extend(action_episode_3) names['reward_optimal_3_' + str(tput_origimal_class)].extend(reward_episode_3) names['number_optimal_' + str(tput_origimal_class)].append(NUM_CONTAINERS) names['optimal_range_' + str(tput_origimal_class)] = min( 1.2, tput / (highest_tput_original / optimal_range_original)) elif names['highest_tput_' + str(tput_origimal_class)] < tput * names[ 'optimal_range_' + str(tput_origimal_class)]: if UseExperienceReplay: names['observation_optimal_1_' + str(tput_origimal_class)].extend(observation_episode_1) names['action_optimal_1_' + str(tput_origimal_class)].extend(action_episode_1) names['reward_optimal_1_' + str(tput_origimal_class)].extend(reward_episode_1) names['observation_optimal_2_' + str(tput_origimal_class)].extend(observation_episode_2) names['action_optimal_2_' + str(tput_origimal_class)].extend(action_episode_2) names['reward_optimal_2_' + str(tput_origimal_class)].extend(reward_episode_2) names['observation_optimal_3_' + str(tput_origimal_class)].extend(observation_episode_3) names['action_optimal_3_' + str(tput_origimal_class)].extend(action_episode_3) names['reward_optimal_3_' + str(tput_origimal_class)].extend(reward_episode_3) names['number_optimal_' + str(tput_origimal_class)].append(NUM_CONTAINERS) observation_episode_1, action_episode_1, reward_episode_1 = [], [], [] observation_episode_2, action_episode_2, reward_episode_2 = [], [], [] observation_episode_3, action_episode_3, reward_episode_3 = [], [], [] """ Each batch, RL.learn() """ # records_per_episode = NUM_CONTAINERS * training_times_per_episode if (epoch_i % batch_size == 0) & (epoch_i > 1): if UseExperienceReplay: for replay_class in range(0, 1): reward_optimal_1 = names['reward_optimal_1_' + str(replay_class)] observation_optimal_1 = names['observation_optimal_1_' + str(replay_class)] action_optimal_1 = names['action_optimal_1_' + str(replay_class)] reward_optimal_2 = names['reward_optimal_2_' + str(replay_class)] observation_optimal_2 = names['observation_optimal_2_' + str(replay_class)] action_optimal_2 = names['action_optimal_2_' + str(replay_class)] reward_optimal_3 = names['reward_optimal_3_' + str(replay_class)] observation_optimal_3 = names['observation_optimal_3_' + str(replay_class)] action_optimal_3 = names['action_optimal_3_' + str(replay_class)] number_optimal = names['number_optimal_' + str(replay_class)] buffer_size = int(len(number_optimal)) assert sum( number_optimal) * training_times_per_episode == len( action_optimal_1) if buffer_size < replay_size: # TODO: if layers changes, training_times_per_episode should be modified RL_1.ep_obs.extend(observation_optimal_1) RL_1.ep_as.extend(action_optimal_1) RL_1.ep_rs.extend(reward_optimal_1) RL_2.ep_obs.extend(observation_optimal_2) RL_2.ep_as.extend(action_optimal_2) RL_2.ep_rs.extend(reward_optimal_2) RL_3.ep_obs.extend(observation_optimal_3) RL_3.ep_as.extend(action_optimal_3) RL_3.ep_rs.extend(reward_optimal_3) else: replay_index = np.random.choice(range(buffer_size), size=replay_size, replace=False) for replay_id in range(replay_size): replace_start = replay_index[replay_id] start_location = sum(number_optimal[:replace_start] ) * training_times_per_episode stop_location = sum( number_optimal[:replace_start + 1]) * training_times_per_episode RL_1.ep_obs.extend(observation_optimal_1[ start_location:stop_location]) RL_1.ep_as.extend( action_optimal_1[start_location:stop_location]) RL_1.ep_rs.extend( reward_optimal_1[start_location:stop_location]) RL_2.ep_obs.extend(observation_optimal_2[ start_location:stop_location]) RL_2.ep_as.extend( action_optimal_2[start_location:stop_location]) RL_2.ep_rs.extend( reward_optimal_2[start_location:stop_location]) RL_3.ep_obs.extend(observation_optimal_3[ start_location:stop_location]) RL_3.ep_as.extend( action_optimal_3[start_location:stop_location]) RL_3.ep_rs.extend( reward_optimal_3[start_location:stop_location]) # entropy_weight=0.1 RL_1.learn(epoch_i, entropy_weight, True) RL_2.learn(epoch_i, entropy_weight, False) RL_3.learn(epoch_i, entropy_weight, False) """ checkpoint, per 1000 episodes """ if (epoch_i % 500 == 0) & (epoch_i > 1): highest_value = 0 for class_replay in range(0, 1): highest_value = names['highest_tput_' + str(class_replay)] optimal_number = len(names['number_optimal_' + str(class_replay)]) print("\n epoch: %d, highest tput: %f, optimal_number: %d" % (epoch_i, highest_value, optimal_number)) RL_1.save_session(ckpt_path_1) RL_2.save_session(ckpt_path_2) RL_3.save_session(ckpt_path_3) np.savez(np_path, tputs=np.array(RL_1.tput_persisit), candidate=np.array(RL_1.episode)) """ optimal range adaptively change """ print(prob_weights) print(prob_weights) entropy_weight *= 0.5 entropy_weight = max(entropy_weight, 0.002) print("time by now: ", time.time() - start_time) epoch_i += 1