def train(num=1000): agent = PolicyGradient(env.observation_space.shape[0], env.action_space.n) # agent.load_model() steps = [] outputs = [] states = [] actions = [] rewards = [] for i_episode in range(num): old_observation = env.reset() old_action = agent.get_action( np.reshape(old_observation, [1, env.observation_space.shape[0]])) step = 0 while True: step = step + 1 # env.render() observation, reward, done, info = env.step(old_action) states.append(old_observation) actions.append(old_action) rewards.append(reward) old_observation = observation old_action = agent.get_action( np.reshape(observation, [1, env.observation_space.shape[0]])) if step > 50000: steps.append(step) break if done: print("{}:{} steps: {}".format(i_episode, step, reward)) agent.train(np.array(rewards), np.array(actions), np.array(states)) steps.append(step) agent.save_model() break # if the average steps of consecutive 100 games is lower than a standard # we consider the method passes the game score = sum(steps[-100:]) / 100 if len(steps) >= 100 and score < 275: print( "---------------------------------------------------------------" ) print("done") break
def train(num=2000): agent = PolicyGradient(env.observation_space.shape[0], env.action_space.n) # agent.load_model() steps = [] for i_episode in range(num): old_observation = env.reset() old_action = agent.get_action( np.reshape(old_observation, [1, env.observation_space.shape[0]])) done = False step = 0 states = [] actions = [] rewards = [] while not done: step = step + 1 # env.render() observation, reward, done, info = env.step(old_action) states.append(old_observation) actions.append(old_action) rewards.append(reward) old_observation = observation old_action = agent.get_action( np.reshape(old_observation, [1, env.observation_space.shape[0]])) if done: steps.append(step) print("{}:{} steps".format(i_episode, step)) agent.train(np.array(rewards), np.array(actions), np.array(states)) agent.save_model() break # if the average steps of consecutive 100 games is lower than a standard # we consider the method passes the game if len(steps) > 200 and sum(steps[-200:]) / 200 >= 195: print(sum(steps[-200:]) / 200) break
def __init__(self, path_name, surffix, path_surffix): """ parameters set """ self.NUM_NODES = params['number of nodes in the cluster'] self.NUM_APPS = 7 # self.NUM_CONTAINERS = params['number of containers'] # self.sim = Simulator() # self.env = LraClusterEnv(num_nodes=self.NUM_NODES) ckpt_path_1 = path_surffix + path_name + "_1" + "/model.ckpt" ckpt_path_2 = path_surffix + path_name + "_2" + "/model.ckpt" ckpt_path_3 = path_surffix + path_name + "_3" + "/model.ckpt" self.nodes_per_group = int(params['nodes per group']) # self.number_of_node_groups = int(self.NUM_NODES / self.nodes_per_group) """ Build Network """ self.n_actions = self.nodes_per_group #: 3 nodes per group self.n_features = int(self.n_actions * self.NUM_APPS + 1 + self.NUM_APPS) #: 29 self.RL_1 = PolicyGradient(n_actions=self.n_actions, n_features=self.n_features, learning_rate=params['learning rate'], suffix=surffix + '1') self.RL_2 = PolicyGradient(n_actions=self.n_actions, n_features=self.n_features, learning_rate=params['learning rate'], suffix=surffix + '2') self.RL_3 = PolicyGradient(n_actions=self.n_actions, n_features=self.n_features, learning_rate=params['learning rate'], suffix=surffix + '3') self.RL_1.restore_session(ckpt_path_1) self.RL_2.restore_session(ckpt_path_2) self.RL_3.restore_session(ckpt_path_3)
import gym from PolicyGradient import PolicyGradient import matplotlib.pyplot as plt env = gym.make('MountainCar-v0') env.seed(1) env = env.unwrapped RENDER = False RL = PolicyGradient(n_actions=env.action_space.n, n_features=env.observation_space.shape[0], learning_rate=0.02, reward_decay=0.995, print_graph=True) total_steps = 0 for i_episode in range(1000): observation = env.reset() while True: if RENDER: env.render() action = RL.choose_action(observation) observation_, reward, done, info = env.step(action) RL.store_transition(observation, action, reward)
def __init__(self, trains, validates, tests, n_feature, n_action, name_data, metric, Zero_is_Action, hidden_layers, train_eval_func, test_eval_func, reward_design='action_depend_baseline', model_base=None, validate_max_steps=1000, size_bag=5000, parallel=False, learner=None, sess=None, coor=None, n_bags=1, isLoad=False): self.name_data = name_data self.label = 'visit' self.n_action = n_action self.n_feature = n_feature self.Zero_is_Action = Zero_is_Action self.isLoad = isLoad # self.model_save self.saved_model_dir = None self.trains = trains self.tests = tests self.validates = validates self.treatment_weight = [1.0] self.treatment_keys = ['reward'] # if self.isLoad is True: # self.sess = tf.Session() # # ans = self.load_model( # self.sess, self.trains[0][0].reshape((-1, 8))) # print('ans', ans) # print('..', self.trains[0][0]) # exit() # # else: if self.Zero_is_Action is True: self.learner = PolicyGradient( n_action=self.n_action, n_feature=self.n_feature, hidden_layers=hidden_layers) else: self.learner = PolicyGradient( n_action=self.n_action - 1, n_feature=self.n_feature, hidden_layers=hidden_layers) self.sess = self.learner.sess self.n_train = len(self.trains) self.n_test = len(self.tests) self.n_validate = len(self.validates) self.max_epoch = 1000000 self.validate_max_steps = validate_max_steps self.n_bags = n_bags self.output_steps = 10 self.parallel = parallel self.batch_size = 512 self.size_bag = size_bag self.train_eval_func = train_eval_func self.test_eval_func = test_eval_func self.n_step_repeat = n_bags self.metric = metric self.reward_design = reward_design self.start_time = time.strftime("%Y_%m_%d_%H_%M_%S", time.localtime()) # self.saved_model_dir # os.mknod("test.txt") # self.log_file = self.log = open( 'logs/log_' + time.strftime("%Y_%m_%d_%H_%M_%S", time.localtime()), 'a') print('train eval_func', getattr(self.train_eval_func, '__name__')) print('test eval_func', getattr(self.test_eval_func, '__name__'))
class RLift: def __init__(self, trains, validates, tests, n_feature, n_action, name_data, metric, Zero_is_Action, hidden_layers, train_eval_func, test_eval_func, reward_design='action_depend_baseline', model_base=None, validate_max_steps=1000, size_bag=5000, parallel=False, learner=None, sess=None, coor=None, n_bags=1, isLoad=False): self.name_data = name_data self.label = 'visit' self.n_action = n_action self.n_feature = n_feature self.Zero_is_Action = Zero_is_Action self.isLoad = isLoad # self.model_save self.saved_model_dir = None self.trains = trains self.tests = tests self.validates = validates self.treatment_weight = [1.0] self.treatment_keys = ['reward'] # if self.isLoad is True: # self.sess = tf.Session() # # ans = self.load_model( # self.sess, self.trains[0][0].reshape((-1, 8))) # print('ans', ans) # print('..', self.trains[0][0]) # exit() # # else: if self.Zero_is_Action is True: self.learner = PolicyGradient( n_action=self.n_action, n_feature=self.n_feature, hidden_layers=hidden_layers) else: self.learner = PolicyGradient( n_action=self.n_action - 1, n_feature=self.n_feature, hidden_layers=hidden_layers) self.sess = self.learner.sess self.n_train = len(self.trains) self.n_test = len(self.tests) self.n_validate = len(self.validates) self.max_epoch = 1000000 self.validate_max_steps = validate_max_steps self.n_bags = n_bags self.output_steps = 10 self.parallel = parallel self.batch_size = 512 self.size_bag = size_bag self.train_eval_func = train_eval_func self.test_eval_func = test_eval_func self.n_step_repeat = n_bags self.metric = metric self.reward_design = reward_design self.start_time = time.strftime("%Y_%m_%d_%H_%M_%S", time.localtime()) # self.saved_model_dir # os.mknod("test.txt") # self.log_file = self.log = open( 'logs/log_' + time.strftime("%Y_%m_%d_%H_%M_%S", time.localtime()), 'a') print('train eval_func', getattr(self.train_eval_func, '__name__')) print('test eval_func', getattr(self.test_eval_func, '__name__')) # self.start_state = [np.zeros_like(self.trains[0][0]), 0, 0] # self.end_state = [np.ones_like(self.trains[0][0]), 0, 0] # print('train', self.trains) # print('test', self.tests) # print('train', self.trains) def split_bags(self): bags = [[]] * self.n_bags self.n_train = len(self.trains) indexs = np.arange(self.n_train) np.random.shuffle(indexs) for i in range(self.n_bags): bags[i] = [self.trains[j] for j in range(self.n_train) if (j % self.n_bags) == i] return bags def next_batch(self): ''' Return: batch = [state, real_action, response] ''' indexs = np.arange(self.n_train) np.random.shuffle(indexs) batch = [self.trains[i] for i in indexs[:self.size_bag]] return batch def stat_actions(self, actions): p = [np.sum(actions == i) for i in range(self.n_action)] p = np.array(p) p = p / np.sum(p) return p def train(self): uplift_validate_max = -1 max_result_record = None uplift_train = -1 uplift_train_actions = [-1] * self.n_action validate_max_steps = self.validate_max_steps validate_cur_steps = 0 for epoch in range(self.max_epoch): self.log.write('Epoch' + str(epoch) + '\n') bag_rewards = np.zeros((self.n_step_repeat,)) trans = [] for eid in range(self.n_step_repeat): bag = self.next_batch() datas = np.array([data[0] for data in bag]) # lifts = data[4] actions, probs = self.learner.choose_action( datas, mode='random', greedy=0.1) if self.Zero_is_Action is False: actions = [x + 1 for x in actions] tmp_probs = [] for prob in probs: tmp = [0.0] tmp.extend(prob.tolist()) tmp_probs.append(tmp) probs = np.array(tmp_probs) records = [] real_probs = np.ones(self.n_action) * 0.2 for a, data, p in zip(actions, bag, probs): # Record: [Algo Action, Real Action, {Reaction}, Prob_sample, Prob_Algo] records.append( [int(a), data[1], {'reward': data[2]}, real_probs, p]) # exit() # record = [[a, bag[i][1], bag[i][2], probs[i][1]] # for i, a in enumerate(actions)] # if self.metric == 'same_diff': # bag_rewards[eid], lifts_actions, pro_actions, lift_treatment, algo_treatment, algo_control, algo_treatment = self.eval_func( # record=records, n_action=self.n_action) # elif self.metric == 'qini': # bag_rewards_qini[eid] = qini_Q( # record=record, n_action=self.n_action) eval_res = self.train_eval_func(records=records, treatment_weight=self.treatment_weight, treatment_keys=self.treatment_keys, n_action=self.n_action) bag_rewards[eid] = eval_res['reward'] algo_treatment = eval_res['response'] algo_control = eval_res['control'] # variance = eval_res['variance'] algo_probs = eval_res['algo_action_prob'] # print('variance', variance) algo_action_base = eval_res['algo_action_base'] # print('eval_res', eval_res['reward'], eval_res['response'], # eval_res['control'], eval_res['algo_action_prob']) tran = Transition() # print('actions', actions) for i, (data, algo_prob) in enumerate(zip(bag, probs)): feature = data[0] algo_action = actions[i] real_action = data[1] response = data[2] lifts = data[4] val_next = bag_rewards[eid] if self.reward_design == 'UMG': # print('reward_design is same diff') # print('algo_action', algo_action, 'real_action', real_action) if algo_action == real_action or real_action == 0: if algo_action == real_action: rwd = (response - algo_control) + val_next elif real_action == 0: rwd = -(response - algo_control) + val_next if self.Zero_is_Action is False: algo_action -= 1 tran.append( state=data[0], real_action=real_action, algo_action=algo_action, reward=rwd) elif self.reward_design == 'action_depend_baseline': # if algo_action == real_action or real_action == 0: # print('algo_action_base[0]', algo_action_base[0]) if algo_action == real_action or real_action == 0: if algo_action == real_action: rwd = ( response - algo_action_base[0][algo_action]) + val_next elif real_action == 0: rwd = - \ (response - algo_action_base[0][algo_action]) + val_next if self.Zero_is_Action is False: algo_action -= 1 tran.append( state=data[0], real_action=real_action, algo_action=algo_action, reward=rwd) else: print('Error! Reward Design is not found!') exit() trans.append(tran) # print('bag_rewards', bag_rewards) reward_mean = np.mean(bag_rewards).astype(np.floating) reward_std = 1 print('mean', reward_mean, 'std', max(1e-4, reward_std)) self.log.write('mean:' + str(reward_mean) + ' std:' + str(max(1e-4, reward_std)) + '\n') for tran in trans: tran.avg_reward(reward_mean, reward_std) self.learner.learn(tran) # if we only use RLift, then it need to record the best result by itself. # if self.parallel is False: eval_res = self.test(eval_func=self.test_eval_func, datas=self.validates, epoch=epoch, result_output=False) print('validate eval_res', eval_res['reward'], eval_res['response'], eval_res['control'], eval_res['algo_action_prob'], eval_res['algo_action_nums']) uplift_validate = eval_res['reward'] print('uplift_validate', uplift_validate) if uplift_validate > uplift_validate_max: uplift_validate_max = uplift_validate if self.saved_model_dir is not None: self.save_model(sess=self.learner.sess) # exit() print('uplift_validate_max', uplift_validate_max) self.log.write('uplift_validate_max:' + str(uplift_validate_max) + '\n') validate_cur_steps = 0 max_result_record = self.results_calc(eval_func=self.test_eval_func, epoch=epoch, result_output=True, outputs_list=['test'], isMax=True) print('max result test') uplift_test_max = max_result_record['test'] # print('uplift_test_max', uplift_test_max) print('tests', uplift_test_max['reward'], uplift_test_max['reward'], uplift_test_max['response'], uplift_test_max['control'], uplift_test_max['algo_action_prob']) print('max result train') # uplift_test_max = max_result_record['train'] # # print('uplift_test_max', uplift_test_max) # print('trains', uplift_test_max['reward'], uplift_test_max['reward'], # uplift_test_max['response'], uplift_test_max['control'], uplift_test_max['algo_action_prob']) self.log.write('max_result_record:' + str(max_result_record) + '\n') else: validate_cur_steps += 1 if validate_cur_steps >= validate_max_steps: print('Training Finished', epoch) print('uplift_test_max', uplift_test_max, 'uplift_validate_max', uplift_validate_max) print('max result', max_result_record) self.log.write('Training Finished:' + str(epoch) + '\n') self.log.write('uplift_validate_max:' + str(uplift_validate_max) + '\n') self.sess.close() sys.exit() if epoch % self.output_steps == 0 and epoch > 0: print('Epoch', epoch) print('uplift_test_max', uplift_test_max) print('uplift_validate_max', uplift_validate_max) print('max result', max_result_record) # self.results_calc(eval_func=self.eval_func, outputs_list=[ # 'test', 'validate'], epoch=epoch, result_output=False) # def results_store(self): # self.results_trains = def save_model(self, sess): builder = tf.saved_model.builder.SavedModelBuilder( self.saved_model_dir) # x 为输入tensor, keep_prob为dropout的prob tensor inputs = {'input_x': tf.saved_model.utils.build_tensor_info( self.learner.tf_obs)} # y 为最终需要的输出结果tensor outputs = {'output': tf.saved_model.utils.build_tensor_info( self.learner.all_act_prob)} signature = tf.saved_model.signature_def_utils.build_signature_def( inputs, outputs, 'test_sig_name') # signature = None builder.add_meta_graph_and_variables( sess, [tf.saved_model.tag_constants.SERVING], signature_def_map=signature) builder.save() def load_model(self, sess, _x): signature_key = 'test_signature' input_key = 'input_x:0' output_key = 'output:0' meta_graph_def = tf.saved_model.loader.load( sess, [tf.saved_model.tag_constants.SERVING], self.saved_model_dir) # 从meta_graph_def中取出SignatureDef对象 signature = meta_graph_def.signature_def # 从signature中找出具体输入输出的tensor name x_tensor_name = signature[signature_key].inputs[input_key].name y_tensor_name = signature[signature_key].outputs[output_key].name # 获取tensor 并inference x = sess.graph.get_tensor_by_name(x_tensor_name) y = sess.graph.get_tensor_by_name(y_tensor_name) res = sess.run(y, feed_dict={x: _x}) print('res shape', res.shape) return res def results_calc(self, epoch, eval_func, outputs_list=['train', 'test', 'validate'], result_output=False, isMax=False): res = {} if isMax is True: suffix = 'max' else: suffix = str(epoch).zfill(5) if 'train' in outputs_list: uplift_train = self.test(eval_func=eval_func, datas=self.trains, epoch=epoch, output_filename='train_' + suffix, result_output=result_output) res['train'] = uplift_train if 'validate' in outputs_list: uplift_validate = self.test(eval_func=eval_func, datas=self.validates, epoch=epoch, output_filename='validate_' + suffix, result_output=result_output) res['validate'] = uplift_validate if 'test' in outputs_list: uplift_test = self.test(eval_func=eval_func, datas=self.tests, epoch=epoch, output_filename='test_' + suffix, result_output=result_output) res['test'] = uplift_test print('Epoch', epoch) self.log.write('Epoch:' + str(epoch) + '\n') for name in res.keys(): print(name, res[name]) return res def test(self, datas, epoch, eval_func, output_filename=None, result_output=False): ''' Test on the datas = [feature, action_real, reaction] ''' real_probs = np.ones(self.n_action) / self.n_action records = [] features = [data[0] for data in datas] actions_algo, probs = self.learner.choose_action( features, mode='random', greedy=None) # print('probs', probs) # for i, a in enumerate(actions_algo): # record.append([int(a), datas[i][1], datas[i][2], None]) # for i, (a, data, prob) in enumerate(zip(actions_algo, datas, probs)): for i, (data, action, prob) in enumerate(zip(datas, actions_algo, probs)): # action = int(actions_algo[0]) # prob = probs[0] if self.Zero_is_Action is False: action = action + 1 tmp = [0.0] tmp.extend(prob.tolist()) prob = np.array(tmp) records.append( [action, data[1], {'reward': data[2]}, real_probs, prob]) # reactions = [datas[i][2] for i, a in enumerate(actions_algo)] # reactions = np.array(reactions) # reactions = np.reshape(reactions, (len(datas), 1)) # if output_filename is not None and result_output is True: # name_func = getattr(self.eval_func, '__name__') # np.save('../output/' + output_filename + '_' + name_func + '_' + self.start_time, # np.hstack((probs, reactions, actions_real))) # print(output_filename + '_' + name_func + ' saved') ans = eval_func(records=records, treatment_weight=self.treatment_weight, treatment_keys=self.treatment_keys, n_action=self.n_action) return ans
from PolicyGradient import PolicyGradient import matplotlib.pyplot as plt DISPLAY_REWARD_THRESHOLD = -2000 RENDER = False env = gym.make('MountainCar-v0') env.seed(1) env = env.unwrapped print("action space:", env.action_space) print("observation space:", env.observation_space, " , high:", env.observation_space.high, " , low:", env.observation_space.low) pg = PolicyGradient( n_actions=env.action_space.n, n_features=env.observation_space.shape[0], learning_rate=0.02, reward_decay=0.995, ) for i_episode in range(1000): observation = env.reset() while True: if RENDER: env.render() action = pg.choose_action(observation) observation_, reward, done, info = env.step(action) pg.store_transition(observation, action, reward) if done: ep_rs_sum = sum(pg.ep_rs)
metavar='G', help='learning rate (default: 1e-4)') parser.add_argument('--batch_size', type=int, default=5, metavar='G', help='Every how many episodes to da a param update') parser.add_argument('--seed', type=int, default=87, metavar='N', help='random seed (default: 87)') args = parser.parse_args() policy = PolicyGradient() env = gym.make('trade-v0') env.seed(args.seed) torch.manual_seed(args.seed) optimizer = optim.RMSprop(policy.parameters(), lr=args.learning_rate, weight_decay=args.decay_rate) # # check & load pretrain model # if os.path.isfile('pg_params.pkl'): # print('Load Policy Network parameters ...') # policy.load_state_dict(torch.load('pg_params.pkl'))
class subScheduler(): def __init__(self, path_name, surffix, path_surffix): """ parameters set """ self.NUM_NODES = params['number of nodes in the cluster'] self.NUM_APPS = 7 # self.NUM_CONTAINERS = params['number of containers'] # self.sim = Simulator() # self.env = LraClusterEnv(num_nodes=self.NUM_NODES) ckpt_path_1 = path_surffix + path_name + "_1" + "/model.ckpt" ckpt_path_2 = path_surffix + path_name + "_2" + "/model.ckpt" ckpt_path_3 = path_surffix + path_name + "_3" + "/model.ckpt" self.nodes_per_group = int(params['nodes per group']) # self.number_of_node_groups = int(self.NUM_NODES / self.nodes_per_group) """ Build Network """ self.n_actions = self.nodes_per_group #: 3 nodes per group self.n_features = int(self.n_actions * self.NUM_APPS + 1 + self.NUM_APPS) #: 29 self.RL_1 = PolicyGradient(n_actions=self.n_actions, n_features=self.n_features, learning_rate=params['learning rate'], suffix=surffix + '1') self.RL_2 = PolicyGradient(n_actions=self.n_actions, n_features=self.n_features, learning_rate=params['learning rate'], suffix=surffix + '2') self.RL_3 = PolicyGradient(n_actions=self.n_actions, n_features=self.n_features, learning_rate=params['learning rate'], suffix=surffix + '3') self.RL_1.restore_session(ckpt_path_1) self.RL_2.restore_session(ckpt_path_2) self.RL_3.restore_session(ckpt_path_3) def batch_data(self, rnd_array): index_data = [] for i in range(7): index_data.extend([i] * rnd_array[i]) return rnd_array, index_data def get_total_tput(self, rnd_array): # assert sum(rnd_array) == 81 source_batch, index_data = self.batch_data( rnd_array.astype(int)) # index_data = [0,1,2,0,1,2] env = LraClusterEnv(num_nodes=self.NUM_NODES, ifSimulator=False) observation = env.reset().copy() # (9,9) """ Episode """ for inter_episode_index in range(int(sum(rnd_array))): # observation_new_list = [] # observation[:, index_data[inter_episode_index]] += 1 source_batch[index_data[inter_episode_index]] -= 1 observation, mapping_index = handle_constraint( observation, self.NUM_NODES) assert len(mapping_index) > 0 observation_first_layer = np.empty([0, env.NUM_APPS], int) number_of_first_layer_nodes = int(self.NUM_NODES / self.nodes_per_group) # 9 for i in range(self.nodes_per_group): observation_new = np.sum( observation[i * number_of_first_layer_nodes:(i + 1) * number_of_first_layer_nodes], 0).reshape(1, -1) observation_first_layer = np.append(observation_first_layer, observation_new, 0) observation_first_layer[:, index_data[inter_episode_index]] += 1 observation_first_layer = np.append( np.append(observation_first_layer, index_data[inter_episode_index]), np.array(source_batch)).reshape(1, -1) # observation_first_layer = np.array(observation_first_layer).reshape(1, -1) # observation_first_layer = np.append(observation_first_layer, index_data[inter_episode_index]).reshape(1, -1) # observation_first_layer = np.append(observation_first_layer, np.array(source_batch)).reshape(1, -1) # (1,29) action_1, prob_weights = self.RL_1.choose_action_determine( observation_first_layer) observation_copy = observation observation_copy = observation_copy[action_1 * number_of_first_layer_nodes: (action_1 + 1) * number_of_first_layer_nodes] number_of_second_layer_nodes = int(number_of_first_layer_nodes / self.nodes_per_group) # 9/3 = 3 observation_second_layer = np.empty([0, env.NUM_APPS], int) for i in range(self.nodes_per_group): observation_new = np.sum( observation_copy[i * number_of_second_layer_nodes:(i + 1) * number_of_second_layer_nodes], 0).reshape(1, -1) observation_second_layer = np.append(observation_second_layer, observation_new, 0) observation_second_layer[:, index_data[inter_episode_index]] += 1 observation_second_layer = np.append( np.append(observation_second_layer, index_data[inter_episode_index]), np.array(source_batch)).reshape(1, -1) # observation_second_layer = np.array(observation_second_layer).reshape(1, -1) # observation_second_layer = np.append(observation_second_layer, index_data[inter_episode_index]).reshape(1, -1) # observation_second_layer = np.append(observation_second_layer, np.array(source_batch)).reshape(1, -1) action_2, prob_weights = self.RL_2.choose_action_determine( observation_second_layer) observation_copy = observation_copy[action_2 * number_of_second_layer_nodes: (action_2 + 1) * number_of_second_layer_nodes] number_of_third_layer_nodes = int(number_of_second_layer_nodes / self.nodes_per_group) # 3/3 = 1 observation_third_layer = np.empty([0, env.NUM_APPS], int) for i in range(self.nodes_per_group): observation_new = np.sum( observation_copy[i * number_of_third_layer_nodes:(i + 1) * number_of_third_layer_nodes], 0).reshape(1, -1) observation_third_layer = np.append(observation_third_layer, observation_new, 0) observation_third_layer[:, index_data[inter_episode_index]] += 1 observation_third_layer = np.append( np.append(observation_third_layer, index_data[inter_episode_index]), np.array(source_batch)).reshape(1, -1) # observation_third_layer = np.array(observation_third_layer).reshape(1, -1) # observation_third_layer = np.append(observation_third_layer, index_data[inter_episode_index]).reshape(1, -1) # observation_third_layer = np.append(observation_third_layer, np.array(source_batch)).reshape(1, -1) action_3, prob_weights = self.RL_3.choose_action_determine( observation_third_layer) final_decision = action_1 * number_of_first_layer_nodes + action_2 * number_of_second_layer_nodes + action_3 * number_of_third_layer_nodes appid = index_data[inter_episode_index] # observation_ = env.step(action*nodes_per_group + Node_index[action], appid) observation_ = env.step(mapping_index[final_decision], appid) observation = observation_.copy() # (9,9) """ After an entire allocation, calculate total throughput, reward """ state = env.get_tput_total_env() # assert sum(sum(self.env.state)) == 81 return state
def train(params): """ parameters set """ NUM_NODES = params['number of nodes in the cluster'] env = LraClusterEnv(num_nodes=NUM_NODES) batch_size = params['batch_size'] ckpt_path_1 = "./checkpoint/" + params['path'] + "_1" + "/model.ckpt" ckpt_path_2 = "./checkpoint/" + params['path'] + "_2" + "/model.ckpt" ckpt_path_3 = "./checkpoint/" + params['path'] + "_3" + "/model.ckpt" np_path = "./checkpoint/" + params['path'] + "/optimal_file_name.npz" Recover = params['recover'] nodes_per_group = int(params['nodes per group']) replay_size = params['replay size'] training_times_per_episode = 1 UseExperienceReplay = False """ Build Network """ n_actions = nodes_per_group #: 3 nodes per group n_features = int(n_actions * env.NUM_APPS + 1 + env.NUM_APPS) #: 3*7+1+7 = 29 RL_1 = PolicyGradient(n_actions=n_actions, n_features=n_features, learning_rate=params['learning rate'], suffix=str(params['NUM_CONTAINERS_start']) + '1') RL_2 = PolicyGradient(n_actions=n_actions, n_features=n_features, learning_rate=params['learning rate'], suffix=str(params['NUM_CONTAINERS_start']) + '2') RL_3 = PolicyGradient(n_actions=n_actions, n_features=n_features, learning_rate=params['learning rate'], suffix=str(params['NUM_CONTAINERS_start']) + '3') sim = Simulator() """ Training """ start_time = time.time() global_start_time = start_time observation_episode_1, action_episode_1, reward_episode_1 = [], [], [] observation_episode_2, action_episode_2, reward_episode_2 = [], [], [] observation_episode_3, action_episode_3, reward_episode_3 = [], [], [] epoch_i = 0 entropy_weight = 0.01 names = locals() for i in range(0, 10): names['highest_tput_' + str(i)] = 0.1 names['observation_optimal_1_' + str(i)] = [] names['action_optimal_1_' + str(i)] = [] names['reward_optimal_1_' + str(i)] = [] names['number_optimal_' + str(i)] = [] names['optimal_range_' + str(i)] = 1.2 for i in range(0, 10): names['observation_optimal_2_' + str(i)] = [] names['action_optimal_2_' + str(i)] = [] names['reward_optimal_2_' + str(i)] = [] for i in range(0, 10): names['observation_optimal_3_' + str(i)] = [] names['action_optimal_3_' + str(i)] = [] names['reward_optimal_3_' + str(i)] = [] # TODO: delete this range def store_episode_1(observations, actions): observation_episode_1.append(observations) action_episode_1.append(actions) def store_episode_2(observations, actions): observation_episode_2.append(observations) action_episode_2.append(actions) def store_episode_3(observations, actions): observation_episode_3.append(observations) action_episode_3.append(actions) NUM_CONTAINERS_start = params['NUM_CONTAINERS_start'] while epoch_i < params['epochs']: NUM_CONTAINERS = np.random.randint(NUM_CONTAINERS_start + 1, NUM_CONTAINERS_start + 11) tput_origimal_class = int(NUM_CONTAINERS - NUM_CONTAINERS_start - 1) source_batch_, index_data = batch_data( NUM_CONTAINERS, env.NUM_APPS) # index_data = [0,1,2,0,1,2] observation = env.reset().copy() # (9,9) source_batch = source_batch_.copy() for inter_episode_index in range(NUM_CONTAINERS): appid = index_data[inter_episode_index] observation_ = env.step(inter_episode_index % NUM_NODES, appid) # load-balancing observation = observation_.copy() # (9,9) tput_state = env.get_tput_total_env() tput_baseline = (sim.predict(tput_state.reshape(-1, env.NUM_APPS)) * tput_state).sum() / NUM_CONTAINERS """ Episode """ observation = env.reset().copy() for inter_episode_index in range(NUM_CONTAINERS): source_batch[index_data[inter_episode_index]] -= 1 observation, mapping_index = handle_constraint( observation.copy(), NUM_NODES) assert len(mapping_index) > 0 observation_first_layer = np.empty([0, env.NUM_APPS], int) number_of_first_layer_nodes = int(NUM_NODES / nodes_per_group) # 9 for i in range(nodes_per_group): observation_new = np.sum( observation[i * number_of_first_layer_nodes:(i + 1) * number_of_first_layer_nodes], 0).reshape(1, -1) observation_first_layer = np.append(observation_first_layer, observation_new, 0) observation_first_layer[:, index_data[inter_episode_index]] += 1 observation_first_layer = np.array( observation_first_layer).reshape(1, -1) observation_first_layer = np.append( observation_first_layer, index_data[inter_episode_index]).reshape(1, -1) observation_first_layer = np.append( observation_first_layer, np.array(source_batch)).reshape(1, -1) # (1,29) action_1, prob_weights = RL_1.choose_action( observation_first_layer.copy()) observation_copy = observation.copy() observation_copy = observation_copy[action_1 * number_of_first_layer_nodes: (action_1 + 1) * number_of_first_layer_nodes] number_of_second_layer_nodes = int(number_of_first_layer_nodes / nodes_per_group) # 9/3 = 3 observation_second_layer = np.empty([0, env.NUM_APPS], int) for i in range(nodes_per_group): observation_new = np.sum( observation_copy[i * number_of_second_layer_nodes:(i + 1) * number_of_second_layer_nodes], 0).reshape(1, -1) observation_second_layer = np.append(observation_second_layer, observation_new, 0) observation_second_layer[:, index_data[inter_episode_index]] += 1 observation_second_layer = np.array( observation_second_layer).reshape(1, -1) observation_second_layer = np.append( observation_second_layer, index_data[inter_episode_index]).reshape(1, -1) observation_second_layer = np.append( observation_second_layer, np.array(source_batch)).reshape(1, -1) action_2, prob_weights = RL_2.choose_action( observation_second_layer.copy()) observation_copy = observation_copy[action_2 * number_of_second_layer_nodes: (action_2 + 1) * number_of_second_layer_nodes] number_of_third_layer_nodes = int(number_of_second_layer_nodes / nodes_per_group) # 3/3 = 1 observation_third_layer = np.empty([0, env.NUM_APPS], int) for i in range(nodes_per_group): observation_new = np.sum( observation_copy[i * number_of_third_layer_nodes:(i + 1) * number_of_third_layer_nodes], 0).reshape(1, -1) observation_third_layer = np.append(observation_third_layer, observation_new, 0) observation_third_layer[:, index_data[inter_episode_index]] += 1 observation_third_layer = np.array( observation_third_layer).reshape(1, -1) observation_third_layer = np.append( observation_third_layer, index_data[inter_episode_index]).reshape(1, -1) observation_third_layer = np.append( observation_third_layer, np.array(source_batch)).reshape(1, -1) action_3, prob_weights = RL_3.choose_action( observation_third_layer.copy()) final_decision = action_1 * number_of_first_layer_nodes + action_2 * number_of_second_layer_nodes + action_3 * number_of_third_layer_nodes appid = index_data[inter_episode_index] observation_ = env.step(mapping_index[final_decision], appid) store_episode_1(observation_first_layer, action_1) store_episode_2(observation_second_layer, action_2) store_episode_3(observation_third_layer, action_3) observation = observation_.copy() # (9,9) """ After an entire allocation, calculate total throughput, reward """ tput_state = env.get_tput_total_env() tput = (sim.predict(tput_state.reshape(-1, env.NUM_APPS)) * tput_state).sum() / NUM_CONTAINERS RL_1.store_tput_per_episode(tput, epoch_i) assert (np.sum(env.state, axis=1) <= params['container_limitation per node']).all() assert sum(sum(env.state)) == NUM_CONTAINERS reward_ratio = (tput - tput_baseline) reward_episode_1 = [reward_ratio] * len(observation_episode_1) reward_episode_2 = [reward_ratio] * len(observation_episode_2) reward_episode_3 = [reward_ratio] * len(observation_episode_3) RL_1.store_training_samples_per_episode(observation_episode_1, action_episode_1, reward_episode_1) RL_2.store_training_samples_per_episode(observation_episode_2, action_episode_2, reward_episode_2) RL_3.store_training_samples_per_episode(observation_episode_3, action_episode_3, reward_episode_3) """ check_tput_quality(tput) """ if names['highest_tput_' + str(tput_origimal_class)] < tput: highest_tput_original = names['highest_tput_' + str(tput_origimal_class)] optimal_range_original = names['optimal_range_' + str(tput_origimal_class)] names['highest_tput_' + str(tput_origimal_class)] = tput names['number_optimal_' + str(tput_origimal_class)] = [] names['observation_optimal_1_' + str(tput_origimal_class)], names[ 'action_optimal_1_' + str(tput_origimal_class)], names[ 'reward_optimal_1_' + str(tput_origimal_class)] = [], [], [] names['observation_optimal_2_' + str(tput_origimal_class)], names[ 'action_optimal_2_' + str(tput_origimal_class)], names[ 'reward_optimal_2_' + str(tput_origimal_class)] = [], [], [] names['observation_optimal_3_' + str(tput_origimal_class)], names[ 'action_optimal_3_' + str(tput_origimal_class)], names[ 'reward_optimal_3_' + str(tput_origimal_class)] = [], [], [] if UseExperienceReplay: names['observation_optimal_1_' + str(tput_origimal_class)].extend(observation_episode_1) names['action_optimal_1_' + str(tput_origimal_class)].extend(action_episode_1) names['reward_optimal_1_' + str(tput_origimal_class)].extend(reward_episode_1) names['observation_optimal_2_' + str(tput_origimal_class)].extend(observation_episode_2) names['action_optimal_2_' + str(tput_origimal_class)].extend(action_episode_2) names['reward_optimal_2_' + str(tput_origimal_class)].extend(reward_episode_2) names['observation_optimal_3_' + str(tput_origimal_class)].extend(observation_episode_3) names['action_optimal_3_' + str(tput_origimal_class)].extend(action_episode_3) names['reward_optimal_3_' + str(tput_origimal_class)].extend(reward_episode_3) names['number_optimal_' + str(tput_origimal_class)].append(NUM_CONTAINERS) names['optimal_range_' + str(tput_origimal_class)] = min( 1.2, tput / (highest_tput_original / optimal_range_original)) elif names['highest_tput_' + str(tput_origimal_class)] < tput * names[ 'optimal_range_' + str(tput_origimal_class)]: if UseExperienceReplay: names['observation_optimal_1_' + str(tput_origimal_class)].extend(observation_episode_1) names['action_optimal_1_' + str(tput_origimal_class)].extend(action_episode_1) names['reward_optimal_1_' + str(tput_origimal_class)].extend(reward_episode_1) names['observation_optimal_2_' + str(tput_origimal_class)].extend(observation_episode_2) names['action_optimal_2_' + str(tput_origimal_class)].extend(action_episode_2) names['reward_optimal_2_' + str(tput_origimal_class)].extend(reward_episode_2) names['observation_optimal_3_' + str(tput_origimal_class)].extend(observation_episode_3) names['action_optimal_3_' + str(tput_origimal_class)].extend(action_episode_3) names['reward_optimal_3_' + str(tput_origimal_class)].extend(reward_episode_3) names['number_optimal_' + str(tput_origimal_class)].append(NUM_CONTAINERS) observation_episode_1, action_episode_1, reward_episode_1 = [], [], [] observation_episode_2, action_episode_2, reward_episode_2 = [], [], [] observation_episode_3, action_episode_3, reward_episode_3 = [], [], [] """ Each batch, RL.learn() """ if (epoch_i % batch_size == 0) & (epoch_i > 1): if UseExperienceReplay: for replay_class in range(0, 10): reward_optimal_1 = names['reward_optimal_1_' + str(replay_class)] observation_optimal_1 = names['observation_optimal_1_' + str(replay_class)] action_optimal_1 = names['action_optimal_1_' + str(replay_class)] reward_optimal_2 = names['reward_optimal_2_' + str(replay_class)] observation_optimal_2 = names['observation_optimal_2_' + str(replay_class)] action_optimal_2 = names['action_optimal_2_' + str(replay_class)] reward_optimal_3 = names['reward_optimal_3_' + str(replay_class)] observation_optimal_3 = names['observation_optimal_3_' + str(replay_class)] action_optimal_3 = names['action_optimal_3_' + str(replay_class)] number_optimal = names['number_optimal_' + str(replay_class)] buffer_size = int(len(number_optimal)) assert sum( number_optimal) * training_times_per_episode == len( action_optimal_1) if buffer_size < replay_size: # TODO: if layers changes, training_times_per_episode should be modified RL_1.ep_obs.extend(observation_optimal_1) RL_1.ep_as.extend(action_optimal_1) RL_1.ep_rs.extend(reward_optimal_1) RL_2.ep_obs.extend(observation_optimal_2) RL_2.ep_as.extend(action_optimal_2) RL_2.ep_rs.extend(reward_optimal_2) RL_3.ep_obs.extend(observation_optimal_3) RL_3.ep_as.extend(action_optimal_3) RL_3.ep_rs.extend(reward_optimal_3) else: replay_index = np.random.choice(range(buffer_size), size=replay_size, replace=False) for replay_id in range(replay_size): replace_start = replay_index[replay_id] start_location = sum(number_optimal[:replace_start] ) * training_times_per_episode stop_location = sum( number_optimal[:replace_start + 1]) * training_times_per_episode RL_1.ep_obs.extend(observation_optimal_1[ start_location:stop_location]) RL_1.ep_as.extend( action_optimal_1[start_location:stop_location]) RL_1.ep_rs.extend( reward_optimal_1[start_location:stop_location]) RL_2.ep_obs.extend(observation_optimal_2[ start_location:stop_location]) RL_2.ep_as.extend( action_optimal_2[start_location:stop_location]) RL_2.ep_rs.extend( reward_optimal_2[start_location:stop_location]) RL_3.ep_obs.extend(observation_optimal_3[ start_location:stop_location]) RL_3.ep_as.extend( action_optimal_3[start_location:stop_location]) RL_3.ep_rs.extend( reward_optimal_3[start_location:stop_location]) # entropy_weight=0.1 RL_1.learn(epoch_i, entropy_weight, True) RL_2.learn(epoch_i, entropy_weight, False) RL_3.learn(epoch_i, entropy_weight, False) """ checkpoint, per 1000 episodes """ if (epoch_i % 1000 == 0) & (epoch_i > 1): highest_value = 0 for class_replay in range(0, 10): highest_value = names['highest_tput_' + str(class_replay)] optimal_number = len(names['number_optimal_' + str(class_replay)]) print("\n epoch: %d, highest tput: %f, optimal_number: %d" % (epoch_i, highest_value, optimal_number)) RL_1.save_session(ckpt_path_1) RL_2.save_session(ckpt_path_2) RL_3.save_session(ckpt_path_3) np.savez(np_path, tputs=np.array(RL_1.tput_persisit), candidate=np.array(RL_1.episode)) """ optimal range adaptively change """ print(prob_weights) print(prob_weights) entropy_weight *= 0.5 entropy_weight = max(entropy_weight, 0.002) epoch_i += 1