os.makedirs(args.result_folder) # tensorflo seeding tf.set_random_seed(args.seed) # set up environment env = Environment() # set up agents agents = {} for scheme in args.test_schemes: if scheme == 'learn': sess = tf.Session() agents[scheme] = ActorAgent(sess, args.node_input_dim, args.job_input_dim, args.hid_dims, args.output_dim, args.max_depth, range(1, args.exec_cap + 1)) elif scheme == 'dynamic_partition': agents[scheme] = DynamicPartitionAgent() elif scheme == 'spark_fifo': agents[scheme] = SparkAgent(exec_cap=args.exec_cap) elif scheme == 'pso': agents[scheme] = PSOAgent(env.wall_time) else: print('scheme ' + str(scheme) + ' not recognized') exit(1) # store info for all schemes all_total_reward = {} for scheme in args.test_schemes: all_total_reward[scheme] = []
def train_agent(agent_id, param_queue, reward_queue, adv_queue, gradient_queue): # model evaluation seed tf.set_random_seed(agent_id) # set up environment env = Environment() # gpu configuration config = tf.ConfigProto( device_count={'GPU': args.worker_num_gpu}, gpu_options=tf.GPUOptions( per_process_gpu_memory_fraction=args.worker_gpu_fraction)) sess = tf.Session(config=config) # set up actor agent actor_agent = ActorAgent(sess, args.node_input_dim, args.job_input_dim, args.hid_dims, args.output_dim, args.max_depth, range(1, args.exec_cap + 1)) # collect experiences while True: # get parameters from master (actor_params, seed, max_time, entropy_weight) = \ param_queue.get() # synchronize model actor_agent.set_params(actor_params) # reset environment env.seed(seed) env.reset(max_time=max_time) # set up storage for experience exp = {'node_inputs': [], 'job_inputs': [], \ 'gcn_mats': [], 'gcn_masks': [], \ 'summ_mats': [], 'running_dag_mat': [], \ 'dag_summ_back_mat': [], \ 'node_act_vec': [], 'job_act_vec': [], \ 'node_valid_mask': [], 'job_valid_mask': [], \ 'reward': [], 'wall_time': [], 'job_state_change': []} try: # The masking functions (node_valid_mask and # job_valid_mask in actor_agent.py) has some # small chance (once in every few thousand # iterations) to leave some non-zero probability # mass for a masked-out action. This will # trigger the check in "node_act and job_act # should be valid" in actor_agent.py # Whenever this is detected, we throw out the # rollout of that iteration and try again. # run experiment obs = env.observe() done = False # initial time exp['wall_time'].append(env.wall_time.curr_time) while not done: node, use_exec = invoke_model(actor_agent, obs, exp) obs, reward, done = env.step(node, use_exec) if node is not None: # valid action, store reward and time exp['reward'].append(reward) exp['wall_time'].append(env.wall_time.curr_time) elif len(exp['reward']) > 0: # Note: if we skip the reward when node is None # (i.e., no available actions), the sneaky # agent will learn to exhaustively pick all # nodes in one scheduling round, in order to # avoid the negative reward exp['reward'][-1] += reward exp['wall_time'][-1] = env.wall_time.curr_time # report reward signals to master assert len(exp['node_inputs']) == len(exp['reward']) reward_queue.put( [exp['reward'], exp['wall_time'], len(env.finished_job_dags), np.mean([j.completion_time - j.start_time \ for j in env.finished_job_dags]), env.wall_time.curr_time >= env.max_time]) # get advantage term from master batch_adv = adv_queue.get() if batch_adv is None: # some other agents panic for the try and the # main thread throw out the rollout, reset and # try again now continue # compute gradients actor_gradient, loss = compute_actor_gradients( actor_agent, exp, batch_adv, entropy_weight) # report gradient to master gradient_queue.put([actor_gradient, loss]) except AssertionError: # ask the main to abort this rollout and # try again reward_queue.put(None) # need to still get from adv_queue to # prevent blocking adv_queue.get()
def main(): np.random.seed(args.seed) tf.set_random_seed(args.seed) # create result and model folder create_folder_if_not_exists(args.result_folder) create_folder_if_not_exists(args.model_folder) # initialize communication queues params_queues = [mp.Queue(1) for _ in range(args.num_agents)] reward_queues = [mp.Queue(1) for _ in range(args.num_agents)] adv_queues = [mp.Queue(1) for _ in range(args.num_agents)] gradient_queues = [mp.Queue(1) for _ in range(args.num_agents)] # set up training agents agents = [] for i in range(args.num_agents): agents.append( mp.Process(target=train_agent, args=(i, params_queues[i], reward_queues[i], adv_queues[i], gradient_queues[i]))) # start training agents for i in range(args.num_agents): agents[i].start() # gpu configuration config = tf.ConfigProto( device_count={'GPU': args.master_num_gpu}, gpu_options=tf.GPUOptions( per_process_gpu_memory_fraction=args.master_gpu_fraction)) sess = tf.Session(config=config) # set up actor agent actor_agent = ActorAgent(sess, args.node_input_dim, args.job_input_dim, args.hid_dims, args.output_dim, args.max_depth, range(1, args.exec_cap + 1)) pre_train_actor_agent(actor_agent, args.seed, args.heuristic, args.num_heur_ep, args.reset_prob, args.entropy_weight_init, args.lr, args.exec_cap) # initialize entropy parameters entropy_weight = args.entropy_weight_init # initialize episode reset probability reset_prob = args.reset_prob # tensorboard logging tf_logger = TFLogger(sess, [ 'actor_loss', 'entropy', 'value_loss', 'episode_length', 'average_reward_per_second', 'sum_reward', 'reset_probability', 'num_jobs', 'reset_hit', 'average_job_duration', 'entropy_weight' ]) # store average reward for computing differential rewards avg_reward_calculator = AveragePerStepReward( args.average_reward_storage_size) jtcs = [] # ---- start training process ---- for ep in range(1, args.num_ep + 1): print('training epoch', ep) # synchronize the model parameters for each training agent actor_params = actor_agent.get_params() # generate max time stochastically based on reset prob max_time = generate_coin_flips(reset_prob) # send out parameters to training agents for i in range(args.num_agents): params_queues[i].put( [actor_params, args.seed + ep, max_time, entropy_weight]) # storage for advantage computation all_rewards, all_diff_times, all_times, \ all_num_finished_jobs, all_avg_job_duration, \ all_reset_hit, = [], [], [], [], [], [] t1 = time.time() # get reward from agents any_agent_panic = False for i in range(args.num_agents): result = reward_queues[i].get() if result is None: any_agent_panic = True continue else: batch_reward, batch_time, \ num_finished_jobs, avg_job_duration, \ reset_hit = result diff_time = np.array(batch_time[1:]) - \ np.array(batch_time[:-1]) all_rewards.append(batch_reward) all_diff_times.append(diff_time) all_times.append(batch_time[1:]) all_num_finished_jobs.append(num_finished_jobs) all_avg_job_duration.append(avg_job_duration) all_reset_hit.append(reset_hit) avg_reward_calculator.add_list_filter_zero(batch_reward, diff_time) t2 = time.time() print('got reward from workers', t2 - t1, 'seconds') jtcs.append([ep, all_avg_job_duration]) if any_agent_panic: # The try condition breaks in some agent (should # happen rarely), throw out this rollout and try # again for next iteration (TODO: log this event) for i in range(args.num_agents): adv_queues[i].put(None) continue # compute differential reward all_cum_reward = [] avg_per_step_reward = avg_reward_calculator.get_avg_per_step_reward() for i in range(args.num_agents): if args.diff_reward_enabled: # differential reward mode on rewards = np.array([r - avg_per_step_reward * t for \ (r, t) in zip(all_rewards[i], all_diff_times[i])]) else: # regular reward rewards = np.array([r for \ (r, t) in zip(all_rewards[i], all_diff_times[i])]) cum_reward = discount(rewards, args.gamma) all_cum_reward.append(cum_reward) # compute baseline baselines = get_piecewise_linear_fit_baseline(all_cum_reward, all_times) # give worker back the advantage for i in range(args.num_agents): batch_adv = all_cum_reward[i] - baselines[i] batch_adv = np.reshape(batch_adv, [len(batch_adv), 1]) adv_queues[i].put(batch_adv) t3 = time.time() print('advantage ready', t3 - t2, 'seconds') actor_gradients = [] all_action_loss = [] # for tensorboard all_entropy = [] # for tensorboard all_value_loss = [] # for tensorboard for i in range(args.num_agents): (actor_gradient, loss) = gradient_queues[i].get() actor_gradients.append(actor_gradient) all_action_loss.append(loss[0]) all_entropy.append(-loss[1] / \ float(all_cum_reward[i].shape[0])) all_value_loss.append(loss[2]) t4 = time.time() print('worker send back gradients', t4 - t3, 'seconds') actor_agent.apply_gradients(aggregate_gradients(actor_gradients), args.lr) t5 = time.time() print('apply gradient', t5 - t4, 'seconds') tf_logger.log(ep, [ np.mean(all_action_loss), np.mean(all_entropy), np.mean(all_value_loss), np.mean([len(b) for b in baselines]), avg_per_step_reward * args.reward_scale, np.mean([cr[0] for cr in all_cum_reward]), reset_prob, np.mean(all_num_finished_jobs), np.mean(all_reset_hit), np.mean(all_avg_job_duration), entropy_weight ]) # decrease entropy weight entropy_weight = decrease_var(entropy_weight, args.entropy_weight_min, args.entropy_weight_decay) # decrease reset probability reset_prob = decrease_var(reset_prob, args.reset_prob_min, args.reset_prob_decay) if ep % args.model_save_interval == 0: actor_agent.save_model(args.model_folder + \ 'model_ep_' + str(ep)) save_data.save_jcts_ep(jtcs, args.model_folder) sess.close()
def train_agent(agent_id, param_queue, reward_queue, adv_queue, gradient_queue): # model evaluation seed global idxs tf.set_random_seed(agent_id) # set up environment env = Environment() # gpu configuration config = tf.ConfigProto( device_count={'GPU': args.worker_num_gpu}, gpu_options=tf.GPUOptions( per_process_gpu_memory_fraction=args.worker_gpu_fraction)) sess = tf.Session(config=config) # set up actor agent max_num = max(args.exec_level_num) exec_cpu = np.asarray(args.exec_cpus) exec_mem = np.asarray(args.exec_mems) type_num = exec_cpu exec_num = args.exec_level_num actor_agent = ActorAgent( sess, args.node_input_dim, args.job_input_dim, args.hid_dims, args.output_dim, args.max_depth, range(1, max_num + 1), type_num, exec_mem, exec_num) # collect experiences while True: # get parameters from master (actor_params, seed, max_time, entropy_weight) = \ param_queue.get() # synchronize model actor_agent.set_params(actor_params) # reset environment env.seed(seed) env.reset(max_time=max_time) # set up storage for experience exp = {'node_inputs': [], 'job_inputs': [], \ 'gcn_mats': [], 'gcn_masks': [], \ 'summ_mats': [], 'running_dag_mat': [], \ 'dag_summ_back_mat': [], \ 'node_act_vec': [], 'job_act_vec': [], 'type_act_vec': [], \ 'node_valid_mask': [], 'job_valid_mask': [], 'type_valid_mask': [], \ 'reward': [], 'wall_time': [], 'job_state_change': []} try: # The masking functions (node_valid_mask and # job_valid_mask in actor_agent.py) has some # small chance (once in every few thousand # iterations) to leave some non-zero probability # mass for a masked-out action. This will # trigger the check in "node_act and job_act # should be valid" in actor_agent.py # Whenever this is detected, we throw out the # rollout of that iteration and try again. # run experiment obs = env.observe() done = False # initial time exp['wall_time'].append(env.wall_time.curr_time) job_dags = obs[0] # print("1") while not done: node, use_exec, use_type = invoke_model(actor_agent, obs, exp) if node is None: with open('result.txt', 'a', encoding='utf-8') as f: f.writelines(str(idxs) + " 本次不执行调度。还剩" + str(len(job_dags)) + "个j0b。" + '\n') idxs = idxs + 1 else: job_idx = job_dags.index(node.job_dag) with open('result.txt', 'a', encoding='utf-8') as f: f.writelines(str(idxs) + " 本次调度job" + str(job_idx) + "的" + str(node.idx) + "号node,分配" + str( args.exec_cpus[ use_type]) + "核" + str( args.exec_mems[use_type]) + "G执行器" + str(use_exec) + "个。还剩" + str( len(job_dags)) + "个j0b。" + '\n') idxs = idxs + 1 obs, reward, done = env.step(node, use_exec, use_type) if node is not None: # valid action, store reward and time exp['reward'].append(reward) exp['wall_time'].append(env.wall_time.curr_time) elif len(exp['reward']) > 0: # Note: if we skip the reward when node is None # (i.e., no available actions), the sneaky # agent will learn to exhaustively pick all # nodes in one scheduling round, in order to # avoid the negative reward exp['reward'][-1] += reward exp['wall_time'][-1] = env.wall_time.curr_time # report reward signals to master # print(len(exp['node_inputs'])) # print(len(exp['reward'])) # print(len(exp['node_inputs'])) # print(len(exp['reward'])) assert len(exp['node_inputs']) == len(exp['reward']) # print("a") # print([exp['reward'], exp['wall_time'], # len(env.finished_job_dags), # np.mean([j.completion_time - j.start_time \ # for j in env.finished_job_dags]), # env.wall_time.curr_time >= env.max_time]) reward_queue.put( [exp['reward'], exp['wall_time'], len(env.finished_job_dags), np.mean([j.completion_time - j.start_time \ for j in env.finished_job_dags]), env.wall_time.curr_time >= env.max_time]) # get advantage term from master batch_adv = adv_queue.get() if batch_adv is None: # some other agents panic for the try and the # main thread throw out the rollout, reset and # try again now continue # print("yes") # print(batch_adv[0]) # print(batch_adv[1009]) # compute gradients actor_gradient, loss = compute_actor_gradients( actor_agent, exp, batch_adv, entropy_weight) # report gradient to master # print("loss") gradient_queue.put([actor_gradient, loss]) except AssertionError: # ask the main to abort this rollout and # try again print(2) traceback.print_exc() reward_queue.put(None) # need to still get from adv_queue to # prevent blocking adv_queue.get()