def train_agent(agent_id, param_queue, reward_queue, adv_queue, gradient_queue): # model evaluation seed tf.set_random_seed(agent_id) # set up environment env = Environment() # gpu configuration config = tf.ConfigProto( device_count={'GPU': args.worker_num_gpu}, gpu_options=tf.GPUOptions( per_process_gpu_memory_fraction=args.worker_gpu_fraction)) sess = tf.Session(config=config) # set up actor agent actor_agent = ActorAgent(sess, args.node_input_dim, args.job_input_dim, args.hid_dims, args.output_dim, args.max_depth, range(1, args.exec_cap + 1)) # collect experiences while True: # get parameters from master (actor_params, seed, max_time, entropy_weight) = \ param_queue.get() # synchronize model actor_agent.set_params(actor_params) # reset environment env.seed(seed) env.reset(max_time=max_time) # set up storage for experience exp = {'node_inputs': [], 'job_inputs': [], \ 'gcn_mats': [], 'gcn_masks': [], \ 'summ_mats': [], 'running_dag_mat': [], \ 'dag_summ_back_mat': [], \ 'node_act_vec': [], 'job_act_vec': [], \ 'node_valid_mask': [], 'job_valid_mask': [], \ 'reward': [], 'wall_time': [], 'job_state_change': []} try: # The masking functions (node_valid_mask and # job_valid_mask in actor_agent.py) has some # small chance (once in every few thousand # iterations) to leave some non-zero probability # mass for a masked-out action. This will # trigger the check in "node_act and job_act # should be valid" in actor_agent.py # Whenever this is detected, we throw out the # rollout of that iteration and try again. # run experiment obs = env.observe() done = False # initial time exp['wall_time'].append(env.wall_time.curr_time) while not done: node, use_exec = invoke_model(actor_agent, obs, exp) obs, reward, done = env.step(node, use_exec) if node is not None: # valid action, store reward and time exp['reward'].append(reward) exp['wall_time'].append(env.wall_time.curr_time) elif len(exp['reward']) > 0: # Note: if we skip the reward when node is None # (i.e., no available actions), the sneaky # agent will learn to exhaustively pick all # nodes in one scheduling round, in order to # avoid the negative reward exp['reward'][-1] += reward exp['wall_time'][-1] = env.wall_time.curr_time # report reward signals to master assert len(exp['node_inputs']) == len(exp['reward']) reward_queue.put( [exp['reward'], exp['wall_time'], len(env.finished_job_dags), np.mean([j.completion_time - j.start_time \ for j in env.finished_job_dags]), env.wall_time.curr_time >= env.max_time]) # get advantage term from master batch_adv = adv_queue.get() if batch_adv is None: # some other agents panic for the try and the # main thread throw out the rollout, reset and # try again now continue # compute gradients actor_gradient, loss = compute_actor_gradients( actor_agent, exp, batch_adv, entropy_weight) # report gradient to master gradient_queue.put([actor_gradient, loss]) except AssertionError: # ask the main to abort this rollout and # try again reward_queue.put(None) # need to still get from adv_queue to # prevent blocking adv_queue.get()
def train_agent(agent_id, param_queue, reward_queue, adv_queue, gradient_queue): # model evaluation seed global idxs tf.set_random_seed(agent_id) # set up environment env = Environment() # gpu configuration config = tf.ConfigProto( device_count={'GPU': args.worker_num_gpu}, gpu_options=tf.GPUOptions( per_process_gpu_memory_fraction=args.worker_gpu_fraction)) sess = tf.Session(config=config) # set up actor agent max_num = max(args.exec_level_num) exec_cpu = np.asarray(args.exec_cpus) exec_mem = np.asarray(args.exec_mems) type_num = exec_cpu exec_num = args.exec_level_num actor_agent = ActorAgent( sess, args.node_input_dim, args.job_input_dim, args.hid_dims, args.output_dim, args.max_depth, range(1, max_num + 1), type_num, exec_mem, exec_num) # collect experiences while True: # get parameters from master (actor_params, seed, max_time, entropy_weight) = \ param_queue.get() # synchronize model actor_agent.set_params(actor_params) # reset environment env.seed(seed) env.reset(max_time=max_time) # set up storage for experience exp = {'node_inputs': [], 'job_inputs': [], \ 'gcn_mats': [], 'gcn_masks': [], \ 'summ_mats': [], 'running_dag_mat': [], \ 'dag_summ_back_mat': [], \ 'node_act_vec': [], 'job_act_vec': [], 'type_act_vec': [], \ 'node_valid_mask': [], 'job_valid_mask': [], 'type_valid_mask': [], \ 'reward': [], 'wall_time': [], 'job_state_change': []} try: # The masking functions (node_valid_mask and # job_valid_mask in actor_agent.py) has some # small chance (once in every few thousand # iterations) to leave some non-zero probability # mass for a masked-out action. This will # trigger the check in "node_act and job_act # should be valid" in actor_agent.py # Whenever this is detected, we throw out the # rollout of that iteration and try again. # run experiment obs = env.observe() done = False # initial time exp['wall_time'].append(env.wall_time.curr_time) job_dags = obs[0] # print("1") while not done: node, use_exec, use_type = invoke_model(actor_agent, obs, exp) if node is None: with open('result.txt', 'a', encoding='utf-8') as f: f.writelines(str(idxs) + " 本次不执行调度。还剩" + str(len(job_dags)) + "个j0b。" + '\n') idxs = idxs + 1 else: job_idx = job_dags.index(node.job_dag) with open('result.txt', 'a', encoding='utf-8') as f: f.writelines(str(idxs) + " 本次调度job" + str(job_idx) + "的" + str(node.idx) + "号node,分配" + str( args.exec_cpus[ use_type]) + "核" + str( args.exec_mems[use_type]) + "G执行器" + str(use_exec) + "个。还剩" + str( len(job_dags)) + "个j0b。" + '\n') idxs = idxs + 1 obs, reward, done = env.step(node, use_exec, use_type) if node is not None: # valid action, store reward and time exp['reward'].append(reward) exp['wall_time'].append(env.wall_time.curr_time) elif len(exp['reward']) > 0: # Note: if we skip the reward when node is None # (i.e., no available actions), the sneaky # agent will learn to exhaustively pick all # nodes in one scheduling round, in order to # avoid the negative reward exp['reward'][-1] += reward exp['wall_time'][-1] = env.wall_time.curr_time # report reward signals to master # print(len(exp['node_inputs'])) # print(len(exp['reward'])) # print(len(exp['node_inputs'])) # print(len(exp['reward'])) assert len(exp['node_inputs']) == len(exp['reward']) # print("a") # print([exp['reward'], exp['wall_time'], # len(env.finished_job_dags), # np.mean([j.completion_time - j.start_time \ # for j in env.finished_job_dags]), # env.wall_time.curr_time >= env.max_time]) reward_queue.put( [exp['reward'], exp['wall_time'], len(env.finished_job_dags), np.mean([j.completion_time - j.start_time \ for j in env.finished_job_dags]), env.wall_time.curr_time >= env.max_time]) # get advantage term from master batch_adv = adv_queue.get() if batch_adv is None: # some other agents panic for the try and the # main thread throw out the rollout, reset and # try again now continue # print("yes") # print(batch_adv[0]) # print(batch_adv[1009]) # compute gradients actor_gradient, loss = compute_actor_gradients( actor_agent, exp, batch_adv, entropy_weight) # report gradient to master # print("loss") gradient_queue.put([actor_gradient, loss]) except AssertionError: # ask the main to abort this rollout and # try again print(2) traceback.print_exc() reward_queue.put(None) # need to still get from adv_queue to # prevent blocking adv_queue.get()