def test(policy_net, validation_traces, logger, step, tb_logger): val_tic = time.time() tag_prefix = "Central " try: if pm.TRAINING_MODE == "SL": val_loss = validate.val_loss(policy_net, copy.deepcopy(validation_traces), logger, step) tb_logger.add_scalar(tag=tag_prefix + "Val Loss", value=val_loss, step=step) jct, makespan, reward = validate.val_jmr(policy_net, copy.deepcopy(validation_traces), logger, step, tb_logger) tb_logger.add_scalar(tag=tag_prefix + "Val JCT", value=jct, step=step) tb_logger.add_scalar(tag=tag_prefix + "Val Makespan", value=makespan, step=step) tb_logger.add_scalar(tag=tag_prefix + "Val Reward", value=reward, step=step) tb_logger.flush() val_toc = time.time() logger.info("Central Agent:" + " Validation at step " + str(step) + " Time: " + '%.3f' % (val_toc - val_tic)) # log results if pm.TRAINING_MODE == "SL": f = open(LOG_DIR + "sl_validation.txt", 'a') else: f = open(LOG_DIR + "rl_validation.txt", 'a') f.write("step " + str(step) + ": " + str(jct) + " " + str(makespan) + " " + str(reward) + "\n") f.close() return (jct, makespan, reward) except Exception as e: logger.error("Error when validation! " + str(e)) tb_logger.add_text(tag="validation error", value=str(e), step=step)
def rl_agent(net_weights_q, net_gradients_q, stats_q, id): logger = log.getLogger(name="agent_"+str(id), level=pm.LOG_MODE,mode="w",fh=True,ch=True,prefix="Agent " +str(id)) logger.info("Start reinforcement learning, agent " + str(id) + " ...") if not pm.RANDOMNESS: np.random.seed(pm.np_seed+id+1) config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess, tf.device("/gpu:"+str(id%2)): policy_net = network.PolicyNetwork(sess, "policy_net", pm.TRAINING_MODE, logger) if pm.VALUE_NET: value_net = network.ValueNetwork(sess, "value_net", pm.TRAINING_MODE, logger) sess.run(tf.global_variables_initializer()) # to avoid batch normalization error if pm.VALUE_NET: policy_weights, value_weights = net_weights_q.get() value_net.set_weights(value_weights) else: policy_weights = net_weights_q.get() policy_net.set_weights(policy_weights) # initialization from master first_time = True global_step = 1 if not pm.VAL_ON_MASTER: validation_traces = [] for i in range(pm.VAL_DATASET): validation_traces.append(trace.Trace(None).get_trace()) if pm.PRIORITY_REPLAY: mem_store = prioritized_memory.Memory(maxlen=pm.REPLAY_MEMORY_SIZE) else: mem_store = memory.Memory(maxlen=pm.REPLAY_MEMORY_SIZE) logger.info("Filling experience buffer...") # generate training data traces = [] for episode in range(pm.TRAIN_EPOCH_SIZE): job_trace = trace.Trace(None).get_trace() traces.append(job_trace) if pm.EPSILON_GREEDY: if pm.VARYING_EPSILON: temperature = pm.ANNEALING_TEMPERATURE * (1 + float(id)/pm.NUM_AGENTS) else: temperature = pm.ANNEALING_TEMPERATURE gates = [True, True, True] for epoch in range(pm.TOT_TRAIN_EPOCHS): for episode in range(pm.TRAIN_EPOCH_SIZE): if pm.CHANGING_JOB_TYPES: if global_step >= 0 and gates[0]: gates[0] = False traces = [] for episode in range(pm.TRAIN_EPOCH_SIZE): job_trace = trace.Trace(None).get_trace(4) traces.append(job_trace) logger.info("Changing job types 4") elif global_step >= 1000 and gates[1]: gates[1] = False traces = [] for episode in range(pm.TRAIN_EPOCH_SIZE): job_trace = trace.Trace(None).get_trace(6) traces.append(job_trace) logger.info("Changing job types 6") elif global_step >= 2000 and gates[2]: gates[2] = False traces = [] for episode in range(pm.TRAIN_EPOCH_SIZE): job_trace = trace.Trace(None).get_trace(8) traces.append(job_trace) logger.info("Changing job types 8") tic = time.time() if mem_store.full() and pm.ENABLE_K8S: logger.info("Switching to k8s environment!!!") env = k8s_rl_env.K8S_RL_Env("RL", copy.deepcopy(traces[episode]), logger) else: env = rl_env.RL_Env("RL", copy.deepcopy(traces[episode]), logger) states = [] masked_outputs = [] actions = [] rewards = [] ts = 0 while not env.end: if pm.LOG_MODE == "DEBUG": time.sleep(0.01) state = env.observe() output = policy_net.predict(np.reshape(state, (1, pm.STATE_DIM[0], pm.STATE_DIM[1]))) if pm.EPSILON_GREEDY: # greedy epsilon env.epsilon = 2 / (1 + np.exp(global_step / temperature)) masked_output, action, reward, move_on, valid_state = env.step(output) if valid_state: # do not save state when move on except skip_ts, but need to save reward!!! states.append(state) masked_outputs.append(masked_output) actions.append(action) rewards.append(reward) if move_on: ts += 1 # ts_reward = reward if ts%pm.LT_REWARD_NUM_TS == 0 and len(states) > 0: # states can be [] due to no jobs in the ts # lt_reward = sum(rewards) # ts_rewards = [0 for _ in range(pm.LT_REWARD_NUM_TS)] # ts_rewards[-1] = lt_reward # for i in reversed(range(0, len(ts_rewards) - 1)): # ts_rewards[i] += ts_rewards[i + 1] * pm.DISCOUNT_FACTOR if pm.LT_REWARD_IN_TS: for i in reversed(range(0,len(rewards)-1)): rewards[i] += rewards[i+1]*pm.DISCOUNT_FACTOR elif pm.TS_REWARD_PLUS_JOB_REWARD: rewards = env.get_job_reward() assert len(rewards) == len(states) else: rewards = [reward for _ in range(len(states))] # randomly fill samples to memory if pm.RANDOM_FILL_MEMORY: indexes = np.random.choice(len(states), size=pm.MINI_BATCH_SIZE, replace=False) for i in indexes: mem_store.store(states[i], masked_outputs[i], actions[i], rewards[i]) else: for i in range(len(states)): mem_store.store(states[i], masked_outputs[i], actions[i], rewards[i]) if mem_store.full() and ts%pm.NUM_TS_PER_UPDATE == 0: # prepare a training batch mem_indexes, trajectories, IS_weights = mem_store.sample(pm.MINI_BATCH_SIZE) states_batch = [traj.state for traj in trajectories] outputs_batch = [traj.output for traj in trajectories] actions_batch = [traj.action for traj in trajectories] rewards_batch = [traj.reward for traj in trajectories] # pull latest weights before training if not first_time: # avoid pulling twice at the first update if pm.VALUE_NET: policy_weights, value_weights = net_weights_q.get() if isinstance(policy_weights, basestring) and policy_weights == "exit": logger.info("Agent " + str(id) + " exits.") exit(0) policy_net.set_weights(policy_weights) value_net.set_weights(value_weights) else: policy_weights = net_weights_q.get() if isinstance(policy_weights, basestring) and policy_weights == "exit": logger.info("Agent " + str(id) + " exits.") exit(0) policy_net.set_weights(policy_weights) else: first_time = False # set entropy weight, both agent and central agent need to be set policy_net.anneal_entropy_weight(global_step) # reinforcement learning to calculate gradients if pm.VALUE_NET: value_output = value_net.predict(np.stack(states_batch)) td_loss = np.vstack(rewards_batch) - value_output adjusted_td_loss = td_loss * np.vstack(IS_weights) policy_entropy, policy_loss, policy_grads = policy_net.get_rl_gradients(np.stack(states_batch), \ np.vstack(outputs_batch), np.vstack(actions_batch), adjusted_td_loss) value_loss, value_grads = value_net.get_rl_gradients(np.stack(states_batch), value_output, np.vstack(rewards_batch)) else: if pm.PRIORITY_MEMORY_SORT_REWARD and pm.MEAN_REWARD_BASELINE: td_loss = np.vstack(rewards_batch) - mem_store.avg_reward() else: td_loss = np.vstack(rewards_batch) - 0 adjusted_td_loss = td_loss * np.vstack(IS_weights) policy_entropy, policy_loss, policy_grads = policy_net.get_rl_gradients(np.stack(states_batch), np.vstack(outputs_batch), np.vstack(actions_batch), adjusted_td_loss) for aa in range(len(actions_batch)): if actions_batch[aa][-1] == 1: # print "rewards:", rewards_batch[aa], "td_loss:", td_loss[aa] logger.debug("rewards:" + str(rewards_batch[aa]) + "td_loss:" + str(td_loss[aa])) for i in range(len(policy_grads)): try: assert np.any(np.isnan(policy_grads[i])) == False # print np.mean(np.abs(policy_grads[i])) # 10^-5 to 10^-2 except Exception as e: logger.error("Error: " + str(e)) logger.error("Gradients: " + str(policy_grads[i])) logger.error("Input type: " + str(states_batch[:,0])) logger.error("Masked Output: " + str(outputs_batch)) logger.error("Action: " + str(actions_batch)) logger.error("TD Loss: " + str(td_loss)) logger.error("Policy Loss: " + str(policy_loss)) logger.error("Policy Entropy: " + str(policy_entropy)) exit(1) # another option is to continue if pm.VALUE_NET: for i in range(len(value_grads)): try: assert np.any(np.isnan(value_grads[i])) == False except Exception as e: logger.error("Error: " + str(e) + " " + str(policy_grads[i])) exit(1) # send gradients to the central agent if pm.VALUE_NET: net_gradients_q.put((policy_grads, value_grads)) else: net_gradients_q.put(policy_grads) if pm.PRIORITY_REPLAY: mem_store.update(mem_indexes, abs(td_loss)) # validation if not pm.VAL_ON_MASTER and global_step % pm.VAL_INTERVAL == 0: val_loss = validate.val_loss(policy_net, validation_traces, logger, global_step) jct, makespan, reward = validate.val_jmr(policy_net, validation_traces, logger, global_step) stats_q.put(("val", val_loss, jct, makespan, reward)) # statistics if pm.VALUE_NET: stats_q.put(("step:policy+value", policy_entropy, policy_loss, value_loss, sum(td_loss)/len(td_loss), sum(rewards_batch)/len(rewards_batch), output)) else: stats_q.put(("step:policy", policy_entropy, policy_loss, sum(td_loss)/len(td_loss), sum(rewards_batch)/len(rewards_batch), output)) global_step += 1 # clear states = [] masked_outputs = [] actions = [] rewards = [] # collect statistics after training one trace num_jobs, jct, makespan, reward = env.get_results() stats_q.put(("trace:sched_result", jct, makespan, reward)) if (epoch*pm.TRAIN_EPOCH_SIZE+episode)%pm.DISP_INTERVAL == 0: if (epoch*pm.TRAIN_EPOCH_SIZE+episode)%50 == 0: stats_q.put(("trace:job_stats", episode, env.get_jobstats())) toc = time.time() logger.info("--------------------------------------------------------------") logger.info("Agent " + str(id) + " Epoch " + str(epoch) + " Trace " + str(episode) + " Step " + str(global_step)) logger.info("# of Jobs\t AVG JCT\t Makespan\t Reward\t Time") logger.info(str(num_jobs) + " \t" + " \t" + " " + '%.3f' %jct + " \t\t" + " " + '%.3f' %makespan \ + "\t\t" + " " + '%.3f' %reward + "\t" + " " + '%.3f' % (toc - tic))
def sl_agent(net_weights_q, net_gradients_q, stats_q, id): logger = log.getLogger(name="agent_"+str(id), level=pm.LOG_MODE) logger.info("Start supervised learning, agent " + str(id) + " ...") if not pm.RANDOMNESS: np.random.seed(pm.np_seed+id+1) config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess, tf.device("/gpu:"+str(id%2)): policy_net = network.PolicyNetwork(sess, "policy_net", pm.TRAINING_MODE, logger) sess.run(tf.global_variables_initializer()) # to avoid batch normalization error global_step = 1 avg_jct = [] avg_makespan = [] avg_reward = [] if not pm.VAL_ON_MASTER: validation_traces = [] # validation traces for i in range(pm.VAL_DATASET): validation_traces.append(trace.Trace(None).get_trace()) # generate training traces traces = [] for episode in range(pm.TRAIN_EPOCH_SIZE): job_trace = trace.Trace(None).get_trace() traces.append(job_trace) mem_store = memory.Memory(maxlen=pm.REPLAY_MEMORY_SIZE) logger.info("Filling experience buffer...") for epoch in range(pm.TOT_TRAIN_EPOCHS): for episode in range(pm.TRAIN_EPOCH_SIZE): tic = time.time() job_trace = copy.deepcopy(traces[episode]) if pm.HEURISTIC == "DRF": env = drf_env.DRF_Env("DRF", job_trace, logger) elif pm.HEURISTIC == "FIFO": env = fifo_env.FIFO_Env("FIFO", job_trace, logger) elif pm.HEURISTIC == "SRTF": env = srtf_env.SRTF_Env("SRTF", job_trace, logger) elif pm.HEURISTIC == "Tetris": env = tetris_env.Tetris_Env("Tetris", job_trace, logger) while not env.end: if pm.LOG_MODE == "DEBUG": time.sleep(0.01) data = env.step() logger.debug("ts length:" + str(len(data))) for (input, label) in data: mem_store.store(input, 0, label, 0) if mem_store.full(): # prepare a training batch _, trajectories, _ = mem_store.sample(pm.MINI_BATCH_SIZE) input_batch = [traj.state for traj in trajectories] label_batch = [traj.action for traj in trajectories] # if global_step % 10 == 0: # print "input", input_batch[0] # print "label", label_batch[0] # pull latest weights before training weights = net_weights_q.get() if isinstance(weights, basestring) and weights == "exit": logger.info("Agent " + str(id) + " exits.") exit(0) policy_net.set_weights(weights) # superversed learning to calculate gradients entropy, loss, policy_grads = policy_net.get_sl_gradients(np.stack(input_batch),np.vstack(label_batch)) for i in range(len(policy_grads)): assert np.any(np.isnan(policy_grads[i])) == False # send gradients to the central agent net_gradients_q.put(policy_grads) # validation if not pm.VAL_ON_MASTER and global_step % pm.VAL_INTERVAL == 0: val_tic = time.time() val_loss = validate.val_loss(policy_net, validation_traces, logger, global_step) jct, makespan, reward = validate.val_jmr(policy_net, validation_traces, logger, global_step) stats_q.put(("val", val_loss, jct, makespan, reward)) val_toc = time.time() logger.info("Agent " + str(id) + " Validation at step " + str(global_step) + " Time: " + '%.3f'%(val_toc-val_tic)) stats_q.put(("step:sl", entropy, loss)) global_step += 1 num_jobs, jct, makespan, reward = env.get_results() avg_jct.append(jct) avg_makespan.append(makespan) avg_reward.append(reward) if global_step%pm.DISP_INTERVAL == 0: logger.info("Agent\t AVG JCT\t Makespan\t Reward") logger.info(str(id) + " \t \t " + '%.3f' %(sum(avg_jct)/len(avg_jct)) + " \t\t" + " " + '%.3f' %(1.0*sum(avg_makespan)/len(avg_makespan)) \ + " \t" + " " + '%.3f' %(sum(avg_reward)/len(avg_reward)))