def fifo(job_trace=None): if job_trace is None: job_trace = trace.Trace(None).get_trace() env = fifo_env.FIFO_Env("FIFO", job_trace, None) while not env.end: env.step() return env.get_results()
def val_loss(net, val_traces, logger, global_step): avg_loss = 0 step = 0 data = [] for episode in range(len(val_traces)): job_trace = val_traces[episode] if pm.HEURISTIC == "DRF": env = drf_env.DRF_Env("DRF", job_trace, logger) elif pm.HEURISTIC == "FIFO": env = fifo_env.FIFO_Env("FIFO", job_trace, logger) elif pm.HEURISTIC == "SRTF": env = srtf_env.SRTF_Env("SRTF", job_trace, logger) elif pm.HEURISTIC == "Tetris": env = tetris_env.Tetris_Env("Tetris", job_trace, logger) ts = 0 while not env.end: data += env.step() ts += 1 if len(data) >= pm.MINI_BATCH_SIZE: # prepare a validation batch indexes = np.random.choice(len(data), size=pm.MINI_BATCH_SIZE, replace=False) inputs = [] labels = [] for index in indexes: input, label = data[index] inputs.append(input) labels.append(label) # superversed learning to calculate gradients output, loss = net.get_sl_loss(np.stack(inputs), np.vstack(labels)) avg_loss += loss # if step%50 == 0: # # # type, # of time slots in the system so far, normalized remaining epoch, dom resource # tb_logger.add_text(tag="sl:input+label+output:" + str(episode) + "_" + str(ts), value="input:" + \ # " type: "+ str(input[0]) + " stay_ts: " + str(input[1]) + " rt: " + str(input[2]) \ # + " resr:" + str(input[3]) + "\n" + # " label: " + str(label) + "\n" + " output: " + str(output[-1]), step=global_step) step += 1 data = [] return avg_loss / step
def sl_agent(net_weights_q, net_gradients_q, stats_q, id): logger = log.getLogger(name="agent_"+str(id), level=pm.LOG_MODE) logger.info("Start supervised learning, agent " + str(id) + " ...") if not pm.RANDOMNESS: np.random.seed(pm.np_seed+id+1) config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess, tf.device("/gpu:"+str(id%2)): policy_net = network.PolicyNetwork(sess, "policy_net", pm.TRAINING_MODE, logger) sess.run(tf.global_variables_initializer()) # to avoid batch normalization error global_step = 1 avg_jct = [] avg_makespan = [] avg_reward = [] if not pm.VAL_ON_MASTER: validation_traces = [] # validation traces for i in range(pm.VAL_DATASET): validation_traces.append(trace.Trace(None).get_trace()) # generate training traces traces = [] for episode in range(pm.TRAIN_EPOCH_SIZE): job_trace = trace.Trace(None).get_trace() traces.append(job_trace) mem_store = memory.Memory(maxlen=pm.REPLAY_MEMORY_SIZE) logger.info("Filling experience buffer...") for epoch in range(pm.TOT_TRAIN_EPOCHS): for episode in range(pm.TRAIN_EPOCH_SIZE): tic = time.time() job_trace = copy.deepcopy(traces[episode]) if pm.HEURISTIC == "DRF": env = drf_env.DRF_Env("DRF", job_trace, logger) elif pm.HEURISTIC == "FIFO": env = fifo_env.FIFO_Env("FIFO", job_trace, logger) elif pm.HEURISTIC == "SRTF": env = srtf_env.SRTF_Env("SRTF", job_trace, logger) elif pm.HEURISTIC == "Tetris": env = tetris_env.Tetris_Env("Tetris", job_trace, logger) while not env.end: if pm.LOG_MODE == "DEBUG": time.sleep(0.01) data = env.step() logger.debug("ts length:" + str(len(data))) for (input, label) in data: mem_store.store(input, 0, label, 0) if mem_store.full(): # prepare a training batch _, trajectories, _ = mem_store.sample(pm.MINI_BATCH_SIZE) input_batch = [traj.state for traj in trajectories] label_batch = [traj.action for traj in trajectories] # if global_step % 10 == 0: # print "input", input_batch[0] # print "label", label_batch[0] # pull latest weights before training weights = net_weights_q.get() if isinstance(weights, basestring) and weights == "exit": logger.info("Agent " + str(id) + " exits.") exit(0) policy_net.set_weights(weights) # superversed learning to calculate gradients entropy, loss, policy_grads = policy_net.get_sl_gradients(np.stack(input_batch),np.vstack(label_batch)) for i in range(len(policy_grads)): assert np.any(np.isnan(policy_grads[i])) == False # send gradients to the central agent net_gradients_q.put(policy_grads) # validation if not pm.VAL_ON_MASTER and global_step % pm.VAL_INTERVAL == 0: val_tic = time.time() val_loss = validate.val_loss(policy_net, validation_traces, logger, global_step) jct, makespan, reward = validate.val_jmr(policy_net, validation_traces, logger, global_step) stats_q.put(("val", val_loss, jct, makespan, reward)) val_toc = time.time() logger.info("Agent " + str(id) + " Validation at step " + str(global_step) + " Time: " + '%.3f'%(val_toc-val_tic)) stats_q.put(("step:sl", entropy, loss)) global_step += 1 num_jobs, jct, makespan, reward = env.get_results() avg_jct.append(jct) avg_makespan.append(makespan) avg_reward.append(reward) if global_step%pm.DISP_INTERVAL == 0: logger.info("Agent\t AVG JCT\t Makespan\t Reward") logger.info(str(id) + " \t \t " + '%.3f' %(sum(avg_jct)/len(avg_jct)) + " \t\t" + " " + '%.3f' %(1.0*sum(avg_makespan)/len(avg_makespan)) \ + " \t" + " " + '%.3f' %(sum(avg_reward)/len(avg_reward)))