Exemple #1
0
def rl_agent(net_weights_q, net_gradients_q, stats_q, id):
	logger = log.getLogger(name="agent_"+str(id), level=pm.LOG_MODE,mode="w",fh=True,ch=True,prefix="Agent " +str(id))
	logger.info("Start reinforcement learning, agent " + str(id) + " ...")

	if not pm.RANDOMNESS:
		np.random.seed(pm.np_seed+id+1)

	config = tf.ConfigProto()
	config.gpu_options.allow_growth = True
	with tf.Session(config=config) as sess, tf.device("/gpu:"+str(id%2)):
		policy_net = network.PolicyNetwork(sess, "policy_net", pm.TRAINING_MODE, logger)
		if pm.VALUE_NET:
			value_net = network.ValueNetwork(sess, "value_net", pm.TRAINING_MODE, logger)
		sess.run(tf.global_variables_initializer())  # to avoid batch normalization error
		if pm.VALUE_NET:
			policy_weights, value_weights = net_weights_q.get()
			value_net.set_weights(value_weights)
		else:
			policy_weights = net_weights_q.get()
		policy_net.set_weights(policy_weights) # initialization from master
		first_time = True

		global_step = 1
		if not pm.VAL_ON_MASTER:
			validation_traces = []
			for i in range(pm.VAL_DATASET):
				validation_traces.append(trace.Trace(None).get_trace())
		if pm.PRIORITY_REPLAY:
			mem_store = prioritized_memory.Memory(maxlen=pm.REPLAY_MEMORY_SIZE)
		else:
			mem_store = memory.Memory(maxlen=pm.REPLAY_MEMORY_SIZE)
		logger.info("Filling experience buffer...")

		# generate training data
		traces = []
		for episode in range(pm.TRAIN_EPOCH_SIZE):
			job_trace = trace.Trace(None).get_trace()
			traces.append(job_trace)

		if pm.EPSILON_GREEDY:
			if pm.VARYING_EPSILON:
				temperature = pm.ANNEALING_TEMPERATURE * (1 + float(id)/pm.NUM_AGENTS)
			else:
				temperature = pm.ANNEALING_TEMPERATURE
		gates = [True, True, True]
		for epoch in range(pm.TOT_TRAIN_EPOCHS):
			for episode in range(pm.TRAIN_EPOCH_SIZE):
				if pm.CHANGING_JOB_TYPES:
					if global_step >= 0 and gates[0]:
						gates[0] = False
						traces = []
						for episode in range(pm.TRAIN_EPOCH_SIZE):
							job_trace = trace.Trace(None).get_trace(4)
							traces.append(job_trace)
						logger.info("Changing job types 4")
					elif global_step >= 1000 and gates[1]:
						gates[1] = False
						traces = []
						for episode in range(pm.TRAIN_EPOCH_SIZE):
							job_trace = trace.Trace(None).get_trace(6)
							traces.append(job_trace)
						logger.info("Changing job types 6")
					elif global_step >= 2000 and gates[2]:
						gates[2] = False
						traces = []
						for episode in range(pm.TRAIN_EPOCH_SIZE):
							job_trace = trace.Trace(None).get_trace(8)
							traces.append(job_trace)
						logger.info("Changing job types 8")
				tic = time.time()
				if mem_store.full() and pm.ENABLE_K8S:
					logger.info("Switching to k8s environment!!!")
					env = k8s_rl_env.K8S_RL_Env("RL", copy.deepcopy(traces[episode]), logger)
				else:
					env = rl_env.RL_Env("RL", copy.deepcopy(traces[episode]), logger)
				states = []
				masked_outputs = []
				actions = []
				rewards = []
				ts = 0
				while not env.end:
					if pm.LOG_MODE == "DEBUG":
						time.sleep(0.01)
					state = env.observe()
					output = policy_net.predict(np.reshape(state, (1, pm.STATE_DIM[0], pm.STATE_DIM[1])))
					if pm.EPSILON_GREEDY: # greedy epsilon
						env.epsilon = 2 / (1 + np.exp(global_step / temperature))
					masked_output, action, reward, move_on, valid_state = env.step(output)

					if valid_state: # do not save state when move on except skip_ts, but need to save reward!!!
						states.append(state)
						masked_outputs.append(masked_output)
						actions.append(action)
						rewards.append(reward)
					if move_on:
						ts += 1
						# ts_reward = reward
						if ts%pm.LT_REWARD_NUM_TS == 0 and len(states) > 0: # states can be [] due to no jobs in the ts
							# lt_reward = sum(rewards)
							# ts_rewards = [0 for _ in range(pm.LT_REWARD_NUM_TS)]
							# ts_rewards[-1] = lt_reward
							# for i in reversed(range(0, len(ts_rewards) - 1)):
							# 	ts_rewards[i] += ts_rewards[i + 1] * pm.DISCOUNT_FACTOR

							if pm.LT_REWARD_IN_TS:
								for i in reversed(range(0,len(rewards)-1)):
									rewards[i] += rewards[i+1]*pm.DISCOUNT_FACTOR
							elif pm.TS_REWARD_PLUS_JOB_REWARD:
								rewards = env.get_job_reward()
								assert len(rewards) == len(states)
							else:
								rewards = [reward for _ in range(len(states))]

							# randomly fill samples to memory
							if pm.RANDOM_FILL_MEMORY:
								indexes = np.random.choice(len(states), size=pm.MINI_BATCH_SIZE, replace=False)
								for i in indexes:
									mem_store.store(states[i], masked_outputs[i], actions[i], rewards[i])
							else:
								for i in range(len(states)):
									mem_store.store(states[i], masked_outputs[i], actions[i], rewards[i])

							if mem_store.full() and ts%pm.NUM_TS_PER_UPDATE == 0:
								# prepare a training batch
								mem_indexes, trajectories, IS_weights = mem_store.sample(pm.MINI_BATCH_SIZE)
								states_batch = [traj.state for traj in trajectories]
								outputs_batch = [traj.output for traj in trajectories]
								actions_batch = [traj.action for traj in trajectories]
								rewards_batch = [traj.reward for traj in trajectories]

								# pull latest weights before training
								if not first_time: # avoid pulling twice at the first update
									if pm.VALUE_NET:
										policy_weights, value_weights = net_weights_q.get()
										if isinstance(policy_weights, basestring) and policy_weights == "exit":
											logger.info("Agent " + str(id) + " exits.")
											exit(0)
										policy_net.set_weights(policy_weights)
										value_net.set_weights(value_weights)
									else:
										policy_weights = net_weights_q.get()
										if isinstance(policy_weights, basestring) and policy_weights == "exit":
											logger.info("Agent " + str(id) + " exits.")
											exit(0)
										policy_net.set_weights(policy_weights)
								else:
									first_time = False

								# set entropy weight, both agent and central agent need to be set
								policy_net.anneal_entropy_weight(global_step)

								# reinforcement learning to calculate gradients
								if pm.VALUE_NET:
									value_output = value_net.predict(np.stack(states_batch))
									td_loss = np.vstack(rewards_batch) - value_output
									adjusted_td_loss = td_loss * np.vstack(IS_weights)
									policy_entropy, policy_loss, policy_grads = policy_net.get_rl_gradients(np.stack(states_batch), \
													np.vstack(outputs_batch), np.vstack(actions_batch), adjusted_td_loss)
									value_loss, value_grads = value_net.get_rl_gradients(np.stack(states_batch), value_output, np.vstack(rewards_batch))
								else:
									if pm.PRIORITY_MEMORY_SORT_REWARD and pm.MEAN_REWARD_BASELINE:
										td_loss = np.vstack(rewards_batch) - mem_store.avg_reward()
									else:
										td_loss = np.vstack(rewards_batch) - 0
									adjusted_td_loss = td_loss * np.vstack(IS_weights)
									policy_entropy, policy_loss, policy_grads = policy_net.get_rl_gradients(np.stack(states_batch), np.vstack(outputs_batch), np.vstack(actions_batch), adjusted_td_loss)

								for aa in range(len(actions_batch)):
									if actions_batch[aa][-1] == 1:
										# print "rewards:", rewards_batch[aa], "td_loss:", td_loss[aa]
										logger.debug("rewards:" + str(rewards_batch[aa]) + "td_loss:" + str(td_loss[aa]))

								for i in range(len(policy_grads)):
									try:
										assert np.any(np.isnan(policy_grads[i])) == False
										# print np.mean(np.abs(policy_grads[i])) # 10^-5 to 10^-2
									except Exception as e:
										logger.error("Error: " + str(e))
										logger.error("Gradients: " + str(policy_grads[i]))
										logger.error("Input type: " + str(states_batch[:,0]))
										logger.error("Masked Output: " + str(outputs_batch))
										logger.error("Action: " + str(actions_batch))
										logger.error("TD Loss: " + str(td_loss))
										logger.error("Policy Loss: " + str(policy_loss))
										logger.error("Policy Entropy: " + str(policy_entropy))
										exit(1) # another option is to continue
								if pm.VALUE_NET:
									for i in range(len(value_grads)):
										try:
											assert np.any(np.isnan(value_grads[i])) == False
										except Exception as e:
											logger.error("Error: " + str(e) + " " + str(policy_grads[i]))
											exit(1)

								# send gradients to the central agent
								if pm.VALUE_NET:
									net_gradients_q.put((policy_grads, value_grads))
								else:
									net_gradients_q.put(policy_grads)
								if pm.PRIORITY_REPLAY:
									mem_store.update(mem_indexes, abs(td_loss))
								# validation
								if not pm.VAL_ON_MASTER and global_step % pm.VAL_INTERVAL == 0:
									val_loss = validate.val_loss(policy_net, validation_traces, logger, global_step)
									jct, makespan, reward = validate.val_jmr(policy_net, validation_traces, logger,
																			 global_step)
									stats_q.put(("val", val_loss, jct, makespan, reward))

								# statistics
								if pm.VALUE_NET:
									stats_q.put(("step:policy+value", policy_entropy, policy_loss, value_loss, sum(td_loss)/len(td_loss), sum(rewards_batch)/len(rewards_batch), output))
								else:
									stats_q.put(("step:policy", policy_entropy, policy_loss, sum(td_loss)/len(td_loss), sum(rewards_batch)/len(rewards_batch), output))
								global_step += 1

							# clear
							states = []
							masked_outputs = []
							actions = []
							rewards = []

				# collect statistics after training one trace
				num_jobs, jct, makespan, reward = env.get_results()
				stats_q.put(("trace:sched_result", jct, makespan, reward))
				if (epoch*pm.TRAIN_EPOCH_SIZE+episode)%pm.DISP_INTERVAL == 0:
					if (epoch*pm.TRAIN_EPOCH_SIZE+episode)%50 == 0:
						stats_q.put(("trace:job_stats", episode, env.get_jobstats()))
					toc = time.time()
					logger.info("--------------------------------------------------------------")
					logger.info("Agent " + str(id) + " Epoch " + str(epoch) + " Trace " + str(episode) + " Step " + str(global_step))
					logger.info("# of Jobs\t AVG JCT\t Makespan\t Reward\t Time")
					logger.info(str(num_jobs) + " \t" + " \t" + " " + '%.3f' %jct + " \t\t" + " " + '%.3f' %makespan \
								+ "\t\t" + " " + '%.3f' %reward + "\t" + " " + '%.3f' % (toc - tic))
Exemple #2
0
def central_agent(net_weights_qs, net_gradients_qs, stats_qs):
	logger = log.getLogger(name="central_agent", level=pm.LOG_MODE)
	logger.info("Start central agent...")

	if not pm.RANDOMNESS:
		np.random.seed(pm.np_seed)
		tf.set_random_seed(pm.tf_seed)

	config = tf.ConfigProto()
	config.allow_soft_placement=False
	config.gpu_options.allow_growth = True
	tb_logger = tb_log.Logger(pm.SUMMARY_DIR)
	log_config(tb_logger)

	with tf.Session(config=config) as sess:
		policy_net = network.PolicyNetwork(sess, "policy_net", pm.TRAINING_MODE, logger)
		if pm.VALUE_NET:
			value_net = network.ValueNetwork(sess, "value_net", pm.TRAINING_MODE, logger)
		logger.info("Create the policy network, with "+str(policy_net.get_num_weights())+" parameters")

		sess.run(tf.global_variables_initializer())
		tb_logger.add_graph(sess.graph)
		tb_logger.flush()
		policy_tf_saver = tf.train.Saver(max_to_keep=pm.MAX_NUM_CHECKPOINTS, var_list=tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='policy_net'))
		if pm.POLICY_NN_MODEL is not None:
			policy_tf_saver.restore(sess, pm.POLICY_NN_MODEL)
			logger.info("Policy model "+pm.POLICY_NN_MODEL+" is restored.")

		if pm.VALUE_NET:
			value_tf_saver = tf.train.Saver(max_to_keep=pm.MAX_NUM_CHECKPOINTS, var_list=tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='value_net'))
			if pm.VALUE_NN_MODEL is not None:
				value_tf_saver.restore(sess, pm.VALUE_NN_MODEL)
				logger.info("Value model " + pm.VALUE_NN_MODEL + " is restored.")

		step = 1
		start_t = time.time()

		if pm.VAL_ON_MASTER:
			validation_traces = []  # validation traces
			tags_prefix = ["DRF: ", "SRTF: ", "FIFO: ", "Tetris: ", "Optimus: "]
			for i in range(pm.VAL_DATASET):
				validation_traces.append(trace.Trace(None).get_trace())
			stats = comparison.compare(copy.deepcopy(validation_traces), logger) # deep copy to avoid changes to validation_traces
			if not pm.SKIP_FIRST_VAL:
				stats.append(test(policy_net, copy.deepcopy(validation_traces), logger, step=0, tb_logger=tb_logger))
				tags_prefix.append("Init_NN: ")

			f = open(LOG_DIR + "baselines.txt", 'w')
			for i in range(len(stats)):
				jct, makespan, reward = stats[i]
				value = tags_prefix[i] + " JCT: " + str(jct) + " Makespan: " + str(makespan) + " Reward: " + str(reward) + "\n"
				f.write(value)
				tb_logger.add_text(tag=tags_prefix[i], value=value, step=step)
			f.close()
			tb_logger.flush()
			logger.info("Finish validation for heuristics and initialized NN.")

		while step <= pm.TOT_NUM_STEPS:
			# send updated parameters to agents
			policy_weights = policy_net.get_weights()
			if pm.VALUE_NET:
				value_weights = value_net.get_weights()
				for i in range(pm.NUM_AGENTS):
					net_weights_qs[i].put((policy_weights, value_weights))
			else:
				for i in range(pm.NUM_AGENTS):
					net_weights_qs[i].put(policy_weights)

			# display speed
			if step % 1 == 0:
				elaps_t = time.time() - start_t
				speed = step / elaps_t
				logger.info("Central agent: Step " + str(
					step) + " Speed " + '%.3f' % speed + " batches/sec" + " Time " + '%.3f' % elaps_t + " seconds")


			# statistics
			if pm.TRAINING_MODE == "RL":
				policy_net.anneal_entropy_weight(step)
				tb_logger.add_scalar(tag="Entropy Weight", value=policy_net.entropy_weight, step=step)
				if pm.EPSILON_GREEDY:
					eps = 2 / (1 + np.exp(step / pm.ANNEALING_TEMPERATURE)) * 0.6
					tb_logger.add_scalar(tag="Epsilon Greedy", value=eps, step=step)

			collect_stats(stats_qs, tb_logger, step)
			if not pm.FIX_LEARNING_RATE:
				if step in pm.ADJUST_LR_STEPS:
					policy_net.lr /= 2
					if pm.VALUE_NET:
						value_net.lr /= 2
					logger.info("Learning rate is decreased to " + str(policy_net.lr) + " at step " + str(step))
			if step < pm.STEP_TRAIN_CRITIC_NET:  # set policy net lr to 0 to train critic net only
				policy_net.lr = 0.0

			if step % pm.DISP_INTERVAL == 0:
				tb_logger.add_scalar(tag="Learning rate", value=policy_net.lr, step=step)

			# save model
			if step % pm.CHECKPOINT_INTERVAL == 0:
				name_prefix = ""
				if pm.TRAINING_MODE == "SL":
					name_prefix += "sl_"
				else:
					name_prefix += "rl_"
				if pm.PS_WORKER:
					name_prefix += "ps_worker_"
				else:
					name_prefix += "worker_"

				model_name = pm.MODEL_DIR + "policy_" + name_prefix + str(step) + ".ckpt"
				path = policy_tf_saver.save(sess, model_name)
				logger.info("Policy model saved: " + path)
				if pm.VALUE_NET and pm.SAVE_VALUE_MODEL:
					model_name = pm.MODEL_DIR + "value_" + name_prefix + str(step) + ".ckpt"
					path = value_tf_saver.save(sess, model_name)
					logger.info("Value model saved: " + path)

			# validation
			if pm.VAL_ON_MASTER and step % pm.VAL_INTERVAL == 0:
				test(policy_net, copy.deepcopy(validation_traces), logger, step, tb_logger)

			# poll and update parameters
			poll_ids = set([i for i in range(pm.NUM_AGENTS)])
			avg_policy_grads = []
			avg_value_grads = []
			while True:
				for i in poll_ids.copy():
					try:
						if pm.VALUE_NET:
							policy_gradients, value_gradients = net_gradients_qs[i].get(False)
						else:
							policy_gradients = net_gradients_qs[i].get(False)
						poll_ids.remove(i)
						if len(avg_policy_grads) == 0:
							avg_policy_grads = policy_gradients
						else:
							for j in range(len(avg_policy_grads)):
								avg_policy_grads[j] += policy_gradients[j]
						if pm.VALUE_NET:
							if len(avg_value_grads) == 0:
								avg_value_grads = value_gradients
							else:
								for j in range(len(avg_value_grads)):
									avg_value_grads[j] += value_gradients[j]
					except:
						continue
				if len(poll_ids) == 0:
					break
			for i in range(0, len(avg_policy_grads)):
				avg_policy_grads[i] = avg_policy_grads[i] / pm.NUM_AGENTS
			policy_net.apply_gradients(avg_policy_grads)

			if pm.VALUE_NET:
				for i in range(0, len(avg_value_grads)):
					avg_value_grads[i] = avg_value_grads[i] / pm.NUM_AGENTS
				value_net.apply_gradients(avg_value_grads)

			# visualize gradients and weights
			if step % pm.VISUAL_GW_INTERVAL == 0 and pm.EXPERIMENT_NAME is None:
				assert len(policy_weights) == len(avg_policy_grads)
				for i in range(0,len(policy_weights),10):
					tb_logger.add_histogram(tag="Policy weights " + str(i), value=policy_weights[i], step=step)
					tb_logger.add_histogram(tag="Policy gradients " + str(i), value=avg_policy_grads[i], step=step)
				if pm.VALUE_NET:
					assert len(value_weights) == len(avg_value_grads)
					for i in range(0,len(value_weights),10):
						tb_logger.add_histogram(tag="Value weights " + str(i), value=value_weights[i], step=step)
						tb_logger.add_histogram(tag="Value gradients " + str(i), value=avg_value_grads[i], step=step)

			step += 1

		logger.info("Training ends...")
		if pm.VALUE_NET:
			for i in range(pm.NUM_AGENTS):
				net_weights_qs[i].put(("exit", "exit"))
		else:
			for i in range(pm.NUM_AGENTS):
				net_weights_qs[i].put("exit")
		# os.system("sudo pkill -9 python")
		exit(0)