def __init__(self, sess): self.sess = sess self.actor = a3c.ActorNetwork(self.sess, state_dim=S_INFO, action_dim=A_DIM, learning_rate=ACTOR_LR_RATE) self.critic = a3c.CriticNetwork(self.sess, state_dim=S_INFO, learning_rate=CRITIC_LR_RATE) self.summary_ops, self.summary_vars = a3c.build_summaries() self.sess.run(tf.global_variables_initializer()) self.writer = tf.summary.FileWriter(SUMMARY_DIR, sess.graph) self.saver = tf.train.Saver() # restore neural network if NN_MODEL is not None: print("load model success!") self.saver.restore(self.sess, NN_MODEL) self.epoch = 0 self.i_episode = 0 self.total_reward = 0.0 self.s = env.reset()
def __init__(self, sess, a_dims, s_lengths, nn_model): ''' Initialize the learner. :a_dim: array containing the dimension space for each action :s_info: the number of different metric information :s_lengths: array containing the length of each metric information ''' if not os.path.exists(SUMMARY_DIR): os.makedirs(SUMMARY_DIR) self.sess = sess self.a_dims = a_dims self.s_lengths = s_lengths self.s_dims = len(self.s_lengths), max(self.s_lengths) self.actor = a3c.ActorNetwork(self.sess, self.a_dims, self.s_lengths, ACTOR_LR_RATE) self.critic = a3c.CriticNetwork(self.sess, self.a_dims, self.s_lengths, CRITIC_LR_RATE) self.summary_ops, self.summary_vars = a3c.build_summaries() self.sess.run(tf.global_variables_initializer()) self.writer = tf.summary.FileWriter(SUMMARY_DIR, self.sess.graph) self.saver = tf.train.Saver() if nn_model is not None: saver.restore(nn_model) logging.info('Model restored.') self.entropy_record = list()
def __init__(self, checkpoint): self.sess = tf.Session() self.actor = a3c.ActorNetwork(self.sess, state_dim=[S_INFO, S_LEN], action_dim=A_DIM, learning_rate=ACTOR_LR_RATE) summary_ops, summary_vars = a3c.build_summaries() self.sess.run(tf.global_variables_initializer()) self.saver = tf.train.Saver() # save neural net parameters # restore neural net parameters self.saver.restore(self.sess, checkpoint)
def central_agent(net_params_queues, exp_queues): assert len(net_params_queues) == NUM_AGENTS assert len(exp_queues) == NUM_AGENTS logging.basicConfig(filename=LOG_FILE + '_central', filemode='w', level=logging.INFO) with tf.Session() as sess, open(LOG_FILE + '_test', 'w') as test_log_file: actor = a3c.ActorNetwork(sess, state_dim=[S_INFO, S_LEN], action_dim=A_DIM, learning_rate=ACTOR_LR_RATE) critic = a3c.CriticNetwork(sess, state_dim=[S_INFO, S_LEN], learning_rate=CRITIC_LR_RATE) summary_ops, summary_vars = a3c.build_summaries() sess.run(tf.global_variables_initializer()) writer = tf.summary.FileWriter(SUMMARY_DIR, sess.graph) # training monitor saver = tf.train.Saver(max_to_keep=50000) # save neural net parameters # restore neural net parameters nn_model = NN_MODEL if nn_model is not None: # nn_model is the path to file saver.restore(sess, nn_model) print("Model restored.") epoch = 0 # assemble experiences from agents, compute the gradients while epoch <= num_epochs: # synchronize the network parameters of work agent actor_net_params = actor.get_network_params() critic_net_params = critic.get_network_params() for i in range(NUM_AGENTS): net_params_queues[i].put([actor_net_params, critic_net_params]) # Note: this is synchronous version of the parallel training, # which is easier to understand and probe. The framework can be # fairly easily modified to support asynchronous training. # Some practices of asynchronous training (lock-free SGD at # its core) are nicely explained in the following two papers: # https://arxiv.org/abs/1602.01783 # https://arxiv.org/abs/1106.5730 # record average reward and td loss change # in the experiences from the agents total_batch_len = 0.0 total_reward = 0.0 total_td_loss = 0.0 total_entropy = 0.0 total_agents = 0.0 # assemble experiences from the agents actor_gradient_batch = [] critic_gradient_batch = [] for i in range(NUM_AGENTS): s_batch, a_batch, r_batch, terminal, info = exp_queues[i].get() actor_gradient, critic_gradient, td_batch = \ a3c.compute_gradients( s_batch=np.stack(s_batch, axis=0), a_batch=np.vstack(a_batch), r_batch=np.vstack(r_batch), terminal=terminal, actor=actor, critic=critic) actor_gradient_batch.append(actor_gradient) critic_gradient_batch.append(critic_gradient) total_reward += np.sum(r_batch) total_td_loss += np.sum(td_batch) total_batch_len += len(r_batch) total_agents += 1.0 total_entropy += np.sum(info['entropy']) # compute aggregated gradient assert NUM_AGENTS == len(actor_gradient_batch) assert len(actor_gradient_batch) == len(critic_gradient_batch) # assembled_actor_gradient = actor_gradient_batch[0] # assembled_critic_gradient = critic_gradient_batch[0] # for i in range(len(actor_gradient_batch) - 1): # for j in range(len(assembled_actor_gradient)): # assembled_actor_gradient[j] += actor_gradient_batch[i][j] # assembled_critic_gradient[j] += critic_gradient_batch[i][j] # actor.apply_gradients(assembled_actor_gradient) # critic.apply_gradients(assembled_critic_gradient) for i in range(len(actor_gradient_batch)): actor.apply_gradients(actor_gradient_batch[i]) critic.apply_gradients(critic_gradient_batch[i]) # log training information epoch += 1 avg_reward = total_reward / total_agents avg_td_loss = total_td_loss / total_batch_len avg_entropy = total_entropy / total_batch_len logging.info('Epoch: ' + str(epoch) + ' TD_loss: ' + str(avg_td_loss) + ' Avg_reward: ' + str(avg_reward) + ' Avg_entropy: ' + str(avg_entropy)) summary_str = sess.run(summary_ops, feed_dict={ summary_vars[0]: avg_td_loss, summary_vars[1]: avg_reward, summary_vars[2]: avg_entropy }) writer.add_summary(summary_str, epoch) writer.flush() if epoch % MODEL_SAVE_INTERVAL == 0: # Save the neural net parameters to disk. save_path = saver.save(sess, SUMMARY_DIR + "/nn_model_ep_" + str(epoch) + ".ckpt") logging.info("Model saved in file: " + save_path) testing( epoch, SUMMARY_DIR + "/nn_model_ep_" + str(epoch) + ".ckpt", test_log_file )
def main(): # utility_offset = -math.log(VIDEO_BIT_RATE[0]) # so utilities[0] = 0 # utilities = [math.log(b) + utility_offset for b in VIDEO_BIT_RATE] np.random.seed(RANDOM_SEED) assert len(VIDEO_BIT_RATE) == A_DIM all_cooked_time, all_cooked_bw, _ = load_trace.load_trace() load_trace.plot_bandwidth(all_cooked_time, all_cooked_bw, _) if not os.path.exists(SUMMARY_DIR): os.makedirs(SUMMARY_DIR) net_env = env.Environment(all_cooked_time=all_cooked_time, all_cooked_bw=all_cooked_bw) with tf.Session() as sess, open(LOG_FILE, 'w') as log_file: actor = a3c.ActorNetwork(sess, state_dim=[S_INFO, S_LEN], action_dim=A_DIM, learning_rate=ACTOR_LR_RATE) critic = a3c.CriticNetwork(sess, state_dim=[S_INFO, S_LEN], learning_rate=CRITIC_LR_RATE) summary_ops, summary_vars = a3c.build_summaries() sess.run(tf.global_variables_initializer()) writer = tf.summary.FileWriter(SUMMARY_DIR, sess.graph) # training monitor saver = tf.train.Saver() # save neural net parameters # restore neural net parameters nn_model = NN_MODEL if nn_model is not None: # nn_model is the path to file saver.restore(sess, nn_model) print("Model restored.") epoch = 0 time_stamp = 0 last_bit_rate = DEFAULT_QUALITY bit_rate = DEFAULT_QUALITY action_vec = np.zeros(A_DIM) action_vec[bit_rate] = 1 s_batch = [np.zeros((S_INFO, S_LEN))] a_batch = [action_vec] r_batch = [] entropy_record = [] actor_gradient_batch = [] critic_gradient_batch = [] while True: # serve video forever # the action is from the last decision # this is to make the framework similar to the real delay, sleep_time, buffer_size, rebuf, \ video_chunk_size, next_video_chunk_sizes, \ end_of_video, video_chunk_counter,throughput,video_chunk_remain = \ net_env.get_video_chunk(bit_rate) #print(net_env.get_video_chunk(bit_rate)) time_stamp += delay # in ms time_stamp += sleep_time # in ms # reward is video quality - rebuffer penalty - smooth penalty reward = VIDEO_BIT_RATE[bit_rate] / M_IN_K \ - REBUF_PENALTY * rebuf \ - SMOOTH_PENALTY * np.abs(VIDEO_BIT_RATE[bit_rate] - VIDEO_BIT_RATE[last_bit_rate]) / M_IN_K r_batch.append(reward) last_bit_rate = bit_rate # retrieve previous state if len(s_batch) == 0: state = [np.zeros((S_INFO, S_LEN))] else: state = np.array(s_batch[-1], copy=True) # print(state) # dequeue history record state = np.roll(state, -1, axis=1) # this should be S_INFO number of terms state[0, -1] = VIDEO_BIT_RATE[bit_rate] / float( np.max(VIDEO_BIT_RATE)) # last quality state[1, -1] = buffer_size / BUFFER_NORM_FACTOR # 10 sec state[2, -1] = float(video_chunk_size) / float( delay) / M_IN_K # kilo byte / ms state[3, -1] = float(delay) / M_IN_K / BUFFER_NORM_FACTOR # 10 sec state[4, :A_DIM] = np.array( next_video_chunk_sizes) / M_IN_K / M_IN_K # mega byte state[5, -1] = np.minimum( video_chunk_remain, CHUNK_TIL_VIDEO_END_CAP) / float(CHUNK_TIL_VIDEO_END_CAP) # print('state',state) action_prob = actor.predict(np.reshape(state, (1, S_INFO, S_LEN))) action_cumsum = np.cumsum(action_prob) rand = np.random.randint(1, RAND_RANGE) / float(RAND_RANGE) print(action_cumsum, action_cumsum > rand, (action_cumsum > rand).argmax()) # print(action_cumsum > np.random.randint(1, RAND_RANGE) / float(RAND_RANGE)) # print(action_cumsum > np.random.randint(1, RAND_RANGE) / float(RAND_RANGE)).argmax() #compute Vp and map bitrate # bit_rate = (action_cumsum > np.random.randint(1, RAND_RANGE) / float(RAND_RANGE)).argmax() Vp_index = (action_cumsum > np.random.randint(1, RAND_RANGE) / float(RAND_RANGE)).argmax() Vp = BUFFER_PARAMETER[Vp_index] # Note: we need to discretize the probability into 1/RAND_RANGE steps, # because there is an intrinsic discrepancy in passing single state and batch states config = { 'buffer_size': env.BUFFER_THRESH, 'gp': GP, 'Vp': Vp, 'abr_osc': False, 'abr_basic': False, 'no_ibr': False } bola = get_bitrate.Bola(config=config) bit_rate = bola.get_quality( Vp, buffer_size * env.MILLISECONDS_IN_SECOND, last_bit_rate, throughput) #决策前的信息 print( '[%d]:download time %.2fms,thrput=%.2f,chunk size %d,buffer=%.2fs,bitrate=%d' % (video_chunk_counter, throughput, delay, video_chunk_size, buffer_size, last_bit_rate)) entropy_record.append(a3c.compute_entropy(action_prob[0])) # log time_stamp, bit_rate, buffer_size, reward log_file.write( str(time_stamp) + '\t' + str(VIDEO_BIT_RATE[bit_rate]) + '\t' + str(buffer_size) + '\t' + str(rebuf) + '\t' + str(video_chunk_size) + '\t' + str(delay) + '\t' + str(reward) + '\n') log_file.flush() if len(r_batch ) >= TRAIN_SEQ_LEN or end_of_video: # do training once actor_gradient, critic_gradient, td_batch = \ a3c.compute_gradients(s_batch=np.stack(s_batch[1:], axis=0), # ignore the first chuck a_batch=np.vstack(a_batch[1:]), # since we don't have the r_batch=np.vstack(r_batch[1:]), # control over it terminal=end_of_video, actor=actor, critic=critic) td_loss = np.mean(td_batch) actor_gradient_batch.append(actor_gradient) critic_gradient_batch.append(critic_gradient) print("====") print("Epoch", epoch) print("TD_loss", td_loss, "Avg_reward", np.mean(r_batch), "Avg_entropy", np.mean(entropy_record)) print("====") summary_str = sess.run(summary_ops, feed_dict={ summary_vars[0]: td_loss, summary_vars[1]: np.mean(r_batch), summary_vars[2]: np.mean(entropy_record) }) writer.add_summary(summary_str, epoch) writer.flush() entropy_record = [] if len(actor_gradient_batch) >= GRADIENT_BATCH_SIZE: assert len(actor_gradient_batch) == len( critic_gradient_batch) # assembled_actor_gradient = actor_gradient_batch[0] # assembled_critic_gradient = critic_gradient_batch[0] # assert len(actor_gradient_batch) == len(critic_gradient_batch) # for i in xrange(len(actor_gradient_batch) - 1): # for j in xrange(len(actor_gradient)): # assembled_actor_gradient[j] += actor_gradient_batch[i][j] # assembled_critic_gradient[j] += critic_gradient_batch[i][j] # actor.apply_gradients(assembled_actor_gradient) # critic.apply_gradients(assembled_critic_gradient) for i in range(len(actor_gradient_batch)): actor.apply_gradients(actor_gradient_batch[i]) critic.apply_gradients(critic_gradient_batch[i]) actor_gradient_batch = [] critic_gradient_batch = [] epoch += 1 if epoch % MODEL_SAVE_INTERVAL == 0: # Save the neural net parameters to disk. save_path = saver.save( sess, SUMMARY_DIR + "/nn_model_ep_" + str(epoch) + ".ckpt") print("Model saved in file: %s" % save_path) del s_batch[:] del a_batch[:] del r_batch[:] if end_of_video: last_bit_rate = DEFAULT_QUALITY bit_rate = DEFAULT_QUALITY # use the default action here action_vec = np.zeros(A_DIM) action_vec[bit_rate] = 1 s_batch.append(np.zeros((S_INFO, S_LEN))) a_batch.append(action_vec) else: s_batch.append(state) action_vec = np.zeros(A_DIM) # print(bit_rate) action_vec[bit_rate] = 1 a_batch.append(action_vec)
def central_agent(net_params_queues, exp_queues): assert len(net_params_queues) == NUM_AGENTS assert len(exp_queues) == NUM_AGENTS with tf.Session() as sess, open(SUMMARY_DIR + '/log_central', 'w') as log_file: actor = a3c.ActorNetwork(sess, state_dim=S_DIM, action_dim=A_DIM, learning_rate=ACTOR_LR_RATE) critic = a3c.CriticNetwork(sess, state_dim=S_DIM, learning_rate=CRITIC_LR_RATE) summary_ops, summary_vars = a3c.build_summaries() sess.run(tf.global_variables_initializer()) writer = tf.summary.FileWriter(SUMMARY_DIR, sess.graph) # training monitor saver = tf.train.Saver() # save neural net parameters # restore neural net parameters nn_model = NN_MODEL if nn_model is not None: # nn_model is the path to file saver.restore(sess, nn_model) print("Model restored.") # while True: # assemble experiences from agents, compute the gradients for ep in range(TRAIN_EPOCH): # synchronize the network parameters of work agent actor_net_params = actor.get_network_params() critic_net_params = critic.get_network_params() for i in range(NUM_AGENTS): net_params_queues[i].put([actor_net_params, critic_net_params]) # record average reward and td loss change # in the experiences from the agents total_batch_len = 0.0 total_reward = 0.0 total_td_loss = 0.0 total_agents = 0.0 # assemble experiences from the agents actor_gradient_batch = [] critic_gradient_batch = [] for i in range(NUM_AGENTS): s_batch, a_batch, r_batch, terminal = exp_queues[i].get() actor_gradient, critic_gradient, td_batch = \ a3c.compute_gradients( # s_batch=np.vstack(s_batch), s_batch=np.stack(s_batch, axis=0), a_batch=np.vstack(a_batch), r_batch=np.vstack(r_batch), terminal=terminal, actor=actor, critic=critic) actor_gradient_batch.append(actor_gradient) critic_gradient_batch.append(critic_gradient) total_reward += np.sum(r_batch) total_td_loss += np.sum(td_batch) total_batch_len += len(r_batch) total_agents += 1.0 # compute aggregated gradient assert NUM_AGENTS == len(actor_gradient_batch) assert len(actor_gradient_batch) == len(critic_gradient_batch) for i in range(len(actor_gradient_batch)): actor.apply_gradients(actor_gradient_batch[i]) critic.apply_gradients(critic_gradient_batch[i]) # log training information avg_reward = total_reward / total_agents avg_td_loss = total_td_loss / total_batch_len log_file.write('Epoch: ' + str(ep) + ' TD_loss: ' + str(avg_td_loss) + ' Avg_reward: ' + str(avg_reward) + '\n') log_file.flush() summary_str = sess.run(summary_ops, feed_dict={ summary_vars[0]: avg_td_loss, summary_vars[1]: avg_reward }) writer.add_summary(summary_str, ep) writer.flush() if ep % MODEL_SAVE_INTERVAL == 0: # Save the neural net parameters to disk. save_path = saver.save( sess, MODEL_DIR + "/nn_model_ep_" + str(ep) + ".ckpt")
def central_agent(net_params_queues, exp_queues): assert len(net_params_queues) == NUM_AGENTS assert len(exp_queues) == NUM_AGENTS logging.basicConfig(filename=LOG_FILE + '_central', filemode='w', level=logging.INFO) with tf.Session() as sess, open(LOG_FILE + '_test', 'wb') as test_log_file: actor = a3c.ActorNetwork(sess, state_dim=[S_INFO, S_LEN], action_dim=A_DIM, learning_rate=ACTOR_LR_RATE) critic = a3c.CriticNetwork(sess, state_dim=[S_INFO, S_LEN], learning_rate=CRITIC_LR_RATE) summary_ops, summary_vars = a3c.build_summaries() sess.run(tf.global_variables_initializer()) writer = tf.summary.FileWriter(SUMMARY_DIR, sess.graph) # training monitor saver = tf.train.Saver(max_to_keep=10000) # save neural net parameters # restore neural net parameters nn_model = NN_MODEL if nn_model == "None": epoch = 0 nn_model = None if nn_model is not None: # nn_model is the path to file epoch = int(nn_model.replace("nn_model_ep_", "").split(".ckpt")[0]) saver.restore(sess, MODEL_DIR + nn_model) print("Model restored.") # while True: # assemble experiences from agents, compute the gradients while True: # synchronize the network parameters of work agent actor_net_params = actor.get_network_params() critic_net_params = critic.get_network_params() for i in xrange(NUM_AGENTS): net_params_queues[i].put([actor_net_params, critic_net_params]) # record average reward and td loss change # in the experiences from the agents total_batch_len = 0.0 total_reward = 0.0 total_td_loss = 0.0 total_entropy = 0.0 total_agents = 0.0 # assemble experiences from the agents actor_gradient_batch = [] critic_gradient_batch = [] for i in xrange(NUM_AGENTS): s_batch, a_batch, r_batch, terminal, info = exp_queues[i].get() actor_gradient, critic_gradient, td_batch = \ a3c.compute_gradients( s_batch=np.stack(s_batch, axis=0), a_batch=np.vstack(a_batch), r_batch=np.vstack(r_batch), terminal=terminal, actor=actor, critic=critic) for i in xrange(len(actor_gradient)): assert np.any(np.isnan(actor_gradient[i])) == False actor_gradient_batch.append(actor_gradient) critic_gradient_batch.append(critic_gradient) total_reward += np.sum(r_batch) total_td_loss += np.sum(td_batch) total_batch_len += len(r_batch) total_agents += 1.0 total_entropy += np.sum(info['entropy']) # compute aggregated gradient assert NUM_AGENTS == len(actor_gradient_batch) assert len(actor_gradient_batch) == len(critic_gradient_batch) # assembled_actor_gradient = actor_gradient_batch[0] # assembled_critic_gradient = critic_gradient_batch[0] # for i in xrange(len(actor_gradient_batch) - 1): # for j in xrange(len(assembled_actor_gradient)): # assembled_actor_gradient[j] += actor_gradient_batch[i][j] # assembled_critic_gradient[j] += critic_gradient_batch[i][j] # actor.apply_gradients(assembled_actor_gradient) # critic.apply_gradients(assembled_critic_gradient) for i in xrange(len(actor_gradient_batch)): actor.apply_gradients(actor_gradient_batch[i]) critic.apply_gradients(critic_gradient_batch[i]) # log training information epoch += 1 avg_reward = total_reward / total_agents avg_td_loss = total_td_loss / total_batch_len avg_entropy = total_entropy / total_batch_len logging.info('Epoch: ' + str(epoch) + ' TD_loss: ' + str(avg_td_loss) + ' Avg_reward: ' + str(avg_reward) + ' Avg_entropy: ' + str(avg_entropy)) summary_str = sess.run(summary_ops, feed_dict={ summary_vars[0]: avg_td_loss, summary_vars[1]: avg_reward, summary_vars[2]: avg_entropy }) writer.add_summary(summary_str, epoch) writer.flush() if epoch % MODEL_SAVE_INTERVAL == 0: # Save the neural net parameters to disk. save_path = saver.save( sess, MODEL_DIR + "nn_model_ep_" + str(epoch) + ".ckpt") logging.info("Model saved in file: " + save_path) testing(epoch, MODEL_DIR + "nn_model_ep_" + str(epoch) + ".ckpt", test_log_file)
def central_agent(net_params_queues, exp_queues): # 参数是两个有16个队列(进程队列?)的列表 #打开Session(){ # 生成神经网络 # 生成一个tf.summary???(好像是用来检测数据作可视化用的) # 初始化神经网络参数,读取已保存的神经网络 # 循环{ # 在Queue中放入神经网络参数*子agent数量 # 初始化变量和batch[] # 从Queue获取子agent传来的batch[]数据,综合以后执行梯度下降Optimizer # 将数据写入文件 # 达到一定次数更新一次保存的神经网络 # } #} assert len(net_params_queues) == NUM_AGENTS assert len(exp_queues) == NUM_AGENTS logging.basicConfig(filename=LOG_FILE + '_central', filemode='w', level=logging.INFO) # 创建日志? with tf.Session() as sess, open(LOG_FILE + '_test', 'wb') as test_log_file: # 创建actor神经网络,参数为tensorflow的Session,[输入神经元个数,历史带宽长度],输出神经元个数(码率范围),学习率 actor = a3c.ActorNetwork(sess, state_dim=[S_INFO, S_LEN], action_dim=A_DIM, learning_rate=ACTOR_LR_RATE) # 创建critic神经网络,参数为tensorflow的Session,[输入神经元个数,历史带宽长度],学习率 critic = a3c.CriticNetwork(sess, state_dim=[S_INFO, S_LEN], learning_rate=CRITIC_LR_RATE) summary_ops, summary_vars = a3c.build_summaries() # 总结什么? sess.run(tf.global_variables_initializer()) writer = tf.summary.FileWriter(SUMMARY_DIR, sess.graph) # training monitor saver = tf.train.Saver() # save neural net parameters # restore neural net parameters nn_model = NN_MODEL if nn_model is not None: # nn_model is the path to file saver.restore(sess, nn_model) print("Model restored.") epoch = 0 # assemble experiences from agents, compute the gradients while True: # synchronize同步 the network parameters of work agent actor_net_params = actor.get_network_params() critic_net_params = critic.get_network_params() for i in xrange(NUM_AGENTS): # 0-15 net_params_queues[i].put([actor_net_params, critic_net_params ]) # 将参数放入列表中每个进程对应的队列 # Note: this is synchronous version of the parallel training, # which is easier to understand and probe. The framework can be # fairly easily modified to support asynchronous training. # Some practices of asynchronous training (lock-free SGD at # its core) are nicely explained in the following two papers: # https://arxiv.org/abs/1602.01783 # https://arxiv.org/abs/1106.5730 # record average reward and td loss change # in the experiences from the agents total_batch_len = 0.0 total_reward = 0.0 total_td_loss = 0.0 total_entropy = 0.0 total_agents = 0.0 # assemble experiences from the agents actor_gradient_batch = [] critic_gradient_batch = [] for i in xrange(NUM_AGENTS): # 0-15 s_batch, a_batch, r_batch, terminal, info = exp_queues[i].get( ) # 从列表中每个进程对应的队列取出参数? actor_gradient, critic_gradient, td_batch = \ a3c.compute_gradients( # 计算梯度? s_batch=np.stack(s_batch, axis=0), a_batch=np.vstack(a_batch), r_batch=np.vstack(r_batch), terminal=terminal, actor=actor, critic=critic) actor_gradient_batch.append(actor_gradient) critic_gradient_batch.append(critic_gradient) total_reward += np.sum(r_batch) total_td_loss += np.sum(td_batch) total_batch_len += len(r_batch) total_agents += 1.0 total_entropy += np.sum(info['entropy']) # 从info字典中取出熵值 # compute aggregated汇总 gradient assert NUM_AGENTS == len(actor_gradient_batch) assert len(actor_gradient_batch) == len(critic_gradient_batch) # assembled_actor_gradient = actor_gradient_batch[0] # assembled_critic_gradient = critic_gradient_batch[0] # for i in xrange(len(actor_gradient_batch) - 1): # for j in xrange(len(assembled_actor_gradient)): # assembled_actor_gradient[j] += actor_gradient_batch[i][j] # assembled_critic_gradient[j] += critic_gradient_batch[i][j] # actor.apply_gradients(assembled_actor_gradient) # critic.apply_gradients(assembled_critic_gradient) for i in xrange(len(actor_gradient_batch)): actor.apply_gradients(actor_gradient_batch[i]) critic.apply_gradients(critic_gradient_batch[i]) # log training information epoch += 1 avg_reward = total_reward / total_agents avg_td_loss = total_td_loss / total_batch_len avg_entropy = total_entropy / total_batch_len logging.info('Epoch: ' + str(epoch) + ' TD_loss: ' + str(avg_td_loss) + ' Avg_reward: ' + str(avg_reward) + ' Avg_entropy: ' + str(avg_entropy)) # 记录日志 summary_str = sess.run(summary_ops, feed_dict={ summary_vars[0]: avg_td_loss, summary_vars[1]: avg_reward, summary_vars[2]: avg_entropy }) writer.add_summary(summary_str, epoch) writer.flush() if epoch % MODEL_SAVE_INTERVAL == 0: # Save the neural net parameters to disk. save_path = saver.save( sess, SUMMARY_DIR + "/nn_model_ep_" + str(epoch) + ".ckpt") logging.info("Model saved in file: " + save_path) testing(epoch, SUMMARY_DIR + "/nn_model_ep_" + str(epoch) + ".ckpt", test_log_file) # 测试?
def run(port=8333, log_file_path=LOG_FILE): np.random.seed(RANDOM_SEED) with tf.Session() as sess, open(log_file_path, 'wb') as log_file: actor = a3c.ActorNetwork(sess, state_dim=S_DIM, action_dim=A_DIM, learning_rate=ACTOR_LR_RATE) critic = a3c.CriticNetwork(sess, state_dim=S_DIM, learning_rate=CRITIC_LR_RATE) summary_ops, summary_vars = a3c.build_summaries() sess.run(tf.initialize_all_variables()) writer = tf.summary.FileWriter(SUMMARY_DIR, sess.graph) saver = tf.train.Saver() # save neural net parameters #restore neural net parameters nn_model = NN_MODEL if nn_model is not None: # nn_model is the path to file saver.restore(sess, nn_model) print("Model restored.") init_action = np.zeros(A_DIM) #by default we simply use the first lambda init_action[DEFAULT_LAMBDA] = 0 s_batch = [np.zeros(S_DIM)] a_batch = [init_action] r_batch = [] entropy_record = [] #this is for training actor_gradient_batch = [] #this is for training critic_gradient_batch = [] #this is for training last_lambda = DEFAULT_LAMBDA epoch = 0 end_of_training = False # Create a TCP/IP socket sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) # Bind the socket to the port server_address = ('localhost', port) print >> sys.stderr, 'starting up on %s port %s' % server_address sock.bind(server_address) # Listen for incoming connections sock.listen(5) count = 0 while True: # Wait for a connection print >> sys.stderr, 'waiting for a connection' connection, addr = sock.accept() print 'Connected with ' + addr[0] + ':' + str(addr[1]) # Receive the json file # json file format: # 'reward': float # 'state': array = '{"state": ["1", "3", "4", ...]}' #numBytes = sys.getsizeof(int) #print ("size to receive: " + str(numBytes)) size = connection.recv(4) size = struct.unpack('!i', size)[0] print >> sys.stderr, 'received "%s"' % size data = connection.recv(size) jsonData = json.loads(data) print jsonData #to receive reward reward = float(jsonData['reward']) if (count > 0): r_batch.append(reward) else: r_batch.append(0.0) count = count + 1 #to receive state stateArray = jsonData['state'] state = np.array(stateArray) print(state) #to compute action action_prob = actor.predict(np.reshape(state, (1, S_DIM))) print("action_prob: ") print(action_prob) action_cumsum = np.cumsum(action_prob) print("action_cumsum: ") print(action_cumsum) print("comparison: ") print(action_cumsum > np.random.randint(1, RAND_RANGE) / float(RAND_RANGE)) selectedLambda = action_prob.argmax() #selectedLambda = (action_cumsum > np.random.randint(1, RAND_RANGE) / float(RAND_RANGE)).argmax() print >> sys.stderr, 'selectedLambda "%s"' % selectedLambda #to update entropy entropy_record.append(a3c.compute_entropy(action_prob[0])) #TODO #to update and apply gradient if len(r_batch) >= TRAIN_SEQ_LEN: actor_gradient, critic_gradient, td_batch = \ a3c.compute_gradients(s_batch=np.stack(s_batch[1:], axis=0), a_batch=np.vstack(a_batch[1:]), r_batch=np.vstack(r_batch[1:]), terminal=end_of_training, actor=actor, critic=critic) td_loss = np.mean(td_batch) print("td_loss: ") print(td_loss) print("actor_gradient: ") print(actor_gradient) print("critic_gradient: ") print(critic_gradient) actor_gradient_batch.append(actor_gradient) critic_gradient_batch.append(critic_gradient) entropy_record = [] print("len(actor_gradient_batch) = ") print len(actor_gradient_batch) if len(actor_gradient_batch) >= GRADIENT_BATCH_SIZE: print("GRADIENT_BATCH_SIZE reached") assert len(actor_gradient_batch) == len( critic_gradient_batch) for i in xrange(len(actor_gradient_batch)): print("###################" + str(i) + "###################") print(actor_gradient_batch[i]) print(critic_gradient_batch[i]) actor.apply_gradients(actor_gradient_batch[i]) critic.apply_gradients(critic_gradient_batch[i]) actor_gradient_batch = [] critic_gradient_batch = [] avg_reward = np.mean(r_batch) summary_str = sess.run(summary_ops, feed_dict={ summary_vars[0]: td_loss, summary_vars[1]: avg_reward }) writer.add_summary(summary_str, epoch) writer.flush() log_file.write( str(datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S')) + '\t' + str(epoch) + '\t' + str(avg_reward) + '\t' + str(td_loss) + '\n') log_file.flush() epoch += 1 if epoch % MODEL_SAVE_INTERVAL == 0: # save the neural net parameters to disk. save_path = saver.save( sess, "./nn_model_ep_" + str(epoch) + ".ckpt") print("Model saved in file: %s" % save_path) if epoch == MAX_EPOCH: end_of_training = True del s_batch[:] del a_batch[:] del r_batch[:] s_batch.append(state) action_vec = np.zeros(A_DIM) action_vec[selectedLambda] = 1 a_batch.append(action_vec) #to send back action print >> sys.stderr, 'sending data back to the client' connection.sendall(struct.pack('!i', selectedLambda)) last_lambda = selectedLambda connection.close() sock.close()