def Initial(self): # Initail your session or something #with tf.Session() as sess: self.sess = tf.Session() self.actor = a3c.ActorNetwork(self.sess, state_dim=[S_INFO, S_LEN], action_dim=A_DIM, learning_rate=ACTOR_LR_RATE) self.sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() # save neural net parameters # restore neural net parameters nn_model = NN_MODEL if nn_model is not None: # nn_model is the path to file saver.restore(self.sess, nn_model) print("Model restored.")
def central_agent(net_params_queues, exp_queues): assert len(net_params_queues) == NUM_AGENTS assert len(exp_queues) == NUM_AGENTS logging.basicConfig(filename=LOG_FILE + '_central', filemode='w', level=logging.INFO) config = tf.ConfigProto(allow_soft_placement=True) config.gpu_options.allow_growth = True #config.gpu_options.per_process_gpu_memory_fraction = 0.5 with tf.Session(config=config) as sess, open(LOG_FILE + '_test', 'wb') as test_log_file: actor = a3c.ActorNetwork(sess, state_dim=[S_INFO, S_LEN], action_dim=A_DIM, learning_rate=ACTOR_LR_RATE) critic = a3c.CriticNetwork(sess, state_dim=[S_INFO, S_LEN], learning_rate=CRITIC_LR_RATE) summary_ops, summary_vars = a3c.build_summaries() sess.run(tf.global_variables_initializer()) writer = tf.summary.FileWriter(SUMMARY_DIR, sess.graph) # training monitor saver = tf.train.Saver() # save neural net parameters # restore neural net parameters nn_model = NN_MODEL if nn_model is not None: # nn_model is the path to file saver.restore(sess, nn_model) print("Model restored.") epoch = 0 # assemble experiences from agents, compute the gradients while True: # synchronize the network parameters of work agent actor_net_params = actor.get_network_params() critic_net_params = critic.get_network_params() for i in xrange(NUM_AGENTS): net_params_queues[i].put([actor_net_params, critic_net_params]) # Note: this is synchronous version of the parallel training, # which is easier to understand and probe. The framework can be # fairly easily modified to support asynchronous training. # Some practices of asynchronous training (lock-free SGD at # its core) are nicely explained in the following two papers: # https://arxiv.org/abs/1602.01783 # https://arxiv.org/abs/1106.5730 # record average reward and td loss change # in the experiences from the agents total_batch_len = 0.0 total_reward = 0.0 total_td_loss = 0.0 total_entropy = 0.0 total_agents = 0.0 # assemble experiences from the agents actor_gradient_batch = [] critic_gradient_batch = [] for i in xrange(NUM_AGENTS): # get state action and reward from agents s_batch, a_batch, r_batch, terminal, info = exp_queues[i].get() actor_gradient, critic_gradient, td_batch = \ a3c.compute_gradients( s_batch=np.stack(s_batch, axis=0), a_batch=np.vstack(a_batch), r_batch=np.vstack(r_batch), terminal=terminal, actor=actor, critic=critic) actor_gradient_batch.append(actor_gradient) critic_gradient_batch.append(critic_gradient) total_reward += np.sum(r_batch) total_td_loss += np.sum(td_batch) total_batch_len += len(r_batch) total_agents += 1.0 total_entropy += np.sum(info['entropy']) # compute aggregated gradient assert NUM_AGENTS == len(actor_gradient_batch) assert len(actor_gradient_batch) == len(critic_gradient_batch) # assembled_actor_gradient = actor_gradient_batch[0] # assembled_critic_gradient = critic_gradient_batch[0] # for i in xrange(len(actor_gradient_batch) - 1): # for j in xrange(len(assembled_actor_gradient)): # assembled_actor_gradient[j] += actor_gradient_batch[i][j] # assembled_critic_gradient[j] += critic_gradient_batch[i][j] # actor.apply_gradients(assembled_actor_gradient) # critic.apply_gradients(assembled_critic_gradient) for i in xrange(len(actor_gradient_batch)): actor.apply_gradients(actor_gradient_batch[i]) critic.apply_gradients(critic_gradient_batch[i]) # log training information epoch += 1 avg_reward = total_reward / total_agents avg_td_loss = total_td_loss / total_batch_len avg_entropy = total_entropy / total_batch_len logging.info('Epoch: ' + str(epoch) + ' TD_loss: ' + str(avg_td_loss) + ' Avg_reward: ' + str(avg_reward) + ' Avg_entropy: ' + str(avg_entropy)) summary_str = sess.run(summary_ops, feed_dict={ summary_vars[0]: avg_td_loss, summary_vars[1]: avg_reward, summary_vars[2]: avg_entropy }) writer.add_summary(summary_str, epoch) writer.flush() if epoch % MODEL_SAVE_INTERVAL == 0: # Save the neural net parameters to disk. save_path = saver.save( sess, SUMMARY_DIR + "/nn_model_ep_" + str(epoch) + ".ckpt") logging.info("Model saved in file: " + save_path)
def agent(agent_id, all_cooked_time, all_cooked_bw, all_file_names, net_params_queue, exp_queue): # create result directory #if not os.path.exists(LOG_FILE_PATH): #os.makedirs(LOG_FILE_PATH) # -- End Configuration -- # You shouldn't need to change the rest of the code here. log_file_path = LOG_FILE + '_agent_' + str(agent_id) video_trace_prefix = './dataset/video_trace' + VIDEO_TRACE + '/new_frame_trace_' # load the trace #all_cooked_time, all_cooked_bw, all_file_names = load_trace.load_trace(network_trace_dir) # random_seed random_seed = agent_id count = 0 trace_count = 1 FPS = 25 frame_time_len = 0.04 reward_all_sum = 0 run_time = 0 # init # setting one: # 1,all_cooked_time : timestamp # 2,all_cooked_bw : throughput # 3,all_cooked_rtt : rtt # 4,agent_id : random_seed # 5,logfile_path : logfile_path # 6,VIDEO_SIZE_FILE : Video Size File Path # 7,Debug Setting : Debug net_env = env.Environment(all_cooked_time=all_cooked_time, all_cooked_bw=all_cooked_bw, random_seed=random_seed, logfile_path=log_file_path, VIDEO_SIZE_FILE=video_trace_prefix, Debug=DEBUG) BIT_RATE = [500.0, 850.0, 1200.0, 1850.0] # kpbs TARGET_BUFFER = [0, 0.04] # seconds # ABR setting RESEVOIR = 0.5 CUSHION = 2 cnt = 0 # defalut setting last_bit_rate = 0 bit_rate = 0 target_buffer = 1 latency_limit = 4 config = tf.ConfigProto(allow_soft_placement=True) config.gpu_options.allow_growth = True #config.gpu_options.per_process_gpu_memory_fraction = 0.5 with tf.Session( config=config ) as sess: #open(LOG_FILE + '_agent_' + str(agent_id), 'wb') as log_file: actor = a3c.ActorNetwork(sess, state_dim=[S_INFO, S_LEN], action_dim=A_DIM, learning_rate=ACTOR_LR_RATE) critic = a3c.CriticNetwork(sess, state_dim=[S_INFO, S_LEN], learning_rate=CRITIC_LR_RATE) actor_net_params, critic_net_params = net_params_queue.get( ) # update the network parameters from central agent actor.set_network_params( actor_net_params ) # this part is only initial the network parameters, they will be updated in the loop critic.set_network_params(critic_net_params) #last_bit_rate = DEFAULT_QUALITY #bit_rate = DEFAULT_QUALITY reward = 0.0 action_vec = np.zeros(A_DIM) #action_vec[bit_rate] = 1 s_batch = [np.zeros((S_INFO, S_LEN))] a_batch = [action_vec] r_batch = [] entropy_record = [] # QOE setting reward_frame = 0 reward_all = 0 SMOOTH_PENALTY = 0 REBUF_PENALTY = 7 LANTENCY_PENALTY = 0.005 SKIP_PENALTY = 0.5 # past_info setting past_frame_num = 7500 S_time = [0] * past_frame_num S_time_interval = [0] * past_frame_num S_send_data_size = [0] * past_frame_num S_chunk_len = [0] * past_frame_num S_rebuf = [0] * past_frame_num S_buffer_size = [0] * past_frame_num S_end_delay = [0] * past_frame_num S_chunk_size = [0] * past_frame_num S_play_time_len = [0] * past_frame_num S_decision_flag = [0] * past_frame_num S_buffer_flag = [0] * past_frame_num S_cdn_flag = [0] * past_frame_num S_skip_time = [0] * past_frame_num # params setting call_time_sum = 0 time_previous = 0 while True: reward_frame = 0 time, time_interval, send_data_size, chunk_len, \ rebuf, buffer_size, play_time_len, end_delay, \ cdn_newest_id, download_id, cdn_has_frame, skip_frame_time_len, decision_flag, \ buffer_flag, cdn_flag, skip_flag, end_of_video = net_env.get_video_frame(bit_rate, target_buffer, latency_limit) S_time.pop(0) S_time_interval.pop(0) S_send_data_size.pop(0) S_chunk_len.pop(0) S_buffer_size.pop(0) S_rebuf.pop(0) S_end_delay.pop(0) S_play_time_len.pop(0) S_decision_flag.pop(0) S_buffer_flag.pop(0) S_cdn_flag.pop(0) S_skip_time.pop(0) S_time.append(time) S_time_interval.append(time_interval) S_send_data_size.append(send_data_size) S_chunk_len.append(chunk_len) S_buffer_size.append(buffer_size) S_rebuf.append(rebuf) S_end_delay.append(end_delay) S_play_time_len.append(play_time_len) S_decision_flag.append(decision_flag) S_buffer_flag.append(buffer_flag) S_cdn_flag.append(cdn_flag) S_skip_time.append(skip_frame_time_len) if end_delay <= 1.0: LANTENCY_PENALTY = 0.005 else: LANTENCY_PENALTY = 0.01 if not cdn_flag: reward_frame = 0.7 * frame_time_len * float( BIT_RATE[bit_rate]) / 1000 - REBUF_PENALTY * rebuf reward += reward_frame else: reward_frame = -(REBUF_PENALTY * rebuf) reward += reward_frame if decision_flag or end_of_video: # reward formate = play_time * BIT_RATE - 4.3 * rebuf - 1.2 * end_delay reward_frame += -1 * SMOOTH_PENALTY * ( abs(BIT_RATE[bit_rate] - BIT_RATE[last_bit_rate]) / 1000) if abs(BIT_RATE[bit_rate] - BIT_RATE[last_bit_rate]) / 1000 >= 1.0: SMOOTH_PENALTY += 0 elif abs(BIT_RATE[bit_rate] - BIT_RATE[last_bit_rate] ) / 1000 < 1.0 and SMOOTH_PENALTY > 0.1: SMOOTH_PENALTY -= 0 if np.sum(S_rebuf[-51:-1]) >= 1 and REBUF_PENALTY <= 10.0: REBUF_PENALTY += 0.3 elif np.sum(S_rebuf[-51:-1]) < 1 and REBUF_PENALTY > 0.1: REBUF_PENALTY -= 0.1 reward += reward_frame # last_bit_rate last_bit_rate = bit_rate r_batch.append(reward) reward = 0.0 # retrieve previous state if len(s_batch) == 0: state = [np.zeros((S_INFO, S_LEN))] else: state = np.array(s_batch[-1], copy=True) # dequeue history record state = np.roll(state, -1, axis=1) #A = S_buffer_size[-51:-1] # this should be S_INFO number of terms #T_all = float(np.sum(S_time_interval[-51:-1])) T_all = time - time_previous time_previous = time num_of_frame = float(GOP / T_all) #print 'number of frames:', num_of_frame throughput = float(np.sum(S_send_data_size[-51:-1])) / float( np.sum(S_time_interval[-51:-1])) state[0, -1] = BIT_RATE[bit_rate] / float( np.max(BIT_RATE)) # last quality present state[1, -1] = num_of_frame / FPS state[ 2, -1] = throughput / M_IN_K / BW_NORM_FACTOR # kilo byte / ms #history #state[3, -1] = np.sum(S_skip_time[-51:-1]) / BUFFER_NORM_FACTOR #skip frame #present #state[4, -1] = S_end_delay[-1] / BUFFER_NORM_FACTOR # latency #present state[3, -1] = np.sum(S_rebuf[-51:-1]) / BUFFER_NORM_FACTOR #print 'state1:', (num_of_frame / FPS) #state[5, -1] = np.minimum(video_chunk_remain, CHUNK_TIL_VIDEO_END_CAP) / float(CHUNK_TIL_VIDEO_END_CAP) # compute action probability vector action_prob = actor.predict( np.reshape(state, (1, S_INFO, S_LEN))) bit_rate = np.argmax(action_prob) print("bitrate: ", BIT_RATE[bit_rate]) #action_prob = actor.predict(np.reshape(state, (1, S_INFO, S_LEN))) #action_cumsum = np.cumsum(action_prob) #bit_rate = (action_cumsum > np.random.randint(1, RAND_RANGE) / float(RAND_RANGE)).argmax() #print 'bitrate: ', bit_rate # Note: we need to discretize the probability into 1/RAND_RANGE steps, # because there is an intrinsic discrepancy in passing single state and batch states entropy_record.append(a3c.compute_entropy(action_prob[0])) if ( len(r_batch) >= TRAIN_SEQ_LEN ) or end_of_video: # if the number of trained chunks is up to 100 or it is at the end of the video, the state, action, reward and entropy will be sent to central agent if len(r_batch) > 1: exp_queue.put([ s_batch[1:], # ignore the first chuck a_batch[1:], # since we don't have the r_batch[1:], # control over it end_of_video, { 'entropy': entropy_record } ]) # synchronize the network parameters from the coordinator actor_net_params, critic_net_params = net_params_queue.get( ) actor.set_network_params(actor_net_params) critic.set_network_params(critic_net_params) del s_batch[:] del a_batch[:] del r_batch[:] del entropy_record[:] if end_of_video: print("network traceID, network_reward, avg_running_time", trace_count, reward_all) #, call_time_sum / cnt) reward_all_sum += reward_all #run_time += call_time_sum / cnt #if trace_count >= len(all_file_names): # trace_count = 1 #break trace_count += 1 cnt = 0 call_time_sum = 0 last_bit_rate = 0 reward_all = 0 bit_rate = 0 target_buffer = 0 action_vec = np.zeros(A_DIM) action_vec[bit_rate] = 1 s_batch.append(np.zeros((S_INFO, S_LEN))) a_batch.append(action_vec) S_time_interval = [0] * past_frame_num S_send_data_size = [0] * past_frame_num S_chunk_len = [0] * past_frame_num S_rebuf = [0] * past_frame_num S_buffer_size = [0] * past_frame_num S_end_delay = [0] * past_frame_num S_chunk_size = [0] * past_frame_num S_play_time_len = [0] * past_frame_num S_decision_flag = [0] * past_frame_num S_buffer_flag = [0] * past_frame_num S_cdn_flag = [0] * past_frame_num else: if decision_flag: s_batch.append(state) action_vec = np.zeros(A_DIM) action_vec[bit_rate] = 1 a_batch.append(action_vec) reward_all += reward_frame