def agent(agent_id, net_params_queue, exp_queue): net_env = env.Environment(random_seed=agent_id, fixed_env=False, trace_folder=TRAIN_TRACES) with tf.Session() as sess, open(LOG_FILE + '_agent_' + str(agent_id), 'wb') as log_file: actor = a3c.ActorNetwork(sess, state_dim=[S_INFO, S_LEN], action_dim=A_DIM, learning_rate=ACTOR_LR_RATE) critic = a3c.CriticNetwork(sess, state_dim=[S_INFO, S_LEN], learning_rate=CRITIC_LR_RATE) # initial synchronization of the network parameters from the coordinator actor_net_params, critic_net_params = net_params_queue.get() actor.set_network_params(actor_net_params) critic.set_network_params(critic_net_params) mask = net_env.video_masks[net_env.video_idx] last_bit_rate = DEFAULT_QUALITY bit_rate = DEFAULT_QUALITY action = bitrate_to_action(bit_rate, mask) last_action = action action_vec = np.zeros(np.sum(mask)) action_vec[bit_rate] = 1 s_batch = [np.zeros((S_INFO, S_LEN))] a_batch = [action_vec] r_batch = [] entropy_record = [] time_stamp = 0 while True: # experience video streaming forever # the action is from the last decision # this is to make the framework similar to the real delay, sleep_time, buffer_size, \ rebuf, video_chunk_size, end_of_video, \ video_chunk_remain, video_num_chunks, \ next_video_chunk_size, mask = \ net_env.get_video_chunk(bit_rate) time_stamp += delay # in ms time_stamp += sleep_time # in ms reward = VIDEO_BIT_RATE[action] / M_IN_K \ - REBUF_PENALTY * rebuf \ - SMOOTH_PENALTY * np.abs(VIDEO_BIT_RATE[action] - VIDEO_BIT_RATE[last_action]) / M_IN_K r_batch.append(reward) last_bit_rate = bit_rate last_action = action # retrieve previous state if len(s_batch) == 0: state = [np.zeros((S_INFO, S_LEN))] else: state = np.array(s_batch[-1], copy=True) # dequeue history record state = np.roll(state, -1, axis=1) # this should be S_INFO number of terms state[0, -1] = VIDEO_BIT_RATE[action] / float( np.max(VIDEO_BIT_RATE)) # last quality state[1, -1] = buffer_size / BUFFER_NORM_FACTOR state[2, -1] = float(video_chunk_size) / float( delay) / M_IN_K # kilo byte / ms state[3, -1] = float(delay) / M_IN_K state[4, -1] = video_chunk_remain / float(video_num_chunks) state[5, :] = -1 nxt_chnk_cnt = 0 for i in xrange(A_DIM): if mask[i] == 1: state[5, i] = next_video_chunk_size[nxt_chnk_cnt] / M_IN_B nxt_chnk_cnt += 1 assert (nxt_chnk_cnt) == np.sum(mask) state[6, -A_DIM:] = mask # compute action probability vector action_prob = actor.predict(np.reshape(state, (1, S_INFO, S_LEN))) # the action probability should correspond to number of bit rates assert len(action_prob[0]) == np.sum(mask) action_cumsum = np.cumsum(action_prob) bit_rate = (action_cumsum > np.random.randint(1, RAND_RANGE) / float(RAND_RANGE)).argmax() # Note: we need to discretize the probability into 1/RAND_RANGE steps, # because there is an intrinsic discrepancy in passing single state and batch states action = bitrate_to_action(bit_rate, mask) entropy_record.append(a3c.compute_entropy(action_prob[0])) # log time_stamp, bit_rate, buffer_size, reward log_file.write( str(time_stamp) + '\t' + str(VIDEO_BIT_RATE[action]) + '\t' + str(buffer_size) + '\t' + str(rebuf) + '\t' + str(video_chunk_size) + '\t' + str(delay) + '\t' + str(reward) + '\n') log_file.flush() # report experience to the coordinator if len(r_batch) >= TRAIN_SEQ_LEN or end_of_video: exp_queue.put([ s_batch[1:], # ignore the first chuck a_batch[1:], # since we don't have the r_batch[1:], # control over it end_of_video, { 'entropy': entropy_record } ]) # synchronize the network parameters from the coordinator actor_net_params, critic_net_params = net_params_queue.get() actor.set_network_params(actor_net_params) critic.set_network_params(critic_net_params) del s_batch[:] del a_batch[:] del r_batch[:] del entropy_record[:] log_file.write( '\n') # so that in the log we know where video ends # store the state and action into batches if end_of_video: last_bit_rate = DEFAULT_QUALITY bit_rate = DEFAULT_QUALITY # use the default action here action = bitrate_to_action(bit_rate, mask) last_action = action action_vec = np.zeros(np.sum(mask)) action_vec[bit_rate] = 1 s_batch.append(np.zeros((S_INFO, S_LEN))) a_batch.append(action_vec) else: s_batch.append(state) action_vec = np.zeros(np.sum(mask)) action_vec[bit_rate] = 1 a_batch.append(action_vec)
def main(): summary_dir = SUMMARY_DIR if not os.path.exists(summary_dir): os.makedirs(summary_dir) log_file_dir = TEST_LOG_FOLDER if not os.path.exists(log_file_dir): os.makedirs(log_file_dir) TOTAL_REWARD_BITRATE = 0.0 TOTAL_REWARD_HD_BITRATE = 0.0 TOTAL_REWARD_REBUF = 0.0 TOTAL_REWARD_SMOOTHNESS = 0.0 TOTAL_REWARD = 0.0 TOTAL_HOTSPOT_CHUNKS = 0.0 np.random.seed(RANDOM_SEED) all_cooked_time, all_cooked_bw, all_file_names = load_trace.load_trace( TEST_TRACES) net_env = env.Environment(all_cooked_time=all_cooked_time, all_cooked_bw=all_cooked_bw) log_path = LOG_FILE + '_' + all_file_names[net_env.trace_idx] log_file = open(log_path, 'wb') with tf.Session() as sess: actor = a3c.ActorNetwork(sess, state_dim=[S_INFO, S_LEN], action_dim=A_DIM, learning_rate=ACTOR_LR_RATE) critic = a3c.CriticNetwork(sess, state_dim=[S_INFO, S_LEN], learning_rate=CRITIC_LR_RATE) sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() # save neural net parameters # restore neural net parameters if NN_MODEL is not None: # NN_MODEL is the path to file saver.restore(sess, NN_MODEL) print "Testing model restored." time_stamp = 0 prefetch_decision = DEFAULT_PREFETCH last_bit_rate = DEFAULT_QUALITY bit_rate = DEFAULT_QUALITY action_vec = np.zeros(A_DIM) action_vec[prefetch_decision] = 1 s_batch = [np.zeros((S_INFO, S_LEN))] a_batch = [action_vec] r_batch = [] entropy_record = [] video_count = 0 while True: # serve video forever # the action is from the last decision # this is to make the framework similar to the real state_data_for_action = net_env.execute_action(prefetch_decision) # normal chunk state information delay = state_data_for_action['delay'] sleep_time = state_data_for_action['sleep_time'] last_bit_rate = state_data_for_action['last_bit_rate'] play_buffer_size = state_data_for_action['play_buffer_size'] rebuf = state_data_for_action['rebuf'] video_chunk_size = state_data_for_action['video_chunk_size'] next_video_chunk_sizes = state_data_for_action[ 'next_video_chunk_sizes'] end_of_video = state_data_for_action['end_of_video'] video_chunk_remain = state_data_for_action['video_chunk_remain'] current_seq_no = state_data_for_action['current_seq_no'] log_prefetch_decision = state_data_for_action[ 'log_prefetch_decision'] # hotspot chunk state information was_hotspot_chunk = 1.0 * state_data_for_action['was_hotspot_chunk'] TOTAL_HOTSPOT_CHUNKS += was_hotspot_chunk hotspot_chunks_remain = state_data_for_action[ 'hotspot_chunks_remain'] chunks_till_played = state_data_for_action['chunks_till_played'] total_buffer_size = state_data_for_action['total_buffer_size'] last_hotspot_bit_rate = state_data_for_action[ 'last_hotspot_bit_rate'] next_hotspot_chunk_sizes = state_data_for_action[ 'next_hotspot_chunk_sizes'] dist_from_hotspot_chunks = state_data_for_action[ 'dist_from_hotspot_chunks'] smoothness_eval_bitrates = state_data_for_action[ 'smoothness_eval_bitrates'] # abr decision state information normal_bitrate_pensieve = state_data_for_action[ 'normal_bitrate_pensieve'] hotspot_bitrate_pensieve = state_data_for_action[ 'hotspot_bitrate_pensieve'] # print len(next_video_chunk_sizes) # print len(next_hotspot_chunk_sizes) last_overall_bitrate = last_bit_rate if prefetch_decision == 1: last_overall_bitrate = last_hotspot_bit_rate time_stamp += delay # in ms time_stamp += sleep_time # in ms # reward is video quality - rebuffer penalty - smoothness reward_normal_br = (1.0 - was_hotspot_chunk) * ( VIDEO_BIT_RATE[last_bit_rate] / M_IN_K) * 1.0 reward_hotspot_br = was_hotspot_chunk * HD_REWARD[ last_hotspot_bit_rate] * 1.0 reward_rebuffering = REBUF_PENALTY * rebuf * 1.0 reward_smoothness = 0.0 if len(smoothness_eval_bitrates) > 1: for i in xrange(len(smoothness_eval_bitrates) - 1): reward_smoothness += 1.0 * SMOOTH_PENALTY * (1.0 * np.abs( VIDEO_BIT_RATE[smoothness_eval_bitrates[i + 1]] - VIDEO_BIT_RATE[smoothness_eval_bitrates[i]]) / M_IN_K) reward = (1.0 * reward_normal_br) + (1.0 * reward_hotspot_br) - ( 1.0 * reward_rebuffering) - (1.0 * reward_smoothness) TOTAL_REWARD_BITRATE += reward_normal_br TOTAL_REWARD_HD_BITRATE += reward_hotspot_br TOTAL_REWARD_REBUF += reward_rebuffering TOTAL_REWARD_SMOOTHNESS += reward_smoothness TOTAL_REWARD += reward # print "reward before: {}".format(reward) r_batch.append(reward) # print "reward after: {}".format(reward) # log time_stamp, bit_rate, buffer_size, reward if not end_of_video: log_file.write( str(time_stamp) + '\t' + str(VIDEO_BIT_RATE[last_overall_bitrate]) + '\t' + str(play_buffer_size) + '\t' + str(rebuf) + '\t' + str(video_chunk_size) + '\t' + str(delay) + '\t' + str(reward) + '\t' + str(log_prefetch_decision) + '\t' + str(int(was_hotspot_chunk)) + '\t' + str(current_seq_no) + '\n') log_file.flush() # retrieve previous state if len(s_batch) == 0: state = [np.zeros((S_INFO, S_LEN))] else: state = np.array(s_batch[-1], copy=True) # dequeue history record state = np.roll(state, -1, axis=1) # this should be S_INFO number of terms ## Normal state S_ABR_INFO state[0, -1] = VIDEO_BIT_RATE[last_overall_bitrate] / float( np.max(VIDEO_BIT_RATE)) # last quality state[1, -1] = play_buffer_size / BUFFER_NORM_FACTOR # 10 sec state[2, -1] = float(video_chunk_size) / float( delay) / M_IN_K # kilo byte / ms state[3, -1] = float(delay) / M_IN_K / BUFFER_NORM_FACTOR # 10 sec state[4, :BITRATE_LEVELS] = np.array( next_video_chunk_sizes) / M_IN_K / M_IN_K # mega byte state[5, -1] = np.minimum( video_chunk_remain, CHUNK_TIL_VIDEO_END_CAP) / float(CHUNK_TIL_VIDEO_END_CAP) ## Hotspot state S_HOT_INFO state[6, -1] = np.minimum( hotspot_chunks_remain, NUM_HOTSPOT_CHUNKS) / float(NUM_HOTSPOT_CHUNKS) state[7, -1] = np.minimum( chunks_till_played, CHUNK_TIL_VIDEO_END_CAP) / float(CHUNK_TIL_VIDEO_END_CAP) state[8, -1] = total_buffer_size / BUFFER_NORM_FACTOR state[9, -1] = last_hotspot_bit_rate / float(np.max(VIDEO_BIT_RATE)) state[10, :BITRATE_LEVELS] = np.array( next_hotspot_chunk_sizes) / M_IN_K / M_IN_K state[11, :NUM_HOTSPOT_CHUNKS] = ( np.array(dist_from_hotspot_chunks) + CHUNK_TIL_VIDEO_END_CAP) / float(2 * CHUNK_TIL_VIDEO_END_CAP) ## Bitrate actions state S_BRT_INFO state[12, -1] = normal_bitrate_pensieve / float(np.max(VIDEO_BIT_RATE)) state[13, -1] = hotspot_bitrate_pensieve / float( np.max(VIDEO_BIT_RATE)) # compute action probability vector action_prob = actor.predict(np.reshape(state, (1, S_INFO, S_LEN))) action_cumsum = np.cumsum(action_prob) prefetch_decision = ( action_cumsum > np.random.randint(1, RAND_RANGE) / float(RAND_RANGE)).argmax() # Note: we need to discretize the probability into 1/RAND_RANGE steps, # because there is an intrinsic discrepancy in passing single state and batch states s_batch.append(state) entropy_record.append(a3c.compute_entropy(action_prob[0])) if end_of_video: log_file.write('\n') log_file.close() # break prefetch_decision = DEFAULT_PREFETCH del s_batch[:] del a_batch[:] del r_batch[:] action_vec = np.zeros(A_DIM) action_vec[prefetch_decision] = 1 s_batch.append(np.zeros((S_INFO, S_LEN))) a_batch.append(action_vec) entropy_record = [] video_count += 1 if video_count >= len(all_file_names): break # print "log file: {}".format(log_file) # print "Hot chunks: {}".format(TOTAL_HOTSPOT_CHUNKS) log_path = LOG_FILE + '_' + all_file_names[net_env.trace_idx] log_file = open(log_path, 'wb') print "Normal bitrate reward: {}".format(TOTAL_REWARD_BITRATE) print "Hotspot bitrate reward: {}".format(TOTAL_REWARD_HD_BITRATE) print "Rebuffering reward: {}".format(TOTAL_REWARD_REBUF) print "Smoothness reward: {}".format(TOTAL_REWARD_SMOOTHNESS) print "Total reward: {}".format(TOTAL_REWARD) print "Total hotspot chunks: {}".format(int(TOTAL_HOTSPOT_CHUNKS))
def agent(agent_id, all_cooked_time, all_cooked_bw, net_params_queue, exp_queue, epoch_queue): net_env = env.Environment(all_cooked_time=all_cooked_time, all_cooked_bw=all_cooked_bw, random_seed=agent_id) with tf.Session() as sess, open(LOG_FILE + '_agent_' + str(agent_id), 'wb') as log_file: actor = a3c.ActorNetwork(sess, state_dim=[S_INFO, S_LEN], action_dim=A_DIM, learning_rate=ACTOR_LR_RATE) critic = a3c.CriticNetwork(sess, state_dim=[S_INFO, S_LEN], learning_rate=CRITIC_LR_RATE) # 1.从center同步最新的模型参数 initial synchronization of the network parameters from the coordinator actor_net_params, critic_net_params = net_params_queue.get() epoch_num = epoch_queue.get() actor.set_network_params(actor_net_params) critic.set_network_params(critic_net_params) last_bit_rate = DEFAULT_QUALITY bit_rate = DEFAULT_QUALITY action_vec = np.zeros(A_DIM) #初始化 动作空间A个actions action_vec[bit_rate] = 1 s_batch = [np.zeros((S_INFO, S_LEN))] a_batch = [action_vec] r_batch = [] entropy_record = [] time_stamp = 0 while True: # experience video streaming forever # 和环境Env交互 the action is from the last decision # this is to make the framework similar to the real # delay, sleep_time, buffer_size, rebuf, \ # video_chunk_size, next_video_chunk_sizes, \ # end_of_video, video_chunk_remain = \ # net_env.get_video_chunk(bit_rate) assert bit_rate >= 0 assert bit_rate < A_DIM bitrate_send_last, lossrate_recv_last, bitrate_real_recovery,\ bitrate_send_last_probe, lossrate_recv_last_probe, bitrate_real_recovery_probe,\ end_of_video \ = net_env.action_dispatch_and_report_svr(VIDEO_BIT_RATE[bit_rate]) time_stamp += 2 # -- linear reward -- # reward is video quality - rebuffer penalty - smoothness #print '1', net_env.netbw #print '2', bitrate_send_last_probe * (1 - lossrate_recv_last_probe) x_funtion_top = (bitrate_send_last_probe * (1 - lossrate_recv_last_probe) - VIDEO_BIT_RATE[bit_rate]) / M_IN_K reward = -x_funtion_top * x_funtion_top # 0.1 0.2 ... 1.1 1.2 r_batch.append(reward) last_bit_rate = bit_rate # retrieve previous state if len(s_batch) == 0: state = [np.zeros((S_INFO, S_LEN))] else: state = np.array(s_batch[-1], copy=True) # dequeue history record #state = np.roll(state, -1, axis=1) # this should be S_INFO number of terms #state[0, -1] = bitrate_send_last / 1000.0 # last quality #state[1, -1] = lossrate_recv_last # 丢包率0.1 0.2 0.3 0.4 #state[2, -1] = bitrate_real_recovery / 1000.0 # kilo byte / ms state = np.roll(state, -1, axis=1) state[0, -1] = bitrate_send_last_probe / 1000.0 # last quality state[1, -1] = lossrate_recv_last_probe # 丢包率0.1 0.2 0.3 0.4 state[2, -1] = bitrate_real_recovery_probe / 1000.0 # kilo byte / ms state[3, :A_DIM] = np.array( VIDEO_BIT_RATE[:]) / 1000.0 # kilo byte / ms state[4, -1] = bitrate_send_last / 1000.0 # kilo byte / ms # print state[3, :A_DIM] # ================== Predict BandWidth ========================= # compute action probability vector action_prob = actor.predict(np.reshape(state, (1, S_INFO, S_LEN))) action_cumsum = np.cumsum(action_prob) bit_rate = (action_cumsum > np.random.randint(1, RAND_RANGE) / float(RAND_RANGE)).argmax() # Note: we need to discretize the probability into 1/RAND_RANGE steps, # because there is an intrinsic discrepancy in passing single state and batch states entropy_record.append(a3c.compute_entropy(action_prob[0])) # log time_stamp, bit_rate, buffer_size, reward log_file.write( str(time_stamp) + '\t' + str(VIDEO_BIT_RATE[bit_rate]) + '\t' + str(bitrate_send_last) + '\t' + str(lossrate_recv_last) + '\t' + str(bitrate_real_recovery) + '\t' + str(reward) + '\n') log_file.flush() # report experience to the coordinator if len(r_batch) >= TRAIN_SEQ_LEN or end_of_video: exp_queue.put([ s_batch[1:], # ignore the first chuck a_batch[1:], # since we don't have the r_batch[1:], # control over it end_of_video, { 'entropy': entropy_record } ]) # synchronize the network parameters from the coordinator actor_net_params, critic_net_params = net_params_queue.get() actor.set_network_params(actor_net_params) critic.set_network_params(critic_net_params) epoch_num = epoch_queue.get() del s_batch[:] del a_batch[:] del r_batch[:] del entropy_record[:] log_file.write( '\n') # so that in the log we know where video ends # store the state and action into batches if end_of_video: last_bit_rate = DEFAULT_QUALITY bit_rate = DEFAULT_QUALITY # use the default action here action_vec = np.zeros(A_DIM) action_vec[bit_rate] = 1 s_batch.append(np.zeros((S_INFO, S_LEN))) a_batch.append(action_vec) else: s_batch.append(state) action_vec = np.zeros(A_DIM) action_vec[bit_rate] = 1 a_batch.append(action_vec)
def main(): np.random.seed(RANDOM_SEED) assert len(VIDEO_BIT_RATE) == A_DIM net_env = env.Environment(fixed_env=True, trace_folder=TEST_TRACES, video_folder=TEST_VIDEO_FOLDER) log_path = LOG_FILE + '_' + net_env.all_file_names[net_env.trace_idx] log_file = open(log_path, 'wb') with tf.Session() as sess: actor = a3c.ActorNetwork(sess, state_dim=[S_INFO, S_LEN], action_dim=A_DIM, learning_rate=ACTOR_LR_RATE) critic = a3c.CriticNetwork(sess, state_dim=[S_INFO, S_LEN], learning_rate=CRITIC_LR_RATE) sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() # save neural net parameters # restore neural net parameters if NN_MODEL is not None: # NN_MODEL is the path to file saver.restore(sess, NN_MODEL) print("Testing model restored.") time_stamp = 0 last_bit_rate = DEFAULT_QUALITY bit_rate = DEFAULT_QUALITY action = bitrate_to_action(bit_rate, net_env.video_masks[net_env.video_idx]) last_action = action s_batch = [np.zeros((S_INFO, S_LEN))] entropy_record = [] video_count = 0 while True: # serve video forever # the action is from the last decision # this is to make the framework similar to the real delay, sleep_time, buffer_size, \ rebuf, video_chunk_size, end_of_video, \ video_chunk_remain, video_num_chunks, \ next_video_chunk_size, mask = \ net_env.get_video_chunk(bit_rate) time_stamp += delay # in ms time_stamp += sleep_time # in ms reward = VIDEO_BIT_RATE[action] / M_IN_K \ - REBUF_PENALTY * rebuf \ - SMOOTH_PENALTY * np.abs(VIDEO_BIT_RATE[action] - VIDEO_BIT_RATE[last_action]) / M_IN_K last_bit_rate = bit_rate last_action = action # log time_stamp, bit_rate, buffer_size, reward log_file.write(str(time_stamp / M_IN_K) + '\t' + str(VIDEO_BIT_RATE[action]) + '\t' + str(buffer_size) + '\t' + str(rebuf) + '\t' + str(video_chunk_size) + '\t' + str(delay) + '\t' + str(reward) + '\n') log_file.flush() # retrieve previous state if len(s_batch) == 0: state = [np.zeros((S_INFO, S_LEN))] else: state = np.array(s_batch[-1], copy=True) # dequeue history record state = np.roll(state, -1, axis=1) # this should be S_INFO number of terms state[0, -1] = VIDEO_BIT_RATE[action] / float(np.max(VIDEO_BIT_RATE)) # last quality state[1, -1] = buffer_size / BUFFER_NORM_FACTOR state[2, -1] = float(video_chunk_size) / float(delay) / M_IN_K # kilo byte / ms state[3, -1] = float(delay) / M_IN_K state[4, -1] = video_chunk_remain / float(video_num_chunks) state[5, :] = -1 nxt_chnk_cnt = 0 for i in xrange(A_DIM): if mask[i] == 1: state[5, i] = next_video_chunk_size[nxt_chnk_cnt] / M_IN_B nxt_chnk_cnt += 1 assert(nxt_chnk_cnt) == np.sum(mask) state[6, -A_DIM:] = mask action_prob = actor.predict(np.reshape(state, (1, S_INFO, S_LEN))) # the action probability should correspond to number of bit rates assert len(action_prob[0]) == np.sum(mask) action_cumsum = np.cumsum(action_prob) bit_rate = (action_cumsum > np.random.randint(1, RAND_RANGE) / float(RAND_RANGE)).argmax() # Note: we need to discretize the probability into 1/RAND_RANGE steps, # because there is an intrinsic discrepancy in passing single state and batch states action = bitrate_to_action(bit_rate, mask) s_batch.append(state) entropy_record.append(a3c.compute_entropy(action_prob[0])) if end_of_video: log_file.write('\n') log_file.close() del s_batch[:] last_bit_rate = DEFAULT_QUALITY bit_rate = DEFAULT_QUALITY # use the default action here action = bitrate_to_action(bit_rate, mask) last_action = action s_batch.append(np.zeros((S_INFO, S_LEN))) entropy_record = [] video_count += 1 if video_count >= len(net_env.all_cooked_bw): break log_path = LOG_FILE + '_' + net_env.all_file_names[net_env.trace_idx] log_file = open(log_path, 'wb')
def agent(agent_id, all_cooked_time, all_cooked_bw, net_params_queue, exp_queue): # agent号,trece数据,对应的两个队列的列表 #Summary:先建立环境,然后打开Session(){ # 生成神经网络 # (从主agent获取参数,给神经网络初始化) # 选取默认动作,初始化batch[],entropy[] # 循环:{ # 从环境更新状态,新状态加入batch[],选择新动作,记录数据进文件 # 积累到batch大小,放到多进程的Queue中(等待主agent取出) # 重新从主agent获取参数,清除旧batch[]的数据 # } #} net_env = env.Environment(all_cooked_time=all_cooked_time, all_cooked_bw=all_cooked_bw, random_seed=agent_id) # 调试环境参数? with tf.Session() as sess, open(LOG_FILE + '_agent_' + str(agent_id), 'wb') as log_file: # 创建actor神经网络,参数为tensorflow的Session,[输入神经元个数,历史带宽长度],输出神经元个数(码率范围),学习率 actor = a3c.ActorNetwork(sess, state_dim=[S_INFO, S_LEN], action_dim=A_DIM, learning_rate=ACTOR_LR_RATE) # 创建critic神经网络,参数为tensorflow的Session,[输入神经元个数,历史带宽长度],学习率 critic = a3c.CriticNetwork(sess, state_dim=[S_INFO, S_LEN], learning_rate=CRITIC_LR_RATE) # initial synchronization of the network parameters from the coordinator actor_net_params, critic_net_params = net_params_queue.get() actor.set_network_params(actor_net_params) critic.set_network_params(critic_net_params) last_bit_rate = DEFAULT_QUALITY bit_rate = DEFAULT_QUALITY action_vec = np.zeros(A_DIM) # [0,0,0,0,0,0] action_vec[bit_rate] = 1 # 设置有效码率为1(其中一个) s_batch = [np.zeros((S_INFO, S_LEN))] # [6*8的0矩阵,],历史状态列表? a_batch = [action_vec] # [[0,0,0,0,0,0],] r_batch = [] # reward? entropy_record = [] time_stamp = 0 while True: # experience video streaming forever # the action is from the last decision # this is to make the framework similar to the real delay, sleep_time, buffer_size, rebuf, \ video_chunk_size, next_video_chunk_sizes, \ end_of_video, video_chunk_remain = \ net_env.get_video_chunk(bit_rate) # 还没看懂 time_stamp += delay # in ms time_stamp += sleep_time # in ms # -- linear reward -- # reward is video quality - rebuffer penalty - smoothness reward = VIDEO_BIT_RATE[bit_rate] / M_IN_K \ - REBUF_PENALTY * rebuf \ - SMOOTH_PENALTY * np.abs(VIDEO_BIT_RATE[bit_rate] - VIDEO_BIT_RATE[last_bit_rate]) / M_IN_K # -- log scale reward -- # log_bit_rate = np.log(VIDEO_BIT_RATE[bit_rate] / float(VIDEO_BIT_RATE[-1])) # log_last_bit_rate = np.log(VIDEO_BIT_RATE[last_bit_rate] / float(VIDEO_BIT_RATE[-1])) # reward = log_bit_rate \ # - REBUF_PENALTY * rebuf \ # - SMOOTH_PENALTY * np.abs(log_bit_rate - log_last_bit_rate) # -- HD reward -- # reward = HD_REWARD[bit_rate] \ # - REBUF_PENALTY * rebuf \ # - SMOOTH_PENALTY * np.abs(HD_REWARD[bit_rate] - HD_REWARD[last_bit_rate]) r_batch.append(reward) last_bit_rate = bit_rate # retrieve取回/恢复 previous state if len(s_batch) == 0: state = [np.zeros((S_INFO, S_LEN))] else: state = np.array(s_batch[-1], copy=True) # dequeue history record state = np.roll(state, -1, axis=1) # 没看懂 # this should be S_INFO number of terms state[0, -1] = VIDEO_BIT_RATE[bit_rate] / float( np.max(VIDEO_BIT_RATE)) # last quality,码率 state[ 1, -1] = buffer_size / BUFFER_NORM_FACTOR # 10 sec,current buffer size,缓存大小 state[2, -1] = float(video_chunk_size) / float( delay) / M_IN_K # kilo byte / ms, 带宽测量 state[3, -1] = float( delay) / M_IN_K / BUFFER_NORM_FACTOR # 10 sec, 延迟时间,下载时间? state[4, :A_DIM] = np.array( next_video_chunk_sizes ) / M_IN_K / M_IN_K # mega byte, 下一个chunk的各种size,放在前6列? state[5, -1] = np.minimum(video_chunk_remain, CHUNK_TIL_VIDEO_END_CAP) / float( CHUNK_TIL_VIDEO_END_CAP) # 剩余chunks # compute action probability vector,这里没搞懂 action_prob = actor.predict(np.reshape(state, (1, S_INFO, S_LEN))) action_cumsum = np.cumsum(action_prob) bit_rate = (action_cumsum > np.random.randint(1, RAND_RANGE) / float(RAND_RANGE)).argmax() # rand_range = 1000,前面有 # Note: we need to discretize the probability into 1/RAND_RANGE steps, # because there is an intrinsic discrepancy in passing single state and batch states entropy_record.append(a3c.compute_entropy(action_prob[0])) # log time_stamp, bit_rate, buffer_size, reward log_file.write( str(time_stamp) + '\t' + str(VIDEO_BIT_RATE[bit_rate]) + '\t' + str(buffer_size) + '\t' + str(rebuf) + '\t' + str(video_chunk_size) + '\t' + str(delay) + '\t' + str(reward) + '\n') log_file.flush() # report experience to the coordinator if len(r_batch) >= TRAIN_SEQ_LEN or end_of_video: exp_queue.put([ s_batch[1:], # ignore the first chuck a_batch[1:], # since we don't have the r_batch[1:], # control over it end_of_video, { 'entropy': entropy_record } ]) # synchronize the network parameters from the coordinator,更新神经网络参数 actor_net_params, critic_net_params = net_params_queue.get() actor.set_network_params(actor_net_params) critic.set_network_params(critic_net_params) del s_batch[:] del a_batch[:] del r_batch[:] del entropy_record[:] log_file.write( '\n') # so that in the log we know where video ends # store the state and action into batches if end_of_video: last_bit_rate = DEFAULT_QUALITY bit_rate = DEFAULT_QUALITY # use the default action here action_vec = np.zeros(A_DIM) action_vec[bit_rate] = 1 s_batch.append(np.zeros((S_INFO, S_LEN))) a_batch.append(action_vec) else: s_batch.append(state) action_vec = np.zeros(A_DIM) action_vec[bit_rate] = 1 a_batch.append(action_vec)
def main(): np.random.seed(RANDOM_SEED) assert len(VIDEO_BIT_RATE) == A_DIM all_cooked_time, all_cooked_bw, all_file_names = load_trace.load_trace(TEST_TRACES) net_env = env.Environment(all_cooked_time=all_cooked_time, all_cooked_bw=all_cooked_bw) log_path = LOG_FILE + '_' + all_file_names[net_env.trace_idx] log_file = open(log_path, 'wb') with tf.Session() as sess: actor = a3c.ActorNetwork(sess, state_dim=[S_INFO, S_LEN], action_dim=A_DIM, learning_rate=ACTOR_LR_RATE) critic = a3c.CriticNetwork(sess, state_dim=[S_INFO, S_LEN], learning_rate=CRITIC_LR_RATE) sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() # save neural net parameters # restore neural net parameters if NN_MODEL is not None: # NN_MODEL is the path to file saver.restore(sess, NN_MODEL) print("Testing model restored.") time_stamp = 0 last_bit_rate = DEFAULT_QUALITY bit_rate = DEFAULT_QUALITY action_vec = np.zeros(A_DIM) action_vec[bit_rate] = 1 s_batch = [np.zeros((S_INFO, S_LEN))] a_batch = [action_vec] r_batch = [] entropy_record = [] video_count = 0 while True: # serve video forever # the action is from the last decision # this is to make the framework similar to the real delay, sleep_time, buffer_size, rebuf, \ video_chunk_size, next_video_chunk_sizes, \ end_of_video, video_chunk_remain = \ net_env.get_video_chunk(bit_rate) time_stamp += delay # in ms time_stamp += sleep_time # in ms # reward is video quality - rebuffer penalty - smoothness reward = VIDEO_BIT_RATE[bit_rate] / M_IN_K \ - REBUF_PENALTY * rebuf \ - SMOOTH_PENALTY * np.abs(VIDEO_BIT_RATE[bit_rate] - VIDEO_BIT_RATE[last_bit_rate]) / M_IN_K r_batch.append(reward) last_bit_rate = bit_rate # log time_stamp, bit_rate, buffer_size, reward log_file.write(str(time_stamp / M_IN_K) + '\t' + str(VIDEO_BIT_RATE[bit_rate]) + '\t' + str(buffer_size) + '\t' + str(rebuf) + '\t' + str(video_chunk_size) + '\t' + str(delay) + '\t' + str(reward) + '\n') log_file.flush() # retrieve previous state if len(s_batch) == 0: state = [np.zeros((S_INFO, S_LEN))] else: state = np.array(s_batch[-1], copy=True) # dequeue history record state = np.roll(state, -1, axis=1) # this should be S_INFO number of terms state[0, -1] = VIDEO_BIT_RATE[bit_rate] / float(np.max(VIDEO_BIT_RATE)) # last quality state[1, -1] = buffer_size / BUFFER_NORM_FACTOR # 10 sec state[2, -1] = float(video_chunk_size) / float(delay) / M_IN_K # kilo byte / ms state[3, -1] = float(delay) / M_IN_K / BUFFER_NORM_FACTOR # 10 sec state[4, :A_DIM] = np.array(next_video_chunk_sizes) / M_IN_K / M_IN_K # mega byte state[5, -1] = np.minimum(video_chunk_remain, CHUNK_TIL_VIDEO_END_CAP) / float(CHUNK_TIL_VIDEO_END_CAP) action_prob = actor.predict(np.reshape(state, (1, S_INFO, S_LEN))) action_cumsum = np.cumsum(action_prob) bit_rate = (action_cumsum > np.random.randint(1, RAND_RANGE) / float(RAND_RANGE)).argmax() # Note: we need to discretize the probability into 1/RAND_RANGE steps, # because there is an intrinsic discrepancy in passing single state and batch states DECISIONS.append(bit_rate) s_batch.append(state) entropy_record.append(a3c.compute_entropy(action_prob[0])) if end_of_video: log_file.write('\n') log_file.close() last_bit_rate = DEFAULT_QUALITY bit_rate = DEFAULT_QUALITY # use the default action here del s_batch[:] del a_batch[:] del r_batch[:] action_vec = np.zeros(A_DIM) action_vec[bit_rate] = 1 s_batch.append(np.zeros((S_INFO, S_LEN))) a_batch.append(action_vec) entropy_record = [] video_count += 1 if video_count >= len(all_file_names): break log_path = LOG_FILE + '_' + all_file_names[net_env.trace_idx] log_file = open(log_path, 'wb') print "Decisions: {}".format(Counter(DECISIONS))
def agent(agent_id, all_cooked_time, all_cooked_bw, net_params_queue, exp_queue): net_env = env.Environment(all_cooked_time=all_cooked_time, all_cooked_bw=all_cooked_bw, random_seed=agent_id) with tf.Session() as sess, open(LOG_FILE + '_agent_' + str(agent_id), 'wb') as log_file: actor = a3c.ActorNetwork(sess, state_dim=[S_INFO, S_LEN], action_dim=A_DIM, learning_rate=ACTOR_LR_RATE) critic = a3c.CriticNetwork(sess, state_dim=[S_INFO, S_LEN], learning_rate=CRITIC_LR_RATE) # initial synchronization of the network parameters from the coordinator actor_net_params, critic_net_params = net_params_queue.get() actor.set_network_params(actor_net_params) critic.set_network_params(critic_net_params) prefetch_decision = DEFAULT_PREFETCH action_vec = np.zeros(A_DIM) action_vec[ prefetch_decision] = 1 # Normal chunk action: [1,0]; Hotspot chunk action: [0,1] s_batch = [np.zeros((S_INFO, S_LEN))] a_batch = [action_vec] r_batch = [] entropy_record = [] time_stamp = 0 while True: # experience video streaming forever # --------------------------------- # the action is from the last decision # this is to make the framework similar to the real # TO-DO: Add additional state info state_data_for_action = net_env.execute_action(prefetch_decision) # normal chunk state information delay = state_data_for_action['delay'] sleep_time = state_data_for_action['sleep_time'] last_bit_rate = state_data_for_action['last_bit_rate'] play_buffer_size = state_data_for_action['play_buffer_size'] rebuf = state_data_for_action['rebuf'] video_chunk_size = state_data_for_action['video_chunk_size'] next_video_chunk_sizes = state_data_for_action[ 'next_video_chunk_sizes'] end_of_video = state_data_for_action['end_of_video'] video_chunk_remain = state_data_for_action['video_chunk_remain'] # hotspot chunk state information was_hotspot_chunk = state_data_for_action['was_hotspot_chunk'] hotspot_chunks_remain = state_data_for_action[ 'hotspot_chunks_remain'] chunks_till_played = state_data_for_action['chunks_till_played'] total_buffer_size = state_data_for_action['total_buffer_size'] last_hotspot_bit_rate = state_data_for_action[ 'last_hotspot_bit_rate'] next_hotspot_chunk_sizes = state_data_for_action[ 'next_hotspot_chunk_sizes'] dist_from_hotspot_chunks = state_data_for_action[ 'dist_from_hotspot_chunks'] smoothness_eval_bitrates = state_data_for_action[ 'smoothness_eval_bitrates'] # abr decision state information normal_bitrate_pensieve = state_data_for_action[ 'normal_bitrate_pensieve'] hotspot_bitrate_pensieve = state_data_for_action[ 'hotspot_bitrate_pensieve'] # print len(next_video_chunk_sizes) # print len(next_hotspot_chunk_sizes) last_overall_bitrate = last_bit_rate if prefetch_decision == 1: last_overall_bitrate = last_hotspot_bit_rate # --------------------------------- time_stamp += delay # in ms time_stamp += sleep_time # in ms # -- linear reward -- (in hotspot aware scenario) # reward is video quality - rebuffer penalty - smoothness + hd reward for hotspot reward_normal_br = (1 - was_hotspot_chunk) * ( VIDEO_BIT_RATE[last_bit_rate] / M_IN_K) * 1.0 reward_hotspot_br = was_hotspot_chunk * HD_REWARD[ last_hotspot_bit_rate] * 1.0 reward_rebuffering = REBUF_PENALTY * rebuf * 1.0 reward_smoothness = 0.0 if len(smoothness_eval_bitrates) > 1: for i in xrange(len(smoothness_eval_bitrates) - 1): reward_smoothness += 1.0 * SMOOTH_PENALTY * (1.0 * np.abs( VIDEO_BIT_RATE[smoothness_eval_bitrates[i + 1]] - VIDEO_BIT_RATE[smoothness_eval_bitrates[i]]) / M_IN_K) reward = (1.0 * reward_normal_br) + (1.0 * reward_hotspot_br) - ( 1.0 * reward_rebuffering) - (1.0 * reward_smoothness) # -- log scale reward -- # log_bit_rate = np.log(VIDEO_BIT_RATE[bit_rate] / float(VIDEO_BIT_RATE[-1])) # log_last_bit_rate = np.log(VIDEO_BIT_RATE[last_bit_rate] / float(VIDEO_BIT_RATE[-1])) # reward = log_bit_rate \ # - REBUF_PENALTY * rebuf \ # - SMOOTH_PENALTY * np.abs(log_bit_rate - log_last_bit_rate) # -- HD reward -- # reward = HD_REWARD[bit_rate] \ # - REBUF_PENALTY * rebuf \ # - SMOOTH_PENALTY * np.abs(HD_REWARD[bit_rate] - HD_REWARD[last_bit_rate]) r_batch.append(reward) # retrieve previous state if len(s_batch) == 0: state = [np.zeros((S_INFO, S_LEN))] else: state = np.array(s_batch[-1], copy=True) # dequeue history record state = np.roll(state, -1, axis=1) # this should be S_INFO number of terms ## Normal state S_ABR_INFO state[0, -1] = VIDEO_BIT_RATE[last_overall_bitrate] / float( np.max(VIDEO_BIT_RATE)) # last quality state[1, -1] = play_buffer_size / BUFFER_NORM_FACTOR # 10 sec state[2, -1] = float(video_chunk_size) / float( delay) / M_IN_K # kilo byte / ms state[3, -1] = float(delay) / M_IN_K / BUFFER_NORM_FACTOR # 10 sec state[4, :BITRATE_LEVELS] = np.array( next_video_chunk_sizes) / M_IN_K / M_IN_K # mega byte state[5, -1] = np.minimum( video_chunk_remain, CHUNK_TIL_VIDEO_END_CAP) / float(CHUNK_TIL_VIDEO_END_CAP) ## Hotspot state S_HOT_INFO state[6, -1] = np.minimum( hotspot_chunks_remain, NUM_HOTSPOT_CHUNKS) / float(NUM_HOTSPOT_CHUNKS) state[7, -1] = np.minimum( chunks_till_played, CHUNK_TIL_VIDEO_END_CAP) / float(CHUNK_TIL_VIDEO_END_CAP) state[8, -1] = total_buffer_size / BUFFER_NORM_FACTOR state[9, -1] = last_hotspot_bit_rate / float(np.max(VIDEO_BIT_RATE)) state[10, :BITRATE_LEVELS] = np.array( next_hotspot_chunk_sizes) / M_IN_K / M_IN_K state[11, :NUM_HOTSPOT_CHUNKS] = ( np.array(dist_from_hotspot_chunks) + CHUNK_TIL_VIDEO_END_CAP) / float(2 * CHUNK_TIL_VIDEO_END_CAP) ## Bitrate actions state S_BRT_INFO state[12, -1] = normal_bitrate_pensieve / float(np.max(VIDEO_BIT_RATE)) state[13, -1] = hotspot_bitrate_pensieve / float( np.max(VIDEO_BIT_RATE)) # compute action probability vector action_prob = actor.predict(np.reshape(state, (1, S_INFO, S_LEN))) action_cumsum = np.cumsum(action_prob) prefetch_decision = ( action_cumsum > np.random.randint(1, RAND_RANGE) / float(RAND_RANGE)).argmax() # Note: we need to discretize the probability into 1/RAND_RANGE steps, # because there is an intrinsic discrepancy in passing single state and batch states entropy_record.append(a3c.compute_entropy(action_prob[0])) # log time_stamp, bit_rate, buffer_size, reward, prefetch_decision log_file.write( str(time_stamp) + '\t' + str(VIDEO_BIT_RATE[last_overall_bitrate]) + '\t' + str(play_buffer_size) + '\t' + str(rebuf) + '\t' + str(video_chunk_size) + '\t' + str(delay) + '\t' + str(prefetch_decision) + '\t' + str(reward) + '\n') log_file.flush() # report experience to the coordinator if len(r_batch) >= TRAIN_SEQ_LEN or end_of_video: exp_queue.put([ s_batch[1:], # ignore the first chunk a_batch[1:], # since we don't have the r_batch[1:], # control over it end_of_video, { 'entropy': entropy_record } ]) # synchronize the network parameters from the coordinator actor_net_params, critic_net_params = net_params_queue.get() actor.set_network_params(actor_net_params) critic.set_network_params(critic_net_params) del s_batch[:] del a_batch[:] del r_batch[:] del entropy_record[:] log_file.write( '\n') # so that in the log we know where video ends # store the state and action into batches if end_of_video: prefetch_decision = DEFAULT_PREFETCH action_vec = np.zeros(A_DIM) action_vec[prefetch_decision] = 1 s_batch.append(np.zeros((S_INFO, S_LEN))) a_batch.append(action_vec) else: s_batch.append(state) action_vec = np.zeros(A_DIM) action_vec[prefetch_decision] = 1 a_batch.append(action_vec)
def main(): np.random.seed(RANDOM_SEED) assert len(PACKET_SIZE) == A_DIM if not os.path.exists(SUMMARY_DIR): os.makedirs(SUMMARY_DIR) all_cooked_time, all_cooked_bw, all_file_names = load_trace.load_trace() net_env = env.Environment(all_cooked_time=all_cooked_time, all_cooked_bw=all_cooked_bw) log_path = LOG_FILE + '_base2_' + all_file_names[net_env.trace_idx] log_file = open(log_path, 'wb') with tf.Session() as sess: actor = a3c.ActorNetwork(sess, state_dim=[S_INFO, S_LEN], action_dim=A_DIM, learning_rate=ACTOR_LR_RATE) critic = a3c.CriticNetwork(sess, state_dim=[S_INFO, S_LEN], learning_rate=CRITIC_LR_RATE) sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() # save neural net parameters # restore neural net parameters nn_model = NN_MODEL if nn_model is not None: # nn_model is the path to file saver.restore(sess, nn_model) print("Model restored.") time_stamp = 0 sensor_selection = DEFAULT_SELECTION action_vec = np.zeros(A_DIM) prob_violation = np.zeros(A_DIM) violation_n = np.zeros(A_DIM) action_vec[sensor_selection] = 1 s_batch = [np.zeros((S_INFO, S_LEN))] a_batch = [action_vec] r_batch = [] entropy_record = [] video_count = 0 objective = 0 k = 0 sum_age = 0 sum_violation = 0 while k < 30000: # serve video forever # the action is from the last decision # this is to make the framework similar to the real delay, sleep_time, video_chunk_size = net_env.get_video_chunk( sensor_selection) #time_stamp += delay # in ms #time_stamp += sleep_time # in ms #for n in range(0,A_DIM): # violation[n] = 0 # if n == sensor_selection: # age[n,k] = delay # else: # age[n,k] = age[n,k-1] + delay # if age[n,k] > tau[n]: # violation[n] += 1 # sum_age = np.sum(age[:,:]) # sum_violation = np.sum(violation) # expected_age=sum_age/(k*A_DIM) sum_age_before = np.sum(age[:, k]) current_violation = 0 for n in range(0, A_DIM): #for k in range (1,TRAIN_SEQ_LEN): if n == sensor_selection: #print (j) #time.sleep(2) dummy = int(j[n]) j[n] += 1 age[n, k] = delay anis[n, dummy] = age[n, k] #violation[n] = 0 else: age[n, k] = age[n, k - 1] + delay dummy = int(j[n]) anis[n, dummy] = age[n, k] if age[n, k] > tau[n]: violation[n] += 1 current_violation = current_violation + (10 - n / 10) violation_n_k[n, k] += 1 prob_violation = violation / (k + 1) #print violation_n #time.sleep(2) for n in range(0, A_DIM): #expected_age[n] = gamma[n]*np.sum((anis[n,:int(j[n])+1])/(int(j[n])+1)) expected_age_n[n] = np.sum(age[n, :]) / ((k + 1)) if violation_n[n] > epsilon[n]: hamza[n] = 1 else: hamza[n] = 0 expected_age = np.sum(expected_age_n[:]) / A_DIM #prob_violation = violation/k #reward = (-np.sum(age[:,k]) - lamba*np.sum(violation_n_k[:,k]) - mu*np.sum(hamza[:]))/100 reward = (-np.sum(age[:, k]) - lamba * current_violation - mu * np.sum(hamza[:])) / 100 sum_age += np.sum(age) if k == 29999: for n in range(0, A_DIM): violation_n[n] = 1000 * (10 - n / 10) * violation[n] / (k + 1) sum_age = sum_age / ((k + 1) * A_DIM) sum_violation = np.sum(violation_n) print(sum_age + sum_violation) print(100 * violation[:] / (k + 1)) print(expected_age_n[:]) r_batch.append(reward) log_file.write( str(time_stamp) + '\t' + str(PACKET_SIZE[sensor_selection]) + '\t' + str(delay) + '\t' + str(reward) + '\t' + str(age[0, k]) + '\t' + str(age[1, k]) + '\t' + str(age[2, k]) + '\t' + str(age[3, k]) + '\t' + str(age[4, k]) + '\t' + str(age[5, k]) + '\t' + str(age[6, k]) + '\t' + str(age[7, k]) + '\t' + str(age[8, k]) + '\t' + str(age[9, k]) + '\n') log_file.flush() # retrieve previous state if len(s_batch) == 0: state = [np.zeros((S_INFO, S_LEN))] else: state = np.array(s_batch[-1], copy=True) # dequeue history record state = np.roll(state, -1, axis=1) state[0, -1] = float(age[0, k]) / M_IN_K state[1, -1] = float(age[1, k]) / M_IN_K state[2, -1] = float(age[2, k]) / M_IN_K state[3, -1] = float(age[3, k]) / M_IN_K state[4, -1] = float(age[4, k]) / M_IN_K state[5, -1] = float(age[5, k]) / M_IN_K state[6, -1] = float(age[6, k]) / M_IN_K state[7, -1] = float(age[7, k]) / M_IN_K state[8, -1] = float(age[8, k]) / M_IN_K state[9, -1] = float(age[9, k]) / M_IN_K #state[10, -1] = float(PACKET_SIZE[0])/float(PACKET_SIZE[9]) #state[11, -1] = float(PACKET_SIZE[1])/float(PACKET_SIZE[9]) #state[12, -1] = float(PACKET_SIZE[2])/float(PACKET_SIZE[9]) #state[13, -1] = float(PACKET_SIZE[3])/float(PACKET_SIZE[9]) #state[14, -1] = float(PACKET_SIZE[4])/float(PACKET_SIZE[9]) #state[15, -1] = float(PACKET_SIZE[5])/float(PACKET_SIZE[9]) #state[16, -1] = float(PACKET_SIZE[6])/float(PACKET_SIZE[9]) #state[17, -1] = float(PACKET_SIZE[7])/float(PACKET_SIZE[9]) #state[18, -1] = float(PACKET_SIZE[8])/float(PACKET_SIZE[9]) #state[19, -1] = float(PACKET_SIZE[9])/float(PACKET_SIZE[9]) state[10, -1] = float(delay) / 100 state[11, -1] = float(PACKET_SIZE[sensor_selection]) / ( 100 * float(delay) * float(PACKET_SIZE[9])) # compute action probability vector action_prob = actor.predict(np.reshape(state, (1, S_INFO, S_LEN))) #action_cumsum = np.cumsum(action_prob) sensor_selection = (age[:, k]).argmax( ) #(action_cumsum > np.random.randint(1, RAND_RANGE) / float(RAND_RANGE)).argmax() # Note: we need to discretize the probability into 1/RAND_RANGE steps, # because there is an intrinsic discrepancy in passing single state and batch states entropy_record.append(a3c.compute_entropy(action_prob[0])) time_stamp += 1 # log time_stamp, bit_rate, buffer_size, reward #if end_of_video: # del s_batch[:] # del a_batch[:] # del r_batch[:] # del entropy_record[:] #k = 0 #for n in range(0,A_DIM): # violation[n] = 0 # age[n,:] = 0 #sensor_selection = DEFAULT_SELECTION #log_file.write('\n') # so that in the log we know where video ends s_batch.append(state) action_vec = np.zeros(A_DIM) action_vec[sensor_selection] = 1 a_batch.append(action_vec) #log_path = LOG_FILE + '_' + all_file_names[net_env.trace_idx] #log_file = open(log_path, 'wb') k += 1
def agent(agent_id, all_cooked_time, all_cooked_bw, all_file_names, video_size_file, net_params_queue, exp_queue): net_env = env.Environment(all_cooked_time=all_cooked_time, all_cooked_bw=all_cooked_bw, random_seed=agent_id, VIDEO_SIZE_FILE=video_size_file, Debug=False) with tf.Session() as sess, open(LOG_FILE + '_agent_' + str(agent_id), 'wb') as log_file: actor = a3c.ActorNetwork(sess, state_dim=[S_INFO, S_LEN], action_dim=A_DIM, learning_rate=ACTOR_LR_RATE) critic = a3c.CriticNetwork(sess, state_dim=[S_INFO, S_LEN], learning_rate=CRITIC_LR_RATE) # initial synchronization of the network parameters from the coordinator actor_net_params, critic_net_params = net_params_queue.get() actor.set_network_params(actor_net_params) critic.set_network_params(critic_net_params) bit_rate = DEFAULT_QUALITY target_buffer = DEFAULT_QUALITY latency_limit = 4 index = 1 action_vec = np.zeros(A_DIM) action_vec[index] = 1 s_batch = [np.zeros((S_INFO, S_LEN))] a_batch = [action_vec] r_batch = [] entropy_record = [] video_count = 0 reward_all_sum = 0 reward_all = 0 reward = 0 switch_num = 0 SMOOTH_PENALTY = 0.01 REBUF_PENALTY = 1.5 LANTENCY_PENALTY = 0.01 BITRATE_REWARD = 0.001 SKIP_PENALTY = 1 epoch = 0 n = 0 state = np.array(s_batch[-1], copy=True) frame_time_len = 0.04 last_bit_rate = DEFAULT_QUALITY while True: # experience video streaming forever # the action is from the last decision # this is to make the framework similar to the real time, time_interval, send_data_size, chunk_len, \ rebuf, buffer_size, play_time_len, end_delay, \ cdn_newest_id, download_id, cdn_has_frame, skip_frame_time_len, decision_flag, \ buffer_flag, cdn_flag, skip_flag, end_of_video = net_env.get_video_frame(bit_rate, target_buffer, latency_limit) # # QOE setting # if end_delay <= 1.0: # LANTENCY_PENALTY = 0.005 # else: # LANTENCY_PENALTY = 0.01 reward_frame = 0 epoch += 1 if not cdn_flag: reward_frame = frame_time_len * float( BIT_RATE[bit_rate] ) * BITRATE_REWARD - REBUF_PENALTY * rebuf - LANTENCY_PENALTY * end_delay - SKIP_PENALTY * skip_frame_time_len else: reward_frame = -(REBUF_PENALTY * rebuf) reward += reward_frame # dequeue history record state = np.roll(state, -1, axis=1) # this should be S_INFO number of terms state[0, -1] = buffer_size * 0.1 state[1, -1] = send_data_size * 0.00001 state[2, -1] = time_interval * 10 # kilo byte / ms state[3, -1] = end_delay * 0.1 # 10 sec state[4, -1] = rebuf # mega byte if decision_flag and not end_of_video: reward_frame = -1 * SMOOTH_PENALTY * ( abs(BIT_RATE[bit_rate] - BIT_RATE[last_bit_rate]) / 1000) reward += reward_frame last_bit_rate = bit_rate r_batch.append(reward) reward = 0 # compute action probability vector action_prob = actor.predict( np.reshape(state, (1, S_INFO, S_LEN))) action_cumsum = np.cumsum(action_prob) temp = np.random.randint(1, RAND_RANGE) / float(RAND_RANGE) index = (action_cumsum > temp).argmax() bit_rate = ACTION_SAPCE[index][0] target_buffer = ACTION_SAPCE[index][1] latency_limit = ACTION_SAPCE[index][2] # Note: we need to discretize the probability into 1/RAND_RANGE steps, # because there is an intrinsic discrepancy in passing single state and batch states entropy_record.append(a3c.compute_entropy(action_prob[0])) # report experience to the coordinator if len(r_batch) >= TRAIN_SEQ_LEN: exp_queue.put([ s_batch[1:], # ignore the first chuck a_batch[1:], # since we don't have the r_batch[1:], # control over it end_of_video, { 'entropy': entropy_record } ]) # synchronize the network parameters from the coordinator actor_net_params, critic_net_params = net_params_queue.get( ) actor.set_network_params(actor_net_params) critic.set_network_params(critic_net_params) del s_batch[:] del a_batch[:] del r_batch[:] del entropy_record[:] s_batch.append(state) action_vec = np.zeros(A_DIM) action_vec[index] = 1 a_batch.append(action_vec) reward_all += reward_frame # store the state and action into batches if end_of_video: r_batch.append(reward) reward_all_sum += reward_all / 20 video_count += 1 if video_count >= len(all_file_names): n += 1 video_count = 0 print(n, "agent_id ", agent_id, "reward_all_sum:", reward_all_sum) w.writerow([n, reward_all_sum]) out.flush() reward_all_sum = 0 net_env = env.Environment(all_cooked_time=all_cooked_time, all_cooked_bw=all_cooked_bw, random_seed=epoch, VIDEO_SIZE_FILE=video_size_file, Debug=False) if n == NUM_EPOCH: break reward_all = 0 reward = 0 switch_num = 0 bit_rate = DEFAULT_QUALITY # use the default action here target_buffer = DEFAULT_QUALITY action_vec = np.zeros(A_DIM) action_vec[bit_rate] = 1 s_batch.append(np.zeros((S_INFO, S_LEN))) a_batch.append(action_vec)
def main(): np.random.seed(RANDOM_SEED) assert len(VIDEO_BIT_RATE) == A_DIM # Originally defined in env.py mask = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1] session_conf = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1) with tf.Session(config=session_conf) as sess: actor = a3c.ActorNetwork(sess, state_dim=[S_INFO, S_LEN], action_dim=A_DIM, learning_rate=ACTOR_LR_RATE) critic = a3c.CriticNetwork(sess, state_dim=[S_INFO, S_LEN], learning_rate=CRITIC_LR_RATE) sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() # save neural net parameters # restore neural net parameters if NN_MODEL is not None: # NN_MODEL is the path to file saver.restore(sess, NN_MODEL) last_bit_rate = DEFAULT_QUALITY bit_rate = DEFAULT_QUALITY action = bitrate_to_action(bit_rate, mask) last_action = action s_batch = [np.zeros((S_INFO, S_LEN))] entropy_record = [] video_chunks_sent = 0 video_num_chunks = 43200 # 24 hours of video. Is this an acceptable proxy for never ending video? puffer_sock = start_ipc_client() while True: # serve video forever # the action is from the last decision # this is to make the framework similar to the real video_chunk_remain = video_num_chunks - video_chunks_sent delay, buffer_size, \ rebuf, video_chunk_size, \ next_video_chunk_size = \ get_puffer_info(puffer_sock) reward = VIDEO_BIT_RATE[action] / M_IN_K \ - REBUF_PENALTY * rebuf \ - SMOOTH_PENALTY * np.abs(VIDEO_BIT_RATE[action] - VIDEO_BIT_RATE[last_action]) / M_IN_K last_bit_rate = bit_rate last_action = action # Add average audio size to each video chunk to improve throughput estimates # This is necessary because original Pensieve code does not consider audio, and # no simple solution exists given that our audio and video chunks are different # time scales. video_chunk_size += AVG_AUDIO_SIZE_BYTES for idx in xrange(len(next_video_chunk_size)): next_video_chunk_size[ idx] = next_video_chunk_size[idx] + AVG_AUDIO_SIZE_BYTES # retrieve previous state if len(s_batch) == 0: state = [np.zeros((S_INFO, S_LEN))] else: state = np.array(s_batch[-1], copy=True) # dequeue history record state = np.roll(state, -1, axis=1) if delay == 0: #No division by zero delay = 1 # this should be S_INFO number of terms state[0, -1] = VIDEO_BIT_RATE[action] / float( np.max(VIDEO_BIT_RATE)) # last quality state[1, -1] = buffer_size / BUFFER_NORM_FACTOR state[2, -1] = float(video_chunk_size) / float( delay ) / M_IN_K # kilo byte / ms # This is really just throughput state[3, -1] = float(delay) / M_IN_K state[4, -1] = video_chunk_remain / float(video_num_chunks) state[5, :] = -1 nxt_chnk_cnt = 0 for i in xrange(A_DIM): if mask[i] == 1: state[5, i] = next_video_chunk_size[nxt_chnk_cnt] / M_IN_B nxt_chnk_cnt += 1 assert (nxt_chnk_cnt) == np.sum(mask) state[6, -A_DIM:] = mask action_prob = actor.predict(np.reshape(state, (1, S_INFO, S_LEN))) # the action probability should correspond to number of bit rates assert len(action_prob[0]) == np.sum(mask) action_cumsum = np.cumsum(action_prob) bit_rate = (action_cumsum > np.random.randint(1, RAND_RANGE) / float(RAND_RANGE)).argmax() # Note: we need to discretize the probability into 1/RAND_RANGE steps, # because there is an intrinsic discrepancy in passing single state and batch states action = bitrate_to_action(bit_rate, mask) # Now I have my action! Send this action back to the Puffer server over IPC send_puffer_next_action(puffer_sock, bit_rate) s_batch.append(state) entropy_record.append(a3c.compute_entropy(action_prob[0]))
def do_POST(self): content_length = int(self.headers['Content-Length']) env_post_data = json.loads(self.rfile.read(content_length)) # mlog(fnc="do_POST()", msg="POST req data: Last request - {}, Last quality - {}, Rebuffer Time - {}".format( # post_data['lastRequest'], post_data['lastquality'], float(post_data['RebufferTime'] - self.input_dict['last_total_rebuf']))) send_data = "" if ('pastThroughput' in env_post_data): # @Hongzi: this is just the summary of throughput/quality at the end of the load # so we don't want to use this information to send back a new quality mlog(fnc="do_POST()", msg="Past throughput is present in post_data, \ not using this information to send back quality") else: # Get params according to rl_test.py in original Pensieve code delay = env_post_data["delay"] sleep_time = env_post_data["sleep_time"] buffer_size = env_post_data["buffer_size"] rebuf = env_post_data["rebuf"] video_chunk_size = env_post_data["video_chunk_size"] next_video_chunk_sizes = env_post_data[ "next_video_chunk_sizes"] end_of_video = env_post_data["end_of_video"] video_chunk_remain = env_post_data["video_chunk_remain"] # Get additional params to differentiate between hotspot y/n cases bit_rate = env_post_data["bit_rate"] last_bit_rate = env_post_data["last_bit_rate"] is_last_action_prefetch = env_post_data[ "is_last_action_prefetch"] is_prefetch_hotspot = env_post_data["is_prefetch_hotspot"] Request_Handler.time_stamp += delay # in ms Request_Handler.time_stamp += sleep_time # in ms # rebuffer_time = float(post_data['RebufferTime'] - self.input_dict['last_total_rebuf']) # # --linear reward-- # reward = VIDEO_BIT_RATE[post_data['lastquality']] / M_IN_K \ # - REBUF_PENALTY * rebuffer_time / M_IN_K \ # - SMOOTH_PENALTY * np.abs(VIDEO_BIT_RATE[post_data['lastquality']] - # self.input_dict['last_bit_rate']) / M_IN_K # --log reward-- # log_bit_rate = np.log(VIDEO_BIT_RATE[post_data['lastquality']] / float(VIDEO_BIT_RATE[0])) # log_last_bit_rate = np.log(self.input_dict['last_bit_rate'] / float(VIDEO_BIT_RATE[0])) # reward = log_bit_rate \ # - 4.3 * rebuffer_time / M_IN_K \ # - SMOOTH_PENALTY * np.abs(log_bit_rate - log_last_bit_rate) # --hd reward-- # reward = BITRATE_REWARD[post_data['lastquality']] \ # - 8 * rebuffer_time / M_IN_K - np.abs(BITRATE_REWARD[post_data['lastquality']] - BITRATE_REWARD_MAP[self.input_dict['last_bit_rate']]) # Linear reward reward = VIDEO_BIT_RATE[bit_rate] / M_IN_K \ - REBUF_PENALTY * rebuf \ - SMOOTH_PENALTY * np.abs(VIDEO_BIT_RATE[bit_rate] - VIDEO_BIT_RATE[last_bit_rate]) / M_IN_K # self.input_dict['last_bit_rate'] = VIDEO_BIT_RATE[post_data['lastquality']] # self.input_dict['last_total_rebuf'] = post_data['RebufferTime'] self.r_batch.append(reward) # custom: append last state if Request_Handler.train_counter > 0: if is_last_action_prefetch == 1: self.s_batch.append(Request_Handler.last_hotspot_state) else: self.s_batch.append(Request_Handler.last_normal_state) # retrieve previous state if len(self.s_batch) == 0: state = [np.zeros((S_INFO, S_LEN))] else: state = np.array(self.s_batch[-1], copy=True) # compute bandwidth measurement # video_chunk_fetch_time = post_data['delay'] # video_chunk_size = post_data['lastChunkSize'] # compute number of video chunks left # video_chunk_remain = TOTAL_VIDEO_CHUNKS - post_data['videoChunkCount'] # dequeue history record state = np.roll(state, -1, axis=1) # print "roll: {}, shape: {}".format(type(state), state.shape) # next_video_chunk_sizes = [] # for i in xrange(A_DIM): # next_video_chunk_sizes.append(get_chunk_size(i, post_data['nextVideoChunkIndex'])) # this should be S_INFO number of terms # try: # state[0, -1] = VIDEO_BIT_RATE[post_data['lastquality']] / float(np.max(VIDEO_BIT_RATE)) # state[1, -1] = post_data['buffer'] / BUFFER_NORM_FACTOR # state[2, -1] = float(video_chunk_size) / float(video_chunk_fetch_time) / M_IN_K # kilo byte / ms # state[3, -1] = float(video_chunk_fetch_time) / M_IN_K / BUFFER_NORM_FACTOR # 10 sec # state[4, :A_DIM] = np.array(next_video_chunk_sizes) / M_IN_K / M_IN_K # mega byte # state[5, -1] = np.minimum(video_chunk_remain, CHUNK_TIL_VIDEO_END_CAP) / float(CHUNK_TIL_VIDEO_END_CAP) # print "Video bitrate: {}".format(state[0, -1]) # print "Buffer: {}".format(state[1, -1]) # print "Throughput: {}".format(state[2, -1]) # print "Download duration: {}".format(state[3, -1]) # print "Next video chunk sizes: {}".format(state[4, :A_DIM]) # print "Video chunks remaining: {}".format(state[5, -1]) # print "\n" # except ZeroDivisionError: # # this should occur VERY rarely (1 out of 3000), should be a dash issue # # in this case we ignore the observation and roll back to an eariler one # if len(self.s_batch) == 0: # state = [np.zeros((S_INFO, S_LEN))] # else: # state = np.array(self.s_batch[-1], copy=True) # this should be S_INFO number of terms try: state[0, -1] = VIDEO_BIT_RATE[bit_rate] / float( np.max(VIDEO_BIT_RATE)) # last quality state[1, -1] = buffer_size / BUFFER_NORM_FACTOR # 10 sec state[2, -1] = float(video_chunk_size) / float( delay) / M_IN_K # kilo byte / ms state[3, -1] = float( delay) / M_IN_K / BUFFER_NORM_FACTOR # 10 sec state[4, :A_DIM] = np.array( next_video_chunk_sizes) / M_IN_K / M_IN_K # mega byte state[5, -1] = np.minimum(video_chunk_remain, CHUNK_TIL_VIDEO_END_CAP) / float( CHUNK_TIL_VIDEO_END_CAP) except ZeroDivisionError: # this should occur VERY rarely (1 out of 3000), should be a dash issue # in this case we ignore the observation and roll back to an eariler one if len(self.s_batch) == 0: state = [np.zeros((S_INFO, S_LEN))] else: state = np.array(self.s_batch[-1], copy=True) # log wall_time, bit_rate, buffer_size, rebuffer_time, video_chunk_size, download_time, reward # self.log_file.write(str(time.time()) + '\t' + # str(VIDEO_BIT_RATE[post_data['lastquality']]) + '\t' + # str(post_data['buffer']) + '\t' + # str(rebuffer_time / M_IN_K) + '\t' + # str(video_chunk_size) + '\t' + # str(video_chunk_fetch_time) + '\t' + # str(reward) + '\n') # self.log_file.flush() # print "state construct: {}, shape: {}".format(type(state), state.shape) action_prob = self.actor.predict( np.reshape(state, (1, S_INFO, S_LEN))) action_cumsum = np.cumsum(action_prob) bit_rate = (action_cumsum > np.random.randint(1, RAND_RANGE) / float(RAND_RANGE)).argmax() # Note: we need to discretize the probability into 1/RAND_RANGE steps, # because there is an intrinsic discrepancy in passing single state and batch states self.entropy_record.append(a3c.compute_entropy(action_prob[0])) # send data to html side # send_data = str(bit_rate) send_data = json.dumps({"bitrate": bit_rate}) mlog(fnc="do_POST()", msg="Bitrate decision: {}".format(bit_rate)) self.send_response(200) self.send_header('Content-Type', 'text/plain') self.send_header('Content-Length', len(send_data)) self.send_header('Access-Control-Allow-Origin', "*") self.end_headers() self.wfile.write(send_data) # record [state, action, reward] # put it here after training, notice there is a shift in reward storage if is_prefetch_hotspot == 1: Request_Handler.last_hotspot_state = state Request_Handler.prefetch_decisions.append(0) else: Request_Handler.last_normal_state = state Request_Handler.prefetch_decisions.append(1) # self.s_batch.append(state) # print "batch append: {}, shape: {}".format(type(state), state.shape) Request_Handler.train_counter += 1
def agent(agent_id, net_params_queue, exp_queue): net_env = innovation_env.Environment(random_seed=agent_id) with tf.Session() as sess, open(LOG_FILE + '_agent_' + str(agent_id), 'wb') as log_file: actor = a3c.ActorNetwork(sess, state_dim=[S_INFO, S_LEN], action_dim=A_DIM, learning_rate=ACTOR_LR_RATE) critic = a3c.CriticNetwork(sess, state_dim=[S_INFO, S_LEN], learning_rate=CRITIC_LR_RATE) # initial synchronization of the network parameters from the coordinator actor_net_params, critic_net_params = net_params_queue.get() actor.set_network_params(actor_net_params) critic.set_network_params(critic_net_params) last_vmaf = -1 bit_rate = DEFAULT_QUALITY last_rtt = -1 action_vec = np.zeros(A_DIM) action_vec[bit_rate] = 1 s_batch = [np.zeros((S_INFO, S_LEN))] a_batch = [action_vec] r_batch = [] entropy_record = [] time_stamp = 0 index = 1 while True: # experience video streaming forever _norm_bitrate = VIDEO_BIT_RATE[bit_rate] delay, loss, recv_bitrate, rtt, throughput, limbo_bytes_len = \ net_env.get_video_chunk(bit_rate) rtt = float(rtt) / float(1000) if last_rtt < 0: last_rtt = rtt _norm_send_bitrate = bit_rate / A_DIM _queuing_delay = abs(rtt - last_rtt) _norm_recv_bitrate = min( float(recv_bitrate) / delay / BUFFER_NORM_FACTOR, 1.0) time_stamp += delay # in ms vmaf = net_env.get_vmaf(bit_rate) if last_vmaf < 0: last_vmaf = vmaf #_normalized_bitrate = (_norm_bitrate - BITRATE_MIN) / (BITRATE_MAX - BITRATE_MIN) _vmaf_reward = (vmaf / _norm_bitrate) * BITRATE_MIN reward = vmaf - 0.2 * _norm_send_bitrate - 1.0 / DELAY_GRADIENT_MAX * \ min(_queuing_delay, DELAY_GRADIENT_MAX) - \ 1.0 * abs(last_vmaf - vmaf) r_batch.append(reward) last_vmaf = vmaf last_rtt = rtt log_file.write( str(time_stamp) + '\t' + str(_norm_bitrate) + '\t' + str(recv_bitrate) + '\t' + str(limbo_bytes_len) + '\t' + str(rtt) + '\t' + str(vmaf) + '\t' + str(reward) + '\n') log_file.flush() if len(s_batch) == 0: state = [np.zeros((S_INFO, S_LEN))] else: state = np.array(s_batch[-1], copy=True) # dequeue history record state = np.roll(state, -1, axis=1) state[0, -1] = _norm_send_bitrate # last quality state[1, -1] = _norm_recv_bitrate # kilo byte / ms state[2, -1] = _queuing_delay # max:500ms state[3, -1] = float(loss) # changed loss # test:add fft feature _fft = np.fft.fft(state[1]) state[4] = _fft.real state[5] = _fft.imag state[6, -1] = net_env.get_single_image() action_prob = actor.predict(np.reshape(state, (1, S_INFO, S_LEN))) #print 'state',state[6] #print 'action',action_prob[0] action_cumsum = np.cumsum(action_prob) bit_rate = (action_cumsum > np.random.randint(1, RAND_RANGE) / float(RAND_RANGE)).argmax() entropy_record.append(a3c.compute_entropy(action_prob[0])) # report experience to the coordinator if len(r_batch) >= TRAIN_SEQ_LEN: exp_queue.put([ s_batch[:], # ignore the first chuck a_batch[:], # since we don't have the r_batch[:], # control over it # end_of_video, { 'entropy': entropy_record } ]) # synchronize the network parameters from the coordinator actor_net_params, critic_net_params = net_params_queue.get() actor.set_network_params(actor_net_params) critic.set_network_params(critic_net_params) del s_batch[:] del a_batch[:] del r_batch[:] del entropy_record[:] # if index % MODEL_TEST_INTERVAL == 0 and agent_id == 0: # print 'start test' # test(actor,index) index += 1 # so that in the log we know where video ends log_file.write('\n') s_batch.append(state) action_vec = np.zeros(A_DIM) action_vec[bit_rate] = 1 a_batch.append(action_vec)
def main(): np.random.seed(RANDOM_SEED) assert len(VIDEO_BIT_RATE) == A_DIM if not os.path.exists(SUMMARY_DIR): os.makedirs(SUMMARY_DIR) log_path = LOG_FILE + '_sim_0' log_file = open(log_path, 'wb') with tf.Session() as sess: actor = a3c.ActorNetwork(sess, state_dim=[S_INFO, S_LEN], action_dim=A_DIM, learning_rate=ACTOR_LR_RATE) critic = a3c.CriticNetwork(sess, state_dim=[S_INFO, S_LEN], learning_rate=CRITIC_LR_RATE) sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() # save neural net parameters # restore neural net parameters nn_model = NN_MODEL if nn_model is not None: # nn_model is the path to file saver.restore(sess, nn_model) print("Model restored.") time_stamp = 0 last_bit_rate = DEFAULT_QUALITY bit_rate = DEFAULT_QUALITY action_vec = np.zeros(A_DIM) action_vec[bit_rate] = 1 s_batch = [np.zeros((S_INFO, S_LEN))] a_batch = [action_vec] r_batch = [] entropy_record = [] video_count = 0 delay_file = open(DATA_PATH + '/lastdownloadtime0') #sleep_file = open(DATA_PATH + '/rebufftime0') buffer_size_file = open(DATA_PATH + '/buffer0') rebuf_file = open(DATA_PATH + '/rebufftime0') video_chunk_size_file = open(DATA_PATH + '/chunk_size0') video_chunk_remain_file = open(DATA_PATH + '/m_segmentleft0') time_file = open(DATA_PATH + '/time0') while True: # serve video forever # the action is from the last decision # this is to make the framework similar to the real with open(DATA_PATH + '/permission0') as enable: key = enable.read() if key == '1': output_file = open(DATA_PATH + '/predict0', 'a') file_permission = open(DATA_PATH + '/permission0', 'a') delay = delay_file.readline().split('\n')[0] delay = float(delay) * 1000 #in ms sleep_time = 0.0 #float(sleep_file.readline().split('\n')[0]) buffer_size = float( buffer_size_file.readline().split('\n')[0]) buffer_size = max(buffer_size, 0) rebuf = float(rebuf_file.readline().split('\n')[0]) video_chunk_size = float( video_chunk_size_file.readline().split('\n')[0]) next_video_chunk_sizes = np.multiply(VIDEO_BIT_RATE, 500) video_chunk_remain = float( video_chunk_remain_file.readline().split('\n')[0]) currTime = time_file.readline().split('\n')[0] if video_chunk_remain == 0: end_of_video = 1 else: end_of_video = 0 time_stamp += delay # in ms time_stamp += sleep_time # in ms # reward is video quality - rebuffer penalty - smoothness reward = VIDEO_BIT_RATE[bit_rate] / M_IN_K \ - REBUF_PENALTY * rebuf \ - SMOOTH_PENALTY * np.abs(VIDEO_BIT_RATE[bit_rate] - VIDEO_BIT_RATE[last_bit_rate]) / M_IN_K r_batch.append(reward) last_bit_rate = bit_rate # log time_stamp, bit_rate, buffer_size, reward #log_file.write(str(time_stamp / M_IN_K) + '\t' + # str(VIDEO_BIT_RATE[bit_rate]) + '\t' + # str(buffer_size) + '\t' + # str(rebuf) + '\t' + # str(video_chunk_size) + '\t' + # str(delay) + '\t' + # str(reward) + '\n') #log_file.flush() # log time_stamp, bit_rate, buffer_size, reward log_file.write( str(currTime) + '\t' + str(VIDEO_BIT_RATE[bit_rate]) + '\t' + str(buffer_size) + '\t' + str(rebuf) + '\t' + str(video_chunk_size) + '\t' + str(delay) + '\t' + str(reward) + '\n') log_file.flush() # retrieve previous state if len(s_batch) == 0: state = [np.zeros((S_INFO, S_LEN))] else: state = np.array(s_batch[-1], copy=True) # dequeue history record state = np.roll(state, -1, axis=1) # this should be S_INFO number of terms state[0, -1] = VIDEO_BIT_RATE[bit_rate] / float( np.max(VIDEO_BIT_RATE)) # last quality state[1, -1] = buffer_size / BUFFER_NORM_FACTOR # 10 sec state[2, -1] = float(video_chunk_size) / float( delay) / M_IN_K # kilo byte / ms state[3, -1] = float( delay) / M_IN_K / BUFFER_NORM_FACTOR # 10 sec state[4, :A_DIM] = np.array( next_video_chunk_sizes) / M_IN_K / M_IN_K # mega byte state[5, -1] = np.minimum(video_chunk_remain, CHUNK_TIL_VIDEO_END_CAP) / float( CHUNK_TIL_VIDEO_END_CAP) action_prob = actor.predict( np.reshape(state, (1, S_INFO, S_LEN))) action_cumsum = np.cumsum(action_prob) bit_rate = (action_cumsum > np.random.randint(1, RAND_RANGE) / float(RAND_RANGE)).argmax() output_file.write( str(VIDEO_BIT_RATE[int(bit_rate)] * 1000) + '\n') file_permission.write('0\n') output_file.close() file_permission.close() # Note: we need to discretize the probability into 1/RAND_RANGE steps, # because there is an intrinsic discrepancy in passing single state and batch states s_batch.append(state) entropy_record.append(a3c.compute_entropy(action_prob[0])) if end_of_video: log_file.write('\n') log_file.close() last_bit_rate = DEFAULT_QUALITY bit_rate = DEFAULT_QUALITY # use the default action here del s_batch[:] del a_batch[:] del r_batch[:] action_vec = np.zeros(A_DIM) action_vec[bit_rate] = 1 s_batch.append(np.zeros((S_INFO, S_LEN))) a_batch.append(action_vec) entropy_record = [] print "video count", video_count video_count += 1 log_path = LOG_FILE + '_sim' + '_' + str(video_count) log_file = open(log_path, 'wb')
def agent(agent_id, all_cooked_time, all_cooked_bw, net_params_queue, exp_queue): net_env = env.Environment(all_cooked_time=all_cooked_time, all_cooked_bw=all_cooked_bw, random_seed=agent_id) with tf.Session() as sess, open(LOG_FILE + '_agent_' + str(agent_id), 'wb') as log_file: actor = a3c.ActorNetwork(sess, state_dim=[S_INFO, S_LEN], action_dim=A_DIM, learning_rate=ACTOR_LR_RATE) critic = a3c.CriticNetwork(sess, state_dim=[S_INFO, S_LEN], learning_rate=CRITIC_LR_RATE) # initial synchronization of the network parameters from the coordinator actor_net_params, critic_net_params = net_params_queue.get() actor.set_network_params(actor_net_params) critic.set_network_params(critic_net_params) sensor_selection = DEFAULT_SELECTION action_vec = np.zeros(A_DIM) action_vec[sensor_selection] = 1 s_batch = [np.zeros((S_INFO, S_LEN))] a_batch = [action_vec] r_batch = [] entropy_record = [] time_stamp = 0 k = 0 sum_age = 0 sum_violation = 0 while True: # experience video streaming forever # the action is from the last decision # this is to make the framework similar to the real delay, sleep_time, video_chunk_size = net_env.get_video_chunk( sensor_selection) max_age = (age[:, k]).argmax() sum_age_before = np.sum(age[:, k]) current_violation = 0 for n in range(0, A_DIM): #for k in range (1,TRAIN_SEQ_LEN): if n == sensor_selection: age[n, k] = delay else: age[n, k] = age[n, k - 1] + delay if age[n, k] > tau[n]: current_violation += 1 for n in range(0, A_DIM): expected_age_n[n] = np.sum(age[n, :]) / ((k + 1)) expected_age = np.sum(expected_age_n[:]) / A_DIM reward = (-np.sum(age[:, k]) - lamba * current_violation) / 100 r_batch.append(reward) if len(s_batch) == 0: state = [np.zeros((S_INFO, S_LEN))] else: state = np.array(s_batch[-1], copy=True) # dequeue history record state = np.roll(state, -1, axis=1) # this should be S_INFO number of terms state[0, -1] = float(age[0, k]) / M_IN_K state[1, -1] = float(age[1, k]) / M_IN_K state[2, -1] = float(age[2, k]) / M_IN_K state[3, -1] = float(age[3, k]) / M_IN_K state[4, -1] = float(age[4, k]) / M_IN_K state[5, -1] = float(age[5, k]) / M_IN_K state[6, -1] = float(age[6, k]) / M_IN_K state[7, -1] = float(age[7, k]) / M_IN_K state[8, -1] = float(age[8, k]) / M_IN_K state[9, -1] = float(age[9, k]) / M_IN_K #state[10, -1] = float(PACKET_SIZE[0])/float(PACKET_SIZE[9]) #state[11, -1] = float(PACKET_SIZE[1])/float(PACKET_SIZE[9]) #state[12, -1] = float(PACKET_SIZE[2])/float(PACKET_SIZE[9]) #state[13, -1] = float(PACKET_SIZE[3])/float(PACKET_SIZE[9]) #state[14, -1] = float(PACKET_SIZE[4])/float(PACKET_SIZE[9]) #state[15, -1] = float(PACKET_SIZE[5])/float(PACKET_SIZE[9]) #state[16, -1] = float(PACKET_SIZE[6])/float(PACKET_SIZE[9]) #state[17, -1] = float(PACKET_SIZE[7])/float(PACKET_SIZE[9]) #state[18, -1] = float(PACKET_SIZE[8])/float(PACKET_SIZE[9]) #state[19, -1] = float(PACKET_SIZE[9])/float(PACKET_SIZE[9]) state[10, -1] = float(delay) / 100 state[11, -1] = float(PACKET_SIZE[sensor_selection]) / ( 100 * float(delay) * float(PACKET_SIZE[9])) log_file.write( str(time_stamp) + '\t' + str(reward) + '\t' + str(age[:, k]) + '\t' + str(expected_age_n) + '\n') log_file.flush() # compute action probability vector action_prob = actor.predict(np.reshape(state, (1, S_INFO, S_LEN))) action_cumsum = np.cumsum(action_prob) sensor_selection = ( action_cumsum > np.random.randint(1, RAND_RANGE) / float(RAND_RANGE)).argmax() entropy_record.append(a3c.compute_entropy(action_prob[0])) time_stamp += 1 # report experience to the coordinator if len(r_batch) >= TRAIN_SEQ_LEN: #or end_of_video: exp_queue.put([ s_batch[1:], # ignore the first chuck a_batch[1:], # since we don't have the r_batch[1:], # control over it True, { 'entropy': entropy_record } ]) # synchronize the network parameters from the coordinator actor_net_params, critic_net_params = net_params_queue.get() actor.set_network_params(actor_net_params) critic.set_network_params(critic_net_params) del s_batch[:] del a_batch[:] del r_batch[:] del entropy_record[:] s_batch.append(state) action_vec = np.zeros(A_DIM) action_vec[sensor_selection] = 1 a_batch.append(action_vec) k += 1
def agent(agent_id, all_cooked_time, all_cooked_bw, net_params_queue, exp_queue): net_env = env.Environment(all_cooked_time=all_cooked_time, all_cooked_bw=all_cooked_bw, random_seed=agent_id) with tf.Session() as sess, open(LOG_FILE + '_agent_' + str(agent_id), 'wb') as log_file: actor = a3c.ActorNetwork(sess, state_dim=[S_INFO, S_LEN], action_dim=A_DIM, learning_rate=ACTOR_LR_RATE) critic = a3c.CriticNetwork(sess, state_dim=[S_INFO, S_LEN], learning_rate=CRITIC_LR_RATE) # initial synchronization of the network parameters from the coordinator actor_net_params, critic_net_params = net_params_queue.get() actor.set_network_params(actor_net_params) critic.set_network_params(critic_net_params) last_bit_rate = DEFAULT_QUALITY bit_rate = DEFAULT_QUALITY action_vec = np.zeros(A_DIM) action_vec[bit_rate] = 1 s_batch = [np.zeros((S_INFO, S_LEN))] a_batch = [action_vec] r_batch = [] entropy_record = [] time_stamp = 0 while True: # experience video streaming forever # the action is from the last decision # this is to make the framework similar to the real delay, sleep_time, buffer_size, rebuf, \ video_chunk_size, next_video_chunk_sizes, \ end_of_video, video_chunk_remain = \ net_env.get_video_chunk(bit_rate) time_stamp += delay # in ms time_stamp += sleep_time # in ms # -- linear reward -- # reward is video quality - rebuffer penalty - smoothness reward = VIDEO_BIT_RATE[bit_rate] / M_IN_K \ - REBUF_PENALTY * rebuf \ - SMOOTH_PENALTY * np.abs(VIDEO_BIT_RATE[bit_rate] - VIDEO_BIT_RATE[last_bit_rate]) / M_IN_K # -- log scale reward -- # log_bit_rate = np.log(VIDEO_BIT_RATE[bit_rate] / float(VIDEO_BIT_RATE[-1])) # log_last_bit_rate = np.log(VIDEO_BIT_RATE[last_bit_rate] / float(VIDEO_BIT_RATE[-1])) # reward = log_bit_rate \ # - REBUF_PENALTY * rebuf \ # - SMOOTH_PENALTY * np.abs(log_bit_rate - log_last_bit_rate) # -- HD reward -- # reward = HD_REWARD[bit_rate] \ # - REBUF_PENALTY * rebuf \ # - SMOOTH_PENALTY * np.abs(HD_REWARD[bit_rate] - HD_REWARD[last_bit_rate]) r_batch.append(reward) last_bit_rate = bit_rate # retrieve previous state if len(s_batch) == 0: state = [np.zeros((S_INFO, S_LEN))] else: state = np.array(s_batch[-1], copy=True) # dequeue history record state = np.roll(state, -1, axis=1) # this should be S_INFO number of terms state[0, -1] = VIDEO_BIT_RATE[bit_rate] / float( np.max(VIDEO_BIT_RATE)) # last quality state[1, -1] = buffer_size / BUFFER_NORM_FACTOR # 10 sec state[2, -1] = float(video_chunk_size) / float( delay) / M_IN_K # kilo byte / ms state[3, -1] = float(delay) / M_IN_K / BUFFER_NORM_FACTOR # 10 sec state[4, :A_DIM] = np.array( next_video_chunk_sizes) / M_IN_K / M_IN_K # mega byte state[5, -1] = np.minimum( video_chunk_remain, CHUNK_TIL_VIDEO_END_CAP) / float(CHUNK_TIL_VIDEO_END_CAP) # compute action probability vector action_prob = actor.predict(np.reshape(state, (1, S_INFO, S_LEN))) action_cumsum = np.cumsum(action_prob) bit_rate = (action_cumsum > np.random.randint(1, RAND_RANGE) / float(RAND_RANGE)).argmax() # Note: we need to discretize the probability into 1/RAND_RANGE steps, # because there is an intrinsic discrepancy in passing single state and batch states entropy_record.append(a3c.compute_entropy(action_prob[0])) # log time_stamp, bit_rate, buffer_size, reward log_file.write( str(time_stamp) + '\t' + str(VIDEO_BIT_RATE[bit_rate]) + '\t' + str(buffer_size) + '\t' + str(rebuf) + '\t' + str(video_chunk_size) + '\t' + str(delay) + '\t' + str(reward) + '\n') log_file.flush() # report experience to the coordinator if len(r_batch) >= TRAIN_SEQ_LEN or end_of_video: exp_queue.put([ s_batch[1:], # ignore the first chuck a_batch[1:], # since we don't have the r_batch[1:], # control over it end_of_video, { 'entropy': entropy_record } ]) # synchronize the network parameters from the coordinator actor_net_params, critic_net_params = net_params_queue.get() actor.set_network_params(actor_net_params) critic.set_network_params(critic_net_params) del s_batch[:] del a_batch[:] del r_batch[:] del entropy_record[:] log_file.write( '\n') # so that in the log we know where video ends # store the state and action into batches if end_of_video: last_bit_rate = DEFAULT_QUALITY bit_rate = DEFAULT_QUALITY # use the default action here action_vec = np.zeros(A_DIM) action_vec[bit_rate] = 1 s_batch.append(np.zeros((S_INFO, S_LEN))) a_batch.append(action_vec) else: s_batch.append(state) action_vec = np.zeros(A_DIM) action_vec[bit_rate] = 1 a_batch.append(action_vec)
def main(): args = parser.parse_args() if args.lin: qoe_metric = 'results_lin' elif args.log: qoe_metric = 'results_log' else: print('Please select the QoE Metric!') if args.FCC: dataset = 'fcc' elif args.HSDPA: dataset = 'HSDPA' elif args.Oboe: dataset = 'Oboe' else: print('Please select the dataset!') dataset_path = './traces_' + dataset + '/' Log_file_path = './' + qoe_metric + '/' + dataset + '/log_sim_rl' np.random.seed(RANDOM_SEED) assert len(VIDEO_BIT_RATE) == A_DIM # if not os.path.exists(SUMMARY_DIR): # os.makedirs(SUMMARY_DIR) all_cooked_time, all_cooked_bw, all_file_names = load_trace.load_trace(dataset_path) net_env = env.Environment(all_cooked_time=all_cooked_time, all_cooked_bw=all_cooked_bw) log_path = Log_file_path + '_' + all_file_names[net_env.trace_idx] log_file = open(log_path, 'wb') with tf.Session() as sess: actor = a3c.ActorNetwork(sess, state_dim=[S_INFO, S_LEN], action_dim=A_DIM, learning_rate=ACTOR_LR_RATE) critic = a3c.CriticNetwork(sess, state_dim=[S_INFO, S_LEN], learning_rate=CRITIC_LR_RATE) sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() # save neural net parameters # restore neural net parameters nn_model = NN_MODEL if nn_model is not None: # nn_model is the path to file saver.restore(sess, nn_model) print("Model restored.") time_stamp = 0 last_bit_rate = DEFAULT_QUALITY bit_rate = DEFAULT_QUALITY action_vec = np.zeros(A_DIM) action_vec[bit_rate] = 1 s_batch = [np.zeros((S_INFO, S_LEN))] a_batch = [action_vec] r_batch = [] entropy_record = [] video_count = 0 while True: # serve video forever # the action is from the last decision # this is to make the framework similar to the real delay, sleep_time, buffer_size, rebuf, \ video_chunk_size, next_video_chunk_sizes, \ end_of_video, video_chunk_remain = \ net_env.get_video_chunk(bit_rate) time_stamp += delay # in ms time_stamp += sleep_time # in ms # reward is video quality - rebuffer penalty if qoe_metric == 'results_lin': REBUF_PENALTY = 4.3 reward = VIDEO_BIT_RATE[bit_rate] / M_IN_K \ - REBUF_PENALTY * rebuf \ - SMOOTH_PENALTY * np.abs(VIDEO_BIT_RATE[bit_rate] - VIDEO_BIT_RATE[last_bit_rate]) / M_IN_K else: REBUF_PENALTY = 2.66 log_bit_rate = np.log(VIDEO_BIT_RATE[bit_rate] / float(VIDEO_BIT_RATE[0])) log_last_bit_rate = np.log(VIDEO_BIT_RATE[last_bit_rate] / float(VIDEO_BIT_RATE[0])) reward = log_bit_rate \ - REBUF_PENALTY * rebuf \ - SMOOTH_PENALTY * np.abs(log_bit_rate - log_last_bit_rate) r_batch.append(reward) last_bit_rate = bit_rate # log time_stamp, bit_rate, buffer_size, reward log_file.write(str(time_stamp / M_IN_K) + '\t' + str(VIDEO_BIT_RATE[bit_rate]) + '\t' + str(buffer_size) + '\t' + str(rebuf) + '\t' + str(video_chunk_size) + '\t' + str(delay) + '\t' + str(reward) + '\n') log_file.flush() # retrieve previous state if len(s_batch) == 0: state = [np.zeros((S_INFO, S_LEN))] else: state = np.array(s_batch[-1], copy=True) # dequeue history record state = np.roll(state, -1, axis=1) # this should be S_INFO number of terms state[0, -1] = VIDEO_BIT_RATE[bit_rate] / float(np.max(VIDEO_BIT_RATE)) # last quality state[1, -1] = buffer_size / BUFFER_NORM_FACTOR # 10 sec state[2, -1] = float(video_chunk_size) / float(delay) / M_IN_K # kilo byte / ms state[3, -1] = float(delay) / M_IN_K / BUFFER_NORM_FACTOR # 10 sec state[4, :A_DIM] = np.array(next_video_chunk_sizes) / M_IN_K / M_IN_K # mega byte state[5, -1] = np.minimum(video_chunk_remain, CHUNK_TIL_VIDEO_END_CAP) / float(CHUNK_TIL_VIDEO_END_CAP) action_prob = actor.predict(np.reshape(state, (1, S_INFO, S_LEN))) action_cumsum = np.cumsum(action_prob) bit_rate = (action_cumsum > np.random.randint(1, RAND_RANGE) / float(RAND_RANGE)).argmax() # Note: we need to discretize the probability into 1/RAND_RANGE steps, # because there is an intrinsic discrepancy in passing single state and batch states s_batch.append(state) entropy_record.append(a3c.compute_entropy(action_prob[0])) if end_of_video: log_file.write('\n') log_file.close() last_bit_rate = DEFAULT_QUALITY bit_rate = DEFAULT_QUALITY # use the default action here del s_batch[:] del a_batch[:] del r_batch[:] action_vec = np.zeros(A_DIM) action_vec[bit_rate] = 1 s_batch.append(np.zeros((S_INFO, S_LEN))) a_batch.append(action_vec) entropy_record = [] print "video count", video_count video_count += 1 if video_count >= len(all_file_names): break log_path = Log_file_path + '_' + all_file_names[net_env.trace_idx] log_file = open(log_path, 'wb')
def main(): np.random.seed(RANDOM_SEED) assert len(VIDEO_BIT_RATE) == A_DIM all_cooked_time, all_cooked_bw, _ = load_trace.load_trace() #print(all_cooked_bw) if not os.path.exists(SUMMARY_DIR): os.makedirs(SUMMARY_DIR) net_env = env.Environment(all_cooked_time=all_cooked_time, all_cooked_bw=all_cooked_bw) with tf.Session() as sess, open(LOG_FILE, 'w') as log_file: actor = a3c.ActorNetwork(sess, state_dim=[S_INFO, S_LEN], action_dim=A_DIM, learning_rate=ACTOR_LR_RATE) critic = a3c.CriticNetwork(sess, state_dim=[S_INFO, S_LEN], learning_rate=CRITIC_LR_RATE) summary_ops, summary_vars = a3c.build_summaries() sess.run(tf.global_variables_initializer()) writer = tf.summary.FileWriter(SUMMARY_DIR, sess.graph) # training monitor saver = tf.train.Saver() # save neural net parameters # restore neural net parameters nn_model = NN_MODEL if nn_model is not None: # nn_model is the path to file saver.restore(sess, nn_model) print("Model restored.") epoch = 0 time_stamp = 0 last_bit_rate = DEFAULT_QUALITY bit_rate = DEFAULT_QUALITY action_vec = np.zeros(A_DIM) action_vec[bit_rate] = 1 s_batch = [np.zeros((S_INFO, S_LEN))] a_batch = [action_vec] r_batch = [] entropy_record = [] actor_gradient_batch = [] critic_gradient_batch = [] while True: # serve video forever # the action is from the last decision # this is to make the framework similar to the real delay, sleep_time, buffer_size, rebuf, \ video_chunk_size, next_video_chunk_sizes, \ end_of_video,video_chunk_counter,throughput, video_chunk_remain = \ net_env.get_video_chunk(bit_rate) #print(net_env.get_video_chunk(bit_rate)) time_stamp += delay # in ms time_stamp += sleep_time # in ms # reward is video quality - rebuffer penalty - smooth penalty reward = VIDEO_BIT_RATE[bit_rate] / M_IN_K \ - REBUF_PENALTY * rebuf \ - SMOOTH_PENALTY * np.abs(VIDEO_BIT_RATE[bit_rate] - VIDEO_BIT_RATE[last_bit_rate]) / M_IN_K r_batch.append(reward) last_bit_rate = bit_rate # retrieve previous state if len(s_batch) == 0: state = [np.zeros((S_INFO, S_LEN))] else: state = np.array(s_batch[-1], copy=True) # print(state) # dequeue history record state = np.roll(state, -1, axis=1) print('state', state) # this should be S_INFO number of terms state[0, -1] = VIDEO_BIT_RATE[bit_rate] / float( np.max(VIDEO_BIT_RATE)) # last quality state[1, -1] = buffer_size / BUFFER_NORM_FACTOR # 10 sec state[2, -1] = float(video_chunk_size) / float( delay) / M_IN_K # kilo byte / ms state[3, -1] = float(delay) / M_IN_K / BUFFER_NORM_FACTOR # 10 sec state[4, :A_DIM] = np.array( next_video_chunk_sizes) / M_IN_K / M_IN_K # mega byte state[5, -1] = np.minimum( video_chunk_remain, CHUNK_TIL_VIDEO_END_CAP) / float(CHUNK_TIL_VIDEO_END_CAP) action_prob = actor.predict(np.reshape(state, (1, S_INFO, S_LEN))) action_cumsum = np.cumsum(action_prob) #rand=np.random.randint(1,RAND_RANGE)/ float(RAND_RANGE) #print(action_cumsum,action_cumsum>rand,(action_cumsum>rand).argmax()) #print(action_cumsum > np.random.randint(1, RAND_RANGE) / float(RAND_RANGE)) #print(action_cumsum > np.random.randint(1, RAND_RANGE) / float(RAND_RANGE)).argmax() bit_rate = (action_cumsum > np.random.randint(1, RAND_RANGE) / float(RAND_RANGE)).argmax() # Note: we need to discretize the probability into 1/RAND_RANGE steps, # because there is an intrinsic discrepancy in passing single state and batch states print( '[%d]:download time %.2fms,chunk size %d,buffer=%.2fs,bitrate=%d' % (video_chunk_counter, delay, video_chunk_size, buffer_size, last_bit_rate)) entropy_record.append(a3c.compute_entropy(action_prob[0])) # log time_stamp, bit_rate, buffer_size, reward log_file.write( str(time_stamp) + '\t' + str(VIDEO_BIT_RATE[bit_rate]) + '\t' + str(buffer_size) + '\t' + str(rebuf) + '\t' + str(video_chunk_size) + '\t' + str(delay) + '\t' + str(reward) + '\n') log_file.flush() if len(r_batch ) >= TRAIN_SEQ_LEN or end_of_video: # do training once actor_gradient, critic_gradient, td_batch = \ a3c.compute_gradients(s_batch=np.stack(s_batch[1:], axis=0), # ignore the first chuck a_batch=np.vstack(a_batch[1:]), # since we don't have the r_batch=np.vstack(r_batch[1:]), # control over it terminal=end_of_video, actor=actor, critic=critic) td_loss = np.mean(td_batch) actor_gradient_batch.append(actor_gradient) critic_gradient_batch.append(critic_gradient) print("====") print("Epoch", epoch) print("TD_loss", td_loss, "Avg_reward", np.mean(r_batch), "Avg_entropy", np.mean(entropy_record)) print("====") summary_str = sess.run(summary_ops, feed_dict={ summary_vars[0]: td_loss, summary_vars[1]: np.mean(r_batch), summary_vars[2]: np.mean(entropy_record) }) writer.add_summary(summary_str, epoch) writer.flush() entropy_record = [] if len(actor_gradient_batch) >= GRADIENT_BATCH_SIZE: assert len(actor_gradient_batch) == len( critic_gradient_batch) # assembled_actor_gradient = actor_gradient_batch[0] # assembled_critic_gradient = critic_gradient_batch[0] # assert len(actor_gradient_batch) == len(critic_gradient_batch) # for i in xrange(len(actor_gradient_batch) - 1): # for j in xrange(len(actor_gradient)): # assembled_actor_gradient[j] += actor_gradient_batch[i][j] # assembled_critic_gradient[j] += critic_gradient_batch[i][j] # actor.apply_gradients(assembled_actor_gradient) # critic.apply_gradients(assembled_critic_gradient) for i in range(len(actor_gradient_batch)): actor.apply_gradients(actor_gradient_batch[i]) critic.apply_gradients(critic_gradient_batch[i]) actor_gradient_batch = [] critic_gradient_batch = [] epoch += 1 if epoch % MODEL_SAVE_INTERVAL == 0: # Save the neural net parameters to disk. save_path = saver.save( sess, SUMMARY_DIR + "/nn_model_ep_" + str(epoch) + ".ckpt") print("Model saved in file: %s" % save_path) del s_batch[:] del a_batch[:] del r_batch[:] if end_of_video: last_bit_rate = DEFAULT_QUALITY bit_rate = DEFAULT_QUALITY # use the default action here action_vec = np.zeros(A_DIM) action_vec[bit_rate] = 1 s_batch.append(np.zeros((S_INFO, S_LEN))) a_batch.append(action_vec) else: s_batch.append(state) action_vec = np.zeros(A_DIM) action_vec[bit_rate] = 1 a_batch.append(action_vec)
def run(port=8333, log_file_path=LOG_FILE): np.random.seed(RANDOM_SEED) with tf.Session() as sess, open(log_file_path, 'wb') as log_file: actor = a3c.ActorNetwork(sess, state_dim=S_DIM, action_dim=A_DIM, learning_rate=ACTOR_LR_RATE) critic = a3c.CriticNetwork(sess, state_dim=S_DIM, learning_rate=CRITIC_LR_RATE) summary_ops, summary_vars = a3c.build_summaries() sess.run(tf.initialize_all_variables()) writer = tf.summary.FileWriter(SUMMARY_DIR, sess.graph) saver = tf.train.Saver() # save neural net parameters #restore neural net parameters nn_model = NN_MODEL if nn_model is not None: # nn_model is the path to file saver.restore(sess, nn_model) print("Model restored.") init_action = np.zeros(A_DIM) #by default we simply use the first lambda init_action[DEFAULT_LAMBDA] = 0 s_batch = [np.zeros(S_DIM)] a_batch = [init_action] r_batch = [] entropy_record = [] #this is for training actor_gradient_batch = [] #this is for training critic_gradient_batch = [] #this is for training last_lambda = DEFAULT_LAMBDA epoch = 0 end_of_training = False # Create a TCP/IP socket sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) # Bind the socket to the port server_address = ('localhost', port) print >> sys.stderr, 'starting up on %s port %s' % server_address sock.bind(server_address) # Listen for incoming connections sock.listen(5) count = 0 while True: # Wait for a connection print >> sys.stderr, 'waiting for a connection' connection, addr = sock.accept() print 'Connected with ' + addr[0] + ':' + str(addr[1]) # Receive the json file # json file format: # 'reward': float # 'state': array = '{"state": ["1", "3", "4", ...]}' #numBytes = sys.getsizeof(int) #print ("size to receive: " + str(numBytes)) size = connection.recv(4) size = struct.unpack('!i', size)[0] print >> sys.stderr, 'received "%s"' % size data = connection.recv(size) jsonData = json.loads(data) print jsonData #to receive reward reward = float(jsonData['reward']) if (count > 0): r_batch.append(reward) else: r_batch.append(0.0) count = count + 1 #to receive state stateArray = jsonData['state'] state = np.array(stateArray) print(state) #to compute action action_prob = actor.predict(np.reshape(state, (1, S_DIM))) print("action_prob: ") print(action_prob) action_cumsum = np.cumsum(action_prob) print("action_cumsum: ") print(action_cumsum) print("comparison: ") print(action_cumsum > np.random.randint(1, RAND_RANGE) / float(RAND_RANGE)) selectedLambda = action_prob.argmax() #selectedLambda = (action_cumsum > np.random.randint(1, RAND_RANGE) / float(RAND_RANGE)).argmax() print >> sys.stderr, 'selectedLambda "%s"' % selectedLambda #to update entropy entropy_record.append(a3c.compute_entropy(action_prob[0])) #TODO #to update and apply gradient if len(r_batch) >= TRAIN_SEQ_LEN: actor_gradient, critic_gradient, td_batch = \ a3c.compute_gradients(s_batch=np.stack(s_batch[1:], axis=0), a_batch=np.vstack(a_batch[1:]), r_batch=np.vstack(r_batch[1:]), terminal=end_of_training, actor=actor, critic=critic) td_loss = np.mean(td_batch) print("td_loss: ") print(td_loss) print("actor_gradient: ") print(actor_gradient) print("critic_gradient: ") print(critic_gradient) actor_gradient_batch.append(actor_gradient) critic_gradient_batch.append(critic_gradient) entropy_record = [] print("len(actor_gradient_batch) = ") print len(actor_gradient_batch) if len(actor_gradient_batch) >= GRADIENT_BATCH_SIZE: print("GRADIENT_BATCH_SIZE reached") assert len(actor_gradient_batch) == len( critic_gradient_batch) for i in xrange(len(actor_gradient_batch)): print("###################" + str(i) + "###################") print(actor_gradient_batch[i]) print(critic_gradient_batch[i]) actor.apply_gradients(actor_gradient_batch[i]) critic.apply_gradients(critic_gradient_batch[i]) actor_gradient_batch = [] critic_gradient_batch = [] avg_reward = np.mean(r_batch) summary_str = sess.run(summary_ops, feed_dict={ summary_vars[0]: td_loss, summary_vars[1]: avg_reward }) writer.add_summary(summary_str, epoch) writer.flush() log_file.write( str(datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S')) + '\t' + str(epoch) + '\t' + str(avg_reward) + '\t' + str(td_loss) + '\n') log_file.flush() epoch += 1 if epoch % MODEL_SAVE_INTERVAL == 0: # save the neural net parameters to disk. save_path = saver.save( sess, "./nn_model_ep_" + str(epoch) + ".ckpt") print("Model saved in file: %s" % save_path) if epoch == MAX_EPOCH: end_of_training = True del s_batch[:] del a_batch[:] del r_batch[:] s_batch.append(state) action_vec = np.zeros(A_DIM) action_vec[selectedLambda] = 1 a_batch.append(action_vec) #to send back action print >> sys.stderr, 'sending data back to the client' connection.sendall(struct.pack('!i', selectedLambda)) last_lambda = selectedLambda connection.close() sock.close()
def agent(agent_id, all_cooked_time, all_cooked_bw, net_params_queue, exp_queue): net_env = env.Environment(time=all_cooked_time, bandwidth=all_cooked_bw, random_seed=agent_id) with tf.Session() as sess, open(LOG_FILE + '_agent_' + str(agent_id), 'wb') as log_file: actor = a3c.ActorNetwork(sess, state_dim=[S_INFO, S_LEN], action_dim=A_DIM, learning_rate=ACTOR_LR_RATE) critic = a3c.CriticNetwork(sess, state_dim=[S_INFO, S_LEN], learning_rate=CRITIC_LR_RATE) # initial synchronization of the network parameters from the coordinator actor_net_params, critic_net_params = net_params_queue.get() actor.set_network_params(actor_net_params) critic.set_network_params(critic_net_params) last_bit_rate = DEFAULT_QUALITY bit_rate = DEFAULT_QUALITY action_vec = np.zeros(A_DIM) action_vec[bit_rate] = 1 s_batch = [np.zeros((S_INFO, S_LEN))] a_batch = [action_vec] r_batch = [] entropy_record = [] #need to initialize, and get before simulation step track_index = [] hm = head_movement.move_prediction() time_stamp = 0 while True: # experience video streaming forever # the action is from the last decision # this is to make the framework similar to the real # xgw 20180918: need to modify here estimate_track_index = hm.get_head_movement_prediction() # actual_track_index = hm.get_head_movement_current() actual_track_index = [2, 3, 5, 6] delay, rebuf, buffer_size, sleep_time, video_chunk_size, end_of_video = \ net_env.get_video_chunk(bit_rate, estimate_track_index) time_stamp += delay # in ms time_stamp += sleep_time # in ms # -- linear reward -- # reward is video quality - rebuffer penalty - smoothness # xgw 20180918: need to modify the reward, add the qualiy consistency in viewport # and the buffer # actually the consistency of quality in viewport is the error of head movement prediction error # so it's not sure that whether add the "quality consistency" here # don't know how to modelized the qp as first input reward = VIDEO_BIT_RATE[bit_rate] / M_IN_K \ - REBUF_PENALTY * rebuf \ - SMOOTH_PENALTY * np.abs(VIDEO_BIT_RATE[bit_rate] - VIDEO_BIT_RATE[last_bit_rate]) / M_IN_K # bit_rate_log_reward = np.log((bit_rate + 1) / A_DIM) * BIT_RATE_REWARD_PARAMETER # smooth_p = np.exp(np.abs(last_bit_rate - bit_rate) / A_DIM) * SMOOTH_PENALTY # reward = bit_rate - REBUF_PENALTY * rebuf - smooth_p r_batch.append(reward) last_bit_rate = bit_rate # retrieve previous state if len(s_batch) == 0: state = [np.zeros((S_INFO, S_LEN))] else: state = np.array(s_batch[-1], copy=True) # dequeue history record state = np.roll(state, -1, axis=1) # this should be S_INFO number of terms # state[0, -1] = VIDEO_BIT_RATE[bit_rate] / float(np.max(VIDEO_BIT_RATE)) # last quality # state[1, -1] = buffer_size / BUFFER_NORM_FACTOR # 6 sec # state[2, -1] = float(video_chunk_size) / float(delay) / M_IN_K # kilo byte / ms # state[3, -1] = float(delay) / M_IN_K / BUFFER_NORM_FACTOR # 10 sec # state[4, :A_DIM] = np.array(next_video_chunk_sizes) / M_IN_K / M_IN_K # mega byte # state[5, -1] = np.minimum(video_chunk_remain, CHUNK_TIL_VIDEO_END_CAP) / float(CHUNK_TIL_VIDEO_END_CAP) state[0, -1] = float(video_chunk_size) / float( delay) / M_IN_K # kilo byte / ms state[1, -1] = buffer_size / BUFFER_NORM_FACTOR # 6 sec state[2, :4] = np.array(actual_track_index) state[3, -1] = VIDEO_BIT_RATE[bit_rate] / float( np.max(VIDEO_BIT_RATE)) # last chunk's bitrate # compute action probability vector action_prob = actor.predict(np.reshape(state, (1, S_INFO, S_LEN))) action_cumsum = np.cumsum(action_prob) bit_rate = (action_cumsum > np.random.randint(1, RAND_RANGE) / float(RAND_RANGE)).argmax() # Note: we need to discretize the probability into 1/RAND_RANGE steps, # because there is an intrinsic discrepancy in passing single state and batch states entropy_record.append(a3c.compute_entropy(action_prob[0])) # log time_stamp, bit_rate, buffer_size, reward log_file.write('time_stamp: ' + str(time_stamp) + '\t' + 'VIDEO_BIT_RATE: ' + str(VIDEO_BIT_RATE[bit_rate]) + '\t' + 'buffer_size: ' + str(buffer_size) + '\t' + 'rebuf: ' + str(rebuf) + '\t' + 'video_chunk_size: ' + str(video_chunk_size) + '\t' + 'delay: ' + str(delay) + '\t' + 'avg throughtput: ' + str(video_chunk_size / delay) + '\t' + 'reward: ' + str(reward) + '\n') log_file.flush() # report experience to the coordinator if len(r_batch) >= TRAIN_SEQ_LEN or end_of_video: exp_queue.put([ s_batch[1:], # ignore the first chuck a_batch[1:], # since we don't have the r_batch[1:], # control over it end_of_video, { 'entropy': entropy_record } ]) # synchronize the network parameters from the coordinator actor_net_params, critic_net_params = net_params_queue.get() actor.set_network_params(actor_net_params) critic.set_network_params(critic_net_params) del s_batch[:] del a_batch[:] del r_batch[:] del entropy_record[:] log_file.write( '\n') # so that in the log we know where video ends # store the state and action into batches if end_of_video: last_bit_rate = DEFAULT_QUALITY bit_rate = DEFAULT_QUALITY # use the default action here action_vec = np.zeros(A_DIM) action_vec[bit_rate] = 1 s_batch.append(np.zeros((S_INFO, S_LEN))) a_batch.append(action_vec) else: s_batch.append(state) action_vec = np.zeros(A_DIM) action_vec[bit_rate] = 1 a_batch.append(action_vec)
def main(): # run_id = '0' rnd_ratio = 0.8 if len(sys.argv) > 1: run_id = sys.argv[1] else: run_id = '0' seed = RANDOM_SEED + int(run_id) np.random.seed(seed) assert len(VIDEO_BIT_RATE) == A_DIM if not os.path.exists(SUMMARY_DIR): os.makedirs(SUMMARY_DIR) if not os.path.exists(TRANS_DIR): os.makedirs(TRANS_DIR) all_cooked_time, all_cooked_bw, all_file_names = load_trace.load_trace(cooked_trace_folder=TRACE_DIR) net_env = env.Environment(all_cooked_time=all_cooked_time, all_cooked_bw=all_cooked_bw, random_seed=seed) log_path = LOG_FILE + '_' + all_file_names[net_env.trace_idx] + '_' + str(run_id) log_file = open(log_path, 'wb') trans_path = TRANS_FILE + '_' + all_file_names[net_env.trace_idx] + '_' + str(run_id) trans_file = open(trans_path, 'wb') last_action = deque(maxlen=2) last_action.append(1) last_action.append(1) with tf.Session() as sess: actor = a3c.ActorNetwork(sess, state_dim=[S_INFO, S_LEN], action_dim=A_DIM, learning_rate=ACTOR_LR_RATE) critic = a3c.CriticNetwork(sess, state_dim=[S_INFO, S_LEN], learning_rate=CRITIC_LR_RATE) sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() # save neural net parameters # restore neural net parameters nn_model = NN_MODEL if nn_model is not None: # nn_model is the path to file saver.restore(sess, nn_model) print("Model restored.") time_stamp = 0 last_bit_rate = DEFAULT_QUALITY bit_rate = DEFAULT_QUALITY action_vec = np.zeros(A_DIM) action_vec[bit_rate] = 1 s_batch = [np.zeros((S_INFO, S_LEN))] a_batch = [action_vec] r_batch = [] entropy_record = [] video_count = 0 while True: # serve video forever # the action is from the last decision # this is to make the framework similar to the real delay, sleep_time, buffer_size, rebuf, \ video_chunk_size, next_video_chunk_sizes, \ end_of_video, video_chunk_remain = \ net_env.get_video_chunk(bit_rate) time_stamp += delay # in ms time_stamp += sleep_time # in ms # reward is video quality - rebuffer penalty - smoothness reward = VIDEO_BIT_RATE[bit_rate] / M_IN_K \ - REBUF_PENALTY * rebuf \ - SMOOTH_PENALTY * np.abs(VIDEO_BIT_RATE[bit_rate] - VIDEO_BIT_RATE[last_bit_rate]) / M_IN_K r_batch.append(reward) last_bit_rate = bit_rate # log time_stamp, bit_rate, buffer_size, reward log_file.write(str(time_stamp / M_IN_K) + '\t' + str(VIDEO_BIT_RATE[bit_rate]) + '\t' + str(buffer_size) + '\t' + str(rebuf) + '\t' + str(video_chunk_size) + '\t' + str(delay) + '\t' + str(reward) + '\n') log_file.flush() # retrieve previous state if len(s_batch) == 0: state = [np.zeros((S_INFO, S_LEN))] old_state = np.zeros((S_INFO, S_LEN), dtype=np.float64) else: state = np.array(s_batch[-1], copy=True) old_state = np.array(s_batch[-1], copy=True) # dequeue history record state = np.roll(state, -1, axis=1) # this should be S_INFO number of terms state[0, -1] = VIDEO_BIT_RATE[bit_rate] / float(np.max(VIDEO_BIT_RATE)) # last quality state[1, -1] = buffer_size / BUFFER_NORM_FACTOR # 10 sec state[2, -1] = float(video_chunk_size) / float(delay) / M_IN_K # kilo byte / ms state[3, -1] = float(delay) / M_IN_K / BUFFER_NORM_FACTOR # 10 sec state[4, :A_DIM] = np.array(next_video_chunk_sizes) / M_IN_K / M_IN_K # mega byte state[5, -1] = np.minimum(video_chunk_remain, CHUNK_TIL_VIDEO_END_CAP) / float(CHUNK_TIL_VIDEO_END_CAP) action_prob = actor.predict(np.reshape(state, (1, S_INFO, S_LEN))) action_cumsum = np.cumsum(action_prob) bit_rate = (action_cumsum > np.random.randint(1, RAND_RANGE) / float(RAND_RANGE)).argmax() # Note: we need to discretize the probability into 1/RAND_RANGE steps, # because there is an intrinsic discrepancy in passing single state and batch states if np.random.random() < rnd_ratio: bit_rate = np.random.randint(0, A_DIM) print "random action", bit_rate send_data = str(bit_rate) trans_file.write('|'.join([str(list(old_state.reshape(-1))), str(list(action_prob.reshape(-1))), str(list(state.reshape(-1))), str(reward), str(send_data)])) trans_file.write('\n') trans_file.flush() # print 'state', list(old_state.reshape(-1)) # print 'action', last_action[0] # print 'reward', reward last_action.append(send_data) s_batch.append(state) entropy_record.append(a3c.compute_entropy(action_prob[0])) if end_of_video: log_file.write('\n') log_file.close() trans_file.write('\n') trans_file.close() last_bit_rate = DEFAULT_QUALITY bit_rate = DEFAULT_QUALITY # use the default action here del s_batch[:] del a_batch[:] del r_batch[:] action_vec = np.zeros(A_DIM) action_vec[bit_rate] = 1 s_batch.append(np.zeros((S_INFO, S_LEN))) a_batch.append(action_vec) entropy_record = [] print "video count", video_count, all_file_names[net_env.trace_idx] video_count += 1 if video_count > len(all_file_names): break log_path = LOG_FILE + '_' + all_file_names[net_env.trace_idx] + '_' + str(run_id) log_file = open(log_path, 'wb') trans_path = TRANS_FILE + '_' + all_file_names[net_env.trace_idx] + '_' + str(run_id) trans_file = open(trans_path, 'wb')