Exemple #1
0
class Algorithm:
     def __init__(self):
         self.replay_buffer = ReplayBuffer(buffer_size=BUFFER_MAX, past_frame_len=FRAME_SKIP, multi_step=N_STEP)

     # Intial
     def Initial(self):
     # Initail your session or somethingself.target_net
     # restore neural net parameters
         # self.buffer_size = 0
         self.ctx = try_gpu(GPU_INDEX)
         self.frame_cnt = 0
         self.train_count = 0
         self.loss_sum = 0

         self.q_count = 0
         self.q_sum = 0
         self.dtype = DTYPE
         INPUT_SAMPLE = nd.random_uniform(0,1,(1, FRAME_SKIP, 11), self.ctx, self.dtype)
         self.target_net = self.get_net(INPUT_SAMPLE)
         self.policy_net = self.get_net(INPUT_SAMPLE)

         if MODEL_FILE is not None:
             print('%s: read trained results from [%s]' % (tm.strftime("%Y-%m-%d %H:%M:%S"), MODEL_FILE))
             self.policy_net.load_params(MODEL_FILE, ctx=self.ctx)
         self.update_target_net()
         # adagrad
         self.trainer = Trainer(self.policy_net.collect_params(),
                                optimizer=mx.optimizer.RMSProp(LEARNING_RATE, 0.95, 0.95))
         self.loss_func = loss.L2Loss()

         self.epsilon = EPSILON_START
         self.epsilon_min = EPSILON_MIN
         self.epsilon_rate = (EPSILON_START - EPSILON_MIN) / EPSILON_DECAY
         self.rng = np.random.RandomState(int(time() * 1000) % 100000000)


     def update_target_net(self):
         self.copy_params(self.policy_net, self.target_net)
         return

     def calculate_reward(self,end_of_video,cdn_flag,rebuf,end_delay,skip_frame_time_len,decision_flag,bitrate,last_bitrate,frame_time_len):
         if end_of_video <= 1.0:
             LANTENCY_PENALTY = 0.005
         else:
             LANTENCY_PENALTY = 0.01
         if not cdn_flag:
             reward_frame = frame_time_len * float(BIT_RATE[
                                                   bitrate]) / 1000 - REBUF_PENALTY * rebuf - LANTENCY_PENALTY * end_delay - SKIP_PENALTY * skip_frame_time_len
         else:
             reward_frame = -(REBUF_PENALTY * rebuf)
         if decision_flag or end_of_video:
             reward_frame += -1 * SMOOTH_PENALTY * (abs(BIT_RATE[bitrate] - BIT_RATE[last_bitrate]) / 1000)
         return reward_frame

     def run_frame(self,time, time_interval, send_data_size, chunk_len, \
                rebuf, buffer_size, play_time_len, end_delay, \
                cdn_newest_id, download_id, cdn_has_frame, skip_frame_time_len, decision_flag, \
                buffer_flag, cdn_flag, skip_flag, end_of_video,action,last_action,frame_time_len):

         bitrate, target_buffer, latency = self.action_to_submit(action)
         last_bitrate,_,_ = self.action_to_submit(last_action)
         reward_frame = self.calculate_reward(end_of_video,cdn_flag,rebuf,end_delay,skip_frame_time_len,decision_flag,bitrate,last_bitrate,frame_time_len)
         self.replay_buffer.insert_sample(time_interval, send_data_size, chunk_len, rebuf, buffer_size, play_time_len,end_delay, cdn_newest_id, download_id, cdn_has_frame, skip_frame_time_len,decision_flag, buffer_flag, cdn_flag, skip_flag, end_of_video, reward_frame, action)
         st = self.replay_buffer.get_current_state()
         st = nd.array(st, ctx=self.ctx, dtype=self.dtype).reshape((1, FRAME_SKIP, -1))
         action, max_q = self.choose_action(False, False, st)
         bit_rate, target_buffer, latency_limit = self.action_to_submit(action)
         self.frame_cnt += 1
         if max_q is not None:
             self.q_count += 1
             self.q_sum += max_q
         if  self.frame_cnt % TRAIN_PER_STEP == 0:
             state, s_, actions, rewards = self.replay_buffer.get_batch(16)
             loss = self.train_policy_net(state, actions, rewards, s_)
             self.train_count += 1
             self.loss_sum += loss
         # fixme 视频结束的时候是否需要清零
         if end_of_video:
             average_loss = self.loss_sum / (self.train_count + 0.0001)
             average_q = self.q_sum / (self.q_count + 0.000001)
             self.loss_sum = 0
             self.train_count = 0
             self.q_count = 0
             self.q_sum = 0
         else:
             average_loss = 0
             average_q = 0
         return reward_frame,bit_rate, target_buffer, latency_limit,action,average_loss,average_q

     def train_policy_net(self,states,actions,rewards,next_states):
         batch_size = actions.shape[0]
         s = states.shape
         states = nd.array(states,ctx=self.ctx,dtype=self.dtype)
         actions = nd.array(actions[:,0],ctx=self.ctx)
         rewards = nd.array(rewards[:,0],ctx=self.ctx)
         next_states = nd.array(next_states,ctx=self.ctx,dtype=self.dtype)

         next_qs = self.target_net(next_states)
         next_q_out = nd.max(next_qs,axis=1)

         target = rewards + next_q_out * 0.99 ** MULTI_STEP

         with autograd.record():
             current_qs = self.policy_net(states)
             current_q = nd.pick(current_qs,actions,1)
             loss = self.loss_func(target,current_q)
         loss.backward()
         self.trainer.step(16)
         total_loss = loss.mean().asscalar()
         return total_loss

     def save_params_to_file(self,model_path,mark):
         time_mark = tm.time()
         filename = model_path + '/net_' + str(mark) + '_' + str(time_mark) + '.model'
         self.policy_net.save_params(filename)
         print(tm.strftime(TIME_FORMAT), 'save results success:',filename)
         files = getNewestFile(model_path)
         if len(files) > 5:
             tmp = files[5:]
             for f in tmp:
                 if os.path.exists(model_path + "/" + f):
                     os.remove(model_path + "/" + f)
                     print(f + "is deleted.")

     def get_net(self, input_sample):
         if IS_DUELING:
             net = dueling_dqn.DuelingDQN()
             net.initialize(init.Xavier(), ctx=self.ctx)
         else:
             net = dueling_dqn.OriginDQN()
             net.initialize(init.Xavier(), ctx=self.ctx)
         net(input_sample)
         return net

     def choose_action(self, random_action, testing, st):
         self.epsilon = max(self.epsilon_min, self.epsilon - self.epsilon_rate)
         max_q = None
         random_num = self.rng.rand()
         if random_action or ((not testing) and random_num < self.epsilon):
             action = self.rng.randint(0,ACTION_NUM)
         else:
             out = self.policy_net(st)
             max_index = nd.argmax(out, axis=1)
             action = int(max_index.astype(np.int).asscalar())
             max_q = out[0, action].asscalar()
         return action, max_q

     def action_to_submit(self,action):
         bit_rate = action % 4
         target_buffer = action // 4
         latency_limit = 4
         return bit_rate, target_buffer, latency_limit


     #Define your al
     def run(self, time, S_time_interval, S_send_data_size, S_chunk_len, S_rebuf, S_buffer_size, S_play_time_len,
             S_end_delay, S_decision_flag, S_buffer_flag,S_cdn_flag,S_skip_time,
             end_of_video, cdn_newest_id,download_id,cdn_has_frame,IntialVars):
         # state = np.empty(shape=(len(S_time_interval),11),dtype=np.float32)
         S_end_of_video = [0] * FRAME_SKIP
         S_end_of_video[-1] = end_of_video
         state = [S_time_interval[-FRAME_SKIP:],S_send_data_size[-FRAME_SKIP:],S_chunk_len[-FRAME_SKIP:], S_buffer_size[-FRAME_SKIP:], S_rebuf[-FRAME_SKIP:],
                      S_end_delay[-FRAME_SKIP:],  S_play_time_len[-FRAME_SKIP:],S_decision_flag[-FRAME_SKIP:], S_cdn_flag[-FRAME_SKIP:],S_skip_time[-FRAME_SKIP:],S_end_of_video]

         state = nd.array(state,dtype=self.dtype).transpose((1,0)).reshape((1,FRAME_SKIP,-1))
         # print(state.shape)

         action, max_q = self.choose_action(False,True,state)
         # print(action)
         bit_rate, target_buffer, latency_limit = self.action_to_submit(action)
         print(bit_rate, target_buffer, latency_limit)

         return bit_rate, target_buffer, latency_limit

     def run(self, time, S_time_interval, S_send_data_size, S_chunk_len, S_rebuf, S_buffer_size, S_play_time_len,
             S_end_delay, S_decision_flag, S_buffer_flag, S_cdn_flag, S_skip_time, end_of_video, cdn_newest_id,
             download_id, cdn_has_frame, IntialVars):

         # If you choose the marchine learning
         '''state = []
         state[0] = ...
         state[1] = ...
         state[2] = ...
         state[3] = ...
         state[4] = ...
         decision = actor.predict(state).argmax()
         bit_rate, target_buffer = decison//4, decison % 4 .....
         return bit_rate, target_buffer'''

         # If you choose BBA
         RESEVOIR = 0.5
         CUSHION = 1.5

         if S_buffer_size[-1] < RESEVOIR:
             bit_rate = 0
         elif S_buffer_size[-1] >= RESEVOIR + CUSHION and S_buffer_size[-1] < CUSHION + CUSHION:
             bit_rate = 2
         elif S_buffer_size[-1] >= CUSHION + CUSHION:
             bit_rate = 3
         else:
             bit_rate = 1

         target_buffer = 0
         latency_limit = 4

         return bit_rate, target_buffer, latency_limit




     def get_params(self):
     # get your params
        your_params = []
        return your_params

     def copy_params(self, src_net, dst_net):
         ps_src = src_net.collect_params()
         ps_dst = dst_net.collect_params()
         prefix_length = len(src_net.prefix)
         for k, v in ps_src.items():
             k = k[prefix_length:]
             v_dst = ps_dst.get(k)
             v_dst.set_data(v.data())