def testEnv(): env = Env() channelThroughPut = 0 # fraction of time that packets are successfully delivered over the channel # i.e no collisions or idle time slots for iteration in range(config.Iterations): for t in range(config.TimeSlots): initialState = env.reset() for user in range(config.N): action = slottedAlohaProtocol() env.step(action=action, user=user) # each user changes the inner state of the environment where the environment uses the inner state # in order to keep track on the channels and the ACK signals for each user nextStateForEachUser, rewardForEachUser = env.getNextState() # if a reward is one that means that a packets was successfully delivered over the channel # the sum has a maximum of the number of channels -> config.K channelThroughPut = channelThroughPut + np.sum(rewardForEachUser) # measuring the expected value channelThroughPut = channelThroughPut / (config.Iterations * config.TimeSlots) print("Channel Utilization average {}".format(channelThroughPut)) ToPlotX = range(config.Iterations * config.TimeSlots) ToPlotY = np.ones_like(ToPlotX) * channelThroughPut plot_graph(data=[ToPlotX, ToPlotY], filename="Aloha", title="Aloha", xlabel="Time slot", ylabel="Average channel utilization", legend="SlottedAloha") # # # def testTimeEnv(): # env = TimeDependentEnv() # channelThroughPut = 0 # fraction of time that packets are successfully delivered over the channel # # i.e no collisions or idle time slots # for iteration in range(config.Iterations): # TimeSPU = env.reset() # for t in range(config.TimeSlots): # env.resetTimeStep() # # reset the internal state of the environment # # which keep tracks of the users actions through out the time step # for user in range(config.N): # action = slottedAlohaProtocol() # env.step(action=action, user=user) # # each user changes the inner state of the environment where the environment uses the inner state # # in order to keep track on the channels and the ACK signals for each user # nextStateForEachUser, rewardForEachUser = env.tstep(timestep=t) # # if a reward is one that means that a packets was successfully delivered over the channel # # the sum has a maximum of the number of channels -> config.K # channelThroughPut = channelThroughPut + np.sum(rewardForEachUser) # # measuring the expected value # channelThroughPut = channelThroughPut / (config.Iterations * config.TimeSlots) # print("Channel Utilization average {}".format(channelThroughPut)) # ToPlotX = range(config.Iterations * config.TimeSlots) # ToPlotY = np.ones_like(ToPlotX) * channelThroughPut # plot_graph(data=[ToPlotX, ToPlotY], filename="Aloha", title="Aloha", # xlabel="Time slot", ylabel="Average channel utilization", legend="SlottedAloha")
def transcate_DDPG(self): BATCH_SIZE = 32 total_steps = 0 # 记录步数,一天是一步 profit_list = [] # 记录每局总收益 profitAdvanced_list = [] actions = 2 # 行动个数 s_dim = 87 a_dim = 1 brain = DDPG( a_dim=a_dim, s_dim=s_dim, a_bound=1., LR_A=0.001, LR_C=0.001, GAMMA=.99, TAU=0.01, # replacement=REPLACEMENT, ) gameNum = 0 #记录游戏轮数 ex_steps = 500 #探索衰减的轮数 epsilon = self.epsilon last_remainder = 0 reward_list = [0] #存储每次的收益,来计算baseline Loss_list = [] #存储训练过程中的损失值 wait_list = [] #记录N轮游戏分别等待天数 gameSplit = 5000 #每多少轮游戏画图 while total_steps < 60000: # 初始化游戏 # routeId = random.randrange(0, 49, 1) routeId = 21 self.routeline = self.allRoute[routeId] # print(self.routeline) env = Env(self.routeline) gameNum += 1 # state = env.getState() # 以state[0]、state[1]方式访问 today = env.getToday() terminal = False order_accepted = False isExploration = False create_date = 1 end_date = 0 stay_num = 0 # 一局游戏 # print("GAME#:",gameNum) baseline = 0 tao_prob = [] tao_reward = [] wait_day = [] #记录一局游戏等待哪些天 while today < self.routeline[-1] and terminal == False: # 有新订单产生 (当订单数已满10个时,此处不会收到新订单) if order_accepted == False: self.orderSelect(self.routeline, 60) # print(self.order) env.setOrder(self.order) order_accepted = True # 遍历self.orders(即state[0])字典,对每一个订单操作 state = env.getState() # 当前状态 state_tf = np.mat(state) # print(state_tf,len(state_tf)) # 由神经网络选择行动 if random.random() < epsilon and isExploration == False: isExploration = True # end_date = random.randrange(env.getTodayIndex(),87,1) end_date = 60 if isExploration: if env.getTodayIndex() == end_date: action_model = 1 if ex_steps > 0: ex_steps -= 1 else: action_model = 0 else: #action from learning action_model = brain.choose_action(state_tf) # print(action_model) wait_day.append(env.getTodayIndex()) # 订单字典 历史曲线 reward reward = env.getReward(action_model) tao_reward.append(reward) # 订单完成或者到最后一天 terminal = env.isTerminal(action_model) state_ = env.getNextState(action_model) if len(state_) == 1: state_ = copy.deepcopy(state) brain.store_transition(state, action_model, reward, state_) # profitAdvanced_list.append(td_error[0][0]) if brain.pointer > brain.MEMORY_CAPACITY: # print(b_s_) brain.learn() total_steps += 1 if terminal: # wait_list.append(wait_day[-1]) # loss = brain.learn() # Loss_list.append(loss) break # step 过一天加一 env.nextStep() # 一局的总收益 epsilon = self.epsilon * (ex_steps / 500) print("epsilon:", epsilon) print("TD_Error:", baseline) profit = env.getTotalReward() profit_list.append(profit) print("total_steps:", total_steps) print("profit_list", profit_list) print("profit:", profit, "profitAvg:", np.mean(profit_list)) print("action-prob:", tao_prob) print("Reward:", tao_reward) print("wait_day:", wait_day) self.writeHistory('./picture/history.txt', epsilon, baseline, total_steps, profit_list, profit, tao_prob, tao_reward, wait_day, gameNum) print("########################" + str(gameNum) + "###########################") if len(profit_list) >= gameSplit: plt.figure() plt.plot(profit_list, 'r-') plt.savefig('./picture/' + str(gameNum) + 'liner_profit_PG.jpg') plt.figure() plt.scatter(np.arange(gameSplit), profit_list) plt.savefig('./picture/' + str(gameNum) + 'scatter_profit_PG.jpg') plt.figure() plt.plot(profitAdvanced_list, 'g-') plt.savefig('./picture/' + str(gameNum) + 'liner_advanced_PG.jpg') plt.figure() plt.plot(Loss_list, 'y-') plt.savefig('./picture/' + str(gameNum) + 'liner_loss_PG.jpg') plt.figure() plt.scatter(np.arange(gameSplit), wait_list, c='r') plt.savefig('./picture/' + str(gameNum) + 'scatter_waitDay_PG.jpg') if len(profit_list) >= 500: profit_list.clear() wait_list.clear()
def transcate_AC(self): total_steps = 0 # 记录步数,一天是一步 profit_list = [] # 记录每局总收益 profitAdvanced_list = [] actions = 2 # 行动个数 brain = ActorCritic( n_actions=2, n_features=87, LR_A=0.001, LR_C=0.01, reward_decay=1., prob_clip=0., ) gameNum = 0 #记录游戏轮数 ex_steps = 500 #探索衰减的轮数 epsilon = self.epsilon last_remainder = 0 reward_list = [0] #存储每次的收益,来计算baseline Loss_list = [] #存储训练过程中的损失值 wait_list = [] #记录N轮游戏分别等待天数 gameSplit = 500 #每多少轮游戏画图 while total_steps < 60000: # 初始化游戏 # routeId = random.randrange(0, 49, 1) routeId = 21 self.routeline = self.allRoute[routeId] # print(self.routeline) env = Env(self.routeline) gameNum += 1 # state = env.getState() # 以state[0]、state[1]方式访问 today = env.getToday() terminal = False order_accepted = False isExploration = False create_date = 1 end_date = 0 stay_num = 0 # 一局游戏 # print("GAME#:",gameNum) baseline = 0 tao_prob = [] tao_reward = [] wait_day = [] #记录一局游戏等待哪些天 while today < self.routeline[-1] and terminal == False: # 有新订单产生 (当订单数已满10个时,此处不会收到新订单) if order_accepted == False: self.orderSelect(self.routeline, 60) # print(self.order) env.setOrder(self.order) order_accepted = True # 遍历self.orders(即state[0])字典,对每一个订单操作 state = env.getState() # 当前状态 state_tf = np.mat(state) # print(state_tf,len(state_tf)) # 由神经网络选择行动 if random.random() < epsilon and isExploration == False: isExploration = True end_date = random.randrange(env.getTodayIndex(), 87, 1) # end_date = 60 if isExploration: if env.getTodayIndex() == end_date: action_model = 1 if ex_steps > 0: ex_steps -= 1 else: action_model = 0 else: #action from learning action_model, p = brain.choose_action( state_tf, env.getTodayIndex()) tao_prob.append(p) if action_model == 0: action_finishOrder = [1, 0] else: action_finishOrder = [0, 1] wait_day.append(env.getTodayIndex()) # 订单字典 历史曲线 reward reward = env.getReward(action_model) tao_reward.append(reward) # 订单完成或者到最后一天 terminal = env.isTerminal(action_model) state_ = env.getNextState(action_model) # print(state_tf) # print(state_) td_error = brain.criticLearn(state_tf, reward, state_) baseline = td_error profitAdvanced_list.append(td_error[0][0]) loss = brain.actorLearn(state_tf, action_model, td_error) # print(loss) Loss_list.append(loss) # 保存记录到记忆库 # print("this is store arg:",state_tf,";", action_model,";", reward,";", env.getTodayIndex()) # brain.store_transition(state_tf, action_model, reward, env.getTodayIndex()) # print(action_model) total_steps += 1 if terminal: wait_list.append(wait_day[-1]) break # step 过一天加一 env.nextStep() # 一局的总收益 # epsilon = self.epsilon*(ex_steps/500) print("epsilon:", epsilon) print("TD_Error:", baseline) profit = env.getTotalReward() profit_list.append(profit) print("total_steps:", total_steps) print("profit_list", profit_list) print("profit:", profit, "profitAvg:", np.mean(profit_list)) print("action-prob:", tao_prob) print("Reward:", tao_reward) print("wait_day:", wait_day) self.writeHistory('./picture/history.txt', epsilon, baseline, total_steps, profit_list, profit, tao_prob, tao_reward, wait_day, gameNum) print("########################" + str(gameNum) + "###########################") if len(profit_list) >= gameSplit: plt.figure() plt.plot(profit_list, 'r-') plt.savefig('./picture/' + str(gameNum) + 'liner_profit_PG.jpg') plt.figure() plt.scatter(np.arange(gameSplit), profit_list) plt.savefig('./picture/' + str(gameNum) + 'scatter_profit_PG.jpg') plt.figure() plt.plot(profitAdvanced_list, 'g-') plt.savefig('./picture/' + str(gameNum) + 'liner_advanced_PG.jpg') plt.figure() plt.plot(Loss_list, 'y-') plt.savefig('./picture/' + str(gameNum) + 'liner_loss_PG.jpg') plt.figure() plt.scatter(np.arange(gameSplit), wait_list, c='r') plt.savefig('./picture/' + str(gameNum) + 'scatter_waitDay_PG.jpg') profit_list.clear() wait_list.clear()