def run(self): ep = 0 while self.g_ep.value < 100: # total_step = 1 s = self.env.reset() buffer_s, buffer_a, buffer_r = [], [], [] ep_r = [0. for i in range(self.agent_num)] for step in range(1000): # print(ep) # if self.name == 'w00' and self.g_ep.value%10 == 0: # path = "/Users/xue/Desktop/temp/temp%d"%self.g_ep.value # if not os.path.exists(path): # os.mkdir(path) # self.env.render(path) s0 = s[0] a0, prob0 = self.lnet[0].choose_action(v_wrap(s0[None, :]), True) a0 = [a0] s = [np.concatenate((s[i],np.array(a0)),-1) for i in range(1, self.agent_num)] s = [s0] + s a = [self.lnet[i].choose_action(v_wrap(s[i][None, :]), True) for i in range(1, self.agent_num)] prob = [elem[1] for elem in a] a = a0 + [elem[0] for elem in a] s_, r, done, _ = self.env.step(a,need_argmax=False) # print(a) # if done[0]: r = -1 ep_r = [ep_r[i] + r[i] for i in range(self.agent_num)] x = self._influencer_reward(r[0], self.lnet[1:], prob0, a0, s[1:], prob) r = [float(i) for i in r] r[0] += x.numpy() buffer_a.append(a) buffer_s.append(s) buffer_r.append(r) if step % 5 == 0 and step != 0: # update global and assign to local net _s0 = s_[0] a0 = self.lnet[0].choose_action(v_wrap(_s0[None, :]), False) a0 = [a0] _s = [np.concatenate((s_[i], np.array(a0)), -1) for i in range(1, self.agent_num)] _s = [_s0] + _s # sync done = [False for i in range(self.agent_num)] [push_and_pull(self.opt[i], self.lnet[i], self.gnet[i], done[i], _s[i], buffer_s, buffer_a, buffer_r, self.GAMMA, i) for i in range(self.agent_num)] [self.scheduler_lr[i].step() for i in range(self.agent_num)] buffer_s, buffer_a, buffer_r = [], [], [] # if ep == 999: # done and print information # record(self.g_ep, self.g_ep_r, sum(ep_r), self.res_queue, self.name) # break s = s_ # total_step += 1 print('ep%d'%ep, self.name, sum(ep_r)) ep+=1 if self.name == "w00": self.sender.send([sum(ep_r),ep]) self.res_queue.put(None)
def expand_dispatch(self,dis = 1): job = copy.deepcopy(self) s = self.state while(True): a = v_wrap(np.repeat(dis,net.choose_action(v_wrap(s)).size)) s_,r,done = self.step(a.numpy()) s = s_ if done : # print(self.total_time) break return job ,self.total_time
def expand(self, net): job = copy.deepcopy(self) s = self.state while (True): a = v_wrap(net.choose_action(v_wrap(s))) s_, r, done = self.step(a.numpy()) s = s_ if done: # print(self.total_time) break return job, self.total_time
def run(self): total_step = 1 while self.g_ep.value < self.args.MAXEPS: # print(self.g_ep.value) s = self.env.reset() buffer_s, buffer_a, buffer_r = [], [], [] ep_r = 0 for t in range(self.args.MAXSTEP): if self.name == 'worker0': self.env.render() a = self.lnet.choose_action(v_wrap(s[None, :])) s_, r, done, _ = self.env.step(a.clip(-2, 2)) if t == self.args.MAXSTEP - 1: done = True ep_r += r buffer_s.append(s) buffer_a.append(a) buffer_r.append((r + 8.1) / 8.1) if total_step % self.args.updateperiod == 0 or done: # print(total_step) push_and_pull(self.opt, self.lnet, self.gnet, done, s_, buffer_s, buffer_a, buffer_r, self.args.gamma) buffer_s, buffer_a, buffer_r = [], [], [] if done: print('*') record(self.g_ep, self.g_ep_r, ep_r, self.res_queue, self.name) break s = s_ total_step += 1 self.res_queue.put(None)
def run(self): total_step = 1 while self.g_ep.value < MAX_EP: s = self.env.reset() buffer_s, buffer_a, buffer_r = [], [], [] ep_r = 0. while True: if self.name == 'w00': self.env.render() a = self.lnet.choose_action(v_wrap(s[None, :])) s_, r, done, _ = self.env.step(a) if done: r = -1 ep_r += r buffer_a.append(a) buffer_s.append(s) buffer_r.append(r) if total_step % UPDATE_GLOBAL_ITER == 0 or done: push_and_pull(self.opt, self.lnet, self.gnet, done, s_, buffer_s, buffer_a, buffer_r, GAMMA) buffer_s, buffer_a, buffer_r = [], [], [] if done: record(self.g_ep, self.g_ep_r, ep_r, self.res_queue, self.name) break s = s_ total_step += 1 self.res_queue.put(None)
def run(self): total_step = 1 while self.g_ep.value < MAX_EP: s = self.env.reset() buffer_s, buffer_a, buffer_r = [], [], [] ep_r = 0. for t in range(MAX_EP_STEP): if self.name == 'w0': self.env.render() a = self.lnet.choose_action(v_wrap(s[None, :])) s_, r, done, _ = self.env.step(a.clip(-2, 2)) if t == MAX_EP_STEP - 1: done = True ep_r += r buffer_a.append(a) buffer_s.append(s) buffer_r.append((r + 8.1) / 8.1) # normalize if total_step % UPDATE_GLOBAL_ITER == 0 or done: # update global and assign to local net # sync push_and_pull(self.opt, self.lnet, self.gnet, done, s_, buffer_s, buffer_a, buffer_r, GAMMA) buffer_s, buffer_a, buffer_r = [], [], [] if done: # done and print information record(self.g_ep, self.g_ep_r, ep_r, self.res_queue, self.name) break s = s_ total_step += 1 self.res_queue.put(None)
def run(self): total_step = 1 while self.g_ep.value < MAX_EP: s = self.env.reset() buffer_s, buffer_action, buffer_reward = [], [], [] ep_r = 0. while True: if self.name == 'w00': self.env.render() a = self.lnet.choose_action(v_wrap(s[None, :])) s_, r, done, _ = self.env.step(a) if done: r = -1 ep_r += r buffer_action.append(a) buffer_s.append(s) buffer_reward.append(r) if total_step % UPDATE_GLOBAL_ITER == 0 or done: # update global and assign to local net # sync push_and_pull(self.opt, self.lnet, self.gnet, done, s_, buffer_s, buffer_action, buffer_reward, GAMMA) buffer_s, buffer_action, buffer_reward = [], [], [] if done: # done and print information record(self.g_ep, self.g_ep_r, ep_r, self.res_queue, self.name) break s = s_ total_step += 1 self.res_queue.put(None)
def run(self): total_step = 1 while self.g_ep.value < MAX_EP: try: mapID = np.random.randint( 1, 6) #Choosing a map ID from 5 maps in Maps folder randomly posID_x = np.random.randint( MAP_MAX_X ) #Choosing a initial position of the DQN agent on X-axes randomly posID_y = np.random.randint( MAP_MAX_Y ) #Choosing a initial position of the DQN agent on Y-axes randomly #Creating a request for initializing a map, initial position, the initial energy, and the maximum number of steps of the DQN agent request = ("map" + str(mapID) + "," + str(posID_x) + "," + str(posID_y) + ",50,100") #Send the request to the game environment (GAME_SOCKET_DUMMY.py) self.env.send_map_info(request) self.env.reset() s = self.env.get_state() print(s.shape) buffer_s, buffer_a, buffer_r = [], [], [] ep_r = 0. for t in range(MAX_EP_STEP): a = self.lnet.choose_action(v_wrap(s[None, :]), .5) self.env.step(str(a)) s_ = self.env.get_state() r = self.env.get_reward() done = self.env.check_terminate() if t == MAX_EP_STEP - 1: done = True ep_r += r buffer_a.append(a) buffer_s.append(s) buffer_r.append(r) if total_step % UPDATE_GLOBAL_ITER == 0 or done: # update global and assign to local net # sync push_and_pull(self.opt, self.lnet, self.gnet, done, s_, buffer_s, buffer_a, buffer_r, GAMMA) buffer_s, buffer_a, buffer_r = [], [], [] if done: # done and print information record(self.g_ep, self.g_ep_r, ep_r, self.res_queue, self.name) break s = s_ total_step += 1 except Exception as e: import traceback traceback.print_exc() break self.res_queue.put(None)
def _influencer_reward(self, e, nets, prob0, a0, s, p_a): a_cf = [] for i in range(self.action_dim): if i != a0[0]: a_cf.append(i) p_cf = [] s_cf = np.array([[np.concatenate((s[i][ :-1],np.array([a_cf[j]])),-1) for j in range(self.action_dim-1)] for i in range(self.agent_num-1)]) for i in range(len(nets)): # _a = [nets[i].choose_action(v_wrap(s_cf[i][None, :]), True)[1] for i in range(self.agent_num-1)] # temp = nets[i].choose_action(v_wrap(s_cf[i][None, :]), True)[1][0] _a = [torch.mul(nets[i].choose_action(v_wrap(s_cf[i][None, :]), True)[1][0], prob0)] # _a = _a = torch.sum(_a[0],-2) x = p_a[i][0] y = _a.detach() p_cf.append(torch.nn.functional.kl_div(torch.log(x),y,reduction="sum")) # l = scipy.stats.entropy(x.numpy(), y.numpy()) return e + 50*self._sum(p_cf)/len(p_cf)
def run(self): ep = 0 while self.g_ep.value < 100: # total_step = 1 s = self.env.reset() buffer_s, buffer_a, buffer_r = [], [], [] ep_r = [0. for i in range(self.agent_num)] for step in range(1000): # print(ep) # if self.name == 'w00' and self.g_ep.value%10 == 0: # path = "/Users/xue/Desktop/temp/temp%d"%self.g_ep.value # if not os.path.exists(path): # os.mkdir(path) # self.env.render(path) a = [self.lnet[i].choose_action(v_wrap(s[i][None, :])) for i in range(self.agent_num)] s_, r, done, _ = self.env.step(a, need_argmax=False) # print(a) # if done[0]: r = -1 ep_r = [ep_r[i] + r[i] for i in range(self.agent_num)] buffer_a.append(a) buffer_s.append(s) buffer_r.append(r) if step % 5 == 0: # update global and assign to local net # sync done = [False for i in range(self.agent_num)] [push_and_pull(self.opt[i], self.lnet[i], self.gnet[i], done[i], s_[i], buffer_s, buffer_a, buffer_r, self.GAMMA, i) for i in range(self.agent_num)] [self.scheduler_lr[i].step() for i in range(self.agent_num)] buffer_s, buffer_a, buffer_r = [], [], [] # if ep == 999: # done and print information # record(self.g_ep, self.g_ep_r, sum(ep_r), self.res_queue, self.name) # break s = s_ # total_step += 1 print('ep%d' % ep, self.name, sum(ep_r)) ep += 1 if self.name == "w00": self.sender.send([sum(ep_r), ep]) self.res_queue.put(None)
def run(self): total_step = 1 while self.g_ep.value < MAX_EP: s = np.transpose(self.env.reset(), (2, 0, 1)) / 255.0 lives = self.lives_sum buffer_s, buffer_a, buffer_r = [], [], [] self.ep_r = 0. actions = [] while True: total_step += 1 self.env.render() a = self.lnet.choose_action(v_wrap(s[None, :])) # a = np.random.randint(low = 0, high = 8) actions.append(str(a)) s_, r, done, info = self.env.step(a) s_ = np.transpose(s_, (2, 0, 1)) / 255.0 livesLeft = info[ 'ale.lives'] # punish everytime the agent loses life if livesLeft != lives: r = DIE_PENALTY lives = livesLeft self.ep_r += r buffer_a.append(a) buffer_s.append(s) buffer_r.append(r) if total_step % UPDATE_GLOBAL_ITER == 0 or done: # update global and assign to local net # sync # if self.name == 'w0': # self.env.render() push_and_pull(self.opt, self.lnet, self.gnet, done, s_, buffer_s, buffer_a, buffer_r, GAMMA) buffer_s, buffer_a, buffer_r = [], [], [] if done: # done and print information record(self.g_ep, self.ep_r, self.res_queue, self.name, self.lives_sum, DIE_PENALTY) break s = s_ self.res_queue.put(None)
def run(self): total_step = 1 while self.g_ep.value < MAX_EP: s = self.env.reset() buffer_s, buffer_a, buffer_r = [], [], [] ep_r = 0. while True: if self.name == 'w0': self.env.render() a = self.lnet.choose_action(v_wrap(s[None, :])) action = np.zeros((N_A)) action[a] = 1 # print(self.name,action) s_, r, done, _ = self.env.step(action) #if done: r = -1 ep_r += r buffer_a.append(a) buffer_s.append(s) buffer_r.append(r) if done: # update global and assign to local net break s = s_ total_step += 1
def run(self): total_step = 1 while self.g_ep.value < MAX_EP: # get video ----------------------------- while(True): video_random = random.random() videoName = "" for i in range(len(self.videoList)): if video_random < self.videoList[i][1]: videoName = self.videoList[i - 1][0] break if videoName == "": videoName = self.videoList[-1][0] else: break # get video ----------------------------- busyList = self.get_busyTrace() bandwidth_fileName, rtt = self.getBandwidthFile() reqBI = self.client.init(videoName, bandwidth_fileName, rtt, self.bwType) # mask--------------------------- mask = [1] * A_DIM randmCachedBICount = random.randint(1, 5) BI = [0,1,2,3,4] randomCachedBI = random.sample(BI, randmCachedBICount) for bIndex in range(5): if bIndex not in randomCachedBI: mask[bIndex] = 0 # mask--------------------------- segNum = 0 buffer_s, buffer_a, buffer_r = [], [], [] ep_r = 0. busy = busyList[segNum%len(busyList)] state_ = np.zeros(S_LEN) state = state_.copy() #state =[reqBitrate, lastBitrate, buffer, hThroughput, mThroughput, busy, mask] reqBitrate, lastBitrate, buffer, hThroughput, mThroughput, reward, reqBI, done, segNum, busy = [0] * 10 # start one epoch ********************************** while True: if sum(mask) == 1: a = mask.index(1) break # lnet.chose_action **************************** a, logits = self.lnet.choose_action(mask, v_wrap(state[None, :])) # lnet.choose_action **************************** # print -------------------------------------------- if platform.system() == "Linux": if random.randint(0,1000) == 1: print("reqb=", reqBitrate, "lb=", lastBitrate, "buffer=", int(buffer), "hT=", int(hThroughput), "mT=", int(mThroughput), "busy=", round(busy, 2), "mask=",mask, "action=", a, "reqBI=", reqBI, "reward=",round(reward,2), "logits=", logits) else: print("reqb=", reqBitrate, "lb=", round(lastBitrate,2), "buffer=", int(buffer), "hT=", int(hThroughput), "mT=", int(mThroughput), "busy=", round(busy, 2), "mask=",mask, "action=", a, "reqBI=", reqBI, "reward=",round(reward,2), "logits=", logits) # print -------------------------------------------- busy = busyList[segNum % len(busyList)] # client.run **************************** if a == 5: hitFlag = False else: hitFlag = True reqBitrate, lastBitrate, buffer, hThroughput, mThroughput, reward, reqBI, done, segNum = self.client.run(a, busy, hitFlag) # client.run **************************** state_[0] = reqBitrate / BITRATES[-1] state_[1] = lastBitrate / BITRATES[-1] state_[2] = (buffer/1000 - 30) / 10 state_[3] = (hThroughput - throughput_mean) / throughput_std state_[4] = (mThroughput - throughput_mean) / throughput_std print(state) # state_[5] = (busy - busy_mean) / busy_std reward = reward / 5 ep_r += reward buffer_a.append(a) buffer_s.append(state) buffer_r.append(reward) if total_step % UPDATE_GLOBAL_ITER == 0 or done: # update global and assign to local net # sync push_and_pull(self.opt, self.lnet, self.gnet, done, state_, buffer_s, buffer_a, buffer_r, GAMMA) buffer_s, buffer_a, buffer_r = [], [], [] if done: # done and print information record(self.g_ep, self.g_ep_r, ep_r, self.res_queue, self.name) break state = state_.copy() total_step += 1 self.res_queue.put(None) print("end run")
c_loss = td.pow(2) # value_loss = c_loss probs = F.softmax(logits, dim=1) m = self.distribution(probs) exp_v = m.log_prob(a) * td.detach().squeeze() a_loss = -exp_v # policy_loss = a_loss # entropy regularization --- log_probs = F.log_softmax(logits, dim=1) entropy = -(log_probs * probs).sum(1) a_loss -= 0.5 * entropy # entropy regularization --- total_loss = (c_loss + a_loss).mean() return total_loss if __name__ == '__main__': # An instance of your model. lnet = Net() lnet.load_state_dict( torch.load('../../data/RL_model/2019-06-28_10-18-56/model/233293.pkl')) # An example input you would normally provide to your model's forward() method. data_list = [0.33333333, 0.33333333, -0.91149055, 0.00101571, -0.19378804] data = np.array(data_list) print(data_list) # Use torch.jit.trace to generate a torch.jit.ScriptModule via tracing. traced_script_module = torch.jit.trace(lnet, v_wrap(data[None, :])) traced_script_module.save( "../../data/RL_model/2019-06-28_10-18-56/model.pt")
return job ,self.total_time timee = [] enet = Net(16,5) tnet = Net(16,5) #net.explore = False job = job_shop_env("la11") s = job.state for i in range(50): ot = 1222 j=0 while(True): j += 1 job,pre = job.expand(enet) a = v_wrap(enet.choose_action(v_wrap(s))) a.numpy() s_,r,done = job.step(a.numpy()) if abs(pre - ot ) == 0 : r = 100 + r else: r = 100/abs(pre - ot )+r opt = SharedAdam(enet.parameters(), lr=0.000001) _,q_t = tnet.forward(v_wrap(s)) loss = enet.loss_func(v_wrap(s),a,v_wrap(r)+0.1*q_t) if j % 2 ==0: tnet.load_state_dict(enet.state_dict()) opt.zero_grad() loss.backward() opt.step()
s = s_ if done : # print(self.total_time) break return job ,self.total_time net = Net(16,5) #net.explore = False job = job_shop_env(part=0) s = job.state for i in range(50): ot = optt(job) while(True): job,pre = job.expand(net) a = v_wrap(net.choose_action(v_wrap(s))) a.numpy() s_,r,done = job.step(a.numpy()) if abs(pre - ot ) == 0 : r = 10 + r else: r = 10/abs(pre - ot )+r opt = SharedAdam(net.parameters(), lr=0.00001) loss = net.loss_func(v_wrap(s),a,v_wrap(r)) opt.zero_grad() loss.backward() opt.step() s = s_ if done : print(job.total_time)
def run(self): ptitle('Training Agent: {}'.format(self.rank)) config = self.config check_point_episodes = config["check_point_episodes"] check_point_folder = os.path.join(config["check_point_folder"], config["env"]) setup_worker_logging(self.log_queue) self.env = create_env(config["env"], self.seed) observation_space = self.env.observation_space action_space = IdToAct(self.env.action_space) with open(os.path.join("data", f"{config['env']}_action_space.npz"), 'rb') as f: archive = np.load(f) action_space.init_converter(all_actions=archive[archive.files[0]]) self.action_space = action_space all_actions = np.array(action_space.all_actions) self.local_net = Net(self.state_size, self.action_mappings, self.action_line_mappings) # local network self.local_net = cuda(self.gpu_id, self.local_net) total_step = 1 l_ep = 0 while self.g_ep.value < self.num_episodes: self.print( f"{self.env.name} - {self.env.chronics_handler.get_name()}") if isinstance(self.env, MultiMixEnvironment): obs = self.env.reset(random=True) else: obs = self.env.reset() maintenance_list = obs.time_next_maintenance + obs.duration_next_maintenance s = self.convert_obs(observation_space, obs) s = v_wrap(s[None, :]) s = cuda(self.gpu_id, s) buffer_s, buffer_a, buffer_r = [], [], [] ep_r = 0. ep_step = 0 ep_agent_num_dmd = 0 ep_agent_num_acts = 0 while True: rho = obs.rho.copy() rho[rho == 0.0] = 1.0 lines_overload = rho > config["danger_threshold"] expert_act = expert_rules(self.name, maintenance_list, ep_step, action_space, obs) if expert_act is not None: a = np.where(all_actions == expert_act)[0][0] choosen_actions = np.array([a]) #print(f"Expert act: {a}") elif not np.any(lines_overload): choosen_actions = np.array([0]) else: lines_overload = cuda( self.gpu_id, torch.tensor(lines_overload.astype(int)).float()) attention = torch.matmul(lines_overload.reshape(1, -1), self.action_line_mappings) attention[attention > 1] = 1 choosen_actions = self.local_net.choose_action( s, attention, self.g_num_candidate_acts.value) ep_agent_num_dmd += 1 obs_previous = obs a, obs_forecasted, obs_do_nothing = forecast_actions( choosen_actions, self.action_space, obs, min_threshold=0.95) logging.info(f"{self.name}_act|||{a}") act = self.action_space.convert_act(a) obs, r, done, info = self.env.step(act) r = lreward(a, self.env, obs_previous, obs_do_nothing, obs_forecasted, obs, done, info, threshold_safe=0.85) if a > 0: if r > 0: print("+", end="") elif r < 0: print("-", end="") elif len(choosen_actions) > 0: print("*", end="") else: print("x", end="") else: if len(choosen_actions) > 0: print("o", end="") else: print("0", end="") if r > 0: ep_agent_num_acts += 1 s_ = self.convert_obs(observation_space, obs) s_ = v_wrap(s_[None, :]) s_ = cuda(self.gpu_id, s_) ep_r += r buffer_a.append(a) buffer_s.append(s) buffer_r.append(r) if total_step % self.update_global_iter == 0 or done: # update global and assign to local net # sync # if len(buffer_r) > 0 and np.mean(np.abs(buffer_r)) > 0: buffer_a = cuda(self.gpu_id, torch.tensor(buffer_a, dtype=torch.long)) buffer_s = cuda(self.gpu_id, torch.cat(buffer_s)) push_and_pull(self.opt, self.local_net, check_point_episodes, check_point_folder, self.g_ep, l_ep, self.name, self.rank, self.global_net, done, s_, buffer_s, buffer_a, buffer_r, self.gamma, self.gpu_id) buffer_s, buffer_a, buffer_r = [], [], [] if done: # done and print information print("") record(config["starting_num_candidate_acts"], config["num_candidate_acts_decay_iter"], self.g_ep, self.g_step, self.g_num_candidate_acts, self.g_ep_r, ep_r, self.res_queue, self.name, ep_step, ep_agent_num_dmd, ep_agent_num_acts) break s = s_ total_step += 1 ep_step += 1 l_ep += 1 self.res_queue.put(None)
opt = SharedAdam(gnet.parameters(), lr=5e-3) # global optimizer global_ep, global_ep_r, res_queue = mp.Value('i', 0), mp.Value('d', 0.), mp.Queue() # parallel training workers = [Worker(gnet, opt, global_ep, global_ep_r, res_queue, i) for i in range(mp.cpu_count())] [w.start() for w in workers] res = [] # record episode reward to plot while True: r = res_queue.get() if r is not None: res.append(r) else: break [w.join() for w in workers] import matplotlib.pyplot as plt plt.plot(res) plt.ylabel('Moving average ep reward') plt.xlabel('Step') plt.show() for _ in range(3): s = env.reset() while True: env.render() a = gnet.choose_action(v_wrap(s[None, :])) s_, r, done, _ = env.step(a) s = s_ if done: break
def run(self): self.getBandwidthFileList() # print("bandwidth file count=", len(self.bandwidth_fileList)) r_avg_sum = 0 resDir = self.modelDir + "/test_res_" + str(self.traceIndex) if os.path.exists(resDir) == False: os.makedirs(resDir) resFile = open(resDir + "/" + self.policy + "_" + self.bwType + ".txt",'w') for bandwidth_file in self.bandwidth_fileList: bandwidth_fileName = bandwidth_file[0] rtt = bandwidth_file[1] bwType = bandwidth_file[2] busyList = self.get_busyTrace() # get video ----------------------------- videoName = "" if PUREFLAF: video_random = random.random() for i in range(len(self.videoList)): if video_random < self.videoList[i][1]: videoName = self.videoList[i - 1][0] break if videoName == "": videoName = self.videoList[-1][0] else: if self.videoTraceIndex == len(self.videoTraceList): self.videoTraceIndex = 0 videoName = self.videoTraceList[self.videoTraceIndex] # print(videoName) # get video ---------------------------- reqBI = self.client.init(videoName=videoName, bandwidthFileName=bandwidth_fileName, rtt=rtt, bwType=bwType) # mask--------------------------- mask = [1] * A_DIM for bIndex in range(5): if PUREFLAF: mask[bIndex] = 0 else: if videoName + "_" + str(bIndex + 1) not in self.cachedList: mask[bIndex] = 0. # mask--------------------------- state_ = np.zeros(S_LEN) state = state_.copy() #state =[lastBitrate buffer hThroughput mThroughput busy mask] # 开始测试------------------------------ total_step = 0 segNum = 0 r_sum = 0 reqBitrate, lastBitrate, buffer, hThroughput, mThroughput, reward, reqBI, done, segNum, busy = [0] * 10 while True: if sum(mask) == 1: a = mask.index(1) else: if self.policy == "no_policy": a = 5 elif self.policy == "RL": a, logits = self.lnet.choose_action(mask, v_wrap(state[None, :])) self.saveLogits(videoName, logits) elif self.policy == "lower": a = self.choose_action_lower(mask, reqBI) elif self.policy == "closest": a = self.choose_action_closest(mask, reqBI) elif self.policy == "highest": a = self.choose_action_highest(mask, reqBI) elif self.policy == "prefetch": a = self.choose_action_prefetch(mask, reqBI) else: print("想啥呢") return # if random.randint(0, 1000) == 1: # print("reqb=", reqBitrate, "lb=", lastBitrate, "buffer=", int(buffer), "hT=", int(hThroughput), # "mT=", int(mThroughput), "busy=", round(busy, 2), # "mask=", mask, "action=", a, "reqBI=", reqBI, "reward=", round(reward, 2), "logits=", logits) busy = busyList[segNum % len(busyList)] if a == 5: hitFlag = False else: hitFlag = True reqBitrate, lastBitrate, buffer, hThroughput, mThroughput, reward, reqBI, done, segNum = self.client.run(a, busy, hitFlag) state_[0] = reqBitrate / BITRATES[-1] state_[1] = lastBitrate / BITRATES[-1] state_[2] = (buffer/1000 - 30) / 10 state_[3] = (hThroughput - throughput_mean) / throughput_std state_[4] = (mThroughput - throughput_mean) / throughput_std reward = reward / 5 r_sum += reward total_step += 1 if done: break state = state_.copy() # 结束测试------------------------------ r_avg = r_sum / total_step r_avg_sum += r_avg resFile.write(str(r_avg) + "\n") resFile.flush() print(self.bwType, self.policy, videoName, self.bandwidth_fileList.index(bandwidth_file),"/",len(self.bandwidth_fileList), r_avg) self.videoTraceIndex += 1
def generate_saliency(observation, game_step, net, game_tracker, record_json_dir, actual_action, actual_probs, opponent_action, opponent_id, new_val=True): """Generate saliency data for a given observation observation -- the observation to be used game_step -- the current timestep of the game, needed because of an off-by-one in the observation's counter net -- the network to generate saliency from game_tracker -- Needed for generate_NN_input record_json_dir -- output directory for saliency data files. Should be kept with the data from env.render actual_action -- action decided from actor distribution by the unmodified observation actual_probs -- action distribution produced by the unmodified observation opponent_action -- the action the opponent took at this time step opponent_id -- the id of the opponent new_val -- if false uses passage/wall modifications, otherwise uses previously unknown value modifications """ data = {} data['step'] = game_step data[ 'actual_action'] = actual_action #have to pass actual action in from outside the func because the agent selects an action from the prob distribution in the main code data['actual_probs'] = actual_probs # data['actual_terminal_prediction'] = actual_terminal_predicton #ignoring critic for now. TODO? #below gets populated by the algorithm data['mods'] = [] data['actions'] = [] data['opponent_action'] = opponent_action data['opponent_id'] = opponent_id # data['predictions'] = [] #again, ignoring critic so ignoring collection of critic value changes. board_size = len(observation['board'][0]) #assuming board is square for i in range(board_size): mod_list = [] action_list = [] # prediction_list = [] for j in range(board_size): mod_observation = deepcopy(observation) #remove the player from the 'alive' channel as well just to truly remove the data. if mod_observation['board'][i][j] >= 10: mod_observation['alive'].remove(mod_observation['board'][i][j]) #replace this tile's value with a new value. if new_val: mod_observation['board'][i][ j] = 14 #first number that doesn't have a definition already else: #if the tile here is a passage, make it a wall. Otherwise, make this tile a passage. #prevents no change from occuring in passages mod_observation['board'][i][ j] = 1 if mod_observation['board'][i][j] == 0 else 0 #won't ever be a bomb in this tile. possible values are 0, 1, and 14 mod_observation['bomb_blast_strength'][i][j] = 0 mod_observation['bomb_life'][i][j] = 0 this_state = generate_NN_input(10, mod_observation, game_step, game_tracker) m_this_state = v_wrap(this_state).unsqueeze(0) this_action, _, _, this_probs, this_terminal_prediction = net.choose_action( m_this_state) diffs = [ this_probs[k] - actual_probs[k] for k in range(len(this_probs)) ] mod_list.append( deepcopy(diffs)) # Have to deppcopy to avoid pointer hell. action_list.append(this_action) # prediction_list.append(this_terminal_prediction) data['mods'].append(deepcopy(mod_list)) data['actions'].append(deepcopy(action_list)) # data['predictions'].append(deepcopy(prediction_list)) with open(f"{record_json_dir}/d{game_step:03d}.json", 'w') as f: #Uses MyEncoder to properly clean numpy data to types which json will serialize. #json.dumps won't serialize numpy data by default f.write(json.dumps(data, cls=MyEncoder))
env = gym.make(env_id) #Pendulum VibrationEnv input_size = env.observation_space.shape[0] output_size = env.action_space.shape[0] env.reset() s_, r, done, _ = env.step([1]) # s_ = torch.from_numpy(s_.reshape([batch, input_size,seq_len])).float() # s_ = s_.reshape([1, input_size,-1]).float() # s_ = s_.float() print(s_) sys = TCN(input_size, output_size, num_channels = [25, 25, 30], kernel_size = 8, dropout = 0) # lstm_input = torch.randn(1, input_size, 1) # mu, sigma, values = sys(lstm_input) # print(lstm_input) # print(mu, sigma, values) s_ = s_.reshape([batch, input_size, seq_len]) v_wrap(s_) mu, sigma, values = sys(v_wrap(s_)) print(mu, sigma, values) # a = sys.choose_action(v_wrap(s_[None, :])) a = sys.choose_action(v_wrap(s_)) print(a)