Esempio n. 1
0
    def run(self):
        ep = 0
        while self.g_ep.value < 100:
            # total_step = 1
            s = self.env.reset()
            buffer_s, buffer_a, buffer_r = [], [], []
            ep_r = [0. for i in range(self.agent_num)]
            for step in range(1000):
                # print(ep)
                # if self.name == 'w00' and self.g_ep.value%10 == 0:
                #     path = "/Users/xue/Desktop/temp/temp%d"%self.g_ep.value
                #     if not os.path.exists(path):
                #         os.mkdir(path)
                #     self.env.render(path)
                s0 = s[0]
                a0, prob0 = self.lnet[0].choose_action(v_wrap(s0[None, :]), True)
                a0 = [a0]
                s = [np.concatenate((s[i],np.array(a0)),-1) for i in range(1, self.agent_num)]
                s = [s0] + s
                a = [self.lnet[i].choose_action(v_wrap(s[i][None, :]), True) for i in range(1, self.agent_num)]
                prob = [elem[1] for elem in a]
                a = a0 + [elem[0] for elem in a]
                s_, r, done, _ = self.env.step(a,need_argmax=False)
                # print(a)
                # if done[0]: r = -1
                ep_r = [ep_r[i] + r[i] for i in range(self.agent_num)]
                x = self._influencer_reward(r[0], self.lnet[1:], prob0, a0, s[1:], prob)
                r = [float(i) for i in r]
                r[0] += x.numpy()
                buffer_a.append(a)
                buffer_s.append(s)
                buffer_r.append(r)

                if step % 5 == 0 and step != 0:  # update global and assign to local net
                    _s0 = s_[0]
                    a0 = self.lnet[0].choose_action(v_wrap(_s0[None, :]), False)
                    a0 = [a0]
                    _s = [np.concatenate((s_[i], np.array(a0)), -1) for i in range(1, self.agent_num)]
                    _s = [_s0] + _s
                    # sync
                    done = [False for i in range(self.agent_num)]
                    [push_and_pull(self.opt[i], self.lnet[i], self.gnet[i], done[i],
                                   _s[i], buffer_s, buffer_a, buffer_r, self.GAMMA, i)
                                                    for i in range(self.agent_num)]
                    [self.scheduler_lr[i].step() for i in range(self.agent_num)]
                    buffer_s, buffer_a, buffer_r = [], [], []
                # if ep == 999:  # done and print information
                #     record(self.g_ep, self.g_ep_r, sum(ep_r), self.res_queue, self.name)
                #     break
                s = s_
                # total_step += 1
            print('ep%d'%ep, self.name, sum(ep_r))
            ep+=1
            if self.name == "w00":
                self.sender.send([sum(ep_r),ep])
        self.res_queue.put(None)
    def expand_dispatch(self,dis = 1):
        job = copy.deepcopy(self)
        s = self.state
        while(True):
            a = v_wrap(np.repeat(dis,net.choose_action(v_wrap(s)).size))
            s_,r,done = self.step(a.numpy())
            s = s_
            if done :
#                print(self.total_time)
                break
        return job ,self.total_time
 def expand(self, net):
     job = copy.deepcopy(self)
     s = self.state
     while (True):
         a = v_wrap(net.choose_action(v_wrap(s)))
         s_, r, done = self.step(a.numpy())
         s = s_
         if done:
             #                print(self.total_time)
             break
     return job, self.total_time
    def run(self):
        total_step = 1
        while self.g_ep.value < self.args.MAXEPS:
            # print(self.g_ep.value)
            s = self.env.reset()
            buffer_s, buffer_a, buffer_r = [], [], []
            ep_r = 0
            for t in range(self.args.MAXSTEP):
                if self.name == 'worker0':
                    self.env.render()
                a = self.lnet.choose_action(v_wrap(s[None, :]))
                s_, r, done, _ = self.env.step(a.clip(-2, 2))
                if t == self.args.MAXSTEP - 1:
                    done = True
                ep_r += r
                buffer_s.append(s)
                buffer_a.append(a)
                buffer_r.append((r + 8.1) / 8.1)

                if total_step % self.args.updateperiod == 0 or done:
                    # print(total_step)
                    push_and_pull(self.opt, self.lnet, self.gnet, done, s_,
                                  buffer_s, buffer_a, buffer_r,
                                  self.args.gamma)
                    buffer_s, buffer_a, buffer_r = [], [], []
                    if done:
                        print('*')
                        record(self.g_ep, self.g_ep_r, ep_r, self.res_queue,
                               self.name)
                        break
                s = s_
                total_step += 1
        self.res_queue.put(None)
Esempio n. 5
0
    def run(self):
        total_step = 1
        while self.g_ep.value < MAX_EP:
            s = self.env.reset()
            buffer_s, buffer_a, buffer_r = [], [], []
            ep_r = 0.
            while True:
                if self.name == 'w00':
                    self.env.render()
                a = self.lnet.choose_action(v_wrap(s[None, :]))
                s_, r, done, _ = self.env.step(a)
                if done: r = -1
                ep_r += r
                buffer_a.append(a)
                buffer_s.append(s)
                buffer_r.append(r)

                if total_step % UPDATE_GLOBAL_ITER == 0 or done:
                    push_and_pull(self.opt, self.lnet, self.gnet, done, s_,
                                  buffer_s, buffer_a, buffer_r, GAMMA)
                    buffer_s, buffer_a, buffer_r = [], [], []
                    if done:
                        record(self.g_ep, self.g_ep_r, ep_r, self.res_queue,
                               self.name)
                        break

                s = s_
                total_step += 1
        self.res_queue.put(None)
Esempio n. 6
0
    def run(self):
        total_step = 1
        while self.g_ep.value < MAX_EP:
            s = self.env.reset()
            buffer_s, buffer_a, buffer_r = [], [], []
            ep_r = 0.
            for t in range(MAX_EP_STEP):
                if self.name == 'w0':
                    self.env.render()
                a = self.lnet.choose_action(v_wrap(s[None, :]))
                s_, r, done, _ = self.env.step(a.clip(-2, 2))
                if t == MAX_EP_STEP - 1:
                    done = True
                ep_r += r
                buffer_a.append(a)
                buffer_s.append(s)
                buffer_r.append((r + 8.1) / 8.1)  # normalize

                if total_step % UPDATE_GLOBAL_ITER == 0 or done:  # update global and assign to local net
                    # sync
                    push_and_pull(self.opt, self.lnet, self.gnet, done, s_,
                                  buffer_s, buffer_a, buffer_r, GAMMA)
                    buffer_s, buffer_a, buffer_r = [], [], []

                    if done:  # done and print information
                        record(self.g_ep, self.g_ep_r, ep_r, self.res_queue,
                               self.name)
                        break
                s = s_
                total_step += 1

        self.res_queue.put(None)
Esempio n. 7
0
    def run(self):
        total_step = 1
        while self.g_ep.value < MAX_EP:
            s = self.env.reset()
            buffer_s, buffer_action, buffer_reward = [], [], []
            ep_r = 0.
            while True:
                if self.name == 'w00':
                    self.env.render()
                a = self.lnet.choose_action(v_wrap(s[None, :]))
                s_, r, done, _ = self.env.step(a)
                if done: r = -1
                ep_r += r
                buffer_action.append(a)
                buffer_s.append(s)
                buffer_reward.append(r)

                if total_step % UPDATE_GLOBAL_ITER == 0 or done:  # update global and assign to local net
                    # sync
                    push_and_pull(self.opt, self.lnet, self.gnet, done, s_,
                                  buffer_s, buffer_action, buffer_reward,
                                  GAMMA)
                    buffer_s, buffer_action, buffer_reward = [], [], []

                    if done:  # done and print information
                        record(self.g_ep, self.g_ep_r, ep_r, self.res_queue,
                               self.name)
                        break
                s = s_
                total_step += 1
        self.res_queue.put(None)
Esempio n. 8
0
    def run(self):
        total_step = 1

        while self.g_ep.value < MAX_EP:
            try:
                mapID = np.random.randint(
                    1,
                    6)  #Choosing a map ID from 5 maps in Maps folder randomly
                posID_x = np.random.randint(
                    MAP_MAX_X
                )  #Choosing a initial position of the DQN agent on X-axes randomly
                posID_y = np.random.randint(
                    MAP_MAX_Y
                )  #Choosing a initial position of the DQN agent on Y-axes randomly
                #Creating a request for initializing a map, initial position, the initial energy, and the maximum number of steps of the DQN agent
                request = ("map" + str(mapID) + "," + str(posID_x) + "," +
                           str(posID_y) + ",50,100")
                #Send the request to the game environment (GAME_SOCKET_DUMMY.py)
                self.env.send_map_info(request)
                self.env.reset()
                s = self.env.get_state()
                print(s.shape)
                buffer_s, buffer_a, buffer_r = [], [], []
                ep_r = 0.
                for t in range(MAX_EP_STEP):
                    a = self.lnet.choose_action(v_wrap(s[None, :]), .5)
                    self.env.step(str(a))
                    s_ = self.env.get_state()
                    r = self.env.get_reward()
                    done = self.env.check_terminate()

                    if t == MAX_EP_STEP - 1:
                        done = True
                    ep_r += r
                    buffer_a.append(a)
                    buffer_s.append(s)
                    buffer_r.append(r)

                    if total_step % UPDATE_GLOBAL_ITER == 0 or done:  # update global and assign to local net
                        # sync
                        push_and_pull(self.opt, self.lnet, self.gnet, done, s_,
                                      buffer_s, buffer_a, buffer_r, GAMMA)
                        buffer_s, buffer_a, buffer_r = [], [], []

                        if done:  # done and print information
                            record(self.g_ep, self.g_ep_r, ep_r,
                                   self.res_queue, self.name)
                            break
                    s = s_
                    total_step += 1

            except Exception as e:
                import traceback
                traceback.print_exc()
                break

        self.res_queue.put(None)
Esempio n. 9
0
 def _influencer_reward(self, e, nets, prob0, a0, s, p_a):
     a_cf = []
     for i in range(self.action_dim):
         if i != a0[0]:
             a_cf.append(i)
     p_cf = []
     s_cf = np.array([[np.concatenate((s[i][ :-1],np.array([a_cf[j]])),-1) for j in range(self.action_dim-1)] for i in range(self.agent_num-1)])
     for i in range(len(nets)):
         # _a = [nets[i].choose_action(v_wrap(s_cf[i][None, :]), True)[1] for i in range(self.agent_num-1)]
         # temp = nets[i].choose_action(v_wrap(s_cf[i][None, :]), True)[1][0]
         _a = [torch.mul(nets[i].choose_action(v_wrap(s_cf[i][None, :]), True)[1][0], prob0)]
         # _a =
         _a = torch.sum(_a[0],-2)
         x = p_a[i][0]
         y = _a.detach()
         p_cf.append(torch.nn.functional.kl_div(torch.log(x),y,reduction="sum"))
         # l = scipy.stats.entropy(x.numpy(), y.numpy())
     return e + 50*self._sum(p_cf)/len(p_cf)
Esempio n. 10
0
    def run(self):
        ep = 0
        while self.g_ep.value < 100:
            # total_step = 1
            s = self.env.reset()
            buffer_s, buffer_a, buffer_r = [], [], []
            ep_r = [0. for i in range(self.agent_num)]
            for step in range(1000):
                # print(ep)
                # if self.name == 'w00' and self.g_ep.value%10 == 0:
                #     path = "/Users/xue/Desktop/temp/temp%d"%self.g_ep.value
                #     if not os.path.exists(path):
                #         os.mkdir(path)
                #     self.env.render(path)
                a = [self.lnet[i].choose_action(v_wrap(s[i][None, :])) for i in range(self.agent_num)]
                s_, r, done, _ = self.env.step(a, need_argmax=False)
                # print(a)
                # if done[0]: r = -1
                ep_r = [ep_r[i] + r[i] for i in range(self.agent_num)]
                buffer_a.append(a)
                buffer_s.append(s)
                buffer_r.append(r)

                if step % 5 == 0:  # update global and assign to local net
                    # sync
                    done = [False for i in range(self.agent_num)]
                    [push_and_pull(self.opt[i], self.lnet[i], self.gnet[i], done[i],
                                   s_[i], buffer_s, buffer_a, buffer_r, self.GAMMA, i)
                     for i in range(self.agent_num)]
                    [self.scheduler_lr[i].step() for i in range(self.agent_num)]
                    buffer_s, buffer_a, buffer_r = [], [], []
                # if ep == 999:  # done and print information
                #     record(self.g_ep, self.g_ep_r, sum(ep_r), self.res_queue, self.name)
                #     break
                s = s_
                # total_step += 1
            print('ep%d' % ep, self.name, sum(ep_r))
            ep += 1
            if self.name == "w00":
                self.sender.send([sum(ep_r), ep])
        self.res_queue.put(None)
Esempio n. 11
0
    def run(self):
        total_step = 1
        while self.g_ep.value < MAX_EP:
            s = np.transpose(self.env.reset(), (2, 0, 1)) / 255.0
            lives = self.lives_sum
            buffer_s, buffer_a, buffer_r = [], [], []
            self.ep_r = 0.
            actions = []
            while True:
                total_step += 1
                self.env.render()
                a = self.lnet.choose_action(v_wrap(s[None, :]))
                # a = np.random.randint(low = 0, high = 8)
                actions.append(str(a))
                s_, r, done, info = self.env.step(a)
                s_ = np.transpose(s_, (2, 0, 1)) / 255.0
                livesLeft = info[
                    'ale.lives']  # punish everytime the agent loses life
                if livesLeft != lives:
                    r = DIE_PENALTY
                    lives = livesLeft
                self.ep_r += r
                buffer_a.append(a)
                buffer_s.append(s)
                buffer_r.append(r)

                if total_step % UPDATE_GLOBAL_ITER == 0 or done:  # update global and assign to local net
                    # sync
                    # if self.name == 'w0':
                    #     self.env.render()
                    push_and_pull(self.opt, self.lnet, self.gnet, done, s_,
                                  buffer_s, buffer_a, buffer_r, GAMMA)
                    buffer_s, buffer_a, buffer_r = [], [], []

                    if done:  # done and print information
                        record(self.g_ep, self.ep_r, self.res_queue, self.name,
                               self.lives_sum, DIE_PENALTY)
                        break
                s = s_
        self.res_queue.put(None)
    def run(self):
        total_step = 1
        while self.g_ep.value < MAX_EP:
            s = self.env.reset()
            buffer_s, buffer_a, buffer_r = [], [], []
            ep_r = 0.
            while True:
                if self.name == 'w0':
                    self.env.render()
                a = self.lnet.choose_action(v_wrap(s[None, :]))
                action = np.zeros((N_A))
                action[a] = 1
                # print(self.name,action)
                s_, r, done, _ = self.env.step(action)
                #if done: r = -1
                ep_r += r
                buffer_a.append(a)
                buffer_s.append(s)
                buffer_r.append(r)

                if done:  # update global and assign to local net
                    break
                s = s_
                total_step += 1
Esempio n. 13
0
    def run(self):
        total_step = 1
        while self.g_ep.value < MAX_EP:
            # get video -----------------------------
            while(True):
                video_random = random.random()
                videoName = ""
                for i in range(len(self.videoList)):
                    if video_random < self.videoList[i][1]:
                        videoName = self.videoList[i - 1][0]
                        break
                if videoName == "":
                    videoName = self.videoList[-1][0]
                else:
                    break

            # get video -----------------------------
            busyList = self.get_busyTrace()
            bandwidth_fileName, rtt = self.getBandwidthFile()
            reqBI = self.client.init(videoName, bandwidth_fileName, rtt, self.bwType)
            # mask---------------------------
            mask = [1] * A_DIM
            randmCachedBICount = random.randint(1, 5)
            BI = [0,1,2,3,4]
            randomCachedBI = random.sample(BI, randmCachedBICount)
            for bIndex in range(5):
                if bIndex not in randomCachedBI:
                    mask[bIndex] = 0
            # mask---------------------------
            segNum = 0
            buffer_s, buffer_a, buffer_r = [], [], []
            ep_r = 0.

            busy = busyList[segNum%len(busyList)]
            state_ = np.zeros(S_LEN)
            state = state_.copy() #state =[reqBitrate, lastBitrate, buffer, hThroughput, mThroughput, busy, mask]

            reqBitrate, lastBitrate, buffer, hThroughput, mThroughput, reward, reqBI, done, segNum, busy = [0] * 10
            # start one epoch **********************************
            while True:
                if sum(mask) == 1:
                    a = mask.index(1)
                    break
                # lnet.chose_action ****************************
                a, logits = self.lnet.choose_action(mask, v_wrap(state[None, :]))
                # lnet.choose_action ****************************
                # print --------------------------------------------
                if platform.system() == "Linux":
                    if random.randint(0,1000) == 1:
                        print("reqb=", reqBitrate, "lb=", lastBitrate, "buffer=", int(buffer), "hT=", int(hThroughput), "mT=", int(mThroughput), "busy=", round(busy, 2),
                              "mask=",mask, "action=", a, "reqBI=", reqBI, "reward=",round(reward,2), "logits=", logits)
                else:
                    print("reqb=", reqBitrate, "lb=", round(lastBitrate,2), "buffer=", int(buffer), "hT=", int(hThroughput), "mT=", int(mThroughput), "busy=", round(busy, 2),
                      "mask=",mask, "action=", a, "reqBI=", reqBI, "reward=",round(reward,2), "logits=", logits)
                # print --------------------------------------------
                busy = busyList[segNum % len(busyList)]
                # client.run ****************************
                if a == 5:
                    hitFlag = False
                else:
                    hitFlag = True
                reqBitrate, lastBitrate, buffer, hThroughput, mThroughput, reward, reqBI, done, segNum = self.client.run(a, busy, hitFlag)
                # client.run ****************************
                state_[0] = reqBitrate / BITRATES[-1]
                state_[1] = lastBitrate / BITRATES[-1]
                state_[2] = (buffer/1000 - 30) / 10
                state_[3] = (hThroughput - throughput_mean) / throughput_std
                state_[4] = (mThroughput - throughput_mean) / throughput_std
                print(state)
                # state_[5] = (busy - busy_mean) / busy_std
                reward = reward / 5

                ep_r += reward
                buffer_a.append(a)
                buffer_s.append(state)
                buffer_r.append(reward)
                if total_step % UPDATE_GLOBAL_ITER == 0 or done:  # update global and assign to local net
                    # sync
                    push_and_pull(self.opt, self.lnet, self.gnet, done, state_, buffer_s, buffer_a, buffer_r, GAMMA)
                    buffer_s, buffer_a, buffer_r = [], [], []
                    if done:  # done and print information
                        record(self.g_ep, self.g_ep_r, ep_r, self.res_queue, self.name)
                        break
                state = state_.copy()
                total_step += 1

        self.res_queue.put(None)
        print("end run")
Esempio n. 14
0
File: pt.py Progetto: lfbmpaper/lfbm
        c_loss = td.pow(2)  # value_loss = c_loss
        probs = F.softmax(logits, dim=1)
        m = self.distribution(probs)
        exp_v = m.log_prob(a) * td.detach().squeeze()
        a_loss = -exp_v  # policy_loss = a_loss
        # entropy regularization ---
        log_probs = F.log_softmax(logits, dim=1)
        entropy = -(log_probs * probs).sum(1)
        a_loss -= 0.5 * entropy
        # entropy regularization ---
        total_loss = (c_loss + a_loss).mean()
        return total_loss


if __name__ == '__main__':

    # An instance of your model.
    lnet = Net()
    lnet.load_state_dict(
        torch.load('../../data/RL_model/2019-06-28_10-18-56/model/233293.pkl'))

    # An example input you would normally provide to your model's forward() method.

    data_list = [0.33333333, 0.33333333, -0.91149055, 0.00101571, -0.19378804]

    data = np.array(data_list)
    print(data_list)
    # Use torch.jit.trace to generate a torch.jit.ScriptModule via tracing.
    traced_script_module = torch.jit.trace(lnet, v_wrap(data[None, :]))
    traced_script_module.save(
        "../../data/RL_model/2019-06-28_10-18-56/model.pt")
        return job ,self.total_time
        
timee = []
enet = Net(16,5)
tnet = Net(16,5)
#net.explore = False
job = job_shop_env("la11")
s = job.state

for i in range(50):
    ot = 1222
    j=0
    while(True):
        j += 1
        job,pre = job.expand(enet)
        a = v_wrap(enet.choose_action(v_wrap(s)))
        a.numpy()
        s_,r,done  =  job.step(a.numpy())
        if abs(pre - ot ) == 0 :
            r = 100 + r
        else:
            r = 100/abs(pre - ot )+r        
	
        opt = SharedAdam(enet.parameters(), lr=0.000001)
        _,q_t = tnet.forward(v_wrap(s))
        loss = enet.loss_func(v_wrap(s),a,v_wrap(r)+0.1*q_t)
        if j % 2 ==0:
            tnet.load_state_dict(enet.state_dict())
        opt.zero_grad()
        loss.backward()
        opt.step()
            s = s_
            if done :
#                print(self.total_time)
                break
        return job ,self.total_time
        

net = Net(16,5)
#net.explore = False
job = job_shop_env(part=0)
s = job.state
for i in range(50):
    ot = optt(job)
    while(True):
        job,pre = job.expand(net)
        a = v_wrap(net.choose_action(v_wrap(s)))
        a.numpy()
        s_,r,done  =  job.step(a.numpy())
        if abs(pre - ot ) == 0 :
            r = 10 + r
        else:
            r = 10/abs(pre - ot )+r        
	
        opt = SharedAdam(net.parameters(), lr=0.00001) 
        loss = net.loss_func(v_wrap(s),a,v_wrap(r))
        opt.zero_grad()
        loss.backward()
        opt.step()
        s = s_
        if done :
            print(job.total_time)
Esempio n. 17
0
    def run(self):
        ptitle('Training Agent: {}'.format(self.rank))
        config = self.config
        check_point_episodes = config["check_point_episodes"]
        check_point_folder = os.path.join(config["check_point_folder"],
                                          config["env"])
        setup_worker_logging(self.log_queue)

        self.env = create_env(config["env"], self.seed)
        observation_space = self.env.observation_space
        action_space = IdToAct(self.env.action_space)
        with open(os.path.join("data", f"{config['env']}_action_space.npz"),
                  'rb') as f:
            archive = np.load(f)
            action_space.init_converter(all_actions=archive[archive.files[0]])

        self.action_space = action_space
        all_actions = np.array(action_space.all_actions)

        self.local_net = Net(self.state_size, self.action_mappings,
                             self.action_line_mappings)  # local network
        self.local_net = cuda(self.gpu_id, self.local_net)

        total_step = 1
        l_ep = 0
        while self.g_ep.value < self.num_episodes:
            self.print(
                f"{self.env.name} - {self.env.chronics_handler.get_name()}")
            if isinstance(self.env, MultiMixEnvironment):
                obs = self.env.reset(random=True)
            else:
                obs = self.env.reset()

            maintenance_list = obs.time_next_maintenance + obs.duration_next_maintenance

            s = self.convert_obs(observation_space, obs)
            s = v_wrap(s[None, :])
            s = cuda(self.gpu_id, s)

            buffer_s, buffer_a, buffer_r = [], [], []
            ep_r = 0.
            ep_step = 0
            ep_agent_num_dmd = 0
            ep_agent_num_acts = 0
            while True:
                rho = obs.rho.copy()
                rho[rho == 0.0] = 1.0
                lines_overload = rho > config["danger_threshold"]

                expert_act = expert_rules(self.name, maintenance_list, ep_step,
                                          action_space, obs)

                if expert_act is not None:
                    a = np.where(all_actions == expert_act)[0][0]
                    choosen_actions = np.array([a])
                    #print(f"Expert act: {a}")
                elif not np.any(lines_overload):
                    choosen_actions = np.array([0])
                else:
                    lines_overload = cuda(
                        self.gpu_id,
                        torch.tensor(lines_overload.astype(int)).float())
                    attention = torch.matmul(lines_overload.reshape(1, -1),
                                             self.action_line_mappings)
                    attention[attention > 1] = 1
                    choosen_actions = self.local_net.choose_action(
                        s, attention, self.g_num_candidate_acts.value)
                    ep_agent_num_dmd += 1

                obs_previous = obs
                a, obs_forecasted, obs_do_nothing = forecast_actions(
                    choosen_actions,
                    self.action_space,
                    obs,
                    min_threshold=0.95)

                logging.info(f"{self.name}_act|||{a}")
                act = self.action_space.convert_act(a)

                obs, r, done, info = self.env.step(act)

                r = lreward(a,
                            self.env,
                            obs_previous,
                            obs_do_nothing,
                            obs_forecasted,
                            obs,
                            done,
                            info,
                            threshold_safe=0.85)

                if a > 0:
                    if r > 0:
                        print("+", end="")
                    elif r < 0:
                        print("-", end="")
                    elif len(choosen_actions) > 0:
                        print("*", end="")
                    else:
                        print("x", end="")
                else:
                    if len(choosen_actions) > 0:
                        print("o", end="")
                    else:
                        print("0", end="")

                if r > 0:
                    ep_agent_num_acts += 1

                s_ = self.convert_obs(observation_space, obs)
                s_ = v_wrap(s_[None, :])
                s_ = cuda(self.gpu_id, s_)

                ep_r += r
                buffer_a.append(a)
                buffer_s.append(s)
                buffer_r.append(r)

                if total_step % self.update_global_iter == 0 or done:  # update global and assign to local net
                    # sync

                    # if len(buffer_r) > 0 and np.mean(np.abs(buffer_r)) > 0:
                    buffer_a = cuda(self.gpu_id,
                                    torch.tensor(buffer_a, dtype=torch.long))
                    buffer_s = cuda(self.gpu_id, torch.cat(buffer_s))
                    push_and_pull(self.opt, self.local_net,
                                  check_point_episodes, check_point_folder,
                                  self.g_ep, l_ep, self.name, self.rank,
                                  self.global_net, done, s_, buffer_s,
                                  buffer_a, buffer_r, self.gamma, self.gpu_id)

                    buffer_s, buffer_a, buffer_r = [], [], []

                    if done:  # done and print information
                        print("")
                        record(config["starting_num_candidate_acts"],
                               config["num_candidate_acts_decay_iter"],
                               self.g_ep, self.g_step,
                               self.g_num_candidate_acts, self.g_ep_r, ep_r,
                               self.res_queue, self.name, ep_step,
                               ep_agent_num_dmd, ep_agent_num_acts)
                        break
                s = s_
                total_step += 1
                ep_step += 1
            l_ep += 1
        self.res_queue.put(None)
Esempio n. 18
0
    opt = SharedAdam(gnet.parameters(), lr=5e-3)  # global optimizer
    global_ep, global_ep_r, res_queue = mp.Value('i', 0), mp.Value('d', 0.), mp.Queue()

    # parallel training
    workers = [Worker(gnet, opt, global_ep, global_ep_r, res_queue, i) for i in range(mp.cpu_count())]
    [w.start() for w in workers]
    res = []                    # record episode reward to plot
    while True:
        r = res_queue.get()
        if r is not None:
            res.append(r)
        else:
            break
    [w.join() for w in workers]

    import matplotlib.pyplot as plt
    plt.plot(res)
    plt.ylabel('Moving average ep reward')
    plt.xlabel('Step')
    plt.show()


    for _ in range(3):
        s = env.reset()
        while True:
            env.render()
            a = gnet.choose_action(v_wrap(s[None, :]))
            s_, r, done, _ = env.step(a)
            s = s_
            if done:
                break
Esempio n. 19
0
    def run(self):
        self.getBandwidthFileList()
        # print("bandwidth file count=", len(self.bandwidth_fileList))
        r_avg_sum = 0
        resDir = self.modelDir + "/test_res_" + str(self.traceIndex)
        if os.path.exists(resDir) == False:
            os.makedirs(resDir)
        resFile = open(resDir + "/" + self.policy + "_" + self.bwType + ".txt",'w')
        for bandwidth_file in self.bandwidth_fileList:
            bandwidth_fileName = bandwidth_file[0]
            rtt = bandwidth_file[1]
            bwType = bandwidth_file[2]
            busyList = self.get_busyTrace()
            # get video -----------------------------
            videoName = ""
            if PUREFLAF:
                video_random = random.random()
                for i in range(len(self.videoList)):
                    if video_random < self.videoList[i][1]:
                        videoName = self.videoList[i - 1][0]
                        break
                if videoName == "":
                    videoName = self.videoList[-1][0]
            else:
                if self.videoTraceIndex == len(self.videoTraceList):
                    self.videoTraceIndex = 0
                videoName = self.videoTraceList[self.videoTraceIndex]

            # print(videoName)
            # get video ----------------------------
            reqBI = self.client.init(videoName=videoName, bandwidthFileName=bandwidth_fileName, rtt=rtt, bwType=bwType)
            # mask---------------------------
            mask = [1] * A_DIM
            for bIndex in range(5):
                if PUREFLAF:
                    mask[bIndex] = 0
                else:
                    if videoName + "_" + str(bIndex + 1) not in self.cachedList:
                        mask[bIndex] = 0.
            # mask---------------------------
            state_ = np.zeros(S_LEN)
            state = state_.copy() #state =[lastBitrate buffer hThroughput mThroughput busy mask]
            # 开始测试------------------------------
            total_step = 0
            segNum = 0
            r_sum = 0
            reqBitrate, lastBitrate, buffer, hThroughput, mThroughput, reward, reqBI, done, segNum, busy = [0] * 10
            while True:
                if sum(mask) == 1:
                    a = mask.index(1)
                else:
                    if self.policy == "no_policy":
                        a = 5
                    elif self.policy == "RL":
                        a, logits = self.lnet.choose_action(mask, v_wrap(state[None, :]))
                        self.saveLogits(videoName, logits)
                    elif self.policy == "lower":
                        a = self.choose_action_lower(mask, reqBI)
                    elif self.policy == "closest":
                        a = self.choose_action_closest(mask, reqBI)
                    elif self.policy == "highest":
                        a = self.choose_action_highest(mask, reqBI)
                    elif self.policy == "prefetch":
                        a = self.choose_action_prefetch(mask, reqBI)
                    else:
                        print("想啥呢")
                        return

                # if random.randint(0, 1000) == 1:
                #     print("reqb=", reqBitrate, "lb=", lastBitrate, "buffer=", int(buffer), "hT=", int(hThroughput),
                #           "mT=", int(mThroughput), "busy=", round(busy, 2),
                #           "mask=", mask, "action=", a, "reqBI=", reqBI, "reward=", round(reward, 2), "logits=", logits)

                busy = busyList[segNum % len(busyList)]
                if a == 5:
                    hitFlag = False
                else:
                    hitFlag = True

                reqBitrate, lastBitrate, buffer, hThroughput, mThroughput, reward, reqBI, done, segNum = self.client.run(a, busy, hitFlag)

                state_[0] = reqBitrate / BITRATES[-1]
                state_[1] = lastBitrate / BITRATES[-1]
                state_[2] = (buffer/1000 - 30) / 10
                state_[3] = (hThroughput - throughput_mean) / throughput_std
                state_[4] = (mThroughput - throughput_mean) / throughput_std
                reward = reward / 5

                r_sum += reward
                total_step += 1
                if done:
                    break
                state = state_.copy()
            # 结束测试------------------------------
            r_avg = r_sum / total_step
            r_avg_sum += r_avg
            resFile.write(str(r_avg) + "\n")
            resFile.flush()
            print(self.bwType, self.policy, videoName, self.bandwidth_fileList.index(bandwidth_file),"/",len(self.bandwidth_fileList), r_avg)
            self.videoTraceIndex += 1
Esempio n. 20
0
def generate_saliency(observation,
                      game_step,
                      net,
                      game_tracker,
                      record_json_dir,
                      actual_action,
                      actual_probs,
                      opponent_action,
                      opponent_id,
                      new_val=True):
    """Generate saliency data for a given observation

    observation     -- the observation to be used
    game_step       -- the current timestep of the game, needed because of an off-by-one in the observation's counter
    net             -- the network to generate saliency from
    game_tracker    -- Needed for generate_NN_input
    record_json_dir -- output directory for saliency data files. Should be kept with the data from env.render
    actual_action   -- action decided from actor distribution by the unmodified observation
    actual_probs    -- action distribution produced by the unmodified observation
    opponent_action -- the action the opponent took at this time step
    opponent_id     -- the id of the opponent
    new_val         -- if false uses passage/wall modifications, otherwise uses previously unknown value modifications
    """

    data = {}
    data['step'] = game_step
    data[
        'actual_action'] = actual_action  #have to pass actual action in from outside the func because the agent selects an action from the prob distribution in the main code
    data['actual_probs'] = actual_probs
    # data['actual_terminal_prediction'] = actual_terminal_predicton #ignoring critic for now. TODO?

    #below gets populated by the algorithm
    data['mods'] = []
    data['actions'] = []
    data['opponent_action'] = opponent_action
    data['opponent_id'] = opponent_id
    # data['predictions'] = [] #again, ignoring critic so ignoring collection of critic value changes.

    board_size = len(observation['board'][0])  #assuming board is square
    for i in range(board_size):

        mod_list = []
        action_list = []
        # prediction_list = []

        for j in range(board_size):
            mod_observation = deepcopy(observation)

            #remove the player from the 'alive' channel as well just to truly remove the data.
            if mod_observation['board'][i][j] >= 10:
                mod_observation['alive'].remove(mod_observation['board'][i][j])

            #replace this tile's value with a new value.
            if new_val:
                mod_observation['board'][i][
                    j] = 14  #first number that doesn't have a definition already
            else:
                #if the tile here is a passage, make it a wall. Otherwise, make this tile a passage.
                #prevents no change from occuring in passages
                mod_observation['board'][i][
                    j] = 1 if mod_observation['board'][i][j] == 0 else 0

            #won't ever be a bomb in this tile. possible values are 0, 1, and 14
            mod_observation['bomb_blast_strength'][i][j] = 0
            mod_observation['bomb_life'][i][j] = 0

            this_state = generate_NN_input(10, mod_observation, game_step,
                                           game_tracker)
            m_this_state = v_wrap(this_state).unsqueeze(0)
            this_action, _, _, this_probs, this_terminal_prediction = net.choose_action(
                m_this_state)

            diffs = [
                this_probs[k] - actual_probs[k] for k in range(len(this_probs))
            ]

            mod_list.append(
                deepcopy(diffs))  # Have to deppcopy to avoid pointer hell.
            action_list.append(this_action)
            # prediction_list.append(this_terminal_prediction)

        data['mods'].append(deepcopy(mod_list))
        data['actions'].append(deepcopy(action_list))
        # data['predictions'].append(deepcopy(prediction_list))

    with open(f"{record_json_dir}/d{game_step:03d}.json", 'w') as f:
        #Uses MyEncoder to properly clean numpy data to types which json will serialize.
        #json.dumps won't serialize numpy data by default
        f.write(json.dumps(data, cls=MyEncoder))
Esempio n. 21
0
env = gym.make(env_id)  #Pendulum  VibrationEnv


input_size = env.observation_space.shape[0]
output_size = env.action_space.shape[0]

env.reset()

s_, r, done, _ = env.step([1])

# s_ = torch.from_numpy(s_.reshape([batch, input_size,seq_len])).float()
# s_ = s_.reshape([1, input_size,-1]).float()
# s_ = s_.float()
print(s_)

sys = TCN(input_size, output_size, num_channels = [25, 25, 30], kernel_size = 8, dropout = 0)

# lstm_input = torch.randn(1, input_size, 1)
# mu, sigma, values  = sys(lstm_input)
# print(lstm_input)
# print(mu, sigma, values)
s_ = s_.reshape([batch, input_size, seq_len])
v_wrap(s_)

mu, sigma, values  = sys(v_wrap(s_))
print(mu, sigma, values)


# a = sys.choose_action(v_wrap(s_[None, :]))
a = sys.choose_action(v_wrap(s_))
print(a)