Example #1
0
     reward = reward_step
     if stepIdx > 100:
         s, a, r = PG.store_transition(observation_step, action,
                                       reward)
     if stepIdx % 6 == 0 and stepIdx > 100:
         PG.learn()
 for k in range(len(observation)):
     ss = observation[k].copy()
     ss.extend(matrixOfChanAlloc.copy().reshape(
         1, nOfenb * nOfchannel).tolist()[0])
     # print(ss)
     observation_step = np.array(ss).reshape(
         nOfenb * nOfchannel + 4, 1).ravel()
     print("observation_step: ", observation_step)
     if observation_step[1] > 0:
         action = PG.choose_action1(observation_step,
                                    matrixOfChanAlloc, stepIdx)
         if action < 12:
             action_list.append(observation_step[0])
             action_list.append(observation_step[1])
             action_list.append(action)
         else:
             action_list.append(0)
             action_list.append(0)
             action_list.append(0)
         reward = 0
         if k == len(observation) - 1 or observation[k + 1][1] == 0:
             #大step
             d = ()
             for b in range(len(action_list)):
                 d += (spaces.Discrete(int(action_list[b])), )
             action_ = spaces.Tuple(d)
Example #2
0
                    ss = []
                    for a in observation:
                        for b in a:
                            ss.append(b)

                    ss.extend(matrixOfChanAlloc.copy().reshape(
                        1, nOfenb * nOfchannel).tolist()[0])  #请求+信道占用

                    observation_step = np.array(ss).reshape(
                        nOfenb * nOfchannel + sizeperq * len(observation),
                        1).ravel()  #变换为网络输入所要求的维度
                    print("observation_step: ", observation_step)
                    if observation_step[k * sizeperq +
                                        1] > 0:  #判断RNTI是否大于0 是否为有效请求
                        action = PG.choose_action1(observation_step,
                                                   matrixOfChanAlloc,
                                                   observation[k][0])  #选取动作

                        if action < nOfchannel:  #判断是否为有效动作
                            observation[k][4] = action  #改变状态
                            addaction(observation[k][0], observation[k][1],
                                      action, action_list)  #存储分配策略到action_list
                        else:
                            addaction(0, 0, 0, action_list)  #空动作

                    reward = 0  #eposide没有结束reward为0
                    if stepIdx > 100 and k < numue - 1:  #stepIndex大于100开始进入学习过程,开始存储信息
                        s, a, r = PG.store_transition(
                            observation_step,
                            action + observation[k][0] * nOfchannel, reward)