Exemple #1
0
def experiment(variant):
    from matrix_game import MatrixGame
    expl_env = MatrixGame(game_name=args.exp_name)
    eval_env = MatrixGame(game_name=args.exp_name)
    num_agent = eval_env.agent_num
    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.n

    policy_n, target_policy_n, qf1_n, target_qf1_n, qf2_n, target_qf2_n, eval_policy_n, expl_policy_n = \
        [], [], [], [], [], [], [], []
    for i in range(num_agent):
        policy = SoftmaxMlpPolicy(input_size=obs_dim,
                                  output_size=action_dim,
                                  **variant['policy_kwargs'])
        target_policy = copy.deepcopy(policy)
        qf1 = FlattenMlp(input_size=(obs_dim * num_agent + action_dim *
                                     (num_agent - 1)),
                         output_size=action_dim,
                         **variant['qf_kwargs'])
        target_qf1 = copy.deepcopy(qf1)
        qf2 = FlattenMlp(input_size=(obs_dim * num_agent + action_dim *
                                     (num_agent - 1)),
                         output_size=action_dim,
                         **variant['qf_kwargs'])
        target_qf2 = copy.deepcopy(qf2)
        eval_policy = ArgmaxDiscretePolicy(policy)
        expl_policy = PolicyWrappedWithExplorationStrategy(
            EpsilonGreedy(expl_env.action_space),
            eval_policy,
        )
        policy_n.append(policy)
        target_policy_n.append(target_policy)
        qf1_n.append(qf1)
        target_qf1_n.append(target_qf1)
        qf2_n.append(qf2)
        target_qf2_n.append(target_qf2)
        eval_policy_n.append(eval_policy)
        expl_policy_n.append(expl_policy)

    eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n)
    expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n)
    replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'],
                                      expl_env,
                                      num_agent=num_agent)
    trainer = PRGDiscreteTrainer(env=expl_env,
                                 qf1_n=qf1_n,
                                 target_qf1_n=target_qf1_n,
                                 qf2_n=qf2_n,
                                 target_qf2_n=target_qf2_n,
                                 policy_n=policy_n,
                                 target_policy_n=target_policy_n,
                                 **variant['trainer_kwargs'])
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        log_path_function=get_generic_ma_path_information,
        **variant['algorithm_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
Exemple #2
0
    def run_for_all_mode(self, bw, un):
        nb_episode = 2000
        actions = np.arange(8)
        user_num = un
        lambda_n = np.zeros(user_num)

        for i in range(user_num):  # 每比特需要周期量 70~800 cycles/bits
            if i % 5 == 0:
                lambda_n[i] = 0.001
            if i % 5 == 1:
                lambda_n[i] = 0.01
            if i % 5 == 2:
                lambda_n[i] = 0.1
            if i % 5 == 3:
                lambda_n[i] = 0.001
            if i % 5 == 4:
                lambda_n[i] = 0.01
        actions_set = [[0, 5 * pow(10, 6), 0.4], [0, 5 * pow(10, 6), 0.4],
                       [0, 5 * pow(10, 6), 0.4], [0, 5 * pow(10, 6), 0.4],
                       [1, 0, 0.4], [1, 0, 0.4], [1, 0, 0.4], [1, 0, 0.4]]
        GPD1_array = [4 * pow(10, 6) for _ in range(user_num)]
        GPD2_array = [0.3 for _ in range(user_num)]

        # init wolf agent
        wolf_agent_array = []
        for i in range(user_num):
            wolf_agent_array.append(
                WoLFAgent(alpha=0.1,
                          actions=actions,
                          high_delta=0.004,
                          low_delta=0.002))

        queue_relay_array = []

        for i in range(user_num):
            queue_relay_array.append(
                QueueRelay(lambda_n[i], GPD1_array[i], GPD2_array[i]))

        # set reward functio

        # reward = Reward()
        reward_history = []

        cost_local_history = []
        # init_Queue_relay

        Q_array_histroy = [[10] for i in range(user_num)]  ##  TLIU

        for episode in range(nb_episode):
            print('episode for all :', episode)

            Q_array = []
            Qx_array = []
            Qy_array = []
            Qz_array = []
            M1_array = []
            M2_array = []

            for i in range(user_num):
                Q_array.append(queue_relay_array[i].Q)
                Qx_array.append(queue_relay_array[i].Qx)
                Qy_array.append(queue_relay_array[i].Qy)
                Qz_array.append(queue_relay_array[i].Qz)
                M1_array.append(queue_relay_array[i].M1)
                M2_array.append(queue_relay_array[i].M2)

            for i in range(user_num):
                Q_array_histroy[i].append(Q_array[i])
            if episode % 50 == 0 and episode != 0:
                for i in range(user_num):

                    data = Q_array_histroy[i]
                    # data = [10000000000000 for i in range(200) ]
                    # res = aa.gpd(  data  , 3.96*pow(10,5)  )
                    res = self.gpdaa.gpd(data, 3.96 * pow(10, 7))
                    if res:
                        queue_relay_array[i].GPD1 = res[0][0]
                        queue_relay_array[i].GPD2 = res[0][1]
                        queue_relay_array[i].updateM1()
                        queue_relay_array[i].updateM2()

            iteration_actions = []
            for i in range(user_num):
                iteration_actions.append(wolf_agent_array[i].act())
            game = MatrixGame(actions=iteration_actions,
                              Q=Q_array,
                              Qx=Qx_array,
                              Qy=Qy_array,
                              Qz=Qz_array,
                              M1=M1_array,
                              M2=M2_array,
                              BW=bw)

            reward, cost_local, bn, lumbda, rff = game.step(
                actions=iteration_actions)
            for i in range(user_num):
                # wolf agent act
                # update_Queue_relay
                queue_relay_array[i].lumbda = lumbda[i]
                queue_relay_array[i].updateQ(
                    bn[i], actions_set[iteration_actions[i]][0], rff[i])
                queue_relay_array[i].updateQx()
                queue_relay_array[i].updateQy()
                queue_relay_array[i].updateQz()

            # reward step
            reward_history.append(sum(reward))

            cost_local_history.append(sum(cost_local))

            for i in range(user_num):
                wolf_agent_array[i].observe(reward=reward[i])

        # for i in range(user_num):
        #     print(wolf_agent_array[i].pi_average)

        plt.plot(np.arange(len(reward_history)), reward_history, label="")
        plt.title('all mode ')
        plt.show()
        print('reward_history[-1]:', reward_history[-1])

        return cost_local_history[-1]
Exemple #3
0
from policy import EpsGreedyQPolicy
from matrix_game import MatrixGame

if __name__ == '__main__':
    nb_episode = 100

    agent1 = MiniMaxQLearner(aid=0,
                             alpha=0.1,
                             policy=EpsGreedyQPolicy(),
                             actions=np.arange(2))  # agentの設定
    agent2 = MiniMaxQLearner(aid=1,
                             alpha=0.1,
                             policy=EpsGreedyQPolicy(),
                             actions=np.arange(2))  # agentの設定

    game = MatrixGame()
    for episode in range(nb_episode):
        action1 = agent1.act()
        action2 = agent2.act()

        _, r1, r2 = game.step(action1, action2)

        agent1.observe(reward=r1, opponent_action=agent2.previous_action)
        agent2.observe(reward=r2, opponent_action=agent1.previous_action)
    print(agent1.pi)
    print(agent2.pi)
    # ipdb.set_trace()
    plt.plot(np.arange(len(agent1.pi_history)),
             agent1.pi_history,
             label="agent1's pi(0)")
    plt.plot(np.arange(len(agent2.pi_history)),
                    probability = res[1]
                    PR[i].append(probability)
                    print(res)
                    queue_relay_array[i].GPD1 = res[0][0]
                    queue_relay_array[i].GPD2 = res[0][1]
                    queue_relay_array[i].updateM1()
                    queue_relay_array[i].updateM2()
        ##  TLIU

        iteration_actions = []
        for i in range(user_num):
            iteration_actions.append(wolf_agent_array[i].act())
        game = MatrixGame(actions=iteration_actions,
                          Q=Q_array,
                          Qx=Qx_array,
                          Qy=Qy_array,
                          Qz=Qz_array,
                          M1=M1_array,
                          M2=M2_array,
                          BW=10 * pow(10, 6))

        reward, _, bn, lumbda, rff = game.step(actions=iteration_actions)
        print("episode", episode, "reward", sum(reward))
        OUTPUT.append(sum(reward))

        for i in range(user_num):
            # wolf agent act
            # update_Queue_relay
            queue_relay_array[i].lumbda = lumbda[i]
            queue_relay_array[i].updateQ(bn[i],
                                         actions_set[iteration_actions[i]][0],
                                         rff[i])
Exemple #5
0
    def wolf_cal_reward(self, DL, DH):

        nb_episode = 3000
        actions = np.arange(8)
        user_num = 10
        lambda_n = np.zeros(user_num)
        OUTPUT = []  #
        # PR = [[] for i in range(user_num)]
        gpdtemp = GPD()

        for i in range(user_num):  # 每比特需要周期量 70~800 cycles/bits
            if i % 5 == 0:
                lambda_n[i] = 0.001
            if i % 5 == 1:
                lambda_n[i] = 0.01
            if i % 5 == 2:
                lambda_n[i] = 0.1
            if i % 5 == 3:
                lambda_n[i] = 0.001
            if i % 5 == 4:
                lambda_n[i] = 0.01
        actions_set = [[0, 5 * pow(10, 6), 0.4], [0, 5 * pow(10, 6), 0.4],
                       [0, 5 * pow(10, 6), 0.4], [0, 5 * pow(10, 6), 0.4],
                       [1, 0, 0.4], [1, 0, 0.4], [1, 0, 0.4], [1, 0, 0.4]]
        GPD1_array = [4 * pow(10, 6) for _ in range(user_num)]
        GPD2_array = [0.3 for _ in range(user_num)]

        # init wolf agent
        wolf_agent_array = []
        for i in range(user_num):
            wolf_agent_array.append(
                WoLFAgent(alpha=0.1,
                          actions=actions,
                          high_delta=DH,
                          low_delta=DL))

        queue_relay_array = []

        for i in range(user_num):
            queue_relay_array.append(
                QueueRelay(lambda_n[i], GPD1_array[i], GPD2_array[i]))

        # set reward functio

        # reward = Reward()
        reward_history = []
        # init_Queue_relay

        Q_array_histroy = [[10] for i in range(user_num)]  ##  TLIU

        for episode in range(nb_episode):

            Q_array = []
            Qx_array = []
            Qy_array = []
            Qz_array = []
            M1_array = []
            M2_array = []

            for i in range(user_num):
                Q_array.append(queue_relay_array[i].Q)
                Qx_array.append(queue_relay_array[i].Qx)
                Qy_array.append(queue_relay_array[i].Qy)
                Qz_array.append(queue_relay_array[i].Qz)
                M1_array.append(queue_relay_array[i].M1)
                M2_array.append(queue_relay_array[i].M2)

            ##  TLIU,GPD

            for i in range(user_num):
                Q_array_histroy[i].append(Q_array[i])
            if episode % 50 == 0 and episode != 0:
                for i in range(user_num):

                    data = Q_array_histroy[i]
                    # data = [10000000000000 for i in range(200) ]
                    # res = aa.gpd(  data  , 3.96*pow(10,5)  )

                    res = gpdtemp.gpd(data, 3.96 * pow(10, 6))
                    if res:
                        if len(res) > 1:
                            if res[1]:
                                # probability = res[1]
                                pass
                        if res[0]:
                            print(res)
                            queue_relay_array[i].GPD1 = res[0][0]
                            queue_relay_array[i].GPD2 = res[0][1]
                            queue_relay_array[i].updateM1()
                            queue_relay_array[i].updateM2()
            ##  TLIU

            iteration_actions = []
            for i in range(user_num):
                iteration_actions.append(wolf_agent_array[i].act())
            game = MatrixGame(actions=iteration_actions,
                              Q=Q_array,
                              Qx=Qx_array,
                              Qy=Qy_array,
                              Qz=Qz_array,
                              M1=M1_array,
                              M2=M2_array,
                              BW=10 * pow(10, 6))

            reward, bn, lumbda, rff = game.step(actions=iteration_actions)
            print("episode", episode, "reward", sum(reward))
            OUTPUT.append(sum(reward))

            for i in range(user_num):
                # wolf agent act
                # update_Queue_relay
                queue_relay_array[i].lumbda = lumbda[i]
                queue_relay_array[i].updateQ(
                    bn[i], actions_set[iteration_actions[i]][0], rff[i])
                queue_relay_array[i].updateQx()
                queue_relay_array[i].updateQy()
                queue_relay_array[i].updateQz()

            # reward step
            reward_history.append(sum(reward))
            for i in range(user_num):
                wolf_agent_array[i].observe(reward=reward[i])

        for i in range(user_num):
            print('pi_average', wolf_agent_array[i].pi_average)

        plt.plot(np.arange(len(reward_history)), reward_history, label="all")
        plt.title('wolf_dl' + str(DL) + '-dh' + str(DH))
        plt.show()

        return np.mean(reward_history[-300:])
    actions = np.arange(3)
    agent1 = WoLFAgent(alpha=0.1,
                       actions=actions,
                       high_delta=0.0004,
                       low_delta=0.0002)
    agent2 = WoLFAgent(alpha=0.1,
                       actions=actions,
                       high_delta=0.0004,
                       low_delta=0.0002)
    agent3 = WoLFAgent(alpha=0.1,
                       actions=actions,
                       high_delta=0.0004,
                       low_delta=0.0002)

    game = MatrixGame()
    for episode in range(nb_episode):
        actions = []
        action1 = agent1.act()
        action2 = agent2.act()
        action3 = agent3.act()
        actions.append(action1)
        actions.append(action2)
        actions.append(action3)
        _, reward = game.step(actions)

        agent1.observe(reward=reward[0])
        agent2.observe(reward=reward[1])
        agent3.observe(reward=reward[2])

    print(agent1.q_values)