def experiment(variant): from matrix_game import MatrixGame expl_env = MatrixGame(game_name=args.exp_name) eval_env = MatrixGame(game_name=args.exp_name) num_agent = eval_env.agent_num obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.n policy_n, target_policy_n, qf1_n, target_qf1_n, qf2_n, target_qf2_n, eval_policy_n, expl_policy_n = \ [], [], [], [], [], [], [], [] for i in range(num_agent): policy = SoftmaxMlpPolicy(input_size=obs_dim, output_size=action_dim, **variant['policy_kwargs']) target_policy = copy.deepcopy(policy) qf1 = FlattenMlp(input_size=(obs_dim * num_agent + action_dim * (num_agent - 1)), output_size=action_dim, **variant['qf_kwargs']) target_qf1 = copy.deepcopy(qf1) qf2 = FlattenMlp(input_size=(obs_dim * num_agent + action_dim * (num_agent - 1)), output_size=action_dim, **variant['qf_kwargs']) target_qf2 = copy.deepcopy(qf2) eval_policy = ArgmaxDiscretePolicy(policy) expl_policy = PolicyWrappedWithExplorationStrategy( EpsilonGreedy(expl_env.action_space), eval_policy, ) policy_n.append(policy) target_policy_n.append(target_policy) qf1_n.append(qf1) target_qf1_n.append(target_qf1) qf2_n.append(qf2) target_qf2_n.append(target_qf2) eval_policy_n.append(eval_policy) expl_policy_n.append(expl_policy) eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n) expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n) replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'], expl_env, num_agent=num_agent) trainer = PRGDiscreteTrainer(env=expl_env, qf1_n=qf1_n, target_qf1_n=target_qf1_n, qf2_n=qf2_n, target_qf2_n=target_qf2_n, policy_n=policy_n, target_policy_n=target_policy_n, **variant['trainer_kwargs']) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, log_path_function=get_generic_ma_path_information, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def run_for_all_mode(self, bw, un): nb_episode = 2000 actions = np.arange(8) user_num = un lambda_n = np.zeros(user_num) for i in range(user_num): # 每比特需要周期量 70~800 cycles/bits if i % 5 == 0: lambda_n[i] = 0.001 if i % 5 == 1: lambda_n[i] = 0.01 if i % 5 == 2: lambda_n[i] = 0.1 if i % 5 == 3: lambda_n[i] = 0.001 if i % 5 == 4: lambda_n[i] = 0.01 actions_set = [[0, 5 * pow(10, 6), 0.4], [0, 5 * pow(10, 6), 0.4], [0, 5 * pow(10, 6), 0.4], [0, 5 * pow(10, 6), 0.4], [1, 0, 0.4], [1, 0, 0.4], [1, 0, 0.4], [1, 0, 0.4]] GPD1_array = [4 * pow(10, 6) for _ in range(user_num)] GPD2_array = [0.3 for _ in range(user_num)] # init wolf agent wolf_agent_array = [] for i in range(user_num): wolf_agent_array.append( WoLFAgent(alpha=0.1, actions=actions, high_delta=0.004, low_delta=0.002)) queue_relay_array = [] for i in range(user_num): queue_relay_array.append( QueueRelay(lambda_n[i], GPD1_array[i], GPD2_array[i])) # set reward functio # reward = Reward() reward_history = [] cost_local_history = [] # init_Queue_relay Q_array_histroy = [[10] for i in range(user_num)] ## TLIU for episode in range(nb_episode): print('episode for all :', episode) Q_array = [] Qx_array = [] Qy_array = [] Qz_array = [] M1_array = [] M2_array = [] for i in range(user_num): Q_array.append(queue_relay_array[i].Q) Qx_array.append(queue_relay_array[i].Qx) Qy_array.append(queue_relay_array[i].Qy) Qz_array.append(queue_relay_array[i].Qz) M1_array.append(queue_relay_array[i].M1) M2_array.append(queue_relay_array[i].M2) for i in range(user_num): Q_array_histroy[i].append(Q_array[i]) if episode % 50 == 0 and episode != 0: for i in range(user_num): data = Q_array_histroy[i] # data = [10000000000000 for i in range(200) ] # res = aa.gpd( data , 3.96*pow(10,5) ) res = self.gpdaa.gpd(data, 3.96 * pow(10, 7)) if res: queue_relay_array[i].GPD1 = res[0][0] queue_relay_array[i].GPD2 = res[0][1] queue_relay_array[i].updateM1() queue_relay_array[i].updateM2() iteration_actions = [] for i in range(user_num): iteration_actions.append(wolf_agent_array[i].act()) game = MatrixGame(actions=iteration_actions, Q=Q_array, Qx=Qx_array, Qy=Qy_array, Qz=Qz_array, M1=M1_array, M2=M2_array, BW=bw) reward, cost_local, bn, lumbda, rff = game.step( actions=iteration_actions) for i in range(user_num): # wolf agent act # update_Queue_relay queue_relay_array[i].lumbda = lumbda[i] queue_relay_array[i].updateQ( bn[i], actions_set[iteration_actions[i]][0], rff[i]) queue_relay_array[i].updateQx() queue_relay_array[i].updateQy() queue_relay_array[i].updateQz() # reward step reward_history.append(sum(reward)) cost_local_history.append(sum(cost_local)) for i in range(user_num): wolf_agent_array[i].observe(reward=reward[i]) # for i in range(user_num): # print(wolf_agent_array[i].pi_average) plt.plot(np.arange(len(reward_history)), reward_history, label="") plt.title('all mode ') plt.show() print('reward_history[-1]:', reward_history[-1]) return cost_local_history[-1]
probability = res[1] PR[i].append(probability) print(res) queue_relay_array[i].GPD1 = res[0][0] queue_relay_array[i].GPD2 = res[0][1] queue_relay_array[i].updateM1() queue_relay_array[i].updateM2() ## TLIU iteration_actions = [] for i in range(user_num): iteration_actions.append(wolf_agent_array[i].act()) game = MatrixGame(actions=iteration_actions, Q=Q_array, Qx=Qx_array, Qy=Qy_array, Qz=Qz_array, M1=M1_array, M2=M2_array, BW=10 * pow(10, 6)) reward, _, bn, lumbda, rff = game.step(actions=iteration_actions) print("episode", episode, "reward", sum(reward)) OUTPUT.append(sum(reward)) for i in range(user_num): # wolf agent act # update_Queue_relay queue_relay_array[i].lumbda = lumbda[i] queue_relay_array[i].updateQ(bn[i], actions_set[iteration_actions[i]][0], rff[i])
from policy import EpsGreedyQPolicy from matrix_game import MatrixGame if __name__ == '__main__': nb_episode = 100 agent1 = MiniMaxQLearner(aid=0, alpha=0.1, policy=EpsGreedyQPolicy(), actions=np.arange(2)) # agentの設定 agent2 = MiniMaxQLearner(aid=1, alpha=0.1, policy=EpsGreedyQPolicy(), actions=np.arange(2)) # agentの設定 game = MatrixGame() for episode in range(nb_episode): action1 = agent1.act() action2 = agent2.act() _, r1, r2 = game.step(action1, action2) agent1.observe(reward=r1, opponent_action=agent2.previous_action) agent2.observe(reward=r2, opponent_action=agent1.previous_action) print(agent1.pi) print(agent2.pi) # ipdb.set_trace() plt.plot(np.arange(len(agent1.pi_history)), agent1.pi_history, label="agent1's pi(0)") plt.plot(np.arange(len(agent2.pi_history)),
def wolf_cal_reward(self, DL, DH): nb_episode = 3000 actions = np.arange(8) user_num = 10 lambda_n = np.zeros(user_num) OUTPUT = [] # # PR = [[] for i in range(user_num)] gpdtemp = GPD() for i in range(user_num): # 每比特需要周期量 70~800 cycles/bits if i % 5 == 0: lambda_n[i] = 0.001 if i % 5 == 1: lambda_n[i] = 0.01 if i % 5 == 2: lambda_n[i] = 0.1 if i % 5 == 3: lambda_n[i] = 0.001 if i % 5 == 4: lambda_n[i] = 0.01 actions_set = [[0, 5 * pow(10, 6), 0.4], [0, 5 * pow(10, 6), 0.4], [0, 5 * pow(10, 6), 0.4], [0, 5 * pow(10, 6), 0.4], [1, 0, 0.4], [1, 0, 0.4], [1, 0, 0.4], [1, 0, 0.4]] GPD1_array = [4 * pow(10, 6) for _ in range(user_num)] GPD2_array = [0.3 for _ in range(user_num)] # init wolf agent wolf_agent_array = [] for i in range(user_num): wolf_agent_array.append( WoLFAgent(alpha=0.1, actions=actions, high_delta=DH, low_delta=DL)) queue_relay_array = [] for i in range(user_num): queue_relay_array.append( QueueRelay(lambda_n[i], GPD1_array[i], GPD2_array[i])) # set reward functio # reward = Reward() reward_history = [] # init_Queue_relay Q_array_histroy = [[10] for i in range(user_num)] ## TLIU for episode in range(nb_episode): Q_array = [] Qx_array = [] Qy_array = [] Qz_array = [] M1_array = [] M2_array = [] for i in range(user_num): Q_array.append(queue_relay_array[i].Q) Qx_array.append(queue_relay_array[i].Qx) Qy_array.append(queue_relay_array[i].Qy) Qz_array.append(queue_relay_array[i].Qz) M1_array.append(queue_relay_array[i].M1) M2_array.append(queue_relay_array[i].M2) ## TLIU,GPD for i in range(user_num): Q_array_histroy[i].append(Q_array[i]) if episode % 50 == 0 and episode != 0: for i in range(user_num): data = Q_array_histroy[i] # data = [10000000000000 for i in range(200) ] # res = aa.gpd( data , 3.96*pow(10,5) ) res = gpdtemp.gpd(data, 3.96 * pow(10, 6)) if res: if len(res) > 1: if res[1]: # probability = res[1] pass if res[0]: print(res) queue_relay_array[i].GPD1 = res[0][0] queue_relay_array[i].GPD2 = res[0][1] queue_relay_array[i].updateM1() queue_relay_array[i].updateM2() ## TLIU iteration_actions = [] for i in range(user_num): iteration_actions.append(wolf_agent_array[i].act()) game = MatrixGame(actions=iteration_actions, Q=Q_array, Qx=Qx_array, Qy=Qy_array, Qz=Qz_array, M1=M1_array, M2=M2_array, BW=10 * pow(10, 6)) reward, bn, lumbda, rff = game.step(actions=iteration_actions) print("episode", episode, "reward", sum(reward)) OUTPUT.append(sum(reward)) for i in range(user_num): # wolf agent act # update_Queue_relay queue_relay_array[i].lumbda = lumbda[i] queue_relay_array[i].updateQ( bn[i], actions_set[iteration_actions[i]][0], rff[i]) queue_relay_array[i].updateQx() queue_relay_array[i].updateQy() queue_relay_array[i].updateQz() # reward step reward_history.append(sum(reward)) for i in range(user_num): wolf_agent_array[i].observe(reward=reward[i]) for i in range(user_num): print('pi_average', wolf_agent_array[i].pi_average) plt.plot(np.arange(len(reward_history)), reward_history, label="all") plt.title('wolf_dl' + str(DL) + '-dh' + str(DH)) plt.show() return np.mean(reward_history[-300:])