if __name__ == "__main__": n_nodes = 2 # number of nodes n_actions = 2 # number of actions M = 20 # state length E = 1000 # memory size F = 20 # target network update frequency B = 64 # mini-batch size gamma = 0.9 # discount factor alpha = 1 # fairness index max_iter = int(5e4) idx = 1 env = ENVIRONMENT(state_size=int(8 * M), tx_prob=0.2) agent = DQN(env.state_size, n_nodes, n_actions, memory_size=E, replace_target_iter=F, batch_size=B, gamma=gamma, epsilon=1, epsilon_min=0.005, epsilon_decay=0.995, alpha=alpha) main(env.tx_prob, M, E, F, B, gamma, alpha, idx, max_iter)
def DLMA_RNN(D, D_, pb1, pt1, ps1, pb2, ps2, iteration=int(1e5)): aloha = ALOHA_AGENT(D=D, arrival_rate=pb1, trans_prob=pt1) aloha.initialize() DLMA = DQN(D=D_, arrival_rate=pb2, features=8, n_actions=2, n_nodes=2, state_length=4, memory_size=1000, replace_target_iter=20, batch_size=64, learning_rate=0.01, gamma=0.9, epsilon=1, epsilon_min=0.005, epsilon_decay=0.995, alpha=0) DLMA.initailize() env = ENVIRONMENT(aloha_channel=ps1, agent_channel=ps2, aloha=aloha, agent=DLMA) channel_state = [0] * DLMA.features state = np.zeros((4, len(channel_state))) DLMA_RNN_reward = [] begin = time.time() for i in tqdm(range(iteration)): state = np.vstack([state[1:], channel_state]) aloha_reward, agent_reward, observation = env.step() env.aloha.update(observation) env.agent.update(observation, state) DLMA_RNN_reward.append(aloha_reward + agent_reward) next_channel_state = return_action( env.agent.action) + return_observation(observation) + [ agent_reward, agent_reward ] experience = np.concatenate([ channel_state, [env.agent.action, agent_reward, agent_reward], next_channel_state ]) env.agent.add_experience(experience) if i > 100 and (i % 5 == 0): env.agent.learn() # internally iterates default (prediction) model channel_state = next_channel_state DLMA_RNN_timely_throughput = np.mean(DLMA_RNN_reward) print('DLMA_RNN_timely_throughput:', DLMA_RNN_timely_throughput) end = time.time() print('time: ', (end - begin), 's') print('memory: %.4f MB' % (psutil.Process(os.getpid()).memory_info().rss / 1024 / 1024))
print('average agent reward: {}'.format(np.mean( agent_reward_list[-2000:]))) print('average aloha reward: {}'.format(np.mean( aloha_reward_list[-2000:]))) print('average total reward: {}'.format( np.mean(agent_reward_list[-2000:]) + np.mean(aloha_reward_list[-2000:]))) print('Time elapsed:', time.time() - start) if __name__ == "__main__": env = ENVIRONMENT( state_size=40, window_size=3, ) dqn_agent = DQN( env.state_size, env.n_actions, env.n_nodes, memory_size=500, replace_target_iter=200, batch_size=32, learning_rate=0.01, gamma=0.9, epsilon=0.1, epsilon_min=0.005, epsilon_decay=0.995, ) main(max_iter=500000)
### save training loss # dqn_agent.my_plot('len1e5_M20_W2_alpha50_g0.999_MM6_r10_1') if __name__ == "__main__": RATIO = 10 # the packet length of WiFi NUM_ACTIONS = 11 # the number of actions 0-10 env = ENVIRONMENT(features=NUM_ACTIONS + 4, ratio=RATIO, n_actions=NUM_ACTIONS, init_wifi_window_size=2, max_backoff=6, penalty=0.5) dqn_agent = DQN(env.features, env.ratio, env.n_actions, env.n_nodes, history_len=20, memory_size=1000, replace_target_iter=20, batch_size=32, learning_rate=0.01, gamma=0.999, epsilon=1, epsilon_min=0.005, epsilon_decay=0.995, alpha=50) main(dqn_agent.history_len, env.n_actions, RATIO, max_iter=100000)
if __name__ == "__main__": RATIO1 = RATIO # ALOHA packet length RATIO2 = RATIO # TDMA packet length NUM_ACTIONS = RATIO1 + 1 env = ENVIRONMENT( state_size=300, # 15*20 aloha_ratio=RATIO1, tdma_ratio=RATIO2, n_actions=NUM_ACTIONS, transmission_prob=0.5, penalty=0.5, ) dqn_agent = DQN(env.state_size, env.aloha_ratio, env.tdma_ratio, env.n_actions, env.n_nodes, memory_size=1000, replace_target_iter=20, batch_size=32, learning_rate=0.01, gamma=0.999, epsilon=1, epsilon_min=0.005, epsilon_decay=0.995, alpha=50) main(env.n_actions, RATIO1, max_iter=100000)