def ucb_train_test(max_arr_rate=10, n_ssb=16, oversampling_f=1, num_train_step=50, test_every=1, num_test_steps=50, snr_percentile=100): n_antenna = 64 codebook_size = oversampling_f * n_antenna n_test_steps = int(num_train_step / test_every) rewards = np.zeros((n_test_steps, num_test_steps)) env = InitialAccessEnv(oversampling_factor=oversampling_f, num_beams_possible=n_ssb, snr_thold_percentil=snr_percentile, bandit=True) ucb = CUCB(num_arms=codebook_size, scale_factor=max_arr_rate) all_beams = [] for train_step_idx in tqdm(range(num_train_step)): # for train_step_idx in range(num_train_step): a_t = ucb.select_arms(n_ssb) s_t_1, r_t, done, info = env.step(a_t) ucb.update(a_t, s_t_1) if train_step_idx % test_every == 0: for i in range(num_test_steps): a_t = ucb.greedy_select_arms(n_ssb) s_t_1, r_t, done, info = env.step(a_t) rewards[int(train_step_idx / test_every), i] = r_t all_beams.extend(np.where(info["new_arrival"] > 0)[0]) true_usable_beams, true_beam_count = np.unique(all_beams, return_counts=True) predicted_beams = np.where(ucb.greedy_select_arms(n_ssb) > 0)[0] return ucb, rewards, true_usable_beams, true_beam_count, predicted_beams
def cv_ucb_train_test(max_arr_rate = 10, n_ssb = 16, oversampling_f = 1, num_train_step = 50, test_every = 1, num_test_steps = 50, snr_percentile = 100): n_antenna = 64 codebook_size = oversampling_f*n_antenna n_test_steps = int(num_train_step/test_every) rewards = np.zeros((n_test_steps,num_test_steps)) env = InitialAccessEnv(oversampling_factor=oversampling_f,num_beams_possible=n_ssb,snr_thold_percentil = snr_percentile, bandit=True) ucb = CUCB(num_arms=codebook_size,scale_factor=max_arr_rate) all_beams_old = [] all_beams_new = [] old_cluster_centers = env.gaussian_center.current_cluster_centers new_cluster_centers = [] predicted_beams_old = [] beam_weights_old = [] for train_step_idx in tqdm(range(num_train_step)): # for train_step_idx in range(num_train_step): if train_step_idx == int(num_train_step/2): predicted_beams_old = np.where(ucb.greedy_select_arms(n_ssb)>0)[0] beam_weights_old = ucb.Qt env.gaussian_center.change_cluster() a_t = ucb.select_arms(n_ssb) s_t_1, r_t, done, info = env.step(a_t) ucb.update(a_t, s_t_1) if train_step_idx % test_every == 0: for i in range(num_test_steps): a_t = ucb.greedy_select_arms(n_ssb) s_t_1, r_t, done, info = env.step(a_t) rewards[int(train_step_idx/test_every),i] = r_t if train_step_idx < int(num_train_step/2): all_beams_old.extend(np.where(info["new_arrival"]>0)[0]) else: all_beams_new.extend(np.where(info["new_arrival"]>0)[0]) new_cluster_centers = env.gaussian_center.current_cluster_centers true_usable_beams_old, true_beam_count_old = np.unique(all_beams_old,return_counts=True) true_usable_beams_new, true_beam_count_new = np.unique(all_beams_new,return_counts=True) predicted_beams_new = np.where(ucb.greedy_select_arms(n_ssb)>0)[0] beam_weights_new = ucb.Qt # return ucb, rewards, true_usable_beams, true_beam_count, predicted_beams, old_cluster_centers, new_cluster_centers return rewards, beam_weights_old, beam_weights_new, true_usable_beams_old, true_usable_beams_new, true_beam_count_old, true_beam_count_new, predicted_beams_old, predicted_beams_new, old_cluster_centers, new_cluster_centers
def test_reinforce(with_baseline): env = env = InitialAccessEnv(bandit=False) gamma = 0.97 alpha = 3e-4 if 'tensorflow' in sys.modules: import tensorflow as tf tf.compat.v1.reset_default_graph() pi = PiApproximationWithNN(env.observation_space.shape[0], env.action_space.n, alpha) if with_baseline: B = VApproximationWithNN(env.observation_space.shape[0], alpha) else: B = Baseline(0.) return REINFORCE(env, gamma, 1000, pi, B)
import numpy as np import gym import matplotlib.pyplot as plt from InitialAccessEnv import InitialAccessEnv from Agent_Utils import MaxBoltzmannQMultiBinaryPolicy from keras.models import Sequential from keras.layers import Dense, Activation, Flatten from keras.optimizers import Adam from rl.agents.dqn import DQNAgent #from rl.policy import BoltzmannQPolicy from rl.memory import SequentialMemory # Get the environment and extract the number of actions. env = InitialAccessEnv() np.random.seed(123) env.seed(123) nb_actions = env.action_space.n window_length = 5 # Next, we build a very simple model. model = Sequential() model.add(Flatten(input_shape=(window_length,) + env.observation_space.shape)) model.add(Dense(nb_actions)) model.add(Activation('relu')) model.add(Dense(nb_actions)) model.add(Activation('relu')) model.add(Dense(nb_actions)) model.add(Activation('relu')) model.add(Dense(nb_actions)) model.add(Activation('linear'))
# s0 = env.reset() # Mt = np.eye(32) # while True: if __name__ == "__main__": codebook_size = 256 o_f = 4 n_ssb = 32 snr_pt = 95 n_run = 10 n_train = 500 test_interval = 50 all_rewards = np.zeros((n_run, n_train)) for run_idx in range(n_run): env = InitialAccessEnv(oversampling_factor=4, num_beams_possible=32, snr_thold_percentil=95, bandit=False) rewards = LSBGreedy(env, 1, n_train) all_rewards[run_idx, :] = rewards y = all_rewards.mean(axis=0)[0:n_train:10] x = np.arange(0, n_train, test_interval) y_max = all_rewards.max(axis=0)[0:n_train:10] y_min = all_rewards.min(axis=0)[0:n_train:10] y_err_asym = [y_min, y_max] y_err = all_rewards.max(axis=0) - all_rewards.min(axis=0) plt.figure(0) plt.errorbar(x, y, yerr=y_err_asym) plt.xlabel('number of training steps') plt.ylabel('reward') plt.savefig('LSBGreedy_training_progress_%dchoose%d_%dp.eps' % (codebook_size, n_ssb, snr_pt),