def ucb_train_test(max_arr_rate=10,
                   n_ssb=16,
                   oversampling_f=1,
                   num_train_step=50,
                   test_every=1,
                   num_test_steps=50,
                   snr_percentile=100):
    n_antenna = 64
    codebook_size = oversampling_f * n_antenna
    n_test_steps = int(num_train_step / test_every)
    rewards = np.zeros((n_test_steps, num_test_steps))
    env = InitialAccessEnv(oversampling_factor=oversampling_f,
                           num_beams_possible=n_ssb,
                           snr_thold_percentil=snr_percentile,
                           bandit=True)
    ucb = CUCB(num_arms=codebook_size, scale_factor=max_arr_rate)
    all_beams = []
    for train_step_idx in tqdm(range(num_train_step)):
        #    for train_step_idx in range(num_train_step):
        a_t = ucb.select_arms(n_ssb)
        s_t_1, r_t, done, info = env.step(a_t)
        ucb.update(a_t, s_t_1)
        if train_step_idx % test_every == 0:
            for i in range(num_test_steps):
                a_t = ucb.greedy_select_arms(n_ssb)
                s_t_1, r_t, done, info = env.step(a_t)
                rewards[int(train_step_idx / test_every), i] = r_t
                all_beams.extend(np.where(info["new_arrival"] > 0)[0])
    true_usable_beams, true_beam_count = np.unique(all_beams,
                                                   return_counts=True)
    predicted_beams = np.where(ucb.greedy_select_arms(n_ssb) > 0)[0]
    return ucb, rewards, true_usable_beams, true_beam_count, predicted_beams
def cv_ucb_train_test(max_arr_rate = 10, 
                   n_ssb = 16,
                   oversampling_f = 1,
                   num_train_step = 50, 
                   test_every = 1, 
                   num_test_steps = 50,
                   snr_percentile = 100):
    n_antenna = 64
    codebook_size = oversampling_f*n_antenna
    n_test_steps = int(num_train_step/test_every)
    rewards = np.zeros((n_test_steps,num_test_steps))
    env = InitialAccessEnv(oversampling_factor=oversampling_f,num_beams_possible=n_ssb,snr_thold_percentil = snr_percentile, bandit=True)
    ucb = CUCB(num_arms=codebook_size,scale_factor=max_arr_rate)   
    all_beams_old = []
    all_beams_new = []
    old_cluster_centers = env.gaussian_center.current_cluster_centers
    new_cluster_centers = []
    predicted_beams_old = []
    beam_weights_old = []
    for train_step_idx in tqdm(range(num_train_step)):
#    for train_step_idx in range(num_train_step):
        if train_step_idx == int(num_train_step/2):
            predicted_beams_old = np.where(ucb.greedy_select_arms(n_ssb)>0)[0]
            beam_weights_old = ucb.Qt
            env.gaussian_center.change_cluster()
        a_t = ucb.select_arms(n_ssb)
        s_t_1, r_t, done, info = env.step(a_t)
        ucb.update(a_t, s_t_1)
        if train_step_idx % test_every == 0:
            for i in range(num_test_steps):
                a_t = ucb.greedy_select_arms(n_ssb)
                s_t_1, r_t, done, info = env.step(a_t)
                rewards[int(train_step_idx/test_every),i] = r_t
                if train_step_idx < int(num_train_step/2):
                    all_beams_old.extend(np.where(info["new_arrival"]>0)[0])
                else:
                    all_beams_new.extend(np.where(info["new_arrival"]>0)[0])
    new_cluster_centers = env.gaussian_center.current_cluster_centers
    true_usable_beams_old, true_beam_count_old = np.unique(all_beams_old,return_counts=True)
    true_usable_beams_new, true_beam_count_new = np.unique(all_beams_new,return_counts=True)
    predicted_beams_new = np.where(ucb.greedy_select_arms(n_ssb)>0)[0]
    beam_weights_new = ucb.Qt
#    return ucb, rewards, true_usable_beams, true_beam_count, predicted_beams, old_cluster_centers, new_cluster_centers
    return rewards, beam_weights_old, beam_weights_new, true_usable_beams_old, true_usable_beams_new, true_beam_count_old, true_beam_count_new, predicted_beams_old, predicted_beams_new, old_cluster_centers, new_cluster_centers
def test_reinforce(with_baseline):
    env = env = InitialAccessEnv(bandit=False)
    gamma = 0.97
    alpha = 3e-4

    if 'tensorflow' in sys.modules:
        import tensorflow as tf
        tf.compat.v1.reset_default_graph()

    pi = PiApproximationWithNN(env.observation_space.shape[0],
                               env.action_space.n, alpha)

    if with_baseline:
        B = VApproximationWithNN(env.observation_space.shape[0], alpha)
    else:
        B = Baseline(0.)

    return REINFORCE(env, gamma, 1000, pi, B)
Exemple #4
0
import numpy as np
import gym
import matplotlib.pyplot as plt

from InitialAccessEnv import InitialAccessEnv
from Agent_Utils import MaxBoltzmannQMultiBinaryPolicy
from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.optimizers import Adam

from rl.agents.dqn import DQNAgent
#from rl.policy import BoltzmannQPolicy
from rl.memory import SequentialMemory

# Get the environment and extract the number of actions.
env = InitialAccessEnv()
np.random.seed(123)
env.seed(123)
nb_actions = env.action_space.n
window_length = 5
# Next, we build a very simple model.
model = Sequential()
model.add(Flatten(input_shape=(window_length,) + env.observation_space.shape))
model.add(Dense(nb_actions))
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('linear'))
Exemple #5
0
#    s0 = env.reset()
#    Mt = np.eye(32)
#    while True:

if __name__ == "__main__":
    codebook_size = 256
    o_f = 4
    n_ssb = 32
    snr_pt = 95
    n_run = 10
    n_train = 500
    test_interval = 50
    all_rewards = np.zeros((n_run, n_train))
    for run_idx in range(n_run):
        env = InitialAccessEnv(oversampling_factor=4,
                               num_beams_possible=32,
                               snr_thold_percentil=95,
                               bandit=False)
        rewards = LSBGreedy(env, 1, n_train)
        all_rewards[run_idx, :] = rewards
    y = all_rewards.mean(axis=0)[0:n_train:10]
    x = np.arange(0, n_train, test_interval)
    y_max = all_rewards.max(axis=0)[0:n_train:10]
    y_min = all_rewards.min(axis=0)[0:n_train:10]
    y_err_asym = [y_min, y_max]
    y_err = all_rewards.max(axis=0) - all_rewards.min(axis=0)
    plt.figure(0)
    plt.errorbar(x, y, yerr=y_err_asym)
    plt.xlabel('number of training steps')
    plt.ylabel('reward')
    plt.savefig('LSBGreedy_training_progress_%dchoose%d_%dp.eps' %
                (codebook_size, n_ssb, snr_pt),