Beispiel #1
0
    def __init__(self,
                 state_dim,
                 action_dim,
                 max_action,
                 sess,
                 tau=0.001,
                 actor_hs=[400, 300],
                 actor_lr=0.001,
                 critic_hs=[400, 300],
                 critic_lr=0.001,
                 dqda_clipping=None,
                 clip_norm=False,
                 vae_lr=0.001):

        self.sess = sess
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.latent_dim = action_dim * 2

        self.bcq_train = BCQNetwork("train_bcq",
                                    state_dim=state_dim,
                                    action_dim=action_dim,
                                    max_action=max_action,
                                    actor_hs_list=actor_hs,
                                    actor_lr=actor_lr,
                                    critic_hs_list=critic_hs,
                                    critic_lr=critic_lr,
                                    dqda_clipping=dqda_clipping,
                                    clip_norm=clip_norm)
        self.bcq_target = BCQNetwork("target_bcq",
                                     state_dim=state_dim,
                                     action_dim=action_dim,
                                     max_action=max_action,
                                     actor_hs_list=actor_hs,
                                     actor_lr=actor_lr,
                                     critic_hs_list=critic_hs,
                                     critic_lr=critic_lr,
                                     dqda_clipping=dqda_clipping,
                                     clip_norm=clip_norm)
        self.vae = VAE(state_dim, action_dim, self.latent_dim, max_action,
                       vae_lr)

        self.sess.run(tf.global_variables_initializer())
        self.saver = tf.train.Saver()
        # target network update operations
        self.target_network_update_op = trfl.update_target_variables(
            self.bcq_target.get_network_variables(),
            self.bcq_train.get_network_variables(),
            tau=tau)
        # intialize networks to start with the same variables:
        self.target_same_init = trfl.update_target_variables(
            self.bcq_target.get_network_variables(),
            self.bcq_train.get_network_variables(),
            tau=1.0)
        self.sess.run(self.target_same_init)
Beispiel #2
0
        ActorCriticNetwork(name='ac_target_net_{:.0f}'.format(i),
                           obs_size=obs_size,
                           action_size=action_size,
                           actor_hidden_size=actor_hidden_size,
                           ac_learning_rate=ac_learning_rate,
                           entropy_cost=entropy_cost,
                           normalise_entropy=normalise_entropy,
                           lambda_=lambda_,
                           baseline_cost=baseline_cost))

    print('Instantiated Target Network {:.0f} of {:.0f}'.format(
        i + 1, ch.N_D2D))

    D2D_target_net_update_ops.append(
        trfl.update_target_variables(
            D2D_target_nets[i].get_network_variables(),
            D2D_nets[i].get_network_variables(),
            tau=0.001))

    print('Instantiated Target Net Update ops {:.0f} of {:.0f}'.format(
        i + 1, ch.N_D2D))
    print('\n')

stats_rewards_list = []
stats_every = 10

initial_actions = []
power_levels = []
RB_selections = []

g_iB, g_j, G_ij, g_jB, G_j_j = ch.reset()
#for i in range(0, ch.N_D2D):
Beispiel #3
0
        # print("Resoring...")
        # saver.restore(sess, tf.train.latest_checkpoint('/mnt/hgfs/ryanprinster/lab/models/'))


tf.reset_default_graph()

mainQN = QNetwork(name='main_qn',
                  hidden_size=hidden_size,
                  learning_rate=learning_rate,
                  batch_size=batch_size * (minibatch_size - 1))
targetQN = QNetwork(name='target_qn',
                    hidden_size=hidden_size,
                    learning_rate=learning_rate,
                    batch_size=batch_size)
target_network_update_ops = trfl.update_target_variables(
    targetQN.get_qnetwork_variables(),
    mainQN.get_qnetwork_variables(),
    tau=1.0)


def run(length, width, height, fps, level, record, demo, demofiles, video):
    """Spins up an environment and runs the random agent."""
    config = {'fps': str(fps), 'width': str(width), 'height': str(height)}
    if record:
        config['record'] = record
    if demo:
        config['demo'] = demo
    if demofiles:
        config['demofiles'] = demofiles
    if video:
        config['video'] = video
    # env = deepmind_lab.Lab(level, ['RGB_INTERLEAVED', 'DEBUG.CAMERA.TOP_DOWN'], config=config)
# instantiate COMA critic network

central_critic = CriticNetwork(name="Critic_Net",
                               obs_size=obs_size,
                               action_size=action_size,
                               critic_hidden_size=critic_hidden_size,
                               critic_learning_rate=critic_learning_rate)

target_critic_net = CriticNetwork(name="arget_Critic_Net",
                                  obs_size=obs_size,
                                  action_size=action_size,
                                  critic_hidden_size=critic_hidden_size,
                                  critic_learning_rate=critic_learning_rate)

target_critic_update_ops = trfl.update_target_variables(
    target_critic_net.get_network_variables(),
    central_critic.get_network_variables(),
    tau=0.001)

print('Instantiated Critic Network')

# instantiate actor networks

D2D_actor_nets = []

for i in range(0, ch.N_D2D):
    D2D_actor_nets.append(
        ActorNetwork(name='a_net_{:.0f}'.format(i),
                     obs_size=obs_size,
                     actor_hidden_size=actor_hidden_size,
                     actor_learning_rate=actor_learning_rate))
Beispiel #5
0
obs_size = ch.N_CU

print('action_size: ', action_size)
print('obs_size: ', obs_size)

tf.reset_default_graph()

# instantiate social critic networks

social_central_critic = CriticNetwork(name="social_Critic_Net",critic_hidden_size=social_critic_hidden_size,
                                         critic_learning_rate=social_critic_learning_rate)

social_target_critic_net = CriticNetwork(name="social_Target_Critic_Net",critic_hidden_size=social_critic_hidden_size,
                                         critic_learning_rate=social_critic_learning_rate)

social_target_critic_update_ops = trfl.update_target_variables(social_target_critic_net.get_network_variables(), 
                                                                  social_central_critic.get_network_variables(), tau=0.001)

print('Instantiated Social Critic Network')

# instantiate individual critic

individual_central_critic = CriticNetwork(name="individual_Critic_Net",critic_hidden_size=individual_critic_hidden_size,
                                         critic_learning_rate=individual_critic_learning_rate)

individual_target_critic_net = CriticNetwork(name="Target_Critic_Net",critic_hidden_size=individual_critic_hidden_size,
                                         critic_learning_rate=individual_critic_learning_rate)

individual_target_critic_update_ops = trfl.update_target_variables(individual_target_critic_net.get_network_variables(), 
                                                                  individual_central_critic.get_network_variables(), tau=0.001)

print('Instantiated Individual Critic Network')
# instantiate COMA critic network

central_critic = CriticNetwork(name="Critic_Net",
                               obs_size=obs_size,
                               action_size=action_size,
                               critic_hidden_size=critic_hidden_size,
                               critic_learning_rate=critic_learning_rate)

target_critic_net = CriticNetwork(name="arget_Critic_Net",
                                  obs_size=obs_size,
                                  action_size=action_size,
                                  critic_hidden_size=critic_hidden_size,
                                  critic_learning_rate=critic_learning_rate)

target_critic_update_ops = trfl.update_target_variables(
    target_critic_net.get_network_variables(),
    central_critic.get_network_variables(),
    tau=0.001)

print('Instantiated Critic Network')

# instantiate actor networks and individual critic networks

D2D_actor_nets = []
individual_central_critics = []
individual_target_critic_nets = []
individual_target_critic_update_ops = []

for i in range(0, ch.N_D2D):
    individual_central_critics.append(
        IndCriticNetwork(name='individual_Critic_Net_{:.0f}'.format(i),
                         critic_hidden_size=individual_critic_hidden_size,
tf.reset_default_graph()

# instantiate social critic networks

social_central_critic = CriticNetwork(
    name="social_Critic_Net",
    critic_hidden_size=social_critic_hidden_size,
    critic_learning_rate=social_critic_learning_rate)

social_target_critic_net = CriticNetwork(
    name="social_Target_Critic_Net",
    critic_hidden_size=social_critic_hidden_size,
    critic_learning_rate=social_critic_learning_rate)

social_target_critic_update_ops = trfl.update_target_variables(
    social_target_critic_net.get_network_variables(),
    social_central_critic.get_network_variables(),
    tau=0.001)

print('Instantiated Social Critic Network')

# instantiate individual critic

individual_central_critic = CriticNetwork(
    name="individual_Critic_Net",
    critic_hidden_size=individual_critic_hidden_size,
    critic_learning_rate=individual_critic_learning_rate)

individual_target_critic_net = CriticNetwork(
    name="Target_Critic_Net",
    critic_hidden_size=individual_critic_hidden_size,
    critic_learning_rate=individual_critic_learning_rate)
        ActorCriticNetwork(name='ac_target_net_{:.0f}'.format(i),
                           obs_size=obs_size,
                           action_size=action_size,
                           actor_hidden_size=actor_hidden_size,
                           ac_learning_rate=ac_learning_rate,
                           entropy_cost=entropy_cost,
                           normalise_entropy=normalise_entropy,
                           lambda_=lambda_,
                           baseline_cost=baseline_cost))

    print('Instantiated Target Network {:.0f} of {:.0f}'.format(
        i + 1, env.no_players))

    player_target_net_update_ops.append(
        trfl.update_target_variables(
            player_target_nets[i].get_network_variables(),
            player_nets[i].get_network_variables(),
            tau=0.001))

    print('Instantiated Target Net Update ops {:.0f} of {:.0f}'.format(
        i + 1, env.no_players))
    print('\n')

stats_rewards_list = []

RB_selections = []

state = env.initialize()


def running_mean(x, N):
    cumsum = np.cumsum(np.insert(x, 0, 0))