def __init__(self, state_dim, action_dim, max_action, sess, tau=0.001, actor_hs=[400, 300], actor_lr=0.001, critic_hs=[400, 300], critic_lr=0.001, dqda_clipping=None, clip_norm=False, vae_lr=0.001): self.sess = sess self.state_dim = state_dim self.action_dim = action_dim self.latent_dim = action_dim * 2 self.bcq_train = BCQNetwork("train_bcq", state_dim=state_dim, action_dim=action_dim, max_action=max_action, actor_hs_list=actor_hs, actor_lr=actor_lr, critic_hs_list=critic_hs, critic_lr=critic_lr, dqda_clipping=dqda_clipping, clip_norm=clip_norm) self.bcq_target = BCQNetwork("target_bcq", state_dim=state_dim, action_dim=action_dim, max_action=max_action, actor_hs_list=actor_hs, actor_lr=actor_lr, critic_hs_list=critic_hs, critic_lr=critic_lr, dqda_clipping=dqda_clipping, clip_norm=clip_norm) self.vae = VAE(state_dim, action_dim, self.latent_dim, max_action, vae_lr) self.sess.run(tf.global_variables_initializer()) self.saver = tf.train.Saver() # target network update operations self.target_network_update_op = trfl.update_target_variables( self.bcq_target.get_network_variables(), self.bcq_train.get_network_variables(), tau=tau) # intialize networks to start with the same variables: self.target_same_init = trfl.update_target_variables( self.bcq_target.get_network_variables(), self.bcq_train.get_network_variables(), tau=1.0) self.sess.run(self.target_same_init)
ActorCriticNetwork(name='ac_target_net_{:.0f}'.format(i), obs_size=obs_size, action_size=action_size, actor_hidden_size=actor_hidden_size, ac_learning_rate=ac_learning_rate, entropy_cost=entropy_cost, normalise_entropy=normalise_entropy, lambda_=lambda_, baseline_cost=baseline_cost)) print('Instantiated Target Network {:.0f} of {:.0f}'.format( i + 1, ch.N_D2D)) D2D_target_net_update_ops.append( trfl.update_target_variables( D2D_target_nets[i].get_network_variables(), D2D_nets[i].get_network_variables(), tau=0.001)) print('Instantiated Target Net Update ops {:.0f} of {:.0f}'.format( i + 1, ch.N_D2D)) print('\n') stats_rewards_list = [] stats_every = 10 initial_actions = [] power_levels = [] RB_selections = [] g_iB, g_j, G_ij, g_jB, G_j_j = ch.reset() #for i in range(0, ch.N_D2D):
# print("Resoring...") # saver.restore(sess, tf.train.latest_checkpoint('/mnt/hgfs/ryanprinster/lab/models/')) tf.reset_default_graph() mainQN = QNetwork(name='main_qn', hidden_size=hidden_size, learning_rate=learning_rate, batch_size=batch_size * (minibatch_size - 1)) targetQN = QNetwork(name='target_qn', hidden_size=hidden_size, learning_rate=learning_rate, batch_size=batch_size) target_network_update_ops = trfl.update_target_variables( targetQN.get_qnetwork_variables(), mainQN.get_qnetwork_variables(), tau=1.0) def run(length, width, height, fps, level, record, demo, demofiles, video): """Spins up an environment and runs the random agent.""" config = {'fps': str(fps), 'width': str(width), 'height': str(height)} if record: config['record'] = record if demo: config['demo'] = demo if demofiles: config['demofiles'] = demofiles if video: config['video'] = video # env = deepmind_lab.Lab(level, ['RGB_INTERLEAVED', 'DEBUG.CAMERA.TOP_DOWN'], config=config)
# instantiate COMA critic network central_critic = CriticNetwork(name="Critic_Net", obs_size=obs_size, action_size=action_size, critic_hidden_size=critic_hidden_size, critic_learning_rate=critic_learning_rate) target_critic_net = CriticNetwork(name="arget_Critic_Net", obs_size=obs_size, action_size=action_size, critic_hidden_size=critic_hidden_size, critic_learning_rate=critic_learning_rate) target_critic_update_ops = trfl.update_target_variables( target_critic_net.get_network_variables(), central_critic.get_network_variables(), tau=0.001) print('Instantiated Critic Network') # instantiate actor networks D2D_actor_nets = [] for i in range(0, ch.N_D2D): D2D_actor_nets.append( ActorNetwork(name='a_net_{:.0f}'.format(i), obs_size=obs_size, actor_hidden_size=actor_hidden_size, actor_learning_rate=actor_learning_rate))
obs_size = ch.N_CU print('action_size: ', action_size) print('obs_size: ', obs_size) tf.reset_default_graph() # instantiate social critic networks social_central_critic = CriticNetwork(name="social_Critic_Net",critic_hidden_size=social_critic_hidden_size, critic_learning_rate=social_critic_learning_rate) social_target_critic_net = CriticNetwork(name="social_Target_Critic_Net",critic_hidden_size=social_critic_hidden_size, critic_learning_rate=social_critic_learning_rate) social_target_critic_update_ops = trfl.update_target_variables(social_target_critic_net.get_network_variables(), social_central_critic.get_network_variables(), tau=0.001) print('Instantiated Social Critic Network') # instantiate individual critic individual_central_critic = CriticNetwork(name="individual_Critic_Net",critic_hidden_size=individual_critic_hidden_size, critic_learning_rate=individual_critic_learning_rate) individual_target_critic_net = CriticNetwork(name="Target_Critic_Net",critic_hidden_size=individual_critic_hidden_size, critic_learning_rate=individual_critic_learning_rate) individual_target_critic_update_ops = trfl.update_target_variables(individual_target_critic_net.get_network_variables(), individual_central_critic.get_network_variables(), tau=0.001) print('Instantiated Individual Critic Network')
# instantiate COMA critic network central_critic = CriticNetwork(name="Critic_Net", obs_size=obs_size, action_size=action_size, critic_hidden_size=critic_hidden_size, critic_learning_rate=critic_learning_rate) target_critic_net = CriticNetwork(name="arget_Critic_Net", obs_size=obs_size, action_size=action_size, critic_hidden_size=critic_hidden_size, critic_learning_rate=critic_learning_rate) target_critic_update_ops = trfl.update_target_variables( target_critic_net.get_network_variables(), central_critic.get_network_variables(), tau=0.001) print('Instantiated Critic Network') # instantiate actor networks and individual critic networks D2D_actor_nets = [] individual_central_critics = [] individual_target_critic_nets = [] individual_target_critic_update_ops = [] for i in range(0, ch.N_D2D): individual_central_critics.append( IndCriticNetwork(name='individual_Critic_Net_{:.0f}'.format(i), critic_hidden_size=individual_critic_hidden_size,
tf.reset_default_graph() # instantiate social critic networks social_central_critic = CriticNetwork( name="social_Critic_Net", critic_hidden_size=social_critic_hidden_size, critic_learning_rate=social_critic_learning_rate) social_target_critic_net = CriticNetwork( name="social_Target_Critic_Net", critic_hidden_size=social_critic_hidden_size, critic_learning_rate=social_critic_learning_rate) social_target_critic_update_ops = trfl.update_target_variables( social_target_critic_net.get_network_variables(), social_central_critic.get_network_variables(), tau=0.001) print('Instantiated Social Critic Network') # instantiate individual critic individual_central_critic = CriticNetwork( name="individual_Critic_Net", critic_hidden_size=individual_critic_hidden_size, critic_learning_rate=individual_critic_learning_rate) individual_target_critic_net = CriticNetwork( name="Target_Critic_Net", critic_hidden_size=individual_critic_hidden_size, critic_learning_rate=individual_critic_learning_rate)
ActorCriticNetwork(name='ac_target_net_{:.0f}'.format(i), obs_size=obs_size, action_size=action_size, actor_hidden_size=actor_hidden_size, ac_learning_rate=ac_learning_rate, entropy_cost=entropy_cost, normalise_entropy=normalise_entropy, lambda_=lambda_, baseline_cost=baseline_cost)) print('Instantiated Target Network {:.0f} of {:.0f}'.format( i + 1, env.no_players)) player_target_net_update_ops.append( trfl.update_target_variables( player_target_nets[i].get_network_variables(), player_nets[i].get_network_variables(), tau=0.001)) print('Instantiated Target Net Update ops {:.0f} of {:.0f}'.format( i + 1, env.no_players)) print('\n') stats_rewards_list = [] RB_selections = [] state = env.initialize() def running_mean(x, N): cumsum = np.cumsum(np.insert(x, 0, 0))