def make_networks(n_workers, n_actions, weight_inits, value_loss_coef, entropy_bonus, max_grad_norm, optimizer, debug): # https://www.tensorflow.org/api_docs/python/tf/Graph notes that graph # construction isn't thread-safe. So we all do all graph construction # serially before starting the worker threads. # Create shared parameters with tf.variable_scope('global'): make_inference_network(n_actions=n_actions, weight_inits=weight_inits) # Create per-worker copies of shared parameters worker_networks = [] for worker_n in range(n_workers): create_summary_ops = (worker_n == 0) worker_name = "worker_{}".format(worker_n) network = Network(scope=worker_name, n_actions=n_actions, entropy_bonus=entropy_bonus, value_loss_coef=value_loss_coef, weight_inits=weight_inits, max_grad_norm=max_grad_norm, optimizer=optimizer, summaries=create_summary_ops, debug=debug) worker_networks.append(network) return worker_networks
def make_networks(n_workers, obs_shape, n_actions, value_loss_coef, entropy_bonus, max_grad_norm, optimizer, detailed_logs, debug): # https://www.tensorflow.org/api_docs/python/tf/Graph notes that graph construction isn't # thread-safe. So we all do all graph construction serially before starting the worker threads. # Porturgues: # (https://www.tensorflow.org/api_docs/python/tf/Graph observa que a construção do grafo não é thread-safe. # Portanto, todos nós fazemos toda a construção de grafos em série antes de iniciar os threads de trabalho.) # A with instrução é usada para envolver a execução de um bloco com métodos definidos por um gerenciador de contexto (consulte a seção Com gerenciadores de contexto de instrução ). # Um gerenciador de contexto para definir ops que cria variáveis (camadas). # Esse gerenciador de contexto valida que o (opcional) valuesé do mesmo gráfico, garante que o gráfico seja o gráfico padrão e envia um escopo de nome e um escopo de variável # Create shared parameters (Crie parâmetros compartilhados) with tf.variable_scope('global'): # https://www.tensorflow.org/api_docs/python/tf/variable_scope#class_variable_scope make_inference_network(obs_shape, n_actions) # Função importado de network.py Cria a estrutura da rede # Create per-worker copies of shared parameters (Criar cópias por trabalhador dos parâmetros compartilhados) worker_networks = [] for worker_n in range(n_workers): # loop n_workers vezes create_summary_ops = (worker_n == 0) # create_summary_ops se for o primeiro loop worker_name = "worker_{}".format(worker_n) # defino o nome do worker de acordo com o loop # Network = função de network.py ##### AQUI network = Network(scope=worker_name, n_actions=n_actions, entropy_bonus=entropy_bonus, value_loss_coef=value_loss_coef, max_grad_norm=max_grad_norm, optimizer=optimizer, add_summaries=create_summary_ops, detailed_logs=detailed_logs, debug=debug) worker_networks.append(network) # inserindo network na lista return worker_networks # retorna lista de redes
def test_policy_loss(self): """ Does calculating policy loss based on the cross-entropy really give the right result? """ optimizer = tf.train.RMSPropOptimizer(learning_rate=1e-3) with tf.variable_scope('global'): make_inference_network(obs_shape=(84, 84, 4), n_actions=6) network = Network('foo_scope', n_actions=6, entropy_bonus=0.0, value_loss_coef=0.5, max_grad_norm=0.5, optimizer=optimizer, add_summaries=False) sess = tf.Session() sess.run(tf.global_variables_initializer()) obs = np.random.rand(3, 84, 84, 4) action_probs = sess.run(network.action_probs, feed_dict={network.states: obs}) rewards = [4, 5, 6] actions = [1, 3, 2] advantage, actual_loss = sess.run( [network.advantage, network.policy_loss], feed_dict={ network.states: obs, network.actions: actions, network.returns: rewards }) expected_loss = (-np.log(action_probs[0][1]) * advantage[0] + -np.log(action_probs[1][3]) * advantage[1] + -np.log(action_probs[2][2]) * advantage[2]) expected_loss /= 3 self.assertAlmostEqual(expected_loss, actual_loss, places=5)
def test_rmsprop_variables(self): """ Test 1: let's look at the variables the optimizer creates to check there's no funny business. """ sess = tf.Session() env = generic_preprocess(gym.make('Pong-v0'), max_n_noops=0) optimizer = tf.train.RMSPropOptimizer(learning_rate=5e-4, decay=0.99, epsilon=1e-5) with tf.variable_scope('global'): make_inference_network(n_actions=env.action_space.n, weight_inits='glorot') network1 = Network(scope="worker_1", n_actions=env.action_space.n, entropy_bonus=0.01, value_loss_coef=0.5, weight_inits='glorot', max_grad_norm=0.5, optimizer=optimizer, summaries=False, debug=False) Worker(sess=sess, env=env, network=network1, log_dir='/tmp') vars1 = optimizer.variables() network2 = Network(scope="worker_2", n_actions=env.action_space.n, entropy_bonus=0.01, value_loss_coef=0.5, weight_inits='glorot', max_grad_norm=0.5, optimizer=optimizer, summaries=False, debug=False) Worker(sess=sess, env=env, network=network2, log_dir='/tmp') vars2 = optimizer.variables() self.assertNotEqual(id(vars1), id(vars2)) # First, were any extra variables added when we created the second # optimizer, that might be indicative of a second set of statistics? self.assertLessEqual(vars1, vars2) # Second, are all the variables definitely associated with the global # set of parameters rather than the thead-local parameters? for v in vars1: self.assertIn('global', v.name)
def run_weight_test(reset_rmsprop): tf.reset_default_graph() utils.set_random_seeds(0) sess = tf.Session() env = generic_preprocess(gym.make('Pong-v0'), max_n_noops=0) env.seed(0) with tf.variable_scope('global'): make_inference_network(n_actions=env.action_space.n, weight_inits='glorot') shared_variables = tf.global_variables() optimizer = tf.train.RMSPropOptimizer(learning_rate=5e-4, decay=0.99, epsilon=1e-5) network1 = Network(scope="worker_1", n_actions=env.action_space.n, entropy_bonus=0.01, value_loss_coef=0.5, weight_inits='glorot', max_grad_norm=0.5, optimizer=optimizer, summaries=False, debug=False) w1 = Worker(sess=sess, env=env, network=network1, log_dir='/tmp') network2 = Network(scope="worker_2", n_actions=env.action_space.n, entropy_bonus=0.01, value_loss_coef=0.5, weight_inits='glorot', max_grad_norm=0.5, optimizer=optimizer, summaries=False, debug=False) w2 = Worker(sess=sess, env=env, network=network2, log_dir='/tmp') rmsprop_init_ops = [v.initializer for v in optimizer.variables()] sess.run(tf.global_variables_initializer()) vars_sum_init = sess.run(get_var_sum(shared_variables)) w1.run_update(n_steps=1) vars_sum_post_w1_update = sess.run(get_var_sum(shared_variables)) if reset_rmsprop: sess.run(rmsprop_init_ops) w2.run_update(n_steps=1) vars_sum_post_w2_update = sess.run(get_var_sum(shared_variables)) return vars_sum_init, vars_sum_post_w1_update, vars_sum_post_w2_update
def vars_hash_after_training(seed, n_steps): tf.reset_default_graph() with tempfile.TemporaryDirectory() as temp_dir: cmd = "python train.py PongNoFrameskip-v4 --wake_interval 1 " \ "--seed {} --n_steps {}".format(seed, n_steps) cmd = cmd.split(' ') + ["--log_dir", temp_dir] subprocess.call(cmd) sess = tf.Session() dummy_env = gym.make('PongNoFrameskip-v4') with tf.variable_scope('global'): make_inference_network(n_actions=dummy_env.action_space.n, weight_inits='glorot') saver = tf.train.Saver() ckpt_dir = osp.join(temp_dir, 'checkpoints') ckpt_file = tf.train.latest_checkpoint(ckpt_dir) saver.restore(sess, ckpt_file) vars = sess.run(tf.trainable_variables()) vars_hash = np.sum([np.sum(v) for v in vars]) return vars_hash
def vars_hash_after_training(seed, n_steps): with tempfile.TemporaryDirectory() as temp_dir: cmd = ("python train.py PongNoFrameskip-v4 " "--manager_wake_interval 1 --seed {} --n_steps {}".format( seed, n_steps)) cmd = cmd.split(' ') + ["--log_dir", temp_dir] subprocess.call(cmd) tf.reset_default_graph() sess = tf.Session() with tf.variable_scope('global'): make_inference_network(obs_shape=(84, 84, 4), n_actions=6) # For PongNoFrameskip-v4 saver = tf.train.Saver() ckpt_dir = osp.join(temp_dir, 'checkpoints') ckpt_file = tf.train.latest_checkpoint(ckpt_dir) saver.restore(sess, ckpt_file) tf_vars = sess.run(tf.trainable_variables()) vars_hash = np.sum([np.sum(v) for v in tf_vars]) return vars_hash
def get_network(ckpt_dir, obs_shape, n_actions): sess = tf.Session() with tf.variable_scope('global'): obs_placeholder, _, action_probs_op, _, _ = \ make_inference_network(obs_shape, n_actions, debug=False) ckpt_file = tf.train.latest_checkpoint(ckpt_dir) if not ckpt_file: raise Exception("Couldn't find checkpoint in '{}'".format(ckpt_dir)) print("Loading checkpoint from '{}'".format(ckpt_file)) saver = tf.train.Saver() saver.restore(sess, ckpt_file) return sess, obs_placeholder, action_probs_op