Example #1
0
def make_networks(n_workers, n_actions, weight_inits, value_loss_coef,
                  entropy_bonus, max_grad_norm, optimizer, debug):
    # https://www.tensorflow.org/api_docs/python/tf/Graph notes that graph
    # construction isn't thread-safe. So we all do all graph construction
    # serially before starting the worker threads.

    # Create shared parameters
    with tf.variable_scope('global'):
        make_inference_network(n_actions=n_actions, weight_inits=weight_inits)

    # Create per-worker copies of shared parameters
    worker_networks = []
    for worker_n in range(n_workers):
        create_summary_ops = (worker_n == 0)
        worker_name = "worker_{}".format(worker_n)
        network = Network(scope=worker_name,
                          n_actions=n_actions,
                          entropy_bonus=entropy_bonus,
                          value_loss_coef=value_loss_coef,
                          weight_inits=weight_inits,
                          max_grad_norm=max_grad_norm,
                          optimizer=optimizer,
                          summaries=create_summary_ops,
                          debug=debug)
        worker_networks.append(network)
    return worker_networks
Example #2
0
def make_networks(n_workers, obs_shape, n_actions, value_loss_coef, entropy_bonus, max_grad_norm,
                  optimizer, detailed_logs, debug):
    # https://www.tensorflow.org/api_docs/python/tf/Graph notes that graph construction isn't
    # thread-safe. So we all do all graph construction serially before starting the worker threads.
    # Porturgues:
    #   (https://www.tensorflow.org/api_docs/python/tf/Graph observa que a construção do grafo não é thread-safe.
    #   Portanto, todos nós fazemos toda a construção de grafos em série antes de iniciar os threads de trabalho.)

    # A with instrução é usada para envolver a execução de um bloco com métodos definidos por um gerenciador de contexto (consulte a seção Com gerenciadores de contexto de instrução ).
    # Um gerenciador de contexto para definir ops que cria variáveis ​​(camadas).
    # Esse gerenciador de contexto valida que o (opcional) valuesé do mesmo gráfico, garante que o gráfico seja o gráfico padrão e envia um escopo de nome e um escopo de variável
    # Create shared parameters (Crie parâmetros compartilhados)
    with tf.variable_scope('global'): # https://www.tensorflow.org/api_docs/python/tf/variable_scope#class_variable_scope
        make_inference_network(obs_shape, n_actions) # Função importado de network.py Cria a estrutura da rede


    # Create per-worker copies of shared parameters (Criar cópias por trabalhador dos parâmetros compartilhados)
    worker_networks = []
    for worker_n in range(n_workers): # loop n_workers vezes
        create_summary_ops = (worker_n == 0) # create_summary_ops se for o primeiro loop
        worker_name = "worker_{}".format(worker_n) # defino o nome do worker de acordo com o loop

        # Network = função de network.py
        ##### AQUI
        network = Network(scope=worker_name, n_actions=n_actions, entropy_bonus=entropy_bonus,
                          value_loss_coef=value_loss_coef, max_grad_norm=max_grad_norm,
                          optimizer=optimizer, add_summaries=create_summary_ops,
                          detailed_logs=detailed_logs, debug=debug)
        worker_networks.append(network) # inserindo network na lista

    return worker_networks # retorna lista de redes
Example #3
0
    def test_policy_loss(self):
        """
        Does calculating policy loss based on the cross-entropy really give the right result?
        """
        optimizer = tf.train.RMSPropOptimizer(learning_rate=1e-3)
        with tf.variable_scope('global'):
            make_inference_network(obs_shape=(84, 84, 4), n_actions=6)
        network = Network('foo_scope',
                          n_actions=6,
                          entropy_bonus=0.0,
                          value_loss_coef=0.5,
                          max_grad_norm=0.5,
                          optimizer=optimizer,
                          add_summaries=False)
        sess = tf.Session()
        sess.run(tf.global_variables_initializer())

        obs = np.random.rand(3, 84, 84, 4)
        action_probs = sess.run(network.action_probs,
                                feed_dict={network.states: obs})

        rewards = [4, 5, 6]
        actions = [1, 3, 2]
        advantage, actual_loss = sess.run(
            [network.advantage, network.policy_loss],
            feed_dict={
                network.states: obs,
                network.actions: actions,
                network.returns: rewards
            })
        expected_loss = (-np.log(action_probs[0][1]) * advantage[0] +
                         -np.log(action_probs[1][3]) * advantage[1] +
                         -np.log(action_probs[2][2]) * advantage[2])
        expected_loss /= 3
        self.assertAlmostEqual(expected_loss, actual_loss, places=5)
Example #4
0
    def test_rmsprop_variables(self):
        """
        Test 1: let's look at the variables the optimizer creates to check
        there's no funny business.
        """
        sess = tf.Session()
        env = generic_preprocess(gym.make('Pong-v0'), max_n_noops=0)

        optimizer = tf.train.RMSPropOptimizer(learning_rate=5e-4,
                                              decay=0.99,
                                              epsilon=1e-5)

        with tf.variable_scope('global'):
            make_inference_network(n_actions=env.action_space.n,
                                   weight_inits='glorot')

        network1 = Network(scope="worker_1",
                           n_actions=env.action_space.n,
                           entropy_bonus=0.01,
                           value_loss_coef=0.5,
                           weight_inits='glorot',
                           max_grad_norm=0.5,
                           optimizer=optimizer,
                           summaries=False,
                           debug=False)
        Worker(sess=sess, env=env, network=network1, log_dir='/tmp')

        vars1 = optimizer.variables()

        network2 = Network(scope="worker_2",
                           n_actions=env.action_space.n,
                           entropy_bonus=0.01,
                           value_loss_coef=0.5,
                           weight_inits='glorot',
                           max_grad_norm=0.5,
                           optimizer=optimizer,
                           summaries=False,
                           debug=False)
        Worker(sess=sess, env=env, network=network2, log_dir='/tmp')

        vars2 = optimizer.variables()

        self.assertNotEqual(id(vars1), id(vars2))

        # First, were any extra variables added when we created the second
        # optimizer, that might be indicative of a second set of statistics?
        self.assertLessEqual(vars1, vars2)
        # Second, are all the variables definitely associated with the global
        # set of parameters rather than the thead-local parameters?
        for v in vars1:
            self.assertIn('global', v.name)
Example #5
0
def run_weight_test(reset_rmsprop):
    tf.reset_default_graph()
    utils.set_random_seeds(0)
    sess = tf.Session()
    env = generic_preprocess(gym.make('Pong-v0'), max_n_noops=0)
    env.seed(0)

    with tf.variable_scope('global'):
        make_inference_network(n_actions=env.action_space.n,
                               weight_inits='glorot')
    shared_variables = tf.global_variables()

    optimizer = tf.train.RMSPropOptimizer(learning_rate=5e-4,
                                          decay=0.99,
                                          epsilon=1e-5)

    network1 = Network(scope="worker_1",
                       n_actions=env.action_space.n,
                       entropy_bonus=0.01,
                       value_loss_coef=0.5,
                       weight_inits='glorot',
                       max_grad_norm=0.5,
                       optimizer=optimizer,
                       summaries=False,
                       debug=False)
    w1 = Worker(sess=sess, env=env, network=network1, log_dir='/tmp')

    network2 = Network(scope="worker_2",
                       n_actions=env.action_space.n,
                       entropy_bonus=0.01,
                       value_loss_coef=0.5,
                       weight_inits='glorot',
                       max_grad_norm=0.5,
                       optimizer=optimizer,
                       summaries=False,
                       debug=False)
    w2 = Worker(sess=sess, env=env, network=network2, log_dir='/tmp')

    rmsprop_init_ops = [v.initializer for v in optimizer.variables()]

    sess.run(tf.global_variables_initializer())

    vars_sum_init = sess.run(get_var_sum(shared_variables))
    w1.run_update(n_steps=1)
    vars_sum_post_w1_update = sess.run(get_var_sum(shared_variables))
    if reset_rmsprop:
        sess.run(rmsprop_init_ops)
    w2.run_update(n_steps=1)
    vars_sum_post_w2_update = sess.run(get_var_sum(shared_variables))

    return vars_sum_init, vars_sum_post_w1_update, vars_sum_post_w2_update
Example #6
0
def vars_hash_after_training(seed, n_steps):
    tf.reset_default_graph()
    with tempfile.TemporaryDirectory() as temp_dir:
        cmd = "python train.py PongNoFrameskip-v4 --wake_interval 1 " \
              "--seed {} --n_steps {}".format(seed, n_steps)
        cmd = cmd.split(' ') + ["--log_dir", temp_dir]
        subprocess.call(cmd)

        sess = tf.Session()
        dummy_env = gym.make('PongNoFrameskip-v4')
        with tf.variable_scope('global'):
            make_inference_network(n_actions=dummy_env.action_space.n,
                                   weight_inits='glorot')
        saver = tf.train.Saver()
        ckpt_dir = osp.join(temp_dir, 'checkpoints')
        ckpt_file = tf.train.latest_checkpoint(ckpt_dir)
        saver.restore(sess, ckpt_file)
        vars = sess.run(tf.trainable_variables())
        vars_hash = np.sum([np.sum(v) for v in vars])

        return vars_hash
Example #7
0
def vars_hash_after_training(seed, n_steps):
    with tempfile.TemporaryDirectory() as temp_dir:
        cmd = ("python train.py PongNoFrameskip-v4 "
               "--manager_wake_interval 1 --seed {} --n_steps {}".format(
                   seed, n_steps))
        cmd = cmd.split(' ') + ["--log_dir", temp_dir]
        subprocess.call(cmd)

        tf.reset_default_graph()
        sess = tf.Session()
        with tf.variable_scope('global'):
            make_inference_network(obs_shape=(84, 84, 4),
                                   n_actions=6)  # For PongNoFrameskip-v4

        saver = tf.train.Saver()
        ckpt_dir = osp.join(temp_dir, 'checkpoints')
        ckpt_file = tf.train.latest_checkpoint(ckpt_dir)
        saver.restore(sess, ckpt_file)
        tf_vars = sess.run(tf.trainable_variables())
        vars_hash = np.sum([np.sum(v) for v in tf_vars])

        return vars_hash
Example #8
0
def get_network(ckpt_dir, obs_shape, n_actions):
    sess = tf.Session()

    with tf.variable_scope('global'):
        obs_placeholder, _, action_probs_op, _, _ = \
            make_inference_network(obs_shape, n_actions, debug=False)

    ckpt_file = tf.train.latest_checkpoint(ckpt_dir)
    if not ckpt_file:
        raise Exception("Couldn't find checkpoint in '{}'".format(ckpt_dir))
    print("Loading checkpoint from '{}'".format(ckpt_file))
    saver = tf.train.Saver()
    saver.restore(sess, ckpt_file)

    return sess, obs_placeholder, action_probs_op