def test_policy_theta_discrete_space_PARAM_FAIL(gym_and_tf_discrete_setup):

    _, act_p, exp_spec, playground = gym_and_tf_discrete_setup
    obs_p_wrong_shape = tf_cv1.placeholder(tf.float32, shape=(None, 3))
    theta_mlp = bloc.build_MLP_computation_graph(
        obs_p_wrong_shape, playground.ACTION_CHOICES,
        exp_spec.theta_nn_h_layer_topo)

    with pytest.raises(AssertionError):
        bloc.policy_theta_discrete_space(obs_p_wrong_shape, playground)
def test_policy_theta_discrete_space_ENV_NOT_DISCRETE(
        gym_and_tf_continuous_setup):
    obs_p, act_p, exp_spec, continuous_playground = gym_and_tf_continuous_setup

    out_p_wrong_shape = tf_cv1.placeholder(tf.float32, shape=(None, 43))
    theta_mlp = bloc.build_MLP_computation_graph(
        obs_p, continuous_playground.ACTION_CHOICES,
        exp_spec.theta_nn_h_layer_topo)

    with pytest.raises(AssertionError):
        bloc.policy_theta_discrete_space(theta_mlp, continuous_playground)
def REINFORCE_policy(observation_placeholder: tf.Tensor, action_placeholder: tf.Tensor, Q_values_placeholder: tf.Tensor,
                     experiment_spec: ExperimentSpec, playground: GymPlayground) -> (tf.Tensor, tf.Tensor, tf.Tensor):
    """
    The learning agent: REINFORCE (aka: Basic Policy Gradient)
    Based on the paper by Williams, R. J.
         Simple statistical gradient-following algorithms for connectionist reinforcement learning. (1992)

    Policy gradient is a on-policy method which seek to directly optimize the policy π_θ by using sampled trajectories τ
    as weight. Those weight will then be used to indicate how good the policy performed.
    Based on that knowledge, the algorithm update the parameter θ of his policy to make action leading to similar good
    trajectories more likely and similar bad trajectories less likely.
    In the case of Deep Reinforcement Learning, the policy parameter θ is a neural net.

    :type observation_placeholder: tf.Tensor
    :type action_placeholder: tf.Tensor
    :type Q_values_placeholder: tf.Tensor
    :type playground: GymPlayground
    :type experiment_spec: ExperimentSpec
    :return: (sampled_action, theta_mlp, pseudo_loss)
    :rtype: (tf.Tensor, tf.Tensor, tf.Tensor)
    """
    with tf.name_scope(vocab.REINFORCE) as scope:

        """ ---- Build parameter theta as a multilayer perceptron ---- """
        theta_mlp = build_MLP_computation_graph(observation_placeholder, playground.ACTION_CHOICES,
                                                experiment_spec.theta_nn_h_layer_topo,
                                                hidden_layers_activation=experiment_spec.theta_hidden_layers_activation,
                                                output_layers_activation=experiment_spec.theta_output_layers_activation,
                                                name=vocab.theta_NeuralNet)

        # ::Discrete case
        if isinstance(playground.env.action_space, gym.spaces.Discrete):

            """ ---- Assess the input shape compatibility ---- """
            are_compatible = observation_placeholder.shape.as_list()[-1] == playground.OBSERVATION_SPACE.shape[0]
            assert are_compatible, ("the observation_placeholder is incompatible with environment, "
                                    "{} != {}").format(observation_placeholder.shape.as_list()[-1],
                                                       playground.OBSERVATION_SPACE.shape[0])

            """ ---- Build the policy for discrete space ---- """
            sampled_action, log_p_all = policy_theta_discrete_space(theta_mlp, playground)

            """ ---- Build the pseudo loss function ---- """
            pseudo_loss = discrete_pseudo_loss(log_p_all, action_placeholder, Q_values_placeholder, playground,
                                               vocab.pseudo_loss)

        # ::Continuous case
        elif isinstance(playground.env.action_space, gym.spaces.Box):
            raise NotImplementedError   # (Ice-Boxed) todo:implement -->  for policy for continuous space:

        # ::Other gym environment
        else:
            print("\n>>> The agent implementation does not support that environment space "
                  "{} yet.\n\n".format(playground.env.action_space))
            raise NotImplementedError

    return sampled_action, theta_mlp, pseudo_loss
def build_actor_policy_graph(
        observation_placeholder: tf.Tensor, experiment_spec: ExperimentSpec,
        playground: GymPlayground) -> (tf.Tensor, tf.Tensor, tf.Tensor):
    """
    The ACTOR graph(aka the policy network)

        1. Actor network theta
            input: the observations collected
            output: the logits of each action in the action space

        2. Policy
            input: the actor network
            output: a selected action & the probabilities of each action in the action space

    :return: sampled_action, log_pi_all, theta_mlp
    """
    with tf.name_scope(vocab.actor_network) as scope:

        # ::Discrete case
        if isinstance(playground.env.action_space, gym.spaces.Discrete):
            """ ---- Assess the input shape compatibility ---- """
            are_compatible = observation_placeholder.shape.as_list(
            )[-1] == playground.OBSERVATION_SPACE.shape[0]
            assert are_compatible, (
                "the observation_placeholder is incompatible with environment, "
                "{} != {}").format(observation_placeholder.shape.as_list()[-1],
                                   playground.OBSERVATION_SPACE.shape[0])
            """ ---- Build parameter THETA as a multilayer perceptron ---- """
            theta_mlp = build_MLP_computation_graph(
                observation_placeholder,
                playground.ACTION_CHOICES,
                experiment_spec.theta_nn_h_layer_topo,
                hidden_layers_activation=experiment_spec.
                theta_hidden_layers_activation,
                output_layers_activation=experiment_spec.
                theta_output_layers_activation,
                name=vocab.theta_NeuralNet)
            """ ---- Build the policy for discrete space ---- """
            sampled_action, log_pi_all = policy_theta_discrete_space(
                theta_mlp, playground)

        # ::Continuous case
        elif isinstance(playground.env.action_space, gym.spaces.Box):
            raise NotImplementedError  # (Ice-Boxed) todo:implement -->  for policy for continuous space:

        # ::Other gym environment
        else:
            print(
                "\n>>> The agent implementation does not support that environment space "
                "{} yet.\n\n".format(playground.env.action_space))
            raise NotImplementedError

    return sampled_action, log_pi_all, theta_mlp
def test_policy_theta_discrete_space_PASS(gym_and_tf_discrete_setup):

    obs_p, act_p, exp_spec, playground = gym_and_tf_discrete_setup
    theta_mlp = bloc.build_MLP_computation_graph(
        obs_p, playground.ACTION_CHOICES, exp_spec.theta_nn_h_layer_topo)
    bloc.policy_theta_discrete_space(theta_mlp, playground)
Beispiel #6
0
def build_actor_critic_shared_graph(
        obs_ph: tf.Tensor, exp_spec: ExperimentSpec, playground: GymPlayground
) -> (tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor):
    """
    The ACTOR-CRITIC shared network variant architecture

        1. Actor network theta
            input: the observations collected
            output: the logits of each action in the action space

        2. Policy
            input: the actor network
            output: a selected action & the probabilities of each action in the action space

        3. Critic network phi
            input: the observations collected
            output: the logits of each action in the action space

    :return: sampled_action, log_pi_all, theta_shared_MLP, critic
    """
    """ ---- Assess the input shape compatibility ---- """
    are_compatible = obs_ph.shape.as_list(
    )[-1] == playground.OBSERVATION_SPACE.shape[0]
    assert are_compatible, (
        "the observation_placeholder is incompatible with environment, "
        "{} != {}").format(obs_ph.shape.as_list()[-1],
                           playground.OBSERVATION_SPACE.shape[0])

    # ::Discrete case
    if isinstance(playground.env.action_space, gym.spaces.Discrete):
        """ ---- Build parameter THETA as a multilayer perceptron ---- """
        theta_shared_MLP = build_MLP_computation_graph(
            obs_ph,
            playground.ACTION_CHOICES,
            exp_spec.theta_nn_h_layer_topo,
            hidden_layers_activation=exp_spec.theta_hidden_layers_activation,
            output_layers_activation=exp_spec.theta_output_layers_activation,
            reuse=None,  # <-- (!)
            name=vocab.shared_network)
        """ ---- Build the policy for discrete space ---- """
        sampled_action, log_pi_all = policy_theta_discrete_space(
            theta_shared_MLP, playground)

    # ::Continuous case
    elif isinstance(playground.env.action_space, gym.spaces.Box):
        raise NotImplementedError  # (Ice-Boxed) todo:implement -->  for policy for continuous space:

    # ::Other gym environment
    else:
        print(
            "\n>>> The agent implementation does not support that environment space "
            "{} yet.\n\n".format(playground.env.action_space))
        raise NotImplementedError
    """ ---- Build the Critic ---- """
    phi_shared_MLP = build_MLP_computation_graph(
        obs_ph,
        playground.ACTION_CHOICES,
        exp_spec.theta_nn_h_layer_topo,
        hidden_layers_activation=exp_spec.theta_hidden_layers_activation,
        output_layers_activation=exp_spec.theta_output_layers_activation,
        reuse=True,  # <-- (!)
        name=vocab.shared_network)

    critic = build_MLP_computation_graph(
        phi_shared_MLP,
        1, (),
        hidden_layers_activation=exp_spec.theta_hidden_layers_activation,
        output_layers_activation=exp_spec.theta_output_layers_activation,
        name=vocab.V_estimate)

    return sampled_action, log_pi_all, theta_shared_MLP, critic