コード例 #1
0
def build_act(make_obs_ph,make_feature_ph, q_func, num_actions, scope="deepq", reuse=None):
    """Creates the act function:

    Parameters
    ----------
    make_obs_ph: str -> tf.placeholder or TfInput
        a function that take a name and creates a placeholder of input with that name
    q_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    num_actions: int
        number of actions.
    scope: str or VariableScope
        optional scope for variable_scope.
    reuse: bool or None
        whether or not the variables should be reused. To be able to reuse the scope must be given.

    Returns
    -------
    act: (tf.Variable, bool, float) -> tf.Variable
        function to select and action given observation.
`       See the top of the file for details.
    """
    with tf.variable_scope(scope, reuse=reuse):
        observations_ph = U.ensure_tf_input(make_obs_ph("observation"))
        features_ph = U.ensure_tf_input(make_feature_ph("feature"))
        stochastic_ph = tf.placeholder(tf.bool, (), name="stochastic")
        update_eps_ph = tf.placeholder(tf.float32, (), name="update_eps")

        eps = tf.get_variable("eps", (), initializer=tf.constant_initializer(0))

        q_values = q_func(observations_ph.get(),features_ph.get(), num_actions, scope="q_func")
        deterministic_actions = tf.argmax(q_values, axis=1)

        batch_size = tf.shape(observations_ph.get())[0]
        random_actions = tf.random_uniform(tf.stack([batch_size]), minval=0, maxval=num_actions, dtype=tf.int64)
        chose_random = tf.random_uniform(tf.stack([batch_size]), minval=0, maxval=1, dtype=tf.float32) < eps
        stochastic_actions = tf.where(chose_random, random_actions, deterministic_actions)

        output_actions = tf.cond(stochastic_ph, lambda: stochastic_actions, lambda: deterministic_actions)
        update_eps_expr = eps.assign(tf.cond(update_eps_ph >= 0, lambda: update_eps_ph, lambda: eps))
        act = U.function(inputs=[observations_ph,features_ph, stochastic_ph, update_eps_ph],
                         outputs=output_actions,
                         givens={update_eps_ph: -1.0, stochastic_ph: True},
                         updates=[update_eps_expr])
        return act
コード例 #2
0
def svgd_adv_build_act(
    make_obs_ph,
    adv_func,
    num_actions,
    en=1,
    scope="svgd_advantage_learning",
    reuse=None,
):

    with tf.variable_scope(scope, reuse=reuse):
        observations_ph = U.ensure_tf_input(make_obs_ph("observation"))
        is_training = tf.placeholder(tf.bool, (), name='is_training')
        stochastic_ph = tf.placeholder(tf.bool, (), name="stochastic")
        update_eps_ph = tf.placeholder(tf.float32, (), name="update_eps")
        eps = tf.get_variable("eps", (),
                              initializer=tf.constant_initializer(0))

        a_values_list = []

        for count in range(en):
            adv_tem = adv_func(
                observations_ph.get(),
                num_actions,
                is_training=is_training,
                scope="adv_func" + str(count) + '_',
            )
            a_values_list.append(adv_tem)

        a_values = sum(a_values_list)
        deterministic_actions = tf.argmax(a_values, axis=1)

        batch_size = tf.shape(observations_ph.get())[0]
        random_actions = tf.random_uniform(tf.stack([batch_size]),
                                           minval=0,
                                           maxval=num_actions,
                                           dtype=tf.int64)
        chose_random = tf.random_uniform(
            tf.stack([batch_size]), minval=0, maxval=1, dtype=tf.float32) < eps
        stochastic_actions = tf.where(chose_random, random_actions,
                                      deterministic_actions)

        output_actions = tf.cond(stochastic_ph, lambda: stochastic_actions,
                                 lambda: deterministic_actions)
        update_eps_expr = eps.assign(
            tf.cond(update_eps_ph >= 0, lambda: update_eps_ph, lambda: eps))

        act = U.function(inputs=[
            observations_ph, stochastic_ph, update_eps_ph, is_training
        ],
                         outputs=[tf.squeeze(output_actions), a_values] +
                         a_values_list,
                         givens={
                             update_eps_ph: -1.0,
                             stochastic_ph: True,
                             is_training: False
                         },
                         updates=[update_eps_expr])

        return act, is_training
コード例 #3
0
def get_q_values(make_obs_ph,
                 q_func,
                 num_actions,
                 scope="q-values",
                 reuse=None):
    with tf.variable_scope(scope, reuse=reuse):
        observations_ph = U.ensure_tf_input(make_obs_ph("observation"))
        q_values = q_func(observations_ph.get(), num_actions, scope="q_func")
        return q_values
コード例 #4
0
def build_act_modelbased_general(make_obs_ph, net_func, num_actions, scope="deepq", secondary_scope="model_func", reuse=None):
    with tf.variable_scope(scope, reuse=reuse):
        observations_ph = U.ensure_tf_input(make_obs_ph("observation"))
        z, v = net_func(observations_ph.get(),
                        num_actions,
                        scope=secondary_scope,
                        reuse=tf.AUTO_REUSE)

        act = U.function(inputs=[observations_ph],
                         outputs=[z])

        return act
コード例 #5
0
def build_act_mfvae(make_obs_ph, q_func, z_noise, num_actions, scope="deepq", reuse=None):
    with tf.variable_scope(scope, reuse=reuse):
        observations_ph = U.ensure_tf_input(make_obs_ph("observation"))
        q, q_deterministic, v_mean, v_logvar, z_mean, z_logvar, recon_obs = q_func(observations_ph.get(), z_noise,
                                                                                   num_actions,
                                                                                   scope="q_func",
                                                                                   reuse=tf.AUTO_REUSE)

        act = U.function(inputs=[observations_ph, z_noise],
                         outputs=[z_mean, z_logvar])

        return act
コード例 #6
0
ファイル: build_graph.py プロジェクト: IcarusTan/baselines
def build_act(make_obs_ph, q_func, num_actions, scope="deepq", reuse=None):
    """Creates the act function:

    Parameters
    ----------
    make_obs_ph: str -> tf.placeholder or TfInput
        a function that take a name and creates a placeholder of input with that name
    q_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    num_actions: int
        number of actions.
    scope: str or VariableScope
        optional scope for variable_scope.
    reuse: bool or None
        whether or not the variables should be reused. To be able to reuse the scope must be given.

    Returns
    -------
    act: (tf.Variable, bool, float) -> tf.Variable
        function to select and action given observation.
`       See the top of the file for details.
    """
    with tf.variable_scope(scope, reuse=reuse):
        observations_ph = U.ensure_tf_input(make_obs_ph("observation"))
        stochastic_ph = tf.placeholder(tf.bool, (), name="stochastic")
        update_eps_ph = tf.placeholder(tf.float32, (), name="update_eps")

        eps = tf.get_variable("eps", (), initializer=tf.constant_initializer(0))

        q_values = q_func(observations_ph.get(), num_actions, scope="q_func")
        deterministic_actions = tf.argmax(q_values, axis=1)

        batch_size = tf.shape(observations_ph.get())[0]
        random_actions = tf.random_uniform(tf.stack([batch_size]), minval=0, maxval=num_actions, dtype=tf.int64)
        chose_random = tf.random_uniform(tf.stack([batch_size]), minval=0, maxval=1, dtype=tf.float32) < eps
        stochastic_actions = tf.where(chose_random, random_actions, deterministic_actions)

        output_actions = tf.cond(stochastic_ph, lambda: stochastic_actions, lambda: deterministic_actions)
        update_eps_expr = eps.assign(tf.cond(update_eps_ph >= 0, lambda: update_eps_ph, lambda: eps))
        act = U.function(inputs=[observations_ph, stochastic_ph, update_eps_ph],
                         outputs=output_actions,
                         givens={update_eps_ph: -1.0, stochastic_ph: True},
                         updates=[update_eps_expr])
        return act
コード例 #7
0
def imit_build_act(
    make_obs_ph,
    bnn_func,
    num_actions,
    en,
    bnn_explore=0.01,
    scope="Imitation",
    reuse=None,
    use_sign=False,
):

    with tf.variable_scope(scope, reuse=reuse):
        observations_ph = U.ensure_tf_input(make_obs_ph("observation"))
        is_training = tf.placeholder(tf.bool, (), name='is_training')

        # construct BNNs
        BNN_output_list = []
        for count in range(en):
            bnn_output_tem = bnn_func(
                observations_ph.get(),
                num_actions,
                is_training=is_training,
                scope="bnn_func" + str(count) + '_',
                use_sign=use_sign,
            )
            BNN_output_list.append(bnn_output_tem)

        BNN_output = sum(BNN_output_list)

        eps = tf.constant(0.01)
        batch_size = tf.shape(observations_ph.get())[0]
        random_actions = tf.random_uniform(tf.stack([batch_size]),
                                           minval=0,
                                           maxval=num_actions,
                                           dtype=tf.int64)
        chose_random_bnn = tf.random_uniform(
            tf.stack([batch_size]), minval=0, maxval=1, dtype=tf.float32) < eps

        bnn_deterministic_actions = tf.argmax(BNN_output, axis=1)
        bnn_output_actions = tf.where(chose_random_bnn, random_actions,
                                      bnn_deterministic_actions)

        bnn_act = U.function(
            inputs=[observations_ph, is_training],
            outputs=[tf.squeeze(bnn_output_actions), BNN_output],
            givens={is_training: False},
        )

        return bnn_act, is_training
コード例 #8
0
ファイル: graph_util.py プロジェクト: MouseHu/emdqn
def build_random_input(input_type, obs_shape):
    obs_input_augment = U.ensure_tf_input(input_type(obs_shape, None, name="obs_augment"))

    rand_noise = tf.random.uniform(shape=(1,) + obs_shape, minval=-30, maxval=30, dtype=tf.int32, name="randuniform")
    rand_noise = tf.cast(rand_noise, tf.float32) / 255.
    rand_img = tf.minimum(tf.maximum(rand_noise + obs_input_augment.get(), 0.), 1.)
    init_rand_op = tf.variables_initializer([v for v in tf.global_variables() if 'randuniform' in v.name])

    augment_input_func = U.function(
        inputs=[obs_input_augment],
        outputs=[rand_img],
        updates=[]
    )
    rand_init_func = U.function([], [], updates=[init_rand_op])
    return augment_input_func, rand_init_func
コード例 #9
0
ファイル: graph_util.py プロジェクト: MouseHu/emdqn
def build_random_input_cnn(input_type, obs_shape):
    obs_input_augment = U.ensure_tf_input(input_type(obs_shape, None, name="obs_augment"))

    rand_img = tf.layers.conv2d(obs_input_augment.get(), obs_shape[2], 3, padding='same',
                                kernel_initializer=tf.initializers.glorot_normal(), trainable=False, name='randcnn')

    init_rand_op = tf.variables_initializer([v for v in tf.global_variables() if 'randcnn' in v.name])

    augment_input_func = U.function(
        inputs=[obs_input_augment],
        outputs=[rand_img],
        updates=[]
    )
    rand_init_func = U.function([], [], updates=[init_rand_op])
    return augment_input_func, rand_init_func
コード例 #10
0
def build_value_function(make_obs_ph,
                         q_func,
                         num_actions,
                         scope="deepq",
                         reuse=None):
    """Creates the value function:
        My own version of baselines.build_act which returns the q_values instead of just the action

        Parameters
        ----------
        make_obs_ph: str -> tf.placeholder or TfInput
            a function that take a name and creates a placeholder of input with that name
        q_func: (tf.Variable, int, str, bool) -> tf.Variable
            the model that takes the following inputs:
                observation_in: object
                    the output of observation placeholder
                num_actions: int
                    number of actions
                scope: str
                reuse: bool
                    should be passed to outer variable scope
            and returns a tensor of shape (batch_size, num_actions) with values of every action.
        num_actions: int
            number of actions.
        scope: str or VariableScope
            optional scope for variable_scope.
        reuse: bool or None
            whether or not the variables should be reused. To be able to reuse the scope must be given.

        Returns
        -------
        value: (tf.Variable, bool, float) -> tf.Variable
            function to select and action given observation.
    `       See the top of the file for details.
        """
    with tf.variable_scope(scope, reuse=reuse):
        observations_ph = U.ensure_tf_input(make_obs_ph("observation"))

        q_values = q_func(observations_ph.get(), num_actions, scope="q_func")

        value = U.function(inputs=[observations_ph], outputs=q_values)
        return value
コード例 #11
0
def adv_build_train(make_obs_ph, 
                v_func, 
                adv_func,
                num_actions,
                learning_rate,
                en,
                grad_norm_clipping=None,
                gamma=0.99,
                scope="advantage_learning", 
                reuse=None,
                ):
                
                
    act_f, is_training = adv_build_act(make_obs_ph, adv_func, num_actions, 
                                   en=en, scope=scope, reuse=reuse,)

    with tf.variable_scope(scope, reuse=reuse):
        
        adv_func_vars_list = []
        target_adv_func_vars_list = []        
        error_list = []

        # construct placeholders
        obs_t_input = U.ensure_tf_input(make_obs_ph("obs_t"))
        act_t_ph = tf.placeholder(tf.int32, [None], name="action")
        rew_t_ph = tf.placeholder(tf.float32, [None], name="reward")
        obs_tp1_input = U.ensure_tf_input(make_obs_ph("obs_tp1"))
        done_mask_ph = tf.placeholder(tf.float32, [None], name="done")
        
        obs_t_input_list = tf.split(obs_t_input.get(), en, axis=0)
        act_t_ph_list = tf.split(act_t_ph, en, axis=0)
        rew_t_ph_list = tf.split(rew_t_ph, en, axis=0)
        obs_tp1_input_list = tf.split(obs_tp1_input.get(), en, axis=0)
        done_mask_ph_list = tf.split(done_mask_ph, en, axis=0)

        # build v function
        v_t = tf.squeeze(v_func(obs_t_input.get(), scope="v_func", reuse=False))
        v_t_list = tf.split(v_t, en, axis=0)
        v_func_vars = U.scope_vars(U.absolute_scope_name("v_func"))

        # build v target
        v_tp1 = tf.squeeze(v_func(obs_tp1_input.get(), scope="target_v_func", reuse=False))
        v_tp1_list = tf.split(v_tp1, en, axis=0)
        target_v_func_vars = U.scope_vars(U.absolute_scope_name("target_v_func"))

        
        for count in range(en):
            # build BNN
            adv_t = adv_func(obs_t_input_list[count], num_actions, is_training=is_training, 
                        scope="adv_func" + str(count) + '_', reuse=True,
                        )

            adv_func_vars = U.scope_vars(U.absolute_scope_name("adv_func" + str(count) + '_'))
            adv_func_vars_list += adv_func_vars
            
            # build BNN target
            adv_tp1 = adv_func(obs_tp1_input_list[count], num_actions, is_training=False,
                        scope="target_adv_func" + str(count) + '_',
                        )
            target_adv_func_vars_list += U.scope_vars(U.absolute_scope_name("target_adv_func" + str(count) + '_'))

            adv_t_selected = tf.reduce_sum(adv_t * tf.one_hot(act_t_ph_list[count], num_actions), 1)

            adv_tp1_best = tf.reduce_max(adv_tp1, 1)

            q_t_selected = v_t_list[count] + adv_t_selected
            q_tp1_best = v_tp1_list[count] + adv_tp1_best
            q_tp1_best_masked = (1.0 - done_mask_ph_list[count]) * q_tp1_best
            q_t_selected_target = rew_t_ph_list[count] + gamma * q_tp1_best_masked

            td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)
            
            errors = tf.reduce_mean(tf.square(td_error))
            error_list.append(errors)

        all_vars_list = v_func_vars + adv_func_vars_list
        all_target_vars_list = target_v_func_vars + target_adv_func_vars_list

        total_loss = sum(error_list)
        
        assert grad_norm_clipping is not None
        optimize_expr = U.minimize_and_clip(
                                            tf.train.AdamOptimizer(learning_rate=learning_rate, epsilon=1e-4),
                                            total_loss,
                                            var_list=all_vars_list,
                                            clip_val=grad_norm_clipping
                                        )
        update_target_expr = []

        for var, var_target in zip(sorted(all_vars_list, key=lambda v: v.name),
                                sorted(all_target_vars_list, key=lambda v: v.name)):
            update_target_expr.append(var_target.assign(var))
        update_target_expr = tf.group(*update_target_expr)

        train = U.function(
            inputs=[
                obs_t_input,
                act_t_ph,
                rew_t_ph,
                obs_tp1_input,
                done_mask_ph,
                is_training,
            ],
            outputs=error_list,
            updates=[optimize_expr],
            givens={is_training:True}
        )
        update_target = U.function([], [], updates=[update_target_expr])

    return act_f, train, update_target
コード例 #12
0
def build_train_mf(make_obs_ph,
                   q_func,
                   num_actions,
                   optimizer,
                   grad_norm_clipping=None,
                   gamma=1.0,
                   scope="mfec",
                   alpha=1.0,
                   beta=1.0,
                   theta=1.0,
                   latent_dim=32,
                   ib=True,
                   reuse=None):
    """Creates the train function:

    Parameters
    ----------
    make_obs_ph: str -> tf.placeholder or TfInput
        a function that takes a name and creates a placeholder of input with that name
    q_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    num_actions: int
        number of actions
    reuse: bool
        whether or not to reuse the graph variables
    optimizer: tf.train.Optimizer
        optimizer to use for the Q-learning objective.
    grad_norm_clipping: float or None
        clip gradient norms to this value. If None no clipping is performed.
    gamma: float
        discount rate.
    double_q: bool
        if true will use Double Q Learning (https://arxiv.org/abs/1509.06461).
        In general it is a good idea to keep it enabled.
    scope: str or VariableScope
        optional scope for variable_scope.
    reuse: bool or None
        whether or not the variables should be reused. To be able to reuse the scope must be given.

    Returns
    -------
    act: (tf.Variable, bool, float) -> tf.Variable
        function to select and action given observation.
`       See the top of the file for details.
    train: (object, np.array, np.array, object, np.array, np.array) -> np.array
        optimize the error in Bellman's equation.
`       See the top of the file for details.
    update_target: () -> ()
        copy the parameters from optimized Q function to the target Q function.
`       See the top of the file for details.
    debug: {str: function}
        a bunch of functions to print debug data like q_values.
    """
    act_noise = tf.placeholder(tf.float32, [None, latent_dim],
                               name="act_noise")
    act_f = build_act_mf(make_obs_ph,
                         q_func,
                         act_noise,
                         num_actions,
                         scope=scope,
                         reuse=reuse)

    with tf.variable_scope(scope, reuse=reuse):
        # set up placeholders

        # EMDQN

        obs_vae_input = U.ensure_tf_input(make_obs_ph("obs_vae"))
        z_noise_vae = tf.placeholder(tf.float32, [None, latent_dim],
                                     name="z_noise_vae")
        inputs = [obs_vae_input, z_noise_vae]
        if ib:
            qec_input = tf.placeholder(tf.float32, [None], name='qec')
            inputs.append(qec_input)
        outputs = []

        q_vae, q_deterministic_vae, v_mean_vae, v_logvar_vae, z_mean_vae, z_logvar_vae, recon_obs = q_func(
            obs_vae_input.get(),
            z_noise_vae,
            num_actions,
            scope="q_func",
            reuse=True)
        q_func_vars = U.scope_vars(U.absolute_scope_name("q_func"))

        encoder_loss = -1 + z_mean_vae**2 + tf.exp(z_logvar_vae) - z_logvar_vae

        total_loss = tf.reduce_mean(beta * encoder_loss)
        decoder_loss = tf.keras.losses.binary_crossentropy(
            tf.reshape(recon_obs, [-1]),
            tf.reshape(tf.dtypes.cast(obs_vae_input._placeholder, tf.float32),
                       [-1]))
        print("here", z_mean_vae.shape, z_logvar_vae.shape, encoder_loss.shape,
              decoder_loss.shape)
        vae_loss = beta * encoder_loss + theta * decoder_loss
        outputs.append(encoder_loss)
        outputs.append(decoder_loss)
        outputs.append(vae_loss)
        total_loss += tf.reduce_mean(theta * decoder_loss)
        if ib:
            ib_loss = (v_mean_vae -
                       tf.stop_gradient(tf.expand_dims(qec_input, 1))
                       )**2 / tf.exp(v_logvar_vae) + v_logvar_vae
            print("here2", v_mean_vae.shape,
                  tf.expand_dims(qec_input, 1).shape, v_logvar_vae.shape,
                  ib_loss.shape)
            total_ib_loss = alpha * ib_loss + beta * encoder_loss
            outputs.append(total_ib_loss)
            total_loss += tf.reduce_mean(alpha * ib_loss)

        if grad_norm_clipping is not None:
            optimize_expr = U.minimize_and_clip(optimizer,
                                                total_loss,
                                                var_list=q_func_vars,
                                                clip_val=grad_norm_clipping)
        else:
            optimize_expr = optimizer.minimize(total_loss,
                                               var_list=q_func_vars)
        # Create callable functions
        # EMDQN
        total_loss_summary = tf.summary.scalar("total loss", total_loss)
        z_var_summary = tf.summary.scalar("z_var",
                                          tf.reduce_mean(tf.exp(z_logvar_vae)))
        encoder_loss_summary = tf.summary.scalar("encoder loss",
                                                 tf.reduce_mean(encoder_loss))
        decoder_loss_summary = tf.summary.scalar("decoder loss",
                                                 tf.reduce_mean(decoder_loss))
        summaries = [
            total_loss_summary, z_var_summary, encoder_loss_summary,
            decoder_loss_summary
        ]
        if ib:
            ib_loss_summary = tf.summary.scalar("ib loss",
                                                tf.reduce_mean(ib_loss))
            total_ib_loss_summary = tf.summary.scalar(
                "total ib loss", tf.reduce_mean(total_ib_loss))
            summaries.append(ib_loss_summary)
            summaries.append(total_ib_loss_summary)

        summary = tf.summary.merge(summaries)
        outputs.append(summary)

        train = U.function(inputs=inputs,
                           outputs=[total_loss, summary],
                           updates=[optimize_expr])

        return act_f, train
コード例 #13
0
ファイル: build_graph.py プロジェクト: zerolocker/baselines
def build_train(make_obs_ph,
                q_func,
                num_actions,
                optimizer,
                train_gaze,
                grad_norm_clipping=None,
                gamma=1.0,
                double_q=True,
                scope="DeepqWithGaze",
                reuse=None,
                param_noise=False,
                param_noise_filter_func=None):
    """Creates the train function:

    Parameters
    ----------
    make_obs_ph: str -> tf.placeholder or TfInput
        a function that takes a name and creates a placeholder of input with that name
    q_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    num_actions: int
        number of actions
    reuse: bool
        whether or not to reuse the graph variables
    optimizer: tf.train.Optimizer
        optimizer to use for the Q-learning objective.
    grad_norm_clipping: float or None
        clip gradient norms to this value. If None no clipping is performed.
    gamma: float
        discount rate.
    double_q: bool
        if true will use Double Q Learning (https://arxiv.org/abs/1509.06461).
        In general it is a good idea to keep it enabled.
    scope: str or VariableScope
        optional scope for variable_scope.
    reuse: bool or None
        whether or not the variables should be reused. To be able to reuse the scope must be given.
    param_noise: bool
        whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905)
    param_noise_filter_func: tf.Variable -> bool
        function that decides whether or not a variable should be perturbed. Only applicable
        if param_noise is True. If set to None, default_param_noise_filter is used by default.

    Returns
    -------
    act: (tf.Variable, bool, float) -> tf.Variable
        function to select and action given observation.
`       See the top of the file for details.
    train: (object, np.array, np.array, object, np.array, np.array) -> np.array
        optimize the error in Bellman's equation.
`       See the top of the file for details.
    update_target: () -> ()
        copy the parameters from optimized Q function to the target Q function.
`       See the top of the file for details.
    debug: {str: function}
        a bunch of functions to print debug data like q_values.
    """
    if param_noise:
        act_f = build_act_with_param_noise(
            make_obs_ph,
            q_func,
            num_actions,
            scope=scope,
            reuse=reuse,
            param_noise_filter_func=param_noise_filter_func)
    else:
        act_f = build_act(make_obs_ph,
                          q_func,
                          num_actions,
                          scope=scope,
                          reuse=reuse)

    with tf.variable_scope(scope, reuse=reuse):
        # set up placeholders
        obs_t_input = U.ensure_tf_input(make_obs_ph("obs_t"))
        act_t_ph = tf.placeholder(tf.int32, [None], name="action")
        rew_t_ph = tf.placeholder(tf.float32, [None], name="reward")
        obs_tp1_input = U.ensure_tf_input(make_obs_ph("obs_tp1"))
        done_mask_ph = tf.placeholder(tf.float32, [None], name="done")
        importance_weights_ph = tf.placeholder(tf.float32, [None],
                                               name="weight")
        initial_freeze_phase_ph = tf.placeholder(tf.bool, (),
                                                 name="initial_freeze_phase")

        # q network evaluation
        q_t = q_func(obs_t_input.get(),
                     num_actions,
                     scope="q_func",
                     reuse=True)  # reuse parameters from act
        q_func_vars = gflag.qfunc_models.get(
            "q_func").weights  # already includes gaze_models weights
        q_func_trainable_vars = [ w for w in gflag.qfunc_models.get("q_func").trainable_weights \
            if (train_gaze or w not in gflag.gaze_models.get("q_func").trainable_weights) ] # train_gaze=False excludes gaze model's weight

        # target q network evalution
        q_tp1 = q_func(obs_tp1_input.get(), num_actions, scope="target_q_func")
        target_q_func_vars = gflag.qfunc_models.get(
            "target_q_func").weights  # already includes gaze_models weights

        # q scores for actions which we know were selected in the given state.
        q_t_selected = tf.reduce_sum(q_t * tf.one_hot(act_t_ph, num_actions),
                                     1)

        # compute estimate of best possible value starting from state at t + 1
        if double_q:
            q_tp1_using_online_net = q_func(obs_tp1_input.get(),
                                            num_actions,
                                            scope="q_func",
                                            reuse=True)
            q_tp1_best_using_online_net = tf.arg_max(q_tp1_using_online_net, 1)
            q_tp1_best = tf.reduce_sum(
                q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions),
                1)
        else:
            q_tp1_best = tf.reduce_max(q_tp1, 1)
        q_tp1_best_masked = (1.0 - done_mask_ph) * q_tp1_best

        # compute RHS of bellman equation
        q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked

        # compute the error (potentially clipped)
        td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)
        errors = U.huber_loss(td_error)
        weighted_error = tf.reduce_mean(importance_weights_ph * errors)

        # compute optimization op (potentially with gradient clipping)
        initial_freeze_weights = gflag.qfunc_models.get_weight_names_for_initial_freeze(
            model_name="q_func")
        q_func_trainable_vars_for_initial_freeze = list(
            filter(lambda w: w.name not in initial_freeze_weights,
                   q_func_trainable_vars))
        if grad_norm_clipping is not None:
            optimize_expr_for_initial_freeze = lambda: U.minimize_and_clip(optimizer,
                                                weighted_error,
                                                var_list=q_func_trainable_vars_for_initial_freeze,
                                                clip_val=grad_norm_clipping) \
                                            if q_func_trainable_vars_for_initial_freeze else tf.no_op()
            optimize_expr_after_freeze = lambda: U.minimize_and_clip(
                optimizer,
                weighted_error,
                var_list=q_func_trainable_vars,
                clip_val=grad_norm_clipping)
        else:
            # must put the operation under lambda, if you fully read tf.cond()'s documentation
            optimize_expr_for_initial_freeze = lambda: optimizer.minimize(
                weighted_error,
                var_list=q_func_trainable_vars_for_initial_freeze)
            optimize_expr_after_freeze = lambda: optimizer.minimize(
                weighted_error, var_list=q_func_trainable_vars)
        optimize_expr = tf.cond(initial_freeze_phase_ph,
                                optimize_expr_for_initial_freeze,
                                optimize_expr_after_freeze)

        # update_target_fn will be called periodically to copy Q network to target Q network
        update_target_expr = []
        assert len(q_func_vars) == len(target_q_func_vars)
        for var, var_target in zip(q_func_vars, target_q_func_vars):
            update_target_expr.append(var_target.assign(var))
        update_target_expr = tf.group(*update_target_expr)

        # Create callable functions
        train = U.function(inputs=[
            obs_t_input,
            act_t_ph,
            rew_t_ph,
            obs_tp1_input,
            done_mask_ph,
            importance_weights_ph,
            initial_freeze_phase_ph,
        ],
                           outputs=td_error,
                           updates=[optimize_expr],
                           givens={K.backend.learning_phase(): 1})
        update_target = U.function([], [], updates=[update_target_expr])

        q_values = U.function([obs_t_input], q_t)

        # For tensorboard
        merged = tf.summary.merge([
            tf.summary.image('img_curframe', obs_t_input.get()),
            tf.summary.image(
                'gaze_curframe',
                q_func(obs_t_input.get(),
                       num_actions,
                       scope="q_func",
                       return_gaze=True,
                       reuse=True))
        ])
        tensorboard_summary = U.function(
            inputs=[obs_t_input],
            outputs=merged,
            givens={K.backend.learning_phase(): 0})

        return act_f, train, update_target, {
            'q_values': q_values
        }, tensorboard_summary
コード例 #14
0
def build_train_modelbased(make_obs_ph,
                           net_func,
                           model_func,
                           num_actions,
                           optimizer,
                           grad_norm_clipping=None,
                           gamma=1.0,
                           scope="mfec",
                           latent_dim=32,
                           input_dim=84 * 84 * 4,
                           hash_dim=32,
                           K=10,
                           beta=0.1,
                           predict=True,
                           reuse=None):
    """Creates the train function:

    Parameters
    ----------
    make_obs_ph: str -> tf.placeholder or TfInput
        a function that takes a name and creates a placeholder of input with that name
    num_actions: int
        number of actions
    reuse: bool
        whether or not to reuse the graph variables
    optimizer: tf.train.Optimizer
        optimizer to use for the Q-learning objective.
    grad_norm_clipping: float or None
        clip gradient norms to this value. If None no clipping is performed.
    gamma: float
        discount rate.
    double_q: bool
        if true will use Double Q Learning (https://arxiv.org/abs/1509.06461).
        In general it is a good idea to keep it enabled.
    scope: str or VariableScope
        optional scope for variable_scope.
    reuse: bool or None
        whether or not the variables should be reused. To be able to reuse the scope must be given.

    Returns
    -------
    act: (tf.Variable, bool, float) -> tf.Variable
        function to select and action given observation.
`       See the top of the file for details.
    train: (object, np.array, np.array, object, np.array, np.array) -> np.array
        optimize the error in Bellman's equation.
`       See the top of the file for details.
    update_target: () -> ()
        copy the parameters from optimized Q function to the target Q function.
`       See the top of the file for details.
    debug: {str: function}
        a bunch of functions to print debug data like q_values.
    """

    z_func = build_act_modelbased(make_obs_ph,
                                  net_func,
                                  num_actions,
                                  scope=scope,
                                  secondary_scope="net_func",
                                  reuse=reuse)

    with tf.variable_scope(scope, reuse=reuse):
        # set up placeholders

        # EMDQN
        tau = tf.placeholder(tf.float32, [1], name='tau')
        # momentum = tf.placeholder(tf.float32, [1], name='momentum')

        obs_mc_input_query = U.ensure_tf_input(make_obs_ph("obs_query"))
        obs_mc_input_positive = U.ensure_tf_input(make_obs_ph("enc_obs_pos"))
        obs_mc_input_negative = U.ensure_tf_input(make_obs_ph("enc_obs_neg"))
        obs_mc_input_model_t = U.ensure_tf_input(make_obs_ph("obs_query"))
        obs_mc_input_model_tp1 = U.ensure_tf_input(make_obs_ph("obs_query"))
        reward_input_model = tf.placeholder(tf.float32, [None], name='reward')
        action_input_model = tf.placeholder(tf.int32, [None], name='action')
        latent_input_out = tf.placeholder(tf.float32, [None, latent_dim],
                                          name='latent')
        action_input_out = tf.placeholder(tf.int32, [None],
                                          name='action_input_out')
        # inputs = [obs_mc_input]
        # inputs = [tau, obs_mc_input_query, obs_mc_input_positive, obs_mc_input_negative]
        inputs = [
            tau, obs_mc_input_query, obs_mc_input_positive,
            obs_mc_input_negative, obs_mc_input_model_t,
            obs_mc_input_model_tp1, reward_input_model, action_input_model
        ]
        z_mc_model_t, _ = net_func(obs_mc_input_model_t.get(),
                                   num_actions,
                                   scope="net_func",
                                   reuse=True)
        z_mc_model_tp1, _ = net_func(obs_mc_input_model_tp1.get(),
                                     num_actions,
                                     scope="net_func",
                                     reuse=True)
        z_mc_out, reward_out = model_func(latent_input_out,
                                          action_input_out,
                                          num_actions,
                                          scope="model_func",
                                          reuse=reuse)
        z_mc_model_tp1_predict, reward_predict = model_func(z_mc_model_t,
                                                            action_input_model,
                                                            num_actions,
                                                            scope="model_func",
                                                            reuse=True)
        z_mc, _ = net_func(obs_mc_input_query.get(),
                           num_actions,
                           scope="net_func",
                           reuse=True)

        # _, v_mc = net_func(
        #     obs_mc_input_query.get(), num_actions,
        #     scope="net_func",
        #     reuse=True)
        z_mc_pos, v_mc_pos = net_func(obs_mc_input_positive.get(),
                                      num_actions,
                                      scope="net_func",
                                      reuse=True)

        z_mc_neg, v_mc_neg = net_func(obs_mc_input_negative.get(),
                                      num_actions,
                                      scope="net_func",
                                      reuse=True)

        z_mc_pos = tf.reshape(z_mc_pos, [-1, 1, latent_dim])
        z_mc = tf.reshape(z_mc, [-1, latent_dim, 1])
        z_mc_neg = tf.reshape(z_mc_neg, [-1, K, latent_dim])

        negative = tf.matmul(z_mc_neg, z_mc) / tau
        sum_negative = tf.squeeze(tf.reduce_sum(tf.exp(negative), axis=1))
        positive = tf.squeeze(tf.matmul(z_mc_pos, z_mc) / tau)
        print("shape:", z_mc.shape, z_mc_pos.shape, z_mc_neg.shape,
              sum_negative.shape, negative.shape, positive.shape)
        contrast_loss = tf.reduce_mean(tf.log(sum_negative) - positive)
        # # print("shape2:", z_mc.shape, negative.shape, positive.shape)
        # # prediction_loss = tf.losses.mean_squared_error(value_input, v_mc)
        # total_loss = contrast_loss
        # if predict:
        #     total_loss += beta * prediction_loss

        model_func_vars = U.scope_vars(
            U.absolute_scope_name("model_func")) + U.scope_vars(
                U.absolute_scope_name("net_func"))
        # encoder_net_func_vars = U.scope_vars(U.absolute_scope_name("encoder_net_func"))

        transition_loss = tf.reduce_sum(
            tf.square(z_mc_model_tp1 - z_mc_model_tp1_predict))
        reward_loss = tf.reduce_sum(
            tf.square(reward_predict - reward_input_model))
        total_loss = contrast_loss + transition_loss + reward_loss
        if grad_norm_clipping is not None:
            optimize_expr_contrast_with_prediction = U.minimize_and_clip(
                optimizer,
                total_loss,
                var_list=model_func_vars,
                clip_val=grad_norm_clipping)
        else:
            optimize_expr_contrast_with_prediction = optimizer.minimize(
                total_loss, var_list=model_func_vars)
        # Create callable functions
        # update_target_fn will be called periodically to copy Q network to target Q network
        z_var_summary = tf.summary.scalar(
            "z_var", tf.reduce_mean(tf.math.reduce_std(z_mc_model_t, axis=1)))
        negative_summary = tf.summary.scalar(
            "negative", tf.reduce_mean(tf.reduce_mean(negative)))
        positive_summary = tf.summary.scalar(
            "positive", tf.reduce_mean(tf.reduce_mean(positive)))
        contrast_loss_summary = tf.summary.scalar(
            "contrast loss", tf.reduce_mean(contrast_loss))
        transition_loss_summary = tf.summary.scalar(
            "transition loss", tf.reduce_mean(transition_loss))
        trivial_loss_summary = tf.summary.scalar(
            "trivial loss",
            tf.reduce_mean(tf.square(z_mc_model_t - z_mc_model_tp1)))
        reward_loss_summary = tf.summary.scalar("reward loss",
                                                tf.reduce_mean(reward_loss))
        # prediction_loss_summary = tf.summary.scalar("prediction loss", tf.reduce_mean(prediction_loss))
        total_loss_summary = tf.summary.scalar("total loss",
                                               tf.reduce_mean(total_loss))

        summaries = [
            z_var_summary, negative_summary, positive_summary,
            contrast_loss_summary, trivial_loss_summary,
            transition_loss_summary, reward_loss_summary, total_loss_summary
        ]
        summary = tf.summary.merge(summaries)

        train = U.function(inputs=inputs,
                           outputs=[total_loss, summary],
                           updates=[optimize_expr_contrast_with_prediction])
        prediction = U.function(inputs=[latent_input_out, action_input_out],
                                outputs=[z_mc_out, reward_out])
        return z_func, prediction, train
コード例 #15
0
def build_act(make_obs_ph,
              q_func,
              num_actions,
              scope="deepq",
              reuse=tf.AUTO_REUSE):
    """Creates the act function:

    Parameters
    ----------
    make_obs_ph: str -> tf.placeholder or TfInput
        a function that take a name and creates a placeholder of input with that name
    q_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    num_actions: int
        number of actions.
    scope: str or VariableScope
        optional scope for variable_scope.
    reuse: bool or None
        whether or not the variables should be reused. To be able to reuse the scope must be given.

    training_flag: 0 defender is training and 1 attacker is training
    mask: function for masking illegal actions while attacker's training, shape [batch_size,num_actions]

    Returns
    -------
    act: (tf.Variable, bool, float) -> tf.Variable
        function to select and action given observation.
`       See the top of the file for details.
    """
    with tf.variable_scope(scope, reuse=reuse):
        observations_ph = U.ensure_tf_input(make_obs_ph("observation"))
        stochastic_ph = tf.placeholder(tf.bool, (), name="stochastic")
        update_eps_ph = tf.placeholder(tf.float32, (), name="update_eps")
        mask_ph = tf.placeholder(
            tf.float32, [None, num_actions],
            name="mask")  # TODO: mask cannot be None. should be zeros.
        training_flag_ph = tf.placeholder(tf.bool, (), name="training_flag")

        eps = tf.get_variable("eps", (),
                              initializer=tf.constant_initializer(0))

        q_values = q_func(observations_ph.get(), num_actions, scope="q_func")

        # TODO: check the type and shape for q_values and random_actions.
        # TODO: check q_values and mask shape match.
        # if training_flag == 1:
        #     q_values = q_values + mask_ph
        # deterministic_actions = tf.argmax(q_values, axis=1)
        #
        # batch_size = tf.shape(observations_ph.get())[0]
        # # When attacker is training, mask illegal actions, even for random action
        # if training_flag == 0:
        #     random_actions = tf.random_uniform(tf.stack([batch_size]), minval=0, maxval=num_actions, dtype=tf.int64)
        # elif training_flag == 1:
        #     q_vals = tf.random.uniform(shape=[batch_size,num_actions],dtype=tf.float32) + mask_ph
        #     random_actions = tf.argmax(q_vals, axis=1)
        # else:
        #     raise ValueError('Training flag is abnormal within the build_graph.')

        q_values_masked = q_values + mask_ph
        q_values_selected = tf.cond(training_flag_ph, lambda: q_values_masked,
                                    lambda: q_values)
        deterministic_actions = tf.argmax(q_values_selected, axis=1)

        batch_size = tf.shape(observations_ph.get())[0]

        random_actions_0 = tf.random_uniform(tf.stack([batch_size]),
                                             minval=0,
                                             maxval=num_actions,
                                             dtype=tf.int64)

        q_vals = tf.random_uniform(shape=[batch_size, num_actions],
                                   dtype=tf.float32) + mask_ph
        random_actions_1 = tf.argmax(q_vals, axis=1)

        random_actions = tf.cond(training_flag_ph, lambda: random_actions_1,
                                 lambda: random_actions_0)

        #TODO: modification done

        chose_random = tf.random_uniform(
            tf.stack([batch_size]), minval=0, maxval=1, dtype=tf.float32) < eps
        stochastic_actions = tf.where(chose_random, random_actions,
                                      deterministic_actions)

        output_actions = tf.cond(stochastic_ph, lambda: stochastic_actions,
                                 lambda: deterministic_actions)
        update_eps_expr = eps.assign(
            tf.cond(update_eps_ph >= 0, lambda: update_eps_ph, lambda: eps))
        _act = U.function(inputs=[
            observations_ph, mask_ph, training_flag_ph, stochastic_ph,
            update_eps_ph
        ],
                          outputs=output_actions,
                          givens={
                              update_eps_ph: -1.0,
                              stochastic_ph: True
                          },
                          updates=[update_eps_expr])

        def act(ob,
                mask,
                training_flag,
                stochastic=True,
                update_eps=-1):  #TODO:check this is correct
            return _act(ob, mask, training_flag, stochastic, update_eps)

        return act
コード例 #16
0
def build_train(make_obs_ph,
                q_func,
                num_actions,
                optimizer,
                grad_norm_clipping=None,
                gamma=1.0,
                double_q=True,
                scope="deepq",
                reuse=tf.AUTO_REUSE,
                param_noise=False,
                param_noise_filter_func=None):
    """Creates the train function:

    Parameters
    ----------
    make_obs_ph: str -> tf.placeholder or TfInput
        a function that takes a name and creates a placeholder of input with that name
    q_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    num_actions: int
        number of actions
    reuse: bool
        whether or not to reuse the graph variables
    optimizer: tf.train.Optimizer
        optimizer to use for the Q-learning objective.
    grad_norm_clipping: float or None
        clip gradient norms to this value. If None no clipping is performed.
    gamma: float
        discount rate.
    double_q: bool
        if true will use Double Q Learning (https://arxiv.org/abs/1509.06461).
        In general it is a good idea to keep it enabled.
    scope: str or VariableScope
        optional scope for variable_scope.
    reuse: bool or None
        whether or not the variables should be reused. To be able to reuse the scope must be given.
    param_noise: bool
        whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905)
    param_noise_filter_func: tf.Variable -> bool
        function that decides whether or not a variable should be perturbed. Only applicable
        if param_noise is True. If set to None, default_param_noise_filter is used by default.

    Returns
    -------
    act: (tf.Variable, bool, float) -> tf.Variable
        function to select and action given observation.
`       See the top of the file for details.
    train: (object, np.array, np.array, object, np.array, np.array) -> np.array
        optimize the error in Bellman's equation.
`       See the top of the file for details.
    update_target: () -> ()
        copy the parameters from optimized Q function to the target Q function.
`       See the top of the file for details.
    debug: {str: function}
        a bunch of functions to print debug data like q_values.
    """
    if param_noise:
        act_f = build_act_with_param_noise(
            make_obs_ph,
            q_func,
            num_actions,
            scope=scope,
            reuse=reuse,
            param_noise_filter_func=param_noise_filter_func)
    else:
        act_f = build_act(make_obs_ph,
                          q_func,
                          num_actions,
                          scope=scope,
                          reuse=reuse)

    # TODO: mask illegal actions.
    with tf.variable_scope(scope, reuse=reuse):
        # set up placeholders
        obs_t_input = U.ensure_tf_input(make_obs_ph("obs_t"))
        act_t_ph = tf.placeholder(tf.int32, [None], name="action")
        rew_t_ph = tf.placeholder(tf.float32, [None], name="reward")
        obs_tp1_input = U.ensure_tf_input(make_obs_ph("obs_tp1"))
        done_mask_ph = tf.placeholder(tf.float32, [None], name="done")
        importance_weights_ph = tf.placeholder(tf.float32, [None],
                                               name="weight")
        mask_tp1_ph = tf.placeholder(tf.float32, [None, num_actions],
                                     name="mask_tp1")
        training_flag_ph = tf.placeholder(tf.bool, (),
                                          name="training_flag_buildgraph")

        # q network evaluation
        q_t = q_func(obs_t_input.get(),
                     num_actions,
                     scope="q_func",
                     reuse=True)  # reuse parameters from act
        q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                        scope=tf.get_variable_scope().name +
                                        "/q_func")

        # target q network evalution
        q_tp1 = q_func(obs_tp1_input.get(), num_actions, scope="target_q_func")
        target_q_func_vars = tf.get_collection(
            tf.GraphKeys.GLOBAL_VARIABLES,
            scope=tf.get_variable_scope().name + "/target_q_func")

        # q scores for actions which we know were selected in the given state.
        q_t_selected = tf.reduce_sum(
            q_t * tf.one_hot(act_t_ph, num_actions),
            1)  #TODO: check what value is fed into act_t_ph

        # compute estimate of best possible value starting from state at t + 1
        # TODO: mask in double q, mask should be for s'.
        # if double_q:
        #     if training_flag == 0:
        #         q_tp1_using_online_net = q_func(obs_tp1_input.get(), num_actions, scope="q_func", reuse=True)
        #         q_tp1_best_using_online_net = tf.argmax(q_tp1_using_online_net,1)  # TODO: make sure this is right.
        #         q_tp1_best = tf.reduce_sum(q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions), 1)
        #     elif training_flag == 1:
        #         q_tp1_using_online_net = q_func(obs_tp1_input.get(), num_actions, scope="q_func", reuse=True)
        #         q_tp1_best_using_online_net = tf.argmax(q_tp1_using_online_net + mask_tp1_ph, 1) #TODO: make sure this is right.
        #         q_tp1_best = tf.reduce_sum(q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions), 1)
        #     else:
        #         raise ValueError("Training flag error!")
        # else:
        #     if training_flag == 0:
        #         q_tp1_best = tf.reduce_max(q_tp1, 1)
        #     else:
        #         q_tp1_best = tf.reduce_max(q_tp1+mask_tp1_ph, 1)

        if double_q:
            q_tp1_using_online_net = q_func(obs_tp1_input.get(),
                                            num_actions,
                                            scope="q_func",
                                            reuse=True)
            q_tp1_best_using_online_net = tf.argmax(q_tp1_using_online_net, 1)
            q_tp1_best_using_online_net_masked = tf.argmax(
                q_tp1_using_online_net + mask_tp1_ph, 1)
            q_tp1_best_using_online_net_selected = tf.cond(
                training_flag_ph, lambda: q_tp1_best_using_online_net_masked,
                lambda: q_tp1_best_using_online_net)
            q_tp1_best = tf.reduce_sum(
                q_tp1 *
                tf.one_hot(q_tp1_best_using_online_net_selected, num_actions),
                1)

        else:
            q_tp1_best_0 = tf.reduce_max(q_tp1, 1)
            q_tp1_best_1 = tf.reduce_max(q_tp1 + mask_tp1_ph, 1)
            q_tp1_best = tf.cond(training_flag_ph, lambda: q_tp1_best_1,
                                 lambda: q_tp1_best_0)

        #TODO: Modification done
        q_tp1_best_masked = (1.0 - done_mask_ph) * q_tp1_best

        # compute RHS of bellman equation
        q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked

        # compute the error (potentially clipped)
        td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)
        errors = U.huber_loss(td_error)
        weighted_error = tf.reduce_mean(importance_weights_ph * errors)

        # compute optimization op (potentially with gradient clipping)
        if grad_norm_clipping is not None:
            gradients = optimizer.compute_gradients(weighted_error,
                                                    var_list=q_func_vars)
            for i, (grad, var) in enumerate(gradients):
                if grad is not None:
                    gradients[i] = (tf.clip_by_norm(grad,
                                                    grad_norm_clipping), var)
            optimize_expr = optimizer.apply_gradients(gradients)
        else:
            optimize_expr = optimizer.minimize(weighted_error,
                                               var_list=q_func_vars)

        # update_target_fn will be called periodically to copy Q network to target Q network
        update_target_expr = []
        for var, var_target in zip(
                sorted(q_func_vars, key=lambda v: v.name),
                sorted(target_q_func_vars, key=lambda v: v.name)):
            update_target_expr.append(var_target.assign(var))
        update_target_expr = tf.group(*update_target_expr)

        # Create callable functions
        train = U.function(inputs=[
            obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph,
            importance_weights_ph, mask_tp1_ph, training_flag_ph
        ],
                           outputs=td_error,
                           updates=[optimize_expr])
        update_target = U.function([], [], updates=[update_target_expr])

        #This q-values has not been masked.
        q_values = U.function([obs_t_input],
                              q_t)  # TODO: check this is correct.

        return act_f, train, update_target, {'q_values': q_values}
コード例 #17
0
def build_train_dbc(input_type,
                    obs_shape,
                    repr_func,
                    model_func,
                    num_actions,
                    optimizer,
                    grad_norm_clipping=None,
                    gamma=1.0,
                    scope="mfec",
                    num_neg=10,
                    latent_dim=32,
                    alpha=1,
                    beta=1e2,
                    theta=10,
                    loss_type=["contrast"],
                    knn=4,
                    c_loss_type="margin",
                    b=100,
                    batch_size=32,
                    reuse=None):
    """Creates the train function:

    Parameters
    ----------
    make_obs_ph: str -> tf.placeholder or TfInput
        a function that takes a name and creates a placeholder of input with that name
    num_actions: int
        number of actions
    reuse: bool
        whether or not to reuse the graph variables
    optimizer: tf.train.Optimizer
        optimizer to use for the Q-learning objective.
    grad_norm_clipping: float or None
        clip gradient norms to this value. If None no clipping is performed.
    gamma: float
        discount rate.
    double_q: bool
        if true will use Double Q Learning (https://arxiv.org/abs/1509.06461).
        In general it is a good idea to keep it enabled.
    scope: str or VariableScope
        optional scope for variable_scope.
    reuse: bool or None
        whether or not the variables should be reused. To be able to reuse the scope must be given.

    Returns
    -------
    act: (tf.Variable, bool, float) -> tf.Variable
        function to select and action given observation.
`       See the top of the file for details.
    train: (object, np.array, np.array, object, np.array, np.array) -> np.array
        optimize the error in Bellman's equation.
`       See the top of the file for details.
    update_target: () -> ()
        copy the parameters from optimized Q function to the target Q function.
`       See the top of the file for details.
    debug: {str: function}
        a bunch of functions to print debug data like q_values.
    """
    if c_loss_type != "infonce":
        assert num_neg == 1
    # z_func = build_act_contrast(make_obs_ph, model_func, num_actions, scope=scope, secondary_scope="model_func",
    #                             reuse=reuse)

    with tf.variable_scope(scope, reuse=reuse):
        # set up placeholders

        # EMDQN
        # tau = tf.placeholder(tf.float32, [1], name='tau')
        # momentum = tf.placeholder(tf.float32, [1], name='momentum')

        # make_obs_ph = lambda name: input_type(obs_shape, batch_size, name=name),

        magic_num = tf.get_variable(name='magic', shape=[1])
        obs_input_u = U.ensure_tf_input(
            input_type(obs_shape, None, name="obs_u"))
        obs_input_u_tp1 = U.ensure_tf_input(
            input_type(obs_shape, None, name="obs_u_tp1"))
        obs_input_v = U.ensure_tf_input(
            input_type(obs_shape, None, name="obs_v"))

        action_input = tf.placeholder(tf.int32, [batch_size], name="action")
        reward_input = tf.placeholder(tf.float32, [batch_size], name="action")

        inputs = [
            obs_input_u, obs_input_u_tp1, obs_input_v, action_input,
            reward_input
        ]
        z_old = repr_func(obs_input_u.get(),
                          num_actions,
                          scope="target_repr_func",
                          reuse=False)

        z_u = repr_func(obs_input_u.get(),
                        num_actions,
                        scope="repr_func",
                        reuse=tf.AUTO_REUSE)

        z_u_tp1 = repr_func(obs_input_u_tp1.get(),
                            num_actions,
                            scope="repr_func",
                            reuse=tf.AUTO_REUSE)

        z_v = repr_func(obs_input_v.get(),
                        num_actions,
                        scope="repr_func",
                        reuse=tf.AUTO_REUSE)

        z_u_tp1_predict, r_u_predict = model_func(z_u,
                                                  num_actions,
                                                  scope="model_func",
                                                  reuse=tf.AUTO_REUSE)

        z_v_tp1_predict, r_v_predict = model_func(z_v,
                                                  num_actions,
                                                  scope="model_func",
                                                  reuse=tf.AUTO_REUSE)

        # total_loss = 0
        # reprsentation loss
        dist_bisimulation = tf.reduce_max(
            tf.abs(r_u_predict - r_v_predict) + gamma * tf.reduce_sum(
                tf.square(z_u_tp1_predict - z_v_tp1_predict), axis=2),
            axis=1)
        dist_bisimulation = tf.stop_gradient(dist_bisimulation)
        repr_loss = tf.losses.mean_squared_error(
            tf.norm(z_u - z_v, ord=1, axis=1), dist_bisimulation)

        # model loss
        z_u_tp1_selected = tf.gather(z_u_tp1_predict,
                                     action_input,
                                     axis=1,
                                     batch_dims=0)
        r_u_selected = tf.gather(r_u_predict,
                                 action_input,
                                 axis=1,
                                 batch_dims=0)
        transition_loss = tf.losses.mean_squared_error(
            z_u_tp1, tf.stop_gradient(z_u_tp1_selected))
        reward_loss = tf.losses.mean_squared_error(
            reward_input, tf.stop_gradient(r_u_selected))
        model_loss = transition_loss + reward_loss

        total_loss = repr_loss + alpha * model_loss

        model_func_vars = U.scope_vars(U.absolute_scope_name("repr_func"))
        model_func_vars_update = copy.copy(model_func_vars) + U.scope_vars(
            U.absolute_scope_name("model_func"))

        target_model_func_vars = U.scope_vars(
            U.absolute_scope_name("repr_model_func"))

        update_target_expr = []
        for var in model_func_vars:
            print(var.name, var.shape)
        for var_target in target_model_func_vars:
            print(var_target.name, var_target.shape)

        for var, var_target in zip(
                sorted(model_func_vars, key=lambda v: v.name),
                sorted(target_model_func_vars, key=lambda v: v.name)):
            update_target_expr.append(var_target.assign(var))
        update_target_expr = tf.group(*update_target_expr)

        if grad_norm_clipping is not None:
            optimize_expr = U.minimize_and_clip(
                optimizer,
                total_loss,
                var_list=model_func_vars_update,
                clip_val=grad_norm_clipping)
        else:
            optimize_expr = optimizer.minimize(total_loss,
                                               var_list=model_func_vars_update)
        # Create callable functions
        # update_target_fn will be called periodically to copy Q network to target Q network
        z_var_summary = tf.summary.scalar(
            "z_var", tf.reduce_mean(tf.math.reduce_std(z_u, axis=1)))
        total_loss_summary = tf.summary.scalar("total loss",
                                               tf.reduce_mean(total_loss))
        transition_loss_summary = tf.summary.scalar(
            "transition loss", tf.reduce_mean(transition_loss))
        reward_loss_summary = tf.summary.scalar("reward loss",
                                                tf.reduce_mean(reward_loss))
        model_loss_summary = tf.summary.scalar("model loss",
                                               tf.reduce_mean(model_loss))
        repr_loss_summary = tf.summary.scalar("repr loss",
                                              tf.reduce_mean(repr_loss))

        summaries = [
            z_var_summary, total_loss_summary, transition_loss_summary,
            reward_loss_summary, model_loss_summary, repr_loss_summary
        ]

        summary = tf.summary.merge(summaries)
        outputs = [total_loss, summary]
        train = U.function(inputs=inputs,
                           outputs=outputs,
                           updates=[optimize_expr])

        eval = U.function(inputs=inputs, outputs=outputs, updates=[])
        z_func = U.function(
            inputs=[obs_input_u],
            outputs=[z_old],
        )
        update_target_func = U.function([], [], updates=[update_target_expr])
        return z_func, train, eval, update_target_func
コード例 #18
0
def build_train(make_obs_ph,
                var_func,
                cvar_func,
                num_actions,
                nb_atoms,
                optimizer,
                grad_norm_clipping=None,
                gamma=1.0,
                scope="cvar_dqn",
                reuse=None):
    """Creates the train function:

    Parameters
    ----------
    make_obs_ph: str -> tf.placeholder or TfInput
        a function that takes a name and creates a placeholder of input with that name
    var_func: (tf.Variable, int, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            nb_atoms: int
                number of atoms
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    cvar_func: (tf.Variable, int, str, bool) -> tf.Variable
        see var_func
    num_actions: int
        number of actions
    reuse: bool
        whether or not to reuse the graph variables
    optimizer: tf.train.Optimizer
        optimizer to use for the Q-learning objective.
    grad_norm_clipping: float or None
        clip gradient norms to this value. If None no clipping is performed.
    gamma: float
        discount rate.
    scope: str or VariableScope
        optional scope for variable_scope.
    reuse: bool or None
        whether or not the variables should be reused. To be able to reuse the scope must be given.

    Returns
    -------
    act: (tf.Variable, bool, float) -> tf.Variable
        function to select and action given observation.
`       See the top of the file for details.
    train: (object, np.array, np.array, object, np.array, np.array) -> np.array
        optimize the error in Bellman's equation.
`       See the top of the file for details.
    update_target: () -> ()
        copy the parameters from optimized Q function to the target Q function.
`       See the top of the file for details.
    debug: {str: function}
        a bunch of functions to print debug data like q_values.
    """
    act_f = build_act(make_obs_ph,
                      cvar_func,
                      var_func,
                      num_actions,
                      nb_atoms,
                      scope=scope,
                      reuse=reuse)

    with tf.variable_scope(scope, reuse=reuse):
        # set up placeholders
        obs_t_input = U.ensure_tf_input(make_obs_ph("obs_t"))
        act_t_ph = tf.placeholder(tf.int32, [None], name="action")
        rew_t_ph = tf.placeholder(tf.float32, [None], name="reward")
        obs_tp1_input = U.ensure_tf_input(make_obs_ph("obs_tp1"))
        done_mask_ph = tf.placeholder(tf.float32, [None], name="done")
        importance_weights_ph = tf.placeholder(tf.float32, [None],
                                               name="weight")
        # atoms
        y = tf.range(1, nb_atoms + 1, dtype=tf.float32,
                     name='y') * 1. / nb_atoms

        # ------------------------------- Core networks ---------------------------------
        # var network
        var_t = var_func(obs_t_input.get(),
                         num_actions,
                         nb_atoms,
                         scope="out_func",
                         reuse_main=True,
                         reuse_last=True)  # reuse from act

        # vars for actions which we know were selected in the given state.
        var_t_selected = gather_along_second_axis(var_t, act_t_ph)
        var_t_selected.set_shape([None, nb_atoms])

        # cvar network
        cvar_t = cvar_func(obs_t_input.get(),
                           num_actions,
                           nb_atoms,
                           scope="out_func",
                           reuse_main=True,
                           reuse_last=True)  # reuse from act

        # cvars for actions which we know were selected in the given state.
        cvar_t_selected = gather_along_second_axis(cvar_t, act_t_ph)
        cvar_t_selected.set_shape([None, nb_atoms])

        # target cvar network
        cvar_tp1 = cvar_func(obs_tp1_input.get(),
                             num_actions,
                             nb_atoms,
                             scope="target_cvar_func")

        # extract variables
        joint_variables = U.scope_vars(U.absolute_scope_name("out_func/net"))
        var_variables = U.scope_vars(U.absolute_scope_name("out_func/var"))
        cvar_variables = U.scope_vars(U.absolute_scope_name("out_func/cvar"))
        target_cvar_func_variables = U.scope_vars(
            U.absolute_scope_name("target_cvar_func"))

        # -------------------------------------------------------------------------------

        # ----------------------------- Extract distribution ----------------------------
        # construct a new cvar with different actions for each atom
        cvar_tp1_star = tf.reduce_max(cvar_tp1, axis=1)
        cvar_tp1_star.set_shape([None, nb_atoms])
        # construct a distribution from the new cvar
        ycvar_tp1_star = cvar_tp1_star * y
        dist_tp1_star_ = extract_distribution(ycvar_tp1_star, nb_atoms)

        # apply done mask
        dist_tp1_star = tf.einsum('ij,i->ij', dist_tp1_star_,
                                  1. - done_mask_ph)

        # Td = r + gamma * dist
        dist_target = tf.identity(rew_t_ph[:, None] + gamma * dist_tp1_star,
                                  name='dist_target')
        # dist is always non-differentiable
        dist_target = tf.stop_gradient(dist_target)

        # -------------------------------------------------------------------------------

        # ---------------------------------- VaR loss -----------------------------------

        td_error = dist_target[:, :, None] - var_t_selected[:, None, :]
        # td_error[0]=
        #  [[Td1-v1 Td1-v2 ... Td1-vn]
        #   [Td2-v1 Td2-v2 ... Td2-vn]
        #   [...                     ]
        #   [Tdn-v1 Tdn-v2 ... Tdn-vn]]

        negative_indicator = tf.cast(td_error < 0, tf.float32)

        var_weights = tf.stop_gradient(
            y - negative_indicator)  # XXX: stop gradient?
        quantile_loss = var_weights * td_error

        var_error = tf.reduce_mean(quantile_loss)
        # -------------------------------------------------------------------------------

        # ---------------------------------- CVaR loss ----------------------------------
        # Minimizing the MSE of:
        # V_i + 1/y_i(Td_j - V_i)^- - C_i

        min_target_diff = negative_indicator / y * tf.stop_gradient(td_error)
        cvar_loss = tf.stop_gradient(
            var_t_selected
        )[:, None, :] + min_target_diff - cvar_t_selected[:, None, :]

        cvar_error = tf.reduce_mean(tf.square(cvar_loss))

        # -------------------------------------------------------------------------------

        # ------------------------------- Finalizing ------------------------------------

        error = var_error + cvar_error
        # compute optimization op (potentially with gradient clipping)
        var_list = [joint_variables, var_variables, cvar_variables]
        if grad_norm_clipping is not None:
            optimize_expr = U.minimize_and_clip(optimizer,
                                                error,
                                                var_list,
                                                clip_val=grad_norm_clipping)
        else:
            optimize_expr = optimizer.minimize(error, var_list=var_list)

        # update_target_fn will be called periodically to copy cvar network to target cvar network
        # Note: var has no target
        update_target_expr = []
        for cvar_variable, target_cvar_variable in zip(
                sorted(joint_variables + cvar_variables, key=lambda v: v.name),
                sorted(target_cvar_func_variables, key=lambda v: v.name)):
            update_target_expr.append(
                target_cvar_variable.assign(cvar_variable))
        update_target_expr = tf.group(*update_target_expr)

        # Create callable functions
        train = U.function(inputs=[
            obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph,
            importance_weights_ph
        ],
                           outputs=error,
                           updates=[optimize_expr])
        update_target = U.function([], [], updates=[update_target_expr])

        # -------------------------------------------------------------------------------

        # --------------------------------- Debug ---------------------------------------
        # a = U.function([obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph], var_t_selected)
        # b = U.function([obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph], cvar_t_selected)
        # c = U.function([obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph], big_dist_target*y)
        # b = U.function([obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph], var_t)
        # c = U.function([obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph], negative_indicator)
        # d = U.function([obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph], big_yc_target)
        # e = U.function([obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph], cvar_t)
        # f = U.function([obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph], cvar_loss)
        # atoms = U.function([obs_tp1_input], atoms)
        # -------------------------------------------------------------------------------

        return act_f, train, update_target, []
コード例 #19
0
ファイル: build_graph.py プロジェクト: mauxam/kaithy
def build_act_with_param_noise(make_obs_ph,
                               q_func,
                               num_actions,
                               scope="deepq",
                               reuse=None,
                               param_noise_filter_func=None,
                               random_filter=False,
                               deterministic_filter=False):
    """Creates the act function with support for parameter space noise exploration (https://arxiv.org/abs/1706.01905):

    Parameters
    ----------
    make_obs_ph: str -> tf.placeholder or TfInput
        a function that take a name and creates a placeholder of input with that name
    q_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    num_actions: int
        number of actions.
    scope: str or VariableScope
        optional scope for variable_scope.
    reuse: bool or None
        whether or not the variables should be reused. To be able to reuse the scope must be given.
    param_noise_filter_func: tf.Variable -> bool
        function that decides whether or not a variable should be perturbed. Only applicable
        if param_noise is True. If set to None, default_param_noise_filter is used by default.

    Returns
    -------
    act: (tf.Variable, bool, float, bool, float, bool) -> tf.Variable
        function to select and action given observation.
`       See the top of the file for details.
    """
    if param_noise_filter_func is None:
        param_noise_filter_func = default_param_noise_filter

    with tf.variable_scope(scope, reuse=reuse):
        observations_ph = U.ensure_tf_input(make_obs_ph("observation"))
        stochastic_ph = tf.placeholder(tf.bool, (), name="stochastic")
        update_eps_ph = tf.placeholder(U.data_type, (), name="update_eps")
        update_param_noise_threshold_ph = tf.placeholder(
            U.data_type, (), name="update_param_noise_threshold")
        update_param_noise_scale_ph = tf.placeholder(
            tf.bool, (), name="update_param_noise_scale")
        reset_ph = tf.placeholder(tf.bool, (), name="reset")

        if deterministic_filter or random_filter:
            invalid_masks = build_invalid_masks(observations_ph.get())

        eps = tf.get_variable("eps", (),
                              initializer=tf.constant_initializer(0))
        param_noise_scale = tf.get_variable(
            "param_noise_scale", (),
            initializer=tf.constant_initializer(0.01),
            trainable=False)
        param_noise_threshold = tf.get_variable(
            "param_noise_threshold", (),
            initializer=tf.constant_initializer(0.05),
            trainable=False)

        # Unmodified Q.
        q_values = q_func(observations_ph.get(), num_actions, scope="q_func")

        # Perturbable Q used for the actual rollout.
        q_values_perturbed = q_func(observations_ph.get(),
                                    num_actions,
                                    scope="perturbed_q_func")

        if deterministic_filter:
            q_values_perturbed = build_q_filter(q_values_perturbed,
                                                invalid_masks)
        # We have to wrap this code into a function due to the way tf.cond() works. See
        # https://stackoverflow.com/questions/37063952/confused-by-the-behavior-of-tf-cond for
        # a more detailed discussion.

        def perturb_vars(original_scope, perturbed_scope):
            all_vars = U.scope_vars(U.absolute_scope_name("q_func"))
            all_perturbed_vars = U.scope_vars(
                U.absolute_scope_name("perturbed_q_func"))
            assert len(all_vars) == len(all_perturbed_vars)
            perturb_ops = []
            for var, perturbed_var in zip(all_vars, all_perturbed_vars):
                if param_noise_filter_func(perturbed_var):
                    # Perturb this variable.
                    op = tf.assign(
                        perturbed_var,
                        var + tf.random_normal(shape=tf.shape(var),
                                               mean=0.,
                                               stddev=param_noise_scale))
                else:
                    # Do not perturb, just assign.
                    op = tf.assign(perturbed_var, var)
                perturb_ops.append(op)
            assert len(perturb_ops) == len(all_vars)
            return tf.group(*perturb_ops)

        # Set up functionality to re-compute `param_noise_scale`. This perturbs yet another copy
        # of the network and measures the effect of that perturbation in action space. If the perturbation
        # is too big, reduce scale of perturbation, otherwise increase.
        q_values_adaptive = q_func(observations_ph.get(),
                                   num_actions,
                                   scope="adaptive_q_func")

        perturb_for_adaption = perturb_vars(original_scope="q_func",
                                            perturbed_scope="adaptive_q_func")
        kl = tf.reduce_sum(tf.nn.softmax(q_values) *
                           (tf.log(tf.nn.softmax(q_values)) -
                            tf.log(tf.nn.softmax(q_values_adaptive))),
                           axis=-1)
        mean_kl = tf.reduce_mean(kl)

        def update_scale():
            with tf.control_dependencies([perturb_for_adaption]):
                update_scale_expr = tf.cond(
                    mean_kl < param_noise_threshold,
                    lambda: param_noise_scale.assign(param_noise_scale * 1.01),
                    lambda: param_noise_scale.assign(param_noise_scale / 1.01),
                )
            return update_scale_expr

        # Functionality to update the threshold for parameter space noise.
        update_param_noise_threshold_expr = param_noise_threshold.assign(
            tf.cond(update_param_noise_threshold_ph >= 0,
                    lambda: update_param_noise_threshold_ph,
                    lambda: param_noise_threshold))

        # Put everything together.
        deterministic_actions = tf.argmax(q_values_perturbed,
                                          axis=1,
                                          output_type=U.index_type)
        batch_size = tf.shape(observations_ph.get())[0]
        random_actions = tf.random_uniform(tf.stack([batch_size]),
                                           minval=0,
                                           maxval=num_actions,
                                           dtype=U.index_type)

        if random_filter:
            random_actions = build_ramdom_filter(deterministic_actions,
                                                 random_actions, invalid_masks)

        chose_random = tf.random_uniform(
            tf.stack([batch_size
                      ]), minval=0, maxval=1, dtype=U.data_type) < eps
        stochastic_actions = tf.where(chose_random, random_actions,
                                      deterministic_actions)

        output_actions = tf.cond(stochastic_ph, lambda: stochastic_actions,
                                 lambda: deterministic_actions)
        update_eps_expr = eps.assign(
            tf.cond(update_eps_ph >= 0, lambda: update_eps_ph, lambda: eps))
        updates = [
            update_eps_expr,
            tf.cond(
                reset_ph,
                lambda: perturb_vars(original_scope="q_func",
                                     perturbed_scope="perturbed_q_func"),
                lambda: tf.group(*[])),
            tf.cond(update_param_noise_scale_ph, lambda: update_scale(),
                    lambda: tf.Variable(0., trainable=False)),
            update_param_noise_threshold_expr,
        ]
        act = U.function(inputs=[
            observations_ph, stochastic_ph, update_eps_ph, reset_ph,
            update_param_noise_threshold_ph, update_param_noise_scale_ph
        ],
                         outputs=output_actions,
                         givens={
                             update_eps_ph: -1.0,
                             stochastic_ph: True,
                             reset_ph: False,
                             update_param_noise_threshold_ph: False,
                             update_param_noise_scale_ph: False
                         },
                         updates=updates)
        return act
コード例 #20
0
def build_train_ib(make_obs_ph,
                   model_func,
                   num_actions,
                   optimizer,
                   grad_norm_clipping=None,
                   gamma=1.0,
                   beta=1.0,
                   theta=1,
                   double_q=True,
                   emdqn=True,
                   vae=True,
                   ib=True,
                   scope="deepq_ib",
                   reuse=None):
    """Creates the train function:

    Parameters
    ----------
    make_obs_ph: str -> tf.placeholder or TfInput
        a function that takes a name and creates a placeholder of input with that name
    q_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    num_actions: int
        number of actions
    reuse: bool
        whether or not to reuse the graph variables
    optimizer: tf.train.Optimizer
        optimizer to use for the Q-learning objective.
    grad_norm_clipping: float or None
        clip gradient norms to this value. If None no clipping is performed.
    gamma: float
        discount rate.
    beta: float
        coefficient of beta-ib.
    double_q: bool
        if true will use Double Q Learning (https://arxiv.org/abs/1509.06461).
        In general it is a good idea to keep it enabled.
    scope: str or VariableScope
        optional scope for variable_scope.
    reuse: bool or None
        whether or not the variables should be reused. To be able to reuse the scope must be given.

    Returns
    -------
    act: (tf.Variable, bool, float) -> tf.Variable
        function to select and action given observation.
`       See the top of the file for details.
    train: (object, np.array, np.array, object, np.array, np.array) -> np.array
        optimize the error in Bellman's equation.
`       See the top of the file for details.
    update_target: () -> ()
        copy the parameters from optimized Q function to the target Q function.
`       See the top of the file for details.
    debug: {str: function}
        a bunch of functions to print debug data like q_values.
    """

    act_noise = tf.placeholder(tf.float32, [None, 512], name="act_noise")
    act_f = build_act_ib(make_obs_ph,
                         model_func,
                         act_noise,
                         num_actions,
                         scope=scope,
                         reuse=reuse)

    with tf.variable_scope(scope, reuse=reuse):
        # set up placeholders
        obs_t_input = U.ensure_tf_input(make_obs_ph("obs_t"))

        act_t_ph = tf.placeholder(tf.int32, [None], name="action")
        rew_t_ph = tf.placeholder(tf.float32, [None], name="reward")
        z_noise_t = tf.placeholder(tf.float32, [None, 512], name="z_noise")

        z_noise_tp1 = tf.placeholder(tf.float32, [None, 512],
                                     name="z_noise_tp1")
        obs_tp1_input = U.ensure_tf_input(make_obs_ph("obs_tp1"))
        done_mask_ph = tf.placeholder(tf.float32, [None], name="done")
        importance_weights_ph = tf.placeholder(tf.float32, [None],
                                               name="weight")

        inputs = [
            obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph,
            importance_weights_ph, act_noise, z_noise_t, z_noise_tp1
        ]
        # EMDQN
        if emdqn or ib:
            qec_input = tf.placeholder(tf.float32, [None], name='qec')
            inputs.append(qec_input)
        if ib or vae:
            obs_vae_input = U.ensure_tf_input(make_obs_ph("obs_vae"))
            z_noise_vae = tf.placeholder(tf.float32, [None, 512],
                                         name="z_noise_vae")
            inputs.append(obs_vae_input)
            inputs.append(z_noise_vae)
        # q network evaluation
        q_t, v_mean_t, v_logvar_t, z_mean_t, z_logvar_t, recon_obs_t = model_func(
            obs_t_input.get(),
            z_noise_t,
            num_actions,
            scope="q_func",
            reuse=True)
        if vae or ib:
            q_vae, v_mean_vae, v_logvar_vae, z_mean_vae, z_logvar_vae, recon_obs = model_func(
                obs_vae_input.get(),
                z_noise_vae,
                num_actions,
                scope="q_func",
                reuse=True)

        # q_t = q_func(obs_t_input.get(), num_actions, scope="q_func", reuse=True)  # reuse parameters from act

        q_func_vars = U.scope_vars(U.absolute_scope_name("q_func"))

        # target q network evalution

        q_tp1, q_d_tp1, v_mean_tp1, v_logvar_tp1, z_mean_tp1, z_logvar_tp1, recon_obs_tp1 = model_func(
            obs_tp1_input.get(),
            z_noise_tp1,
            num_actions,
            scope="target_q_func")

        target_q_func_vars = U.scope_vars(
            U.absolute_scope_name("target_q_func"))

        # q scores for actions which we know were selected in the given state.
        q_t_selected = tf.reduce_sum(q_t * tf.one_hot(act_t_ph, num_actions),
                                     1)

        # compute estimate of best possible value starting from state at t + 1
        if double_q:

            q_tp1_using_online_net, _, _, _, _, _, _ = model_func(
                obs_tp1_input.get(),
                z_noise_tp1,
                num_actions,
                scope="q_func",
                reuse=True)

            q_tp1_best_using_online_net = tf.arg_max(q_tp1_using_online_net, 1)
            q_tp1_best = tf.reduce_sum(
                q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions),
                1)
        else:
            q_tp1_best = tf.reduce_max(q_tp1, 1)
        q_tp1_best_masked = (1.0 - done_mask_ph) * q_tp1_best

        # compute RHS of bellman equation
        q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked

        # compute the error (potentially clipped)
        td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)

        td_loss = tf.reduce_mean(importance_weights_ph *
                                 U.huber_loss(td_error))
        outputs = [td_loss]
        total_loss = td_loss
        if vae or ib:
            encoder_loss = -1 + z_mean_vae**2 + tf.exp(
                z_logvar_vae) - z_logvar_vae
            outputs.append(encoder_loss)
            total_loss += 0.1 * tf.reduce_mean(beta * encoder_loss)
        if vae:
            decoder_loss = tf.keras.losses.binary_crossentropy(
                tf.reshape(recon_obs, [-1]),
                tf.reshape(
                    tf.dtypes.cast(obs_vae_input._placeholder, tf.float32),
                    [-1]))
            print("here", z_mean_t.shape, z_logvar_t.shape, encoder_loss.shape,
                  decoder_loss.shape)
            vae_loss = beta * encoder_loss + theta * decoder_loss
            outputs.append(decoder_loss)
            outputs.append(vae_loss)
            total_loss += 0.1 * tf.reduce_mean(theta * decoder_loss)
        if ib:
            ib_loss = (v_mean_t - tf.stop_gradient(tf.expand_dims(
                qec_input, 1)))**2 / tf.exp(v_logvar_t) + v_logvar_t
            print("here2", v_mean_t.shape,
                  tf.expand_dims(qec_input, 1).shape, v_logvar_t.shape,
                  ib_loss.shape)
            total_ib_loss = ib_loss + beta * encoder_loss
            outputs.append(total_ib_loss)
            total_loss += 0.1 * tf.reduce_mean(ib_loss)
        # EMDQN
        if emdqn:
            qec_error = q_t_selected - tf.stop_gradient(qec_input)
            total_loss += 0.1 * tf.reduce_mean(
                importance_weights_ph * U.huber_loss(qec_error))
            outputs.append(qec_error)

        td_loss_summary = tf.summary.scalar("td loss", td_loss)
        total_loss_summary = tf.summary.scalar("total loss", total_loss)
        z_var_summary = tf.summary.scalar("z_var",
                                          tf.reduce_mean(tf.exp(z_logvar_t)))
        summaries = [td_loss_summary, total_loss_summary, z_var_summary]
        if vae or ib:
            encoder_loss_summary = tf.summary.scalar(
                "encoder loss", tf.reduce_mean(encoder_loss))
            summaries.append(encoder_loss_summary)
        if vae:
            decoder_loss_summary = tf.summary.scalar(
                "decoder loss", tf.reduce_mean(decoder_loss))
            summaries.append(decoder_loss_summary)
        if ib:
            ib_loss_summary = tf.summary.scalar("ib loss",
                                                tf.reduce_mean(ib_loss))
            total_ib_loss_summary = tf.summary.scalar(
                "total ib loss", tf.reduce_mean(total_ib_loss))
            summaries.append(ib_loss_summary)
            summaries.append(total_ib_loss_summary)
        if emdqn:
            qec_loss_summary = tf.summary.scalar(
                "qec loss", tf.reduce_mean(importance_weights_ph * qec_error))
            summaries.append(qec_loss_summary)
        summary = tf.summary.merge(summaries)
        outputs.append(summary)
        # compute optimization op (potentially with gradient clipping)
        if grad_norm_clipping is not None:
            optimize_expr = U.minimize_and_clip(optimizer,
                                                total_loss,
                                                var_list=q_func_vars,
                                                clip_val=grad_norm_clipping)
        else:
            optimize_expr = optimizer.minimize(total_loss,
                                               var_list=q_func_vars)

        # update_target_fn will be called periodically to copy Q network to target Q network
        update_target_expr = []
        for var, var_target in zip(
                sorted(q_func_vars, key=lambda v: v.name),
                sorted(target_q_func_vars, key=lambda v: v.name)):
            update_target_expr.append(var_target.assign(var))
        update_target_expr = tf.group(*update_target_expr)

        # Create callable functions

        train = U.function(inputs=inputs,
                           outputs=[td_error, summary],
                           updates=[optimize_expr])

        get_q_t_selected = U.function(
            inputs=[obs_t_input, act_t_ph, z_noise_t], outputs=q_t_selected)
        update_target = U.function([], [], updates=[update_target_expr])

        q_values = U.function([obs_t_input, z_noise_t], q_t)

        return act_f, train, update_target, {
            'q_values': q_values
        }, get_q_t_selected
コード例 #21
0
def build_train_mer(input_type,
                    obs_shape,
                    model_func,
                    num_actions,
                    optimizer,
                    grad_norm_clipping=None,
                    gamma=1.0,
                    scope="mfec",
                    num_neg=10,
                    latent_dim=32,
                    alpha=0.1,
                    beta=1e2,
                    theta=10,
                    loss_type=["contrast"],
                    knn=4,
                    c_loss_type="margin",
                    b=100,
                    batch_size=32,
                    reuse=None):
    """Creates the train function:

    Parameters
    ----------
    make_obs_ph: str -> tf.placeholder or TfInput
        a function that takes a name and creates a placeholder of input with that name
    num_actions: int
        number of actions
    reuse: bool
        whether or not to reuse the graph variables
    optimizer: tf.train.Optimizer
        optimizer to use for the Q-learning objective.
    grad_norm_clipping: float or None
        clip gradient norms to this value. If None no clipping is performed.
    gamma: float
        discount rate.
    double_q: bool
        if true will use Double Q Learning (https://arxiv.org/abs/1509.06461).
        In general it is a good idea to keep it enabled.
    scope: str or VariableScope
        optional scope for variable_scope.
    reuse: bool or None
        whether or not the variables should be reused. To be able to reuse the scope must be given.

    Returns
    -------
    act: (tf.Variable, bool, float) -> tf.Variable
        function to select and action given observation.
`       See the top of the file for details.
    train: (object, np.array, np.array, object, np.array, np.array) -> np.array
        optimize the error in Bellman's equation.
`       See the top of the file for details.
    update_target: () -> ()
        copy the parameters from optimized Q function to the target Q function.
`       See the top of the file for details.
    debug: {str: function}
        a bunch of functions to print debug data like q_values.
    """
    if c_loss_type != "infonce":
        assert num_neg == 1
    # z_func = build_act_contrast(make_obs_ph, model_func, num_actions, scope=scope, secondary_scope="model_func",
    #                             reuse=reuse)

    with tf.variable_scope(scope, reuse=reuse):
        # set up placeholders

        # EMDQN
        # tau = tf.placeholder(tf.float32, [1], name='tau')
        # momentum = tf.placeholder(tf.float32, [1], name='momentum')

        # make_obs_ph = lambda name: input_type(obs_shape, batch_size, name=name),
        magic_num = tf.get_variable(name='magic', shape=[1])
        obs_input_query = U.ensure_tf_input(
            input_type(obs_shape, None, name="obs_query"))
        obs_input_positive = U.ensure_tf_input(
            input_type(obs_shape, batch_size, name="enc_obs_pos"))
        obs_input_negative = U.ensure_tf_input(
            input_type(obs_shape, batch_size * num_neg, name="enc_obs_neg"))
        obs_input_neighbour = U.ensure_tf_input(
            input_type(obs_shape, batch_size * knn, name="enc_obs_neighbour"))

        obs_input_uniformity_u = U.ensure_tf_input(
            input_type(obs_shape, batch_size, name="enc_obs_uni_u"))
        obs_input_uniformity_v = U.ensure_tf_input(
            input_type(obs_shape, batch_size, name="enc_obs_uni_v"))

        obs_input_weighted_product_u = U.ensure_tf_input(
            input_type(obs_shape, batch_size, name="enc_obs_wp_u"))
        obs_input_weighted_product_v = U.ensure_tf_input(
            input_type(obs_shape, batch_size, name="enc_obs_wp_v"))

        value_input_weighted_product_u = tf.placeholder(tf.float32,
                                                        [batch_size],
                                                        name="value_u")
        value_input_weighted_product_v = tf.placeholder(tf.float32,
                                                        [batch_size],
                                                        name="value_v")

        value_input_query = tf.placeholder(tf.float32, [batch_size],
                                           name="value")
        value_input_neighbour = tf.placeholder(tf.float32, [batch_size, knn],
                                               name="neighbour_value")
        action_embedding = tf.Variable(tf.random_normal(
            [num_actions, latent_dim], stddev=1),
                                       name="action_embedding")
        action_input = tf.placeholder(tf.int32, [batch_size], name="action")
        action_input_causal = tf.placeholder(tf.int32, [batch_size],
                                             name="action")
        reward_input_causal = tf.placeholder(tf.float32, [batch_size],
                                             name="action")

        inputs = [obs_input_query]
        if "contrast" in loss_type:
            inputs += [obs_input_positive, obs_input_negative]
        if "regression" in loss_type:
            inputs += [value_input_query]
        if "linear_model" in loss_type:
            inputs += [action_input]
            if "contrast" not in loss_type:
                inputs += [obs_input_positive]
        if "fit" in loss_type:
            # if "contrast" not in loss_type:
            #     inputs+=[]
            inputs += [obs_input_neighbour, value_input_neighbour]
            if "regression" not in loss_type:
                inputs += [value_input_query]
        if "weight_product" in loss_type:
            inputs += [
                obs_input_uniformity_u, obs_input_uniformity_v,
                obs_input_weighted_product_u, obs_input_weighted_product_v,
                value_input_weighted_product_u, value_input_weighted_product_v
            ]
        if "causality" in loss_type:
            inputs += [reward_input_causal, action_input_causal]
        z_old = model_func(obs_input_query.get(),
                           num_actions,
                           scope="target_model_func",
                           reuse=False)

        z = model_func(obs_input_query.get(),
                       num_actions,
                       scope="model_func",
                       reuse=tf.AUTO_REUSE)

        z_pos = model_func(obs_input_positive.get(),
                           num_actions,
                           scope="model_func",
                           reuse=True)

        z_neg = model_func(obs_input_negative.get(),
                           num_actions,
                           scope="model_func",
                           reuse=True)

        z_uni_u = model_func(obs_input_uniformity_u.get(),
                             num_actions,
                             scope="model_func",
                             reuse=True)
        z_uni_v = model_func(obs_input_uniformity_v.get(),
                             num_actions,
                             scope="model_func",
                             reuse=True)
        z_wp_u = model_func(obs_input_weighted_product_u.get(),
                            num_actions,
                            scope="model_func",
                            reuse=True)
        z_wp_v = model_func(obs_input_weighted_product_v.get(),
                            num_actions,
                            scope="model_func",
                            reuse=True)

        z_pos = tf.reshape(z_pos, [-1, latent_dim])
        z_tar = tf.reshape(z, [-1, latent_dim])
        if "contrast" in loss_type:
            z_neg = tf.reshape(z_neg, [-1, latent_dim])
            contrast_loss, contrast_summary = contrastive_loss_fc(
                z_tar,
                z_pos,
                z_neg,
                c_type=c_loss_type,
                num_neg=num_neg,
                batch_size=batch_size,
                emb_dim=latent_dim)
            symmetry_loss, symmetry_summary = contrastive_loss_fc(
                z_pos,
                z_tar,
                z_neg,
                c_type=c_loss_type,
                num_neg=num_neg,
                batch_size=batch_size,
                emb_dim=latent_dim)
            contrast_loss += symmetry_loss
        z_neighbour = model_func(obs_input_neighbour.get(),
                                 num_actions,
                                 scope="model_func",
                                 reuse=True)

        # fit loss
        z_neighbour = tf.reshape(z_neighbour, [-1, knn, latent_dim])
        square_dist = tf.square(
            tf.tile(tf.expand_dims(z_tar, 1), [1, knn, 1]) - z_neighbour)
        neighbour_dist = tf.reduce_sum(square_dist, axis=2)
        neighbour_coeff = tf.math.softmax(-neighbour_dist / b, axis=1)
        coeff_sum = tf.reduce_mean(tf.reduce_sum(neighbour_coeff, axis=1))
        value_input_neighbour_mean = tf.reduce_mean(value_input_neighbour)
        fit_value = tf.reduce_sum(tf.multiply(neighbour_coeff,
                                              value_input_neighbour),
                                  axis=1)
        fit_loss = tf.reduce_mean(tf.abs(fit_value - value_input_query))

        # causality loss
        reward_input_causal = tf.reshape(reward_input_causal, [1, -1])
        reward_tile = tf.tile(reward_input_causal, [batch_size, 1])
        # reward_mask = (reward_tile - tf.transpose(reward_tile)) ** 2
        reward_mask = 1 - tf.cast(
            tf.equal((reward_tile - tf.transpose(reward_tile)),
                     tf.constant(0.)), tf.float32)
        action_input_causal = tf.reshape(action_input_causal, [1, -1])
        action_tile = tf.tile(action_input_causal, [batch_size, 1])
        action_mask = tf.cast(
            tf.equal((action_tile - tf.transpose(action_tile)),
                     tf.constant(0)), tf.float32)
        total_mask = tf.multiply(reward_mask, action_mask)
        z_tile = tf.tile(tf.expand_dims(z_tar, 1), [1, batch_size, 1])
        z_diff = z_tile - tf.transpose(z_tile, perm=[1, 0, 2])
        distance = tf.reduce_sum(z_diff**2, axis=2)
        exp_distance = tf.exp(-distance)
        causal_find_rate = (tf.reduce_sum(total_mask)) / (batch_size**2 -
                                                          batch_size)
        causal_loss = tf.reduce_sum(tf.multiply(exp_distance, total_mask))

        # regularization loss
        regularization_loss = -tf.maximum(
            1., tf.reduce_mean(U.huber_loss(z_tar, 0.01)))
        regression_loss = tf.reduce_mean(
            tf.squared_difference(tf.norm(z_tar, axis=1), alpha *
                                  value_input_query)) + regularization_loss

        # linear model loss
        action_embeded = tf.matmul(tf.one_hot(action_input, num_actions),
                                   action_embedding)
        model_loss = tf.reduce_mean(
            tf.squared_difference(action_embeded + z_tar,
                                  z_pos)) + 0.01 * regularization_loss

        # weighted product loss
        uniformity_loss = tf.reduce_sum(
            tf.exp(2 * tf.reduce_sum(tf.multiply(z_uni_u, z_uni_v), axis=1) -
                   2))
        value_weight = (value_input_weighted_product_u -
                        value_input_weighted_product_v)**2
        # angle = acos_safe(tf.reduce_sum(tf.multiply(z_wp_u, z_wp_v), axis=1))
        angle = tf.reduce_sum(tf.multiply(z_wp_u, z_wp_v), axis=1)
        weighted_product = tf.multiply(value_weight, angle)
        wp_loss = tf.reduce_sum(weighted_product)

        total_loss = 0
        if "contrast" in loss_type:
            total_loss += contrast_loss
        if "regression" in loss_type:
            total_loss += beta * regression_loss
        if "linear_model" in loss_type:
            total_loss += theta * model_loss
        if "fit" in loss_type:
            total_loss += beta * fit_loss
        if "causality" in loss_type:
            total_loss += theta * causal_loss
        if "weight_product" in loss_type:
            total_loss += 0.1 * uniformity_loss
            total_loss += wp_loss
        model_func_vars = U.scope_vars(U.absolute_scope_name("model_func"))
        model_func_vars_update = copy.copy(model_func_vars)
        if "linear_model" in loss_type:
            model_func_vars_update.append(action_embedding)

        target_model_func_vars = U.scope_vars(
            U.absolute_scope_name("target_model_func"))

        update_target_expr = []
        for var in model_func_vars:
            print(var.name, var.shape)
        for var_target in target_model_func_vars:
            print(var_target.name, var_target.shape)

        for var, var_target in zip(
                sorted(model_func_vars, key=lambda v: v.name),
                sorted(target_model_func_vars, key=lambda v: v.name)):
            update_target_expr.append(var_target.assign(var))
        update_target_expr = tf.group(*update_target_expr)

        if grad_norm_clipping is not None:
            optimize_expr = U.minimize_and_clip(
                optimizer,
                total_loss,
                var_list=model_func_vars_update,
                clip_val=grad_norm_clipping)
        else:
            optimize_expr = optimizer.minimize(total_loss,
                                               var_list=model_func_vars_update)
        # Create callable functions
        # update_target_fn will be called periodically to copy Q network to target Q network
        z_var_summary = tf.summary.scalar(
            "z_var", tf.reduce_mean(tf.math.reduce_std(z, axis=1)))
        if "contrast" in loss_type:
            z_neg = tf.reshape(z_neg, [batch_size, num_neg, latent_dim])
            negative_summary = tf.summary.scalar(
                "negative_dist",
                tf.reduce_mean(emb_dist(z_tar, z_neg[:, 0, :])))
        positive_summary = tf.summary.scalar(
            "positive_dist", tf.reduce_mean(emb_dist(z_tar, z_pos)))
        if "contrast" in loss_type:
            contrast_loss_summary = tf.summary.scalar(
                "contrast loss", tf.reduce_mean(contrast_loss))
        regularization_loss_summary = tf.summary.scalar(
            "regularization loss", tf.reduce_mean(regularization_loss))
        regression_loss_summary = tf.summary.scalar(
            "regression loss", tf.reduce_mean(regression_loss))
        model_loss_summary = tf.summary.scalar("model loss",
                                               tf.reduce_mean(model_loss))
        fit_loss_summary = tf.summary.scalar("fit loss",
                                             tf.reduce_mean(fit_loss))
        fit_value_summary = tf.summary.scalar("fit value",
                                              tf.reduce_mean(fit_value))
        neighbour_value_summary = tf.summary.scalar(
            "neighbour value", value_input_neighbour_mean)
        coeff_summary = tf.summary.scalar("coeff sum", coeff_sum)
        square_dist_summary = tf.summary.scalar("square_dist",
                                                tf.reduce_mean(square_dist))
        z_neighbour_summary = tf.summary.scalar("z_neighbour_mean",
                                                tf.reduce_mean(z_neighbour))
        # fit_loss_summary = tf.summary.scalar("fit loss", tf.reduce_mean(fit_loss))
        # prediction_loss_summary = tf.summary.scalar("prediction loss", tf.reduce_mean(prediction_loss))
        causal_efficiency_summary = tf.summary.scalar("causal efficiency",
                                                      causal_find_rate)
        causal_loss_summary = tf.summary.scalar("causal loss", causal_loss)
        # reward_mask_summary = tf.summary.scalar("reward mask summary", debug_reward_mask)
        # action_mask_summary = tf.summary.scalar("action mask summary", debug_action_mask)
        uniformity_loss_summary = tf.summary.scalar("uniform loss",
                                                    uniformity_loss)
        wp_loss_summary = tf.summary.scalar("weighted product loss", wp_loss)
        total_loss_summary = tf.summary.scalar("total loss",
                                               tf.reduce_mean(total_loss))

        summaries = [
            z_var_summary, total_loss_summary, regularization_loss_summary
        ]

        if "contrast" in loss_type:
            summaries += [
                negative_summary, positive_summary, contrast_loss_summary
            ]
            summaries += contrast_summary
        if "regression" in loss_type:
            summaries.append(regression_loss_summary)
        if "linear_model" in loss_type:
            summaries.append(model_loss_summary)
            if "contrast" not in loss_type:
                summaries.append(positive_summary)
        if "fit" in loss_type:
            summaries.append(fit_loss_summary)
            summaries.append(fit_value_summary)
            summaries.append(neighbour_value_summary)
            summaries.append(coeff_summary)
            summaries.append(square_dist_summary)
            summaries.append(z_neighbour_summary)
        if "causality" in loss_type:
            summaries.append(causal_efficiency_summary)
            summaries.append(causal_loss_summary)
            # summaries.append(reward_mask_summary)
            # summaries.append(action_mask_summary)
        if "weight_product" in loss_type:
            summaries.append(uniformity_loss_summary)
            summaries.append(wp_loss_summary)
        summary = tf.summary.merge(summaries)
        outputs = [total_loss, summary]
        train = U.function(inputs=inputs,
                           outputs=outputs,
                           updates=[optimize_expr])

        eval = U.function(inputs=inputs, outputs=outputs, updates=[])
        z_func = U.function(
            inputs=[obs_input_query],
            outputs=[z_old],
        )
        norm_func = U.function(inputs=[obs_input_query],
                               outputs=[tf.norm(z_tar, axis=1)])
        update_target_func = U.function([], [], updates=[update_target_expr])
        return z_func, train, eval, norm_func, update_target_func
コード例 #22
0
def build_train_mfmc(make_obs_ph,
                     model_func,
                     num_actions,
                     optimizer,
                     grad_norm_clipping=None,
                     gamma=1.0,
                     batch_size=5,
                     scope="mfec",
                     latent_dim=32,
                     input_dim=84 * 84 * 4,
                     hash_dim=32,
                     K=10,
                     beta=0.1,
                     predict=True,
                     use_rp=False,
                     reuse=None):
    """Creates the train function:

    Parameters
    ----------
    make_obs_ph: str -> tf.placeholder or TfInput
        a function that takes a name and creates a placeholder of input with that name
    num_actions: int
        number of actions
    reuse: bool
        whether or not to reuse the graph variables
    optimizer: tf.train.Optimizer
        optimizer to use for the Q-learning objective.
    grad_norm_clipping: float or None
        clip gradient norms to this value. If None no clipping is performed.
    gamma: float
        discount rate.
    double_q: bool
        if true will use Double Q Learning (https://arxiv.org/abs/1509.06461).
        In general it is a good idea to keep it enabled.
    scope: str or VariableScope
        optional scope for variable_scope.
    reuse: bool or None
        whether or not the variables should be reused. To be able to reuse the scope must be given.

    Returns
    -------
    act: (tf.Variable, bool, float) -> tf.Variable
        function to select and action given observation.
`       See the top of the file for details.
    train: (object, np.array, np.array, object, np.array, np.array) -> np.array
        optimize the error in Bellman's equation.
`       See the top of the file for details.
    update_target: () -> ()
        copy the parameters from optimized Q function to the target Q function.
`       See the top of the file for details.
    debug: {str: function}
        a bunch of functions to print debug data like q_values.
    """
    z_func = build_act_mfmc(make_obs_ph,
                            model_func,
                            num_actions,
                            scope=scope,
                            secondary_scope="model_func",
                            reuse=reuse)
    # encoder_z_func = build_act_mfmc(make_obs_ph, model_func, num_actions, scope=scope,
    #                                 secondary_scope="encoder_model_func", reuse=reuse)

    with tf.variable_scope(scope, reuse=reuse):
        # set up placeholders

        # EMDQN
        tau = tf.placeholder(tf.float32, [1], name='tau')
        # momentum = tf.placeholder(tf.float32, [1], name='momentum')

        obs_hash_input = U.ensure_tf_input(make_obs_ph("obs_hash"))
        obs_mc_input = U.ensure_tf_input(make_obs_ph("obs"))
        obs_mc_input_query = U.ensure_tf_input(make_obs_ph("obs_query"))
        # obs_mc_input_positive = U.ensure_tf_input(make_obs_ph("enc_obs_pos"))
        keys_mc_input_negative = tf.placeholder(tf.float32,
                                                [None, K, latent_dim],
                                                name='enc_keys_neg')
        keys_mc_input_positive = tf.placeholder(tf.float32, [None, latent_dim],
                                                name='enc_keys_pos')
        keys_mc_input_anchor = tf.placeholder(tf.float32, [None, latent_dim],
                                              name='enc_keys_anchor')
        # keys_mc_input_anchor = tf.Variable(initial_value=np.zeros((batch_size, latent_dim)),
        #                                    shape=[batch_size, latent_dim],
        #                                    name='enc_keys_anchor',
        #                                    dtype=tf.float32)
        #
        # keys_mc_input_positive = tf.Variable(initial_value=np.zeros((batch_size, latent_dim)),
        #                                      shape=[batch_size, latent_dim],
        #                                      name='enc_keys_pos',
        #                                      dtype=tf.float32)
        # keys_mc_input_negative = tf.Variable(initial_value=np.zeros((batch_size, K, latent_dim)),
        #                                      shape=[batch_size, K, latent_dim],
        #                                      name='enc_keys_neg',
        #                                      dtype=tf.float32)

        # inputs = [obs_mc_input]
        value_input = tf.placeholder(tf.float32, [None, 1], name='value')
        if predict:
            inputs = [
                tau, obs_mc_input_query, keys_mc_input_positive,
                keys_mc_input_negative, keys_mc_input_anchor, obs_mc_input,
                value_input
            ]
        else:
            inputs = [
                tau, obs_mc_input_query, keys_mc_input_positive,
                keys_mc_input_negative, keys_mc_input_anchor
            ]
        z_mc, _ = model_func(obs_mc_input_query.get(),
                             num_actions,
                             scope="model_func",
                             reuse=True)

        _, v_mc = model_func(obs_mc_input.get(),
                             num_actions,
                             scope="model_func",
                             reuse=True)
        # encoder_z_mc_pos, encoder_v_mc_pos = model_func(
        #     obs_mc_input_positive.get(), num_actions,
        #     scope="encoder_model_func", reuse=True)

        # z_mc_pos = tf.stop_gradient(encoder_z_mc_pos)
        # z_mc_pos = tf.reshape(keys_mc_input_positive, [-1, 1, latent_dim])
        # z_mc_anchor = tf.reshape(z_mc, [-1, latent_dim, 1])
        # z_mc_neg = tf.reshape(keys_mc_input_negative, [-1, K, latent_dim])

        z_mc_pos = keys_mc_input_positive
        z_mc = tf.reshape(z_mc, [-1, latent_dim])
        z_mc_expand = tf.reshape(z_mc, [-1, 1, latent_dim])
        z_mc_tile = tf.tile(z_mc_expand, [1, K, 1])
        z_mc_neg = keys_mc_input_negative
        z_mc_anchor = keys_mc_input_anchor

        anchor_dist = tf.sqrt(
            tf.reduce_sum(tf.square(z_mc - z_mc_anchor), axis=1))
        pos_dist = tf.sqrt(tf.reduce_sum(tf.square(z_mc - z_mc_pos), axis=1))
        neg_dist = tf.reduce_mean(tf.sqrt(
            tf.reduce_sum(tf.square(z_mc_tile - z_mc_neg), axis=2)),
                                  axis=1)
        # contrast_loss = tf.reduce_mean(tf.maximum(pos_dist - neg_dist + 1, 0))
        contrast_loss = tf.reduce_mean(tf.maximum(pos_dist - neg_dist + 1, 0)) \
                        + 0.5 * tf.reduce_mean(pos_dist) + 0.5 * tf.reduce_mean(anchor_dist)

        pos_grad = tf.gradients([contrast_loss], [z_mc_pos])
        neg_grad = tf.gradients([contrast_loss], [z_mc_neg])
        # neg_grad = tf.gradients([contrast_loss],[z_mc_neg])

        # negative = tf.matmul(z_mc_neg, z_mc_anchor) / tau
        # exp_negative = tf.squeeze(tf.reduce_sum(tf.exp(negative), axis=1))
        # positive = tf.squeeze(tf.matmul(z_mc_pos, z_mc_anchor) / tau)
        # print("shape:", z_mc.shape, z_mc_anchor.shape, z_mc_pos.shape, negative.shape, exp_negative.shape,
        #       positive.shape)
        # contrast_loss = tf.reduce_mean(tf.log(exp_negative) - positive)
        # print("shape2:", z_mc.shape, negative.shape, positive.shape)
        prediction_loss = tf.losses.mean_squared_error(value_input, v_mc)
        total_loss = contrast_loss
        if predict:
            total_loss += beta * prediction_loss

        model_func_vars = U.scope_vars(U.absolute_scope_name("model_func"))
        # encoder_model_func_vars = U.scope_vars(U.absolute_scope_name("encoder_model_func"))
        if grad_norm_clipping is not None:
            optimize_expr_contrast_with_prediction = U.minimize_and_clip(
                optimizer,
                total_loss,
                var_list=model_func_vars,
                clip_val=grad_norm_clipping)
        else:
            optimize_expr_contrast_with_prediction = optimizer.minimize(
                total_loss, var_list=model_func_vars)
        # Create callable functions
        # update_target_fn will be called periodically to copy Q network to target Q network
        # update_target_expr = []
        # for var, var_target in zip(sorted(model_func_vars, key=lambda v: v.name),
        #                            sorted(encoder_model_func_vars, key=lambda v: v.name)):
        #     update_target_expr.append(var_target.assign((1 - momentum) * var + momentum * var_target))
        # update_target_expr = tf.group(*update_target_expr)
        # update_target = U.function([momentum], [], updates=[update_target_expr])

        if use_rp:
            latten_obs = tf.reshape(obs_hash_input.get(), [-1, input_dim])
            rp = tf.random.normal([input_dim, hash_dim], 0,
                                  1 / np.sqrt(hash_dim))
            obs_hash_output = tf.matmul(latten_obs, rp)

        else:
            obs_hash_output, _ = model_func(obs_hash_input.get(),
                                            num_actions,
                                            scope="hash_func",
                                            reuse=False)
        hash_func = U.function(inputs=[obs_hash_input],
                               outputs=[obs_hash_output])
        # EMDQN
        z_var_summary = tf.summary.scalar(
            "z_var", tf.reduce_mean(tf.math.reduce_std(z_mc, axis=1)))
        z_mean_summary = tf.summary.scalar(
            "z_mean", tf.reduce_mean(tf.math.reduce_mean(z_mc, axis=1)))
        negative_summary = tf.summary.scalar(
            "negative", tf.reduce_mean(tf.reduce_mean(neg_dist)))
        negative_mean_summary = tf.summary.scalar(
            "negative mean", tf.reduce_mean(tf.reduce_mean(z_mc_neg)))
        negative_grad_summary = tf.summary.scalar(
            "negative grad", tf.reduce_mean(tf.abs(neg_grad)))
        negative_var_summary = tf.summary.scalar(
            "negative std", tf.reduce_mean(tf.math.reduce_std(z_mc_neg,
                                                              axis=2)))
        # negative_summary = tf.summary.scalar("negative", tf.reduce_mean(tf.reduce_mean(negative)))
        positive_summary = tf.summary.scalar(
            "positive", tf.reduce_mean(tf.reduce_mean(pos_dist)))
        positive_mean_summary = tf.summary.scalar(
            "positive mean", tf.reduce_mean(tf.reduce_mean(z_mc_pos)))
        positive_grad_summary = tf.summary.scalar(
            "positive grad", tf.reduce_mean(tf.abs(pos_grad)))
        positive_std_summary = tf.summary.scalar(
            "positive std", tf.reduce_mean(tf.math.reduce_std(z_mc_pos,
                                                              axis=1)))
        anchor_summary = tf.summary.scalar(
            "anchor", tf.reduce_mean(tf.reduce_mean(anchor_dist)))
        # positive_summary = tf.summary.scalar("positive", tf.reduce_mean(tf.reduce_mean(positive)))
        # z_norm_summary = tf.summary.scalar("z_norm", tf.reduce_mean(tf.norm(z_mc, axis=1)))
        # encoder_z_norm_summary = tf.summary.scalar("encoder_z_norm", tf.reduce_mean(tf.norm(encoder_z_mc_pos, axis=1)))
        # neg_norm_summary = tf.summary.scalar("neg_z_norm", tf.reduce_mean(tf.norm(keys_mc_input_negative, axis=[1, 2])))
        contrast_loss_summary = tf.summary.scalar(
            "contrast loss", tf.reduce_mean(contrast_loss))
        prediction_loss_summary = tf.summary.scalar(
            "prediction loss", tf.reduce_mean(prediction_loss))
        total_loss_summary = tf.summary.scalar("total loss",
                                               tf.reduce_mean(total_loss))

        if predict:
            summaries = [
                z_var_summary, z_mean_summary, positive_summary,
                negative_summary, contrast_loss_summary,
                prediction_loss_summary, total_loss_summary
            ]
        else:
            summaries = [
                z_var_summary, z_mean_summary, negative_var_summary,
                negative_grad_summary, negative_mean_summary, positive_summary,
                positive_mean_summary, positive_grad_summary,
                positive_std_summary, negative_summary, contrast_loss_summary,
                anchor_summary, total_loss_summary
            ]
        summary = tf.summary.merge(summaries)

        train = U.function(
            inputs=inputs,
            outputs=[total_loss, summary, z_mc, pos_grad, neg_grad],
            updates=[optimize_expr_contrast_with_prediction])

        return hash_func, z_func, train
コード例 #23
0
def build_train(make_obs_ph,
                q_func,
                num_actions,
                optimizer,
                grad_norm_clipping=None,
                gamma=1.0,
                double_q=True,
                scope="deepq",
                reuse=None,
                param_noise=False,
                param_noise_filter_func=None,
                distributed=False,
                v_min=-10.0,
                v_max=10.0,
                atoms=51):
    """Creates the train function:

    Parameters
    ----------
    make_obs_ph: str -> tf.placeholder or TfInput
        a function that takes a name and creates a placeholder of input with that name
    q_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    num_actions: int
        number of actions
    reuse: bool
        whether or not to reuse the graph variables
    optimizer: tf.train.Optimizer
        optimizer to use for the Q-learning objective.
    grad_norm_clipping: float or None
        clip gradient norms to this value. If None no clipping is performed.
    gamma: float
        discount rate.
    double_q: bool
        if true will use Double Q Learning (https://arxiv.org/abs/1509.06461).
        In general it is a good idea to keep it enabled.
    scope: str or VariableScope
        optional scope for variable_scope.
    reuse: bool or None
        whether or not the variables should be reused. To be able to reuse the scope must be given.
    param_noise: bool
        whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905)
    param_noise_filter_func: tf.Variable -> bool
        function that decides whether or not a variable should be perturbed. Only applicable
        if param_noise is True. If set to None, default_param_noise_filter is used by default.
    distributed: bool
        whether or not distributed version is enabled.
    v_min: float
        lower boundary for value, only works when distributed version is enabled.
    v_max: float
        upper boundary for value, only works when distributed version is enabled.
    atoms: int
        number of atoms, only works when distributed version is enabled.

    Returns
    -------
    act: (tf.Variable, bool, float) -> tf.Variable
        function to select and action given observation.
`       See the top of the file for details.
    train: (object, np.array, np.array, object, np.array, np.array) -> np.array
        optimize the error in Bellman's equation.
`       See the top of the file for details.
    update_target: () -> ()
        copy the parameters from optimized Q function to the target Q function.
`       See the top of the file for details.
    debug: {str: function}
        a bunch of functions to print debug data like q_values.
    """
    print("build train use distributed? ", distributed)
    if param_noise:
        act_f = build_act_with_param_noise(
            make_obs_ph,
            q_func,
            num_actions,
            scope=scope,
            reuse=reuse,
            param_noise_filter_func=param_noise_filter_func,
            distributed=distributed,
            v_min=v_min,
            v_max=v_max,
            atoms=atoms)
    else:
        act_f = build_act(make_obs_ph,
                          q_func,
                          num_actions,
                          scope=scope,
                          reuse=reuse,
                          distributed=distributed,
                          v_min=v_min,
                          v_max=v_max,
                          atoms=atoms)

    with tf.variable_scope(scope, reuse=reuse):
        # set up placeholders
        obs_t_input = U.ensure_tf_input(make_obs_ph("obs_t"))
        act_t_ph = tf.placeholder(tf.int32, [None], name="action")
        rew_t_ph = tf.placeholder(tf.float32, [None], name="reward")
        obs_tp1_input = U.ensure_tf_input(make_obs_ph("obs_tp1"))
        done_mask_ph = tf.placeholder(tf.float32, [None], name="done")
        importance_weights_ph = tf.placeholder(tf.float32, [None],
                                               name="weight")
        distributed_target_ph = tf.placeholder(tf.float32, [None, atoms],
                                               name="dis_target")

        # q network evaluation
        if not distributed:
            q_t = q_func(obs_t_input.get(),
                         num_actions,
                         scope="q_func",
                         reuse=True)  # reuse parameters from act
            # target q network evalution
            q_tp1 = q_func(obs_tp1_input.get(),
                           num_actions,
                           scope="target_q_func")
        else:
            q_t = q_func(obs_t_input.get(),
                         num_actions,
                         scope="q_func",
                         reuse=True)  # reuse parameters from act
            q_tp1 = q_func(obs_tp1_input.get(),
                           num_actions,
                           scope="target_q_func")

        q_func_vars = U.scope_vars(U.absolute_scope_name("q_func"))
        target_q_func_vars = U.scope_vars(
            U.absolute_scope_name("target_q_func"))

        # q scores for actions which we know were selected in the given state.
        if not distributed:
            q_t_selected = tf.reduce_sum(
                q_t * tf.one_hot(act_t_ph, num_actions), 1)
        else:
            probability_qt = tf.nn.softmax(q_t)
            q_t_selected = tf.reduce_sum(
                q_t *
                tf.tile(tf.expand_dims(tf.one_hot(act_t_ph, num_actions), 2),
                        [1, 1, atoms]), 1)

        # compute estimate of best possible value starting from state at t + 1
        if double_q:
            print("use double")
            if not distributed:
                q_tp1_using_online_net = q_func(obs_tp1_input.get(),
                                                num_actions,
                                                scope="q_func",
                                                reuse=True)
                q_tp1_best_using_online_net = tf.arg_max(
                    q_tp1_using_online_net, 1)
                q_tp1_best = tf.reduce_sum(
                    q_tp1 *
                    tf.one_hot(q_tp1_best_using_online_net, num_actions), 1)
            else:
                q_tp1_using_online_net = q_func(obs_tp1_input.get(),
                                                num_actions,
                                                scope="q_func",
                                                reuse=True)
                q_tp1_best = get_distibute_q(q_tp1_using_online_net, v_min,
                                             v_max, atoms, obs_tp1_input)
                a_tp1_best = tf.argmax(q_tp1_best, 1)
                probability_qt1 = tf.nn.softmax(q_tp1_using_online_net)
                q_tp1_best = tf.reduce_sum(
                    probability_qt1 * tf.tile(
                        tf.expand_dims(tf.one_hot(a_tp1_best, num_actions), 2),
                        [1, 1, atoms]), 1)
        else:
            print("not use double")
            if not distributed:
                q_tp1_best = tf.reduce_max(q_tp1, 1)
            else:
                if distributed:
                    q_tp1_best = get_distibute_q(q_tp1, v_min, v_max, atoms,
                                                 obs_tp1_input)
                    a_tp1_best = tf.argmax(q_tp1_best, 1)
                    probability_qt1 = tf.nn.softmax(q_tp1)
                    q_tp1_best = tf.reduce_sum(
                        probability_qt1 * tf.tile(
                            tf.expand_dims(tf.one_hot(a_tp1_best, num_actions),
                                           2), [1, 1, atoms]), 1)

        mask = 1.0 - done_mask_ph
        if not distributed:
            q_tp1_best_masked = mask * q_tp1_best
        else:
            q_tp1_best_masked = q_tp1_best

        # compute RHS of bellman equation
        if not distributed:
            q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked
            # compute the error (potentially clipped)
            td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)
            errors = U.huber_loss(td_error)
        else:
            clip_target = tf.clip_by_value(distributed_target_ph, 1e-8, 1.0)
            clip_select = tf.clip_by_value(tf.nn.softmax(q_t_selected), 1e-8,
                                           1.0)
            # use kl divergence
            td_error = tf.reduce_sum(
                clip_target * (tf.log(clip_target) - tf.log(clip_select)),
                axis=-1)
            errors = tf.nn.softmax_cross_entropy_with_logits(
                labels=distributed_target_ph, logits=q_t_selected)

        weighted_error = tf.reduce_mean(importance_weights_ph * errors)

        # compute optimization op (potentially with gradient clipping)
        if grad_norm_clipping is not None:
            optimize_expr = U.minimize_and_clip(optimizer,
                                                weighted_error,
                                                var_list=q_func_vars,
                                                clip_val=grad_norm_clipping)
        else:
            optimize_expr = optimizer.minimize(weighted_error,
                                               var_list=q_func_vars)

        # update_target_fn will be called periodically to copy Q network to target Q network
        update_target_expr = []
        for var, var_target in zip(
                sorted(q_func_vars, key=lambda v: v.name),
                sorted(target_q_func_vars, key=lambda v: v.name)):
            update_target_expr.append(var_target.assign(var))
        update_target_expr = tf.group(*update_target_expr)

        # Create callable functions
        if distributed:
            train = U.function(inputs=[
                obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph,
                importance_weights_ph, distributed_target_ph
            ],
                               outputs=td_error,
                               updates=[optimize_expr])
        else:
            train = U.function(inputs=[
                obs_t_input,
                act_t_ph,
                rew_t_ph,
                obs_tp1_input,
                done_mask_ph,
                importance_weights_ph,
            ],
                               outputs=td_error,
                               updates=[optimize_expr])

        update_target = U.function([], [], updates=[update_target_expr])

        q_values = U.function([obs_t_input], q_t)
        q_tp1_best_final = U.function([obs_tp1_input], q_tp1_best)

        return act_f, train, update_target, {
            'q_values': q_values,
            'q_t1_best': q_tp1_best_final
        }
コード例 #24
0
def build_train(make_obs_ph,
                p_dist_func,
                num_actions,
                optimizer,
                grad_norm_clipping=None,
                gamma=1.0,
                double_q=True,
                scope="distdeepq",
                reuse=None,
                param_noise=False,
                param_noise_filter_func=None,
                dist_params=None):
    """Creates the train function:

    Parameters
    ----------
    make_obs_ph: str -> tf.placeholder or TfInput
        a function that takes a name and creates a placeholder of input with that name
    p_dist_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    num_actions: int
        number of actions
    reuse: bool
        whether or not to reuse the graph variables
    optimizer: tf.train.Optimizer
        optimizer to use for the Q-learning objective.
    grad_norm_clipping: float or None
        clip gradient norms to this value. If None no clipping is performed.
    gamma: float
        discount rate.
    double_q: bool
        if true will use Double Q Learning (https://arxiv.org/abs/1509.06461).
        In general it is a good idea to keep it enabled.
    scope: str or VariableScope
        optional scope for variable_scope.
    reuse: bool or None
        whether or not the variables should be reused. To be able to reuse the scope must be given.
    param_noise: bool
        whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905)
    param_noise_filter_func: tf.Variable -> bool
        function that decides whether or not a variable should be perturbed. Only applicable
        if param_noise is True. If set to None, default_param_noise_filter is used by default.

    Returns
    -------
    act: (tf.Variable, bool, float) -> tf.Variable
        function to select and action given observation.
`       See the top of the file for details.
    train: (object, np.array, np.array, object, np.array, np.array) -> np.array
        optimize the error in Bellman's equation.
`       See the top of the file for details.
    update_target: () -> ()
        copy the parameters from optimized Q function to the target Q function.
`       See the top of the file for details.
    debug: {str: function}
        a bunch of functions to print debug data like q_values.
    """

    if param_noise:
        raise ValueError('parameter noise not supported')
    else:
        act_f = build_act(make_obs_ph,
                          p_dist_func,
                          num_actions,
                          dist_params,
                          scope=scope,
                          reuse=reuse)

    with tf.variable_scope(scope, reuse=reuse):
        # set up placeholders
        obs_t_input = U.ensure_tf_input(make_obs_ph("obs_t"))
        act_t_ph = tf.placeholder(tf.int32, [None], name="action")
        rew_t_ph = tf.placeholder(tf.float32, [None], name="reward")
        obs_tp1_input = U.ensure_tf_input(make_obs_ph("obs_tp1"))
        done_mask_ph = tf.placeholder(tf.float32, [None], name="done")
        importance_weights_ph = tf.placeholder(tf.float32, [None],
                                               name="weight")

        # =====================================================================================
        # q network evaluation
        p_t = p_dist_func(obs_t_input.get(),
                          num_actions,
                          dist_params['nb_atoms'],
                          scope="q_func",
                          reuse=True)  # reuse parameters from act
        q_t = p_to_q(p_t, dist_params)  # reuse parameters from act
        q_func_vars = U.scope_vars(U.absolute_scope_name("q_func"))

        # target q network evalution
        p_tp1 = p_dist_func(obs_tp1_input.get(),
                            num_actions,
                            dist_params['nb_atoms'],
                            scope="target_q_func")
        q_tp1 = p_to_q(p_tp1, dist_params)
        target_q_func_vars = U.scope_vars(
            U.absolute_scope_name("target_q_func"))

        # TODO: use double

        a_next = tf.argmax(q_tp1, 1, output_type=tf.int32)
        batch_dim = tf.shape(rew_t_ph)[0]
        ThTz, debug = build_categorical_alg(p_tp1, rew_t_ph, a_next, gamma,
                                            batch_dim, done_mask_ph,
                                            dist_params)

        # compute the error (potentially clipped)
        cat_idx = tf.transpose(
            tf.reshape(tf.concat([tf.range(batch_dim), act_t_ph], axis=0),
                       [2, batch_dim]))
        p_t_next = tf.gather_nd(p_t, cat_idx)

        cross_entropy = -1 * ThTz * tf.log(p_t_next)
        errors = tf.reduce_sum(cross_entropy, axis=-1)

        mean_error = tf.reduce_mean(errors)

        # compute optimization op (potentially with gradient clipping)
        if grad_norm_clipping is not None:
            optimize_expr = U.minimize_and_clip(optimizer,
                                                mean_error,
                                                var_list=q_func_vars,
                                                clip_val=grad_norm_clipping)
        else:
            optimize_expr = optimizer.minimize(mean_error,
                                               var_list=q_func_vars)

        # =====================================================================================

        # update_target_fn will be called periodically to copy Q network to target Q network
        update_target_expr = []
        for var, var_target in zip(
                sorted(q_func_vars, key=lambda v: v.name),
                sorted(target_q_func_vars, key=lambda v: v.name)):
            update_target_expr.append(var_target.assign(var))
        update_target_expr = tf.group(*update_target_expr)

        # Create callable functions
        train = U.function(inputs=[
            obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph,
            importance_weights_ph
        ],
                           outputs=errors,
                           updates=[optimize_expr])
        update_target = U.function([], [], updates=[update_target_expr])

        q_values = U.function([obs_t_input], q_t)

        return act_f, train, update_target, {
            **debug, 'q_values': q_values,
            'p': p_tp1,
            'cross_entropy': cross_entropy,
            'ThTz': ThTz
        }
コード例 #25
0
def build_train_dueling(make_obs_ph, q_func, model_func, num_actions, optimizer, grad_norm_clipping=None, gamma=1.0,
                        scope="deepq", input_dim=84 * 84 * 4, hash_dim=32, use_rp=False, imitate=False, reuse=None):
    """Creates the train function:

    Parameters
    ----------
    make_obs_ph: str -> tf.placeholder or TfInput
        a function that takes a name and creates a placeholder of input with that name
    q_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    num_actions: int
        number of actions
    reuse: bool
        whether or not to reuse the graph variables
    optimizer: tf.train.Optimizer
        optimizer to use for the Q-learning objective.
    grad_norm_clipping: float or None
        clip gradient norms to this value. If None no clipping is performed.
    gamma: float
        discount rate.
    double_q: bool
        if true will use Double Q Learning (https://arxiv.org/abs/1509.06461).
        In general it is a good idea to keep it enabled.
    scope: str or VariableScope
        optional scope for variable_scope.
    reuse: bool or None
        whether or not the variables should be reused. To be able to reuse the scope must be given.

    Returns
    -------
    act: (tf.Variable, bool, float) -> tf.Variable
        function to select and action given observation.
`       See the top of the file for details.
    train: (object, np.array, np.array, object, np.array, np.array) -> np.array
        optimize the error in Bellman's equation.
`       See the top of the file for details.
    update_target: () -> ()
        copy the parameters from optimized Q function to the target Q function.
`       See the top of the file for details.
    debug: {str: function}
        a bunch of functions to print debug data like q_values.
    """
    act_f = build_act_dueling(make_obs_ph, q_func, model_func, num_actions, input_dim, hash_dim, use_rp, scope=scope,
                              reuse=reuse)

    with tf.variable_scope(scope, reuse=reuse):
        # set up placeholders
        obs_t_input = U.ensure_tf_input(make_obs_ph("obs_t"))
        act_t_ph = tf.placeholder(tf.int32, [None], name="action")
        rew_t_ph = tf.placeholder(tf.float32, [None], name="reward")
        done_mask_ph = tf.placeholder(tf.float32, [None], name="done")
        importance_weights_ph = tf.placeholder(tf.float32, [None], name="weight")
        if imitate:
            imitate_act_t_ph = tf.placeholder(tf.float32, [None, num_actions], name="imitate_action")
        # EMDQN
        value_t_ph = tf.placeholder(tf.float32, [None], name='value_t')
        value_tp1_ph = tf.placeholder(tf.float32, [None], name='value_tp1')
        value_tp1_masked = (1.0 - done_mask_ph) * value_tp1_ph
        # q network evaluation
        q_t = q_func(obs_t_input.get(), num_actions, scope="q_func", reuse=True)  # reuse parameters from act
        # q_t_normalized = q_t - tf.max(q_t,)
        q_func_vars = U.scope_vars(U.absolute_scope_name("q_func"))
        q_t_selected = tf.reduce_sum(q_t * tf.one_hot(act_t_ph, num_actions), 1)

        # compute RHS of bellman equation
        q_target = rew_t_ph + gamma * value_tp1_masked

        # compute the error (potentially clipped)
        td_error = q_target - (q_t_selected + value_t_ph)
        td_summary = tf.summary.scalar("td error", tf.reduce_mean(td_error))
        # EMDQN
        print(q_t.shape)
        if imitate:
            imitation_loss = tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(labels=imitate_act_t_ph, logits=q_t),
                                       axis=1)
            print(imitation_loss.shape)
            errors = U.huber_loss(td_error) + imitation_loss
        else:
            errors = U.huber_loss(td_error)
        total_summary = tf.summary.scalar("total error", tf.reduce_mean(errors))

        value_summary = tf.summary.scalar("value_t", tf.reduce_mean(value_t_ph))
        value_tp1_summary = tf.summary.scalar("value_tp1", tf.reduce_mean(value_tp1_ph))
        q_summary = tf.summary.scalar("estimated qs", tf.reduce_mean(q_t_selected))
        summaries=[td_summary, total_summary, value_summary, value_tp1_summary, q_summary]
        if imitate:
            imitate_summary = tf.summary.scalar("imitate loss", tf.reduce_mean(imitation_loss))
            summaries.append(imitate_summary)
        summary = tf.summary.merge(summaries)

        weighted_error = tf.reduce_mean(importance_weights_ph * errors)
        # compute optimization op (potentially with gradient clipping)
        if grad_norm_clipping is not None:
            optimize_expr = U.minimize_and_clip(optimizer,
                                                weighted_error,
                                                var_list=q_func_vars,
                                                clip_val=grad_norm_clipping)
        else:
            optimize_expr = optimizer.minimize(weighted_error, var_list=q_func_vars)

        # update_target_fn will be called periodically to copy Q network to target Q network
        inputs = [
            obs_t_input,
            act_t_ph,
            rew_t_ph,
            done_mask_ph,
            importance_weights_ph,
            value_t_ph,
            value_tp1_ph
        ]
        if imitate:
            inputs.append(imitate_act_t_ph)
        # Create callable functions
        # EMDQN
        train = U.function(
            inputs=inputs,
            outputs=[td_error, summary],
            updates=[optimize_expr]
        )

        return act_f, train
コード例 #26
0
def build_train(make_obs_ph,
                q_func,
                num_actions,
                optimizer,
                chief=False,
                server=None,
                workers=1,
                grad_norm_clipping=None,
                gamma=1.0,
                double_q=True,
                scope="deepq",
                reuse=None):
    """Creates the act function:

    Parameters
    ----------
    make_obs_ph: str -> tf.placeholder or TfInput
        a function that take a name and creates a placeholder of input with that name
    q_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    num_actions: int
        number of actions
    reuse: bool
        whether or not to reuse the graph variables
    optimizer: tf.train.Optimizer
        optimizer to use for the Q-learning objective.
    chief: bool
        whether or not the worker should assume chief duties.
        these include: initializing global parameters, tensorboarding, saving, etc.
    grad_norm_clipping: float or None
        clip gradient norms to this value. If None no clipping is performed.
    gamma: float
        discount rate.
    double_q: bool
        if true will use Double Q Learning (https://arxiv.org/abs/1509.06461).
        In general it is a good idea to keep it enabled.
    scope: str or VariableScope
        optional scope for variable_scope.
    reuse: bool or None
        whether or not the variables should be reused. To be able to reuse the scope must be given.

    Returns
    -------
    act: (tf.Variable, bool, float) -> tf.Variable
        function to select and action given observation.
`       See the top of the file for details.
    train: (object, np.array, np.array, object, np.array, np.array) -> np.array
        optimize the error in Bellman's equation.
`       See the top of the file for details.
    update_target: () -> ()
        copy the parameters from optimized Q function to the target Q function.
`       See the top of the file for details.
    debug: {str: function}
        a bunch of functions to print debug data like q_values.
    """
    task = server.server_def.task_index
    act_f = build_act(make_obs_ph,
                      q_func,
                      num_actions,
                      scope=scope,
                      reuse=reuse,
                      task=task)

    with tf.variable_scope(scope, reuse=reuse):
        with tf.device("/job:worker/task:{}".format(task)):
            # set up placeholders
            obs_t_input = U.ensure_tf_input(make_obs_ph("obs_t"))
            act_t_ph = tf.placeholder(tf.int32, [None], name="action")
            rew_t_ph = tf.placeholder(tf.float32, [None], name="reward")
            obs_tp1_input = U.ensure_tf_input(make_obs_ph("obs_tp1"))
            done_mask_ph = tf.placeholder(tf.float32, [None], name="done")
            importance_weights_ph = tf.placeholder(tf.float32, [None],
                                                   name="weight")

            # Local timestep counters
            t = tf.placeholder(tf.float32, [1], name="t")
            t_global_old = tf.placeholder(tf.float32, [1], name="t_global_old")
            score_input = tf.placeholder(tf.float32, [1], name="score_input")
            grad_prio = tf.placeholder(tf.bool, [1], name="grad_prio")
            converged_ph = tf.placeholder(tf.bool, [1], name="converged")
            factor_input = tf.placeholder(tf.float32, [1], name="factor_input")

            # Global timestep counter
            # TODO Does TF have built-in global step counters?
            with tf.device("/job:ps/task:0"):
                t_global = tf.Variable(dtype=tf.float32,
                                       initial_value=[0],
                                       name="t_global")
                run_code_global = tf.Variable(initial_value="",
                                              name="run_code_global")
                comm_rounds_global = tf.Variable(dtype=tf.float32,
                                                 initial_value=[0],
                                                 name="comm_rounds_global")
                max_workers_global = tf.constant(workers,
                                                 dtype=tf.float32,
                                                 name="max_workers_global")
                worker_count_global = tf.Variable(dtype=tf.float32,
                                                  initial_value=[0],
                                                  name="worker_count_global")
                score_max_global = tf.Variable(dtype=tf.float32,
                                               initial_value=[0],
                                               name="score_max_global")
                score_min_global = tf.Variable(dtype=tf.float32,
                                               initial_value=[0],
                                               name="score_min_global")
                submit_count_global = tf.Variable(dtype=tf.float32,
                                                  initial_value=[-1],
                                                  name="submit_count_global")
                converged_global = tf.Variable(dtype=tf.bool,
                                               initial_value=[False],
                                               name="converged_global")

            # q network evaluation
            q_t = q_func(obs_t_input.get(),
                         num_actions,
                         scope="q_func",
                         reuse=True)  # reuse parameters from act
            q_func_vars = U.scope_vars(U.absolute_scope_name("q_func"))

            # target q network evalution
            q_tp1 = q_func(obs_tp1_input.get(),
                           num_actions,
                           scope="target_q_func")
            target_q_func_vars = U.scope_vars(
                U.absolute_scope_name("target_q_func"))

            # global weights
            print("chief:", chief, "reuse:", True if not chief else None)
            global_q_func_vars = []
            # with tf.device(tf.train.replica_device_setter(cluster=cluster)):
            with tf.device(
                    "/job:ps/task:0"):  # TODO needs RDS if using multiple PS
                # q_global = q_func(obs_t_input.get(), num_actions, scope="global_weights", reuse=None if chief else True)#reuse=(not chief))
                # q_global = q_func(obs_t_input.get(), num_actions, scope="global_weights")
                with tf.variable_scope("global_weights"):
                    for var in q_func_vars:
                        name = var.name.split(":")[0].split("q_func/")[-1]
                        global_q_func_vars.append(
                            tf.get_variable(name=name,
                                            shape=var.shape,
                                            dtype=var.dtype,
                                            initializer=tf.contrib.layers.
                                            xavier_initializer(
                                                seed=1, dtype=var.dtype)))
            # global_q_func_vars = U.scope_vars(U.absolute_scope_name("global_weights"))
            # print("Global:", global_q_func_vars)

            # old weights (used to implicitly calculate gradient sum: q_func_vars - q_func_vars_old)
            q_func_vars_old = []
            with tf.variable_scope("old_weights"):
                for var in q_func_vars:
                    name = var.name.split(":")[0].split("q_func/")[-1]
                    q_func_vars_old.append(
                        tf.get_variable(
                            name=name,
                            shape=var.shape,
                            dtype=var.dtype,
                            initializer=tf.contrib.layers.xavier_initializer(
                                seed=1, dtype=var.dtype)))
            # q_old = q_func(obs_t_input.get(), num_actions, scope="old_weights")
            # q_func_vars_old = U.scope_vars(U.absolute_scope_name("old_weights"))
            # print("Old vars:", q_func_vars_old)

            # q scores for actions which we know were selected in the given state.
            q_t_selected = tf.reduce_sum(
                q_t * tf.one_hot(act_t_ph, num_actions), 1)

            # compute estimate of best possible value starting from state at t + 1
            if double_q:
                q_tp1_using_online_net = q_func(obs_tp1_input.get(),
                                                num_actions,
                                                scope="q_func",
                                                reuse=True)
                q_tp1_best_using_online_net = tf.arg_max(
                    q_tp1_using_online_net, 1)
                q_tp1_best = tf.reduce_sum(
                    q_tp1 *
                    tf.one_hot(q_tp1_best_using_online_net, num_actions), 1)
            else:
                q_tp1_best = tf.reduce_max(q_tp1, 1)
            q_tp1_best_masked = (1.0 - done_mask_ph) * q_tp1_best

            # compute RHS of bellman equation
            q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked

            # compute the error (potentially clipped)
            td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)
            errors = U.huber_loss(td_error)
            weighted_error = tf.reduce_mean(importance_weights_ph * errors)

            # compute optimization op (potentially with gradient clipping)
            if grad_norm_clipping is not None:
                optimize_expr = U.minimize_and_clip(
                    optimizer,
                    weighted_error,
                    var_list=q_func_vars,
                    clip_val=grad_norm_clipping)
            else:
                optimize_expr = optimizer.minimize(weighted_error,
                                                   var_list=q_func_vars)

            # update_target_fn will be called periodically to copy Q network to target Q network
            update_target_expr = []
            for var, var_target in zip(
                    sorted(q_func_vars, key=lambda v: v.name),
                    sorted(target_q_func_vars, key=lambda v: v.name)):
                update_target_expr.append(var_target.assign(var))
            update_target_expr = tf.group(*update_target_expr)

            # update_global_fn will be called periodically to copy global Q network to q network
            update_global_expr = []
            for var_global, var, var_old in zip(
                    sorted(global_q_func_vars, key=lambda v: v.name),
                    sorted(q_func_vars, key=lambda v: v.name),
                    sorted(q_func_vars_old, key=lambda v: v.name)):
                update_global_expr.append(var.assign(var_global))
                # TODO Can async cause var <- var_global, var_global <- new value, var_old <- var_global in that order?
                # TODO Should this copy from var instead? (concurrency issues?)
                # TODO Can concurrency cause var_old <- var, var <- var_global in that order (resulting in wrong values)?
                # TODO Safest method is to force sequential execution of var <- var_global, var_old <- var! How though?
                update_global_expr.append(var_old.assign(var_global))
            update_global_expr = tf.group(*update_global_expr)

            # update the global time step counter by adding the local
            update_t_global = t_global.assign_add(t)

            optimize_global_expr = []
            # Factor to multiply every gradient with
            # f = t / (t_global - t_global_old)
            dt = tf.subtract(update_t_global, t_global_old)
            factor = tf.where(
                tf.greater_equal(factor_input, 0), factor_input,
                tf.where(
                    grad_prio,
                    tf.divide(tf.subtract(score_input, score_min_global),
                              tf.subtract(score_max_global, score_min_global)),
                    tf.div(t, dt)))
            for var, var_old, var_global in zip(
                    sorted(q_func_vars, key=lambda v: v.name),
                    sorted(q_func_vars_old, key=lambda v: v.name),
                    sorted(global_q_func_vars, key=lambda v: v.name)):
                # Multiply the difference between the old parameters and the locally optimized parameters
                # g = (var - var_old) * f
                grad = tf.multiply(tf.subtract(var, var_old), factor)
                optimize_global_expr.append(var_global.assign_add(grad))
            optimize_global_expr = tf.group(*optimize_global_expr)

            # if cr == cr_g and wc < wc_max:
            #   wc += 1
            #   score_global += score
            # if cr == cr_g and wc == wc_max:
            #   vc += 1
            #   score_global += score
            #   cr_g += 0.5
            # return cr_g
            """
            if cr == cr_g:
                if wc <= wc_max:
                    wc += 1
                    score_global += score
                    if wc == wc_max:
                        cr_g += 0.5
            return cr_g
            """
            # submit_score_expr = \
            #     tf.cond(tf.equal(comm_rounds, comm_rounds_global),
            #             lambda: tf.cond(tf.less_equal(worker_count_global, max_workers_global),
            #                             lambda: tf.group(worker_count_global.assign_add([1]),
            #                                              score_global.assign_add(score_input),
            #                                              tf.cond(tf.equal(worker_count_global, max_workers_global),
            #                                                      lambda: comm_rounds_global.assign_add([0.5]),
            #                                                      lambda: None)),
            #                             lambda: tf.group(None, None, None)),
            #             lambda: None)
            # submit_score_expr = \
            #     tf.cond(tf.logical_and(tf.equal(comm_rounds, comm_rounds_global),
            #                            tf.less(worker_count_global, max_workers_global)),
            #             tf.group(worker_count_global.assign_add(1),
            #                      score_global.assign_add(score_input)),
            #             tf.cond(tf.logical_and(tf.equal(comm_rounds, comm_rounds_global),
            #                                    tf.equal(worker_count_global, max_workers_global)),
            #                     tf.group(worker_count_global.assign_add(1),
            #                              score_global.assign_add(score_input),
            #                              comm_rounds_global.assign_add(0.5))))

            # This makes a sum of all scores (
            # submit_score_expr = score_global.assign_add(score_input)

            # This only saves the maximum score (for normalized score weighting)
            submit_score_max = score_max_global.assign(tf.maximum(
                score_input, score_max_global),
                                                       use_locking=True)
            submit_score_min = score_min_global.assign(tf.minimum(
                score_input, score_min_global),
                                                       use_locking=True)

            set_submit_count = submit_count_global.assign(score_input,
                                                          use_locking=True)
            inc_submit_count = submit_count_global.assign_add([1],
                                                              use_locking=True)

            # check_round_op = tf.equal(comm_rounds, comm_rounds_global) # Not used anymore
            inc_wc = worker_count_global.assign_add([1], use_locking=True)
            zero_wc = worker_count_global.assign([0], use_locking=True)

            inc_cr = comm_rounds_global.assign_add([1], use_locking=True)

            score_reset = score_max_global.assign([0], use_locking=True)

            converged_set = converged_global.assign(converged_ph,
                                                    use_locking=True)

            # Create callable functions
            train = U.function(inputs=[
                obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph,
                importance_weights_ph
            ],
                               outputs=[td_error],
                               updates=[optimize_expr])
            global_opt = U.function(
                inputs=[t, t_global_old, score_input, factor_input, grad_prio],
                outputs=[dt, comm_rounds_global, factor],
                updates=[optimize_global_expr])
            # global_sync_opt = U.function(inputs=[comm_rounds], outputs=[comm_rounds_global], updates=[optimize_global_sync_expr])
            update_weights = U.function(inputs=[],
                                        outputs=[t_global],
                                        updates=[update_global_expr])
            update_target = U.function([], [], updates=[update_target_expr])
            submit_score = U.function(
                inputs=[score_input],
                outputs=[comm_rounds_global],
                updates=[submit_score_max, submit_score_min])
            check_round = U.function(inputs=[],
                                     outputs=[comm_rounds_global],
                                     updates=[])
            request_submit = U.function(inputs=[],
                                        outputs=[comm_rounds_global, inc_wc],
                                        updates=[])
            set_submit = U.function(inputs=[score_input],
                                    outputs=[set_submit_count],
                                    updates=[])
            check_submit = U.function(inputs=[],
                                      outputs=[submit_count_global],
                                      updates=[])
            inc_submit = U.function(inputs=[],
                                    outputs=[inc_submit_count],
                                    updates=[])
            inc_comm_round = U.function(inputs=[],
                                        outputs=[inc_cr],
                                        updates=[])
            reset_wc = U.function(inputs=[], outputs=[zero_wc], updates=[])
            check_wc = U.function(inputs=[],
                                  outputs=[worker_count_global],
                                  updates=[])
            reset_score = U.function(inputs=[],
                                     outputs=[],
                                     updates=[score_reset])
            set_converged = U.function(inputs=[converged_ph],
                                       outputs=[],
                                       updates=[converged_set])
            check_converged = U.function(inputs=[],
                                         outputs=[converged_global],
                                         updates=[])

            # Debugging functions
            q_values = U.function([obs_t_input], q_t)
            weights = U.function(
                inputs=[],
                outputs=[q_func_vars, global_q_func_vars, q_func_vars_old],
                updates=[])
            t_global_func = U.function([], t_global)
            comm_rounds_func = U.function([], comm_rounds_global)

            return act_f, train, global_opt, update_target, update_weights, \
                {'request_submit': request_submit, 'submit_score': submit_score,
                 'check_round': check_round, 'check_submit': check_submit, 'set_submit': set_submit,
                 'inc_submit': inc_submit, 'inc_comm_round': inc_comm_round, 'reset_wc': reset_wc,
                 'check_wc': check_wc, 'reset_score': reset_score,
                 'set_converged': set_converged, 'check_converged': check_converged}, \
                {'q_values': q_values, 'weights': weights, 't_global': t_global_func,
                 'run_code': run_code_global, 'comm_rounds': comm_rounds_func, 'factor': factor}
コード例 #27
0
def build_act(make_obs_ph,
              cvar_func,
              var_func,
              num_actions,
              nb_atoms,
              scope="cvar_dqn",
              reuse=None):
    """Creates the act function:

    Parameters
    ----------
    make_obs_ph: str -> tf.placeholder or TfInput
        a function that take a name and creates a placeholder of input with that name
    cvar_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    num_actions: int
        number of actions.
    nb_atoms: int
        number of linearly-spaced atoms
    scope: str or VariableScope
        optional scope for variable_scope.
    reuse: bool or None
        whether or not the variables should be reused. To be able to reuse the scope must be given.

    Returns
    -------
    act: (tf.Variable, bool, float, float) -> tf.Variable
        function to select and action given observation.
`       See the top of the file for details.
    """
    with tf.variable_scope(scope, reuse=reuse):
        observations_ph = U.ensure_tf_input(make_obs_ph("observation"))
        # alpha in cvar_alpha
        alpha_ph = U.ensure_tf_input(
            tf.placeholder(tf.float32, (), name="alpha"))

        stochastic_ph = tf.placeholder(tf.bool, (), name="stochastic")
        update_eps_ph = tf.placeholder(tf.float32, (), name="update_eps")

        # eps in epsilon-greedy
        eps = tf.get_variable("eps", (),
                              initializer=tf.constant_initializer(0))

        cvar_values = cvar_func(observations_ph.get(),
                                num_actions,
                                nb_atoms,
                                scope="out_func")
        # keep here for plotting
        var_values = var_func(observations_ph.get(),
                              num_actions,
                              nb_atoms,
                              scope="out_func",
                              reuse_main=True,
                              reuse_last=False)

        deterministic_actions = pick_action(cvar_values, alpha_ph.get(),
                                            nb_atoms)

        batch_size = tf.shape(observations_ph.get())[0]
        random_actions = tf.random_uniform(tf.stack([batch_size]),
                                           minval=0,
                                           maxval=num_actions,
                                           dtype=tf.int32)
        chose_random = tf.random_uniform(
            tf.stack([batch_size]), minval=0, maxval=1, dtype=tf.float32) < eps
        stochastic_actions = tf.where(chose_random, random_actions,
                                      deterministic_actions)

        output_actions = tf.cond(stochastic_ph, lambda: stochastic_actions,
                                 lambda: deterministic_actions)
        update_eps_expr = eps.assign(
            tf.cond(update_eps_ph >= 0, lambda: update_eps_ph, lambda: eps))
        act = U.function(
            inputs=[observations_ph, alpha_ph, stochastic_ph, update_eps_ph],
            outputs=output_actions,
            givens={
                update_eps_ph: -1.0,
                stochastic_ph: True
            },
            updates=[update_eps_expr])

        return act
コード例 #28
0
def build_train_contrast(make_obs_ph,
                         model_func,
                         num_actions,
                         optimizer,
                         grad_norm_clipping=None,
                         gamma=1.0,
                         scope="mfec",
                         latent_dim=32,
                         alpha=0.05,
                         beta=0.1,
                         theta=0.1,
                         loss_type=["contrast"],
                         c_loss_type="sqmargin",
                         reuse=None):
    """Creates the train function:

    Parameters
    ----------
    make_obs_ph: str -> tf.placeholder or TfInput
        a function that takes a name and creates a placeholder of input with that name
    num_actions: int
        number of actions
    reuse: bool
        whether or not to reuse the graph variables
    optimizer: tf.train.Optimizer
        optimizer to use for the Q-learning objective.
    grad_norm_clipping: float or None
        clip gradient norms to this value. If None no clipping is performed.
    gamma: float
        discount rate.
    double_q: bool
        if true will use Double Q Learning (https://arxiv.org/abs/1509.06461).
        In general it is a good idea to keep it enabled.
    scope: str or VariableScope
        optional scope for variable_scope.
    reuse: bool or None
        whether or not the variables should be reused. To be able to reuse the scope must be given.

    Returns
    -------
    act: (tf.Variable, bool, float) -> tf.Variable
        function to select and action given observation.
`       See the top of the file for details.
    train: (object, np.array, np.array, object, np.array, np.array) -> np.array
        optimize the error in Bellman's equation.
`       See the top of the file for details.
    update_target: () -> ()
        copy the parameters from optimized Q function to the target Q function.
`       See the top of the file for details.
    debug: {str: function}
        a bunch of functions to print debug data like q_values.
    """

    # z_func = build_act_contrast(make_obs_ph, model_func, num_actions, scope=scope, secondary_scope="model_func",
    #                             reuse=reuse)

    with tf.variable_scope(scope, reuse=reuse):
        # set up placeholders

        # EMDQN
        # tau = tf.placeholder(tf.float32, [1], name='tau')
        # momentum = tf.placeholder(tf.float32, [1], name='momentum')

        obs_input_query = U.ensure_tf_input(make_obs_ph("obs_query"))
        obs_input_positive = U.ensure_tf_input(make_obs_ph("enc_obs_pos"))
        obs_input_negative = U.ensure_tf_input(make_obs_ph("enc_obs_neg"))

        value_input_query = tf.placeholder(tf.float32, [None], name="value")
        action_embedding = tf.Variable(tf.random_normal(
            [num_actions, latent_dim], stddev=1),
                                       name="action_embedding")
        action_input = tf.placeholder(tf.int32, [None], name="action")
        inputs = [obs_input_query]
        if "contrast" in loss_type:
            inputs += [obs_input_positive, obs_input_negative]
        if "regression" in loss_type:
            inputs += [value_input_query]
        if "linear_model" in loss_type:
            inputs += [action_input]
            if "contrast" not in loss_type:
                inputs += [obs_input_positive]
        z = model_func(obs_input_query.get(),
                       num_actions,
                       scope="model_func",
                       reuse=tf.AUTO_REUSE)

        h = model_func(obs_input_query.get(),
                       num_actions,
                       scope="hash_func",
                       reuse=False)

        # _, v = model_func(
        #     obs_input_query.get(), num_actions,
        #     scope="model_func",
        #     reuse=True)
        z_pos = model_func(obs_input_positive.get(),
                           num_actions,
                           scope="model_func",
                           reuse=True)

        z_neg = model_func(obs_input_negative.get(),
                           num_actions,
                           scope="model_func",
                           reuse=True)

        z_pos = tf.reshape(z_pos, [-1, latent_dim])
        z_tar = tf.reshape(z, [-1, latent_dim])
        z_neg = tf.reshape(z_neg, [-1, latent_dim])

        contrast_loss = contrastive_loss_fc(z_tar,
                                            z_pos,
                                            z_neg,
                                            c_type=c_loss_type)

        regression_loss = tf.reduce_mean(
            tf.squared_difference(tf.norm(z_tar, axis=1),
                                  alpha * value_input_query))

        action_embeded = tf.matmul(tf.one_hot(action_input, num_actions),
                                   action_embedding)
        model_loss = tf.reduce_mean(
            tf.squared_difference(action_embeded + z_tar, z_pos))
        print("shape:", z_tar.shape, z_pos.shape, z_neg.shape,
              action_embeded.shape)
        # contrast_loss = tf.reduce_mean(tf.log(sum_negative) - positive)
        # print("shape2:", z.shape, negative.shape, positive.shape)
        # prediction_loss = tf.losses.mean_squared_error(value_input, v)
        total_loss = 0
        if "contrast" in loss_type:
            total_loss += contrast_loss
        if "regression" in loss_type:
            total_loss += beta * regression_loss
        elif "linear_model" in loss_type:
            total_loss += theta * model_loss

        model_func_vars = U.scope_vars(U.absolute_scope_name("model_func"))
        if "linear_model" in loss_type:
            model_func_vars.append(action_embedding)
        if grad_norm_clipping is not None:
            optimize_expr = U.minimize_and_clip(optimizer,
                                                total_loss,
                                                var_list=model_func_vars,
                                                clip_val=grad_norm_clipping)
        else:
            optimize_expr = optimizer.minimize(total_loss,
                                               var_list=model_func_vars)
        # Create callable functions
        # update_target_fn will be called periodically to copy Q network to target Q network
        z_var_summary = tf.summary.scalar(
            "z_var", tf.reduce_mean(tf.math.reduce_std(z, axis=1)))
        negative_summary = tf.summary.scalar(
            "negative_dist", tf.reduce_mean(emb_dist(z_tar, z_neg)))
        positive_summary = tf.summary.scalar(
            "positive_dist", tf.reduce_mean(emb_dist(z_tar, z_pos)))
        contrast_loss_summary = tf.summary.scalar(
            "contrast loss", tf.reduce_mean(contrast_loss))
        regression_loss_summary = tf.summary.scalar(
            "regression loss", tf.reduce_mean(contrast_loss))
        model_loss_summary = tf.summary.scalar("model loss",
                                               tf.reduce_mean(contrast_loss))
        # prediction_loss_summary = tf.summary.scalar("prediction loss", tf.reduce_mean(prediction_loss))
        total_loss_summary = tf.summary.scalar("total loss",
                                               tf.reduce_mean(total_loss))

        summaries = [z_var_summary, total_loss_summary]

        if "contrast" in loss_type:
            summaries += [
                negative_summary, positive_summary, contrast_loss_summary
            ]
        if "regression" in loss_type:
            summaries.append(regression_loss_summary)
        if "linear_model" in loss_type:
            summaries.append(model_loss_summary)
        summary = tf.summary.merge(summaries)
        outputs = [z_tar]
        if "contrast" in loss_type:
            outputs += [z_pos, z_neg]
        elif "linear_model" in loss_type:
            outputs += [z_pos]
        outputs += [total_loss, summary]
        train = U.function(inputs=inputs,
                           outputs=outputs,
                           updates=[optimize_expr])

        eval = U.function(inputs=inputs, outputs=outputs, updates=[])
        z_func = U.function(
            inputs=[obs_input_query],
            outputs=[z, h],
        )
        norm_func = U.function(inputs=[obs_input_query],
                               outputs=[tf.norm(z_tar, axis=1)])
        return z_func, train, eval, norm_func
コード例 #29
0
def svgd_adv_build_train(
    make_obs_ph,
    v_func,
    adv_func,
    num_actions,
    learning_rate,
    en=1,
    grad_norm_clipping=None,
    gamma=1.0,
    scope="svgd_advantage_learning",
    reuse=None,
):

    act_f, is_training = svgd_adv_build_act(make_obs_ph,
                                            adv_func,
                                            num_actions,
                                            en=en,
                                            scope=scope,
                                            reuse=reuse)

    with tf.variable_scope(scope, reuse=reuse):
        # for advantage
        adv_values_list = []
        target_adv_func_vars_list = []

        weighted_error_list = []

        # construct placeholders
        obs_t_input = U.ensure_tf_input(make_obs_ph("obs_t"))
        act_t_ph = tf.placeholder(tf.int32, [None], name="action")

        rew_t_ph = tf.placeholder(tf.float32, [None], name="reward")
        rew_t_ph_list = tf.split(rew_t_ph, en, axis=0)

        obs_tp1_input = U.ensure_tf_input(make_obs_ph("obs_tp1"))
        obs_tp1_list = tf.split(obs_tp1_input.get(), en, axis=0)

        done_mask_ph = tf.placeholder(tf.float32, [None], name="done")
        done_mask_ph_list = tf.split(done_mask_ph, en, axis=0)

        obs_t_input_list = tf.split(obs_t_input.get(), en, axis=0)
        act_t_ph_list = tf.split(act_t_ph, en, axis=0)

        # construct v function
        v_t = tf.squeeze(v_func(obs_t_input.get(), scope="v_func",
                                reuse=False))
        v_t_list = tf.split(v_t, en, axis=0)
        v_func_vars = U.scope_vars(U.absolute_scope_name("v_func"))

        v_tp1 = tf.squeeze(
            v_func(obs_tp1_input.get(), scope="target_v_func", reuse=False))
        target_v_func_vars = U.scope_vars(
            U.absolute_scope_name("target_v_func"))
        v_tp1_list = tf.split(v_tp1, en, axis=0)

        target_adv_values_list = []
        q_t_selected_list = []
        q_t_selected_target_list = []

        bnn_func_trainable_vars_list = []
        bnn_func_trainable_vars_one_list = []
        bnn_func_all_vars_list = []
        log_p_list = []

        for count in range(en):
            adv_t = adv_func(
                obs_t_input_list[count],
                num_actions,
                is_training=is_training,
                scope="adv_func" + str(count) + '_',
                reuse=True,
            )  # reuse parameters from act

            # collect all variables and divide moving variables
            bnn_func_vars = U.scope_vars(
                U.absolute_scope_name("adv_func" + str(count) + '_'))
            bnn_func_trainable_vars = []
            for bv_t in bnn_func_vars:
                if 'moving' not in bv_t.name:
                    bnn_func_trainable_vars.append(bv_t)

            bnn_func_trainable_vars_list.append(bnn_func_trainable_vars)
            bnn_func_trainable_vars_one_list += bnn_func_trainable_vars

            bnn_func_all_vars_list += bnn_func_vars

            # target network
            adv_tp1 = adv_func(
                obs_tp1_list[count],
                num_actions,
                is_training=False,
                scope="target_adv_func" + str(count) + '_',
            )
            target_adv_func_vars_list += U.scope_vars(
                U.absolute_scope_name("target_adv_func" + str(count) + '_'))

            target_adv_values_list.append(adv_tp1)

            adv_tp1_best = tf.reduce_max(adv_tp1, 1)

            q_tp1_best = v_tp1_list[count] + adv_tp1_best
            q_tp1_best_masked = (1.0 - done_mask_ph_list[count]) * q_tp1_best
            q_t_selected_target = rew_t_ph_list[
                count] + gamma * q_tp1_best_masked

            q_t_selected_target_list.append(q_t_selected_target)

            # q scores for actions which we know were selected in the given state.
            adv_t_selected = tf.reduce_sum(
                adv_t * tf.one_hot(act_t_ph_list[count], num_actions), 1)

            # compute estimate of best possible value starting from state at t + 1
            q_t_selected = v_t_list[count] + adv_t_selected

            # compute the error (potentially clipped)
            weighted_error_list.append(
                tf.reduce_mean(
                    tf.square(v_t_list[count] + tf.stop_gradient(
                        adv_t_selected - q_t_selected_target_list[count]))))
            log_p_list.append(-tf.reduce_mean(
                tf.square(adv_t_selected +
                          tf.stop_gradient(v_t_list[count] -
                                           q_t_selected_target_list[count]))))
            adv_values_list.append(adv_t)
            q_t_selected_list.append(q_t_selected)

        all_vars_list = v_func_vars + bnn_func_all_vars_list
        all_target_vars_list = target_v_func_vars + target_adv_func_vars_list

        update_target_expr = []

        for var, var_target in zip(
                sorted(all_vars_list, key=lambda v: v.name),
                sorted(all_target_vars_list, key=lambda v: v.name)):
            update_target_expr.append(var_target.assign(var))
        update_target_expr = tf.group(*update_target_expr)
        """
        Use SVGD
        """
        # use svgd
        # step 1: flatten all variables
        print('step 1: flatten all variables')
        bnn_func_trainable_vars_flatten_list = []
        for index in range(len(bnn_func_trainable_vars_list)):
            bnn_func_trainable_vars = bnn_func_trainable_vars_list[index]
            bnn_func_trainable_vars_flatten = tf.concat([
                tf.reshape(var, shape=[-1]) for var in bnn_func_trainable_vars
            ],
                                                        axis=0)
            # here the shape should be (variables_num)
            bnn_func_trainable_vars_flatten_list.append(
                tf.expand_dims(bnn_func_trainable_vars_flatten, axis=0))
        variables_num = bnn_func_trainable_vars_flatten.get_shape().as_list(
        )[0]
        print('We have total {} variables'.format(variables_num))

        # step 2: pairwise distance
        print('step 2: pairwise distance')
        # here the shape should be (en, variables_num)
        theta = tf.concat(bnn_func_trainable_vars_flatten_list, axis=0)

        assert theta.get_shape().as_list() == [en, variables_num]

        na = tf.reduce_sum(tf.square(theta), 1)
        nb = tf.reduce_sum(tf.square(theta), 1)
        assert na.get_shape().as_list() == [en]
        assert nb.get_shape().as_list() == [en]

        na = tf.reshape(na, [-1, 1])
        nb = tf.reshape(nb, [1, -1])
        assert na.get_shape().as_list() == [en, 1]
        assert nb.get_shape().as_list() == [1, en]

        D = tf.maximum(0.0, na - 2 * tf.matmul(theta, theta, False, True) + nb)
        assert D.get_shape().as_list() == [en, en]

        # step 3 kernel
        print('step 3 kernel')
        D_mid = tf.contrib.distributions.percentile(D, 50)
        h_tau = tf.placeholder(shape=(), dtype=tf.float32, name='h_tau')
        h = tf.sqrt(0.5 * D_mid / tf.log(en + 1.0)) * h_tau
        kernel = tf.exp(-D / h**2 / 2)

        assert kernel.get_shape().as_list() == [
            en, en
        ], 'kernel shape should be (en, en)'

        # step 4 kernel gradients
        print('step 4 kernel gradients')
        dxkxy = 0 - tf.matmul(kernel, theta)
        sumkxy = tf.expand_dims(tf.reduce_sum(kernel, axis=1), axis=1)

        assert dxkxy.get_shape().as_list() == [en, variables_num]
        assert sumkxy.get_shape().as_list() == [en, 1]

        dxkxy += sumkxy * theta
        dxkxy /= (h**2)

        assert dxkxy.get_shape().as_list() == [en, variables_num]

        # step 5: log_p gradients
        print('step 5: log_p gradients')
        optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate,
                                           epsilon=1e-4)

        # V function parameters' loss function
        v_loss = sum(weighted_error_list)
        # BNN function parameters' loss function
        log_p_sum = sum(log_p_list)
        total_loss = v_loss + log_p_sum

        total_grads = optimizer.compute_gradients(
            total_loss,
            var_list=v_func_vars + bnn_func_trainable_vars_one_list,
        )
        grad_v_loss = total_grads[0:len(v_func_vars)]
        grad_log_p = total_grads[len(v_func_vars):]
        assert len(grad_v_loss) == len(v_func_vars)
        assert len(grad_log_p) == len(bnn_func_trainable_vars_one_list)

        grad_list = []
        vars_shape_list = []
        vars_num_list = []

        for i, (grad, var) in enumerate(grad_log_p):
            assert grad is not None
            vars_shape = grad.get_shape().as_list()
            vars_shape_list.append(vars_shape)
            vars_num_list.append(np.prod(vars_shape))
            grad_list.append(grad)

        grad_flatten_concat = tf.concat(
            [tf.reshape(gv, shape=[-1]) for gv in grad_list], axis=0)
        grad_matrix = tf.reshape(grad_flatten_concat, [en, -1])
        assert grad_matrix.get_shape().as_list() == [en, variables_num]

        # temperature
        tau = tf.placeholder(shape=(), dtype=tf.float32, name='decay_tau')
        grad_1 = -dxkxy * tau
        grad_2 = -tf.matmul(kernel, grad_matrix)
        gradients = grad_1 + grad_2

        gradients_flatten = tf.reshape(gradients, shape=[-1])

        # step 6 apply gradients
        print('step 6 apply gradients')
        start = 0
        for i, (grad, var) in enumerate(grad_log_p):
            assert grad is not None
            grad_flatten = gradients_flatten[start:start + vars_num_list[i]]
            clipped_grad = tf.clip_by_norm(
                tf.reshape(grad_flatten, vars_shape_list[i]),
                grad_norm_clipping)
            grad_log_p[i] = (clipped_grad, var)
            start += vars_num_list[i]

        for i, (grad, var) in enumerate(grad_v_loss):
            assert grad is not None
            clipped_grad = tf.clip_by_norm(grad, grad_norm_clipping)
            grad_v_loss[i] = (clipped_grad, var)

        assert start == variables_num * en, 'expect start is {}, but it is {}'.format(
            variables_num, start)

        optimize_expr = optimizer.apply_gradients(grad_v_loss + grad_log_p)

        other_output = [D, D_mid, h, kernel]
        train = U.function(inputs=[
            obs_t_input,
            act_t_ph,
            rew_t_ph,
            obs_tp1_input,
            done_mask_ph,
            tau,
            h_tau,
            is_training,
        ],
                           outputs=weighted_error_list + other_output,
                           updates=[optimize_expr],
                           givens={is_training: True})
        update_target = U.function([], [], updates=[update_target_expr])

        predict_values_outputs = [v_t, v_tp1] + adv_values_list +\
                                 target_adv_values_list + q_t_selected_list + q_t_selected_target_list

        predict_values = U.function(inputs=[
            obs_t_input,
            act_t_ph,
            rew_t_ph,
            obs_tp1_input,
            done_mask_ph,
            is_training,
        ],
                                    outputs=predict_values_outputs,
                                    givens={is_training: False})

    return act_f, train, update_target, {'predict_values': predict_values}
コード例 #30
0
def build_train(make_obs_ph, q_func, num_actions, optimizer, grad_norm_clipping=None, gamma=1.0, double_q=True, scope="deepq", reuse=None):
    """Creates the act function:

    Parameters
    ----------
    make_obs_ph: str -> tf.placeholder or TfInput
        a function that take a name and creates a placeholder of input with that name
    q_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    num_actions: int
        number of actions
    reuse: bool
        whether or not to reuse the graph variables
    optimizer: tf.train.Optimizer
        optimizer to use for the Q-learning objective.
    grad_norm_clipping: float or None
        clip graident norms to this value. If None no clipping is performed.
    gamma: float
        discount rate.
    double_q: bool
        if true will use Double Q Learning (https://arxiv.org/abs/1509.06461).
        In general it is a good idea to keep it enabled.
    scope: str or VariableScope
        optional scope for variable_scope.
    reuse: bool or None
        whether or not the variables should be reused. To be able to reuse the scope must be given.

    Returns
    -------
    act: (tf.Variable, bool, float) -> tf.Variable
        function to select and action given observation.
`       See the top of the file for details.
    train: (object, np.array, np.array, object, np.array, np.array) -> np.array
        optimize the error in Bellman's equation.
`       See the top of the file for details.
    update_target: () -> ()
        copy the parameters from optimized Q function to the target Q function.
`       See the top of the file for details.
    debug: {str: function}
        a bunch of functions to print debug data like q_values.
    """
    act_f = build_act(make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse)

    with tf.variable_scope(scope, reuse=reuse):
        # set up placeholders
        obs_t_input = U.ensure_tf_input(make_obs_ph("obs_t"))
        act_t_ph = tf.placeholder(tf.int32, [None], name="action")
        rew_t_ph = tf.placeholder(tf.float32, [None], name="reward")
        obs_tp1_input = U.ensure_tf_input(make_obs_ph("obs_tp1"))
        done_mask_ph = tf.placeholder(tf.float32, [None], name="done")
        importance_weights_ph = tf.placeholder(tf.float32, [None], name="weight")

        # q network evaluation
        q_t = q_func(obs_t_input.get(), num_actions, scope="q_func", reuse=True)  # reuse parameters from act
        q_func_vars = U.scope_vars(U.absolute_scope_name("q_func"))

        # target q network evalution
        q_tp1 = q_func(obs_tp1_input.get(), num_actions, scope="target_q_func")
        target_q_func_vars = U.scope_vars(U.absolute_scope_name("target_q_func"))

        # q scores for actions which we know were selected in the given state.
        q_t_selected = tf.reduce_sum(q_t * tf.one_hot(act_t_ph, num_actions), 1)

        # compute estimate of best possible value starting from state at t + 1
        if double_q:
            q_tp1_using_online_net = q_func(obs_tp1_input.get(), num_actions, scope="q_func", reuse=True)
            q_tp1_best_using_online_net = tf.arg_max(q_tp1_using_online_net, 1)
            q_tp1_best = tf.reduce_sum(q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions), 1)
        else:
            q_tp1_best = tf.reduce_max(q_tp1, 1)
        q_tp1_best_masked = (1.0 - done_mask_ph) * q_tp1_best

        # compute RHS of bellman equation
        q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked

        # compute the error (potentially clipped)
        td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)
        errors = U.huber_loss(td_error)
        weighted_error = tf.reduce_mean(importance_weights_ph * errors)
        # compute optimization op (potentially with gradient clipping)
        if grad_norm_clipping is not None:
            optimize_expr = U.minimize_and_clip(optimizer,
                                                weighted_error,
                                                var_list=q_func_vars,
                                                clip_val=grad_norm_clipping)
        else:
            optimize_expr = optimizer.minimize(weighted_error, var_list=q_func_vars)

        # update_target_fn will be called periodically to copy Q network to target Q network
        update_target_expr = []
        for var, var_target in zip(sorted(q_func_vars, key=lambda v: v.name),
                                   sorted(target_q_func_vars, key=lambda v: v.name)):
            update_target_expr.append(var_target.assign(var))
        update_target_expr = tf.group(*update_target_expr)

        # Create callable functions
        train = U.function(
            inputs=[
                obs_t_input,
                act_t_ph,
                rew_t_ph,
                obs_tp1_input,
                done_mask_ph,
                importance_weights_ph
            ],
            outputs=td_error,
            updates=[optimize_expr]
        )
        update_target = U.function([], [], updates=[update_target_expr])

        q_values = U.function([obs_t_input], q_t)

        return act_f, train, update_target, {'q_values': q_values}
コード例 #31
0
ファイル: build_graph.py プロジェクト: IcarusTan/baselines
def build_act_with_param_noise(make_obs_ph, q_func, num_actions, scope="deepq", reuse=None, param_noise_filter_func=None):
    """Creates the act function with support for parameter space noise exploration (https://arxiv.org/abs/1706.01905):

    Parameters
    ----------
    make_obs_ph: str -> tf.placeholder or TfInput
        a function that take a name and creates a placeholder of input with that name
    q_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    num_actions: int
        number of actions.
    scope: str or VariableScope
        optional scope for variable_scope.
    reuse: bool or None
        whether or not the variables should be reused. To be able to reuse the scope must be given.
    param_noise_filter_func: tf.Variable -> bool
        function that decides whether or not a variable should be perturbed. Only applicable
        if param_noise is True. If set to None, default_param_noise_filter is used by default.

    Returns
    -------
    act: (tf.Variable, bool, float, bool, float, bool) -> tf.Variable
        function to select and action given observation.
`       See the top of the file for details.
    """
    if param_noise_filter_func is None:
        param_noise_filter_func = default_param_noise_filter

    with tf.variable_scope(scope, reuse=reuse):
        observations_ph = U.ensure_tf_input(make_obs_ph("observation"))
        stochastic_ph = tf.placeholder(tf.bool, (), name="stochastic")
        update_eps_ph = tf.placeholder(tf.float32, (), name="update_eps")
        update_param_noise_threshold_ph = tf.placeholder(tf.float32, (), name="update_param_noise_threshold")
        update_param_noise_scale_ph = tf.placeholder(tf.bool, (), name="update_param_noise_scale")
        reset_ph = tf.placeholder(tf.bool, (), name="reset")

        eps = tf.get_variable("eps", (), initializer=tf.constant_initializer(0))
        param_noise_scale = tf.get_variable("param_noise_scale", (), initializer=tf.constant_initializer(0.01), trainable=False)
        param_noise_threshold = tf.get_variable("param_noise_threshold", (), initializer=tf.constant_initializer(0.05), trainable=False)

        # Unmodified Q.
        q_values = q_func(observations_ph.get(), num_actions, scope="q_func")

        # Perturbable Q used for the actual rollout.
        q_values_perturbed = q_func(observations_ph.get(), num_actions, scope="perturbed_q_func")
        # We have to wrap this code into a function due to the way tf.cond() works. See
        # https://stackoverflow.com/questions/37063952/confused-by-the-behavior-of-tf-cond for
        # a more detailed discussion.
        def perturb_vars(original_scope, perturbed_scope):
            all_vars = U.scope_vars(U.absolute_scope_name("q_func"))
            all_perturbed_vars = U.scope_vars(U.absolute_scope_name("perturbed_q_func"))
            assert len(all_vars) == len(all_perturbed_vars)
            perturb_ops = []
            for var, perturbed_var in zip(all_vars, all_perturbed_vars):
                if param_noise_filter_func(perturbed_var):
                    # Perturb this variable.
                    op = tf.assign(perturbed_var, var + tf.random_normal(shape=tf.shape(var), mean=0., stddev=param_noise_scale))
                else:
                    # Do not perturb, just assign.
                    op = tf.assign(perturbed_var, var)
                perturb_ops.append(op)
            assert len(perturb_ops) == len(all_vars)
            return tf.group(*perturb_ops)

        # Set up functionality to re-compute `param_noise_scale`. This perturbs yet another copy
        # of the network and measures the effect of that perturbation in action space. If the perturbation
        # is too big, reduce scale of perturbation, otherwise increase.
        q_values_adaptive = q_func(observations_ph.get(), num_actions, scope="adaptive_q_func")
        perturb_for_adaption = perturb_vars(original_scope="q_func", perturbed_scope="adaptive_q_func")
        kl = tf.reduce_sum(tf.nn.softmax(q_values) * (tf.log(tf.nn.softmax(q_values)) - tf.log(tf.nn.softmax(q_values_adaptive))), axis=-1)
        mean_kl = tf.reduce_mean(kl)
        def update_scale():
            with tf.control_dependencies([perturb_for_adaption]):
                update_scale_expr = tf.cond(mean_kl < param_noise_threshold,
                    lambda: param_noise_scale.assign(param_noise_scale * 1.01),
                    lambda: param_noise_scale.assign(param_noise_scale / 1.01),
                )
            return update_scale_expr

        # Functionality to update the threshold for parameter space noise.
        update_param_noise_threshold_expr = param_noise_threshold.assign(tf.cond(update_param_noise_threshold_ph >= 0,
            lambda: update_param_noise_threshold_ph, lambda: param_noise_threshold))

        # Put everything together.
        deterministic_actions = tf.argmax(q_values_perturbed, axis=1)
        batch_size = tf.shape(observations_ph.get())[0]
        random_actions = tf.random_uniform(tf.stack([batch_size]), minval=0, maxval=num_actions, dtype=tf.int64)
        chose_random = tf.random_uniform(tf.stack([batch_size]), minval=0, maxval=1, dtype=tf.float32) < eps
        stochastic_actions = tf.where(chose_random, random_actions, deterministic_actions)

        output_actions = tf.cond(stochastic_ph, lambda: stochastic_actions, lambda: deterministic_actions)
        update_eps_expr = eps.assign(tf.cond(update_eps_ph >= 0, lambda: update_eps_ph, lambda: eps))
        updates = [
            update_eps_expr,
            tf.cond(reset_ph, lambda: perturb_vars(original_scope="q_func", perturbed_scope="perturbed_q_func"), lambda: tf.group(*[])),
            tf.cond(update_param_noise_scale_ph, lambda: update_scale(), lambda: tf.Variable(0., trainable=False)),
            update_param_noise_threshold_expr,
        ]
        act = U.function(inputs=[observations_ph, stochastic_ph, update_eps_ph, reset_ph, update_param_noise_threshold_ph, update_param_noise_scale_ph],
                         outputs=output_actions,
                         givens={update_eps_ph: -1.0, stochastic_ph: True, reset_ph: False, update_param_noise_threshold_ph: False, update_param_noise_scale_ph: False},
                         updates=updates)
        return act
コード例 #32
0
ファイル: build_graph.py プロジェクト: mauxam/kaithy
def build_train(make_obs_ph,
                q_func,
                num_actions,
                grad_norm_clipping=None,
                gamma=1.0,
                deterministic_filter=False,
                random_filter=False,
                double_q=True,
                scope="deepq",
                reuse=None,
                param_noise=False,
                param_noise_filter_func=None):
    """Creates the train function:

    Parameters
    ----------
    make_obs_ph: str -> tf.placeholder or TfInput
        a function that takes a name and creates a placeholder of input with that name
    q_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    num_actions: int
        number of actions
    reuse: bool
        whether or not to reuse the graph variables
    grad_norm_clipping: float or None
        clip gradient norms to this value. If None no clipping is performed.
    gamma: float
        discount rate.
    double_q: bool
        if true will use Double Q Learning (https://arxiv.org/abs/1509.06461).
        In general it is a good idea to keep it enabled.
    scope: str or VariableScope
        optional scope for variable_scope.
    reuse: bool or None
        whether or not the variables should be reused. To be able to reuse the scope must be given.
    param_noise: bool
        whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905)
    param_noise_filter_func: tf.Variable -> bool
        function that decides whether or not a variable should be perturbed. Only applicable
        if param_noise is True. If set to None, default_param_noise_filter is used by default.

    Returns
    -------
    act: (tf.Variable, bool, float) -> tf.Variable
        function to select and action given observation.
`       See the top of the file for details.
    train: (object, np.array, np.array, object, np.array, np.array) -> np.array
        optimize the error in Bellman's equation.
`       See the top of the file for details.
    update_target: () -> ()
        copy the parameters from optimized Q function to the target Q function.
`       See the top of the file for details.
    debug: {str: function}
        a bunch of functions to print debug data like q_values.
    """
    if param_noise:
        act_f = build_act_with_param_noise(
            make_obs_ph,
            q_func,
            num_actions,
            scope=scope,
            reuse=reuse,
            param_noise_filter_func=param_noise_filter_func,
            deterministic_filter=deterministic_filter,
            random_filter=random_filter)
    else:
        act_f = build_act(make_obs_ph,
                          q_func,
                          num_actions,
                          scope=scope,
                          reuse=reuse,
                          deterministic_filter=deterministic_filter,
                          random_filter=random_filter)

    with tf.variable_scope(scope, reuse=reuse):
        # set up placeholders
        lr_ph = tf.placeholder(tf.float32, name="lr")
        obs_t_input = U.ensure_tf_input(make_obs_ph("obs_t"))
        act_t_ph = tf.placeholder(tf.int32, [None], name="action")
        rew_t_ph = tf.placeholder(U.data_type, [None], name="reward")
        obs_tp1_input = U.ensure_tf_input(make_obs_ph("obs_tp1"))
        done_mask_ph = tf.placeholder(U.data_type, [None], name="done")
        importance_weights_ph = tf.placeholder(U.data_type, [None],
                                               name="weight")

        board_size = obs_t_input.get().get_shape().as_list()[1]

        obs_t = transform_obses(obs_t_input.get())
        obs_tp1 = transform_obses(obs_tp1_input.get())
        act_t = transform_actions(act_t_ph, board_size)

        if deterministic_filter:
            invalid_masks_tp1 = build_invalid_masks(obs_tp1)

        # q network evaluation
        q_t = q_func(obs_t, num_actions, scope="q_func",
                     reuse=True)  # reuse parameters from act
        q_func_vars = U.scope_vars(U.absolute_scope_name("q_func"))

        # target q network evalution
        q_tp1 = q_func(obs_tp1, num_actions, scope="target_q_func")
        target_q_func_vars = U.scope_vars(
            U.absolute_scope_name("target_q_func"))

        # q scores for actions which we know were selected in the given state.
        q_t_selected = tf.reduce_sum(
            q_t * tf.one_hot(act_t, num_actions, dtype=U.data_type), axis=1)

        # compute estimate of best possible value starting from state at t + 1
        if double_q:
            q_tp1_using_online_net = q_func(obs_tp1,
                                            num_actions,
                                            scope="q_func",
                                            reuse=True)

            if deterministic_filter:
                q_tp1_using_online_net = build_q_filter(
                    q_tp1_using_online_net, invalid_masks_tp1)

            q_tp1_best_using_online_net = tf.argmax(q_tp1_using_online_net,
                                                    1,
                                                    output_type=U.index_type)
            q_tp1_best = tf.reduce_sum(
                q_tp1 * tf.one_hot(q_tp1_best_using_online_net,
                                   num_actions,
                                   dtype=U.data_type), 1)
        else:
            if deterministic_filter:
                q_tp1 = build_q_filter(q_tp1, invalid_masks_tp1)

            q_tp1_best = tf.reduce_max(q_tp1, axis=1)
        q_tp1_best_masked = (1.0 - done_mask_ph) * q_tp1_best

        # compute RHS of bellman equation
        q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked

        # compute the error (potentially clipped)
        td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)
        weighted_error = tf.reduce_mean(importance_weights_ph *
                                        U.huber_loss(td_error))
        regularizer = tf.add_n([tf.nn.l2_loss(var)
                                for var in q_func_vars]) * 0.0001
        total_error = weighted_error + regularizer

        # optimizer = tf.train.MomentumOptimizer(
        #     learning_rate=lr_ph, momentum=0.9)
        optimizer = tf.train.AdamOptimizer(learning_rate=lr_ph)

        # compute optimization op (potentially with gradient clipping)
        if grad_norm_clipping is not None:
            optimize_expr = U.minimize_and_clip(optimizer,
                                                total_error,
                                                var_list=q_func_vars,
                                                clip_val=grad_norm_clipping)
        else:
            optimize_expr = optimizer.minimize(total_error,
                                               var_list=q_func_vars)

        # update_target_fn will be called periodically to copy Q network to target Q network
        update_target_expr = []
        for var, var_target in zip(
                sorted(q_func_vars, key=lambda v: v.name),
                sorted(target_q_func_vars, key=lambda v: v.name)):
            update_target_expr.append(var_target.assign(var))
        update_target_expr = tf.group(*update_target_expr)

        # Create callable functions
        train = U.function(inputs=[
            lr_ph, obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input,
            done_mask_ph, importance_weights_ph
        ],
                           outputs=[td_error, weighted_error, total_error],
                           updates=[optimize_expr])
        update_target = U.function([], [], updates=[update_target_expr])

        q_values = U.function([obs_t_input], q_t)

        return act_f, train, update_target, {'q_values': q_values}
コード例 #33
0
ファイル: build_graph.py プロジェクト: IcarusTan/baselines
def build_train(make_obs_ph, q_func, num_actions, optimizer, grad_norm_clipping=None, gamma=1.0,
    double_q=True, scope="deepq", reuse=None, param_noise=False, param_noise_filter_func=None):
    """Creates the train function:

    Parameters
    ----------
    make_obs_ph: str -> tf.placeholder or TfInput
        a function that takes a name and creates a placeholder of input with that name
    q_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    num_actions: int
        number of actions
    reuse: bool
        whether or not to reuse the graph variables
    optimizer: tf.train.Optimizer
        optimizer to use for the Q-learning objective.
    grad_norm_clipping: float or None
        clip gradient norms to this value. If None no clipping is performed.
    gamma: float
        discount rate.
    double_q: bool
        if true will use Double Q Learning (https://arxiv.org/abs/1509.06461).
        In general it is a good idea to keep it enabled.
    scope: str or VariableScope
        optional scope for variable_scope.
    reuse: bool or None
        whether or not the variables should be reused. To be able to reuse the scope must be given.
    param_noise: bool
        whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905)
    param_noise_filter_func: tf.Variable -> bool
        function that decides whether or not a variable should be perturbed. Only applicable
        if param_noise is True. If set to None, default_param_noise_filter is used by default.

    Returns
    -------
    act: (tf.Variable, bool, float) -> tf.Variable
        function to select and action given observation.
`       See the top of the file for details.
    train: (object, np.array, np.array, object, np.array, np.array) -> np.array
        optimize the error in Bellman's equation.
`       See the top of the file for details.
    update_target: () -> ()
        copy the parameters from optimized Q function to the target Q function.
`       See the top of the file for details.
    debug: {str: function}
        a bunch of functions to print debug data like q_values.
    """
    if param_noise:
        act_f = build_act_with_param_noise(make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse,
            param_noise_filter_func=param_noise_filter_func)
    else:
        act_f = build_act(make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse)

    with tf.variable_scope(scope, reuse=reuse):
        # set up placeholders
        obs_t_input = U.ensure_tf_input(make_obs_ph("obs_t"))
        act_t_ph = tf.placeholder(tf.int32, [None], name="action")
        rew_t_ph = tf.placeholder(tf.float32, [None], name="reward")
        obs_tp1_input = U.ensure_tf_input(make_obs_ph("obs_tp1"))
        done_mask_ph = tf.placeholder(tf.float32, [None], name="done")
        importance_weights_ph = tf.placeholder(tf.float32, [None], name="weight")

        # q network evaluation
        q_t = q_func(obs_t_input.get(), num_actions, scope="q_func", reuse=True)  # reuse parameters from act
        q_func_vars = U.scope_vars(U.absolute_scope_name("q_func"))

        # target q network evalution
        q_tp1 = q_func(obs_tp1_input.get(), num_actions, scope="target_q_func")
        target_q_func_vars = U.scope_vars(U.absolute_scope_name("target_q_func"))

        # q scores for actions which we know were selected in the given state.
        q_t_selected = tf.reduce_sum(q_t * tf.one_hot(act_t_ph, num_actions), 1)

        # compute estimate of best possible value starting from state at t + 1
        if double_q:
            q_tp1_using_online_net = q_func(obs_tp1_input.get(), num_actions, scope="q_func", reuse=True)
            q_tp1_best_using_online_net = tf.arg_max(q_tp1_using_online_net, 1)
            q_tp1_best = tf.reduce_sum(q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions), 1)
        else:
            q_tp1_best = tf.reduce_max(q_tp1, 1)
        q_tp1_best_masked = (1.0 - done_mask_ph) * q_tp1_best

        # compute RHS of bellman equation
        q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked

        # compute the error (potentially clipped)
        td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)
        errors = U.huber_loss(td_error)
        weighted_error = tf.reduce_mean(importance_weights_ph * errors)

        # compute optimization op (potentially with gradient clipping)
        if grad_norm_clipping is not None:
            optimize_expr = U.minimize_and_clip(optimizer,
                                                weighted_error,
                                                var_list=q_func_vars,
                                                clip_val=grad_norm_clipping)
        else:
            optimize_expr = optimizer.minimize(weighted_error, var_list=q_func_vars)

        # update_target_fn will be called periodically to copy Q network to target Q network
        update_target_expr = []
        for var, var_target in zip(sorted(q_func_vars, key=lambda v: v.name),
                                   sorted(target_q_func_vars, key=lambda v: v.name)):
            update_target_expr.append(var_target.assign(var))
        update_target_expr = tf.group(*update_target_expr)

        # Create callable functions
        train = U.function(
            inputs=[
                obs_t_input,
                act_t_ph,
                rew_t_ph,
                obs_tp1_input,
                done_mask_ph,
                importance_weights_ph
            ],
            outputs=td_error,
            updates=[optimize_expr]
        )
        update_target = U.function([], [], updates=[update_target_expr])

        q_values = U.function([obs_t_input], q_t)

        return act_f, train, update_target, {'q_values': q_values}