Esempio n. 1
0
def adv_build_train(make_obs_ph, 
                v_func, 
                adv_func,
                num_actions,
                learning_rate,
                en,
                grad_norm_clipping=None,
                gamma=0.99,
                scope="advantage_learning", 
                reuse=None,
                ):
                
                
    act_f, is_training = adv_build_act(make_obs_ph, adv_func, num_actions, 
                                   en=en, scope=scope, reuse=reuse,)

    with tf.variable_scope(scope, reuse=reuse):
        
        adv_func_vars_list = []
        target_adv_func_vars_list = []        
        error_list = []

        # construct placeholders
        obs_t_input = U.ensure_tf_input(make_obs_ph("obs_t"))
        act_t_ph = tf.placeholder(tf.int32, [None], name="action")
        rew_t_ph = tf.placeholder(tf.float32, [None], name="reward")
        obs_tp1_input = U.ensure_tf_input(make_obs_ph("obs_tp1"))
        done_mask_ph = tf.placeholder(tf.float32, [None], name="done")
        
        obs_t_input_list = tf.split(obs_t_input.get(), en, axis=0)
        act_t_ph_list = tf.split(act_t_ph, en, axis=0)
        rew_t_ph_list = tf.split(rew_t_ph, en, axis=0)
        obs_tp1_input_list = tf.split(obs_tp1_input.get(), en, axis=0)
        done_mask_ph_list = tf.split(done_mask_ph, en, axis=0)

        # build v function
        v_t = tf.squeeze(v_func(obs_t_input.get(), scope="v_func", reuse=False))
        v_t_list = tf.split(v_t, en, axis=0)
        v_func_vars = U.scope_vars(U.absolute_scope_name("v_func"))

        # build v target
        v_tp1 = tf.squeeze(v_func(obs_tp1_input.get(), scope="target_v_func", reuse=False))
        v_tp1_list = tf.split(v_tp1, en, axis=0)
        target_v_func_vars = U.scope_vars(U.absolute_scope_name("target_v_func"))

        
        for count in range(en):
            # build BNN
            adv_t = adv_func(obs_t_input_list[count], num_actions, is_training=is_training, 
                        scope="adv_func" + str(count) + '_', reuse=True,
                        )

            adv_func_vars = U.scope_vars(U.absolute_scope_name("adv_func" + str(count) + '_'))
            adv_func_vars_list += adv_func_vars
            
            # build BNN target
            adv_tp1 = adv_func(obs_tp1_input_list[count], num_actions, is_training=False,
                        scope="target_adv_func" + str(count) + '_',
                        )
            target_adv_func_vars_list += U.scope_vars(U.absolute_scope_name("target_adv_func" + str(count) + '_'))

            adv_t_selected = tf.reduce_sum(adv_t * tf.one_hot(act_t_ph_list[count], num_actions), 1)

            adv_tp1_best = tf.reduce_max(adv_tp1, 1)

            q_t_selected = v_t_list[count] + adv_t_selected
            q_tp1_best = v_tp1_list[count] + adv_tp1_best
            q_tp1_best_masked = (1.0 - done_mask_ph_list[count]) * q_tp1_best
            q_t_selected_target = rew_t_ph_list[count] + gamma * q_tp1_best_masked

            td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)
            
            errors = tf.reduce_mean(tf.square(td_error))
            error_list.append(errors)

        all_vars_list = v_func_vars + adv_func_vars_list
        all_target_vars_list = target_v_func_vars + target_adv_func_vars_list

        total_loss = sum(error_list)
        
        assert grad_norm_clipping is not None
        optimize_expr = U.minimize_and_clip(
                                            tf.train.AdamOptimizer(learning_rate=learning_rate, epsilon=1e-4),
                                            total_loss,
                                            var_list=all_vars_list,
                                            clip_val=grad_norm_clipping
                                        )
        update_target_expr = []

        for var, var_target in zip(sorted(all_vars_list, key=lambda v: v.name),
                                sorted(all_target_vars_list, key=lambda v: v.name)):
            update_target_expr.append(var_target.assign(var))
        update_target_expr = tf.group(*update_target_expr)

        train = U.function(
            inputs=[
                obs_t_input,
                act_t_ph,
                rew_t_ph,
                obs_tp1_input,
                done_mask_ph,
                is_training,
            ],
            outputs=error_list,
            updates=[optimize_expr],
            givens={is_training:True}
        )
        update_target = U.function([], [], updates=[update_target_expr])

    return act_f, train, update_target
Esempio n. 2
0
def build_train(make_obs_ph,
                q_func,
                num_actions,
                optimizer,
                train_gaze,
                grad_norm_clipping=None,
                gamma=1.0,
                double_q=True,
                scope="DeepqWithGaze",
                reuse=None,
                param_noise=False,
                param_noise_filter_func=None):
    """Creates the train function:

    Parameters
    ----------
    make_obs_ph: str -> tf.placeholder or TfInput
        a function that takes a name and creates a placeholder of input with that name
    q_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    num_actions: int
        number of actions
    reuse: bool
        whether or not to reuse the graph variables
    optimizer: tf.train.Optimizer
        optimizer to use for the Q-learning objective.
    grad_norm_clipping: float or None
        clip gradient norms to this value. If None no clipping is performed.
    gamma: float
        discount rate.
    double_q: bool
        if true will use Double Q Learning (https://arxiv.org/abs/1509.06461).
        In general it is a good idea to keep it enabled.
    scope: str or VariableScope
        optional scope for variable_scope.
    reuse: bool or None
        whether or not the variables should be reused. To be able to reuse the scope must be given.
    param_noise: bool
        whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905)
    param_noise_filter_func: tf.Variable -> bool
        function that decides whether or not a variable should be perturbed. Only applicable
        if param_noise is True. If set to None, default_param_noise_filter is used by default.

    Returns
    -------
    act: (tf.Variable, bool, float) -> tf.Variable
        function to select and action given observation.
`       See the top of the file for details.
    train: (object, np.array, np.array, object, np.array, np.array) -> np.array
        optimize the error in Bellman's equation.
`       See the top of the file for details.
    update_target: () -> ()
        copy the parameters from optimized Q function to the target Q function.
`       See the top of the file for details.
    debug: {str: function}
        a bunch of functions to print debug data like q_values.
    """
    if param_noise:
        act_f = build_act_with_param_noise(
            make_obs_ph,
            q_func,
            num_actions,
            scope=scope,
            reuse=reuse,
            param_noise_filter_func=param_noise_filter_func)
    else:
        act_f = build_act(make_obs_ph,
                          q_func,
                          num_actions,
                          scope=scope,
                          reuse=reuse)

    with tf.variable_scope(scope, reuse=reuse):
        # set up placeholders
        obs_t_input = U.ensure_tf_input(make_obs_ph("obs_t"))
        act_t_ph = tf.placeholder(tf.int32, [None], name="action")
        rew_t_ph = tf.placeholder(tf.float32, [None], name="reward")
        obs_tp1_input = U.ensure_tf_input(make_obs_ph("obs_tp1"))
        done_mask_ph = tf.placeholder(tf.float32, [None], name="done")
        importance_weights_ph = tf.placeholder(tf.float32, [None],
                                               name="weight")
        initial_freeze_phase_ph = tf.placeholder(tf.bool, (),
                                                 name="initial_freeze_phase")

        # q network evaluation
        q_t = q_func(obs_t_input.get(),
                     num_actions,
                     scope="q_func",
                     reuse=True)  # reuse parameters from act
        q_func_vars = gflag.qfunc_models.get(
            "q_func").weights  # already includes gaze_models weights
        q_func_trainable_vars = [ w for w in gflag.qfunc_models.get("q_func").trainable_weights \
            if (train_gaze or w not in gflag.gaze_models.get("q_func").trainable_weights) ] # train_gaze=False excludes gaze model's weight

        # target q network evalution
        q_tp1 = q_func(obs_tp1_input.get(), num_actions, scope="target_q_func")
        target_q_func_vars = gflag.qfunc_models.get(
            "target_q_func").weights  # already includes gaze_models weights

        # q scores for actions which we know were selected in the given state.
        q_t_selected = tf.reduce_sum(q_t * tf.one_hot(act_t_ph, num_actions),
                                     1)

        # compute estimate of best possible value starting from state at t + 1
        if double_q:
            q_tp1_using_online_net = q_func(obs_tp1_input.get(),
                                            num_actions,
                                            scope="q_func",
                                            reuse=True)
            q_tp1_best_using_online_net = tf.arg_max(q_tp1_using_online_net, 1)
            q_tp1_best = tf.reduce_sum(
                q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions),
                1)
        else:
            q_tp1_best = tf.reduce_max(q_tp1, 1)
        q_tp1_best_masked = (1.0 - done_mask_ph) * q_tp1_best

        # compute RHS of bellman equation
        q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked

        # compute the error (potentially clipped)
        td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)
        errors = U.huber_loss(td_error)
        weighted_error = tf.reduce_mean(importance_weights_ph * errors)

        # compute optimization op (potentially with gradient clipping)
        initial_freeze_weights = gflag.qfunc_models.get_weight_names_for_initial_freeze(
            model_name="q_func")
        q_func_trainable_vars_for_initial_freeze = list(
            filter(lambda w: w.name not in initial_freeze_weights,
                   q_func_trainable_vars))
        if grad_norm_clipping is not None:
            optimize_expr_for_initial_freeze = lambda: U.minimize_and_clip(optimizer,
                                                weighted_error,
                                                var_list=q_func_trainable_vars_for_initial_freeze,
                                                clip_val=grad_norm_clipping) \
                                            if q_func_trainable_vars_for_initial_freeze else tf.no_op()
            optimize_expr_after_freeze = lambda: U.minimize_and_clip(
                optimizer,
                weighted_error,
                var_list=q_func_trainable_vars,
                clip_val=grad_norm_clipping)
        else:
            # must put the operation under lambda, if you fully read tf.cond()'s documentation
            optimize_expr_for_initial_freeze = lambda: optimizer.minimize(
                weighted_error,
                var_list=q_func_trainable_vars_for_initial_freeze)
            optimize_expr_after_freeze = lambda: optimizer.minimize(
                weighted_error, var_list=q_func_trainable_vars)
        optimize_expr = tf.cond(initial_freeze_phase_ph,
                                optimize_expr_for_initial_freeze,
                                optimize_expr_after_freeze)

        # update_target_fn will be called periodically to copy Q network to target Q network
        update_target_expr = []
        assert len(q_func_vars) == len(target_q_func_vars)
        for var, var_target in zip(q_func_vars, target_q_func_vars):
            update_target_expr.append(var_target.assign(var))
        update_target_expr = tf.group(*update_target_expr)

        # Create callable functions
        train = U.function(inputs=[
            obs_t_input,
            act_t_ph,
            rew_t_ph,
            obs_tp1_input,
            done_mask_ph,
            importance_weights_ph,
            initial_freeze_phase_ph,
        ],
                           outputs=td_error,
                           updates=[optimize_expr],
                           givens={K.backend.learning_phase(): 1})
        update_target = U.function([], [], updates=[update_target_expr])

        q_values = U.function([obs_t_input], q_t)

        # For tensorboard
        merged = tf.summary.merge([
            tf.summary.image('img_curframe', obs_t_input.get()),
            tf.summary.image(
                'gaze_curframe',
                q_func(obs_t_input.get(),
                       num_actions,
                       scope="q_func",
                       return_gaze=True,
                       reuse=True))
        ])
        tensorboard_summary = U.function(
            inputs=[obs_t_input],
            outputs=merged,
            givens={K.backend.learning_phase(): 0})

        return act_f, train, update_target, {
            'q_values': q_values
        }, tensorboard_summary
def build_train_modelbased(make_obs_ph,
                           net_func,
                           model_func,
                           num_actions,
                           optimizer,
                           grad_norm_clipping=None,
                           gamma=1.0,
                           scope="mfec",
                           latent_dim=32,
                           input_dim=84 * 84 * 4,
                           hash_dim=32,
                           K=10,
                           beta=0.1,
                           predict=True,
                           reuse=None):
    """Creates the train function:

    Parameters
    ----------
    make_obs_ph: str -> tf.placeholder or TfInput
        a function that takes a name and creates a placeholder of input with that name
    num_actions: int
        number of actions
    reuse: bool
        whether or not to reuse the graph variables
    optimizer: tf.train.Optimizer
        optimizer to use for the Q-learning objective.
    grad_norm_clipping: float or None
        clip gradient norms to this value. If None no clipping is performed.
    gamma: float
        discount rate.
    double_q: bool
        if true will use Double Q Learning (https://arxiv.org/abs/1509.06461).
        In general it is a good idea to keep it enabled.
    scope: str or VariableScope
        optional scope for variable_scope.
    reuse: bool or None
        whether or not the variables should be reused. To be able to reuse the scope must be given.

    Returns
    -------
    act: (tf.Variable, bool, float) -> tf.Variable
        function to select and action given observation.
`       See the top of the file for details.
    train: (object, np.array, np.array, object, np.array, np.array) -> np.array
        optimize the error in Bellman's equation.
`       See the top of the file for details.
    update_target: () -> ()
        copy the parameters from optimized Q function to the target Q function.
`       See the top of the file for details.
    debug: {str: function}
        a bunch of functions to print debug data like q_values.
    """

    z_func = build_act_modelbased(make_obs_ph,
                                  net_func,
                                  num_actions,
                                  scope=scope,
                                  secondary_scope="net_func",
                                  reuse=reuse)

    with tf.variable_scope(scope, reuse=reuse):
        # set up placeholders

        # EMDQN
        tau = tf.placeholder(tf.float32, [1], name='tau')
        # momentum = tf.placeholder(tf.float32, [1], name='momentum')

        obs_mc_input_query = U.ensure_tf_input(make_obs_ph("obs_query"))
        obs_mc_input_positive = U.ensure_tf_input(make_obs_ph("enc_obs_pos"))
        obs_mc_input_negative = U.ensure_tf_input(make_obs_ph("enc_obs_neg"))
        obs_mc_input_model_t = U.ensure_tf_input(make_obs_ph("obs_query"))
        obs_mc_input_model_tp1 = U.ensure_tf_input(make_obs_ph("obs_query"))
        reward_input_model = tf.placeholder(tf.float32, [None], name='reward')
        action_input_model = tf.placeholder(tf.int32, [None], name='action')
        latent_input_out = tf.placeholder(tf.float32, [None, latent_dim],
                                          name='latent')
        action_input_out = tf.placeholder(tf.int32, [None],
                                          name='action_input_out')
        # inputs = [obs_mc_input]
        # inputs = [tau, obs_mc_input_query, obs_mc_input_positive, obs_mc_input_negative]
        inputs = [
            tau, obs_mc_input_query, obs_mc_input_positive,
            obs_mc_input_negative, obs_mc_input_model_t,
            obs_mc_input_model_tp1, reward_input_model, action_input_model
        ]
        z_mc_model_t, _ = net_func(obs_mc_input_model_t.get(),
                                   num_actions,
                                   scope="net_func",
                                   reuse=True)
        z_mc_model_tp1, _ = net_func(obs_mc_input_model_tp1.get(),
                                     num_actions,
                                     scope="net_func",
                                     reuse=True)
        z_mc_out, reward_out = model_func(latent_input_out,
                                          action_input_out,
                                          num_actions,
                                          scope="model_func",
                                          reuse=reuse)
        z_mc_model_tp1_predict, reward_predict = model_func(z_mc_model_t,
                                                            action_input_model,
                                                            num_actions,
                                                            scope="model_func",
                                                            reuse=True)
        z_mc, _ = net_func(obs_mc_input_query.get(),
                           num_actions,
                           scope="net_func",
                           reuse=True)

        # _, v_mc = net_func(
        #     obs_mc_input_query.get(), num_actions,
        #     scope="net_func",
        #     reuse=True)
        z_mc_pos, v_mc_pos = net_func(obs_mc_input_positive.get(),
                                      num_actions,
                                      scope="net_func",
                                      reuse=True)

        z_mc_neg, v_mc_neg = net_func(obs_mc_input_negative.get(),
                                      num_actions,
                                      scope="net_func",
                                      reuse=True)

        z_mc_pos = tf.reshape(z_mc_pos, [-1, 1, latent_dim])
        z_mc = tf.reshape(z_mc, [-1, latent_dim, 1])
        z_mc_neg = tf.reshape(z_mc_neg, [-1, K, latent_dim])

        negative = tf.matmul(z_mc_neg, z_mc) / tau
        sum_negative = tf.squeeze(tf.reduce_sum(tf.exp(negative), axis=1))
        positive = tf.squeeze(tf.matmul(z_mc_pos, z_mc) / tau)
        print("shape:", z_mc.shape, z_mc_pos.shape, z_mc_neg.shape,
              sum_negative.shape, negative.shape, positive.shape)
        contrast_loss = tf.reduce_mean(tf.log(sum_negative) - positive)
        # # print("shape2:", z_mc.shape, negative.shape, positive.shape)
        # # prediction_loss = tf.losses.mean_squared_error(value_input, v_mc)
        # total_loss = contrast_loss
        # if predict:
        #     total_loss += beta * prediction_loss

        model_func_vars = U.scope_vars(
            U.absolute_scope_name("model_func")) + U.scope_vars(
                U.absolute_scope_name("net_func"))
        # encoder_net_func_vars = U.scope_vars(U.absolute_scope_name("encoder_net_func"))

        transition_loss = tf.reduce_sum(
            tf.square(z_mc_model_tp1 - z_mc_model_tp1_predict))
        reward_loss = tf.reduce_sum(
            tf.square(reward_predict - reward_input_model))
        total_loss = contrast_loss + transition_loss + reward_loss
        if grad_norm_clipping is not None:
            optimize_expr_contrast_with_prediction = U.minimize_and_clip(
                optimizer,
                total_loss,
                var_list=model_func_vars,
                clip_val=grad_norm_clipping)
        else:
            optimize_expr_contrast_with_prediction = optimizer.minimize(
                total_loss, var_list=model_func_vars)
        # Create callable functions
        # update_target_fn will be called periodically to copy Q network to target Q network
        z_var_summary = tf.summary.scalar(
            "z_var", tf.reduce_mean(tf.math.reduce_std(z_mc_model_t, axis=1)))
        negative_summary = tf.summary.scalar(
            "negative", tf.reduce_mean(tf.reduce_mean(negative)))
        positive_summary = tf.summary.scalar(
            "positive", tf.reduce_mean(tf.reduce_mean(positive)))
        contrast_loss_summary = tf.summary.scalar(
            "contrast loss", tf.reduce_mean(contrast_loss))
        transition_loss_summary = tf.summary.scalar(
            "transition loss", tf.reduce_mean(transition_loss))
        trivial_loss_summary = tf.summary.scalar(
            "trivial loss",
            tf.reduce_mean(tf.square(z_mc_model_t - z_mc_model_tp1)))
        reward_loss_summary = tf.summary.scalar("reward loss",
                                                tf.reduce_mean(reward_loss))
        # prediction_loss_summary = tf.summary.scalar("prediction loss", tf.reduce_mean(prediction_loss))
        total_loss_summary = tf.summary.scalar("total loss",
                                               tf.reduce_mean(total_loss))

        summaries = [
            z_var_summary, negative_summary, positive_summary,
            contrast_loss_summary, trivial_loss_summary,
            transition_loss_summary, reward_loss_summary, total_loss_summary
        ]
        summary = tf.summary.merge(summaries)

        train = U.function(inputs=inputs,
                           outputs=[total_loss, summary],
                           updates=[optimize_expr_contrast_with_prediction])
        prediction = U.function(inputs=[latent_input_out, action_input_out],
                                outputs=[z_mc_out, reward_out])
        return z_func, prediction, train
Esempio n. 4
0
                                       key=lambda x: x.name)):
    if (main_var.name.replace("train_base_net", "") == target_var.name.replace(
            "target_base_net", "")):
        assign_ops.append(tf.assign(target_var, main_var))

print("Copying Ops.:", len(assign_ops))

copy_operation = tf.group(*assign_ops)

from collections import deque
replay_buffer = deque(maxlen=50000)

optimizer = tf.train.AdamOptimizer(learning_rate=1e-2)
from baselines.common import tf_util
train_step = tf_util.minimize_and_clip(optimizer,
                                       iqn.loss,
                                       var_list=train_variables)

optimizer_sampling = tf.train.AdamOptimizer(learning_rate=1e-2)
train_step_sampling = tf_util.minimize_and_clip(optimizer_sampling,
                                                iqn.sampling_loss,
                                                var_list=sampling_variables)


def train(x, a, r=None, x_p=None, t=None, true_return=None):
    if true_return is not None:
        return sess.run(
            [iqn.sampling_loss, train_step_sampling],
            feed_dict={
                iqn.train_net.state: x,
                iqn.action_placeholder: a,
Esempio n. 5
0
                                       key=lambda x: x.name)):
    if (main_var.name.replace("train_base_net", "") == target_var.name.replace(
            "target_base_net", "")):
        assign_ops.append(tf.assign(target_var, main_var))

print("Copying Ops.:", len(assign_ops))

copy_operation = tf.group(*assign_ops)

from collections import deque
replay_buffer = deque(maxlen=50000)

optimizer = tf.train.AdamOptimizer(learning_rate=1e-2)
from baselines.common import tf_util
train_step = tf_util.minimize_and_clip(optimizer,
                                       iqn.loss,
                                       var_list=train_variables)


def train(x, a, r, x_p, t):
    return sess.run(
        [iqn.loss, train_step],
        feed_dict={
            iqn.train_net.state: x,
            iqn.action_placeholder: a,
            iqn.r: r,
            iqn.t: t,
            iqn.target_net.state: x_p
        })

def imit_build_train(
    make_obs_ph,
    bnn_func,
    learning_rate,
    num_actions,
    en,

    # raw_input_ph is the placeholder accept the raw image
    raw_input_ph,
    target_output,
    gamma,
    grad_norm_clipping=None,
    alpha=20,
    bnn_explore=0.01,
    scope="Imitation",
    reuse=None,
    use_sign=False,
):

    bnn_act_f, is_training = imit_build_act(
        make_obs_ph,
        bnn_func,
        num_actions,
        bnn_explore=bnn_explore,
        en=en,
        scope=scope,
        reuse=reuse,
        use_sign=use_sign,
    )

    with tf.variable_scope(scope, reuse=reuse):
        loss_list = []
        bnn_func_vars_list = []

        obs_t = tf.cast(raw_input_ph, tf.float32) / 255.0
        obs_t_list = tf.split(obs_t, en, axis=0)
        target_output_list = tf.split(target_output, en, axis=0)

        # TODO
        accu_list = []
        target_label_list = tf.split(tf.argmax(target_output, axis=1),
                                     en,
                                     axis=0)

        for count in range(en):
            bnn_output = bnn_func(obs_t_list[count],
                                  num_actions,
                                  scope="bnn_func" + str(count) + '_',
                                  is_training=is_training,
                                  reuse=True)

            bnn_func_vars = U.scope_vars(
                U.absolute_scope_name("bnn_func" + str(count) + '_'))
            bnn_func_vars_list += bnn_func_vars

            loss_list.append(
                tf.reduce_mean(
                    tf.square(bnn_output - alpha * target_output_list[count])))

            #TODO
            predict = tf.argmax(bnn_output, axis=1)
            accu = tf.reduce_mean(
                tf.cast(tf.equal(predict, target_label_list[count]), "float"))
            accu_list.append(accu)

        total_loss = sum(loss_list)

        assert grad_norm_clipping is not None
        optimize_expr = U.minimize_and_clip(tf.train.AdamOptimizer(
            learning_rate=learning_rate, epsilon=1e-4),
                                            total_loss,
                                            var_list=bnn_func_vars_list,
                                            clip_val=grad_norm_clipping)

        train = U.function(
            inputs=[raw_input_ph, is_training],
            # TODO
            outputs=accu_list + loss_list,
            updates=[optimize_expr],
            givens={is_training: True},
        )

    return bnn_act_f, train
Esempio n. 7
0
def build_train(make_obs_ph, q_func, num_actions, optimizer, grad_norm_clipping=None, gamma=1.0, double_q=True, scope="deepq", reuse=None):
    """Creates the act function:

    Parameters
    ----------
    make_obs_ph: str -> tf.placeholder or TfInput
        a function that take a name and creates a placeholder of input with that name
    q_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    num_actions: int
        number of actions
    reuse: bool
        whether or not to reuse the graph variables
    optimizer: tf.train.Optimizer
        optimizer to use for the Q-learning objective.
    grad_norm_clipping: float or None
        clip graident norms to this value. If None no clipping is performed.
    gamma: float
        discount rate.
    double_q: bool
        if true will use Double Q Learning (https://arxiv.org/abs/1509.06461).
        In general it is a good idea to keep it enabled.
    scope: str or VariableScope
        optional scope for variable_scope.
    reuse: bool or None
        whether or not the variables should be reused. To be able to reuse the scope must be given.

    Returns
    -------
    act: (tf.Variable, bool, float) -> tf.Variable
        function to select and action given observation.
`       See the top of the file for details.
    train: (object, np.array, np.array, object, np.array, np.array) -> np.array
        optimize the error in Bellman's equation.
`       See the top of the file for details.
    update_target: () -> ()
        copy the parameters from optimized Q function to the target Q function.
`       See the top of the file for details.
    debug: {str: function}
        a bunch of functions to print debug data like q_values.
    """
    act_f = build_act(make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse)

    with tf.variable_scope(scope, reuse=reuse):
        # set up placeholders
        obs_t_input = U.ensure_tf_input(make_obs_ph("obs_t"))
        act_t_ph = tf.placeholder(tf.int32, [None], name="action")
        rew_t_ph = tf.placeholder(tf.float32, [None], name="reward")
        obs_tp1_input = U.ensure_tf_input(make_obs_ph("obs_tp1"))
        done_mask_ph = tf.placeholder(tf.float32, [None], name="done")
        importance_weights_ph = tf.placeholder(tf.float32, [None], name="weight")

        # q network evaluation
        q_t = q_func(obs_t_input.get(), num_actions, scope="q_func", reuse=True)  # reuse parameters from act
        q_func_vars = U.scope_vars(U.absolute_scope_name("q_func"))

        # target q network evalution
        q_tp1 = q_func(obs_tp1_input.get(), num_actions, scope="target_q_func")
        target_q_func_vars = U.scope_vars(U.absolute_scope_name("target_q_func"))

        # q scores for actions which we know were selected in the given state.
        q_t_selected = tf.reduce_sum(q_t * tf.one_hot(act_t_ph, num_actions), 1)

        # compute estimate of best possible value starting from state at t + 1
        if double_q:
            q_tp1_using_online_net = q_func(obs_tp1_input.get(), num_actions, scope="q_func", reuse=True)
            q_tp1_best_using_online_net = tf.arg_max(q_tp1_using_online_net, 1)
            q_tp1_best = tf.reduce_sum(q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions), 1)
        else:
            q_tp1_best = tf.reduce_max(q_tp1, 1)
        q_tp1_best_masked = (1.0 - done_mask_ph) * q_tp1_best

        # compute RHS of bellman equation
        q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked

        # compute the error (potentially clipped)
        td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)
        errors = U.huber_loss(td_error)
        weighted_error = tf.reduce_mean(importance_weights_ph * errors)
        # compute optimization op (potentially with gradient clipping)
        if grad_norm_clipping is not None:
            optimize_expr = U.minimize_and_clip(optimizer,
                                                weighted_error,
                                                var_list=q_func_vars,
                                                clip_val=grad_norm_clipping)
        else:
            optimize_expr = optimizer.minimize(weighted_error, var_list=q_func_vars)

        # update_target_fn will be called periodically to copy Q network to target Q network
        update_target_expr = []
        for var, var_target in zip(sorted(q_func_vars, key=lambda v: v.name),
                                   sorted(target_q_func_vars, key=lambda v: v.name)):
            update_target_expr.append(var_target.assign(var))
        update_target_expr = tf.group(*update_target_expr)

        # Create callable functions
        train = U.function(
            inputs=[
                obs_t_input,
                act_t_ph,
                rew_t_ph,
                obs_tp1_input,
                done_mask_ph,
                importance_weights_ph
            ],
            outputs=td_error,
            updates=[optimize_expr]
        )
        update_target = U.function([], [], updates=[update_target_expr])

        q_values = U.function([obs_t_input], q_t)

        return act_f, train, update_target, {'q_values': q_values}
Esempio n. 8
0
def build_train(make_obs_ph,
                var_func,
                cvar_func,
                num_actions,
                nb_atoms,
                optimizer,
                grad_norm_clipping=None,
                gamma=1.0,
                scope="cvar_dqn",
                reuse=None):
    """Creates the train function:

    Parameters
    ----------
    make_obs_ph: str -> tf.placeholder or TfInput
        a function that takes a name and creates a placeholder of input with that name
    var_func: (tf.Variable, int, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            nb_atoms: int
                number of atoms
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    cvar_func: (tf.Variable, int, str, bool) -> tf.Variable
        see var_func
    num_actions: int
        number of actions
    reuse: bool
        whether or not to reuse the graph variables
    optimizer: tf.train.Optimizer
        optimizer to use for the Q-learning objective.
    grad_norm_clipping: float or None
        clip gradient norms to this value. If None no clipping is performed.
    gamma: float
        discount rate.
    scope: str or VariableScope
        optional scope for variable_scope.
    reuse: bool or None
        whether or not the variables should be reused. To be able to reuse the scope must be given.

    Returns
    -------
    act: (tf.Variable, bool, float) -> tf.Variable
        function to select and action given observation.
`       See the top of the file for details.
    train: (object, np.array, np.array, object, np.array, np.array) -> np.array
        optimize the error in Bellman's equation.
`       See the top of the file for details.
    update_target: () -> ()
        copy the parameters from optimized Q function to the target Q function.
`       See the top of the file for details.
    debug: {str: function}
        a bunch of functions to print debug data like q_values.
    """
    act_f = build_act(make_obs_ph,
                      cvar_func,
                      var_func,
                      num_actions,
                      nb_atoms,
                      scope=scope,
                      reuse=reuse)

    with tf.variable_scope(scope, reuse=reuse):
        # set up placeholders
        obs_t_input = U.ensure_tf_input(make_obs_ph("obs_t"))
        act_t_ph = tf.placeholder(tf.int32, [None], name="action")
        rew_t_ph = tf.placeholder(tf.float32, [None], name="reward")
        obs_tp1_input = U.ensure_tf_input(make_obs_ph("obs_tp1"))
        done_mask_ph = tf.placeholder(tf.float32, [None], name="done")
        importance_weights_ph = tf.placeholder(tf.float32, [None],
                                               name="weight")
        # atoms
        y = tf.range(1, nb_atoms + 1, dtype=tf.float32,
                     name='y') * 1. / nb_atoms

        # ------------------------------- Core networks ---------------------------------
        # var network
        var_t = var_func(obs_t_input.get(),
                         num_actions,
                         nb_atoms,
                         scope="out_func",
                         reuse_main=True,
                         reuse_last=True)  # reuse from act

        # vars for actions which we know were selected in the given state.
        var_t_selected = gather_along_second_axis(var_t, act_t_ph)
        var_t_selected.set_shape([None, nb_atoms])

        # cvar network
        cvar_t = cvar_func(obs_t_input.get(),
                           num_actions,
                           nb_atoms,
                           scope="out_func",
                           reuse_main=True,
                           reuse_last=True)  # reuse from act

        # cvars for actions which we know were selected in the given state.
        cvar_t_selected = gather_along_second_axis(cvar_t, act_t_ph)
        cvar_t_selected.set_shape([None, nb_atoms])

        # target cvar network
        cvar_tp1 = cvar_func(obs_tp1_input.get(),
                             num_actions,
                             nb_atoms,
                             scope="target_cvar_func")

        # extract variables
        joint_variables = U.scope_vars(U.absolute_scope_name("out_func/net"))
        var_variables = U.scope_vars(U.absolute_scope_name("out_func/var"))
        cvar_variables = U.scope_vars(U.absolute_scope_name("out_func/cvar"))
        target_cvar_func_variables = U.scope_vars(
            U.absolute_scope_name("target_cvar_func"))

        # -------------------------------------------------------------------------------

        # ----------------------------- Extract distribution ----------------------------
        # construct a new cvar with different actions for each atom
        cvar_tp1_star = tf.reduce_max(cvar_tp1, axis=1)
        cvar_tp1_star.set_shape([None, nb_atoms])
        # construct a distribution from the new cvar
        ycvar_tp1_star = cvar_tp1_star * y
        dist_tp1_star_ = extract_distribution(ycvar_tp1_star, nb_atoms)

        # apply done mask
        dist_tp1_star = tf.einsum('ij,i->ij', dist_tp1_star_,
                                  1. - done_mask_ph)

        # Td = r + gamma * dist
        dist_target = tf.identity(rew_t_ph[:, None] + gamma * dist_tp1_star,
                                  name='dist_target')
        # dist is always non-differentiable
        dist_target = tf.stop_gradient(dist_target)

        # -------------------------------------------------------------------------------

        # ---------------------------------- VaR loss -----------------------------------

        td_error = dist_target[:, :, None] - var_t_selected[:, None, :]
        # td_error[0]=
        #  [[Td1-v1 Td1-v2 ... Td1-vn]
        #   [Td2-v1 Td2-v2 ... Td2-vn]
        #   [...                     ]
        #   [Tdn-v1 Tdn-v2 ... Tdn-vn]]

        negative_indicator = tf.cast(td_error < 0, tf.float32)

        var_weights = tf.stop_gradient(
            y - negative_indicator)  # XXX: stop gradient?
        quantile_loss = var_weights * td_error

        var_error = tf.reduce_mean(quantile_loss)
        # -------------------------------------------------------------------------------

        # ---------------------------------- CVaR loss ----------------------------------
        # Minimizing the MSE of:
        # V_i + 1/y_i(Td_j - V_i)^- - C_i

        min_target_diff = negative_indicator / y * tf.stop_gradient(td_error)
        cvar_loss = tf.stop_gradient(
            var_t_selected
        )[:, None, :] + min_target_diff - cvar_t_selected[:, None, :]

        cvar_error = tf.reduce_mean(tf.square(cvar_loss))

        # -------------------------------------------------------------------------------

        # ------------------------------- Finalizing ------------------------------------

        error = var_error + cvar_error
        # compute optimization op (potentially with gradient clipping)
        var_list = [joint_variables, var_variables, cvar_variables]
        if grad_norm_clipping is not None:
            optimize_expr = U.minimize_and_clip(optimizer,
                                                error,
                                                var_list,
                                                clip_val=grad_norm_clipping)
        else:
            optimize_expr = optimizer.minimize(error, var_list=var_list)

        # update_target_fn will be called periodically to copy cvar network to target cvar network
        # Note: var has no target
        update_target_expr = []
        for cvar_variable, target_cvar_variable in zip(
                sorted(joint_variables + cvar_variables, key=lambda v: v.name),
                sorted(target_cvar_func_variables, key=lambda v: v.name)):
            update_target_expr.append(
                target_cvar_variable.assign(cvar_variable))
        update_target_expr = tf.group(*update_target_expr)

        # Create callable functions
        train = U.function(inputs=[
            obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph,
            importance_weights_ph
        ],
                           outputs=error,
                           updates=[optimize_expr])
        update_target = U.function([], [], updates=[update_target_expr])

        # -------------------------------------------------------------------------------

        # --------------------------------- Debug ---------------------------------------
        # a = U.function([obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph], var_t_selected)
        # b = U.function([obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph], cvar_t_selected)
        # c = U.function([obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph], big_dist_target*y)
        # b = U.function([obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph], var_t)
        # c = U.function([obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph], negative_indicator)
        # d = U.function([obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph], big_yc_target)
        # e = U.function([obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph], cvar_t)
        # f = U.function([obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph], cvar_loss)
        # atoms = U.function([obs_tp1_input], atoms)
        # -------------------------------------------------------------------------------

        return act_f, train, update_target, []
Esempio n. 9
0
def build_train(make_obs_ph,
                q_func,
                num_actions,
                optimizer,
                grad_norm_clipping=None,
                gamma=1.0,
                double_q=True,
                scope="deepq",
                reuse=None,
                param_noise=False,
                param_noise_filter_func=None,
                distributed=False,
                v_min=-10.0,
                v_max=10.0,
                atoms=51):
    """Creates the train function:

    Parameters
    ----------
    make_obs_ph: str -> tf.placeholder or TfInput
        a function that takes a name and creates a placeholder of input with that name
    q_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    num_actions: int
        number of actions
    reuse: bool
        whether or not to reuse the graph variables
    optimizer: tf.train.Optimizer
        optimizer to use for the Q-learning objective.
    grad_norm_clipping: float or None
        clip gradient norms to this value. If None no clipping is performed.
    gamma: float
        discount rate.
    double_q: bool
        if true will use Double Q Learning (https://arxiv.org/abs/1509.06461).
        In general it is a good idea to keep it enabled.
    scope: str or VariableScope
        optional scope for variable_scope.
    reuse: bool or None
        whether or not the variables should be reused. To be able to reuse the scope must be given.
    param_noise: bool
        whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905)
    param_noise_filter_func: tf.Variable -> bool
        function that decides whether or not a variable should be perturbed. Only applicable
        if param_noise is True. If set to None, default_param_noise_filter is used by default.
    distributed: bool
        whether or not distributed version is enabled.
    v_min: float
        lower boundary for value, only works when distributed version is enabled.
    v_max: float
        upper boundary for value, only works when distributed version is enabled.
    atoms: int
        number of atoms, only works when distributed version is enabled.

    Returns
    -------
    act: (tf.Variable, bool, float) -> tf.Variable
        function to select and action given observation.
`       See the top of the file for details.
    train: (object, np.array, np.array, object, np.array, np.array) -> np.array
        optimize the error in Bellman's equation.
`       See the top of the file for details.
    update_target: () -> ()
        copy the parameters from optimized Q function to the target Q function.
`       See the top of the file for details.
    debug: {str: function}
        a bunch of functions to print debug data like q_values.
    """
    print("build train use distributed? ", distributed)
    if param_noise:
        act_f = build_act_with_param_noise(
            make_obs_ph,
            q_func,
            num_actions,
            scope=scope,
            reuse=reuse,
            param_noise_filter_func=param_noise_filter_func,
            distributed=distributed,
            v_min=v_min,
            v_max=v_max,
            atoms=atoms)
    else:
        act_f = build_act(make_obs_ph,
                          q_func,
                          num_actions,
                          scope=scope,
                          reuse=reuse,
                          distributed=distributed,
                          v_min=v_min,
                          v_max=v_max,
                          atoms=atoms)

    with tf.variable_scope(scope, reuse=reuse):
        # set up placeholders
        obs_t_input = U.ensure_tf_input(make_obs_ph("obs_t"))
        act_t_ph = tf.placeholder(tf.int32, [None], name="action")
        rew_t_ph = tf.placeholder(tf.float32, [None], name="reward")
        obs_tp1_input = U.ensure_tf_input(make_obs_ph("obs_tp1"))
        done_mask_ph = tf.placeholder(tf.float32, [None], name="done")
        importance_weights_ph = tf.placeholder(tf.float32, [None],
                                               name="weight")
        distributed_target_ph = tf.placeholder(tf.float32, [None, atoms],
                                               name="dis_target")

        # q network evaluation
        if not distributed:
            q_t = q_func(obs_t_input.get(),
                         num_actions,
                         scope="q_func",
                         reuse=True)  # reuse parameters from act
            # target q network evalution
            q_tp1 = q_func(obs_tp1_input.get(),
                           num_actions,
                           scope="target_q_func")
        else:
            q_t = q_func(obs_t_input.get(),
                         num_actions,
                         scope="q_func",
                         reuse=True)  # reuse parameters from act
            q_tp1 = q_func(obs_tp1_input.get(),
                           num_actions,
                           scope="target_q_func")

        q_func_vars = U.scope_vars(U.absolute_scope_name("q_func"))
        target_q_func_vars = U.scope_vars(
            U.absolute_scope_name("target_q_func"))

        # q scores for actions which we know were selected in the given state.
        if not distributed:
            q_t_selected = tf.reduce_sum(
                q_t * tf.one_hot(act_t_ph, num_actions), 1)
        else:
            probability_qt = tf.nn.softmax(q_t)
            q_t_selected = tf.reduce_sum(
                q_t *
                tf.tile(tf.expand_dims(tf.one_hot(act_t_ph, num_actions), 2),
                        [1, 1, atoms]), 1)

        # compute estimate of best possible value starting from state at t + 1
        if double_q:
            print("use double")
            if not distributed:
                q_tp1_using_online_net = q_func(obs_tp1_input.get(),
                                                num_actions,
                                                scope="q_func",
                                                reuse=True)
                q_tp1_best_using_online_net = tf.arg_max(
                    q_tp1_using_online_net, 1)
                q_tp1_best = tf.reduce_sum(
                    q_tp1 *
                    tf.one_hot(q_tp1_best_using_online_net, num_actions), 1)
            else:
                q_tp1_using_online_net = q_func(obs_tp1_input.get(),
                                                num_actions,
                                                scope="q_func",
                                                reuse=True)
                q_tp1_best = get_distibute_q(q_tp1_using_online_net, v_min,
                                             v_max, atoms, obs_tp1_input)
                a_tp1_best = tf.argmax(q_tp1_best, 1)
                probability_qt1 = tf.nn.softmax(q_tp1_using_online_net)
                q_tp1_best = tf.reduce_sum(
                    probability_qt1 * tf.tile(
                        tf.expand_dims(tf.one_hot(a_tp1_best, num_actions), 2),
                        [1, 1, atoms]), 1)
        else:
            print("not use double")
            if not distributed:
                q_tp1_best = tf.reduce_max(q_tp1, 1)
            else:
                if distributed:
                    q_tp1_best = get_distibute_q(q_tp1, v_min, v_max, atoms,
                                                 obs_tp1_input)
                    a_tp1_best = tf.argmax(q_tp1_best, 1)
                    probability_qt1 = tf.nn.softmax(q_tp1)
                    q_tp1_best = tf.reduce_sum(
                        probability_qt1 * tf.tile(
                            tf.expand_dims(tf.one_hot(a_tp1_best, num_actions),
                                           2), [1, 1, atoms]), 1)

        mask = 1.0 - done_mask_ph
        if not distributed:
            q_tp1_best_masked = mask * q_tp1_best
        else:
            q_tp1_best_masked = q_tp1_best

        # compute RHS of bellman equation
        if not distributed:
            q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked
            # compute the error (potentially clipped)
            td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)
            errors = U.huber_loss(td_error)
        else:
            clip_target = tf.clip_by_value(distributed_target_ph, 1e-8, 1.0)
            clip_select = tf.clip_by_value(tf.nn.softmax(q_t_selected), 1e-8,
                                           1.0)
            # use kl divergence
            td_error = tf.reduce_sum(
                clip_target * (tf.log(clip_target) - tf.log(clip_select)),
                axis=-1)
            errors = tf.nn.softmax_cross_entropy_with_logits(
                labels=distributed_target_ph, logits=q_t_selected)

        weighted_error = tf.reduce_mean(importance_weights_ph * errors)

        # compute optimization op (potentially with gradient clipping)
        if grad_norm_clipping is not None:
            optimize_expr = U.minimize_and_clip(optimizer,
                                                weighted_error,
                                                var_list=q_func_vars,
                                                clip_val=grad_norm_clipping)
        else:
            optimize_expr = optimizer.minimize(weighted_error,
                                               var_list=q_func_vars)

        # update_target_fn will be called periodically to copy Q network to target Q network
        update_target_expr = []
        for var, var_target in zip(
                sorted(q_func_vars, key=lambda v: v.name),
                sorted(target_q_func_vars, key=lambda v: v.name)):
            update_target_expr.append(var_target.assign(var))
        update_target_expr = tf.group(*update_target_expr)

        # Create callable functions
        if distributed:
            train = U.function(inputs=[
                obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph,
                importance_weights_ph, distributed_target_ph
            ],
                               outputs=td_error,
                               updates=[optimize_expr])
        else:
            train = U.function(inputs=[
                obs_t_input,
                act_t_ph,
                rew_t_ph,
                obs_tp1_input,
                done_mask_ph,
                importance_weights_ph,
            ],
                               outputs=td_error,
                               updates=[optimize_expr])

        update_target = U.function([], [], updates=[update_target_expr])

        q_values = U.function([obs_t_input], q_t)
        q_tp1_best_final = U.function([obs_tp1_input], q_tp1_best)

        return act_f, train, update_target, {
            'q_values': q_values,
            'q_t1_best': q_tp1_best_final
        }
Esempio n. 10
0
def build_train_ib(make_obs_ph,
                   model_func,
                   num_actions,
                   optimizer,
                   grad_norm_clipping=None,
                   gamma=1.0,
                   beta=1.0,
                   theta=1,
                   double_q=True,
                   emdqn=True,
                   vae=True,
                   ib=True,
                   scope="deepq_ib",
                   reuse=None):
    """Creates the train function:

    Parameters
    ----------
    make_obs_ph: str -> tf.placeholder or TfInput
        a function that takes a name and creates a placeholder of input with that name
    q_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    num_actions: int
        number of actions
    reuse: bool
        whether or not to reuse the graph variables
    optimizer: tf.train.Optimizer
        optimizer to use for the Q-learning objective.
    grad_norm_clipping: float or None
        clip gradient norms to this value. If None no clipping is performed.
    gamma: float
        discount rate.
    beta: float
        coefficient of beta-ib.
    double_q: bool
        if true will use Double Q Learning (https://arxiv.org/abs/1509.06461).
        In general it is a good idea to keep it enabled.
    scope: str or VariableScope
        optional scope for variable_scope.
    reuse: bool or None
        whether or not the variables should be reused. To be able to reuse the scope must be given.

    Returns
    -------
    act: (tf.Variable, bool, float) -> tf.Variable
        function to select and action given observation.
`       See the top of the file for details.
    train: (object, np.array, np.array, object, np.array, np.array) -> np.array
        optimize the error in Bellman's equation.
`       See the top of the file for details.
    update_target: () -> ()
        copy the parameters from optimized Q function to the target Q function.
`       See the top of the file for details.
    debug: {str: function}
        a bunch of functions to print debug data like q_values.
    """

    act_noise = tf.placeholder(tf.float32, [None, 512], name="act_noise")
    act_f = build_act_ib(make_obs_ph,
                         model_func,
                         act_noise,
                         num_actions,
                         scope=scope,
                         reuse=reuse)

    with tf.variable_scope(scope, reuse=reuse):
        # set up placeholders
        obs_t_input = U.ensure_tf_input(make_obs_ph("obs_t"))

        act_t_ph = tf.placeholder(tf.int32, [None], name="action")
        rew_t_ph = tf.placeholder(tf.float32, [None], name="reward")
        z_noise_t = tf.placeholder(tf.float32, [None, 512], name="z_noise")

        z_noise_tp1 = tf.placeholder(tf.float32, [None, 512],
                                     name="z_noise_tp1")
        obs_tp1_input = U.ensure_tf_input(make_obs_ph("obs_tp1"))
        done_mask_ph = tf.placeholder(tf.float32, [None], name="done")
        importance_weights_ph = tf.placeholder(tf.float32, [None],
                                               name="weight")

        inputs = [
            obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph,
            importance_weights_ph, act_noise, z_noise_t, z_noise_tp1
        ]
        # EMDQN
        if emdqn or ib:
            qec_input = tf.placeholder(tf.float32, [None], name='qec')
            inputs.append(qec_input)
        if ib or vae:
            obs_vae_input = U.ensure_tf_input(make_obs_ph("obs_vae"))
            z_noise_vae = tf.placeholder(tf.float32, [None, 512],
                                         name="z_noise_vae")
            inputs.append(obs_vae_input)
            inputs.append(z_noise_vae)
        # q network evaluation
        q_t, v_mean_t, v_logvar_t, z_mean_t, z_logvar_t, recon_obs_t = model_func(
            obs_t_input.get(),
            z_noise_t,
            num_actions,
            scope="q_func",
            reuse=True)
        if vae or ib:
            q_vae, v_mean_vae, v_logvar_vae, z_mean_vae, z_logvar_vae, recon_obs = model_func(
                obs_vae_input.get(),
                z_noise_vae,
                num_actions,
                scope="q_func",
                reuse=True)

        # q_t = q_func(obs_t_input.get(), num_actions, scope="q_func", reuse=True)  # reuse parameters from act

        q_func_vars = U.scope_vars(U.absolute_scope_name("q_func"))

        # target q network evalution

        q_tp1, q_d_tp1, v_mean_tp1, v_logvar_tp1, z_mean_tp1, z_logvar_tp1, recon_obs_tp1 = model_func(
            obs_tp1_input.get(),
            z_noise_tp1,
            num_actions,
            scope="target_q_func")

        target_q_func_vars = U.scope_vars(
            U.absolute_scope_name("target_q_func"))

        # q scores for actions which we know were selected in the given state.
        q_t_selected = tf.reduce_sum(q_t * tf.one_hot(act_t_ph, num_actions),
                                     1)

        # compute estimate of best possible value starting from state at t + 1
        if double_q:

            q_tp1_using_online_net, _, _, _, _, _, _ = model_func(
                obs_tp1_input.get(),
                z_noise_tp1,
                num_actions,
                scope="q_func",
                reuse=True)

            q_tp1_best_using_online_net = tf.arg_max(q_tp1_using_online_net, 1)
            q_tp1_best = tf.reduce_sum(
                q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions),
                1)
        else:
            q_tp1_best = tf.reduce_max(q_tp1, 1)
        q_tp1_best_masked = (1.0 - done_mask_ph) * q_tp1_best

        # compute RHS of bellman equation
        q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked

        # compute the error (potentially clipped)
        td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)

        td_loss = tf.reduce_mean(importance_weights_ph *
                                 U.huber_loss(td_error))
        outputs = [td_loss]
        total_loss = td_loss
        if vae or ib:
            encoder_loss = -1 + z_mean_vae**2 + tf.exp(
                z_logvar_vae) - z_logvar_vae
            outputs.append(encoder_loss)
            total_loss += 0.1 * tf.reduce_mean(beta * encoder_loss)
        if vae:
            decoder_loss = tf.keras.losses.binary_crossentropy(
                tf.reshape(recon_obs, [-1]),
                tf.reshape(
                    tf.dtypes.cast(obs_vae_input._placeholder, tf.float32),
                    [-1]))
            print("here", z_mean_t.shape, z_logvar_t.shape, encoder_loss.shape,
                  decoder_loss.shape)
            vae_loss = beta * encoder_loss + theta * decoder_loss
            outputs.append(decoder_loss)
            outputs.append(vae_loss)
            total_loss += 0.1 * tf.reduce_mean(theta * decoder_loss)
        if ib:
            ib_loss = (v_mean_t - tf.stop_gradient(tf.expand_dims(
                qec_input, 1)))**2 / tf.exp(v_logvar_t) + v_logvar_t
            print("here2", v_mean_t.shape,
                  tf.expand_dims(qec_input, 1).shape, v_logvar_t.shape,
                  ib_loss.shape)
            total_ib_loss = ib_loss + beta * encoder_loss
            outputs.append(total_ib_loss)
            total_loss += 0.1 * tf.reduce_mean(ib_loss)
        # EMDQN
        if emdqn:
            qec_error = q_t_selected - tf.stop_gradient(qec_input)
            total_loss += 0.1 * tf.reduce_mean(
                importance_weights_ph * U.huber_loss(qec_error))
            outputs.append(qec_error)

        td_loss_summary = tf.summary.scalar("td loss", td_loss)
        total_loss_summary = tf.summary.scalar("total loss", total_loss)
        z_var_summary = tf.summary.scalar("z_var",
                                          tf.reduce_mean(tf.exp(z_logvar_t)))
        summaries = [td_loss_summary, total_loss_summary, z_var_summary]
        if vae or ib:
            encoder_loss_summary = tf.summary.scalar(
                "encoder loss", tf.reduce_mean(encoder_loss))
            summaries.append(encoder_loss_summary)
        if vae:
            decoder_loss_summary = tf.summary.scalar(
                "decoder loss", tf.reduce_mean(decoder_loss))
            summaries.append(decoder_loss_summary)
        if ib:
            ib_loss_summary = tf.summary.scalar("ib loss",
                                                tf.reduce_mean(ib_loss))
            total_ib_loss_summary = tf.summary.scalar(
                "total ib loss", tf.reduce_mean(total_ib_loss))
            summaries.append(ib_loss_summary)
            summaries.append(total_ib_loss_summary)
        if emdqn:
            qec_loss_summary = tf.summary.scalar(
                "qec loss", tf.reduce_mean(importance_weights_ph * qec_error))
            summaries.append(qec_loss_summary)
        summary = tf.summary.merge(summaries)
        outputs.append(summary)
        # compute optimization op (potentially with gradient clipping)
        if grad_norm_clipping is not None:
            optimize_expr = U.minimize_and_clip(optimizer,
                                                total_loss,
                                                var_list=q_func_vars,
                                                clip_val=grad_norm_clipping)
        else:
            optimize_expr = optimizer.minimize(total_loss,
                                               var_list=q_func_vars)

        # update_target_fn will be called periodically to copy Q network to target Q network
        update_target_expr = []
        for var, var_target in zip(
                sorted(q_func_vars, key=lambda v: v.name),
                sorted(target_q_func_vars, key=lambda v: v.name)):
            update_target_expr.append(var_target.assign(var))
        update_target_expr = tf.group(*update_target_expr)

        # Create callable functions

        train = U.function(inputs=inputs,
                           outputs=[td_error, summary],
                           updates=[optimize_expr])

        get_q_t_selected = U.function(
            inputs=[obs_t_input, act_t_ph, z_noise_t], outputs=q_t_selected)
        update_target = U.function([], [], updates=[update_target_expr])

        q_values = U.function([obs_t_input, z_noise_t], q_t)

        return act_f, train, update_target, {
            'q_values': q_values
        }, get_q_t_selected
Esempio n. 11
0
def build_train_mfmc(make_obs_ph,
                     model_func,
                     num_actions,
                     optimizer,
                     grad_norm_clipping=None,
                     gamma=1.0,
                     batch_size=5,
                     scope="mfec",
                     latent_dim=32,
                     input_dim=84 * 84 * 4,
                     hash_dim=32,
                     K=10,
                     beta=0.1,
                     predict=True,
                     use_rp=False,
                     reuse=None):
    """Creates the train function:

    Parameters
    ----------
    make_obs_ph: str -> tf.placeholder or TfInput
        a function that takes a name and creates a placeholder of input with that name
    num_actions: int
        number of actions
    reuse: bool
        whether or not to reuse the graph variables
    optimizer: tf.train.Optimizer
        optimizer to use for the Q-learning objective.
    grad_norm_clipping: float or None
        clip gradient norms to this value. If None no clipping is performed.
    gamma: float
        discount rate.
    double_q: bool
        if true will use Double Q Learning (https://arxiv.org/abs/1509.06461).
        In general it is a good idea to keep it enabled.
    scope: str or VariableScope
        optional scope for variable_scope.
    reuse: bool or None
        whether or not the variables should be reused. To be able to reuse the scope must be given.

    Returns
    -------
    act: (tf.Variable, bool, float) -> tf.Variable
        function to select and action given observation.
`       See the top of the file for details.
    train: (object, np.array, np.array, object, np.array, np.array) -> np.array
        optimize the error in Bellman's equation.
`       See the top of the file for details.
    update_target: () -> ()
        copy the parameters from optimized Q function to the target Q function.
`       See the top of the file for details.
    debug: {str: function}
        a bunch of functions to print debug data like q_values.
    """
    z_func = build_act_mfmc(make_obs_ph,
                            model_func,
                            num_actions,
                            scope=scope,
                            secondary_scope="model_func",
                            reuse=reuse)
    # encoder_z_func = build_act_mfmc(make_obs_ph, model_func, num_actions, scope=scope,
    #                                 secondary_scope="encoder_model_func", reuse=reuse)

    with tf.variable_scope(scope, reuse=reuse):
        # set up placeholders

        # EMDQN
        tau = tf.placeholder(tf.float32, [1], name='tau')
        # momentum = tf.placeholder(tf.float32, [1], name='momentum')

        obs_hash_input = U.ensure_tf_input(make_obs_ph("obs_hash"))
        obs_mc_input = U.ensure_tf_input(make_obs_ph("obs"))
        obs_mc_input_query = U.ensure_tf_input(make_obs_ph("obs_query"))
        # obs_mc_input_positive = U.ensure_tf_input(make_obs_ph("enc_obs_pos"))
        keys_mc_input_negative = tf.placeholder(tf.float32,
                                                [None, K, latent_dim],
                                                name='enc_keys_neg')
        keys_mc_input_positive = tf.placeholder(tf.float32, [None, latent_dim],
                                                name='enc_keys_pos')
        keys_mc_input_anchor = tf.placeholder(tf.float32, [None, latent_dim],
                                              name='enc_keys_anchor')
        # keys_mc_input_anchor = tf.Variable(initial_value=np.zeros((batch_size, latent_dim)),
        #                                    shape=[batch_size, latent_dim],
        #                                    name='enc_keys_anchor',
        #                                    dtype=tf.float32)
        #
        # keys_mc_input_positive = tf.Variable(initial_value=np.zeros((batch_size, latent_dim)),
        #                                      shape=[batch_size, latent_dim],
        #                                      name='enc_keys_pos',
        #                                      dtype=tf.float32)
        # keys_mc_input_negative = tf.Variable(initial_value=np.zeros((batch_size, K, latent_dim)),
        #                                      shape=[batch_size, K, latent_dim],
        #                                      name='enc_keys_neg',
        #                                      dtype=tf.float32)

        # inputs = [obs_mc_input]
        value_input = tf.placeholder(tf.float32, [None, 1], name='value')
        if predict:
            inputs = [
                tau, obs_mc_input_query, keys_mc_input_positive,
                keys_mc_input_negative, keys_mc_input_anchor, obs_mc_input,
                value_input
            ]
        else:
            inputs = [
                tau, obs_mc_input_query, keys_mc_input_positive,
                keys_mc_input_negative, keys_mc_input_anchor
            ]
        z_mc, _ = model_func(obs_mc_input_query.get(),
                             num_actions,
                             scope="model_func",
                             reuse=True)

        _, v_mc = model_func(obs_mc_input.get(),
                             num_actions,
                             scope="model_func",
                             reuse=True)
        # encoder_z_mc_pos, encoder_v_mc_pos = model_func(
        #     obs_mc_input_positive.get(), num_actions,
        #     scope="encoder_model_func", reuse=True)

        # z_mc_pos = tf.stop_gradient(encoder_z_mc_pos)
        # z_mc_pos = tf.reshape(keys_mc_input_positive, [-1, 1, latent_dim])
        # z_mc_anchor = tf.reshape(z_mc, [-1, latent_dim, 1])
        # z_mc_neg = tf.reshape(keys_mc_input_negative, [-1, K, latent_dim])

        z_mc_pos = keys_mc_input_positive
        z_mc = tf.reshape(z_mc, [-1, latent_dim])
        z_mc_expand = tf.reshape(z_mc, [-1, 1, latent_dim])
        z_mc_tile = tf.tile(z_mc_expand, [1, K, 1])
        z_mc_neg = keys_mc_input_negative
        z_mc_anchor = keys_mc_input_anchor

        anchor_dist = tf.sqrt(
            tf.reduce_sum(tf.square(z_mc - z_mc_anchor), axis=1))
        pos_dist = tf.sqrt(tf.reduce_sum(tf.square(z_mc - z_mc_pos), axis=1))
        neg_dist = tf.reduce_mean(tf.sqrt(
            tf.reduce_sum(tf.square(z_mc_tile - z_mc_neg), axis=2)),
                                  axis=1)
        # contrast_loss = tf.reduce_mean(tf.maximum(pos_dist - neg_dist + 1, 0))
        contrast_loss = tf.reduce_mean(tf.maximum(pos_dist - neg_dist + 1, 0)) \
                        + 0.5 * tf.reduce_mean(pos_dist) + 0.5 * tf.reduce_mean(anchor_dist)

        pos_grad = tf.gradients([contrast_loss], [z_mc_pos])
        neg_grad = tf.gradients([contrast_loss], [z_mc_neg])
        # neg_grad = tf.gradients([contrast_loss],[z_mc_neg])

        # negative = tf.matmul(z_mc_neg, z_mc_anchor) / tau
        # exp_negative = tf.squeeze(tf.reduce_sum(tf.exp(negative), axis=1))
        # positive = tf.squeeze(tf.matmul(z_mc_pos, z_mc_anchor) / tau)
        # print("shape:", z_mc.shape, z_mc_anchor.shape, z_mc_pos.shape, negative.shape, exp_negative.shape,
        #       positive.shape)
        # contrast_loss = tf.reduce_mean(tf.log(exp_negative) - positive)
        # print("shape2:", z_mc.shape, negative.shape, positive.shape)
        prediction_loss = tf.losses.mean_squared_error(value_input, v_mc)
        total_loss = contrast_loss
        if predict:
            total_loss += beta * prediction_loss

        model_func_vars = U.scope_vars(U.absolute_scope_name("model_func"))
        # encoder_model_func_vars = U.scope_vars(U.absolute_scope_name("encoder_model_func"))
        if grad_norm_clipping is not None:
            optimize_expr_contrast_with_prediction = U.minimize_and_clip(
                optimizer,
                total_loss,
                var_list=model_func_vars,
                clip_val=grad_norm_clipping)
        else:
            optimize_expr_contrast_with_prediction = optimizer.minimize(
                total_loss, var_list=model_func_vars)
        # Create callable functions
        # update_target_fn will be called periodically to copy Q network to target Q network
        # update_target_expr = []
        # for var, var_target in zip(sorted(model_func_vars, key=lambda v: v.name),
        #                            sorted(encoder_model_func_vars, key=lambda v: v.name)):
        #     update_target_expr.append(var_target.assign((1 - momentum) * var + momentum * var_target))
        # update_target_expr = tf.group(*update_target_expr)
        # update_target = U.function([momentum], [], updates=[update_target_expr])

        if use_rp:
            latten_obs = tf.reshape(obs_hash_input.get(), [-1, input_dim])
            rp = tf.random.normal([input_dim, hash_dim], 0,
                                  1 / np.sqrt(hash_dim))
            obs_hash_output = tf.matmul(latten_obs, rp)

        else:
            obs_hash_output, _ = model_func(obs_hash_input.get(),
                                            num_actions,
                                            scope="hash_func",
                                            reuse=False)
        hash_func = U.function(inputs=[obs_hash_input],
                               outputs=[obs_hash_output])
        # EMDQN
        z_var_summary = tf.summary.scalar(
            "z_var", tf.reduce_mean(tf.math.reduce_std(z_mc, axis=1)))
        z_mean_summary = tf.summary.scalar(
            "z_mean", tf.reduce_mean(tf.math.reduce_mean(z_mc, axis=1)))
        negative_summary = tf.summary.scalar(
            "negative", tf.reduce_mean(tf.reduce_mean(neg_dist)))
        negative_mean_summary = tf.summary.scalar(
            "negative mean", tf.reduce_mean(tf.reduce_mean(z_mc_neg)))
        negative_grad_summary = tf.summary.scalar(
            "negative grad", tf.reduce_mean(tf.abs(neg_grad)))
        negative_var_summary = tf.summary.scalar(
            "negative std", tf.reduce_mean(tf.math.reduce_std(z_mc_neg,
                                                              axis=2)))
        # negative_summary = tf.summary.scalar("negative", tf.reduce_mean(tf.reduce_mean(negative)))
        positive_summary = tf.summary.scalar(
            "positive", tf.reduce_mean(tf.reduce_mean(pos_dist)))
        positive_mean_summary = tf.summary.scalar(
            "positive mean", tf.reduce_mean(tf.reduce_mean(z_mc_pos)))
        positive_grad_summary = tf.summary.scalar(
            "positive grad", tf.reduce_mean(tf.abs(pos_grad)))
        positive_std_summary = tf.summary.scalar(
            "positive std", tf.reduce_mean(tf.math.reduce_std(z_mc_pos,
                                                              axis=1)))
        anchor_summary = tf.summary.scalar(
            "anchor", tf.reduce_mean(tf.reduce_mean(anchor_dist)))
        # positive_summary = tf.summary.scalar("positive", tf.reduce_mean(tf.reduce_mean(positive)))
        # z_norm_summary = tf.summary.scalar("z_norm", tf.reduce_mean(tf.norm(z_mc, axis=1)))
        # encoder_z_norm_summary = tf.summary.scalar("encoder_z_norm", tf.reduce_mean(tf.norm(encoder_z_mc_pos, axis=1)))
        # neg_norm_summary = tf.summary.scalar("neg_z_norm", tf.reduce_mean(tf.norm(keys_mc_input_negative, axis=[1, 2])))
        contrast_loss_summary = tf.summary.scalar(
            "contrast loss", tf.reduce_mean(contrast_loss))
        prediction_loss_summary = tf.summary.scalar(
            "prediction loss", tf.reduce_mean(prediction_loss))
        total_loss_summary = tf.summary.scalar("total loss",
                                               tf.reduce_mean(total_loss))

        if predict:
            summaries = [
                z_var_summary, z_mean_summary, positive_summary,
                negative_summary, contrast_loss_summary,
                prediction_loss_summary, total_loss_summary
            ]
        else:
            summaries = [
                z_var_summary, z_mean_summary, negative_var_summary,
                negative_grad_summary, negative_mean_summary, positive_summary,
                positive_mean_summary, positive_grad_summary,
                positive_std_summary, negative_summary, contrast_loss_summary,
                anchor_summary, total_loss_summary
            ]
        summary = tf.summary.merge(summaries)

        train = U.function(
            inputs=inputs,
            outputs=[total_loss, summary, z_mc, pos_grad, neg_grad],
            updates=[optimize_expr_contrast_with_prediction])

        return hash_func, z_func, train
Esempio n. 12
0
def build_train_dbc(input_type,
                    obs_shape,
                    repr_func,
                    model_func,
                    num_actions,
                    optimizer,
                    grad_norm_clipping=None,
                    gamma=1.0,
                    scope="mfec",
                    num_neg=10,
                    latent_dim=32,
                    alpha=1,
                    beta=1e2,
                    theta=10,
                    loss_type=["contrast"],
                    knn=4,
                    c_loss_type="margin",
                    b=100,
                    batch_size=32,
                    reuse=None):
    """Creates the train function:

    Parameters
    ----------
    make_obs_ph: str -> tf.placeholder or TfInput
        a function that takes a name and creates a placeholder of input with that name
    num_actions: int
        number of actions
    reuse: bool
        whether or not to reuse the graph variables
    optimizer: tf.train.Optimizer
        optimizer to use for the Q-learning objective.
    grad_norm_clipping: float or None
        clip gradient norms to this value. If None no clipping is performed.
    gamma: float
        discount rate.
    double_q: bool
        if true will use Double Q Learning (https://arxiv.org/abs/1509.06461).
        In general it is a good idea to keep it enabled.
    scope: str or VariableScope
        optional scope for variable_scope.
    reuse: bool or None
        whether or not the variables should be reused. To be able to reuse the scope must be given.

    Returns
    -------
    act: (tf.Variable, bool, float) -> tf.Variable
        function to select and action given observation.
`       See the top of the file for details.
    train: (object, np.array, np.array, object, np.array, np.array) -> np.array
        optimize the error in Bellman's equation.
`       See the top of the file for details.
    update_target: () -> ()
        copy the parameters from optimized Q function to the target Q function.
`       See the top of the file for details.
    debug: {str: function}
        a bunch of functions to print debug data like q_values.
    """
    if c_loss_type != "infonce":
        assert num_neg == 1
    # z_func = build_act_contrast(make_obs_ph, model_func, num_actions, scope=scope, secondary_scope="model_func",
    #                             reuse=reuse)

    with tf.variable_scope(scope, reuse=reuse):
        # set up placeholders

        # EMDQN
        # tau = tf.placeholder(tf.float32, [1], name='tau')
        # momentum = tf.placeholder(tf.float32, [1], name='momentum')

        # make_obs_ph = lambda name: input_type(obs_shape, batch_size, name=name),

        magic_num = tf.get_variable(name='magic', shape=[1])
        obs_input_u = U.ensure_tf_input(
            input_type(obs_shape, None, name="obs_u"))
        obs_input_u_tp1 = U.ensure_tf_input(
            input_type(obs_shape, None, name="obs_u_tp1"))
        obs_input_v = U.ensure_tf_input(
            input_type(obs_shape, None, name="obs_v"))

        action_input = tf.placeholder(tf.int32, [batch_size], name="action")
        reward_input = tf.placeholder(tf.float32, [batch_size], name="action")

        inputs = [
            obs_input_u, obs_input_u_tp1, obs_input_v, action_input,
            reward_input
        ]
        z_old = repr_func(obs_input_u.get(),
                          num_actions,
                          scope="target_repr_func",
                          reuse=False)

        z_u = repr_func(obs_input_u.get(),
                        num_actions,
                        scope="repr_func",
                        reuse=tf.AUTO_REUSE)

        z_u_tp1 = repr_func(obs_input_u_tp1.get(),
                            num_actions,
                            scope="repr_func",
                            reuse=tf.AUTO_REUSE)

        z_v = repr_func(obs_input_v.get(),
                        num_actions,
                        scope="repr_func",
                        reuse=tf.AUTO_REUSE)

        z_u_tp1_predict, r_u_predict = model_func(z_u,
                                                  num_actions,
                                                  scope="model_func",
                                                  reuse=tf.AUTO_REUSE)

        z_v_tp1_predict, r_v_predict = model_func(z_v,
                                                  num_actions,
                                                  scope="model_func",
                                                  reuse=tf.AUTO_REUSE)

        # total_loss = 0
        # reprsentation loss
        dist_bisimulation = tf.reduce_max(
            tf.abs(r_u_predict - r_v_predict) + gamma * tf.reduce_sum(
                tf.square(z_u_tp1_predict - z_v_tp1_predict), axis=2),
            axis=1)
        dist_bisimulation = tf.stop_gradient(dist_bisimulation)
        repr_loss = tf.losses.mean_squared_error(
            tf.norm(z_u - z_v, ord=1, axis=1), dist_bisimulation)

        # model loss
        z_u_tp1_selected = tf.gather(z_u_tp1_predict,
                                     action_input,
                                     axis=1,
                                     batch_dims=0)
        r_u_selected = tf.gather(r_u_predict,
                                 action_input,
                                 axis=1,
                                 batch_dims=0)
        transition_loss = tf.losses.mean_squared_error(
            z_u_tp1, tf.stop_gradient(z_u_tp1_selected))
        reward_loss = tf.losses.mean_squared_error(
            reward_input, tf.stop_gradient(r_u_selected))
        model_loss = transition_loss + reward_loss

        total_loss = repr_loss + alpha * model_loss

        model_func_vars = U.scope_vars(U.absolute_scope_name("repr_func"))
        model_func_vars_update = copy.copy(model_func_vars) + U.scope_vars(
            U.absolute_scope_name("model_func"))

        target_model_func_vars = U.scope_vars(
            U.absolute_scope_name("repr_model_func"))

        update_target_expr = []
        for var in model_func_vars:
            print(var.name, var.shape)
        for var_target in target_model_func_vars:
            print(var_target.name, var_target.shape)

        for var, var_target in zip(
                sorted(model_func_vars, key=lambda v: v.name),
                sorted(target_model_func_vars, key=lambda v: v.name)):
            update_target_expr.append(var_target.assign(var))
        update_target_expr = tf.group(*update_target_expr)

        if grad_norm_clipping is not None:
            optimize_expr = U.minimize_and_clip(
                optimizer,
                total_loss,
                var_list=model_func_vars_update,
                clip_val=grad_norm_clipping)
        else:
            optimize_expr = optimizer.minimize(total_loss,
                                               var_list=model_func_vars_update)
        # Create callable functions
        # update_target_fn will be called periodically to copy Q network to target Q network
        z_var_summary = tf.summary.scalar(
            "z_var", tf.reduce_mean(tf.math.reduce_std(z_u, axis=1)))
        total_loss_summary = tf.summary.scalar("total loss",
                                               tf.reduce_mean(total_loss))
        transition_loss_summary = tf.summary.scalar(
            "transition loss", tf.reduce_mean(transition_loss))
        reward_loss_summary = tf.summary.scalar("reward loss",
                                                tf.reduce_mean(reward_loss))
        model_loss_summary = tf.summary.scalar("model loss",
                                               tf.reduce_mean(model_loss))
        repr_loss_summary = tf.summary.scalar("repr loss",
                                              tf.reduce_mean(repr_loss))

        summaries = [
            z_var_summary, total_loss_summary, transition_loss_summary,
            reward_loss_summary, model_loss_summary, repr_loss_summary
        ]

        summary = tf.summary.merge(summaries)
        outputs = [total_loss, summary]
        train = U.function(inputs=inputs,
                           outputs=outputs,
                           updates=[optimize_expr])

        eval = U.function(inputs=inputs, outputs=outputs, updates=[])
        z_func = U.function(
            inputs=[obs_input_u],
            outputs=[z_old],
        )
        update_target_func = U.function([], [], updates=[update_target_expr])
        return z_func, train, eval, update_target_func
Esempio n. 13
0
def build_train(make_obs_ph,
                q_func,
                num_actions,
                optimizer,
                chief=False,
                server=None,
                workers=1,
                grad_norm_clipping=None,
                gamma=1.0,
                double_q=True,
                scope="deepq",
                reuse=None):
    """Creates the act function:

    Parameters
    ----------
    make_obs_ph: str -> tf.placeholder or TfInput
        a function that take a name and creates a placeholder of input with that name
    q_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    num_actions: int
        number of actions
    reuse: bool
        whether or not to reuse the graph variables
    optimizer: tf.train.Optimizer
        optimizer to use for the Q-learning objective.
    chief: bool
        whether or not the worker should assume chief duties.
        these include: initializing global parameters, tensorboarding, saving, etc.
    grad_norm_clipping: float or None
        clip gradient norms to this value. If None no clipping is performed.
    gamma: float
        discount rate.
    double_q: bool
        if true will use Double Q Learning (https://arxiv.org/abs/1509.06461).
        In general it is a good idea to keep it enabled.
    scope: str or VariableScope
        optional scope for variable_scope.
    reuse: bool or None
        whether or not the variables should be reused. To be able to reuse the scope must be given.

    Returns
    -------
    act: (tf.Variable, bool, float) -> tf.Variable
        function to select and action given observation.
`       See the top of the file for details.
    train: (object, np.array, np.array, object, np.array, np.array) -> np.array
        optimize the error in Bellman's equation.
`       See the top of the file for details.
    update_target: () -> ()
        copy the parameters from optimized Q function to the target Q function.
`       See the top of the file for details.
    debug: {str: function}
        a bunch of functions to print debug data like q_values.
    """
    task = server.server_def.task_index
    act_f = build_act(make_obs_ph,
                      q_func,
                      num_actions,
                      scope=scope,
                      reuse=reuse,
                      task=task)

    with tf.variable_scope(scope, reuse=reuse):
        with tf.device("/job:worker/task:{}".format(task)):
            # set up placeholders
            obs_t_input = U.ensure_tf_input(make_obs_ph("obs_t"))
            act_t_ph = tf.placeholder(tf.int32, [None], name="action")
            rew_t_ph = tf.placeholder(tf.float32, [None], name="reward")
            obs_tp1_input = U.ensure_tf_input(make_obs_ph("obs_tp1"))
            done_mask_ph = tf.placeholder(tf.float32, [None], name="done")
            importance_weights_ph = tf.placeholder(tf.float32, [None],
                                                   name="weight")

            # Local timestep counters
            t = tf.placeholder(tf.float32, [1], name="t")
            t_global_old = tf.placeholder(tf.float32, [1], name="t_global_old")
            score_input = tf.placeholder(tf.float32, [1], name="score_input")
            grad_prio = tf.placeholder(tf.bool, [1], name="grad_prio")
            converged_ph = tf.placeholder(tf.bool, [1], name="converged")
            factor_input = tf.placeholder(tf.float32, [1], name="factor_input")

            # Global timestep counter
            # TODO Does TF have built-in global step counters?
            with tf.device("/job:ps/task:0"):
                t_global = tf.Variable(dtype=tf.float32,
                                       initial_value=[0],
                                       name="t_global")
                run_code_global = tf.Variable(initial_value="",
                                              name="run_code_global")
                comm_rounds_global = tf.Variable(dtype=tf.float32,
                                                 initial_value=[0],
                                                 name="comm_rounds_global")
                max_workers_global = tf.constant(workers,
                                                 dtype=tf.float32,
                                                 name="max_workers_global")
                worker_count_global = tf.Variable(dtype=tf.float32,
                                                  initial_value=[0],
                                                  name="worker_count_global")
                score_max_global = tf.Variable(dtype=tf.float32,
                                               initial_value=[0],
                                               name="score_max_global")
                score_min_global = tf.Variable(dtype=tf.float32,
                                               initial_value=[0],
                                               name="score_min_global")
                submit_count_global = tf.Variable(dtype=tf.float32,
                                                  initial_value=[-1],
                                                  name="submit_count_global")
                converged_global = tf.Variable(dtype=tf.bool,
                                               initial_value=[False],
                                               name="converged_global")

            # q network evaluation
            q_t = q_func(obs_t_input.get(),
                         num_actions,
                         scope="q_func",
                         reuse=True)  # reuse parameters from act
            q_func_vars = U.scope_vars(U.absolute_scope_name("q_func"))

            # target q network evalution
            q_tp1 = q_func(obs_tp1_input.get(),
                           num_actions,
                           scope="target_q_func")
            target_q_func_vars = U.scope_vars(
                U.absolute_scope_name("target_q_func"))

            # global weights
            print("chief:", chief, "reuse:", True if not chief else None)
            global_q_func_vars = []
            # with tf.device(tf.train.replica_device_setter(cluster=cluster)):
            with tf.device(
                    "/job:ps/task:0"):  # TODO needs RDS if using multiple PS
                # q_global = q_func(obs_t_input.get(), num_actions, scope="global_weights", reuse=None if chief else True)#reuse=(not chief))
                # q_global = q_func(obs_t_input.get(), num_actions, scope="global_weights")
                with tf.variable_scope("global_weights"):
                    for var in q_func_vars:
                        name = var.name.split(":")[0].split("q_func/")[-1]
                        global_q_func_vars.append(
                            tf.get_variable(name=name,
                                            shape=var.shape,
                                            dtype=var.dtype,
                                            initializer=tf.contrib.layers.
                                            xavier_initializer(
                                                seed=1, dtype=var.dtype)))
            # global_q_func_vars = U.scope_vars(U.absolute_scope_name("global_weights"))
            # print("Global:", global_q_func_vars)

            # old weights (used to implicitly calculate gradient sum: q_func_vars - q_func_vars_old)
            q_func_vars_old = []
            with tf.variable_scope("old_weights"):
                for var in q_func_vars:
                    name = var.name.split(":")[0].split("q_func/")[-1]
                    q_func_vars_old.append(
                        tf.get_variable(
                            name=name,
                            shape=var.shape,
                            dtype=var.dtype,
                            initializer=tf.contrib.layers.xavier_initializer(
                                seed=1, dtype=var.dtype)))
            # q_old = q_func(obs_t_input.get(), num_actions, scope="old_weights")
            # q_func_vars_old = U.scope_vars(U.absolute_scope_name("old_weights"))
            # print("Old vars:", q_func_vars_old)

            # q scores for actions which we know were selected in the given state.
            q_t_selected = tf.reduce_sum(
                q_t * tf.one_hot(act_t_ph, num_actions), 1)

            # compute estimate of best possible value starting from state at t + 1
            if double_q:
                q_tp1_using_online_net = q_func(obs_tp1_input.get(),
                                                num_actions,
                                                scope="q_func",
                                                reuse=True)
                q_tp1_best_using_online_net = tf.arg_max(
                    q_tp1_using_online_net, 1)
                q_tp1_best = tf.reduce_sum(
                    q_tp1 *
                    tf.one_hot(q_tp1_best_using_online_net, num_actions), 1)
            else:
                q_tp1_best = tf.reduce_max(q_tp1, 1)
            q_tp1_best_masked = (1.0 - done_mask_ph) * q_tp1_best

            # compute RHS of bellman equation
            q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked

            # compute the error (potentially clipped)
            td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)
            errors = U.huber_loss(td_error)
            weighted_error = tf.reduce_mean(importance_weights_ph * errors)

            # compute optimization op (potentially with gradient clipping)
            if grad_norm_clipping is not None:
                optimize_expr = U.minimize_and_clip(
                    optimizer,
                    weighted_error,
                    var_list=q_func_vars,
                    clip_val=grad_norm_clipping)
            else:
                optimize_expr = optimizer.minimize(weighted_error,
                                                   var_list=q_func_vars)

            # update_target_fn will be called periodically to copy Q network to target Q network
            update_target_expr = []
            for var, var_target in zip(
                    sorted(q_func_vars, key=lambda v: v.name),
                    sorted(target_q_func_vars, key=lambda v: v.name)):
                update_target_expr.append(var_target.assign(var))
            update_target_expr = tf.group(*update_target_expr)

            # update_global_fn will be called periodically to copy global Q network to q network
            update_global_expr = []
            for var_global, var, var_old in zip(
                    sorted(global_q_func_vars, key=lambda v: v.name),
                    sorted(q_func_vars, key=lambda v: v.name),
                    sorted(q_func_vars_old, key=lambda v: v.name)):
                update_global_expr.append(var.assign(var_global))
                # TODO Can async cause var <- var_global, var_global <- new value, var_old <- var_global in that order?
                # TODO Should this copy from var instead? (concurrency issues?)
                # TODO Can concurrency cause var_old <- var, var <- var_global in that order (resulting in wrong values)?
                # TODO Safest method is to force sequential execution of var <- var_global, var_old <- var! How though?
                update_global_expr.append(var_old.assign(var_global))
            update_global_expr = tf.group(*update_global_expr)

            # update the global time step counter by adding the local
            update_t_global = t_global.assign_add(t)

            optimize_global_expr = []
            # Factor to multiply every gradient with
            # f = t / (t_global - t_global_old)
            dt = tf.subtract(update_t_global, t_global_old)
            factor = tf.where(
                tf.greater_equal(factor_input, 0), factor_input,
                tf.where(
                    grad_prio,
                    tf.divide(tf.subtract(score_input, score_min_global),
                              tf.subtract(score_max_global, score_min_global)),
                    tf.div(t, dt)))
            for var, var_old, var_global in zip(
                    sorted(q_func_vars, key=lambda v: v.name),
                    sorted(q_func_vars_old, key=lambda v: v.name),
                    sorted(global_q_func_vars, key=lambda v: v.name)):
                # Multiply the difference between the old parameters and the locally optimized parameters
                # g = (var - var_old) * f
                grad = tf.multiply(tf.subtract(var, var_old), factor)
                optimize_global_expr.append(var_global.assign_add(grad))
            optimize_global_expr = tf.group(*optimize_global_expr)

            # if cr == cr_g and wc < wc_max:
            #   wc += 1
            #   score_global += score
            # if cr == cr_g and wc == wc_max:
            #   vc += 1
            #   score_global += score
            #   cr_g += 0.5
            # return cr_g
            """
            if cr == cr_g:
                if wc <= wc_max:
                    wc += 1
                    score_global += score
                    if wc == wc_max:
                        cr_g += 0.5
            return cr_g
            """
            # submit_score_expr = \
            #     tf.cond(tf.equal(comm_rounds, comm_rounds_global),
            #             lambda: tf.cond(tf.less_equal(worker_count_global, max_workers_global),
            #                             lambda: tf.group(worker_count_global.assign_add([1]),
            #                                              score_global.assign_add(score_input),
            #                                              tf.cond(tf.equal(worker_count_global, max_workers_global),
            #                                                      lambda: comm_rounds_global.assign_add([0.5]),
            #                                                      lambda: None)),
            #                             lambda: tf.group(None, None, None)),
            #             lambda: None)
            # submit_score_expr = \
            #     tf.cond(tf.logical_and(tf.equal(comm_rounds, comm_rounds_global),
            #                            tf.less(worker_count_global, max_workers_global)),
            #             tf.group(worker_count_global.assign_add(1),
            #                      score_global.assign_add(score_input)),
            #             tf.cond(tf.logical_and(tf.equal(comm_rounds, comm_rounds_global),
            #                                    tf.equal(worker_count_global, max_workers_global)),
            #                     tf.group(worker_count_global.assign_add(1),
            #                              score_global.assign_add(score_input),
            #                              comm_rounds_global.assign_add(0.5))))

            # This makes a sum of all scores (
            # submit_score_expr = score_global.assign_add(score_input)

            # This only saves the maximum score (for normalized score weighting)
            submit_score_max = score_max_global.assign(tf.maximum(
                score_input, score_max_global),
                                                       use_locking=True)
            submit_score_min = score_min_global.assign(tf.minimum(
                score_input, score_min_global),
                                                       use_locking=True)

            set_submit_count = submit_count_global.assign(score_input,
                                                          use_locking=True)
            inc_submit_count = submit_count_global.assign_add([1],
                                                              use_locking=True)

            # check_round_op = tf.equal(comm_rounds, comm_rounds_global) # Not used anymore
            inc_wc = worker_count_global.assign_add([1], use_locking=True)
            zero_wc = worker_count_global.assign([0], use_locking=True)

            inc_cr = comm_rounds_global.assign_add([1], use_locking=True)

            score_reset = score_max_global.assign([0], use_locking=True)

            converged_set = converged_global.assign(converged_ph,
                                                    use_locking=True)

            # Create callable functions
            train = U.function(inputs=[
                obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph,
                importance_weights_ph
            ],
                               outputs=[td_error],
                               updates=[optimize_expr])
            global_opt = U.function(
                inputs=[t, t_global_old, score_input, factor_input, grad_prio],
                outputs=[dt, comm_rounds_global, factor],
                updates=[optimize_global_expr])
            # global_sync_opt = U.function(inputs=[comm_rounds], outputs=[comm_rounds_global], updates=[optimize_global_sync_expr])
            update_weights = U.function(inputs=[],
                                        outputs=[t_global],
                                        updates=[update_global_expr])
            update_target = U.function([], [], updates=[update_target_expr])
            submit_score = U.function(
                inputs=[score_input],
                outputs=[comm_rounds_global],
                updates=[submit_score_max, submit_score_min])
            check_round = U.function(inputs=[],
                                     outputs=[comm_rounds_global],
                                     updates=[])
            request_submit = U.function(inputs=[],
                                        outputs=[comm_rounds_global, inc_wc],
                                        updates=[])
            set_submit = U.function(inputs=[score_input],
                                    outputs=[set_submit_count],
                                    updates=[])
            check_submit = U.function(inputs=[],
                                      outputs=[submit_count_global],
                                      updates=[])
            inc_submit = U.function(inputs=[],
                                    outputs=[inc_submit_count],
                                    updates=[])
            inc_comm_round = U.function(inputs=[],
                                        outputs=[inc_cr],
                                        updates=[])
            reset_wc = U.function(inputs=[], outputs=[zero_wc], updates=[])
            check_wc = U.function(inputs=[],
                                  outputs=[worker_count_global],
                                  updates=[])
            reset_score = U.function(inputs=[],
                                     outputs=[],
                                     updates=[score_reset])
            set_converged = U.function(inputs=[converged_ph],
                                       outputs=[],
                                       updates=[converged_set])
            check_converged = U.function(inputs=[],
                                         outputs=[converged_global],
                                         updates=[])

            # Debugging functions
            q_values = U.function([obs_t_input], q_t)
            weights = U.function(
                inputs=[],
                outputs=[q_func_vars, global_q_func_vars, q_func_vars_old],
                updates=[])
            t_global_func = U.function([], t_global)
            comm_rounds_func = U.function([], comm_rounds_global)

            return act_f, train, global_opt, update_target, update_weights, \
                {'request_submit': request_submit, 'submit_score': submit_score,
                 'check_round': check_round, 'check_submit': check_submit, 'set_submit': set_submit,
                 'inc_submit': inc_submit, 'inc_comm_round': inc_comm_round, 'reset_wc': reset_wc,
                 'check_wc': check_wc, 'reset_score': reset_score,
                 'set_converged': set_converged, 'check_converged': check_converged}, \
                {'q_values': q_values, 'weights': weights, 't_global': t_global_func,
                 'run_code': run_code_global, 'comm_rounds': comm_rounds_func, 'factor': factor}
Esempio n. 14
0
def build_train(make_obs_ph,
                q_func,
                num_actions,
                optimizer,
                bootstrap=False,
                swarm=False,
                voting=False,
                heads=1,
                grad_norm_clipping=None,
                gamma=1.0,
                double_q=True,
                scope="deepq",
                reuse=None,
                device="/cpu:0"):
    """Creates the train function:

    Parameters
    ----------
    make_obs_ph: str -> tf.placeholder or TfInput
        a function that takes a name and creates a placeholder of input with that name
    q_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    num_actions: int
        number of actions
    reuse: bool
        whether or not to reuse the graph variables
    optimizer: tf.train.Optimizer
        optimizer to use for the Q-learning objective.
    grad_norm_clipping: float or None
        clip gradient norms to this value. If None no clipping is performed.
    gamma: float
        discount rate.
    double_q: bool
        if true will use Double Q Learning (https://arxiv.org/abs/1509.06461).
        In general it is a good idea to keep it enabled.
    scope: str or VariableScope
        optional scope for variable_scope.
    reuse: bool or None
        whether or not the variables should be reused. To be able to reuse the scope must be given.

    Returns
    -------
    act: (tf.Variable, bool, float) -> tf.Variable
        function to select and action given observation.
`       See the top of the file for details.
    train: (object, np.array, np.array, object, np.array, np.array) -> np.array
        optimize the error in Bellman's equation.
`       See the top of the file for details.
    update_target: () -> ()
        copy the parameters from optimized Q function to the target Q function.
`       See the top of the file for details.
    debug: {str: function}
        a bunch of functions to print debug data like q_values.
    """
    act_f = build_act(make_obs_ph,
                      q_func,
                      bootstrap=bootstrap,
                      swarm=swarm,
                      voting=voting,
                      heads=heads,
                      num_actions=num_actions,
                      scope=scope,
                      reuse=reuse,
                      device=device)

    with tf.variable_scope(scope, reuse=reuse):
        # set up placeholders
        obs_t_input = U.ensure_tf_input(make_obs_ph("obs_t"))
        act_t_ph = tf.placeholder(tf.int32, [None], name="action")
        rew_t_ph = tf.placeholder(tf.float32, [None], name="reward")
        obs_tp1_input = U.ensure_tf_input(make_obs_ph("obs_tp1"))
        done_mask_ph = tf.placeholder(tf.float32, [None], name="done")
        importance_weights_ph = tf.placeholder(tf.float32, [None],
                                               name="weight")
        update_lr_ph = tf.placeholder(tf.float32, (), name="learning_rate")

        lr = tf.get_variable("lr", (), initializer=tf.constant_initializer(0))

        with tf.device(device):
            # q network evaluation
            q_t = q_func(obs_t_input.get(),
                         num_actions,
                         scope="q_func",
                         reuse=True,
                         heads=heads)  # reuse parameters from act
            q_func_vars = U.scope_vars(U.absolute_scope_name("q_func"))

            # target q network evalution
            q_tp1 = q_func(obs_tp1_input.get(),
                           num_actions,
                           scope="target_q_func",
                           reuse=True,
                           heads=heads)  # reuse parameters form act
            target_q_func_vars = U.scope_vars(
                U.absolute_scope_name("target_q_func"))

            # q scores for actions which we know were selected in the given state.
            q_t_selected = []
            for i in range(heads):
                q_t_selected.append(
                    tf.reduce_sum(q_t[i] * tf.one_hot(act_t_ph, num_actions),
                                  1))

            # compute estimate of best possible value starting from state at t + 1
            q_tp1_best = []
            q_tp1_best_using_online_net = []
            if swarm:
                q_tp1_using_online_net = q_func(obs_tp1_input.get(),
                                                num_actions,
                                                scope="q_func",
                                                reuse=True,
                                                heads=heads)

                action_subsets = []
                for i in range(heads):
                    target_greedy_action = tf.argmax(q_tp1[i], axis=1)
                    online_q_value_threshold = tf.reduce_sum(
                        q_tp1_using_online_net[i] *
                        tf.one_hot(target_greedy_action, num_actions), 1)
                    online_q_value_threshold = tf.tile(
                        tf.expand_dims(online_q_value_threshold, 1),
                        tf.constant([1, num_actions]))

                    action_subset = tf.where(
                        (q_tp1_using_online_net[i] - online_q_value_threshold)
                        >= 0,
                        tf.ones([tf.shape(obs_t_input.get())[0], num_actions]),
                        tf.zeros([tf.shape(obs_t_input.get())[0],
                                  num_actions]))
                    action_subsets.append(action_subset)

                action_subsets = tf.stack(action_subsets, axis=1)
                actions_cover = set_cover(action_subsets)
                # preferred_actions = tf.transpose(action_subsets, [1, 0, 2])

                for i in range(heads):
                    q_tp1_best_using_online_net.append(
                        tf.argmax(tf.multiply(actions_cover, q_tp1[i]),
                                  axis=1))
                    q_tp1_best.append(
                        tf.reduce_sum(
                            q_tp1[i] * tf.one_hot(
                                q_tp1_best_using_online_net[i], num_actions),
                            1))
            elif double_q:
                q_tp1_using_online_net = q_func(obs_tp1_input.get(),
                                                num_actions,
                                                scope="q_func",
                                                reuse=True,
                                                heads=heads)
                for i in range(heads):
                    q_tp1_best_using_online_net.append(
                        tf.arg_max(q_tp1_using_online_net[i], 1))
                    q_tp1_best.append(
                        tf.reduce_sum(
                            q_tp1[i] * tf.one_hot(
                                q_tp1_best_using_online_net[i], num_actions),
                            1))
            else:
                for i in range(heads):
                    q_tp1_best.append(tf.reduce_max(q_tp1, 1))

        q_tp1_best_masked = []
        q_t_selected_target = []
        td_error = []
        errors = []
        weighted_error = []
        optimize_expr = []
        optimizer = tf.train.AdamOptimizer(learning_rate=lr,
                                           beta1=0.9,
                                           beta2=0.99,
                                           epsilon=1e-4)
        q_func_heads = U.scope_vars(U.absolute_scope_name("q_func/heads"))
        q_func_convnets = U.scope_vars(U.absolute_scope_name("q_func/convnet"))
        for i in range(heads):
            q_tp1_best_masked.append((1.0 - done_mask_ph) * q_tp1_best[i])

            # compute RHS of bellman equation
            q_t_selected_target.append(rew_t_ph + gamma * q_tp1_best_masked[i])

            # compute the error (potentially clipped)
            td_error.append(q_t_selected[i] -
                            tf.stop_gradient(q_t_selected_target[i]))
            with tf.device(device):
                errors.append(U.huber_loss(td_error[i]))
            weighted_error.append(
                tf.reduce_mean(importance_weights_ph * errors[i]))
            # compute optimization op (potentially with gradient clipping)
            if grad_norm_clipping is not None:
                optimize_expr.append(
                    U.minimize_and_clip(optimizer,
                                        weighted_error[i],
                                        var_list=q_func_heads,
                                        clip_val=grad_norm_clipping))
                optimize_expr.append(
                    U.minimize_and_clip(optimizer,
                                        0.1 * weighted_error[i],
                                        var_list=q_func_convnets,
                                        clip_val=grad_norm_clipping))
            else:
                optimize_expr.append(
                    optimizer.minimize(weighted_error[i],
                                       var_list=q_func_vars))

        update_lr_expr = lr.assign(
            tf.cond(update_lr_ph >= 0, lambda: update_lr_ph, lambda: lr))
        optimize_expr.append(update_lr_expr)

        # update_target_fn will be called periodically to copy Q network to target Q network
        update_target_expr = []
        for var, var_target in zip(
                sorted(q_func_vars, key=lambda v: v.name),
                sorted(target_q_func_vars, key=lambda v: v.name)):
            update_target_expr.append(var_target.assign(var))
        update_target_expr = tf.group(*update_target_expr)

        # Create callable functions
        train = U.function(
            inputs=[
                obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph,
                importance_weights_ph, update_lr_ph
            ],
            outputs=td_error[0],
            updates=optimize_expr,
        )
        update_target = U.function([], [], updates=[update_target_expr])

        q_values = U.function([obs_t_input], q_t)

        return act_f, train, update_target, {'q_values': q_values}
Esempio n. 15
0
def build_train_mf(make_obs_ph,
                   q_func,
                   num_actions,
                   optimizer,
                   grad_norm_clipping=None,
                   gamma=1.0,
                   scope="mfec",
                   alpha=1.0,
                   beta=1.0,
                   theta=1.0,
                   latent_dim=32,
                   ib=True,
                   reuse=None):
    """Creates the train function:

    Parameters
    ----------
    make_obs_ph: str -> tf.placeholder or TfInput
        a function that takes a name and creates a placeholder of input with that name
    q_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    num_actions: int
        number of actions
    reuse: bool
        whether or not to reuse the graph variables
    optimizer: tf.train.Optimizer
        optimizer to use for the Q-learning objective.
    grad_norm_clipping: float or None
        clip gradient norms to this value. If None no clipping is performed.
    gamma: float
        discount rate.
    double_q: bool
        if true will use Double Q Learning (https://arxiv.org/abs/1509.06461).
        In general it is a good idea to keep it enabled.
    scope: str or VariableScope
        optional scope for variable_scope.
    reuse: bool or None
        whether or not the variables should be reused. To be able to reuse the scope must be given.

    Returns
    -------
    act: (tf.Variable, bool, float) -> tf.Variable
        function to select and action given observation.
`       See the top of the file for details.
    train: (object, np.array, np.array, object, np.array, np.array) -> np.array
        optimize the error in Bellman's equation.
`       See the top of the file for details.
    update_target: () -> ()
        copy the parameters from optimized Q function to the target Q function.
`       See the top of the file for details.
    debug: {str: function}
        a bunch of functions to print debug data like q_values.
    """
    act_noise = tf.placeholder(tf.float32, [None, latent_dim],
                               name="act_noise")
    act_f = build_act_mf(make_obs_ph,
                         q_func,
                         act_noise,
                         num_actions,
                         scope=scope,
                         reuse=reuse)

    with tf.variable_scope(scope, reuse=reuse):
        # set up placeholders

        # EMDQN

        obs_vae_input = U.ensure_tf_input(make_obs_ph("obs_vae"))
        z_noise_vae = tf.placeholder(tf.float32, [None, latent_dim],
                                     name="z_noise_vae")
        inputs = [obs_vae_input, z_noise_vae]
        if ib:
            qec_input = tf.placeholder(tf.float32, [None], name='qec')
            inputs.append(qec_input)
        outputs = []

        q_vae, q_deterministic_vae, v_mean_vae, v_logvar_vae, z_mean_vae, z_logvar_vae, recon_obs = q_func(
            obs_vae_input.get(),
            z_noise_vae,
            num_actions,
            scope="q_func",
            reuse=True)
        q_func_vars = U.scope_vars(U.absolute_scope_name("q_func"))

        encoder_loss = -1 + z_mean_vae**2 + tf.exp(z_logvar_vae) - z_logvar_vae

        total_loss = tf.reduce_mean(beta * encoder_loss)
        decoder_loss = tf.keras.losses.binary_crossentropy(
            tf.reshape(recon_obs, [-1]),
            tf.reshape(tf.dtypes.cast(obs_vae_input._placeholder, tf.float32),
                       [-1]))
        print("here", z_mean_vae.shape, z_logvar_vae.shape, encoder_loss.shape,
              decoder_loss.shape)
        vae_loss = beta * encoder_loss + theta * decoder_loss
        outputs.append(encoder_loss)
        outputs.append(decoder_loss)
        outputs.append(vae_loss)
        total_loss += tf.reduce_mean(theta * decoder_loss)
        if ib:
            ib_loss = (v_mean_vae -
                       tf.stop_gradient(tf.expand_dims(qec_input, 1))
                       )**2 / tf.exp(v_logvar_vae) + v_logvar_vae
            print("here2", v_mean_vae.shape,
                  tf.expand_dims(qec_input, 1).shape, v_logvar_vae.shape,
                  ib_loss.shape)
            total_ib_loss = alpha * ib_loss + beta * encoder_loss
            outputs.append(total_ib_loss)
            total_loss += tf.reduce_mean(alpha * ib_loss)

        if grad_norm_clipping is not None:
            optimize_expr = U.minimize_and_clip(optimizer,
                                                total_loss,
                                                var_list=q_func_vars,
                                                clip_val=grad_norm_clipping)
        else:
            optimize_expr = optimizer.minimize(total_loss,
                                               var_list=q_func_vars)
        # Create callable functions
        # EMDQN
        total_loss_summary = tf.summary.scalar("total loss", total_loss)
        z_var_summary = tf.summary.scalar("z_var",
                                          tf.reduce_mean(tf.exp(z_logvar_vae)))
        encoder_loss_summary = tf.summary.scalar("encoder loss",
                                                 tf.reduce_mean(encoder_loss))
        decoder_loss_summary = tf.summary.scalar("decoder loss",
                                                 tf.reduce_mean(decoder_loss))
        summaries = [
            total_loss_summary, z_var_summary, encoder_loss_summary,
            decoder_loss_summary
        ]
        if ib:
            ib_loss_summary = tf.summary.scalar("ib loss",
                                                tf.reduce_mean(ib_loss))
            total_ib_loss_summary = tf.summary.scalar(
                "total ib loss", tf.reduce_mean(total_ib_loss))
            summaries.append(ib_loss_summary)
            summaries.append(total_ib_loss_summary)

        summary = tf.summary.merge(summaries)
        outputs.append(summary)

        train = U.function(inputs=inputs,
                           outputs=[total_loss, summary],
                           updates=[optimize_expr])

        return act_f, train
Esempio n. 16
0
def build_train(make_obs_ph, q_func, num_actions, optimizer, grad_norm_clipping=None, gamma=1.0,
    double_q=True, scope="deepq", reuse=None, param_noise=False, param_noise_filter_func=None):
    """Creates the train function:

    Parameters
    ----------
    make_obs_ph: str -> tf.placeholder or TfInput
        a function that takes a name and creates a placeholder of input with that name
    q_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    num_actions: int
        number of actions
    reuse: bool
        whether or not to reuse the graph variables
    optimizer: tf.train.Optimizer
        optimizer to use for the Q-learning objective.
    grad_norm_clipping: float or None
        clip gradient norms to this value. If None no clipping is performed.
    gamma: float
        discount rate.
    double_q: bool
        if true will use Double Q Learning (https://arxiv.org/abs/1509.06461).
        In general it is a good idea to keep it enabled.
    scope: str or VariableScope
        optional scope for variable_scope.
    reuse: bool or None
        whether or not the variables should be reused. To be able to reuse the scope must be given.
    param_noise: bool
        whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905)
    param_noise_filter_func: tf.Variable -> bool
        function that decides whether or not a variable should be perturbed. Only applicable
        if param_noise is True. If set to None, default_param_noise_filter is used by default.

    Returns
    -------
    act: (tf.Variable, bool, float) -> tf.Variable
        function to select and action given observation.
`       See the top of the file for details.
    train: (object, np.array, np.array, object, np.array, np.array) -> np.array
        optimize the error in Bellman's equation.
`       See the top of the file for details.
    update_target: () -> ()
        copy the parameters from optimized Q function to the target Q function.
`       See the top of the file for details.
    debug: {str: function}
        a bunch of functions to print debug data like q_values.
    """
    if param_noise:
        act_f = build_act_with_param_noise(make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse,
            param_noise_filter_func=param_noise_filter_func)
    else:
        act_f = build_act(make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse)

    with tf.variable_scope(scope, reuse=reuse):
        # set up placeholders
        obs_t_input = U.ensure_tf_input(make_obs_ph("obs_t"))
        act_t_ph = tf.placeholder(tf.int32, [None], name="action")
        rew_t_ph = tf.placeholder(tf.float32, [None], name="reward")
        obs_tp1_input = U.ensure_tf_input(make_obs_ph("obs_tp1"))
        done_mask_ph = tf.placeholder(tf.float32, [None], name="done")
        importance_weights_ph = tf.placeholder(tf.float32, [None], name="weight")

        # q network evaluation
        q_t = q_func(obs_t_input.get(), num_actions, scope="q_func", reuse=True)  # reuse parameters from act
        q_func_vars = U.scope_vars(U.absolute_scope_name("q_func"))

        # target q network evalution
        q_tp1 = q_func(obs_tp1_input.get(), num_actions, scope="target_q_func")
        target_q_func_vars = U.scope_vars(U.absolute_scope_name("target_q_func"))

        # q scores for actions which we know were selected in the given state.
        q_t_selected = tf.reduce_sum(q_t * tf.one_hot(act_t_ph, num_actions), 1)

        # compute estimate of best possible value starting from state at t + 1
        if double_q:
            q_tp1_using_online_net = q_func(obs_tp1_input.get(), num_actions, scope="q_func", reuse=True)
            q_tp1_best_using_online_net = tf.arg_max(q_tp1_using_online_net, 1)
            q_tp1_best = tf.reduce_sum(q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions), 1)
        else:
            q_tp1_best = tf.reduce_max(q_tp1, 1)
        q_tp1_best_masked = (1.0 - done_mask_ph) * q_tp1_best

        # compute RHS of bellman equation
        q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked

        # compute the error (potentially clipped)
        td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)
        errors = U.huber_loss(td_error)
        weighted_error = tf.reduce_mean(importance_weights_ph * errors)

        # compute optimization op (potentially with gradient clipping)
        if grad_norm_clipping is not None:
            optimize_expr = U.minimize_and_clip(optimizer,
                                                weighted_error,
                                                var_list=q_func_vars,
                                                clip_val=grad_norm_clipping)
        else:
            optimize_expr = optimizer.minimize(weighted_error, var_list=q_func_vars)

        # update_target_fn will be called periodically to copy Q network to target Q network
        update_target_expr = []
        for var, var_target in zip(sorted(q_func_vars, key=lambda v: v.name),
                                   sorted(target_q_func_vars, key=lambda v: v.name)):
            update_target_expr.append(var_target.assign(var))
        update_target_expr = tf.group(*update_target_expr)

        # Create callable functions
        train = U.function(
            inputs=[
                obs_t_input,
                act_t_ph,
                rew_t_ph,
                obs_tp1_input,
                done_mask_ph,
                importance_weights_ph
            ],
            outputs=td_error,
            updates=[optimize_expr]
        )
        update_target = U.function([], [], updates=[update_target_expr])

        q_values = U.function([obs_t_input], q_t)

        return act_f, train, update_target, {'q_values': q_values}
Esempio n. 17
0
def build_train(make_obs_ph,
                p_dist_func,
                num_actions,
                optimizer,
                grad_norm_clipping=None,
                gamma=1.0,
                double_q=True,
                scope="distdeepq",
                reuse=None,
                param_noise=False,
                param_noise_filter_func=None,
                dist_params=None):
    """Creates the train function:

    Parameters
    ----------
    make_obs_ph: str -> tf.placeholder or TfInput
        a function that takes a name and creates a placeholder of input with that name
    p_dist_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    num_actions: int
        number of actions
    reuse: bool
        whether or not to reuse the graph variables
    optimizer: tf.train.Optimizer
        optimizer to use for the Q-learning objective.
    grad_norm_clipping: float or None
        clip gradient norms to this value. If None no clipping is performed.
    gamma: float
        discount rate.
    double_q: bool
        if true will use Double Q Learning (https://arxiv.org/abs/1509.06461).
        In general it is a good idea to keep it enabled.
    scope: str or VariableScope
        optional scope for variable_scope.
    reuse: bool or None
        whether or not the variables should be reused. To be able to reuse the scope must be given.
    param_noise: bool
        whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905)
    param_noise_filter_func: tf.Variable -> bool
        function that decides whether or not a variable should be perturbed. Only applicable
        if param_noise is True. If set to None, default_param_noise_filter is used by default.

    Returns
    -------
    act: (tf.Variable, bool, float) -> tf.Variable
        function to select and action given observation.
`       See the top of the file for details.
    train: (object, np.array, np.array, object, np.array, np.array) -> np.array
        optimize the error in Bellman's equation.
`       See the top of the file for details.
    update_target: () -> ()
        copy the parameters from optimized Q function to the target Q function.
`       See the top of the file for details.
    debug: {str: function}
        a bunch of functions to print debug data like q_values.
    """

    if param_noise:
        raise ValueError('parameter noise not supported')
    else:
        act_f = build_act(make_obs_ph,
                          p_dist_func,
                          num_actions,
                          dist_params,
                          scope=scope,
                          reuse=reuse)

    with tf.variable_scope(scope, reuse=reuse):
        # set up placeholders
        obs_t_input = U.ensure_tf_input(make_obs_ph("obs_t"))
        act_t_ph = tf.placeholder(tf.int32, [None], name="action")
        rew_t_ph = tf.placeholder(tf.float32, [None], name="reward")
        obs_tp1_input = U.ensure_tf_input(make_obs_ph("obs_tp1"))
        done_mask_ph = tf.placeholder(tf.float32, [None], name="done")
        importance_weights_ph = tf.placeholder(tf.float32, [None],
                                               name="weight")

        # =====================================================================================
        # q network evaluation
        p_t = p_dist_func(obs_t_input.get(),
                          num_actions,
                          dist_params['nb_atoms'],
                          scope="q_func",
                          reuse=True)  # reuse parameters from act
        q_t = p_to_q(p_t, dist_params)  # reuse parameters from act
        q_func_vars = U.scope_vars(U.absolute_scope_name("q_func"))

        # target q network evalution
        p_tp1 = p_dist_func(obs_tp1_input.get(),
                            num_actions,
                            dist_params['nb_atoms'],
                            scope="target_q_func")
        q_tp1 = p_to_q(p_tp1, dist_params)
        target_q_func_vars = U.scope_vars(
            U.absolute_scope_name("target_q_func"))

        # TODO: use double

        a_next = tf.argmax(q_tp1, 1, output_type=tf.int32)
        batch_dim = tf.shape(rew_t_ph)[0]
        ThTz, debug = build_categorical_alg(p_tp1, rew_t_ph, a_next, gamma,
                                            batch_dim, done_mask_ph,
                                            dist_params)

        # compute the error (potentially clipped)
        cat_idx = tf.transpose(
            tf.reshape(tf.concat([tf.range(batch_dim), act_t_ph], axis=0),
                       [2, batch_dim]))
        p_t_next = tf.gather_nd(p_t, cat_idx)

        cross_entropy = -1 * ThTz * tf.log(p_t_next)
        errors = tf.reduce_sum(cross_entropy, axis=-1)

        mean_error = tf.reduce_mean(errors)

        # compute optimization op (potentially with gradient clipping)
        if grad_norm_clipping is not None:
            optimize_expr = U.minimize_and_clip(optimizer,
                                                mean_error,
                                                var_list=q_func_vars,
                                                clip_val=grad_norm_clipping)
        else:
            optimize_expr = optimizer.minimize(mean_error,
                                               var_list=q_func_vars)

        # =====================================================================================

        # update_target_fn will be called periodically to copy Q network to target Q network
        update_target_expr = []
        for var, var_target in zip(
                sorted(q_func_vars, key=lambda v: v.name),
                sorted(target_q_func_vars, key=lambda v: v.name)):
            update_target_expr.append(var_target.assign(var))
        update_target_expr = tf.group(*update_target_expr)

        # Create callable functions
        train = U.function(inputs=[
            obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph,
            importance_weights_ph
        ],
                           outputs=errors,
                           updates=[optimize_expr])
        update_target = U.function([], [], updates=[update_target_expr])

        q_values = U.function([obs_t_input], q_t)

        return act_f, train, update_target, {
            **debug, 'q_values': q_values,
            'p': p_tp1,
            'cross_entropy': cross_entropy,
            'ThTz': ThTz
        }
Esempio n. 18
0
def build_train_contrast(make_obs_ph,
                         model_func,
                         num_actions,
                         optimizer,
                         grad_norm_clipping=None,
                         gamma=1.0,
                         scope="mfec",
                         latent_dim=32,
                         alpha=0.05,
                         beta=0.1,
                         theta=0.1,
                         loss_type=["contrast"],
                         c_loss_type="sqmargin",
                         reuse=None):
    """Creates the train function:

    Parameters
    ----------
    make_obs_ph: str -> tf.placeholder or TfInput
        a function that takes a name and creates a placeholder of input with that name
    num_actions: int
        number of actions
    reuse: bool
        whether or not to reuse the graph variables
    optimizer: tf.train.Optimizer
        optimizer to use for the Q-learning objective.
    grad_norm_clipping: float or None
        clip gradient norms to this value. If None no clipping is performed.
    gamma: float
        discount rate.
    double_q: bool
        if true will use Double Q Learning (https://arxiv.org/abs/1509.06461).
        In general it is a good idea to keep it enabled.
    scope: str or VariableScope
        optional scope for variable_scope.
    reuse: bool or None
        whether or not the variables should be reused. To be able to reuse the scope must be given.

    Returns
    -------
    act: (tf.Variable, bool, float) -> tf.Variable
        function to select and action given observation.
`       See the top of the file for details.
    train: (object, np.array, np.array, object, np.array, np.array) -> np.array
        optimize the error in Bellman's equation.
`       See the top of the file for details.
    update_target: () -> ()
        copy the parameters from optimized Q function to the target Q function.
`       See the top of the file for details.
    debug: {str: function}
        a bunch of functions to print debug data like q_values.
    """

    # z_func = build_act_contrast(make_obs_ph, model_func, num_actions, scope=scope, secondary_scope="model_func",
    #                             reuse=reuse)

    with tf.variable_scope(scope, reuse=reuse):
        # set up placeholders

        # EMDQN
        # tau = tf.placeholder(tf.float32, [1], name='tau')
        # momentum = tf.placeholder(tf.float32, [1], name='momentum')

        obs_input_query = U.ensure_tf_input(make_obs_ph("obs_query"))
        obs_input_positive = U.ensure_tf_input(make_obs_ph("enc_obs_pos"))
        obs_input_negative = U.ensure_tf_input(make_obs_ph("enc_obs_neg"))

        value_input_query = tf.placeholder(tf.float32, [None], name="value")
        action_embedding = tf.Variable(tf.random_normal(
            [num_actions, latent_dim], stddev=1),
                                       name="action_embedding")
        action_input = tf.placeholder(tf.int32, [None], name="action")
        inputs = [obs_input_query]
        if "contrast" in loss_type:
            inputs += [obs_input_positive, obs_input_negative]
        if "regression" in loss_type:
            inputs += [value_input_query]
        if "linear_model" in loss_type:
            inputs += [action_input]
            if "contrast" not in loss_type:
                inputs += [obs_input_positive]
        z = model_func(obs_input_query.get(),
                       num_actions,
                       scope="model_func",
                       reuse=tf.AUTO_REUSE)

        h = model_func(obs_input_query.get(),
                       num_actions,
                       scope="hash_func",
                       reuse=False)

        # _, v = model_func(
        #     obs_input_query.get(), num_actions,
        #     scope="model_func",
        #     reuse=True)
        z_pos = model_func(obs_input_positive.get(),
                           num_actions,
                           scope="model_func",
                           reuse=True)

        z_neg = model_func(obs_input_negative.get(),
                           num_actions,
                           scope="model_func",
                           reuse=True)

        z_pos = tf.reshape(z_pos, [-1, latent_dim])
        z_tar = tf.reshape(z, [-1, latent_dim])
        z_neg = tf.reshape(z_neg, [-1, latent_dim])

        contrast_loss = contrastive_loss_fc(z_tar,
                                            z_pos,
                                            z_neg,
                                            c_type=c_loss_type)

        regression_loss = tf.reduce_mean(
            tf.squared_difference(tf.norm(z_tar, axis=1),
                                  alpha * value_input_query))

        action_embeded = tf.matmul(tf.one_hot(action_input, num_actions),
                                   action_embedding)
        model_loss = tf.reduce_mean(
            tf.squared_difference(action_embeded + z_tar, z_pos))
        print("shape:", z_tar.shape, z_pos.shape, z_neg.shape,
              action_embeded.shape)
        # contrast_loss = tf.reduce_mean(tf.log(sum_negative) - positive)
        # print("shape2:", z.shape, negative.shape, positive.shape)
        # prediction_loss = tf.losses.mean_squared_error(value_input, v)
        total_loss = 0
        if "contrast" in loss_type:
            total_loss += contrast_loss
        if "regression" in loss_type:
            total_loss += beta * regression_loss
        elif "linear_model" in loss_type:
            total_loss += theta * model_loss

        model_func_vars = U.scope_vars(U.absolute_scope_name("model_func"))
        if "linear_model" in loss_type:
            model_func_vars.append(action_embedding)
        if grad_norm_clipping is not None:
            optimize_expr = U.minimize_and_clip(optimizer,
                                                total_loss,
                                                var_list=model_func_vars,
                                                clip_val=grad_norm_clipping)
        else:
            optimize_expr = optimizer.minimize(total_loss,
                                               var_list=model_func_vars)
        # Create callable functions
        # update_target_fn will be called periodically to copy Q network to target Q network
        z_var_summary = tf.summary.scalar(
            "z_var", tf.reduce_mean(tf.math.reduce_std(z, axis=1)))
        negative_summary = tf.summary.scalar(
            "negative_dist", tf.reduce_mean(emb_dist(z_tar, z_neg)))
        positive_summary = tf.summary.scalar(
            "positive_dist", tf.reduce_mean(emb_dist(z_tar, z_pos)))
        contrast_loss_summary = tf.summary.scalar(
            "contrast loss", tf.reduce_mean(contrast_loss))
        regression_loss_summary = tf.summary.scalar(
            "regression loss", tf.reduce_mean(contrast_loss))
        model_loss_summary = tf.summary.scalar("model loss",
                                               tf.reduce_mean(contrast_loss))
        # prediction_loss_summary = tf.summary.scalar("prediction loss", tf.reduce_mean(prediction_loss))
        total_loss_summary = tf.summary.scalar("total loss",
                                               tf.reduce_mean(total_loss))

        summaries = [z_var_summary, total_loss_summary]

        if "contrast" in loss_type:
            summaries += [
                negative_summary, positive_summary, contrast_loss_summary
            ]
        if "regression" in loss_type:
            summaries.append(regression_loss_summary)
        if "linear_model" in loss_type:
            summaries.append(model_loss_summary)
        summary = tf.summary.merge(summaries)
        outputs = [z_tar]
        if "contrast" in loss_type:
            outputs += [z_pos, z_neg]
        elif "linear_model" in loss_type:
            outputs += [z_pos]
        outputs += [total_loss, summary]
        train = U.function(inputs=inputs,
                           outputs=outputs,
                           updates=[optimize_expr])

        eval = U.function(inputs=inputs, outputs=outputs, updates=[])
        z_func = U.function(
            inputs=[obs_input_query],
            outputs=[z, h],
        )
        norm_func = U.function(inputs=[obs_input_query],
                               outputs=[tf.norm(z_tar, axis=1)])
        return z_func, train, eval, norm_func
Esempio n. 19
0
def build_train_mer(input_type,
                    obs_shape,
                    model_func,
                    num_actions,
                    optimizer,
                    grad_norm_clipping=None,
                    gamma=1.0,
                    scope="mfec",
                    num_neg=10,
                    latent_dim=32,
                    alpha=0.1,
                    beta=1e2,
                    theta=10,
                    loss_type=["contrast"],
                    knn=4,
                    c_loss_type="margin",
                    b=100,
                    batch_size=32,
                    reuse=None):
    """Creates the train function:

    Parameters
    ----------
    make_obs_ph: str -> tf.placeholder or TfInput
        a function that takes a name and creates a placeholder of input with that name
    num_actions: int
        number of actions
    reuse: bool
        whether or not to reuse the graph variables
    optimizer: tf.train.Optimizer
        optimizer to use for the Q-learning objective.
    grad_norm_clipping: float or None
        clip gradient norms to this value. If None no clipping is performed.
    gamma: float
        discount rate.
    double_q: bool
        if true will use Double Q Learning (https://arxiv.org/abs/1509.06461).
        In general it is a good idea to keep it enabled.
    scope: str or VariableScope
        optional scope for variable_scope.
    reuse: bool or None
        whether or not the variables should be reused. To be able to reuse the scope must be given.

    Returns
    -------
    act: (tf.Variable, bool, float) -> tf.Variable
        function to select and action given observation.
`       See the top of the file for details.
    train: (object, np.array, np.array, object, np.array, np.array) -> np.array
        optimize the error in Bellman's equation.
`       See the top of the file for details.
    update_target: () -> ()
        copy the parameters from optimized Q function to the target Q function.
`       See the top of the file for details.
    debug: {str: function}
        a bunch of functions to print debug data like q_values.
    """
    if c_loss_type != "infonce":
        assert num_neg == 1
    # z_func = build_act_contrast(make_obs_ph, model_func, num_actions, scope=scope, secondary_scope="model_func",
    #                             reuse=reuse)

    with tf.variable_scope(scope, reuse=reuse):
        # set up placeholders

        # EMDQN
        # tau = tf.placeholder(tf.float32, [1], name='tau')
        # momentum = tf.placeholder(tf.float32, [1], name='momentum')

        # make_obs_ph = lambda name: input_type(obs_shape, batch_size, name=name),
        magic_num = tf.get_variable(name='magic', shape=[1])
        obs_input_query = U.ensure_tf_input(
            input_type(obs_shape, None, name="obs_query"))
        obs_input_positive = U.ensure_tf_input(
            input_type(obs_shape, batch_size, name="enc_obs_pos"))
        obs_input_negative = U.ensure_tf_input(
            input_type(obs_shape, batch_size * num_neg, name="enc_obs_neg"))
        obs_input_neighbour = U.ensure_tf_input(
            input_type(obs_shape, batch_size * knn, name="enc_obs_neighbour"))

        obs_input_uniformity_u = U.ensure_tf_input(
            input_type(obs_shape, batch_size, name="enc_obs_uni_u"))
        obs_input_uniformity_v = U.ensure_tf_input(
            input_type(obs_shape, batch_size, name="enc_obs_uni_v"))

        obs_input_weighted_product_u = U.ensure_tf_input(
            input_type(obs_shape, batch_size, name="enc_obs_wp_u"))
        obs_input_weighted_product_v = U.ensure_tf_input(
            input_type(obs_shape, batch_size, name="enc_obs_wp_v"))

        value_input_weighted_product_u = tf.placeholder(tf.float32,
                                                        [batch_size],
                                                        name="value_u")
        value_input_weighted_product_v = tf.placeholder(tf.float32,
                                                        [batch_size],
                                                        name="value_v")

        value_input_query = tf.placeholder(tf.float32, [batch_size],
                                           name="value")
        value_input_neighbour = tf.placeholder(tf.float32, [batch_size, knn],
                                               name="neighbour_value")
        action_embedding = tf.Variable(tf.random_normal(
            [num_actions, latent_dim], stddev=1),
                                       name="action_embedding")
        action_input = tf.placeholder(tf.int32, [batch_size], name="action")
        action_input_causal = tf.placeholder(tf.int32, [batch_size],
                                             name="action")
        reward_input_causal = tf.placeholder(tf.float32, [batch_size],
                                             name="action")

        inputs = [obs_input_query]
        if "contrast" in loss_type:
            inputs += [obs_input_positive, obs_input_negative]
        if "regression" in loss_type:
            inputs += [value_input_query]
        if "linear_model" in loss_type:
            inputs += [action_input]
            if "contrast" not in loss_type:
                inputs += [obs_input_positive]
        if "fit" in loss_type:
            # if "contrast" not in loss_type:
            #     inputs+=[]
            inputs += [obs_input_neighbour, value_input_neighbour]
            if "regression" not in loss_type:
                inputs += [value_input_query]
        if "weight_product" in loss_type:
            inputs += [
                obs_input_uniformity_u, obs_input_uniformity_v,
                obs_input_weighted_product_u, obs_input_weighted_product_v,
                value_input_weighted_product_u, value_input_weighted_product_v
            ]
        if "causality" in loss_type:
            inputs += [reward_input_causal, action_input_causal]
        z_old = model_func(obs_input_query.get(),
                           num_actions,
                           scope="target_model_func",
                           reuse=False)

        z = model_func(obs_input_query.get(),
                       num_actions,
                       scope="model_func",
                       reuse=tf.AUTO_REUSE)

        z_pos = model_func(obs_input_positive.get(),
                           num_actions,
                           scope="model_func",
                           reuse=True)

        z_neg = model_func(obs_input_negative.get(),
                           num_actions,
                           scope="model_func",
                           reuse=True)

        z_uni_u = model_func(obs_input_uniformity_u.get(),
                             num_actions,
                             scope="model_func",
                             reuse=True)
        z_uni_v = model_func(obs_input_uniformity_v.get(),
                             num_actions,
                             scope="model_func",
                             reuse=True)
        z_wp_u = model_func(obs_input_weighted_product_u.get(),
                            num_actions,
                            scope="model_func",
                            reuse=True)
        z_wp_v = model_func(obs_input_weighted_product_v.get(),
                            num_actions,
                            scope="model_func",
                            reuse=True)

        z_pos = tf.reshape(z_pos, [-1, latent_dim])
        z_tar = tf.reshape(z, [-1, latent_dim])
        if "contrast" in loss_type:
            z_neg = tf.reshape(z_neg, [-1, latent_dim])
            contrast_loss, contrast_summary = contrastive_loss_fc(
                z_tar,
                z_pos,
                z_neg,
                c_type=c_loss_type,
                num_neg=num_neg,
                batch_size=batch_size,
                emb_dim=latent_dim)
            symmetry_loss, symmetry_summary = contrastive_loss_fc(
                z_pos,
                z_tar,
                z_neg,
                c_type=c_loss_type,
                num_neg=num_neg,
                batch_size=batch_size,
                emb_dim=latent_dim)
            contrast_loss += symmetry_loss
        z_neighbour = model_func(obs_input_neighbour.get(),
                                 num_actions,
                                 scope="model_func",
                                 reuse=True)

        # fit loss
        z_neighbour = tf.reshape(z_neighbour, [-1, knn, latent_dim])
        square_dist = tf.square(
            tf.tile(tf.expand_dims(z_tar, 1), [1, knn, 1]) - z_neighbour)
        neighbour_dist = tf.reduce_sum(square_dist, axis=2)
        neighbour_coeff = tf.math.softmax(-neighbour_dist / b, axis=1)
        coeff_sum = tf.reduce_mean(tf.reduce_sum(neighbour_coeff, axis=1))
        value_input_neighbour_mean = tf.reduce_mean(value_input_neighbour)
        fit_value = tf.reduce_sum(tf.multiply(neighbour_coeff,
                                              value_input_neighbour),
                                  axis=1)
        fit_loss = tf.reduce_mean(tf.abs(fit_value - value_input_query))

        # causality loss
        reward_input_causal = tf.reshape(reward_input_causal, [1, -1])
        reward_tile = tf.tile(reward_input_causal, [batch_size, 1])
        # reward_mask = (reward_tile - tf.transpose(reward_tile)) ** 2
        reward_mask = 1 - tf.cast(
            tf.equal((reward_tile - tf.transpose(reward_tile)),
                     tf.constant(0.)), tf.float32)
        action_input_causal = tf.reshape(action_input_causal, [1, -1])
        action_tile = tf.tile(action_input_causal, [batch_size, 1])
        action_mask = tf.cast(
            tf.equal((action_tile - tf.transpose(action_tile)),
                     tf.constant(0)), tf.float32)
        total_mask = tf.multiply(reward_mask, action_mask)
        z_tile = tf.tile(tf.expand_dims(z_tar, 1), [1, batch_size, 1])
        z_diff = z_tile - tf.transpose(z_tile, perm=[1, 0, 2])
        distance = tf.reduce_sum(z_diff**2, axis=2)
        exp_distance = tf.exp(-distance)
        causal_find_rate = (tf.reduce_sum(total_mask)) / (batch_size**2 -
                                                          batch_size)
        causal_loss = tf.reduce_sum(tf.multiply(exp_distance, total_mask))

        # regularization loss
        regularization_loss = -tf.maximum(
            1., tf.reduce_mean(U.huber_loss(z_tar, 0.01)))
        regression_loss = tf.reduce_mean(
            tf.squared_difference(tf.norm(z_tar, axis=1), alpha *
                                  value_input_query)) + regularization_loss

        # linear model loss
        action_embeded = tf.matmul(tf.one_hot(action_input, num_actions),
                                   action_embedding)
        model_loss = tf.reduce_mean(
            tf.squared_difference(action_embeded + z_tar,
                                  z_pos)) + 0.01 * regularization_loss

        # weighted product loss
        uniformity_loss = tf.reduce_sum(
            tf.exp(2 * tf.reduce_sum(tf.multiply(z_uni_u, z_uni_v), axis=1) -
                   2))
        value_weight = (value_input_weighted_product_u -
                        value_input_weighted_product_v)**2
        # angle = acos_safe(tf.reduce_sum(tf.multiply(z_wp_u, z_wp_v), axis=1))
        angle = tf.reduce_sum(tf.multiply(z_wp_u, z_wp_v), axis=1)
        weighted_product = tf.multiply(value_weight, angle)
        wp_loss = tf.reduce_sum(weighted_product)

        total_loss = 0
        if "contrast" in loss_type:
            total_loss += contrast_loss
        if "regression" in loss_type:
            total_loss += beta * regression_loss
        if "linear_model" in loss_type:
            total_loss += theta * model_loss
        if "fit" in loss_type:
            total_loss += beta * fit_loss
        if "causality" in loss_type:
            total_loss += theta * causal_loss
        if "weight_product" in loss_type:
            total_loss += 0.1 * uniformity_loss
            total_loss += wp_loss
        model_func_vars = U.scope_vars(U.absolute_scope_name("model_func"))
        model_func_vars_update = copy.copy(model_func_vars)
        if "linear_model" in loss_type:
            model_func_vars_update.append(action_embedding)

        target_model_func_vars = U.scope_vars(
            U.absolute_scope_name("target_model_func"))

        update_target_expr = []
        for var in model_func_vars:
            print(var.name, var.shape)
        for var_target in target_model_func_vars:
            print(var_target.name, var_target.shape)

        for var, var_target in zip(
                sorted(model_func_vars, key=lambda v: v.name),
                sorted(target_model_func_vars, key=lambda v: v.name)):
            update_target_expr.append(var_target.assign(var))
        update_target_expr = tf.group(*update_target_expr)

        if grad_norm_clipping is not None:
            optimize_expr = U.minimize_and_clip(
                optimizer,
                total_loss,
                var_list=model_func_vars_update,
                clip_val=grad_norm_clipping)
        else:
            optimize_expr = optimizer.minimize(total_loss,
                                               var_list=model_func_vars_update)
        # Create callable functions
        # update_target_fn will be called periodically to copy Q network to target Q network
        z_var_summary = tf.summary.scalar(
            "z_var", tf.reduce_mean(tf.math.reduce_std(z, axis=1)))
        if "contrast" in loss_type:
            z_neg = tf.reshape(z_neg, [batch_size, num_neg, latent_dim])
            negative_summary = tf.summary.scalar(
                "negative_dist",
                tf.reduce_mean(emb_dist(z_tar, z_neg[:, 0, :])))
        positive_summary = tf.summary.scalar(
            "positive_dist", tf.reduce_mean(emb_dist(z_tar, z_pos)))
        if "contrast" in loss_type:
            contrast_loss_summary = tf.summary.scalar(
                "contrast loss", tf.reduce_mean(contrast_loss))
        regularization_loss_summary = tf.summary.scalar(
            "regularization loss", tf.reduce_mean(regularization_loss))
        regression_loss_summary = tf.summary.scalar(
            "regression loss", tf.reduce_mean(regression_loss))
        model_loss_summary = tf.summary.scalar("model loss",
                                               tf.reduce_mean(model_loss))
        fit_loss_summary = tf.summary.scalar("fit loss",
                                             tf.reduce_mean(fit_loss))
        fit_value_summary = tf.summary.scalar("fit value",
                                              tf.reduce_mean(fit_value))
        neighbour_value_summary = tf.summary.scalar(
            "neighbour value", value_input_neighbour_mean)
        coeff_summary = tf.summary.scalar("coeff sum", coeff_sum)
        square_dist_summary = tf.summary.scalar("square_dist",
                                                tf.reduce_mean(square_dist))
        z_neighbour_summary = tf.summary.scalar("z_neighbour_mean",
                                                tf.reduce_mean(z_neighbour))
        # fit_loss_summary = tf.summary.scalar("fit loss", tf.reduce_mean(fit_loss))
        # prediction_loss_summary = tf.summary.scalar("prediction loss", tf.reduce_mean(prediction_loss))
        causal_efficiency_summary = tf.summary.scalar("causal efficiency",
                                                      causal_find_rate)
        causal_loss_summary = tf.summary.scalar("causal loss", causal_loss)
        # reward_mask_summary = tf.summary.scalar("reward mask summary", debug_reward_mask)
        # action_mask_summary = tf.summary.scalar("action mask summary", debug_action_mask)
        uniformity_loss_summary = tf.summary.scalar("uniform loss",
                                                    uniformity_loss)
        wp_loss_summary = tf.summary.scalar("weighted product loss", wp_loss)
        total_loss_summary = tf.summary.scalar("total loss",
                                               tf.reduce_mean(total_loss))

        summaries = [
            z_var_summary, total_loss_summary, regularization_loss_summary
        ]

        if "contrast" in loss_type:
            summaries += [
                negative_summary, positive_summary, contrast_loss_summary
            ]
            summaries += contrast_summary
        if "regression" in loss_type:
            summaries.append(regression_loss_summary)
        if "linear_model" in loss_type:
            summaries.append(model_loss_summary)
            if "contrast" not in loss_type:
                summaries.append(positive_summary)
        if "fit" in loss_type:
            summaries.append(fit_loss_summary)
            summaries.append(fit_value_summary)
            summaries.append(neighbour_value_summary)
            summaries.append(coeff_summary)
            summaries.append(square_dist_summary)
            summaries.append(z_neighbour_summary)
        if "causality" in loss_type:
            summaries.append(causal_efficiency_summary)
            summaries.append(causal_loss_summary)
            # summaries.append(reward_mask_summary)
            # summaries.append(action_mask_summary)
        if "weight_product" in loss_type:
            summaries.append(uniformity_loss_summary)
            summaries.append(wp_loss_summary)
        summary = tf.summary.merge(summaries)
        outputs = [total_loss, summary]
        train = U.function(inputs=inputs,
                           outputs=outputs,
                           updates=[optimize_expr])

        eval = U.function(inputs=inputs, outputs=outputs, updates=[])
        z_func = U.function(
            inputs=[obs_input_query],
            outputs=[z_old],
        )
        norm_func = U.function(inputs=[obs_input_query],
                               outputs=[tf.norm(z_tar, axis=1)])
        update_target_func = U.function([], [], updates=[update_target_expr])
        return z_func, train, eval, norm_func, update_target_func
Esempio n. 20
0
def build_train_dueling(make_obs_ph, q_func, model_func, num_actions, optimizer, grad_norm_clipping=None, gamma=1.0,
                        scope="deepq", input_dim=84 * 84 * 4, hash_dim=32, use_rp=False, imitate=False, reuse=None):
    """Creates the train function:

    Parameters
    ----------
    make_obs_ph: str -> tf.placeholder or TfInput
        a function that takes a name and creates a placeholder of input with that name
    q_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    num_actions: int
        number of actions
    reuse: bool
        whether or not to reuse the graph variables
    optimizer: tf.train.Optimizer
        optimizer to use for the Q-learning objective.
    grad_norm_clipping: float or None
        clip gradient norms to this value. If None no clipping is performed.
    gamma: float
        discount rate.
    double_q: bool
        if true will use Double Q Learning (https://arxiv.org/abs/1509.06461).
        In general it is a good idea to keep it enabled.
    scope: str or VariableScope
        optional scope for variable_scope.
    reuse: bool or None
        whether or not the variables should be reused. To be able to reuse the scope must be given.

    Returns
    -------
    act: (tf.Variable, bool, float) -> tf.Variable
        function to select and action given observation.
`       See the top of the file for details.
    train: (object, np.array, np.array, object, np.array, np.array) -> np.array
        optimize the error in Bellman's equation.
`       See the top of the file for details.
    update_target: () -> ()
        copy the parameters from optimized Q function to the target Q function.
`       See the top of the file for details.
    debug: {str: function}
        a bunch of functions to print debug data like q_values.
    """
    act_f = build_act_dueling(make_obs_ph, q_func, model_func, num_actions, input_dim, hash_dim, use_rp, scope=scope,
                              reuse=reuse)

    with tf.variable_scope(scope, reuse=reuse):
        # set up placeholders
        obs_t_input = U.ensure_tf_input(make_obs_ph("obs_t"))
        act_t_ph = tf.placeholder(tf.int32, [None], name="action")
        rew_t_ph = tf.placeholder(tf.float32, [None], name="reward")
        done_mask_ph = tf.placeholder(tf.float32, [None], name="done")
        importance_weights_ph = tf.placeholder(tf.float32, [None], name="weight")
        if imitate:
            imitate_act_t_ph = tf.placeholder(tf.float32, [None, num_actions], name="imitate_action")
        # EMDQN
        value_t_ph = tf.placeholder(tf.float32, [None], name='value_t')
        value_tp1_ph = tf.placeholder(tf.float32, [None], name='value_tp1')
        value_tp1_masked = (1.0 - done_mask_ph) * value_tp1_ph
        # q network evaluation
        q_t = q_func(obs_t_input.get(), num_actions, scope="q_func", reuse=True)  # reuse parameters from act
        # q_t_normalized = q_t - tf.max(q_t,)
        q_func_vars = U.scope_vars(U.absolute_scope_name("q_func"))
        q_t_selected = tf.reduce_sum(q_t * tf.one_hot(act_t_ph, num_actions), 1)

        # compute RHS of bellman equation
        q_target = rew_t_ph + gamma * value_tp1_masked

        # compute the error (potentially clipped)
        td_error = q_target - (q_t_selected + value_t_ph)
        td_summary = tf.summary.scalar("td error", tf.reduce_mean(td_error))
        # EMDQN
        print(q_t.shape)
        if imitate:
            imitation_loss = tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(labels=imitate_act_t_ph, logits=q_t),
                                       axis=1)
            print(imitation_loss.shape)
            errors = U.huber_loss(td_error) + imitation_loss
        else:
            errors = U.huber_loss(td_error)
        total_summary = tf.summary.scalar("total error", tf.reduce_mean(errors))

        value_summary = tf.summary.scalar("value_t", tf.reduce_mean(value_t_ph))
        value_tp1_summary = tf.summary.scalar("value_tp1", tf.reduce_mean(value_tp1_ph))
        q_summary = tf.summary.scalar("estimated qs", tf.reduce_mean(q_t_selected))
        summaries=[td_summary, total_summary, value_summary, value_tp1_summary, q_summary]
        if imitate:
            imitate_summary = tf.summary.scalar("imitate loss", tf.reduce_mean(imitation_loss))
            summaries.append(imitate_summary)
        summary = tf.summary.merge(summaries)

        weighted_error = tf.reduce_mean(importance_weights_ph * errors)
        # compute optimization op (potentially with gradient clipping)
        if grad_norm_clipping is not None:
            optimize_expr = U.minimize_and_clip(optimizer,
                                                weighted_error,
                                                var_list=q_func_vars,
                                                clip_val=grad_norm_clipping)
        else:
            optimize_expr = optimizer.minimize(weighted_error, var_list=q_func_vars)

        # update_target_fn will be called periodically to copy Q network to target Q network
        inputs = [
            obs_t_input,
            act_t_ph,
            rew_t_ph,
            done_mask_ph,
            importance_weights_ph,
            value_t_ph,
            value_tp1_ph
        ]
        if imitate:
            inputs.append(imitate_act_t_ph)
        # Create callable functions
        # EMDQN
        train = U.function(
            inputs=inputs,
            outputs=[td_error, summary],
            updates=[optimize_expr]
        )

        return act_f, train
Esempio n. 21
0
def build_train(make_obs_ph,
                q_func,
                num_actions,
                grad_norm_clipping=None,
                gamma=1.0,
                deterministic_filter=False,
                random_filter=False,
                double_q=True,
                scope="deepq",
                reuse=None,
                param_noise=False,
                param_noise_filter_func=None):
    """Creates the train function:

    Parameters
    ----------
    make_obs_ph: str -> tf.placeholder or TfInput
        a function that takes a name and creates a placeholder of input with that name
    q_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    num_actions: int
        number of actions
    reuse: bool
        whether or not to reuse the graph variables
    grad_norm_clipping: float or None
        clip gradient norms to this value. If None no clipping is performed.
    gamma: float
        discount rate.
    double_q: bool
        if true will use Double Q Learning (https://arxiv.org/abs/1509.06461).
        In general it is a good idea to keep it enabled.
    scope: str or VariableScope
        optional scope for variable_scope.
    reuse: bool or None
        whether or not the variables should be reused. To be able to reuse the scope must be given.
    param_noise: bool
        whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905)
    param_noise_filter_func: tf.Variable -> bool
        function that decides whether or not a variable should be perturbed. Only applicable
        if param_noise is True. If set to None, default_param_noise_filter is used by default.

    Returns
    -------
    act: (tf.Variable, bool, float) -> tf.Variable
        function to select and action given observation.
`       See the top of the file for details.
    train: (object, np.array, np.array, object, np.array, np.array) -> np.array
        optimize the error in Bellman's equation.
`       See the top of the file for details.
    update_target: () -> ()
        copy the parameters from optimized Q function to the target Q function.
`       See the top of the file for details.
    debug: {str: function}
        a bunch of functions to print debug data like q_values.
    """
    if param_noise:
        act_f = build_act_with_param_noise(
            make_obs_ph,
            q_func,
            num_actions,
            scope=scope,
            reuse=reuse,
            param_noise_filter_func=param_noise_filter_func,
            deterministic_filter=deterministic_filter,
            random_filter=random_filter)
    else:
        act_f = build_act(make_obs_ph,
                          q_func,
                          num_actions,
                          scope=scope,
                          reuse=reuse,
                          deterministic_filter=deterministic_filter,
                          random_filter=random_filter)

    with tf.variable_scope(scope, reuse=reuse):
        # set up placeholders
        lr_ph = tf.placeholder(tf.float32, name="lr")
        obs_t_input = U.ensure_tf_input(make_obs_ph("obs_t"))
        act_t_ph = tf.placeholder(tf.int32, [None], name="action")
        rew_t_ph = tf.placeholder(U.data_type, [None], name="reward")
        obs_tp1_input = U.ensure_tf_input(make_obs_ph("obs_tp1"))
        done_mask_ph = tf.placeholder(U.data_type, [None], name="done")
        importance_weights_ph = tf.placeholder(U.data_type, [None],
                                               name="weight")

        board_size = obs_t_input.get().get_shape().as_list()[1]

        obs_t = transform_obses(obs_t_input.get())
        obs_tp1 = transform_obses(obs_tp1_input.get())
        act_t = transform_actions(act_t_ph, board_size)

        if deterministic_filter:
            invalid_masks_tp1 = build_invalid_masks(obs_tp1)

        # q network evaluation
        q_t = q_func(obs_t, num_actions, scope="q_func",
                     reuse=True)  # reuse parameters from act
        q_func_vars = U.scope_vars(U.absolute_scope_name("q_func"))

        # target q network evalution
        q_tp1 = q_func(obs_tp1, num_actions, scope="target_q_func")
        target_q_func_vars = U.scope_vars(
            U.absolute_scope_name("target_q_func"))

        # q scores for actions which we know were selected in the given state.
        q_t_selected = tf.reduce_sum(
            q_t * tf.one_hot(act_t, num_actions, dtype=U.data_type), axis=1)

        # compute estimate of best possible value starting from state at t + 1
        if double_q:
            q_tp1_using_online_net = q_func(obs_tp1,
                                            num_actions,
                                            scope="q_func",
                                            reuse=True)

            if deterministic_filter:
                q_tp1_using_online_net = build_q_filter(
                    q_tp1_using_online_net, invalid_masks_tp1)

            q_tp1_best_using_online_net = tf.argmax(q_tp1_using_online_net,
                                                    1,
                                                    output_type=U.index_type)
            q_tp1_best = tf.reduce_sum(
                q_tp1 * tf.one_hot(q_tp1_best_using_online_net,
                                   num_actions,
                                   dtype=U.data_type), 1)
        else:
            if deterministic_filter:
                q_tp1 = build_q_filter(q_tp1, invalid_masks_tp1)

            q_tp1_best = tf.reduce_max(q_tp1, axis=1)
        q_tp1_best_masked = (1.0 - done_mask_ph) * q_tp1_best

        # compute RHS of bellman equation
        q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked

        # compute the error (potentially clipped)
        td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)
        weighted_error = tf.reduce_mean(importance_weights_ph *
                                        U.huber_loss(td_error))
        regularizer = tf.add_n([tf.nn.l2_loss(var)
                                for var in q_func_vars]) * 0.0001
        total_error = weighted_error + regularizer

        # optimizer = tf.train.MomentumOptimizer(
        #     learning_rate=lr_ph, momentum=0.9)
        optimizer = tf.train.AdamOptimizer(learning_rate=lr_ph)

        # compute optimization op (potentially with gradient clipping)
        if grad_norm_clipping is not None:
            optimize_expr = U.minimize_and_clip(optimizer,
                                                total_error,
                                                var_list=q_func_vars,
                                                clip_val=grad_norm_clipping)
        else:
            optimize_expr = optimizer.minimize(total_error,
                                               var_list=q_func_vars)

        # update_target_fn will be called periodically to copy Q network to target Q network
        update_target_expr = []
        for var, var_target in zip(
                sorted(q_func_vars, key=lambda v: v.name),
                sorted(target_q_func_vars, key=lambda v: v.name)):
            update_target_expr.append(var_target.assign(var))
        update_target_expr = tf.group(*update_target_expr)

        # Create callable functions
        train = U.function(inputs=[
            lr_ph, obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input,
            done_mask_ph, importance_weights_ph
        ],
                           outputs=[td_error, weighted_error, total_error],
                           updates=[optimize_expr])
        update_target = U.function([], [], updates=[update_target_expr])

        q_values = U.function([obs_t_input], q_t)

        return act_f, train, update_target, {'q_values': q_values}
    def __init__(self, inputs: TrainInputs, action_space, observation_space):
        act_size = action_space.n
        optimizer = tf.train.AdamOptimizer(learning_rate=inputs.lr)

        with tf.variable_scope(
                'q_func'
        ):  # child scopes of reusable parent scope are reusable
            self.runner = q_policy(obs=inputs.s0,
                                   epsilon=inputs.eps,
                                   action_space=action_space)

        with tf.variable_scope(
                'q_func', reuse=True
        ):  # child scopes of reusable parent scope are reusable
            q_net = q_policy(obs=inputs.s0,
                             epsilon=inputs.eps,
                             action_space=action_space)

        with tf.variable_scope('target_q_func'):
            target_q_net = q_policy(obs=inputs.s1,
                                    epsilon=inputs.eps,
                                    action_space=action_space)

        update_target_op = tf.group(*[
            tf.assign(a, b)
            for a, b in zip(target_q_net.trainables, q_net.trainables)
        ])

        if G.double_q:
            with tf.variable_scope(
                    'q_func', reuse=True
            ):  # child scopes of reusable parent scope are reusable
                inner_q_net = q_policy(obs=inputs.s1,
                                       epsilon=inputs.eps,
                                       action_space=action_space)

        with tf.variable_scope('Q_training'):
            q_sampled = tf.reduce_sum(q_net.q_values *
                                      tf.one_hot(inputs.act, act_size),
                                      axis=1)

            if G.double_q:
                q_asterisk = tf.reduce_sum(
                    target_q_net.q_values *
                    tf.one_hot(inner_q_net.act_argmax, act_size),
                    axis=1)
            else:
                q_asterisk = tf.reduce_max(target_q_net.q_values, axis=1)

            # compute RHS of bellman equation
            T_q = inputs.rew + (1.0 -
                                inputs.done_mask_ph) * G.gamma * q_asterisk

            # compute the error (potentially clipped)
            td_error = q_sampled - tf.stop_gradient(T_q)
            _ = U.huber_loss(td_error)
            if G.prioritized_replay:
                loss = tf.reduce_mean(inputs.sample_weights * _)
            else:
                loss = tf.reduce_mean(_)

            # compute optimization op (potentially with gradient clipping)
            if G.grad_norm_clipping:
                optimize_op = U.minimize_and_clip(
                    optimizer,
                    loss,
                    var_list=q_net.trainables,
                    clip_val=G.grad_norm_clipping)
            else:
                optimize_op = optimizer.minimize(loss,
                                                 var_list=q_net.trainables)

        def train(*,
                  s0s,
                  actions,
                  rewards,
                  s1s,
                  dones,
                  sample_weights=None):  # read: SARSA
            feed_dict = {
                inputs.lr: G.learning_rate,
                inputs.s0: s0s,
                inputs.act: actions,
                inputs.rew: rewards,
                inputs.s1: s1s,
                inputs.done_mask_ph: dones
            }
            if G.prioritized_replay:
                assert sample_weights is not None, "sample_weights is required when prioritized_replay is ON."
                feed_dict[inputs.sample_weights] = sample_weights
            td_error_val, loss_val, _ = U.get_session().run(
                [td_error, loss, optimize_op], feed_dict)
            return td_error_val, loss_val

        def update_target():
            U.get_session().run(update_target_op)

        self.train = train
        self.update_target = update_target