예제 #1
0
def training_loss(op_rollout,
                  s_history,
                  a_history,
                  v_history,
                  r_history,
                  nenvs,
                  nstep,
                  training_depth=1):
    state_shape = s_history.shape.as_list()[1:]
    # for each envs, carry out same action for nstep
    r_vi, v_vi, s_vi = [], [], []

    # value iteration: expand for all possible action
    l = tf.expand_dims(tf.range(0, nenvs), 1)
    l = tf.concat([l, tf.tile([[0]], [nenvs, 1])], axis=1)
    s = tf.gather_nd(tf.reshape(s_history, [nenvs, nstep, -1]), l)
    a = tf.gather_nd(tf.reshape(a_history, [nenvs, nstep]), l)
    for i in range(training_depth):
        s_vi.append(s)
        r, v, s = op_rollout(s, a)
        r_vi.append(r)
        v_vi.append(v)
    r_vi = tf.stack(r_vi, axis=1)
    v_vi = tf.stack(v_vi, axis=1)
    s_vi = tf.stack(s_vi, axis=1)

    s_history = tf.reshape(s_history, [nenvs, 1, nstep, -1])
    v_history = tf.reshape(v_history, [nenvs, 1, nstep])
    r_history = tf.reshape(r_history, [nenvs, 1, nstep])
    s_vi = tf.reshape(s_vi, [nenvs, training_depth, 1, -1])
    v_vi = tf.reshape(v_vi, [nenvs, training_depth, 1])
    r_vi = tf.reshape(r_vi, [nenvs, training_depth, 1])
    # use the upper triangular part
    idx = np.flip(np.triu(np.ones([training_depth, nstep])), 1)
    idx = np.where(idx.reshape([-1]))[0]
    l = np.repeat(np.arange(nenvs), idx.size)
    l = np.stack([l, np.tile(idx, nenvs)], axis=1)

    s_mat = tf.gather_nd(
        tf.reshape(s_history - s_vi,
                   [nenvs, training_depth * nstep, state_shape[-1]]), l)
    r_mat = tf.gather_nd(tf.reshape(r_history - r_vi, [nenvs, -1]), l)
    v_mat = tf.gather_nd(tf.reshape(v_history - v_vi, [nenvs, -1]), l)
    # # bn before loss
    # r_mat = r_bn(r_mat)
    # v_mat = v_bn(v_mat)
    # compute loss
    s_loss = tf.math.reduce_mean(huber_loss(s_mat))
    r_loss = tf.math.reduce_mean(huber_loss(r_mat))
    v_loss = tf.math.reduce_mean(huber_loss(v_mat))
    return r_loss + v_loss  # + s_loss
    def _build_train_reward_func(self, reward_func, observation_input_ph,
                                 action_input_ph, optimizer):
        with tf.variable_scope("reward_func_optimizer"):
            true_rewards_ph = tf.placeholder(tf.float32, [None],
                                             name="true_rewards")
            #loss = tf.metrics.mean_squared_error(reward_func, true_rewards_ph)
            true_rewards = tf.expand_dims(true_rewards_ph, axis=1)

            #loss = tf.reduce_mean(tf.losses.huber_loss(reward_func, true_rewards), name = "loss") # Maybe a bit more robust.
            errors = reward_func - true_rewards
            loss = tf.reduce_mean(tf_utils.huber_loss(errors), name="loss")
            gradients = optimizer.compute_gradients(loss)
            for i, (grad, var) in enumerate(gradients):
                gradients[i] = (tf.clip_by_norm(grad,
                                                self.grad_norm_clipping), var)
            train_reward_func = optimizer.apply_gradients(gradients)

            return errors, train_reward_func, true_rewards_ph
예제 #3
0
def learn(env,
          q_func,
          policy_fn,
          lr=5e-4,
          max_timesteps=100000,
          buffer_size=50000,
          exploration_fraction=0.1,
          exploration_final_eps=0.02,
          train_freq=1,
          batch_size=32,
          print_freq=100,
          checkpoint_freq=10000,
          learning_starts=1000,
          gamma=1.0,
          target_network_update_freq=500,
          prioritized_replay=False,
          prioritized_replay_alpha=0.6,
          prioritized_replay_beta0=0.4,
          prioritized_replay_beta_iters=None,
          prioritized_replay_eps=1e-6,
          param_noise=False,
          callback=None):
    # Create all the functions necessary to train the model

    sess = tf.Session()
    sess.__enter__()

    # capture the shape outside the closure so that the env object is not serialized
    # by cloudpickle when serializing make_obs_ph
    observation_space_shape = env.observation_space.shape
    def make_obs_ph(name):
        return BatchInput(observation_space_shape, name=name)
    
    scope = "ampi"
    reuse=None
    grad_norm_clipping=None
    num_actions=env.action_space.n
    optimizer_q=tf.train.AdamOptimizer(learning_rate=lr)
    optimizer_pi=tf.train.AdamOptimizer(learning_rate=lr)
    act = build_act(make_obs_ph, q_func, num_actions=env.action_space.n, scope=scope, reuse=reuse)
    
    with tf.variable_scope(scope, reuse=reuse):
        # set up placeholders
        obs_t_input = make_obs_ph("obs_t")
        act_t_ph = tf.placeholder(tf.int32, [None], name="action")
        rew_t_ph = tf.placeholder(tf.float32, [None], name="reward")
        obs_tp1_input = make_obs_ph("obs_tp1")
        done_mask_ph = tf.placeholder(tf.float32, [None], name="done")
        importance_weights_ph = tf.placeholder(tf.float32, [None], name="weight")
        
        # add
        ob_space = env.observation_space
        ac_space = env.action_space
        pi, act = policy_fn(obs_t_input.get(), ob_space, ac_space, scope="pi_func") # train pi
        pi_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/pi_func")
        
        pi_tp1, act_tp1 = policy_fn(obs_tp1_input.get(), ob_space, ac_space, scope="target_pi_func") # target pi
        target_pi_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/taget_pi_func")
 
        # q network evaluation
        q_t = q_func(obs_t_input.get(), num_actions, scope="q_func", reuse=True)  # reuse parameters from act
        q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/q_func")

        # target q network evalution
        q_tp1 = q_func(obs_tp1_input.get(), num_actions, scope="target_q_func")
        target_q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/target_q_func")
        
        # Q_{train}(a,s)
        q_t_selected = tf.reduce_sum(q_t * tf.one_hot(act_t_ph, num_actions), 1) 
        
        # y_j
        act_best = tf.argmax(pi, axis=1) # argmax \pi(s_{j+1})
        q_tp1_sampled = tf.reduce_sum(q_tp1 * tf.one_hot(act_best, num_actions), 1) # Q_{target}(s_{j+1}, argmax(\pi(s_{j+1}))
        q_tp1_best_masked = (1.0 - done_mask_ph) * q_tp1_sampled
        q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked
        
        # Regression loss
        td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)
        errors = U.huber_loss(td_error)
        weighted_error = tf.reduce_mean(importance_weights_ph * errors)
        
        # argmax_a Q_{target}(s_j, a)
        z_j = tf.argmax(q_tp1, axis=1) # max Q(s',a')

        # classification loss
        cl_error = tf.nn.sparse_softmax_cross_entropy_with_logits(
                      logits=pi, labels=z_j)
        
        # Q optimization
        if grad_norm_clipping is not None:
            gradients_q = optimizer_q.compute_gradients(weighted_error, var_list=q_func_vars)
            for i, (grad, var) in enumerate(gradients_qq):
                if grad is not None:
                    gradients_q[i] = (tf.clip_by_norm(grad, grad_norm_clipping), var)
            optimize_q = optimizer_q.apply_gradients(gradients_q)
        else:
            optimize_q = optimizer_q.minimize(weighted_error, var_list=q_func_vars)

        # pi optimization
        if grad_norm_clipping is not None:
            gradients_pi = optimizer_pi.compute_gradients(cl_error, var_list=pi_func_vars)
            for i, (grad, var) in enumerate(gradients_pi):
                if grad is not None:
                    gradients_pi[i] = (tf.clip_by_norm(grad, grad_norm_clipping), var)
            optimize_pi = optimizer_pi.apply_gradients(gradients_pi)
        else:
            optimize_pi = optimizer_pi.minimize(cl_error, var_list=pi_func_vars)

        # update_target Q
        update_target_expr = []
        for var, var_target in zip(sorted(q_func_vars, key=lambda v: v.name),
                                   sorted(target_q_func_vars, key=lambda v: v.name)):
            update_target_expr.append(var_target.assign(var))
        update_target_expr = tf.group(*update_target_expr)

        # update_target pi
        update_target_pi = []
        for var, var_target in zip(sorted(pi_func_vars, key=lambda v: v.name),
                                   sorted(target_pi_func_vars, key=lambda v: v.name)):
            update_target_pi.append(var_target.assign(var))
        update_target_pi = tf.group(*update_target_pi)

        # Create callable functions
        train = U.function(
            inputs=[
                obs_t_input,
                act_t_ph,
                rew_t_ph,
                obs_tp1_input,
                done_mask_ph,
                importance_weights_ph
            ],
            outputs=[td_error, cl_error],
            updates=[optimize_q, optimize_pi]
        )
        update_target = U.function([], [], updates=[update_target_expr, update_target_pi])

        q_values = U.function([obs_t_input], q_t)

        debug = {'q_values': q_values}

    # Create the replay buffer
    replay_buffer = ReplayBuffer(buffer_size)
    beta_schedule = None
    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    # Initialize the parameters and copy them to the target network.
    U.initialize()
    update_target()

    episode_rewards = [0.0]
    saved_mean_reward = None
    obs = env.reset()
    reset = True
    with tempfile.TemporaryDirectory() as td:
        model_saved = False
        model_file = os.path.join(td, "model")
        for t in range(max_timesteps):
            if callback is not None:
                if callback(locals(), globals()):
                    break
            # Take action and update exploration to the newest value
            kwargs = {}
            if not param_noise:
                update_eps = exploration.value(t)
                update_param_noise_threshold = 0.
            else:
                update_eps = 0.
                update_param_noise_threshold = -np.log(1. - exploration.value(t) + exploration.value(t) / float(env.action_space.n))
                kwargs['reset'] = reset
                kwargs['update_param_noise_threshold'] = update_param_noise_threshold
                kwargs['update_param_noise_scale'] = True
            
            action = env.action_space.sample() # not used, just so we have the datatype
            stochastic=True
            ac1, vpred1 =  act(stochastic, np.array(obs)[None])
            action = ac1[0]
            #action, _ = pi.act(stochastic, obs)
            
            #action = act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0]
            env_action = action
            reset = False
            new_obs, rew, done, _ = env.step(env_action)
            # Store transition in the replay buffer.
            replay_buffer.add(obs, action, rew, new_obs, float(done))
            obs = new_obs

            episode_rewards[-1] += rew
            if done:
                obs = env.reset()
                episode_rewards.append(0.0)
                reset = True

            if t > learning_starts and t % train_freq == 0:
                # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size)
                weights, batch_idxes = np.ones_like(rewards), None
                td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights)

            if t > learning_starts and t % target_network_update_freq == 0:
                # Update target network periodically.
                update_target()
            

            # Log train and res
            mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
            num_episodes = len(episode_rewards)
            if done and print_freq is not None and len(episode_rewards) % print_freq == 0:
                logger.record_tabular("steps", t)
                logger.record_tabular("episodes", num_episodes)
                logger.record_tabular("mean 100 episode reward", mean_100ep_reward)
                logger.record_tabular("% time spent exploring", int(100 * exploration.value(t)))
                logger.dump_tabular()

            if (checkpoint_freq is not None and t > learning_starts and
                    num_episodes > 100 and t % checkpoint_freq == 0):
                if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward:
                    if print_freq is not None:
                        logger.log("Saving model due to mean reward increase: {} -> {}".format(
                                   saved_mean_reward, mean_100ep_reward))
                    save_state(model_file)
                    model_saved = True
                    saved_mean_reward = mean_100ep_reward
        if model_saved:
            if print_freq is not None:
                logger.log("Restored model with mean reward: {}".format(saved_mean_reward))
            load_state(model_file)

    return act
    def __init__(self, inputs: TrainInputs, action_space, observation_space):
        act_size = action_space.n
        optimizer = tf.train.AdamOptimizer(learning_rate=inputs.lr)

        with tf.variable_scope(
                'q_func'
        ):  # child scopes of reusable parent scope are reusable
            self.runner = q_policy(obs=inputs.s0,
                                   epsilon=inputs.eps,
                                   action_space=action_space)

        with tf.variable_scope(
                'q_func', reuse=True
        ):  # child scopes of reusable parent scope are reusable
            q_net = q_policy(obs=inputs.s0,
                             epsilon=inputs.eps,
                             action_space=action_space)

        with tf.variable_scope('target_q_func'):
            target_q_net = q_policy(obs=inputs.s1,
                                    epsilon=inputs.eps,
                                    action_space=action_space)

        update_target_op = tf.group(*[
            tf.assign(a, b)
            for a, b in zip(target_q_net.trainables, q_net.trainables)
        ])

        if G.double_q:
            with tf.variable_scope(
                    'q_func', reuse=True
            ):  # child scopes of reusable parent scope are reusable
                inner_q_net = q_policy(obs=inputs.s1,
                                       epsilon=inputs.eps,
                                       action_space=action_space)

        with tf.variable_scope('Q_training'):
            q_sampled = tf.reduce_sum(q_net.q_values *
                                      tf.one_hot(inputs.act, act_size),
                                      axis=1)

            if G.double_q:
                q_asterisk = tf.reduce_sum(
                    target_q_net.q_values *
                    tf.one_hot(inner_q_net.act_argmax, act_size),
                    axis=1)
            else:
                q_asterisk = tf.reduce_max(target_q_net.q_values, axis=1)

            # compute RHS of bellman equation
            T_q = inputs.rew + (1.0 -
                                inputs.done_mask_ph) * G.gamma * q_asterisk

            # compute the error (potentially clipped)
            td_error = q_sampled - tf.stop_gradient(T_q)
            _ = U.huber_loss(td_error)
            if G.prioritized_replay:
                loss = tf.reduce_mean(inputs.sample_weights * _)
            else:
                loss = tf.reduce_mean(_)

            # compute optimization op (potentially with gradient clipping)
            if G.grad_norm_clipping:
                optimize_op = U.minimize_and_clip(
                    optimizer,
                    loss,
                    var_list=q_net.trainables,
                    clip_val=G.grad_norm_clipping)
            else:
                optimize_op = optimizer.minimize(loss,
                                                 var_list=q_net.trainables)

        def train(*,
                  s0s,
                  actions,
                  rewards,
                  s1s,
                  dones,
                  sample_weights=None):  # read: SARSA
            feed_dict = {
                inputs.lr: G.learning_rate,
                inputs.s0: s0s,
                inputs.act: actions,
                inputs.rew: rewards,
                inputs.s1: s1s,
                inputs.done_mask_ph: dones
            }
            if G.prioritized_replay:
                assert sample_weights is not None, "sample_weights is required when prioritized_replay is ON."
                feed_dict[inputs.sample_weights] = sample_weights
            td_error_val, loss_val, _ = U.get_session().run(
                [td_error, loss, optimize_op], feed_dict)
            return td_error_val, loss_val

        def update_target():
            U.get_session().run(update_target_op)

        self.train = train
        self.update_target = update_target
예제 #5
0
def build_train(make_obs_ph,
                make_bel_ph,
                q_func,
                num_actions,
                optimizer,
                grad_norm_clipping=None,
                gamma=1.0,
                double_q=True,
                scope="deepq",
                reuse=None,
                param_noise=False,
                param_noise_filter_func=None):
    """Creates the train function:

    Parameters
    ----------
    make_obs_ph: str -> tf.placeholder or TfInput
        a function that takes a name and creates a placeholder of input with that name
    q_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    num_actions: int
        number of actions
    reuse: bool
        whether or not to reuse the graph variables
    optimizer: tf.train.Optimizer
        optimizer to use for the Q-learning objective.
    grad_norm_clipping: float or None
        clip gradient norms to this value. If None no clipping is performed.
    gamma: float
        discount rate.
    double_q: bool
        if true will use Double Q Learning (https://arxiv.org/abs/1509.06461).
        In general it is a good idea to keep it enabled.
    scope: str or VariableScope
        optional scope for variable_scope.
    reuse: bool or None
        whether or not the variables should be reused. To be able to reuse the scope must be given.
    param_noise: bool
        whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905)
    param_noise_filter_func: tf.Variable -> bool
        function that decides whether or not a variable should be perturbed. Only applicable
        if param_noise is True. If set to None, default_param_noise_filter is used by default.

    Returns
    -------
    act: (tf.Variable, bool, float) -> tf.Variable
        function to select and action given observation.
`       See the top of the file for details.
    train: (object, np.array, np.array, object, np.array, np.array) -> np.array
        optimize the error in Bellman's equation.
`       See the top of the file for details.
    update_target: () -> ()
        copy the parameters from optimized Q function to the target Q function.
`       See the top of the file for details.
    debug: {str: function}
        a bunch of functions to print debug data like q_values.
    """
    act_f = build_act(make_obs_ph,
                      make_bel_ph,
                      q_func,
                      num_actions,
                      scope=scope,
                      reuse=reuse)

    with tf.variable_scope(scope, reuse=reuse):
        # set up placeholders
        obs_t_input = make_obs_ph("obs_t")
        bel_t_input = make_bel_ph("bel_t")
        act_t_ph = tf.placeholder(tf.int32, [None], name="action")
        rew_t_ph = tf.placeholder(tf.float32, [None], name="reward")
        expert_qval_ph = tf.placeholder(tf.float32, [None, num_actions],
                                        name="expert_qval_t")

        obs_tp1_input = make_obs_ph("obs_tp1")
        bel_tp1_input = make_bel_ph("bel_tp1")
        expert_qval_tp1_ph = tf.placeholder(tf.float32, [None, num_actions],
                                            name="expert_qval_tp1")
        done_mask_ph = tf.placeholder(tf.float32, [None], name="done")
        importance_weights_ph = tf.placeholder(tf.float32, [None],
                                               name="weight")

        # q network evaluation
        q_t = q_func(obs_t_input.get(),
                     bel_t_input.get(),
                     expert_qval_ph,
                     num_actions,
                     scope="q_func",
                     reuse=True)  # reuse parameters from act
        q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                        scope=tf.get_variable_scope().name +
                                        "/q_func")

        # target q network evalution
        q_tp1 = q_func(obs_tp1_input.get(),
                       bel_tp1_input.get(),
                       expert_qval_tp1_ph,
                       num_actions,
                       scope="target_q_func")
        target_q_func_vars = tf.get_collection(
            tf.GraphKeys.GLOBAL_VARIABLES,
            scope=tf.get_variable_scope().name + "/target_q_func")

        # q scores for actions which we know were selected in the given state.
        one_hot_action = tf.one_hot(act_t_ph, num_actions)
        q_t_selected = tf.reduce_sum(q_t * one_hot_action, axis=1)

        # compute estimate of best possible value starting from state at t + 1
        if double_q:
            q_tp1_using_online_net = q_func(obs_tp1_input.get(),
                                            bel_tp1_input.get(),
                                            expert_qval_tp1_ph,
                                            num_actions,
                                            scope="q_func",
                                            reuse=True)
            q_tp1_best_using_online_net = tf.argmax(q_tp1_using_online_net, 1)
            q_tp1_best = tf.reduce_sum(
                q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions),
                axis=1)
        else:
            q_tp1_best = tf.reduce_max(q_tp1, 1)

        q_tp1_best_masked = (1.0 - done_mask_ph) * q_tp1_best

        # compute RHS of bellman equation
        q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked

        # compute the error (potentially clipped)
        #q_t_selected = tf.Print(q_t_selected, [q_t_selected], '>>>> QT :', summarize=3)
        #q_t_selected_target = tf.Print(q_t_selected_target, [q_t_selected_target], '>>>> QT_Target :', summarize=3)

        td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)
        errors = U.huber_loss(tf.reduce_mean(td_error, axis=0))
        weighted_error = tf.reduce_mean(importance_weights_ph * errors)

        # compute optimization op (potentially with gradient clipping)
        if grad_norm_clipping is not None:
            gradients = optimizer.compute_gradients(weighted_error,
                                                    var_list=q_func_vars)
            for i, (grad, var) in enumerate(gradients):
                if grad is not None:
                    # grad = tf.Print(grad, [grad], '>>>> grad: ', summarize=10)
                    gradients[i] = (tf.clip_by_norm(grad,
                                                    grad_norm_clipping), var)
            optimize_expr = optimizer.apply_gradients(gradients)
        else:
            optimize_expr = optimizer.minimize(weighted_error,
                                               var_list=q_func_vars)

        # update_target_fn will be called periodically to copy Q network to target Q network
        update_target_expr = []
        for var, var_target in zip(
                sorted(q_func_vars, key=lambda v: v.name),
                sorted(target_q_func_vars, key=lambda v: v.name)):
            update_target_expr.append(var_target.assign(var))
        update_target_expr = tf.group(*update_target_expr)

        # Create callable functions
        train = U.function(inputs=[
            obs_t_input,
            bel_t_input,
            expert_qval_ph,
            act_t_ph,
            rew_t_ph,
            obs_tp1_input,
            bel_tp1_input,
            expert_qval_tp1_ph,
            done_mask_ph,
            importance_weights_ph,
        ],
                           outputs=td_error,
                           updates=[optimize_expr])

        update_target = U.function([], [], updates=[update_target_expr])

        q_values = U.function([obs_t_input], q_t)

        return act_f, train, update_target, {'q_values': q_values}
예제 #6
0
def build_train(make_obs_ph,
                q_func,
                num_actions,
                optimizer,
                grad_norm_clipping=None,
                gamma=1.0,
                double_q=True,
                scope="deepq",
                reuse=None,
                param_noise=False,
                param_noise_filter_func=None):
    """
    Creates the train function:

    :param make_obs_ph: (function (str): TensorFlow Tensor) a function that takes a name and creates a placeholder of
        input with that name
    :param q_func: (function (TensorFlow Tensor, int, str, bool): TensorFlow Tensor)
        the model that takes the following inputs:
            - observation_in: (Any) the output of observation placeholder
            - num_actions: int  number of actions
            - scope: (str)
            - reuse: (bool)

            should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions)
            with values of every action.
    :param num_actions: (int) number of actions
    :param reuse: (bool) whether or not to reuse the graph variables
    :param optimizer: (tf.train.Optimizer) optimizer to use for the Q-learning objective.
    :param grad_norm_clipping: (float) clip gradient norms to this value. If None no clipping is performed.
    :param gamma: (float) discount rate.
    :param double_q: (bool) if true will use Double Q Learning (https://arxiv.org/abs/1509.06461). In general it is a
        good idea to keep it enabled.
    :param scope: (str or VariableScope) optional scope for variable_scope.
    :param reuse: (bool) whether or not the variables should be reused. To be able to reuse the scope must be given.
    :param param_noise: (bool) whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905)
    :param param_noise_filter_func: (function (TensorFlow Tensor): bool) function that decides whether or not a
        variable should be perturbed. Only applicable if param_noise is True. If set to None, default_param_noise_filter
        is used by default.

    :return: (tuple)

        act: (function (TensorFlow Tensor, bool, float): TensorFlow Tensor) function to select and action given
            observation. See the top of the file for details.
        train: (function (Any, numpy float, numpy float, Any, numpy bool, numpy float): numpy float)
            optimize the error in Bellman's equation. See the top of the file for details.
        update_target: (function) copy the parameters from optimized Q function to the target Q function.
            See the top of the file for details.
        debug: ({str: function}) a bunch of functions to print debug data like q_values.
    """
    if param_noise:
        act_f = build_act_with_param_noise(
            make_obs_ph,
            q_func,
            num_actions,
            scope=scope,
            reuse=reuse,
            param_noise_filter_func=param_noise_filter_func)
    else:
        act_f = build_act(make_obs_ph,
                          q_func,
                          num_actions,
                          scope=scope,
                          reuse=reuse)

    with tf.variable_scope(scope, reuse=reuse):
        # set up placeholders
        obs_t_input = make_obs_ph("obs_t")
        act_t_ph = tf.placeholder(tf.int32, [None], name="action")
        rew_t_ph = tf.placeholder(tf.float32, [None], name="reward")
        obs_tp1_input = make_obs_ph("obs_tp1")
        done_mask_ph = tf.placeholder(tf.float32, [None], name="done")
        importance_weights_ph = tf.placeholder(tf.float32, [None],
                                               name="weight")

        # q network evaluation
        q_t = q_func(obs_t_input.get(),
                     num_actions,
                     scope="q_func",
                     reuse=True)  # reuse parameters from act
        q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                        scope=tf.get_variable_scope().name +
                                        "/q_func")

        # target q network evalution
        q_tp1 = q_func(obs_tp1_input.get(), num_actions, scope="target_q_func")
        target_q_func_vars = tf.get_collection(
            tf.GraphKeys.GLOBAL_VARIABLES,
            scope=tf.get_variable_scope().name + "/target_q_func")

        # q scores for actions which we know were selected in the given state.
        q_t_selected = tf.reduce_sum(q_t * tf.one_hot(act_t_ph, num_actions),
                                     1)

        # compute estimate of best possible value starting from state at t + 1
        if double_q:
            q_tp1_using_online_net = q_func(obs_tp1_input.get(),
                                            num_actions,
                                            scope="q_func",
                                            reuse=True)
            q_tp1_best_using_online_net = tf.argmax(q_tp1_using_online_net, 1)
            q_tp1_best = tf.reduce_sum(
                q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions),
                1)
        else:
            q_tp1_best = tf.reduce_max(q_tp1, 1)
        q_tp1_best_masked = (1.0 - done_mask_ph) * q_tp1_best

        # compute RHS of bellman equation
        q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked

        # compute the error (potentially clipped)
        td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)
        errors = tf_utils.huber_loss(td_error)
        weighted_error = tf.reduce_mean(importance_weights_ph * errors)

        # compute optimization op (potentially with gradient clipping)
        if grad_norm_clipping is not None:
            gradients = optimizer.compute_gradients(weighted_error,
                                                    var_list=q_func_vars)
            for i, (grad, var) in enumerate(gradients):
                if grad is not None:
                    gradients[i] = (tf.clip_by_norm(grad,
                                                    grad_norm_clipping), var)
            optimize_expr = optimizer.apply_gradients(gradients)
        else:
            optimize_expr = optimizer.minimize(weighted_error,
                                               var_list=q_func_vars)

        # update_target_fn will be called periodically to copy Q network to target Q network
        update_target_expr = []
        for var, var_target in zip(
                sorted(q_func_vars, key=lambda v: v.name),
                sorted(target_q_func_vars, key=lambda v: v.name)):
            update_target_expr.append(var_target.assign(var))
        update_target_expr = tf.group(*update_target_expr)

        # Create callable functions
        train = tf_utils.function(inputs=[
            obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph,
            importance_weights_ph
        ],
                                  outputs=td_error,
                                  updates=[optimize_expr])
        update_target = tf_utils.function([], [], updates=[update_target_expr])

        q_values = tf_utils.function([obs_t_input], q_t)

        return act_f, train, update_target, {'q_values': q_values}
def build_train(make_obs_ph, q_func, num_actions, optimizer, avg_reward_learning_rate=0.0001, grad_norm_clipping=None, gamma=1.0,
    double_q=True, scope="deepr", reuse=None, param_noise=False, param_noise_filter_func=None):
    """Creates the train function:

    Parameters
    ----------
    make_obs_ph: str -> tf.placeholder or TfInput
        a function that takes a name and creates a placeholder of input with that name
    q_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    num_actions: int
        number of actions
    reuse: bool
        whether or not to reuse the graph variables
    optimizer: tf.train.Optimizer
        optimizer to use for the Q-learning objective.
    grad_norm_clipping: float or None
        clip gradient norms to this value. If None no clipping is performed.
    gamma: float
        discount rate.
    double_q: bool
        if true will use Double Q Learning (https://arxiv.org/abs/1509.06461).
        In general it is a good idea to keep it enabled.
    scope: str or VariableScope
        optional scope for variable_scope.
    reuse: bool or None
        whether or not the variables should be reused. To be able to reuse the scope must be given.
    param_noise: bool
        whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905)
    param_noise_filter_func: tf.Variable -> bool
        function that decides whether or not a variable should be perturbed. Only applicable
        if param_noise is True. If set to None, default_param_noise_filter is used by default.

    Returns
    -------
    act: (tf.Variable, bool, float) -> tf.Variable
        function to select and action given observation.
`       See the top of the file for details.
    train: (object, np.array, np.array, object, np.array, np.array) -> np.array
        optimize the error in Bellman's equation.
`       See the top of the file for details.
    update_target: () -> ()
        copy the parameters from optimized Q function to the target Q function.
`       See the top of the file for details.
    debug: {str: function}
        a bunch of functions to print debug data like q_values.
    """
    if param_noise:
        act_f = build_act_with_param_noise(make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse,
            param_noise_filter_func=param_noise_filter_func)
    else:
        act_f = build_act(make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse)

    with tf.variable_scope(scope, reuse=reuse):
        # set up placeholders
        obs_t_input = make_obs_ph("obs_t")
        act_t_ph = tf.placeholder(tf.int32, [None], name="action")
        rew_t_ph = tf.placeholder(tf.float32, [None], name="reward")
        obs_tp1_input = make_obs_ph("obs_tp1")
        done_mask_ph = tf.placeholder(tf.float32, [None], name="done")
        importance_weights_ph = tf.placeholder(tf.float32, [None], name="weight")
        # Actions in output grid that are not valid are neginf
        unused_actions_mask = tf.placeholder(tf.float32, [None, num_actions], name="unused_actions_mask")
        rew_avg = tf.Variable(0., name='rew_avg')
        rew_avg_next = tf.Variable(0., name='rew_avg_next')

        # q network evaluation
        q_t = q_func(obs_t_input.get(), num_actions, scope="q_func", reuse=True)  # reuse parameters from act
        q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/q_func")

        # target q network evalution
        q_tp1 = q_func(obs_tp1_input.get(), num_actions, scope="target_q_func")
        target_q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/target_q_func")

        # q scores for actions which we know were selected in the given state.
        q_t_selected = tf.reduce_sum(q_t * tf.one_hot(act_t_ph, num_actions), 1)
        ##row_ids = tf.range(tf.shape(q_t)[0])
        ##idx = tf.stack([row_ids, act_t_ph], axis=1)
        ##q_t_selected = tf.gather_nd(tf.reshape(q_t, [-1, num_actions]), idx)

        # compute estimate of best possible value starting from state at t + 1
        if double_q:
            q_tp1_using_online_net = q_func(obs_tp1_input.get(), num_actions, scope="q_func", reuse=True)

            # Filter out unused actions
            #q_tp1_using_online_net = tf.boolean_mask(q_tp1_using_online_net, 1-unused_actions_mask, axis=1)
            #q_tp1 = tf.boolean_mask(q_tp1, 1-unused_actions_mask, axis=1)
            #q_t_filtered = tf.boolean_mask(q_t, 1-unused_actions_mask, axis=1)
            q_tp1_using_online_net = q_tp1_using_online_net + unused_actions_mask

            # Best q's -- useful for deciding whether to update R Learning
            # q_t_filtered = q_t + unused_actions_mask
            # q_t_best = tf.reduce_max(q_t_filtered, 1)

            q_tp1_best_using_online_net = tf.argmax(q_tp1_using_online_net, 1)
            ##q_tp1_best_using_online_net = tf.argmax(tf.reshape(q_tp1_using_online_net, [-1, num_actions]), 1)
            q_tp1_best = tf.reduce_sum(q_tp1 * tf.one_hot(q_tp1_best_using_online_net, tf.shape(q_tp1)[1]), 1)
            ##idx = tf.stack([tf.cast(row_ids, tf.int64), q_tp1_best_using_online_net], axis=1)
            ##q_tp1_best = tf.gather_nd(tf.reshape(q_tp1, [-1, num_actions]), idx)
        else:
            q_tp1_best = tf.argmax(q_tp1 + unused_actions_mask, 1)
            q_tp1_best = tf.reduce_sum(q_tp1 * tf.one_hot(q_tp1_best, tf.shape(q_tp1)[1]), 1)
            #q_tp1_best = tf.reduce_max(q_tp1, 1)
        q_tp1_best_masked = (1.0 - done_mask_ph) * q_tp1_best

        # compute RHS of bellman equation
        ##q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked
        #with tf.control_dependencies([rew_avg.assign(rew_avg_next), tf.print(rew_avg)]):
        with tf.control_dependencies([rew_avg.assign(rew_avg_next)]):
            q_t_selected_target = rew_t_ph - rew_avg + q_tp1_best_masked

        # compute the error (potentially clipped)
        ##td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)
        # For R Learning
        td_error = tf.stop_gradient(q_t_selected_target) - q_t_selected
        errors = U.huber_loss(td_error)
        #errors = tf.losses.mean_squared_error(labels=tf.stop_gradient(q_t_selected_target), predictions=q_t_selected)
        weighted_error = tf.reduce_mean(importance_weights_ph * errors)

        # R Learning
        tf.summary.scalar('rew_avg', rew_avg)

        #use_for_reward = tf.cast(tf.abs(q_t_selected - q_t_best) < 0.10*tf.abs(q_t_best), tf.float32)
        #use_for_reward = tf.cast(tf.abs(q_t_selected - q_t_best) < 1e-6, tf.float32)
        #num_valid_rewards = tf.reduce_sum(use_for_reward)

        #with tf.control_dependencies([tf.print(num_valid_rewards)]):
        #rew_avg_next_op = rew_avg_next.assign_add(tf.cond(num_valid_rewards > 0,
        #                                                  lambda: avg_reward_learning_rate*(1/num_valid_rewards)*tf.reduce_sum(use_for_reward * td_error),
        #                                                  lambda: 0.0))
        rew_avg_next_op = rew_avg_next.assign_add(avg_reward_learning_rate*tf.reduce_mean(td_error))

        # compute optimization op (potentially with gradient clipping)
        if grad_norm_clipping is not None:
            gradients = optimizer.compute_gradients(weighted_error, var_list=q_func_vars)
            for i, (grad, var) in enumerate(gradients):
                if grad is not None:
                    gradients[i] = (tf.clip_by_norm(grad, grad_norm_clipping), var)
            with tf.control_dependencies([rew_avg_next_op]):
                optimize_expr = optimizer.apply_gradients(gradients)
        else:
            with tf.control_dependencies([rew_avg_next_op]):
                optimize_expr = optimizer.minimize(weighted_error, var_list=q_func_vars)

        # update_target_fn will be called periodically to copy Q network to target Q network
        update_target_expr = []
        for var, var_target in zip(sorted(q_func_vars, key=lambda v: v.name),
                                   sorted(target_q_func_vars, key=lambda v: v.name)):
            update_target_expr.append(var_target.assign(var))
        update_target_expr = tf.group(*update_target_expr)

        # Create callable functions
        train = U.function(
            inputs=[
                obs_t_input,
                act_t_ph,
                rew_t_ph,
                obs_tp1_input,
                done_mask_ph,
                importance_weights_ph,
                unused_actions_mask
            ],
            #outputs=td_error,
            outputs=[tf.summary.histogram("rewards", rew_t_ph, collections=[]), weighted_error, tf.reduce_mean(q_t_selected), tf.reduce_mean(q_t_selected_target)],
            updates=[optimize_expr]
        )
        update_target = U.function([], [], updates=[update_target_expr])

        q_values = U.function([obs_t_input], q_t)
        #max_q_values = U.function([obs_t_input, unused_actions_mask], tf.reduce_max(tf.boolean_mask(q_t, 1-unused_actions_mask, axis=1), 1))
        max_q_values = U.function([obs_t_input, unused_actions_mask], tf.reduce_max(q_t + unused_actions_mask, 1))

        return act_f, train, update_target, {'q_values': q_values, 'max_q_values': max_q_values}
예제 #8
0
def build_train(make_obs_ph,
                q_func,
                num_actions,
                optimizer,
                train_gaze,
                grad_norm_clipping=None,
                gamma=1.0,
                double_q=True,
                scope="DeepqWithGaze",
                reuse=None,
                param_noise=False,
                param_noise_filter_func=None):
    """Creates the train function:

    Parameters
    ----------
    make_obs_ph: str -> tf.placeholder or TfInput
        a function that takes a name and creates a placeholder of input with that name
    q_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    num_actions: int
        number of actions
    reuse: bool
        whether or not to reuse the graph variables
    optimizer: tf.train.Optimizer
        optimizer to use for the Q-learning objective.
    grad_norm_clipping: float or None
        clip gradient norms to this value. If None no clipping is performed.
    gamma: float
        discount rate.
    double_q: bool
        if true will use Double Q Learning (https://arxiv.org/abs/1509.06461).
        In general it is a good idea to keep it enabled.
    scope: str or VariableScope
        optional scope for variable_scope.
    reuse: bool or None
        whether or not the variables should be reused. To be able to reuse the scope must be given.
    param_noise: bool
        whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905)
    param_noise_filter_func: tf.Variable -> bool
        function that decides whether or not a variable should be perturbed. Only applicable
        if param_noise is True. If set to None, default_param_noise_filter is used by default.

    Returns
    -------
    act: (tf.Variable, bool, float) -> tf.Variable
        function to select and action given observation.
`       See the top of the file for details.
    train: (object, np.array, np.array, object, np.array, np.array) -> np.array
        optimize the error in Bellman's equation.
`       See the top of the file for details.
    update_target: () -> ()
        copy the parameters from optimized Q function to the target Q function.
`       See the top of the file for details.
    debug: {str: function}
        a bunch of functions to print debug data like q_values.
    """
    if param_noise:
        act_f = build_act_with_param_noise(
            make_obs_ph,
            q_func,
            num_actions,
            scope=scope,
            reuse=reuse,
            param_noise_filter_func=param_noise_filter_func)
    else:
        act_f = build_act(make_obs_ph,
                          q_func,
                          num_actions,
                          scope=scope,
                          reuse=reuse)

    with tf.variable_scope(scope, reuse=reuse):
        # set up placeholders
        obs_t_input = U.ensure_tf_input(make_obs_ph("obs_t"))
        act_t_ph = tf.placeholder(tf.int32, [None], name="action")
        rew_t_ph = tf.placeholder(tf.float32, [None], name="reward")
        obs_tp1_input = U.ensure_tf_input(make_obs_ph("obs_tp1"))
        done_mask_ph = tf.placeholder(tf.float32, [None], name="done")
        importance_weights_ph = tf.placeholder(tf.float32, [None],
                                               name="weight")
        initial_freeze_phase_ph = tf.placeholder(tf.bool, (),
                                                 name="initial_freeze_phase")

        # q network evaluation
        q_t = q_func(obs_t_input.get(),
                     num_actions,
                     scope="q_func",
                     reuse=True)  # reuse parameters from act
        q_func_vars = gflag.qfunc_models.get(
            "q_func").weights  # already includes gaze_models weights
        q_func_trainable_vars = [ w for w in gflag.qfunc_models.get("q_func").trainable_weights \
            if (train_gaze or w not in gflag.gaze_models.get("q_func").trainable_weights) ] # train_gaze=False excludes gaze model's weight

        # target q network evalution
        q_tp1 = q_func(obs_tp1_input.get(), num_actions, scope="target_q_func")
        target_q_func_vars = gflag.qfunc_models.get(
            "target_q_func").weights  # already includes gaze_models weights

        # q scores for actions which we know were selected in the given state.
        q_t_selected = tf.reduce_sum(q_t * tf.one_hot(act_t_ph, num_actions),
                                     1)

        # compute estimate of best possible value starting from state at t + 1
        if double_q:
            q_tp1_using_online_net = q_func(obs_tp1_input.get(),
                                            num_actions,
                                            scope="q_func",
                                            reuse=True)
            q_tp1_best_using_online_net = tf.arg_max(q_tp1_using_online_net, 1)
            q_tp1_best = tf.reduce_sum(
                q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions),
                1)
        else:
            q_tp1_best = tf.reduce_max(q_tp1, 1)
        q_tp1_best_masked = (1.0 - done_mask_ph) * q_tp1_best

        # compute RHS of bellman equation
        q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked

        # compute the error (potentially clipped)
        td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)
        errors = U.huber_loss(td_error)
        weighted_error = tf.reduce_mean(importance_weights_ph * errors)

        # compute optimization op (potentially with gradient clipping)
        initial_freeze_weights = gflag.qfunc_models.get_weight_names_for_initial_freeze(
            model_name="q_func")
        q_func_trainable_vars_for_initial_freeze = list(
            filter(lambda w: w.name not in initial_freeze_weights,
                   q_func_trainable_vars))
        if grad_norm_clipping is not None:
            optimize_expr_for_initial_freeze = lambda: U.minimize_and_clip(optimizer,
                                                weighted_error,
                                                var_list=q_func_trainable_vars_for_initial_freeze,
                                                clip_val=grad_norm_clipping) \
                                            if q_func_trainable_vars_for_initial_freeze else tf.no_op()
            optimize_expr_after_freeze = lambda: U.minimize_and_clip(
                optimizer,
                weighted_error,
                var_list=q_func_trainable_vars,
                clip_val=grad_norm_clipping)
        else:
            # must put the operation under lambda, if you fully read tf.cond()'s documentation
            optimize_expr_for_initial_freeze = lambda: optimizer.minimize(
                weighted_error,
                var_list=q_func_trainable_vars_for_initial_freeze)
            optimize_expr_after_freeze = lambda: optimizer.minimize(
                weighted_error, var_list=q_func_trainable_vars)
        optimize_expr = tf.cond(initial_freeze_phase_ph,
                                optimize_expr_for_initial_freeze,
                                optimize_expr_after_freeze)

        # update_target_fn will be called periodically to copy Q network to target Q network
        update_target_expr = []
        assert len(q_func_vars) == len(target_q_func_vars)
        for var, var_target in zip(q_func_vars, target_q_func_vars):
            update_target_expr.append(var_target.assign(var))
        update_target_expr = tf.group(*update_target_expr)

        # Create callable functions
        train = U.function(inputs=[
            obs_t_input,
            act_t_ph,
            rew_t_ph,
            obs_tp1_input,
            done_mask_ph,
            importance_weights_ph,
            initial_freeze_phase_ph,
        ],
                           outputs=td_error,
                           updates=[optimize_expr],
                           givens={K.backend.learning_phase(): 1})
        update_target = U.function([], [], updates=[update_target_expr])

        q_values = U.function([obs_t_input], q_t)

        # For tensorboard
        merged = tf.summary.merge([
            tf.summary.image('img_curframe', obs_t_input.get()),
            tf.summary.image(
                'gaze_curframe',
                q_func(obs_t_input.get(),
                       num_actions,
                       scope="q_func",
                       return_gaze=True,
                       reuse=True))
        ])
        tensorboard_summary = U.function(
            inputs=[obs_t_input],
            outputs=merged,
            givens={K.backend.learning_phase(): 0})

        return act_f, train, update_target, {
            'q_values': q_values
        }, tensorboard_summary
예제 #9
0
def build_train_neural_linear(make_obs_ph, q_func, num_actions, optimizer, grad_norm_clipping=None, gamma=1.0,
    double_q=True, scope="deepq", reuse=None, param_noise=False, param_noise_filter_func=None, actor='target'):
    """Creates the train function:

    Parameters
    ----------
    make_obs_ph: str -> tf.placeholder or TfInput
        a function that takes a name and creates a placeholder of input with that name
    q_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    num_actions: int
        number of actions
    reuse: bool
        whether or not to reuse the graph variables
    optimizer: tf.train.Optimizer
        optimizer to use for the Q-learning objective.
    grad_norm_clipping: float or None
        clip gradient norms to this value. If None no clipping is performed.
    gamma: float
        discount rate.
    double_q: bool
        if true will use Double Q Learning (https://arxiv.org/abs/1509.06461).
        In general it is a good idea to keep it enabled.
    scope: str or VariableScope
        optional scope for variable_scope.
    reuse: bool or None
        whether or not the variables should be reused. To be able to reuse the scope must be given.
    param_noise: bool
        whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905)
    param_noise_filter_func: tf.Variable -> bool
        function that decides whether or not a variable should be perturbed. Only applicable
        if param_noise is True. If set to None, default_param_noise_filter is used by default.

    Returns
    -------
    act: (tf.Variable, bool, float) -> tf.Variable
        function to select and action given observation.
`       See the top of the file for details.
    train: (object, np.array, np.array, object, np.array, np.array) -> np.array
        optimize the error in Bellman's equation.
`       See the top of the file for details.
    update_target: () -> ()
        copy the parameters from optimized Q function to the target Q function.
`       See the top of the file for details.
    debug: {str: function}
        a bunch of functions to print debug data like q_values.
    """

    if actor == 'target':
        act_f = build_act_thompson(make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse, actor="target_q_func")
    else:
        act_f = build_act_thompson(make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse, actor="q_func")

    with tf.variable_scope(scope, reuse=reuse):
        # set up placeholders
        obs_t_input = make_obs_ph("obs_t")
        act_t_ph = tf.placeholder(tf.int32, [None], name="action")
        rew_t_ph = tf.placeholder(tf.float32, [None], name="reward")
        obs_tp1_input = make_obs_ph("obs_tp1")
        done_mask_ph = tf.placeholder(tf.float32, [None], name="done")
        importance_weights_ph = tf.placeholder(tf.float32, [None], name="weight")

        # q network evaluation
        if actor == 'target':
            print("actor is target")
            q_t, phi_xt = q_func(obs_t_input.get(), num_actions, scope="q_func")
            q_tp1, phi_target_xtp1 = q_func(obs_tp1_input.get(), num_actions, scope="target_q_func", reuse=True)
        else:
            print("actor is dqn")
            q_t, phi_xt = q_func(obs_t_input.get(), num_actions, scope="q_func", reuse=True)
            q_tp1, phi_target_xtp1 = q_func(obs_tp1_input.get(), num_actions, scope="target_q_func")

        q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/q_func")

        # target q network evalution
        target_q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/target_q_func")

        last_layer_weights = q_func_vars[-2]#target_q_func_vars[-2]

        # q scores for actions which we know were selected in the given state.
        q_t_selected = tf.reduce_sum(q_t * tf.one_hot(act_t_ph, num_actions), 1)

        # compute estimate of best possible value starting from state at t + 1
        # double dqn learning
        if double_q:
            print("building ddqn loss for neural linear")
            q_tp1_using_online_net, _ = q_func(obs_tp1_input.get(), num_actions, scope="q_func", reuse=True)
            q_tp1_best_using_online_net = tf.argmax(q_tp1_using_online_net, 1)
            q_tp1_best = tf.reduce_sum(q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions), 1)
        else:
            print("building dqn loss for neural linear")
            q_tp1_best = tf.reduce_max(q_tp1, 1)


        q_tp1_best_masked = (1.0 - done_mask_ph) * q_tp1_best

        # compute RHS of bellman equation
        q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked

        # compute the error (potentially clipped)
        td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)
        errors = U.huber_loss(td_error)
        weighted_error = tf.reduce_mean(importance_weights_ph * errors)

        # tf.matmul(tf.expand_dims(phi_xt,-1),tf.expand_dims(phi_xt,1))
        # blr additions
        phiphiTop = tf.matmul(tf.transpose(phi_xt), phi_xt)
        phiYop = tf.squeeze(tf.matmul(tf.expand_dims(q_t_selected_target,0), phi_xt))
        feat_dim = phi_xt.shape[1].value
        #carefull batch size here is actually action size
        precision_mat = tf.placeholder(tf.float64, [None] + [feat_dim, feat_dim], name="phiphiT")
        phiY = tf.placeholder(tf.float64, [None] + [feat_dim, 1], name="phiphiT")
        covariance_mat = tf.matrix_inverse(precision_mat)
        w_mu = tf.squeeze(tf.matmul(covariance_mat,phiY),axis=-1)
        w_ph = tf.placeholder(tf.float32, [None] + [num_actions, feat_dim], name="w")

        # compute optimization op (potentially with gradient clipping)
        if grad_norm_clipping is not None:
            gradients = optimizer.compute_gradients(weighted_error, var_list=q_func_vars)
            for i, (grad, var) in enumerate(gradients):
                if grad is not None:
                    gradients[i] = (tf.clip_by_norm(grad, grad_norm_clipping), var)
            optimize_expr = optimizer.apply_gradients(gradients)
        else:
            optimize_expr = optimizer.minimize(weighted_error, var_list=q_func_vars)

        # update_target_fn will be called periodically to copy Q network to target Q network
        update_target_expr = []
        for var, var_target in zip(sorted(q_func_vars, key=lambda v: v.name),
                                   sorted(target_q_func_vars, key=lambda v: v.name)):
            update_target_expr.append(var_target.assign(var))
        update_target_expr = tf.group(*update_target_expr)

        # Create callable functions
        train = U.function(
            inputs=[
                obs_t_input,
                act_t_ph,
                rew_t_ph,
                obs_tp1_input,
                done_mask_ph,
                importance_weights_ph
            ],
            outputs=td_error,
            updates=[optimize_expr]
        )
        update_target = U.function([], [], updates=[update_target_expr])

        q_values = U.function([obs_t_input], q_t)
        target_q_values = U.function([obs_tp1_input], q_tp1)

        feat = U.function([obs_t_input], phi_xt)
        feat_target = U.function([obs_tp1_input], phi_target_xtp1)


        # Create callable functions
        blr_ops = U.function(
            inputs=[
                obs_t_input,
                act_t_ph,
                rew_t_ph,
                obs_tp1_input,
                done_mask_ph,
                importance_weights_ph
            ],
            outputs=[phiphiTop,phiYop]
        )
        blr_helpers = U.function([precision_mat,phiY],[covariance_mat, w_mu])

        return act_f, train, update_target, feat_dim, feat, feat_target, target_q_values, last_layer_weights, blr_ops, blr_helpers
예제 #10
0
def build_train(make_obs_ph,
                mu_func,
                v_func,
                l_func,
                action_noise,
                num_actions,
                optimizer,
                grad_norm_clipping=None,
                gamma=1.0,
                double_q=True,
                scope="deepq",
                reuse=None,
                param_noise=False,
                param_noise_filter_func=None):
    """Creates the train function:

    Parameters
    ----------
    make_obs_ph: str -> tf.placeholder or TfInput
        a function that takes a name and creates a placeholder of input with that name
    q_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    num_actions: int
        number of actions
    reuse: bool
        whether or not to reuse the graph variables
    optimizer: tf.train.Optimizer
        optimizer to use for the Q-learning objective.
    grad_norm_clipping: float or None
        clip gradient norms to this value. If None no clipping is performed.
    gamma: float
        discount rate.
    double_q: bool
        if true will use Double Q Learning (https://arxiv.org/abs/1509.06461).
        In general it is a good idea to keep it enabled.
    scope: str or VariableScope
        optional scope for variable_scope.
    reuse: bool or None
        whether or not the variables should be reused. To be able to reuse the scope must be given.
    param_noise: bool
        whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905)
    param_noise_filter_func: tf.Variable -> bool
        function that decides whether or not a variable should be perturbed. Only applicable
        if param_noise is True. If set to None, default_param_noise_filter is used by default.

    Returns
    -------
    act: (tf.Variable, bool, float) -> tf.Variable
        function to select and action given observation.
`       See the top of the file for details.
    train: (object, np.array, np.array, object, np.array, np.array) -> np.array
        optimize the error in Bellman's equation.
`       See the top of the file for details.
    update_target: () -> ()
        copy the parameters from optimized Q function to the target Q function.
`       See the top of the file for details.
    debug: {str: function}
        a bunch of functions to print debug data like q_values.
    """
    if param_noise:
        act_f = build_act_with_param_noise(
            make_obs_ph,
            q_func,
            num_actions,
            scope=scope,
            reuse=reuse,
            param_noise_filter_func=param_noise_filter_func)
    else:
        act_f = build_act(make_obs_ph,
                          mu_func,
                          action_noise,
                          num_actions,
                          scope=scope,
                          reuse=reuse)

    with tf.variable_scope(scope, reuse=reuse):
        # set up placeholders
        obs_t_input = make_obs_ph("obs_t")
        act_t_ph = tf.placeholder(tf.float32, [None, num_actions],
                                  name="action")
        rew_t_ph = tf.placeholder(tf.float32, [None], name="reward")
        obs_tp1_input = make_obs_ph("obs_tp1")
        done_mask_ph = tf.placeholder(tf.float32, [None], name="done")
        importance_weights_ph = tf.placeholder(tf.float32, [None],
                                               name="weight")

        with tf.variable_scope("q_func"):
            l_t = l_func(obs_t_input.get(),
                         int((num_actions * (num_actions + 1)) / 2),
                         scope="l_func")
            mu_t = mu_func(obs_t_input.get(),
                           num_actions,
                           scope="mu_func",
                           reuse=True)
            v_t = v_func(obs_t_input.get(), 1, scope="v_func")
            diagonal = tf.exp(l_t[:, :num_actions])

            rows, diag = [], []
            pivot = num_actions

            for i in range(num_actions):
                rows.append(
                    tf.pad(l_t[:, pivot:i + pivot],
                           [[0, 0], [0, num_actions - i]]))
                diag.append(
                    tf.pad(tf.expand_dims(diagonal[:, i], 1),
                           [[0, 0], [i, num_actions - 1 - i]]))
                pivot += i

            l_t = tf.transpose(tf.stack(rows), (1, 0, 2)) + tf.transpose(
                tf.stack(diag), (1, 0, 2))
            #print("shape L", tf.stack(rows).shape, diagonal.shape)
            L = tf.matmul(l_t, tf.transpose(l_t, (0, 2, 1)))
            u = tf.expand_dims(act_t_ph - mu_t, 1)
            print("L shape", L.shape, u.shape)
            a_t = -0.5 * tf.reduce_mean(
                tf.matmul(tf.matmul(u, L), tf.transpose(u, (0, 2, 1))), 2)

            q_t = a_t + v_t

        # q network evaluation
        #q_t = q_func(obs_t_input.get(), num_actions, scope="q_func", reuse=True)  # reuse parameters from act
        q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                        scope=tf.get_variable_scope().name +
                                        "/q_func")
        v_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                        scope=tf.get_variable_scope().name +
                                        "/q_func/v_func")

        # target q network evalution
        v_tp1 = v_func(obs_tp1_input.get(), 1, scope="target_v_func")
        target_q_func_vars = tf.get_collection(
            tf.GraphKeys.GLOBAL_VARIABLES,
            scope=tf.get_variable_scope().name + "/target_v_func")

        # q scores for actions which we know were selected in the given state.
        #q_t_selected = tf.reduce_sum(q_t * tf.one_hot(act_t_ph, num_actions), 1)
        q_t_selected = q_t
        print(
            "q_shape",
            tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                              scope=tf.get_variable_scope().name +
                              "/q_func/v_func"))
        # compute estimate of best possible value starting from state at t + 1
        #if double_q:
        #    q_tp1_using_online_net = q_func(obs_tp1_input.get(), num_actions, scope="q_func", reuse=True)
        #    q_tp1_best_using_online_net = tf.argmax(q_tp1_using_online_net, 1)
        #    q_tp1_best = tf.reduce_sum(q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions), 1)
        #else:
        #    q_tp1_best = tf.reduce_max(q_tp1, 1)
        q_tp1_best_masked = (1.0 - done_mask_ph) * v_tp1

        # compute RHS of bellman equation
        q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked

        # compute the error (potentially clipped)
        td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)
        errors = U.huber_loss(td_error)
        weighted_error = tf.reduce_mean(importance_weights_ph * errors)

        # compute optimization op (potentially with gradient clipping)
        if grad_norm_clipping is not None:
            gradients = optimizer.compute_gradients(weighted_error,
                                                    var_list=q_func_vars)
            for i, (grad, var) in enumerate(gradients):
                if grad is not None:
                    gradients[i] = (tf.clip_by_norm(grad,
                                                    grad_norm_clipping), var)
            optimize_expr = optimizer.apply_gradients(gradients)
        else:
            optimize_expr = optimizer.minimize(weighted_error,
                                               var_list=q_func_vars)

        # update_target_fn will be called periodically to copy Q network to target Q network
        update_target_expr = []
        for var, var_target in zip(
                sorted(v_func_vars, key=lambda v: v.name),
                sorted(target_q_func_vars, key=lambda v: v.name)):
            print("var_target", var_target, var_target.shape, var, var.shape)
            update_target_expr.append(var_target.assign(var))
        update_target_expr = tf.group(*update_target_expr)

        # Create callable functions
        train = U.function(inputs=[
            obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph,
            importance_weights_ph
        ],
                           outputs=td_error,
                           updates=[optimize_expr])
        update_target = U.function([], [], updates=[update_target_expr])

        q_values = U.function([obs_t_input], q_t)

        return act_f, train, update_target, {'q_values': q_values}
예제 #11
0
def build_train_ib(make_obs_ph,
                   model_func,
                   num_actions,
                   optimizer,
                   grad_norm_clipping=None,
                   gamma=1.0,
                   beta=1.0,
                   theta=1,
                   double_q=True,
                   emdqn=True,
                   vae=True,
                   ib=True,
                   scope="deepq_ib",
                   reuse=None):
    """Creates the train function:

    Parameters
    ----------
    make_obs_ph: str -> tf.placeholder or TfInput
        a function that takes a name and creates a placeholder of input with that name
    q_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    num_actions: int
        number of actions
    reuse: bool
        whether or not to reuse the graph variables
    optimizer: tf.train.Optimizer
        optimizer to use for the Q-learning objective.
    grad_norm_clipping: float or None
        clip gradient norms to this value. If None no clipping is performed.
    gamma: float
        discount rate.
    beta: float
        coefficient of beta-ib.
    double_q: bool
        if true will use Double Q Learning (https://arxiv.org/abs/1509.06461).
        In general it is a good idea to keep it enabled.
    scope: str or VariableScope
        optional scope for variable_scope.
    reuse: bool or None
        whether or not the variables should be reused. To be able to reuse the scope must be given.

    Returns
    -------
    act: (tf.Variable, bool, float) -> tf.Variable
        function to select and action given observation.
`       See the top of the file for details.
    train: (object, np.array, np.array, object, np.array, np.array) -> np.array
        optimize the error in Bellman's equation.
`       See the top of the file for details.
    update_target: () -> ()
        copy the parameters from optimized Q function to the target Q function.
`       See the top of the file for details.
    debug: {str: function}
        a bunch of functions to print debug data like q_values.
    """

    act_noise = tf.placeholder(tf.float32, [None, 512], name="act_noise")
    act_f = build_act_ib(make_obs_ph,
                         model_func,
                         act_noise,
                         num_actions,
                         scope=scope,
                         reuse=reuse)

    with tf.variable_scope(scope, reuse=reuse):
        # set up placeholders
        obs_t_input = U.ensure_tf_input(make_obs_ph("obs_t"))

        act_t_ph = tf.placeholder(tf.int32, [None], name="action")
        rew_t_ph = tf.placeholder(tf.float32, [None], name="reward")
        z_noise_t = tf.placeholder(tf.float32, [None, 512], name="z_noise")

        z_noise_tp1 = tf.placeholder(tf.float32, [None, 512],
                                     name="z_noise_tp1")
        obs_tp1_input = U.ensure_tf_input(make_obs_ph("obs_tp1"))
        done_mask_ph = tf.placeholder(tf.float32, [None], name="done")
        importance_weights_ph = tf.placeholder(tf.float32, [None],
                                               name="weight")

        inputs = [
            obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph,
            importance_weights_ph, act_noise, z_noise_t, z_noise_tp1
        ]
        # EMDQN
        if emdqn or ib:
            qec_input = tf.placeholder(tf.float32, [None], name='qec')
            inputs.append(qec_input)
        if ib or vae:
            obs_vae_input = U.ensure_tf_input(make_obs_ph("obs_vae"))
            z_noise_vae = tf.placeholder(tf.float32, [None, 512],
                                         name="z_noise_vae")
            inputs.append(obs_vae_input)
            inputs.append(z_noise_vae)
        # q network evaluation
        q_t, v_mean_t, v_logvar_t, z_mean_t, z_logvar_t, recon_obs_t = model_func(
            obs_t_input.get(),
            z_noise_t,
            num_actions,
            scope="q_func",
            reuse=True)
        if vae or ib:
            q_vae, v_mean_vae, v_logvar_vae, z_mean_vae, z_logvar_vae, recon_obs = model_func(
                obs_vae_input.get(),
                z_noise_vae,
                num_actions,
                scope="q_func",
                reuse=True)

        # q_t = q_func(obs_t_input.get(), num_actions, scope="q_func", reuse=True)  # reuse parameters from act

        q_func_vars = U.scope_vars(U.absolute_scope_name("q_func"))

        # target q network evalution

        q_tp1, q_d_tp1, v_mean_tp1, v_logvar_tp1, z_mean_tp1, z_logvar_tp1, recon_obs_tp1 = model_func(
            obs_tp1_input.get(),
            z_noise_tp1,
            num_actions,
            scope="target_q_func")

        target_q_func_vars = U.scope_vars(
            U.absolute_scope_name("target_q_func"))

        # q scores for actions which we know were selected in the given state.
        q_t_selected = tf.reduce_sum(q_t * tf.one_hot(act_t_ph, num_actions),
                                     1)

        # compute estimate of best possible value starting from state at t + 1
        if double_q:

            q_tp1_using_online_net, _, _, _, _, _, _ = model_func(
                obs_tp1_input.get(),
                z_noise_tp1,
                num_actions,
                scope="q_func",
                reuse=True)

            q_tp1_best_using_online_net = tf.arg_max(q_tp1_using_online_net, 1)
            q_tp1_best = tf.reduce_sum(
                q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions),
                1)
        else:
            q_tp1_best = tf.reduce_max(q_tp1, 1)
        q_tp1_best_masked = (1.0 - done_mask_ph) * q_tp1_best

        # compute RHS of bellman equation
        q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked

        # compute the error (potentially clipped)
        td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)

        td_loss = tf.reduce_mean(importance_weights_ph *
                                 U.huber_loss(td_error))
        outputs = [td_loss]
        total_loss = td_loss
        if vae or ib:
            encoder_loss = -1 + z_mean_vae**2 + tf.exp(
                z_logvar_vae) - z_logvar_vae
            outputs.append(encoder_loss)
            total_loss += 0.1 * tf.reduce_mean(beta * encoder_loss)
        if vae:
            decoder_loss = tf.keras.losses.binary_crossentropy(
                tf.reshape(recon_obs, [-1]),
                tf.reshape(
                    tf.dtypes.cast(obs_vae_input._placeholder, tf.float32),
                    [-1]))
            print("here", z_mean_t.shape, z_logvar_t.shape, encoder_loss.shape,
                  decoder_loss.shape)
            vae_loss = beta * encoder_loss + theta * decoder_loss
            outputs.append(decoder_loss)
            outputs.append(vae_loss)
            total_loss += 0.1 * tf.reduce_mean(theta * decoder_loss)
        if ib:
            ib_loss = (v_mean_t - tf.stop_gradient(tf.expand_dims(
                qec_input, 1)))**2 / tf.exp(v_logvar_t) + v_logvar_t
            print("here2", v_mean_t.shape,
                  tf.expand_dims(qec_input, 1).shape, v_logvar_t.shape,
                  ib_loss.shape)
            total_ib_loss = ib_loss + beta * encoder_loss
            outputs.append(total_ib_loss)
            total_loss += 0.1 * tf.reduce_mean(ib_loss)
        # EMDQN
        if emdqn:
            qec_error = q_t_selected - tf.stop_gradient(qec_input)
            total_loss += 0.1 * tf.reduce_mean(
                importance_weights_ph * U.huber_loss(qec_error))
            outputs.append(qec_error)

        td_loss_summary = tf.summary.scalar("td loss", td_loss)
        total_loss_summary = tf.summary.scalar("total loss", total_loss)
        z_var_summary = tf.summary.scalar("z_var",
                                          tf.reduce_mean(tf.exp(z_logvar_t)))
        summaries = [td_loss_summary, total_loss_summary, z_var_summary]
        if vae or ib:
            encoder_loss_summary = tf.summary.scalar(
                "encoder loss", tf.reduce_mean(encoder_loss))
            summaries.append(encoder_loss_summary)
        if vae:
            decoder_loss_summary = tf.summary.scalar(
                "decoder loss", tf.reduce_mean(decoder_loss))
            summaries.append(decoder_loss_summary)
        if ib:
            ib_loss_summary = tf.summary.scalar("ib loss",
                                                tf.reduce_mean(ib_loss))
            total_ib_loss_summary = tf.summary.scalar(
                "total ib loss", tf.reduce_mean(total_ib_loss))
            summaries.append(ib_loss_summary)
            summaries.append(total_ib_loss_summary)
        if emdqn:
            qec_loss_summary = tf.summary.scalar(
                "qec loss", tf.reduce_mean(importance_weights_ph * qec_error))
            summaries.append(qec_loss_summary)
        summary = tf.summary.merge(summaries)
        outputs.append(summary)
        # compute optimization op (potentially with gradient clipping)
        if grad_norm_clipping is not None:
            optimize_expr = U.minimize_and_clip(optimizer,
                                                total_loss,
                                                var_list=q_func_vars,
                                                clip_val=grad_norm_clipping)
        else:
            optimize_expr = optimizer.minimize(total_loss,
                                               var_list=q_func_vars)

        # update_target_fn will be called periodically to copy Q network to target Q network
        update_target_expr = []
        for var, var_target in zip(
                sorted(q_func_vars, key=lambda v: v.name),
                sorted(target_q_func_vars, key=lambda v: v.name)):
            update_target_expr.append(var_target.assign(var))
        update_target_expr = tf.group(*update_target_expr)

        # Create callable functions

        train = U.function(inputs=inputs,
                           outputs=[td_error, summary],
                           updates=[optimize_expr])

        get_q_t_selected = U.function(
            inputs=[obs_t_input, act_t_ph, z_noise_t], outputs=q_t_selected)
        update_target = U.function([], [], updates=[update_target_expr])

        q_values = U.function([obs_t_input, z_noise_t], q_t)

        return act_f, train, update_target, {
            'q_values': q_values
        }, get_q_t_selected
예제 #12
0
def build_train(make_obs_ph, q_func, num_actions, optimizer, grad_norm_clipping=None, gamma=1.0,
    double_q=True, scope="deepq", reuse=None, param_noise=False, param_noise_filter_func=None):
    """Creates the train function:

    Parameters
    ----------
    make_obs_ph: str -> tf.placeholder or TfInput
        a function that takes a name and creates a placeholder of input with that name
    q_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    num_actions: int
        number of actions
    reuse: bool
        whether or not to reuse the graph variables
    optimizer: tf.train.Optimizer
        optimizer to use for the Q-learning objective.
    grad_norm_clipping: float or None
        clip gradient norms to this value. If None no clipping is performed.
    gamma: float
        discount rate.
    double_q: bool
        if true will use Double Q Learning (https://arxiv.org/abs/1509.06461).
        In general it is a good idea to keep it enabled.
    scope: str or VariableScope
        optional scope for variable_scope.
    reuse: bool or None
        whether or not the variables should be reused. To be able to reuse the scope must be given.
    param_noise: bool
        whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905)
    param_noise_filter_func: tf.Variable -> bool
        function that decides whether or not a variable should be perturbed. Only applicable
        if param_noise is True. If set to None, default_param_noise_filter is used by default.

    Returns
    -------
    act: (tf.Variable, bool, float) -> tf.Variable
        function to select and action given observation.
`       See the top of the file for details.
    train: (object, np.array, np.array, object, np.array, np.array) -> np.array
        optimize the error in Bellman's equation.
`       See the top of the file for details.
    update_target: () -> ()
        copy the parameters from optimized Q function to the target Q function.
`       See the top of the file for details.
    debug: {str: function}
        a bunch of functions to print debug data like q_values.
    """
    if param_noise:
        act_f = build_act_with_param_noise(make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse,
            param_noise_filter_func=param_noise_filter_func)
    else:
        act_f = build_act(make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse)

    with tf.variable_scope(scope, reuse=reuse):
        multi_step_num = 3  # multi step return 10, 5
        gamma = 0.7  # 折扣率
        # set up placeholders
        obs_t_input = make_obs_ph("obs_t")
        act_t_ph = tf.placeholder(tf.int32, [None], name="action")
        rew_t_ph = tf.placeholder(tf.float32, [None], name="reward")  # 如果要使用长期回报,这里需要一个数组
        obs_tp1_input = make_obs_ph("obs_tp1")
        done_mask_ph = tf.placeholder(tf.float32, [None], name="done")
        importance_weights_ph = tf.placeholder(tf.float32, [None], name="weight")

        # 创建Q network 与 target Q network , 返回所有action 的q值(q_func()) , q_t是一个列表
        # q network evaluation
        q_t = q_func(obs_t_input.get(), num_actions, scope="q_func", reuse=True)  # reuse parameters from act
        q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/q_func")
        q_tp1 = q_func(obs_tp1_input.get(), num_actions, scope="target_q_func")
        target_q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/target_q_func")
        # q scores for actions which we know were selected in the given state.
        q_t_selected = tf.reduce_sum(q_t * tf.one_hot(act_t_ph, num_actions), 1)

        # compute estimate of best possible value starting from state at t + 1
        if double_q:
            q_tp1_using_online_net = q_func(obs_tp1_input.get(), num_actions, scope="q_func", reuse=True)
            q_tp1_best_using_online_net = tf.argmax(q_tp1_using_online_net, 1)
            q_tp1_best = tf.reduce_sum(q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions), 1)
        else:
            q_tp1_best = tf.reduce_max(q_tp1, 1)
        q_tp1_best_masked = (1.0 - done_mask_ph) * q_tp1_best

        # compute RHS of bellman equation ,TD目标
        # q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked
        q_t_selected_target = rew_t_ph + (gamma**multi_step_num) * q_tp1_best_masked  # multi step return

        # compute the error (potentially clipped)
        td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)
        errors = U.huber_loss(td_error)
        weighted_error = tf.reduce_mean(importance_weights_ph * errors)

        # # start cpu
        # with tf.device('/cpu:0'):
        #     # compute optimization op (potentially with gradient clipping)
        #     if grad_norm_clipping is not None:
        #         gradients = optimizer.compute_gradients(weighted_error, var_list=q_func_vars)
        #         for i, (grad, var) in enumerate(gradients):
        #             if grad is not None:
        #                 gradients[i] = (tf.clip_by_norm(grad, grad_norm_clipping), var)
        #         optimize_expr = optimizer.apply_gradients(gradients)
        #     else:
        #         optimize_expr = optimizer.minimize(weighted_error, var_list=q_func_vars)
        # # end cpu
        # compute optimization op (potentially with gradient clipping)
        if grad_norm_clipping is not None:
            gradients = optimizer.compute_gradients(weighted_error, var_list=q_func_vars)
            for i, (grad, var) in enumerate(gradients):
                if grad is not None:
                    gradients[i] = (tf.clip_by_norm(grad, grad_norm_clipping), var)
            optimize_expr = optimizer.apply_gradients(gradients)
        else:
            optimize_expr = optimizer.minimize(weighted_error, var_list=q_func_vars)

        # update_target_fn will be called periodically to copy Q network to target Q network
        # sorted() 不会改变原来的可迭代对象
        update_target_expr = []
        for var, var_target in zip(sorted(q_func_vars, key=lambda v: v.name),  # 利用q_func_vars.name 进行排序,
                                   sorted(target_q_func_vars, key=lambda v: v.name)):
            # print(var)  # 这里var\var_target 就是一个tensor
            # print(var_target)
            update_target_expr.append(var_target.assign(var))
        # print(update_target_expr) # 大概就是一系列Assign操作
        update_target_expr = tf.group(*update_target_expr)  # tf.group()将语句变为操作?

        # 因为是单进程写,多进程读,所以将两种操作分别应用于不同内存区域上,降低lock竞争,仍然有一些不够合理的地方
        # 初始化actor的q网络
        def init_actor_qfunc(sess, net_list):
            # 需要tf.variable_scope(scope, reuse=reuse): 因而写在这里
            # 或使用tf.get_default_session()(不可用上下文管理器)
            with sess.as_default():
                # net_list_lock.acquire()
                # 清空list
                i = len(net_list)
                while i > 0:
                    i -= 1
                    del net_list[i]
                for var_actor in q_func_vars:  # 整体顺序是否正确,有待进一步观察
                    net_list.append(var_actor.eval(session=sess))  # list形式
                # for var_actor in q_func_vars:  # net_list 长度为q_func_vars两倍
                #     net_list.append(var_actor.eval(session=sess))
                gc.collect()  # 释放内存, python3.5 应该不需要
                # net_list_lock.release()  # 释放锁

        len_q_func = len(q_func_vars)

        # 更新actor的q网络
        def update_actor_qfunc(sess, net_list, net_list_lock):
            with sess.as_default():
                net_list_lock.acquire()
                for i_tensor in range(len_q_func):
                    net_list[i_tensor] = q_func_vars[i_tensor].eval(session=sess)
                net_list_lock.release()  # 释放锁

        # 下面三个function分别为整合 train、 update_target 、 q_values
        # Create callable functions
        train = U.function(
            inputs=[
                obs_t_input,
                act_t_ph,
                rew_t_ph,
                obs_tp1_input,
                done_mask_ph,
                importance_weights_ph
            ],
            outputs=td_error,
            updates=[optimize_expr]
        )
        # update_target操作没有输入输出
        update_target = U.function([], [], updates=[update_target_expr])

        q_values = U.function([obs_t_input], q_t)

        return act_f, train, update_target, init_actor_qfunc, update_actor_qfunc, {'q_values': q_values}
예제 #13
0
def build_train(make_obs_ph,
                q_func,
                num_actions,
                optimizer,
                chief=False,
                server=None,
                workers=1,
                grad_norm_clipping=None,
                gamma=1.0,
                double_q=True,
                scope="deepq",
                reuse=None):
    """Creates the act function:

    Parameters
    ----------
    make_obs_ph: str -> tf.placeholder or TfInput
        a function that take a name and creates a placeholder of input with that name
    q_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    num_actions: int
        number of actions
    reuse: bool
        whether or not to reuse the graph variables
    optimizer: tf.train.Optimizer
        optimizer to use for the Q-learning objective.
    chief: bool
        whether or not the worker should assume chief duties.
        these include: initializing global parameters, tensorboarding, saving, etc.
    grad_norm_clipping: float or None
        clip gradient norms to this value. If None no clipping is performed.
    gamma: float
        discount rate.
    double_q: bool
        if true will use Double Q Learning (https://arxiv.org/abs/1509.06461).
        In general it is a good idea to keep it enabled.
    scope: str or VariableScope
        optional scope for variable_scope.
    reuse: bool or None
        whether or not the variables should be reused. To be able to reuse the scope must be given.

    Returns
    -------
    act: (tf.Variable, bool, float) -> tf.Variable
        function to select and action given observation.
`       See the top of the file for details.
    train: (object, np.array, np.array, object, np.array, np.array) -> np.array
        optimize the error in Bellman's equation.
`       See the top of the file for details.
    update_target: () -> ()
        copy the parameters from optimized Q function to the target Q function.
`       See the top of the file for details.
    debug: {str: function}
        a bunch of functions to print debug data like q_values.
    """
    task = server.server_def.task_index
    act_f = build_act(make_obs_ph,
                      q_func,
                      num_actions,
                      scope=scope,
                      reuse=reuse,
                      task=task)

    with tf.variable_scope(scope, reuse=reuse):
        with tf.device("/job:worker/task:{}".format(task)):
            # set up placeholders
            obs_t_input = U.ensure_tf_input(make_obs_ph("obs_t"))
            act_t_ph = tf.placeholder(tf.int32, [None], name="action")
            rew_t_ph = tf.placeholder(tf.float32, [None], name="reward")
            obs_tp1_input = U.ensure_tf_input(make_obs_ph("obs_tp1"))
            done_mask_ph = tf.placeholder(tf.float32, [None], name="done")
            importance_weights_ph = tf.placeholder(tf.float32, [None],
                                                   name="weight")

            # Local timestep counters
            t = tf.placeholder(tf.float32, [1], name="t")
            t_global_old = tf.placeholder(tf.float32, [1], name="t_global_old")
            score_input = tf.placeholder(tf.float32, [1], name="score_input")
            grad_prio = tf.placeholder(tf.bool, [1], name="grad_prio")
            converged_ph = tf.placeholder(tf.bool, [1], name="converged")
            factor_input = tf.placeholder(tf.float32, [1], name="factor_input")

            # Global timestep counter
            # TODO Does TF have built-in global step counters?
            with tf.device("/job:ps/task:0"):
                t_global = tf.Variable(dtype=tf.float32,
                                       initial_value=[0],
                                       name="t_global")
                run_code_global = tf.Variable(initial_value="",
                                              name="run_code_global")
                comm_rounds_global = tf.Variable(dtype=tf.float32,
                                                 initial_value=[0],
                                                 name="comm_rounds_global")
                max_workers_global = tf.constant(workers,
                                                 dtype=tf.float32,
                                                 name="max_workers_global")
                worker_count_global = tf.Variable(dtype=tf.float32,
                                                  initial_value=[0],
                                                  name="worker_count_global")
                score_max_global = tf.Variable(dtype=tf.float32,
                                               initial_value=[0],
                                               name="score_max_global")
                score_min_global = tf.Variable(dtype=tf.float32,
                                               initial_value=[0],
                                               name="score_min_global")
                submit_count_global = tf.Variable(dtype=tf.float32,
                                                  initial_value=[-1],
                                                  name="submit_count_global")
                converged_global = tf.Variable(dtype=tf.bool,
                                               initial_value=[False],
                                               name="converged_global")

            # q network evaluation
            q_t = q_func(obs_t_input.get(),
                         num_actions,
                         scope="q_func",
                         reuse=True)  # reuse parameters from act
            q_func_vars = U.scope_vars(U.absolute_scope_name("q_func"))

            # target q network evalution
            q_tp1 = q_func(obs_tp1_input.get(),
                           num_actions,
                           scope="target_q_func")
            target_q_func_vars = U.scope_vars(
                U.absolute_scope_name("target_q_func"))

            # global weights
            print("chief:", chief, "reuse:", True if not chief else None)
            global_q_func_vars = []
            # with tf.device(tf.train.replica_device_setter(cluster=cluster)):
            with tf.device(
                    "/job:ps/task:0"):  # TODO needs RDS if using multiple PS
                # q_global = q_func(obs_t_input.get(), num_actions, scope="global_weights", reuse=None if chief else True)#reuse=(not chief))
                # q_global = q_func(obs_t_input.get(), num_actions, scope="global_weights")
                with tf.variable_scope("global_weights"):
                    for var in q_func_vars:
                        name = var.name.split(":")[0].split("q_func/")[-1]
                        global_q_func_vars.append(
                            tf.get_variable(name=name,
                                            shape=var.shape,
                                            dtype=var.dtype,
                                            initializer=tf.contrib.layers.
                                            xavier_initializer(
                                                seed=1, dtype=var.dtype)))
            # global_q_func_vars = U.scope_vars(U.absolute_scope_name("global_weights"))
            # print("Global:", global_q_func_vars)

            # old weights (used to implicitly calculate gradient sum: q_func_vars - q_func_vars_old)
            q_func_vars_old = []
            with tf.variable_scope("old_weights"):
                for var in q_func_vars:
                    name = var.name.split(":")[0].split("q_func/")[-1]
                    q_func_vars_old.append(
                        tf.get_variable(
                            name=name,
                            shape=var.shape,
                            dtype=var.dtype,
                            initializer=tf.contrib.layers.xavier_initializer(
                                seed=1, dtype=var.dtype)))
            # q_old = q_func(obs_t_input.get(), num_actions, scope="old_weights")
            # q_func_vars_old = U.scope_vars(U.absolute_scope_name("old_weights"))
            # print("Old vars:", q_func_vars_old)

            # q scores for actions which we know were selected in the given state.
            q_t_selected = tf.reduce_sum(
                q_t * tf.one_hot(act_t_ph, num_actions), 1)

            # compute estimate of best possible value starting from state at t + 1
            if double_q:
                q_tp1_using_online_net = q_func(obs_tp1_input.get(),
                                                num_actions,
                                                scope="q_func",
                                                reuse=True)
                q_tp1_best_using_online_net = tf.arg_max(
                    q_tp1_using_online_net, 1)
                q_tp1_best = tf.reduce_sum(
                    q_tp1 *
                    tf.one_hot(q_tp1_best_using_online_net, num_actions), 1)
            else:
                q_tp1_best = tf.reduce_max(q_tp1, 1)
            q_tp1_best_masked = (1.0 - done_mask_ph) * q_tp1_best

            # compute RHS of bellman equation
            q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked

            # compute the error (potentially clipped)
            td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)
            errors = U.huber_loss(td_error)
            weighted_error = tf.reduce_mean(importance_weights_ph * errors)

            # compute optimization op (potentially with gradient clipping)
            if grad_norm_clipping is not None:
                optimize_expr = U.minimize_and_clip(
                    optimizer,
                    weighted_error,
                    var_list=q_func_vars,
                    clip_val=grad_norm_clipping)
            else:
                optimize_expr = optimizer.minimize(weighted_error,
                                                   var_list=q_func_vars)

            # update_target_fn will be called periodically to copy Q network to target Q network
            update_target_expr = []
            for var, var_target in zip(
                    sorted(q_func_vars, key=lambda v: v.name),
                    sorted(target_q_func_vars, key=lambda v: v.name)):
                update_target_expr.append(var_target.assign(var))
            update_target_expr = tf.group(*update_target_expr)

            # update_global_fn will be called periodically to copy global Q network to q network
            update_global_expr = []
            for var_global, var, var_old in zip(
                    sorted(global_q_func_vars, key=lambda v: v.name),
                    sorted(q_func_vars, key=lambda v: v.name),
                    sorted(q_func_vars_old, key=lambda v: v.name)):
                update_global_expr.append(var.assign(var_global))
                # TODO Can async cause var <- var_global, var_global <- new value, var_old <- var_global in that order?
                # TODO Should this copy from var instead? (concurrency issues?)
                # TODO Can concurrency cause var_old <- var, var <- var_global in that order (resulting in wrong values)?
                # TODO Safest method is to force sequential execution of var <- var_global, var_old <- var! How though?
                update_global_expr.append(var_old.assign(var_global))
            update_global_expr = tf.group(*update_global_expr)

            # update the global time step counter by adding the local
            update_t_global = t_global.assign_add(t)

            optimize_global_expr = []
            # Factor to multiply every gradient with
            # f = t / (t_global - t_global_old)
            dt = tf.subtract(update_t_global, t_global_old)
            factor = tf.where(
                tf.greater_equal(factor_input, 0), factor_input,
                tf.where(
                    grad_prio,
                    tf.divide(tf.subtract(score_input, score_min_global),
                              tf.subtract(score_max_global, score_min_global)),
                    tf.div(t, dt)))
            for var, var_old, var_global in zip(
                    sorted(q_func_vars, key=lambda v: v.name),
                    sorted(q_func_vars_old, key=lambda v: v.name),
                    sorted(global_q_func_vars, key=lambda v: v.name)):
                # Multiply the difference between the old parameters and the locally optimized parameters
                # g = (var - var_old) * f
                grad = tf.multiply(tf.subtract(var, var_old), factor)
                optimize_global_expr.append(var_global.assign_add(grad))
            optimize_global_expr = tf.group(*optimize_global_expr)

            # if cr == cr_g and wc < wc_max:
            #   wc += 1
            #   score_global += score
            # if cr == cr_g and wc == wc_max:
            #   vc += 1
            #   score_global += score
            #   cr_g += 0.5
            # return cr_g
            """
            if cr == cr_g:
                if wc <= wc_max:
                    wc += 1
                    score_global += score
                    if wc == wc_max:
                        cr_g += 0.5
            return cr_g
            """
            # submit_score_expr = \
            #     tf.cond(tf.equal(comm_rounds, comm_rounds_global),
            #             lambda: tf.cond(tf.less_equal(worker_count_global, max_workers_global),
            #                             lambda: tf.group(worker_count_global.assign_add([1]),
            #                                              score_global.assign_add(score_input),
            #                                              tf.cond(tf.equal(worker_count_global, max_workers_global),
            #                                                      lambda: comm_rounds_global.assign_add([0.5]),
            #                                                      lambda: None)),
            #                             lambda: tf.group(None, None, None)),
            #             lambda: None)
            # submit_score_expr = \
            #     tf.cond(tf.logical_and(tf.equal(comm_rounds, comm_rounds_global),
            #                            tf.less(worker_count_global, max_workers_global)),
            #             tf.group(worker_count_global.assign_add(1),
            #                      score_global.assign_add(score_input)),
            #             tf.cond(tf.logical_and(tf.equal(comm_rounds, comm_rounds_global),
            #                                    tf.equal(worker_count_global, max_workers_global)),
            #                     tf.group(worker_count_global.assign_add(1),
            #                              score_global.assign_add(score_input),
            #                              comm_rounds_global.assign_add(0.5))))

            # This makes a sum of all scores (
            # submit_score_expr = score_global.assign_add(score_input)

            # This only saves the maximum score (for normalized score weighting)
            submit_score_max = score_max_global.assign(tf.maximum(
                score_input, score_max_global),
                                                       use_locking=True)
            submit_score_min = score_min_global.assign(tf.minimum(
                score_input, score_min_global),
                                                       use_locking=True)

            set_submit_count = submit_count_global.assign(score_input,
                                                          use_locking=True)
            inc_submit_count = submit_count_global.assign_add([1],
                                                              use_locking=True)

            # check_round_op = tf.equal(comm_rounds, comm_rounds_global) # Not used anymore
            inc_wc = worker_count_global.assign_add([1], use_locking=True)
            zero_wc = worker_count_global.assign([0], use_locking=True)

            inc_cr = comm_rounds_global.assign_add([1], use_locking=True)

            score_reset = score_max_global.assign([0], use_locking=True)

            converged_set = converged_global.assign(converged_ph,
                                                    use_locking=True)

            # Create callable functions
            train = U.function(inputs=[
                obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph,
                importance_weights_ph
            ],
                               outputs=[td_error],
                               updates=[optimize_expr])
            global_opt = U.function(
                inputs=[t, t_global_old, score_input, factor_input, grad_prio],
                outputs=[dt, comm_rounds_global, factor],
                updates=[optimize_global_expr])
            # global_sync_opt = U.function(inputs=[comm_rounds], outputs=[comm_rounds_global], updates=[optimize_global_sync_expr])
            update_weights = U.function(inputs=[],
                                        outputs=[t_global],
                                        updates=[update_global_expr])
            update_target = U.function([], [], updates=[update_target_expr])
            submit_score = U.function(
                inputs=[score_input],
                outputs=[comm_rounds_global],
                updates=[submit_score_max, submit_score_min])
            check_round = U.function(inputs=[],
                                     outputs=[comm_rounds_global],
                                     updates=[])
            request_submit = U.function(inputs=[],
                                        outputs=[comm_rounds_global, inc_wc],
                                        updates=[])
            set_submit = U.function(inputs=[score_input],
                                    outputs=[set_submit_count],
                                    updates=[])
            check_submit = U.function(inputs=[],
                                      outputs=[submit_count_global],
                                      updates=[])
            inc_submit = U.function(inputs=[],
                                    outputs=[inc_submit_count],
                                    updates=[])
            inc_comm_round = U.function(inputs=[],
                                        outputs=[inc_cr],
                                        updates=[])
            reset_wc = U.function(inputs=[], outputs=[zero_wc], updates=[])
            check_wc = U.function(inputs=[],
                                  outputs=[worker_count_global],
                                  updates=[])
            reset_score = U.function(inputs=[],
                                     outputs=[],
                                     updates=[score_reset])
            set_converged = U.function(inputs=[converged_ph],
                                       outputs=[],
                                       updates=[converged_set])
            check_converged = U.function(inputs=[],
                                         outputs=[converged_global],
                                         updates=[])

            # Debugging functions
            q_values = U.function([obs_t_input], q_t)
            weights = U.function(
                inputs=[],
                outputs=[q_func_vars, global_q_func_vars, q_func_vars_old],
                updates=[])
            t_global_func = U.function([], t_global)
            comm_rounds_func = U.function([], comm_rounds_global)

            return act_f, train, global_opt, update_target, update_weights, \
                {'request_submit': request_submit, 'submit_score': submit_score,
                 'check_round': check_round, 'check_submit': check_submit, 'set_submit': set_submit,
                 'inc_submit': inc_submit, 'inc_comm_round': inc_comm_round, 'reset_wc': reset_wc,
                 'check_wc': check_wc, 'reset_score': reset_score,
                 'set_converged': set_converged, 'check_converged': check_converged}, \
                {'q_values': q_values, 'weights': weights, 't_global': t_global_func,
                 'run_code': run_code_global, 'comm_rounds': comm_rounds_func, 'factor': factor}
예제 #14
0
def build_train(make_obs_ph,
                q_func,
                num_actions,
                optimizer,
                bootstrap=False,
                swarm=False,
                voting=False,
                heads=1,
                grad_norm_clipping=None,
                gamma=1.0,
                double_q=True,
                scope="deepq",
                reuse=None,
                device="/cpu:0"):
    """Creates the train function:

    Parameters
    ----------
    make_obs_ph: str -> tf.placeholder or TfInput
        a function that takes a name and creates a placeholder of input with that name
    q_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    num_actions: int
        number of actions
    reuse: bool
        whether or not to reuse the graph variables
    optimizer: tf.train.Optimizer
        optimizer to use for the Q-learning objective.
    grad_norm_clipping: float or None
        clip gradient norms to this value. If None no clipping is performed.
    gamma: float
        discount rate.
    double_q: bool
        if true will use Double Q Learning (https://arxiv.org/abs/1509.06461).
        In general it is a good idea to keep it enabled.
    scope: str or VariableScope
        optional scope for variable_scope.
    reuse: bool or None
        whether or not the variables should be reused. To be able to reuse the scope must be given.

    Returns
    -------
    act: (tf.Variable, bool, float) -> tf.Variable
        function to select and action given observation.
`       See the top of the file for details.
    train: (object, np.array, np.array, object, np.array, np.array) -> np.array
        optimize the error in Bellman's equation.
`       See the top of the file for details.
    update_target: () -> ()
        copy the parameters from optimized Q function to the target Q function.
`       See the top of the file for details.
    debug: {str: function}
        a bunch of functions to print debug data like q_values.
    """
    act_f = build_act(make_obs_ph,
                      q_func,
                      bootstrap=bootstrap,
                      swarm=swarm,
                      voting=voting,
                      heads=heads,
                      num_actions=num_actions,
                      scope=scope,
                      reuse=reuse,
                      device=device)

    with tf.variable_scope(scope, reuse=reuse):
        # set up placeholders
        obs_t_input = U.ensure_tf_input(make_obs_ph("obs_t"))
        act_t_ph = tf.placeholder(tf.int32, [None], name="action")
        rew_t_ph = tf.placeholder(tf.float32, [None], name="reward")
        obs_tp1_input = U.ensure_tf_input(make_obs_ph("obs_tp1"))
        done_mask_ph = tf.placeholder(tf.float32, [None], name="done")
        importance_weights_ph = tf.placeholder(tf.float32, [None],
                                               name="weight")
        update_lr_ph = tf.placeholder(tf.float32, (), name="learning_rate")

        lr = tf.get_variable("lr", (), initializer=tf.constant_initializer(0))

        with tf.device(device):
            # q network evaluation
            q_t = q_func(obs_t_input.get(),
                         num_actions,
                         scope="q_func",
                         reuse=True,
                         heads=heads)  # reuse parameters from act
            q_func_vars = U.scope_vars(U.absolute_scope_name("q_func"))

            # target q network evalution
            q_tp1 = q_func(obs_tp1_input.get(),
                           num_actions,
                           scope="target_q_func",
                           reuse=True,
                           heads=heads)  # reuse parameters form act
            target_q_func_vars = U.scope_vars(
                U.absolute_scope_name("target_q_func"))

            # q scores for actions which we know were selected in the given state.
            q_t_selected = []
            for i in range(heads):
                q_t_selected.append(
                    tf.reduce_sum(q_t[i] * tf.one_hot(act_t_ph, num_actions),
                                  1))

            # compute estimate of best possible value starting from state at t + 1
            q_tp1_best = []
            q_tp1_best_using_online_net = []
            if swarm:
                q_tp1_using_online_net = q_func(obs_tp1_input.get(),
                                                num_actions,
                                                scope="q_func",
                                                reuse=True,
                                                heads=heads)

                action_subsets = []
                for i in range(heads):
                    target_greedy_action = tf.argmax(q_tp1[i], axis=1)
                    online_q_value_threshold = tf.reduce_sum(
                        q_tp1_using_online_net[i] *
                        tf.one_hot(target_greedy_action, num_actions), 1)
                    online_q_value_threshold = tf.tile(
                        tf.expand_dims(online_q_value_threshold, 1),
                        tf.constant([1, num_actions]))

                    action_subset = tf.where(
                        (q_tp1_using_online_net[i] - online_q_value_threshold)
                        >= 0,
                        tf.ones([tf.shape(obs_t_input.get())[0], num_actions]),
                        tf.zeros([tf.shape(obs_t_input.get())[0],
                                  num_actions]))
                    action_subsets.append(action_subset)

                action_subsets = tf.stack(action_subsets, axis=1)
                actions_cover = set_cover(action_subsets)
                # preferred_actions = tf.transpose(action_subsets, [1, 0, 2])

                for i in range(heads):
                    q_tp1_best_using_online_net.append(
                        tf.argmax(tf.multiply(actions_cover, q_tp1[i]),
                                  axis=1))
                    q_tp1_best.append(
                        tf.reduce_sum(
                            q_tp1[i] * tf.one_hot(
                                q_tp1_best_using_online_net[i], num_actions),
                            1))
            elif double_q:
                q_tp1_using_online_net = q_func(obs_tp1_input.get(),
                                                num_actions,
                                                scope="q_func",
                                                reuse=True,
                                                heads=heads)
                for i in range(heads):
                    q_tp1_best_using_online_net.append(
                        tf.arg_max(q_tp1_using_online_net[i], 1))
                    q_tp1_best.append(
                        tf.reduce_sum(
                            q_tp1[i] * tf.one_hot(
                                q_tp1_best_using_online_net[i], num_actions),
                            1))
            else:
                for i in range(heads):
                    q_tp1_best.append(tf.reduce_max(q_tp1, 1))

        q_tp1_best_masked = []
        q_t_selected_target = []
        td_error = []
        errors = []
        weighted_error = []
        optimize_expr = []
        optimizer = tf.train.AdamOptimizer(learning_rate=lr,
                                           beta1=0.9,
                                           beta2=0.99,
                                           epsilon=1e-4)
        q_func_heads = U.scope_vars(U.absolute_scope_name("q_func/heads"))
        q_func_convnets = U.scope_vars(U.absolute_scope_name("q_func/convnet"))
        for i in range(heads):
            q_tp1_best_masked.append((1.0 - done_mask_ph) * q_tp1_best[i])

            # compute RHS of bellman equation
            q_t_selected_target.append(rew_t_ph + gamma * q_tp1_best_masked[i])

            # compute the error (potentially clipped)
            td_error.append(q_t_selected[i] -
                            tf.stop_gradient(q_t_selected_target[i]))
            with tf.device(device):
                errors.append(U.huber_loss(td_error[i]))
            weighted_error.append(
                tf.reduce_mean(importance_weights_ph * errors[i]))
            # compute optimization op (potentially with gradient clipping)
            if grad_norm_clipping is not None:
                optimize_expr.append(
                    U.minimize_and_clip(optimizer,
                                        weighted_error[i],
                                        var_list=q_func_heads,
                                        clip_val=grad_norm_clipping))
                optimize_expr.append(
                    U.minimize_and_clip(optimizer,
                                        0.1 * weighted_error[i],
                                        var_list=q_func_convnets,
                                        clip_val=grad_norm_clipping))
            else:
                optimize_expr.append(
                    optimizer.minimize(weighted_error[i],
                                       var_list=q_func_vars))

        update_lr_expr = lr.assign(
            tf.cond(update_lr_ph >= 0, lambda: update_lr_ph, lambda: lr))
        optimize_expr.append(update_lr_expr)

        # update_target_fn will be called periodically to copy Q network to target Q network
        update_target_expr = []
        for var, var_target in zip(
                sorted(q_func_vars, key=lambda v: v.name),
                sorted(target_q_func_vars, key=lambda v: v.name)):
            update_target_expr.append(var_target.assign(var))
        update_target_expr = tf.group(*update_target_expr)

        # Create callable functions
        train = U.function(
            inputs=[
                obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph,
                importance_weights_ph, update_lr_ph
            ],
            outputs=td_error[0],
            updates=optimize_expr,
        )
        update_target = U.function([], [], updates=[update_target_expr])

        q_values = U.function([obs_t_input], q_t)

        return act_f, train, update_target, {'q_values': q_values}
예제 #15
0
def build_train(make_obs_ph,
                q_func,
                num_actions,
                optimizer,
                grad_norm_clipping=None,
                gamma=1.0,
                double_q=False,
                scope="deepq",
                reuse=None,
                param_noise=False,
                param_noise_filter_func=None,
                thompson=True):
    """Creates the train function:

    Parameters
    ----------
    make_obs_ph: str -> tf.placeholder or TfInput
        a function that takes a name and creates a placeholder of input with that name
    q_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    num_actions: int
        number of actions
    reuse: bool
        whether or not to reuse the graph variables
    optimizer: tf.train.Optimizer
        optimizer to use for the Q-learning objective.
    grad_norm_clipping: float or None
        clip gradient norms to this value. If None no clipping is performed.
    gamma: float
        discount rate.
    double_q: bool
        if true will use Double Q Learning (https://arxiv.org/abs/1509.06461).
        In general it is a good idea to keep it enabled.
    scope: str or VariableScope
        optional scope for variable_scope.
    reuse: bool or None
        whether or not the variables should be reused. To be able to reuse the scope must be given.
    param_noise: bool
        whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905)
    param_noise_filter_func: tf.Variable -> bool
        function that decides whether or not a variable should be perturbed. Only applicable
        if param_noise is True. If set to None, default_param_noise_filter is used by default.

    Returns
    -------
    act: (tf.Variable, bool, float) -> tf.Variable
        function to select and action given observation.
`       See the top of the file for details.
    train: (object, np.array, np.array, object, np.array, np.array) -> np.array
        optimize the error in Bellman's equation.
`       See the top of the file for details.
    update_target: () -> ()
        copy the parameters from optimized Q function to the target Q function.
`       See the top of the file for details.
    debug: {str: function}
        a bunch of functions to print debug data like q_values.
    """
    if param_noise:
        act_f = build_act_with_param_noise(
            make_obs_ph,
            q_func,
            num_actions,
            scope=scope,
            reuse=reuse,
            param_noise_filter_func=param_noise_filter_func)
    elif thompson:
        act_f = build_act_with_thompson_sampling(make_obs_ph,
                                                 q_func,
                                                 num_actions,
                                                 scope=scope,
                                                 reuse=reuse)
        eval_act_f = build_act_evaluate_thompson(make_obs_ph,
                                                 q_func,
                                                 num_actions,
                                                 scope=scope,
                                                 reuse=reuse)
    else:
        act_f = build_act(make_obs_ph,
                          q_func,
                          num_actions,
                          scope=scope,
                          reuse=reuse)

    with tf.variable_scope(scope, reuse=reuse):
        # set up placeholders
        obs_t_input = make_obs_ph("obs_t")
        act_t_ph = tf.placeholder(tf.int32, [None], name="action")
        rew_t_ph = tf.placeholder(tf.float32, [None], name="reward")
        obs_tp1_input = make_obs_ph("obs_tp1")
        done_mask_ph = tf.placeholder(tf.float32, [None], name="done")
        importance_weights_ph = tf.placeholder(tf.float32, [None],
                                               name="weight")

        # q network evaluation
        q_t, phi_xt = q_func(obs_t_input.get(),
                             num_actions,
                             scope="q_func",
                             reuse=True)  # reuse parameters from eval act
        q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                        scope=tf.get_variable_scope().name +
                                        "/q_func")

        # target q network evalution
        q_tp1, phi_target_xtp1 = q_func(obs_tp1_input.get(),
                                        num_actions,
                                        scope="target_q_func")
        target_q_func_vars = tf.get_collection(
            tf.GraphKeys.GLOBAL_VARIABLES,
            scope=tf.get_variable_scope().name + "/target_q_func")

        # q scores for actions which we know were selected in the given state.
        q_t_selected = tf.reduce_sum(q_t * tf.one_hot(act_t_ph, num_actions),
                                     1)

        # compute estimate of best possible value starting from state at t + 1
        average_DQN = True
        double_q = False
        if double_q:
            print("building double")
            q_tp1_using_online_net, _ = q_func(obs_tp1_input.get(),
                                               num_actions,
                                               scope="q_func",
                                               reuse=True)
            q_tp1_best_using_online_net = tf.argmax(q_tp1_using_online_net, 1)
            q_tp1_best = tf.reduce_sum(
                q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions),
                1)
        elif average_DQN:
            print("building average dqn")
            k = 2  # we use k-1 for the average dqn - first k-1 for agenet and last k-1 for target
            prev_target_vars = q_func_vars  # we use k-1 for the average dqn - first k-1 for agenet and last k-1 for target
            update_average_target_expr = []
            q_values_ensemble = []
            for j in range(k):
                q_tp1_net_j, _ = q_func(obs_tp1_input.get(),
                                        num_actions,
                                        scope="target_{}".format(j))
                this_target_vars = tf.get_collection(
                    tf.GraphKeys.GLOBAL_VARIABLES,
                    scope=tf.get_variable_scope().name +
                    "/target_{}".format(j))
                q_values_ensemble.append(q_tp1_net_j)
                update_target_expr_j = []
                for var, var_target in zip(
                        sorted(prev_target_vars, key=lambda v: v.name),
                        sorted(this_target_vars, key=lambda v: v.name)):
                    update_target_expr_j.append(var_target.assign(var))
                update_target_expr_j = tf.group(*update_target_expr_j)
                update_target_expr_j_func = U.function(
                    [], [], updates=[update_target_expr_j])
                update_average_target_expr.append(update_target_expr_j_func)
                prev_target_vars = this_target_vars

            q_tp1_average = tf.reduce_mean(tf.stack(q_values_ensemble[:(k -
                                                                        1)],
                                                    axis=-1),
                                           axis=-1)
            q_tp1_best = tf.reduce_max(q_tp1_average, 1)
        else:
            print("building not double")
            q_tp1_best = tf.reduce_max(q_tp1, 1)
        q_tp1_best_masked = (1.0 - done_mask_ph) * q_tp1_best

        # compute RHS of bellman equation
        q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked

        # compute the error (potentially clipped)
        td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)
        errors = U.huber_loss(td_error)
        weighted_error = tf.reduce_mean(importance_weights_ph * errors)

        # compute optimization op (potentially with gradient clipping)
        if grad_norm_clipping is not None:
            gradients = optimizer.compute_gradients(weighted_error,
                                                    var_list=q_func_vars)
            for i, (grad, var) in enumerate(gradients):
                if grad is not None:
                    gradients[i] = (tf.clip_by_norm(grad,
                                                    grad_norm_clipping), var)
            optimize_expr = optimizer.apply_gradients(gradients)
        else:
            optimize_expr = optimizer.minimize(weighted_error,
                                               var_list=q_func_vars)

        # update_target_fn will be called periodically to copy Q network to target Q network
        update_target_expr = []
        for var, var_target in zip(
                sorted(q_func_vars, key=lambda v: v.name),
                sorted(target_q_func_vars, key=lambda v: v.name)):
            update_target_expr.append(var_target.assign(var))
        update_target_expr = tf.group(*update_target_expr)

        # Create callable functions
        train = U.function(inputs=[
            obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph,
            importance_weights_ph
        ],
                           outputs=td_error,
                           updates=[optimize_expr])
        update_target = U.function([], [], updates=[update_target_expr])

        q_values = U.function([obs_t_input], q_t)

        if thompson:
            # Bayes Regression additions
            last_layer_weights = q_func_vars[-2]  #target_q_func_vars[-2]
            phiphiT_op = tf.matmul(tf.transpose(phi_xt), phi_xt)

            phiY_op = tf.squeeze(
                tf.matmul(tf.expand_dims(q_t_selected_target, 0), phi_xt))
            YY_op = tf.matmul(tf.expand_dims(q_t_selected_target, 0),
                              tf.expand_dims(q_t_selected_target, -1))

            feat_dim = phi_xt.shape[1].value

            feat = U.function([obs_t_input], phi_xt)
            feat_target = U.function([obs_tp1_input], phi_target_xtp1)

            # old q network evalution
            ensemble = False
            old_networks = None
            old_pseudo_counts_f = None
            outer_product_op = tf.matmul(tf.expand_dims(phi_xt, axis=-1),
                                         tf.expand_dims(phi_xt, axis=1))
            if ensemble:
                old_networks = {i: None for i in range(5)}
                phiphiTs_inv = []
                for i in range(5):
                    q_t_old, phi_old = q_func(obs_t_input.get(),
                                              num_actions,
                                              scope="old_q_func_{}".format(i))
                    old_q_func_vars = tf.get_collection(
                        tf.GraphKeys.GLOBAL_VARIABLES,
                        scope=tf.get_variable_scope().name +
                        "/old_q_func_{}".format(i))
                    update_old_expr = []
                    for var, var_old in zip(
                            sorted(q_func_vars, key=lambda v: v.name),
                            sorted(old_q_func_vars, key=lambda v: v.name)):
                        update_old_expr.append(var_old.assign(var))
                    update_old_expr = tf.group(*update_old_expr)
                    update_old = U.function([], [], updates=[update_old_expr])
                    feat_old = U.function([obs_t_input], phi_old)
                    phiphiT_inv = tf.placeholder(
                        tf.float32, [None] + [feat_dim, feat_dim],
                        name="phiphiT_inv_{}".format(i))
                    phiphiTs_inv.append(phiphiT_inv)
                    old_networks[i] = {
                        "phi_old": phi_old,
                        "phiphiT_inv": phiphiT_inv,
                        "features": feat_old,
                        "update": update_old
                    }

                old_pseudo_counts = []
                for i in range(5):
                    old_pseudo_counts.append(
                        tf.reduce_sum(tf.matmul(
                            tf.matmul(
                                tf.expand_dims(old_networks[i]['phi_old'],
                                               axis=1),
                                old_networks[i]['phiphiT_inv']),
                            tf.expand_dims(old_networks[i]['phi_old'],
                                           axis=-1)),
                                      axis=[1, 2]))
                # debug = tf.stack(old_pseudo_counts)
                old_pseudo_counts = tf.stack(old_pseudo_counts)

                old_pseudo_counts_f = U.function([obs_t_input, *phiphiTs_inv],
                                                 old_pseudo_counts)

            q_t_old, phi_old = q_func(obs_t_input.get(),
                                      num_actions,
                                      scope="old_q_func",
                                      reuse=True)
            phiphiT_inv = tf.placeholder(tf.float32,
                                         [None] + [feat_dim, feat_dim],
                                         name="phiphiT_inv")
            pseudo_count = tf.reduce_sum(tf.matmul(
                tf.matmul(tf.expand_dims(phi_old, axis=1), phiphiT_inv),
                tf.expand_dims(phi_old, axis=-1)),
                                         axis=[1, 2])

            phiphiTold_op = tf.matmul(tf.transpose(phi_old), phi_old)

            q_tp1_old, _ = q_func(obs_tp1_input.get(),
                                  num_actions,
                                  scope="old_target_q_func")
            if double_q:
                print("building double target for thompson")
                q_tp1_using_online_net_old, _ = q_func(obs_tp1_input.get(),
                                                       num_actions,
                                                       scope="old_q_func",
                                                       reuse=True)
                q_tp1_best_using_online_net_old = tf.argmax(
                    q_tp1_using_online_net_old, 1)
                q_tp1_best_old = tf.reduce_sum(
                    q_tp1_old *
                    tf.one_hot(q_tp1_best_using_online_net_old, num_actions),
                    1)
            elif average_DQN:
                print("building average target for thompson")
                q_tp1_average_old = tf.reduce_mean(tf.stack(
                    q_values_ensemble[-(k - 1):], axis=-1),
                                                   axis=-1)
                q_tp1_best_old = tf.reduce_max(q_tp1_average_old, 1)
            else:
                print("building not double")
                q_tp1_best_old = tf.reduce_max(q_tp1_old, 1)

            q_tp1_best_masked_old = (1.0 - done_mask_ph) * q_tp1_best_old
            q_t_selected_target_old = rew_t_ph + gamma * q_tp1_best_masked_old

            phiYold_op = tf.squeeze(
                tf.matmul(tf.expand_dims(q_t_selected_target_old, 0), phi_old))
            YYold_op = tf.matmul(tf.expand_dims(q_t_selected_target_old, 0),
                                 tf.expand_dims(q_t_selected_target_old, -1))

            sdp_ops = U.function(
                inputs=[obs_t_input, obs_tp1_input, phiphiT_inv],
                outputs=[pseudo_count, outer_product_op])

            # Create callable functions
            blr_ops = U.function(inputs=[
                obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph,
                importance_weights_ph
            ],
                                 outputs=[phiphiT_op, phiY_op, YY_op])
            blr_ops_old = U.function(
                inputs=[
                    obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input,
                    done_mask_ph, importance_weights_ph
                ],
                outputs=[phiphiTold_op, phiYold_op, YYold_op])

            old_q_func_vars = tf.get_collection(
                tf.GraphKeys.GLOBAL_VARIABLES,
                scope=tf.get_variable_scope().name + "/old_q_func")
            update_old_expr = []
            for var, var_old in zip(
                    sorted(q_func_vars, key=lambda v: v.name),
                    sorted(old_q_func_vars, key=lambda v: v.name)):
                update_old_expr.append(var_old.assign(var))
            update_old_expr = tf.group(*update_old_expr)
            update_old = U.function([], [], updates=[update_old_expr])

            if not average_DQN:
                old_target_q_func_vars = tf.get_collection(
                    tf.GraphKeys.GLOBAL_VARIABLES,
                    scope=tf.get_variable_scope().name + "/old_target_q_func")
                update_old_target_expr = []
                for var, var_old in zip(
                        sorted(target_q_func_vars, key=lambda v: v.name),
                        sorted(old_target_q_func_vars, key=lambda v: v.name)):
                    update_old_target_expr.append(var_old.assign(var))
                update_old_target_expr = tf.group(*update_old_target_expr)

            feat_old = U.function([obs_t_input], phi_old)
            if average_DQN:
                update_old_target = update_average_target_expr
            else:
                update_old_target = U.function(
                    [], [], updates=[update_old_target_expr])
            blr_additions = {
                'feat_dim': feat_dim,
                'feature_extractor': feat,
                'target_feature_extractor': feat_target,
                'blr_ops': blr_ops,
                'blr_ops_old': blr_ops_old,
                'last_layer_weights': last_layer_weights,
                'update_old': update_old,
                'update_old_target': update_old_target,
                'old_feature_extractor': feat_old,
                'sdp_ops': sdp_ops,
                'old_networks': old_networks,
                'eval_act': eval_act_f,
                'old_pseudo_counts': old_pseudo_counts_f
            }
        else:
            blr_additions = None

        return act_f, train, update_target, {
            'q_values': q_values
        }, blr_additions
예제 #16
0
def build_train(make_obs_ph, q_func, num_actions, optimizer, grad_norm_clipping=None, gamma=1.0,
    double_q=True, scope="deepq", reuse=None, param_noise=False, param_noise_filter_func=None):
    """Creates the train function:

    Parameters
    ----------
    make_obs_ph: str -> tf.placeholder or TfInput
        a function that takes a name and creates a placeholder of input with that name
    q_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    num_actions: int
        number of actions
    reuse: bool
        whether or not to reuse the graph variables
    optimizer: tf.train.Optimizer
        optimizer to use for the Q-learning objective.
    grad_norm_clipping: float or None
        clip gradient norms to this value. If None no clipping is performed.
    gamma: float
        discount rate.
    double_q: bool
        if true will use Double Q Learning (https://arxiv.org/abs/1509.06461).
        In general it is a good idea to keep it enabled.
    scope: str or VariableScope
        optional scope for variable_scope.
    reuse: bool or None
        whether or not the variables should be reused. To be able to reuse the scope must be given.
    param_noise: bool
        whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905)
    param_noise_filter_func: tf.Variable -> bool
        function that decides whether or not a variable should be perturbed. Only applicable
        if param_noise is True. If set to None, default_param_noise_filter is used by default.

    Returns
    -------
    act: (tf.Variable, bool, float) -> tf.Variable
        function to select and action given observation.
`       See the top of the file for details.
    train: (object, np.array, np.array, object, np.array, np.array) -> np.array
        optimize the error in Bellman's equation.
`       See the top of the file for details.
    update_target: () -> ()
        copy the parameters from optimized Q function to the target Q function.
`       See the top of the file for details.
    debug: {str: function}
        a bunch of functions to print debug data like q_values.
    """
    if param_noise:
        act_f = build_act_with_param_noise(make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse,
            param_noise_filter_func=param_noise_filter_func)
    else:
        act_f = build_act(make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse)

    with tf.variable_scope(scope, reuse=reuse):
        # set up placeholders
        obs_t_input = U.ensure_tf_input(make_obs_ph("obs_t"))
        act_t_ph = tf.placeholder(tf.int32, [None], name="action")
        rew_t_ph = tf.placeholder(tf.float32, [None], name="reward")
        obs_tp1_input = U.ensure_tf_input(make_obs_ph("obs_tp1"))
        done_mask_ph = tf.placeholder(tf.float32, [None], name="done")
        importance_weights_ph = tf.placeholder(tf.float32, [None], name="weight")

        # q network evaluation
        q_t = q_func(obs_t_input.get(), num_actions, scope="q_func", reuse=True)  # reuse parameters from act
        q_func_vars = U.scope_vars(U.absolute_scope_name("q_func"))

        # target q network evalution
        q_tp1 = q_func(obs_tp1_input.get(), num_actions, scope="target_q_func")
        target_q_func_vars = U.scope_vars(U.absolute_scope_name("target_q_func"))

        # q scores for actions which we know were selected in the given state.
        q_t_selected = tf.reduce_sum(q_t * tf.one_hot(act_t_ph, num_actions), 1)

        # compute estimate of best possible value starting from state at t + 1
        if double_q:
            q_tp1_using_online_net = q_func(obs_tp1_input.get(), num_actions, scope="q_func", reuse=True)
            q_tp1_best_using_online_net = tf.arg_max(q_tp1_using_online_net, 1)
            q_tp1_best = tf.reduce_sum(q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions), 1)
        else:
            q_tp1_best = tf.reduce_max(q_tp1, 1)
        q_tp1_best_masked = (1.0 - done_mask_ph) * q_tp1_best

        # compute RHS of bellman equation
        q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked

        # compute the error (potentially clipped)
        td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)
        errors = U.huber_loss(td_error)
        weighted_error = tf.reduce_mean(importance_weights_ph * errors)

        # compute optimization op (potentially with gradient clipping)
        if grad_norm_clipping is not None:
            optimize_expr = U.minimize_and_clip(optimizer,
                                                weighted_error,
                                                var_list=q_func_vars,
                                                clip_val=grad_norm_clipping)
        else:
            optimize_expr = optimizer.minimize(weighted_error, var_list=q_func_vars)

        # update_target_fn will be called periodically to copy Q network to target Q network
        update_target_expr = []
        for var, var_target in zip(sorted(q_func_vars, key=lambda v: v.name),
                                   sorted(target_q_func_vars, key=lambda v: v.name)):
            update_target_expr.append(var_target.assign(var))
        update_target_expr = tf.group(*update_target_expr)

        # Create callable functions
        train = U.function(
            inputs=[
                obs_t_input,
                act_t_ph,
                rew_t_ph,
                obs_tp1_input,
                done_mask_ph,
                importance_weights_ph
            ],
            outputs=td_error,
            updates=[optimize_expr]
        )
        update_target = U.function([], [], updates=[update_target_expr])

        q_values = U.function([obs_t_input], q_t)

        return act_f, train, update_target, {'q_values': q_values}
예제 #17
0
def learn(
        env,
        test_env,
        policy_fn,
        *,
        timesteps_per_actorbatch,  # timesteps per actor per update
        clip_param,
        entcoeff,  # clipping parameter epsilon, entropy coeff
        optim_epochs,
        optim_stepsize,
        optim_batchsize,  # optimization hypers
        gamma,
        lam,  # advantage estimation
        # CMAES
    max_fitness,  # has to be negative, as cmaes consider minization
        popsize,
        gensize,
        bounds,
        sigma,
        eval_iters,
        max_v_train_iter,
        max_timesteps=0,
        max_episodes=0,
        max_iters=0,
        max_seconds=0,
        # time constraint
        callback=None,
        # you can do anything in the callback, since it takes locals(), globals()
        adam_epsilon=1e-5,
        schedule='constant',
        # annealing for stepsize parameters (epsilon and adam)
        seed,
        env_id):
    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    pi = policy_fn("pi", ob_space,
                   ac_space)  # Construct network for new policy
    oldpi = policy_fn("oldpi", ob_space, ac_space)  # Network for old policy
    backup_pi = policy_fn(
        "backup_pi", ob_space, ac_space
    )  # Construct a network for every individual to adapt during the es evolution
    pi_zero = policy_fn(
        "zero_pi", ob_space,
        ac_space)  # pi_0 will only be updated along with iterations

    reward = tf.placeholder(dtype=tf.float32, shape=[None])  # step rewards
    pi_params = tf.placeholder(dtype=tf.float32, shape=[None])
    old_pi_params = tf.placeholder(dtype=tf.float32, shape=[None])
    atarg = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    lrmult = tf.placeholder(
        name='lrmult', dtype=tf.float32,
        shape=[])  # learning rate multiplier, updated with schedule

    bound_coeff = tf.placeholder(
        name='bound_coeff', dtype=tf.float32,
        shape=[])  # learning rate multiplier, updated with schedule

    clip_param = clip_param * lrmult  # Annealed cliping parameter epislon

    ob = U.get_placeholder_cached(name="ob")
    next_ob = U.get_placeholder_cached(
        name="next_ob")  # next step observation for updating q function
    ac = U.get_placeholder_cached(
        name="act")  # action placeholder for computing q function
    mean_ac = U.get_placeholder_cached(
        name="mean_act")  # action placeholder for computing q function

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = tf.reduce_mean(kloldnew)
    meanent = tf.reduce_mean(ent)
    pol_entpen = (-entcoeff) * meanent

    param_dist = tf.reduce_mean(tf.square(pi_params - old_pi_params))
    mean_action_loss = tf.cast(
        tf.reduce_mean(tf.square(1.0 - pi.pd.mode() / oldpi.pd.mode())),
        tf.float32)

    pi_adv = (pi.qpred - pi.vpred)
    adv_mean, adv_var = tf.nn.moments(pi_adv, axes=[0])
    normalized_pi_adv = (pi_adv - adv_mean) / tf.sqrt(adv_var)

    ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac))  # pnew / pold
    surr1 = ratio * atarg  # surrogate from conservative policy iteration
    surr2 = tf.clip_by_value(ratio, 1.0 - clip_param,
                             1.0 + clip_param) * atarg  #
    pol_surr = -tf.reduce_mean(tf.minimum(
        surr1, surr2))  # PPO's pessimistic surrogate (L^CLIP)

    # qf_loss = tf.reduce_mean(tf.square(reward + gamma * pi.mean_qpred - pi.qpred))
    qf_loss = tf.reduce_mean(
        U.huber_loss(reward + gamma * pi.mean_qpred - pi.qpred))
    vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret))
    qf_losses = [qf_loss]
    vf_losses = [vf_loss]
    pol_loss = pol_surr + pol_entpen
    # pol_loss = pol_surr + pol_entpen

    # Advantage function should be improved
    losses = [pol_loss, pol_entpen, meankl, meanent]
    loss_names = ["pol_surr_2", "pol_entpen", "kl", "ent"]

    var_list = pi.get_trainable_variables()
    qf_var_list = [
        v for v in var_list if v.name.split("/")[1].startswith("qf")
    ]
    mean_qf_var_list = [
        v for v in var_list if v.name.split("/")[1].startswith("meanqf")
    ]
    vf_var_list = [
        v for v in var_list if v.name.split("/")[1].startswith("vf")
    ]
    pol_var_list = [
        v for v in var_list if v.name.split("/")[1].startswith("pol")
    ]

    vf_lossandgrad = U.function([ob, ac, atarg, ret, lrmult],
                                vf_losses + [U.flatgrad(vf_loss, vf_var_list)])

    qf_lossandgrad = U.function(
        [ob, ac, next_ob, mean_ac, lrmult, reward, atarg],
        qf_losses + [U.flatgrad(qf_loss, qf_var_list)])

    qf_adam = MpiAdam(qf_var_list, epsilon=adam_epsilon)

    vf_adam = MpiAdam(vf_var_list, epsilon=adam_epsilon)

    assign_target_q_eq_eval_q = U.function(
        [], [],
        updates=[
            tf.assign(target_q, eval_q)
            for (target_q, eval_q) in zipsame(mean_qf_var_list, qf_var_list)
        ])

    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(oldpi.get_variables(), pi.get_variables())
        ])

    assign_backup_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(backup_v, newv) for (
                backup_v,
                newv) in zipsame(backup_pi.get_variables(), pi.get_variables())
        ])
    assign_new_eq_backup = U.function(
        [], [],
        updates=[
            tf.assign(newv, backup_v)
            for (newv, backup_v
                 ) in zipsame(pi.get_variables(), backup_pi.get_variables())
        ])

    mean_pi_actions = U.function(
        [ob], [pi.pd.mode()])  # later for computing pol_loss
    # Compute all losses
    compute_pol_losses = U.function([ob, ob, ac, lrmult, atarg], [pol_loss])

    U.initialize()

    get_pi_flat_params = U.GetFlat(pol_var_list)
    set_pi_flat_params = U.SetFromFlat(pol_var_list)

    vf_adam.sync()
    qf_adam.sync()

    global timesteps_so_far, episodes_so_far, iters_so_far, \
        tstart, lenbuffer, rewbuffer, tstart, ppo_timesteps_so_far, best_fitness

    episodes_so_far = 0
    timesteps_so_far = 0
    ppo_timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=100)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=100)  # rolling buffer for episode rewards

    best_fitness = np.inf

    eval_gen = traj_segment_generator_eval(pi,
                                           test_env,
                                           timesteps_per_actorbatch,
                                           stochastic=True)  # For evaluation
    seg_gen = traj_segment_generator(pi,
                                     env,
                                     timesteps_per_actorbatch,
                                     stochastic=True,
                                     eval_gen=eval_gen)  # For train V Func

    # Build generator for all solutions
    actors = []
    for i in range(popsize):
        newActor = traj_segment_generator(pi,
                                          env,
                                          timesteps_per_actorbatch,
                                          stochastic=True,
                                          eval_gen=eval_gen)
        actors.append(newActor)

    assert sum(
        [max_iters > 0, max_timesteps > 0, max_episodes > 0,
         max_seconds > 0]) == 1, "Only one time constraint permitted"

    while True:
        if max_timesteps and timesteps_so_far >= max_timesteps:
            print("Max time steps")
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            print("Max episodes")
            break
        elif max_iters and iters_so_far >= max_iters:
            print("Max iterations")
            break
        elif max_seconds and time.time() - tstart >= max_seconds:
            print("Max time")
            break

        if schedule == 'constant':
            cur_lrmult = 1.0
        elif schedule == 'linear':
            cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0)

        else:
            raise NotImplementedError

        logger.log("********** Iteration %i ************" % iters_so_far)

        # Generate new samples
        # Train V func
        ob_segs = None
        for i in range(max_v_train_iter):
            logger.log("Iteration:" + str(iters_so_far) +
                       " - sub-train iter for V func:" + str(i))
            logger.log("Generate New Samples")
            seg = seg_gen.__next__()
            add_vtarg_and_adv(seg, gamma, lam)

            ob, ac, next_ob, atarg, reward, tdlamret, traj_idx = seg["ob"], seg["ac"], seg["next_ob"], seg["adv"], seg[
                "rew"], seg["tdlamret"], \
                                                                 seg["traj_index"]
            atarg = (atarg - atarg.mean()) / atarg.std(
            )  # standardized advantage function estimate
            d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret),
                        shuffle=not pi.recurrent)
            optim_batchsize = optim_batchsize or ob.shape[0]

            if hasattr(pi, "ob_rms"):
                pi.ob_rms.update(
                    ob)  # update running mean/std for normalization

            assign_old_eq_new(
            )  # set old parameter values to new parameter values
            # Train V function
            logger.log("Training V Func and Evaluating V Func Losses")
            for _ in range(optim_epochs):
                losses = [
                ]  # list of tuples, each of which gives the loss for a minibatch
                for batch in d.iterate_once(optim_batchsize):
                    *vf_losses, g = vf_lossandgrad(batch["ob"], batch["ac"],
                                                   batch["atarg"],
                                                   batch["vtarg"], cur_lrmult)
                    vf_adam.update(g, optim_stepsize * cur_lrmult)
                    losses.append(vf_losses)
                logger.log(fmt_row(13, np.mean(losses, axis=0)))

            d_q = Dataset(dict(ob=ob,
                               ac=ac,
                               next_ob=next_ob,
                               reward=reward,
                               atarg=atarg,
                               vtarg=tdlamret),
                          shuffle=not pi.recurrent)

            # Re-train q function
            logger.log("Training Q Func Evaluating Q Func Losses")
            for _ in range(optim_epochs):
                losses = [
                ]  # list of tuples, each of which gives the loss for a minibatch
                for batch in d_q.iterate_once(optim_batchsize):
                    *qf_losses, g = qf_lossandgrad(
                        batch["ob"], batch["ac"], batch["next_ob"],
                        mean_pi_actions(batch["ob"])[0], cur_lrmult,
                        batch["reward"], batch["atarg"])
                    qf_adam.update(g, optim_stepsize * cur_lrmult)
                    losses.append(qf_losses)
                logger.log(fmt_row(13, np.mean(losses, axis=0)))

            assign_target_q_eq_eval_q()

        pi0_fitness = compute_pol_losses(ob, ob,
                                         mean_pi_actions(ob)[0], cur_lrmult,
                                         atarg)
        logger.log("Best fitness for Pi0:" + str(np.mean(atarg)))
        logger.log("Best fitness for Pi0:" + str(pi0_fitness))

        # CMAES Train Policy
        assign_old_eq_new()  # set old parameter values to new parameter values
        assign_backup_eq_new()  # backup current policy
        flatten_weights = get_pi_flat_params()
        opt = cma.CMAOptions()
        opt['tolfun'] = max_fitness
        opt['popsize'] = popsize
        opt['maxiter'] = gensize
        opt['verb_disp'] = 0
        opt['verb_log'] = 0
        opt['seed'] = seed
        opt['AdaptSigma'] = True
        es = cma.CMAEvolutionStrategy(flatten_weights, sigma, opt)
        while True:
            if es.countiter >= gensize:
                logger.log("Max generations for current layer")
                break
            logger.log("Iteration:" + str(iters_so_far) +
                       " - sub-train Generation for Policy:" +
                       str(es.countiter))
            logger.log("Sigma=" + str(es.sigma))
            solutions = es.ask()
            costs = []
            lens = []

            assign_backup_eq_new()  # backup current policy
            for id, solution in enumerate(solutions):
                set_pi_flat_params(solution)
                losses = []
                # cost = compute_pol_losses(ob_segs['ob'], ob_segs['ob'], mean_pi_actions(ob_segs['ob'])[0])
                cost = compute_pol_losses(ob, ob,
                                          mean_pi_actions(ob)[0], cur_lrmult,
                                          atarg)
                costs.append(cost[0])
                assign_new_eq_backup()
            # Weights decay
            l2_decay = compute_weight_decay(0.99, solutions)
            costs += l2_decay
            # costs, real_costs = fitness_normalization(costs)
            costs, real_costs = fitness_rank(costs)
            es.tell_real_seg(solutions=solutions,
                             function_values=costs,
                             real_f=real_costs,
                             segs=None)
            logger.log("best_fitness:" + str(best_fitness) +
                       " current best fitness:" + str(es.result[1]))
            best_solution = es.result[0]
            best_fitness = es.result[1]
            logger.log("Best Solution Fitness:" + str(best_fitness))
            set_pi_flat_params(best_solution)
        sigma = es.sigma

        iters_so_far += 1
        episodes_so_far += sum(lens)
예제 #18
0
def build_train_dueling(make_obs_ph, q_func, model_func, num_actions, optimizer, grad_norm_clipping=None, gamma=1.0,
                        scope="deepq", input_dim=84 * 84 * 4, hash_dim=32, use_rp=False, imitate=False, reuse=None):
    """Creates the train function:

    Parameters
    ----------
    make_obs_ph: str -> tf.placeholder or TfInput
        a function that takes a name and creates a placeholder of input with that name
    q_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    num_actions: int
        number of actions
    reuse: bool
        whether or not to reuse the graph variables
    optimizer: tf.train.Optimizer
        optimizer to use for the Q-learning objective.
    grad_norm_clipping: float or None
        clip gradient norms to this value. If None no clipping is performed.
    gamma: float
        discount rate.
    double_q: bool
        if true will use Double Q Learning (https://arxiv.org/abs/1509.06461).
        In general it is a good idea to keep it enabled.
    scope: str or VariableScope
        optional scope for variable_scope.
    reuse: bool or None
        whether or not the variables should be reused. To be able to reuse the scope must be given.

    Returns
    -------
    act: (tf.Variable, bool, float) -> tf.Variable
        function to select and action given observation.
`       See the top of the file for details.
    train: (object, np.array, np.array, object, np.array, np.array) -> np.array
        optimize the error in Bellman's equation.
`       See the top of the file for details.
    update_target: () -> ()
        copy the parameters from optimized Q function to the target Q function.
`       See the top of the file for details.
    debug: {str: function}
        a bunch of functions to print debug data like q_values.
    """
    act_f = build_act_dueling(make_obs_ph, q_func, model_func, num_actions, input_dim, hash_dim, use_rp, scope=scope,
                              reuse=reuse)

    with tf.variable_scope(scope, reuse=reuse):
        # set up placeholders
        obs_t_input = U.ensure_tf_input(make_obs_ph("obs_t"))
        act_t_ph = tf.placeholder(tf.int32, [None], name="action")
        rew_t_ph = tf.placeholder(tf.float32, [None], name="reward")
        done_mask_ph = tf.placeholder(tf.float32, [None], name="done")
        importance_weights_ph = tf.placeholder(tf.float32, [None], name="weight")
        if imitate:
            imitate_act_t_ph = tf.placeholder(tf.float32, [None, num_actions], name="imitate_action")
        # EMDQN
        value_t_ph = tf.placeholder(tf.float32, [None], name='value_t')
        value_tp1_ph = tf.placeholder(tf.float32, [None], name='value_tp1')
        value_tp1_masked = (1.0 - done_mask_ph) * value_tp1_ph
        # q network evaluation
        q_t = q_func(obs_t_input.get(), num_actions, scope="q_func", reuse=True)  # reuse parameters from act
        # q_t_normalized = q_t - tf.max(q_t,)
        q_func_vars = U.scope_vars(U.absolute_scope_name("q_func"))
        q_t_selected = tf.reduce_sum(q_t * tf.one_hot(act_t_ph, num_actions), 1)

        # compute RHS of bellman equation
        q_target = rew_t_ph + gamma * value_tp1_masked

        # compute the error (potentially clipped)
        td_error = q_target - (q_t_selected + value_t_ph)
        td_summary = tf.summary.scalar("td error", tf.reduce_mean(td_error))
        # EMDQN
        print(q_t.shape)
        if imitate:
            imitation_loss = tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(labels=imitate_act_t_ph, logits=q_t),
                                       axis=1)
            print(imitation_loss.shape)
            errors = U.huber_loss(td_error) + imitation_loss
        else:
            errors = U.huber_loss(td_error)
        total_summary = tf.summary.scalar("total error", tf.reduce_mean(errors))

        value_summary = tf.summary.scalar("value_t", tf.reduce_mean(value_t_ph))
        value_tp1_summary = tf.summary.scalar("value_tp1", tf.reduce_mean(value_tp1_ph))
        q_summary = tf.summary.scalar("estimated qs", tf.reduce_mean(q_t_selected))
        summaries=[td_summary, total_summary, value_summary, value_tp1_summary, q_summary]
        if imitate:
            imitate_summary = tf.summary.scalar("imitate loss", tf.reduce_mean(imitation_loss))
            summaries.append(imitate_summary)
        summary = tf.summary.merge(summaries)

        weighted_error = tf.reduce_mean(importance_weights_ph * errors)
        # compute optimization op (potentially with gradient clipping)
        if grad_norm_clipping is not None:
            optimize_expr = U.minimize_and_clip(optimizer,
                                                weighted_error,
                                                var_list=q_func_vars,
                                                clip_val=grad_norm_clipping)
        else:
            optimize_expr = optimizer.minimize(weighted_error, var_list=q_func_vars)

        # update_target_fn will be called periodically to copy Q network to target Q network
        inputs = [
            obs_t_input,
            act_t_ph,
            rew_t_ph,
            done_mask_ph,
            importance_weights_ph,
            value_t_ph,
            value_tp1_ph
        ]
        if imitate:
            inputs.append(imitate_act_t_ph)
        # Create callable functions
        # EMDQN
        train = U.function(
            inputs=inputs,
            outputs=[td_error, summary],
            updates=[optimize_expr]
        )

        return act_f, train
예제 #19
0
def co_build_train(make_obs_ph,
                   q_func,
                   num_actions,
                   optimizer,
                   grad_norm_clipping=None,
                   gamma=1.0,
                   double_q=True,
                   scope="deepq",
                   reuse=None,
                   using_control_sharing=True):

    act_f = co_build_act(make_obs_ph,
                         q_func,
                         num_actions,
                         scope=scope,
                         reuse=reuse,
                         using_control_sharing=using_control_sharing)

    with tf.variable_scope(scope, reuse=reuse):
        # set up placeholders
        obs_t_input = make_obs_ph("obs_t")
        act_t_ph = tf.placeholder(tf.int32, [None], name="action")
        rew_t_ph = tf.placeholder(tf.float32, [None], name="reward")
        obs_tp1_input = make_obs_ph("obs_tp1")
        done_mask_ph = tf.placeholder(tf.float32, [None], name="done")
        importance_weights_ph = tf.placeholder(tf.float32, [None],
                                               name="weight")

        # q network evaluation
        q_t = q_func(obs_t_input.get(),
                     num_actions,
                     scope="q_func",
                     reuse=True)  # reuse parameters from act
        q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                        scope=tf.get_variable_scope().name +
                                        "/q_func")

        # target q network evalution
        q_tp1 = q_func(obs_tp1_input.get(), num_actions, scope="target_q_func")
        target_q_func_vars = tf.get_collection(
            tf.GraphKeys.GLOBAL_VARIABLES,
            scope=tf.get_variable_scope().name + "/target_q_func")

        # q scores for actions which we know were selected in the given state.
        q_t_selected = tf.reduce_sum(q_t * tf.one_hot(act_t_ph, num_actions),
                                     1)

        # compute estimate of best possible value starting from state at t + 1
        if double_q:
            q_tp1_using_online_net = q_func(obs_tp1_input.get(),
                                            num_actions,
                                            scope="q_func",
                                            reuse=True)
            q_tp1_best_using_online_net = tf.argmax(q_tp1_using_online_net, 1)
            q_tp1_best = tf.reduce_sum(
                q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions),
                1)
        else:
            q_tp1_best = tf.reduce_max(q_tp1, 1)
        q_tp1_best_masked = (1.0 - done_mask_ph) * q_tp1_best

        # compute RHS of bellman equation
        q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked

        # compute the error (potentially clipped)
        td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)
        errors = U.huber_loss(td_error)
        weighted_error = tf.reduce_mean(importance_weights_ph * errors)

        # compute optimization op (potentially with gradient clipping)
        if grad_norm_clipping is not None:
            gradients = optimizer.compute_gradients(weighted_error,
                                                    var_list=q_func_vars)
            for i, (grad, var) in enumerate(gradients):
                if grad is not None:
                    gradients[i] = (tf.clip_by_norm(grad,
                                                    grad_norm_clipping), var)
            optimize_expr = optimizer.apply_gradients(gradients)
        else:
            optimize_expr = optimizer.minimize(weighted_error,
                                               var_list=q_func_vars)

        # update_target_fn will be called periodically to copy Q network to target Q network
        update_target_expr = []
        for var, var_target in zip(
                sorted(q_func_vars, key=lambda v: v.name),
                sorted(target_q_func_vars, key=lambda v: v.name)):
            update_target_expr.append(var_target.assign(var))
        update_target_expr = tf.group(*update_target_expr)

        # Create callable functions
        train = U.function(inputs=[
            obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph,
            importance_weights_ph
        ],
                           outputs=td_error,
                           updates=[optimize_expr])
        update_target = U.function([], [], updates=[update_target_expr])

        q_values = U.function([obs_t_input], q_t)

        return act_f, train, update_target, {'q_values': q_values}
예제 #20
0
def build_train_imitation(make_obs_ph,
                          q_func,
                          num_actions,
                          optimizer,
                          grad_norm_clipping=None,
                          gamma=1.0,
                          double_q=False,
                          scope="deepq",
                          reuse=None,
                          param_noise=False,
                          param_noise_filter_func=None):
    """Creates the train function:

    Parameters
    ----------
    make_obs_ph: str -> tf.placeholder or TfInput
        a function that takes a name and creates a placeholder of input with that name
    q_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    num_actions: int
        number of actions
    reuse: bool
        whether or not to reuse the graph variables
    optimizer: tf.train.Optimizer
        optimizer to use for the Q-learning objective.
    grad_norm_clipping: float or None
        clip gradient norms to this value. If None no clipping is performed.
    gamma: float
        discount rate.
    double_q: bool
        if true will use Double Q Learning (https://arxiv.org/abs/1509.06461).
        In general it is a good idea to keep it enabled.
    scope: str or VariableScope
        optional scope for variable_scope.
    reuse: bool or None
        whether or not the variables should be reused. To be able to reuse the scope must be given.
    param_noise: bool
        whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905)
    param_noise_filter_func: tf.Variable -> bool
        function that decides whether or not a variable should be perturbed. Only applicable
        if param_noise is True. If set to None, default_param_noise_filter is used by default.

    Returns
    -------
    act: (tf.Variable, bool, float) -> tf.Variable
        function to select and action given observation.
`       See the top of the file for details.
    train: (object, np.array, np.array, object, np.array, np.array) -> np.array```
        optimize the error in Bellman's equation.
`       See the top of the file for details.
    update_target: () -> ()
        copy the parameters from optimized Q function to the target Q function.
`       See the top of the file for details.
    debug: {str: function}
        a bunch of functions to print debug data like q_values.
    """
    if param_noise:
        act_f = build_act_with_param_noise(
            make_obs_ph,
            q_func,
            num_actions,
            scope=scope,
            reuse=reuse,
            param_noise_filter_func=param_noise_filter_func)
    else:
        act_f = build_act_imitation(make_obs_ph,
                                    q_func,
                                    num_actions,
                                    scope=scope,
                                    reuse=reuse)

    with tf.variable_scope(scope, reuse=reuse):
        # set up placeholders
        obs_t_input = make_obs_ph("obs_t")
        act_t_ph = tf.placeholder(tf.int32, [None], name="action")
        rew_t_ph = tf.placeholder(tf.float32, [None], name="reward")
        obs_tp1_input = make_obs_ph("obs_tp1")
        done_mask_ph = tf.placeholder(tf.float32, [None], name="done")
        importance_weights_ph = tf.placeholder(tf.float32, [None],
                                               name="weight")

        # q network evaluation
        q_t = q_func(obs_t_input.get(),
                     num_actions,
                     scope="q_func",
                     reuse=True)  # reuse parameters from act
        q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                        scope=tf.get_variable_scope().name +
                                        "/q_func")

        # target q network evalution
        q_tp1 = q_func(obs_tp1_input.get(), num_actions, scope="target_q_func")
        target_q_func_vars = tf.get_collection(
            tf.GraphKeys.GLOBAL_VARIABLES,
            scope=tf.get_variable_scope().name + "/target_q_func")

        # q scores for actions which we know were selected in the given state.
        q_t_selected = tf.reduce_sum(q_t * tf.one_hot(act_t_ph, num_actions),
                                     1)  # Q(s,a;θi)

        # compute estimate of best possible value starting from state at t + 1
        if double_q:
            q_tp1_using_online_net = q_func(obs_tp1_input.get(),
                                            num_actions,
                                            scope="q_func",
                                            reuse=True)
            q_tp1_best_using_online_net = tf.argmax(q_tp1_using_online_net, 1)
            q_tp1_best = tf.reduce_sum(
                q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions),
                1)
        else:
            q_tp1_best = tf.reduce_max(q_tp1, 1)
        q_tp1_best_masked = (1.0 -
                             done_mask_ph) * q_tp1_best  # maxQ(s',a';θi-)

        # compute RHS of bellman equation
        q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked

        # compute the error (potentially clipped)
        td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)
        errors = U.huber_loss(td_error)
        weighted_error = tf.reduce_mean(importance_weights_ph * errors)

        # compute optimization op (potentially with gradient clipping)
        if grad_norm_clipping is not None:
            gradients = optimizer.compute_gradients(weighted_error,
                                                    var_list=q_func_vars)
            for i, (grad, var) in enumerate(gradients):
                if grad is not None:
                    gradients[i] = (tf.clip_by_norm(grad,
                                                    grad_norm_clipping), var)
            optimize_expr = optimizer.apply_gradients(gradients)
        else:
            optimize_expr = optimizer.minimize(weighted_error,
                                               var_list=q_func_vars)

# -!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-! OBSERVER !-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-

# TED's set up placeholders
        ment_obs_t_input = make_obs_ph("ment_obs_t")
        ment_act_t_ph = tf.placeholder(tf.int32, [None], name="ment_action")
        ment_obs_tp1_input = make_obs_ph("ment_obs_tp1")
        old_error_ph = tf.placeholder(tf.float32,
                                      shape=[None],
                                      name="old_error")
        old_imp_weights_ph = tf.placeholder(tf.float32, [None],
                                            name="old_imp_weights")

        # TED's q network evaluation
        aug_q_t = q_func(obs_t_input.get(),
                         num_actions,
                         scope="q_func",
                         reuse=True)  # reuse parameters from act
        aug_q_func_vars = tf.get_collection(
            tf.GraphKeys.GLOBAL_VARIABLES,
            scope=tf.get_variable_scope().name + "/q_func")

        # TED's target q network evalution
        aug_q_tp1 = q_func(obs_tp1_input.get(),
                           num_actions,
                           scope="target_q_func",
                           reuse=True)
        aug_target_q_func_vars = tf.get_collection(
            tf.GraphKeys.GLOBAL_VARIABLES,
            scope=tf.get_variable_scope().name + "/target_q_func")

        # TED's q scores for actions which we know were selected in the given state.
        aug_q_t_selected = tf.reduce_sum(
            aug_q_t * tf.one_hot(act_t_ph, num_actions), 1)  # Q(s,a;θi)

        aug_q_tp1_selected = tf.reduce_sum(
            q_tp1 * tf.one_hot(ment_act_t_ph, num_actions), 1)  # Q(s',am;θi)
        aug_q_tp1_selected_masked = (1.0 - done_mask_ph) * aug_q_tp1_selected

        # TED's compute estimate of best possible value starting from state at t + 1
        if double_q:
            aug_q_tp1_using_online_net = q_func(obs_tp1_input.get(),
                                                num_actions,
                                                scope="q_func",
                                                reuse=True)
            aug_q_tp1_best_using_online_net = tf.argmax(
                aug_q_tp1_using_online_net, 1)
            aug_q_tp1_best = tf.reduce_sum(
                aug_q_tp1 *
                tf.one_hot(aug_q_tp1_best_using_online_net, num_actions), 1)
        else:
            aug_q_tp1_best = tf.reduce_max(aug_q_tp1, 1)
        aug_q_tp1_best_masked = (
            1.0 - done_mask_ph) * aug_q_tp1_best  # maxQ(s',a';θi-)

        # TED's compute RHS of bellman equation
        aug_q_t_selected_target = rew_t_ph + gamma * tf.maximum(
            aug_q_tp1_best_masked, aug_q_tp1_selected_masked)
        # aug_q_t_selected_target = rew_t_ph + gamma * aug_q_tp1_best_masked

        # TED's compute the error (potentially clipped)
        aug_td_error = aug_q_t_selected - tf.stop_gradient(
            aug_q_t_selected_target)
        aug_errors = U.huber_loss(aug_td_error)
        aug_weighted_error = tf.reduce_mean(importance_weights_ph * aug_errors)
        # aug_weighted_error = tf.Print(aug_weighted_error, [tf.shape(importance_weights_ph)], "AGENT WEIGHTED ERROR: ")

        # TED's compute optimization op (potentially with gradient clipping)
        if grad_norm_clipping is not None:
            gradients = optimizer.compute_gradients(aug_weighted_error,
                                                    var_list=aug_q_func_vars)
            for i, (grad, var) in enumerate(gradients):
                if grad is not None:
                    gradients[i] = (tf.clip_by_norm(grad,
                                                    grad_norm_clipping), var)
            aug_optimize_expr = optimizer.apply_gradients(gradients)
        else:
            aug_optimize_expr = optimizer.minimize(aug_weighted_error,
                                                   var_list=aug_q_func_vars)

# -!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-! OBSERVER !-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-

# -!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!- MENTOR -!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-

# TED's mentor's q network evaluation
        ment_q_t = q_func(ment_obs_t_input.get(),
                          num_actions,
                          scope="q_func",
                          reuse=True)  # reuse parameters from act
        ment_q_func_vars = tf.get_collection(
            tf.GraphKeys.GLOBAL_VARIABLES,
            scope=tf.get_variable_scope().name + "/q_func")

        # TED's mentor's target q network evalution
        ment_q_tp1 = q_func(ment_obs_tp1_input.get(),
                            num_actions,
                            scope="target_q_func",
                            reuse=True)
        ment_target_q_func_vars = tf.get_collection(
            tf.GraphKeys.GLOBAL_VARIABLES,
            scope=tf.get_variable_scope().name + "/target_q_func")

        # TED's mentor's q scores for action am which we know was selected in the given state.
        ment_q_t_selected = tf.reduce_sum(
            ment_q_t * tf.one_hot(ment_act_t_ph, num_actions),
            1)  # Q(sm,am;θi)

        ment_q_tp1_selected = tf.reduce_sum(
            ment_q_tp1 * tf.one_hot(ment_act_t_ph, num_actions),
            1)  # Q(sm',am;θi-)
        ment_q_tp1_selected_masked = (1.0 - done_mask_ph) * ment_q_tp1_selected

        # TED's compute estimate of best possible value starting from state at t + 1
        if double_q:
            ment_q_tp1_using_online_net = q_func(ment_obs_tp1_input.get(),
                                                 num_actions,
                                                 scope="q_func",
                                                 reuse=True)
            ment_q_tp1_best_using_online_net = tf.argmax(
                ment_q_tp1_using_online_net, 1)
            ment_q_tp1_best = tf.reduce_sum(
                ment_q_tp1 *
                tf.one_hot(ment_q_tp1_best_using_online_net, num_actions), 1)
        else:
            ment_q_tp1_best = tf.reduce_max(ment_q_tp1, 1)
        ment_q_tp1_best_masked = (
            1.0 - done_mask_ph) * ment_q_tp1_best  # maxQ(sm',a';θi-)

        # TED's compute RHS of bellman equation
        ment_q_t_selected_target = rew_t_ph + gamma * tf.maximum(
            ment_q_tp1_best_masked, ment_q_tp1_selected_masked)

        # TED's compute the error (potentially clipped)
        ment_td_error = ment_q_t_selected - tf.stop_gradient(
            ment_q_t_selected_target)
        ment_errors = U.huber_loss(ment_td_error)
        ment_weighted_error = tf.reduce_mean(importance_weights_ph *
                                             ment_errors)
        # ment_weighted_error = tf.Print(ment_weighted_error, [tf.shape(importance_weights_ph)], "MENTOR WEIGHTED ERROR: ")

        # TED's compute optimization op (potentially with gradient clipping)
        if grad_norm_clipping is not None:
            gradients = optimizer.compute_gradients(ment_weighted_error,
                                                    var_list=ment_q_func_vars)
            for i, (grad, var) in enumerate(gradients):
                if grad is not None:
                    gradients[i] = (tf.clip_by_norm(grad,
                                                    grad_norm_clipping), var)
            ment_optimize_expr = optimizer.apply_gradients(gradients)
        else:
            ment_optimize_expr = optimizer.minimize(ment_weighted_error,
                                                    var_list=ment_q_func_vars)


# -!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!- MENTOR -!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-!-

        def temp_func1():
            return aug_td_error, aug_optimize_expr
            # return td_error, optimize_expr
        def temp_func2():
            return ment_td_error, ment_optimize_expr

        old_errors = U.huber_loss(old_error_ph)
        old_weighted_error = tf.reduce_mean(old_imp_weights_ph * old_errors)

        final_td_error, final_optimize_expr = tf.cond(
            tf.greater((ment_weighted_error - old_weighted_error)**2,
                       (aug_weighted_error - old_weighted_error)**2),
            temp_func1, temp_func2)

        # update_target_fn will be called periodically to copy Q network to target Q network
        update_target_expr = []
        for var, var_target in zip(
                sorted(q_func_vars, key=lambda v: v.name),
                sorted(target_q_func_vars, key=lambda v: v.name)):
            update_target_expr.append(var_target.assign(var))
        update_target_expr = tf.group(*update_target_expr)

        # Create callable functions
        train = U.function(inputs=[
            obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph,
            importance_weights_ph
        ],
                           outputs=td_error,
                           updates=[optimize_expr])
        update_target = U.function([], [], updates=[update_target_expr])

        q_values = U.function([obs_t_input], q_t)

        # TED's create callable functions
        trainAugmented = U.function(inputs=[
            obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph,
            importance_weights_ph, ment_obs_t_input, ment_obs_tp1_input,
            ment_act_t_ph, old_error_ph, old_imp_weights_ph
        ],
                                    outputs=final_td_error,
                                    updates=[final_optimize_expr])

        return act_f, train, trainAugmented, update_target, {
            'q_values': q_values
        }
예제 #21
0
def build_train(make_obs_ph,
                q_func,
                hr_func,
                num_actions,
                rl_optimizer,
                hr_optimizer,
                grad_norm_clipping=None,
                gamma=1.0,
                double_q=True,
                scope="deepq",
                reuse=None,
                param_noise=False,
                param_noise_filter_func=None):
    """Creates the train function:

    Parameters
    ----------
    make_obs_ph: str -> tf.placeholder or TfInput
        a function that takes a name and creates a placeholder of input with that name
    q_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    num_actions: int
        number of actions
    reuse: bool
        whether or not to reuse the graph variables
    rl_optimizer: tf.train.Optimizer
        rl_optimizer to use for the Q-learning objective.
    grad_norm_clipping: float or None
        clip gradient norms to this value. If None no clipping is performed.
    gamma: float
        discount rate.
    double_q: bool
        if true will use Double Q Learning (https://arxiv.org/abs/1509.06461).
        In general it is a good idea to keep it enabled.
    scope: str or VariableScope
        optional scope for variable_scope.
    reuse: bool or None
        whether or not the variables should be reused. To be able to reuse the scope must be given.
    param_noise: bool
        whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905)
    param_noise_filter_func: tf.Variable -> bool
        function that decides whether or not a variable should be perturbed. Only applicable
        if param_noise is True. If set to None, default_param_noise_filter is used by default.

    Returns
    -------
    act: (tf.Variable, bool, float) -> tf.Variable
        function to select and action given observation.
`       See the top of the file for details.
    train: (object, np.array, np.array, object, np.array, np.array) -> np.array
        optimize the error in Bellman's equation.
`       See the top of the file for details.
    update_target: () -> ()
        copy the parameters from optimized Q function to the target Q function.
`       See the top of the file for details.
    debug: {str: function}
        a bunch of functions to print debug data like q_values.
    """
    if param_noise:
        act_f = build_act_with_param_noise(
            make_obs_ph,
            q_func,
            hr_func,
            num_actions,
            scope=scope,
            reuse=reuse,
            param_noise_filter_func=param_noise_filter_func)
    else:
        act_f = build_act(make_obs_ph,
                          q_func,
                          hr_func,
                          num_actions,
                          scope=scope,
                          reuse=reuse)

    with tf.variable_scope(scope, reuse=reuse):
        # set up placeholders
        obs_t_input = make_obs_ph("obs_t")
        act_t_ph = tf.placeholder(tf.int32, [None], name="action")
        rew_t_ph = tf.placeholder(tf.float32, [None], name="reward")
        obs_tp1_input = make_obs_ph("obs_tp1")
        done_mask_ph = tf.placeholder(tf.float32, [None], name="done")
        importance_weights_ph = tf.placeholder(tf.float32, [None],
                                               name="weight")

        # feedback placeholders
        obs_fb = make_obs_ph("obs_fb")
        act_fb_ph = tf.placeholder(tf.int32, [None], name="action_fb")
        feedback_ph = tf.placeholder(tf.float32, [None], name="feedback")

        # q network evaluation
        q_t = q_func(obs_t_input.get(),
                     num_actions,
                     scope="q_func",
                     reuse=True)  # reuse parameters from act
        q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                        scope=tf.get_variable_scope().name +
                                        "/q_func")

        # target q network evalution
        q_tp1 = q_func(obs_tp1_input.get(), num_actions, scope="target_q_func")
        target_q_func_vars = tf.get_collection(
            tf.GraphKeys.GLOBAL_VARIABLES,
            scope=tf.get_variable_scope().name + "/target_q_func")

        # q scores for actions which we know were selected in the given state.
        q_t_selected = tf.reduce_sum(q_t * tf.one_hot(act_t_ph, num_actions),
                                     1)

        # compute estimate of best possible value starting from state at t + 1
        if double_q:
            q_tp1_using_online_net = q_func(obs_tp1_input.get(),
                                            num_actions,
                                            scope="q_func",
                                            reuse=True)
            q_tp1_best_using_online_net = tf.argmax(q_tp1_using_online_net, 1)
            q_tp1_best = tf.reduce_sum(
                q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions),
                1)
        else:
            q_tp1_best = tf.reduce_max(q_tp1, 1)
        q_tp1_best_masked = (1.0 - done_mask_ph) * q_tp1_best

        # compute RHS of bellman equation
        q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked

        # compute the error (potentially clipped)
        td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)
        errors = U.huber_loss(td_error)
        weighted_error = tf.reduce_mean(importance_weights_ph * errors)

        # compute optimization op (potentially with gradient clipping)
        if grad_norm_clipping is not None:
            gradients = rl_optimizer.compute_gradients(weighted_error,
                                                       var_list=q_func_vars)
            for i, (grad, var) in enumerate(gradients):
                if grad is not None:
                    gradients[i] = (tf.clip_by_norm(grad,
                                                    grad_norm_clipping), var)
            optimize_expr = rl_optimizer.apply_gradients(gradients)
        else:
            optimize_expr = rl_optimizer.minimize(weighted_error,
                                                  var_list=q_func_vars)

        # update feedback function approximator (HR)
        batch_size = tf.shape(obs_fb.get())[0]
        pred_feedbacks = hr_func(obs_fb.get(),
                                 num_actions,
                                 scope="hr_func",
                                 reuse=True)
        indices = tf.stack([tf.range(batch_size), act_fb_ph], axis=-1)
        pred_feedbacks = tf.gather_nd(pred_feedbacks, indices)
        feedback_loss = tf.reduce_mean(
            -(feedback_ph * tf.log(pred_feedbacks) +
              (1 - feedback_ph) * tf.log(1 - pred_feedbacks)))
        fb_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                         scope=tf.get_variable_scope().name +
                                         "/hr_func")
        feedback_train_op = hr_optimizer.minimize(feedback_loss,
                                                  var_list=fb_func_vars)

        # update_target_fn will be called periodically to copy Q network to target Q network
        update_target_expr = []
        for var, var_target in zip(
                sorted(q_func_vars, key=lambda v: v.name),
                sorted(target_q_func_vars, key=lambda v: v.name)):
            update_target_expr.append(var_target.assign(var))
        update_target_expr = tf.group(*update_target_expr)

        # Create callable functions
        train_rl = U.function(inputs=[
            obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph,
            importance_weights_ph
        ],
                              outputs=td_error,
                              updates=[optimize_expr])
        train_hr = U.function(inputs=[obs_fb, act_fb_ph, feedback_ph],
                              outputs=[pred_feedbacks, feedback_loss],
                              updates=[feedback_train_op])
        _evaluate_hr = U.function(inputs=[obs_fb, act_fb_ph, feedback_ph],
                                  outputs=[pred_feedbacks, feedback_loss],
                                  updates=[])

        def evaluate_hr(obs, actions, feedbacks):
            assert len(obs) == len(actions) == len(feedbacks)
            total_acc = []
            total_loss = []
            fb_batch_size = 5
            for i in range(0, len(obs) - fb_batch_size, fb_batch_size):
                obs_batch = obs[i:i + fb_batch_size]
                action_batch = actions[i:i + fb_batch_size]
                feedback_batch = feedbacks[i:i + fb_batch_size]
                pred, loss = _evaluate_hr(obs_batch, action_batch,
                                          feedback_batch)
                acc = (np.round(pred) == feedback_batch).mean()
                total_acc.append(acc)
                total_loss.append(loss)
            return np.mean(total_acc), np.mean(total_loss)

        update_target = U.function([], [], updates=[update_target_expr])

        q_values = U.function([obs_t_input], q_t)

        return act_f, train_rl, train_hr, evaluate_hr, update_target, {
            'q_values': q_values
        }
예제 #22
0
def build_train_mer(input_type,
                    obs_shape,
                    model_func,
                    num_actions,
                    optimizer,
                    grad_norm_clipping=None,
                    gamma=1.0,
                    scope="mfec",
                    num_neg=10,
                    latent_dim=32,
                    alpha=0.1,
                    beta=1e2,
                    theta=10,
                    loss_type=["contrast"],
                    knn=4,
                    c_loss_type="margin",
                    b=100,
                    batch_size=32,
                    reuse=None):
    """Creates the train function:

    Parameters
    ----------
    make_obs_ph: str -> tf.placeholder or TfInput
        a function that takes a name and creates a placeholder of input with that name
    num_actions: int
        number of actions
    reuse: bool
        whether or not to reuse the graph variables
    optimizer: tf.train.Optimizer
        optimizer to use for the Q-learning objective.
    grad_norm_clipping: float or None
        clip gradient norms to this value. If None no clipping is performed.
    gamma: float
        discount rate.
    double_q: bool
        if true will use Double Q Learning (https://arxiv.org/abs/1509.06461).
        In general it is a good idea to keep it enabled.
    scope: str or VariableScope
        optional scope for variable_scope.
    reuse: bool or None
        whether or not the variables should be reused. To be able to reuse the scope must be given.

    Returns
    -------
    act: (tf.Variable, bool, float) -> tf.Variable
        function to select and action given observation.
`       See the top of the file for details.
    train: (object, np.array, np.array, object, np.array, np.array) -> np.array
        optimize the error in Bellman's equation.
`       See the top of the file for details.
    update_target: () -> ()
        copy the parameters from optimized Q function to the target Q function.
`       See the top of the file for details.
    debug: {str: function}
        a bunch of functions to print debug data like q_values.
    """
    if c_loss_type != "infonce":
        assert num_neg == 1
    # z_func = build_act_contrast(make_obs_ph, model_func, num_actions, scope=scope, secondary_scope="model_func",
    #                             reuse=reuse)

    with tf.variable_scope(scope, reuse=reuse):
        # set up placeholders

        # EMDQN
        # tau = tf.placeholder(tf.float32, [1], name='tau')
        # momentum = tf.placeholder(tf.float32, [1], name='momentum')

        # make_obs_ph = lambda name: input_type(obs_shape, batch_size, name=name),
        magic_num = tf.get_variable(name='magic', shape=[1])
        obs_input_query = U.ensure_tf_input(
            input_type(obs_shape, None, name="obs_query"))
        obs_input_positive = U.ensure_tf_input(
            input_type(obs_shape, batch_size, name="enc_obs_pos"))
        obs_input_negative = U.ensure_tf_input(
            input_type(obs_shape, batch_size * num_neg, name="enc_obs_neg"))
        obs_input_neighbour = U.ensure_tf_input(
            input_type(obs_shape, batch_size * knn, name="enc_obs_neighbour"))

        obs_input_uniformity_u = U.ensure_tf_input(
            input_type(obs_shape, batch_size, name="enc_obs_uni_u"))
        obs_input_uniformity_v = U.ensure_tf_input(
            input_type(obs_shape, batch_size, name="enc_obs_uni_v"))

        obs_input_weighted_product_u = U.ensure_tf_input(
            input_type(obs_shape, batch_size, name="enc_obs_wp_u"))
        obs_input_weighted_product_v = U.ensure_tf_input(
            input_type(obs_shape, batch_size, name="enc_obs_wp_v"))

        value_input_weighted_product_u = tf.placeholder(tf.float32,
                                                        [batch_size],
                                                        name="value_u")
        value_input_weighted_product_v = tf.placeholder(tf.float32,
                                                        [batch_size],
                                                        name="value_v")

        value_input_query = tf.placeholder(tf.float32, [batch_size],
                                           name="value")
        value_input_neighbour = tf.placeholder(tf.float32, [batch_size, knn],
                                               name="neighbour_value")
        action_embedding = tf.Variable(tf.random_normal(
            [num_actions, latent_dim], stddev=1),
                                       name="action_embedding")
        action_input = tf.placeholder(tf.int32, [batch_size], name="action")
        action_input_causal = tf.placeholder(tf.int32, [batch_size],
                                             name="action")
        reward_input_causal = tf.placeholder(tf.float32, [batch_size],
                                             name="action")

        inputs = [obs_input_query]
        if "contrast" in loss_type:
            inputs += [obs_input_positive, obs_input_negative]
        if "regression" in loss_type:
            inputs += [value_input_query]
        if "linear_model" in loss_type:
            inputs += [action_input]
            if "contrast" not in loss_type:
                inputs += [obs_input_positive]
        if "fit" in loss_type:
            # if "contrast" not in loss_type:
            #     inputs+=[]
            inputs += [obs_input_neighbour, value_input_neighbour]
            if "regression" not in loss_type:
                inputs += [value_input_query]
        if "weight_product" in loss_type:
            inputs += [
                obs_input_uniformity_u, obs_input_uniformity_v,
                obs_input_weighted_product_u, obs_input_weighted_product_v,
                value_input_weighted_product_u, value_input_weighted_product_v
            ]
        if "causality" in loss_type:
            inputs += [reward_input_causal, action_input_causal]
        z_old = model_func(obs_input_query.get(),
                           num_actions,
                           scope="target_model_func",
                           reuse=False)

        z = model_func(obs_input_query.get(),
                       num_actions,
                       scope="model_func",
                       reuse=tf.AUTO_REUSE)

        z_pos = model_func(obs_input_positive.get(),
                           num_actions,
                           scope="model_func",
                           reuse=True)

        z_neg = model_func(obs_input_negative.get(),
                           num_actions,
                           scope="model_func",
                           reuse=True)

        z_uni_u = model_func(obs_input_uniformity_u.get(),
                             num_actions,
                             scope="model_func",
                             reuse=True)
        z_uni_v = model_func(obs_input_uniformity_v.get(),
                             num_actions,
                             scope="model_func",
                             reuse=True)
        z_wp_u = model_func(obs_input_weighted_product_u.get(),
                            num_actions,
                            scope="model_func",
                            reuse=True)
        z_wp_v = model_func(obs_input_weighted_product_v.get(),
                            num_actions,
                            scope="model_func",
                            reuse=True)

        z_pos = tf.reshape(z_pos, [-1, latent_dim])
        z_tar = tf.reshape(z, [-1, latent_dim])
        if "contrast" in loss_type:
            z_neg = tf.reshape(z_neg, [-1, latent_dim])
            contrast_loss, contrast_summary = contrastive_loss_fc(
                z_tar,
                z_pos,
                z_neg,
                c_type=c_loss_type,
                num_neg=num_neg,
                batch_size=batch_size,
                emb_dim=latent_dim)
            symmetry_loss, symmetry_summary = contrastive_loss_fc(
                z_pos,
                z_tar,
                z_neg,
                c_type=c_loss_type,
                num_neg=num_neg,
                batch_size=batch_size,
                emb_dim=latent_dim)
            contrast_loss += symmetry_loss
        z_neighbour = model_func(obs_input_neighbour.get(),
                                 num_actions,
                                 scope="model_func",
                                 reuse=True)

        # fit loss
        z_neighbour = tf.reshape(z_neighbour, [-1, knn, latent_dim])
        square_dist = tf.square(
            tf.tile(tf.expand_dims(z_tar, 1), [1, knn, 1]) - z_neighbour)
        neighbour_dist = tf.reduce_sum(square_dist, axis=2)
        neighbour_coeff = tf.math.softmax(-neighbour_dist / b, axis=1)
        coeff_sum = tf.reduce_mean(tf.reduce_sum(neighbour_coeff, axis=1))
        value_input_neighbour_mean = tf.reduce_mean(value_input_neighbour)
        fit_value = tf.reduce_sum(tf.multiply(neighbour_coeff,
                                              value_input_neighbour),
                                  axis=1)
        fit_loss = tf.reduce_mean(tf.abs(fit_value - value_input_query))

        # causality loss
        reward_input_causal = tf.reshape(reward_input_causal, [1, -1])
        reward_tile = tf.tile(reward_input_causal, [batch_size, 1])
        # reward_mask = (reward_tile - tf.transpose(reward_tile)) ** 2
        reward_mask = 1 - tf.cast(
            tf.equal((reward_tile - tf.transpose(reward_tile)),
                     tf.constant(0.)), tf.float32)
        action_input_causal = tf.reshape(action_input_causal, [1, -1])
        action_tile = tf.tile(action_input_causal, [batch_size, 1])
        action_mask = tf.cast(
            tf.equal((action_tile - tf.transpose(action_tile)),
                     tf.constant(0)), tf.float32)
        total_mask = tf.multiply(reward_mask, action_mask)
        z_tile = tf.tile(tf.expand_dims(z_tar, 1), [1, batch_size, 1])
        z_diff = z_tile - tf.transpose(z_tile, perm=[1, 0, 2])
        distance = tf.reduce_sum(z_diff**2, axis=2)
        exp_distance = tf.exp(-distance)
        causal_find_rate = (tf.reduce_sum(total_mask)) / (batch_size**2 -
                                                          batch_size)
        causal_loss = tf.reduce_sum(tf.multiply(exp_distance, total_mask))

        # regularization loss
        regularization_loss = -tf.maximum(
            1., tf.reduce_mean(U.huber_loss(z_tar, 0.01)))
        regression_loss = tf.reduce_mean(
            tf.squared_difference(tf.norm(z_tar, axis=1), alpha *
                                  value_input_query)) + regularization_loss

        # linear model loss
        action_embeded = tf.matmul(tf.one_hot(action_input, num_actions),
                                   action_embedding)
        model_loss = tf.reduce_mean(
            tf.squared_difference(action_embeded + z_tar,
                                  z_pos)) + 0.01 * regularization_loss

        # weighted product loss
        uniformity_loss = tf.reduce_sum(
            tf.exp(2 * tf.reduce_sum(tf.multiply(z_uni_u, z_uni_v), axis=1) -
                   2))
        value_weight = (value_input_weighted_product_u -
                        value_input_weighted_product_v)**2
        # angle = acos_safe(tf.reduce_sum(tf.multiply(z_wp_u, z_wp_v), axis=1))
        angle = tf.reduce_sum(tf.multiply(z_wp_u, z_wp_v), axis=1)
        weighted_product = tf.multiply(value_weight, angle)
        wp_loss = tf.reduce_sum(weighted_product)

        total_loss = 0
        if "contrast" in loss_type:
            total_loss += contrast_loss
        if "regression" in loss_type:
            total_loss += beta * regression_loss
        if "linear_model" in loss_type:
            total_loss += theta * model_loss
        if "fit" in loss_type:
            total_loss += beta * fit_loss
        if "causality" in loss_type:
            total_loss += theta * causal_loss
        if "weight_product" in loss_type:
            total_loss += 0.1 * uniformity_loss
            total_loss += wp_loss
        model_func_vars = U.scope_vars(U.absolute_scope_name("model_func"))
        model_func_vars_update = copy.copy(model_func_vars)
        if "linear_model" in loss_type:
            model_func_vars_update.append(action_embedding)

        target_model_func_vars = U.scope_vars(
            U.absolute_scope_name("target_model_func"))

        update_target_expr = []
        for var in model_func_vars:
            print(var.name, var.shape)
        for var_target in target_model_func_vars:
            print(var_target.name, var_target.shape)

        for var, var_target in zip(
                sorted(model_func_vars, key=lambda v: v.name),
                sorted(target_model_func_vars, key=lambda v: v.name)):
            update_target_expr.append(var_target.assign(var))
        update_target_expr = tf.group(*update_target_expr)

        if grad_norm_clipping is not None:
            optimize_expr = U.minimize_and_clip(
                optimizer,
                total_loss,
                var_list=model_func_vars_update,
                clip_val=grad_norm_clipping)
        else:
            optimize_expr = optimizer.minimize(total_loss,
                                               var_list=model_func_vars_update)
        # Create callable functions
        # update_target_fn will be called periodically to copy Q network to target Q network
        z_var_summary = tf.summary.scalar(
            "z_var", tf.reduce_mean(tf.math.reduce_std(z, axis=1)))
        if "contrast" in loss_type:
            z_neg = tf.reshape(z_neg, [batch_size, num_neg, latent_dim])
            negative_summary = tf.summary.scalar(
                "negative_dist",
                tf.reduce_mean(emb_dist(z_tar, z_neg[:, 0, :])))
        positive_summary = tf.summary.scalar(
            "positive_dist", tf.reduce_mean(emb_dist(z_tar, z_pos)))
        if "contrast" in loss_type:
            contrast_loss_summary = tf.summary.scalar(
                "contrast loss", tf.reduce_mean(contrast_loss))
        regularization_loss_summary = tf.summary.scalar(
            "regularization loss", tf.reduce_mean(regularization_loss))
        regression_loss_summary = tf.summary.scalar(
            "regression loss", tf.reduce_mean(regression_loss))
        model_loss_summary = tf.summary.scalar("model loss",
                                               tf.reduce_mean(model_loss))
        fit_loss_summary = tf.summary.scalar("fit loss",
                                             tf.reduce_mean(fit_loss))
        fit_value_summary = tf.summary.scalar("fit value",
                                              tf.reduce_mean(fit_value))
        neighbour_value_summary = tf.summary.scalar(
            "neighbour value", value_input_neighbour_mean)
        coeff_summary = tf.summary.scalar("coeff sum", coeff_sum)
        square_dist_summary = tf.summary.scalar("square_dist",
                                                tf.reduce_mean(square_dist))
        z_neighbour_summary = tf.summary.scalar("z_neighbour_mean",
                                                tf.reduce_mean(z_neighbour))
        # fit_loss_summary = tf.summary.scalar("fit loss", tf.reduce_mean(fit_loss))
        # prediction_loss_summary = tf.summary.scalar("prediction loss", tf.reduce_mean(prediction_loss))
        causal_efficiency_summary = tf.summary.scalar("causal efficiency",
                                                      causal_find_rate)
        causal_loss_summary = tf.summary.scalar("causal loss", causal_loss)
        # reward_mask_summary = tf.summary.scalar("reward mask summary", debug_reward_mask)
        # action_mask_summary = tf.summary.scalar("action mask summary", debug_action_mask)
        uniformity_loss_summary = tf.summary.scalar("uniform loss",
                                                    uniformity_loss)
        wp_loss_summary = tf.summary.scalar("weighted product loss", wp_loss)
        total_loss_summary = tf.summary.scalar("total loss",
                                               tf.reduce_mean(total_loss))

        summaries = [
            z_var_summary, total_loss_summary, regularization_loss_summary
        ]

        if "contrast" in loss_type:
            summaries += [
                negative_summary, positive_summary, contrast_loss_summary
            ]
            summaries += contrast_summary
        if "regression" in loss_type:
            summaries.append(regression_loss_summary)
        if "linear_model" in loss_type:
            summaries.append(model_loss_summary)
            if "contrast" not in loss_type:
                summaries.append(positive_summary)
        if "fit" in loss_type:
            summaries.append(fit_loss_summary)
            summaries.append(fit_value_summary)
            summaries.append(neighbour_value_summary)
            summaries.append(coeff_summary)
            summaries.append(square_dist_summary)
            summaries.append(z_neighbour_summary)
        if "causality" in loss_type:
            summaries.append(causal_efficiency_summary)
            summaries.append(causal_loss_summary)
            # summaries.append(reward_mask_summary)
            # summaries.append(action_mask_summary)
        if "weight_product" in loss_type:
            summaries.append(uniformity_loss_summary)
            summaries.append(wp_loss_summary)
        summary = tf.summary.merge(summaries)
        outputs = [total_loss, summary]
        train = U.function(inputs=inputs,
                           outputs=outputs,
                           updates=[optimize_expr])

        eval = U.function(inputs=inputs, outputs=outputs, updates=[])
        z_func = U.function(
            inputs=[obs_input_query],
            outputs=[z_old],
        )
        norm_func = U.function(inputs=[obs_input_query],
                               outputs=[tf.norm(z_tar, axis=1)])
        update_target_func = U.function([], [], updates=[update_target_expr])
        return z_func, train, eval, norm_func, update_target_func
예제 #23
0
 def loss_q_prioritize(self, states, q_target, actions, coef_q, weights):
     q_values= self.q_estimation(states)
     q_values = tf.reduce_sum(tf.multiply(q_values,actions),axis = 1)
     loss = coef_q * tf.reduce_mean(weights * U.huber_loss((q_values - q_target)))
     return loss        
예제 #24
0
def build_train(make_obs_ph, q_func, num_actions, optimizer, grad_norm_clipping=None, gamma=1.0, double_q=True, scope="deepq", reuse=None):
    """Creates the act function:

    Parameters
    ----------
    make_obs_ph: str -> tf.placeholder or TfInput
        a function that take a name and creates a placeholder of input with that name
    q_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    num_actions: int
        number of actions
    reuse: bool
        whether or not to reuse the graph variables
    optimizer: tf.train.Optimizer
        optimizer to use for the Q-learning objective.
    grad_norm_clipping: float or None
        clip graident norms to this value. If None no clipping is performed.
    gamma: float
        discount rate.
    double_q: bool
        if true will use Double Q Learning (https://arxiv.org/abs/1509.06461).
        In general it is a good idea to keep it enabled.
    scope: str or VariableScope
        optional scope for variable_scope.
    reuse: bool or None
        whether or not the variables should be reused. To be able to reuse the scope must be given.

    Returns
    -------
    act: (tf.Variable, bool, float) -> tf.Variable
        function to select and action given observation.
`       See the top of the file for details.
    train: (object, np.array, np.array, object, np.array, np.array) -> np.array
        optimize the error in Bellman's equation.
`       See the top of the file for details.
    update_target: () -> ()
        copy the parameters from optimized Q function to the target Q function.
`       See the top of the file for details.
    debug: {str: function}
        a bunch of functions to print debug data like q_values.
    """
    act_f = build_act(make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse)

    with tf.variable_scope(scope, reuse=reuse):
        # set up placeholders
        obs_t_input = U.ensure_tf_input(make_obs_ph("obs_t"))
        act_t_ph = tf.placeholder(tf.int32, [None], name="action")
        rew_t_ph = tf.placeholder(tf.float32, [None], name="reward")
        obs_tp1_input = U.ensure_tf_input(make_obs_ph("obs_tp1"))
        done_mask_ph = tf.placeholder(tf.float32, [None], name="done")
        importance_weights_ph = tf.placeholder(tf.float32, [None], name="weight")

        # q network evaluation
        q_t = q_func(obs_t_input.get(), num_actions, scope="q_func", reuse=True)  # reuse parameters from act
        q_func_vars = U.scope_vars(U.absolute_scope_name("q_func"))

        # target q network evalution
        q_tp1 = q_func(obs_tp1_input.get(), num_actions, scope="target_q_func")
        target_q_func_vars = U.scope_vars(U.absolute_scope_name("target_q_func"))

        # q scores for actions which we know were selected in the given state.
        q_t_selected = tf.reduce_sum(q_t * tf.one_hot(act_t_ph, num_actions), 1)

        # compute estimate of best possible value starting from state at t + 1
        if double_q:
            q_tp1_using_online_net = q_func(obs_tp1_input.get(), num_actions, scope="q_func", reuse=True)
            q_tp1_best_using_online_net = tf.arg_max(q_tp1_using_online_net, 1)
            q_tp1_best = tf.reduce_sum(q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions), 1)
        else:
            q_tp1_best = tf.reduce_max(q_tp1, 1)
        q_tp1_best_masked = (1.0 - done_mask_ph) * q_tp1_best

        # compute RHS of bellman equation
        q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked

        # compute the error (potentially clipped)
        td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)
        errors = U.huber_loss(td_error)
        weighted_error = tf.reduce_mean(importance_weights_ph * errors)
        # compute optimization op (potentially with gradient clipping)
        if grad_norm_clipping is not None:
            optimize_expr = U.minimize_and_clip(optimizer,
                                                weighted_error,
                                                var_list=q_func_vars,
                                                clip_val=grad_norm_clipping)
        else:
            optimize_expr = optimizer.minimize(weighted_error, var_list=q_func_vars)

        # update_target_fn will be called periodically to copy Q network to target Q network
        update_target_expr = []
        for var, var_target in zip(sorted(q_func_vars, key=lambda v: v.name),
                                   sorted(target_q_func_vars, key=lambda v: v.name)):
            update_target_expr.append(var_target.assign(var))
        update_target_expr = tf.group(*update_target_expr)

        # Create callable functions
        train = U.function(
            inputs=[
                obs_t_input,
                act_t_ph,
                rew_t_ph,
                obs_tp1_input,
                done_mask_ph,
                importance_weights_ph
            ],
            outputs=td_error,
            updates=[optimize_expr]
        )
        update_target = U.function([], [], updates=[update_target_expr])

        q_values = U.function([obs_t_input], q_t)

        return act_f, train, update_target, {'q_values': q_values}
예제 #25
0
def build_train(make_obs_ph,
                q_func,
                num_actions,
                optimizer,
                grad_norm_clipping=None,
                gamma=1.0,
                double_q=True,
                scope="deepq",
                reuse=None,
                param_noise=False,
                param_noise_filter_func=None,
                distributed=False,
                v_min=-10.0,
                v_max=10.0,
                atoms=51):
    """Creates the train function:

    Parameters
    ----------
    make_obs_ph: str -> tf.placeholder or TfInput
        a function that takes a name and creates a placeholder of input with that name
    q_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    num_actions: int
        number of actions
    reuse: bool
        whether or not to reuse the graph variables
    optimizer: tf.train.Optimizer
        optimizer to use for the Q-learning objective.
    grad_norm_clipping: float or None
        clip gradient norms to this value. If None no clipping is performed.
    gamma: float
        discount rate.
    double_q: bool
        if true will use Double Q Learning (https://arxiv.org/abs/1509.06461).
        In general it is a good idea to keep it enabled.
    scope: str or VariableScope
        optional scope for variable_scope.
    reuse: bool or None
        whether or not the variables should be reused. To be able to reuse the scope must be given.
    param_noise: bool
        whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905)
    param_noise_filter_func: tf.Variable -> bool
        function that decides whether or not a variable should be perturbed. Only applicable
        if param_noise is True. If set to None, default_param_noise_filter is used by default.
    distributed: bool
        whether or not distributed version is enabled.
    v_min: float
        lower boundary for value, only works when distributed version is enabled.
    v_max: float
        upper boundary for value, only works when distributed version is enabled.
    atoms: int
        number of atoms, only works when distributed version is enabled.

    Returns
    -------
    act: (tf.Variable, bool, float) -> tf.Variable
        function to select and action given observation.
`       See the top of the file for details.
    train: (object, np.array, np.array, object, np.array, np.array) -> np.array
        optimize the error in Bellman's equation.
`       See the top of the file for details.
    update_target: () -> ()
        copy the parameters from optimized Q function to the target Q function.
`       See the top of the file for details.
    debug: {str: function}
        a bunch of functions to print debug data like q_values.
    """
    print("build train use distributed? ", distributed)
    if param_noise:
        act_f = build_act_with_param_noise(
            make_obs_ph,
            q_func,
            num_actions,
            scope=scope,
            reuse=reuse,
            param_noise_filter_func=param_noise_filter_func,
            distributed=distributed,
            v_min=v_min,
            v_max=v_max,
            atoms=atoms)
    else:
        act_f = build_act(make_obs_ph,
                          q_func,
                          num_actions,
                          scope=scope,
                          reuse=reuse,
                          distributed=distributed,
                          v_min=v_min,
                          v_max=v_max,
                          atoms=atoms)

    with tf.variable_scope(scope, reuse=reuse):
        # set up placeholders
        obs_t_input = U.ensure_tf_input(make_obs_ph("obs_t"))
        act_t_ph = tf.placeholder(tf.int32, [None], name="action")
        rew_t_ph = tf.placeholder(tf.float32, [None], name="reward")
        obs_tp1_input = U.ensure_tf_input(make_obs_ph("obs_tp1"))
        done_mask_ph = tf.placeholder(tf.float32, [None], name="done")
        importance_weights_ph = tf.placeholder(tf.float32, [None],
                                               name="weight")
        distributed_target_ph = tf.placeholder(tf.float32, [None, atoms],
                                               name="dis_target")

        # q network evaluation
        if not distributed:
            q_t = q_func(obs_t_input.get(),
                         num_actions,
                         scope="q_func",
                         reuse=True)  # reuse parameters from act
            # target q network evalution
            q_tp1 = q_func(obs_tp1_input.get(),
                           num_actions,
                           scope="target_q_func")
        else:
            q_t = q_func(obs_t_input.get(),
                         num_actions,
                         scope="q_func",
                         reuse=True)  # reuse parameters from act
            q_tp1 = q_func(obs_tp1_input.get(),
                           num_actions,
                           scope="target_q_func")

        q_func_vars = U.scope_vars(U.absolute_scope_name("q_func"))
        target_q_func_vars = U.scope_vars(
            U.absolute_scope_name("target_q_func"))

        # q scores for actions which we know were selected in the given state.
        if not distributed:
            q_t_selected = tf.reduce_sum(
                q_t * tf.one_hot(act_t_ph, num_actions), 1)
        else:
            probability_qt = tf.nn.softmax(q_t)
            q_t_selected = tf.reduce_sum(
                q_t *
                tf.tile(tf.expand_dims(tf.one_hot(act_t_ph, num_actions), 2),
                        [1, 1, atoms]), 1)

        # compute estimate of best possible value starting from state at t + 1
        if double_q:
            print("use double")
            if not distributed:
                q_tp1_using_online_net = q_func(obs_tp1_input.get(),
                                                num_actions,
                                                scope="q_func",
                                                reuse=True)
                q_tp1_best_using_online_net = tf.arg_max(
                    q_tp1_using_online_net, 1)
                q_tp1_best = tf.reduce_sum(
                    q_tp1 *
                    tf.one_hot(q_tp1_best_using_online_net, num_actions), 1)
            else:
                q_tp1_using_online_net = q_func(obs_tp1_input.get(),
                                                num_actions,
                                                scope="q_func",
                                                reuse=True)
                q_tp1_best = get_distibute_q(q_tp1_using_online_net, v_min,
                                             v_max, atoms, obs_tp1_input)
                a_tp1_best = tf.argmax(q_tp1_best, 1)
                probability_qt1 = tf.nn.softmax(q_tp1_using_online_net)
                q_tp1_best = tf.reduce_sum(
                    probability_qt1 * tf.tile(
                        tf.expand_dims(tf.one_hot(a_tp1_best, num_actions), 2),
                        [1, 1, atoms]), 1)
        else:
            print("not use double")
            if not distributed:
                q_tp1_best = tf.reduce_max(q_tp1, 1)
            else:
                if distributed:
                    q_tp1_best = get_distibute_q(q_tp1, v_min, v_max, atoms,
                                                 obs_tp1_input)
                    a_tp1_best = tf.argmax(q_tp1_best, 1)
                    probability_qt1 = tf.nn.softmax(q_tp1)
                    q_tp1_best = tf.reduce_sum(
                        probability_qt1 * tf.tile(
                            tf.expand_dims(tf.one_hot(a_tp1_best, num_actions),
                                           2), [1, 1, atoms]), 1)

        mask = 1.0 - done_mask_ph
        if not distributed:
            q_tp1_best_masked = mask * q_tp1_best
        else:
            q_tp1_best_masked = q_tp1_best

        # compute RHS of bellman equation
        if not distributed:
            q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked
            # compute the error (potentially clipped)
            td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)
            errors = U.huber_loss(td_error)
        else:
            clip_target = tf.clip_by_value(distributed_target_ph, 1e-8, 1.0)
            clip_select = tf.clip_by_value(tf.nn.softmax(q_t_selected), 1e-8,
                                           1.0)
            # use kl divergence
            td_error = tf.reduce_sum(
                clip_target * (tf.log(clip_target) - tf.log(clip_select)),
                axis=-1)
            errors = tf.nn.softmax_cross_entropy_with_logits(
                labels=distributed_target_ph, logits=q_t_selected)

        weighted_error = tf.reduce_mean(importance_weights_ph * errors)

        # compute optimization op (potentially with gradient clipping)
        if grad_norm_clipping is not None:
            optimize_expr = U.minimize_and_clip(optimizer,
                                                weighted_error,
                                                var_list=q_func_vars,
                                                clip_val=grad_norm_clipping)
        else:
            optimize_expr = optimizer.minimize(weighted_error,
                                               var_list=q_func_vars)

        # update_target_fn will be called periodically to copy Q network to target Q network
        update_target_expr = []
        for var, var_target in zip(
                sorted(q_func_vars, key=lambda v: v.name),
                sorted(target_q_func_vars, key=lambda v: v.name)):
            update_target_expr.append(var_target.assign(var))
        update_target_expr = tf.group(*update_target_expr)

        # Create callable functions
        if distributed:
            train = U.function(inputs=[
                obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph,
                importance_weights_ph, distributed_target_ph
            ],
                               outputs=td_error,
                               updates=[optimize_expr])
        else:
            train = U.function(inputs=[
                obs_t_input,
                act_t_ph,
                rew_t_ph,
                obs_tp1_input,
                done_mask_ph,
                importance_weights_ph,
            ],
                               outputs=td_error,
                               updates=[optimize_expr])

        update_target = U.function([], [], updates=[update_target_expr])

        q_values = U.function([obs_t_input], q_t)
        q_tp1_best_final = U.function([obs_tp1_input], q_tp1_best)

        return act_f, train, update_target, {
            'q_values': q_values,
            'q_t1_best': q_tp1_best_final
        }
예제 #26
0
def build_train(make_obs_ph,
                q_func,
                num_actions,
                grad_norm_clipping=None,
                gamma=1.0,
                deterministic_filter=False,
                random_filter=False,
                double_q=True,
                scope="deepq",
                reuse=None,
                param_noise=False,
                param_noise_filter_func=None):
    """Creates the train function:

    Parameters
    ----------
    make_obs_ph: str -> tf.placeholder or TfInput
        a function that takes a name and creates a placeholder of input with that name
    q_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    num_actions: int
        number of actions
    reuse: bool
        whether or not to reuse the graph variables
    grad_norm_clipping: float or None
        clip gradient norms to this value. If None no clipping is performed.
    gamma: float
        discount rate.
    double_q: bool
        if true will use Double Q Learning (https://arxiv.org/abs/1509.06461).
        In general it is a good idea to keep it enabled.
    scope: str or VariableScope
        optional scope for variable_scope.
    reuse: bool or None
        whether or not the variables should be reused. To be able to reuse the scope must be given.
    param_noise: bool
        whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905)
    param_noise_filter_func: tf.Variable -> bool
        function that decides whether or not a variable should be perturbed. Only applicable
        if param_noise is True. If set to None, default_param_noise_filter is used by default.

    Returns
    -------
    act: (tf.Variable, bool, float) -> tf.Variable
        function to select and action given observation.
`       See the top of the file for details.
    train: (object, np.array, np.array, object, np.array, np.array) -> np.array
        optimize the error in Bellman's equation.
`       See the top of the file for details.
    update_target: () -> ()
        copy the parameters from optimized Q function to the target Q function.
`       See the top of the file for details.
    debug: {str: function}
        a bunch of functions to print debug data like q_values.
    """
    if param_noise:
        act_f = build_act_with_param_noise(
            make_obs_ph,
            q_func,
            num_actions,
            scope=scope,
            reuse=reuse,
            param_noise_filter_func=param_noise_filter_func,
            deterministic_filter=deterministic_filter,
            random_filter=random_filter)
    else:
        act_f = build_act(make_obs_ph,
                          q_func,
                          num_actions,
                          scope=scope,
                          reuse=reuse,
                          deterministic_filter=deterministic_filter,
                          random_filter=random_filter)

    with tf.variable_scope(scope, reuse=reuse):
        # set up placeholders
        lr_ph = tf.placeholder(tf.float32, name="lr")
        obs_t_input = U.ensure_tf_input(make_obs_ph("obs_t"))
        act_t_ph = tf.placeholder(tf.int32, [None], name="action")
        rew_t_ph = tf.placeholder(U.data_type, [None], name="reward")
        obs_tp1_input = U.ensure_tf_input(make_obs_ph("obs_tp1"))
        done_mask_ph = tf.placeholder(U.data_type, [None], name="done")
        importance_weights_ph = tf.placeholder(U.data_type, [None],
                                               name="weight")

        board_size = obs_t_input.get().get_shape().as_list()[1]

        obs_t = transform_obses(obs_t_input.get())
        obs_tp1 = transform_obses(obs_tp1_input.get())
        act_t = transform_actions(act_t_ph, board_size)

        if deterministic_filter:
            invalid_masks_tp1 = build_invalid_masks(obs_tp1)

        # q network evaluation
        q_t = q_func(obs_t, num_actions, scope="q_func",
                     reuse=True)  # reuse parameters from act
        q_func_vars = U.scope_vars(U.absolute_scope_name("q_func"))

        # target q network evalution
        q_tp1 = q_func(obs_tp1, num_actions, scope="target_q_func")
        target_q_func_vars = U.scope_vars(
            U.absolute_scope_name("target_q_func"))

        # q scores for actions which we know were selected in the given state.
        q_t_selected = tf.reduce_sum(
            q_t * tf.one_hot(act_t, num_actions, dtype=U.data_type), axis=1)

        # compute estimate of best possible value starting from state at t + 1
        if double_q:
            q_tp1_using_online_net = q_func(obs_tp1,
                                            num_actions,
                                            scope="q_func",
                                            reuse=True)

            if deterministic_filter:
                q_tp1_using_online_net = build_q_filter(
                    q_tp1_using_online_net, invalid_masks_tp1)

            q_tp1_best_using_online_net = tf.argmax(q_tp1_using_online_net,
                                                    1,
                                                    output_type=U.index_type)
            q_tp1_best = tf.reduce_sum(
                q_tp1 * tf.one_hot(q_tp1_best_using_online_net,
                                   num_actions,
                                   dtype=U.data_type), 1)
        else:
            if deterministic_filter:
                q_tp1 = build_q_filter(q_tp1, invalid_masks_tp1)

            q_tp1_best = tf.reduce_max(q_tp1, axis=1)
        q_tp1_best_masked = (1.0 - done_mask_ph) * q_tp1_best

        # compute RHS of bellman equation
        q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked

        # compute the error (potentially clipped)
        td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)
        weighted_error = tf.reduce_mean(importance_weights_ph *
                                        U.huber_loss(td_error))
        regularizer = tf.add_n([tf.nn.l2_loss(var)
                                for var in q_func_vars]) * 0.0001
        total_error = weighted_error + regularizer

        # optimizer = tf.train.MomentumOptimizer(
        #     learning_rate=lr_ph, momentum=0.9)
        optimizer = tf.train.AdamOptimizer(learning_rate=lr_ph)

        # compute optimization op (potentially with gradient clipping)
        if grad_norm_clipping is not None:
            optimize_expr = U.minimize_and_clip(optimizer,
                                                total_error,
                                                var_list=q_func_vars,
                                                clip_val=grad_norm_clipping)
        else:
            optimize_expr = optimizer.minimize(total_error,
                                               var_list=q_func_vars)

        # update_target_fn will be called periodically to copy Q network to target Q network
        update_target_expr = []
        for var, var_target in zip(
                sorted(q_func_vars, key=lambda v: v.name),
                sorted(target_q_func_vars, key=lambda v: v.name)):
            update_target_expr.append(var_target.assign(var))
        update_target_expr = tf.group(*update_target_expr)

        # Create callable functions
        train = U.function(inputs=[
            lr_ph, obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input,
            done_mask_ph, importance_weights_ph
        ],
                           outputs=[td_error, weighted_error, total_error],
                           updates=[optimize_expr])
        update_target = U.function([], [], updates=[update_target_expr])

        q_values = U.function([obs_t_input], q_t)

        return act_f, train, update_target, {'q_values': q_values}
예제 #27
0
    def __init__(self,
                 obs_dim,
                 n_acts,
                 seed,
                 lr,
                 gamma,
                 double_q=True,
                 grad_val_clipping=None,
                 grad_norm_clipping=None):
        # grad_val_clipping=0.5,
        # grad_norm_clipping=5.0):

        sess = U.get_session()
        self.sess = sess
        set_global_seeds(seed)

        # create placeholders for the input data for the current and next timesteps
        cur_input = create_input_placeholders(obs_dim, 'cur_input')
        next_input = create_input_placeholders(obs_dim, 'next_input')

        # create placeholders for the output data for the current timestep
        cur_output = create_output_placeholders(n_acts, 'cur_out')

        # calculate the q value for the chosen action
        q_vals_main_cur = get_model(cur_input['obs_ph'], n_acts, 'main')
        q_a = tf.reduce_max(tf.cast(cur_output['act_ph'], dtype=tf.float32) *
                            q_vals_main_cur,
                            axis=-1)

        # calculate the q value for the target network
        q_vals_target_next = get_model(next_input['obs_ph'], n_acts, 'target')
        if double_q:
            q_vals_main_next = get_model(next_input['obs_ph'], n_acts, 'main')
            next_act_main = tf.argmax(q_vals_main_next, axis=-1)
            q_vals_target_next_best = tf.reduce_max(
                q_vals_target_next * tf.one_hot(next_act_main, n_acts),
                axis=-1)
        else:
            q_vals_target_next_best = tf.reduce_max(q_vals_target_next,
                                                    axis=-1)

        done_mask = 1 - cur_output['done_ph']
        q_target = done_mask * gamma * q_vals_target_next_best
        q_target = cur_output['rew_ph'] + q_target

        # create the loss function
        td_error = q_a - tf.stop_gradient(q_target)
        adjusted_square_error = U.huber_loss(td_error)
        loss = tf.reduce_mean(adjusted_square_error)

        # make target update operation
        main_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                      scope='main')
        target_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                        scope='target')
        assign_ops_target = [
            tf.assign(target_var, main_var)
            for target_var, main_var in zip(target_vars, main_vars)
        ]
        target_update_op = tf.group(*assign_ops_target)

        def update_target():
            sess.run(target_update_op)

        # make train function
        optimizer = tf.train.AdamOptimizer(learning_rate=lr)
        gradients, variables = zip(*optimizer.compute_gradients(loss))
        gradients = list(gradients)
        # if grad_val_clipping:
        #     for i, grad in enumerate(gradients):
        #         if grad is not None:
        #             gradients[i] = tf.clip_by_value(grad, -grad_val_clipping, grad_val_clipping)
        # if grad_norm_clipping:
        #     gradients, global_norm = tf.clip_by_global_norm(gradients, grad_norm_clipping)
        train_op = optimizer.apply_gradients(zip(gradients, variables))

        def train(batch):
            feed_dict = {
                cur_input['obs_ph']: batch['cur_obs'],
                next_input['obs_ph']: batch['next_obs'],
                cur_output['act_ph']: batch['acts'],
                cur_output['rew_ph']: batch['rews'],
                cur_output['done_ph']: batch['done'],
            }
            sess.run(train_op, feed_dict=feed_dict)

        self.train = train
        self.update_target = update_target

        self.save = functools.partial(U.save_variables, sess=sess)
        self.load = functools.partial(U.load_variables, sess=sess)
        print("Initialized Model")