Example #1
0
    def _build_train_ops(self):
        self.lr_c = tf.placeholder(tf.float32,
                                   shape=None,
                                   name='learning_rate_c')
        self.lr_a = tf.placeholder(tf.float32,
                                   shape=None,
                                   name='learning_rate_a')

        with tf.variable_scope('critic_train'):
            # self.reg_c = tf.reduce_mean([tf.nn.l2_loss(x) for x in self.critic_vars])
            self.loss_c = tf.reduce_mean(tf.square(
                self.td_error))  # + 0.001 * self.reg_c
            self.optim_c = tf.train.AdamOptimizer(self.lr_c)
            self.grads_c = self.optim_c.compute_gradients(
                self.loss_c, self.critic_vars)
            if self.clip_norm:
                self.grads_c = [(tf.clip_by_norm(grad, self.clip_norm), var)
                                for grad, var in self.grads_c]

            self.train_op_c = self.optim_c.apply_gradients(self.grads_c)

        with tf.variable_scope('actor_train'):
            # self.reg_a = tf.reduce_mean([tf.nn.l2_loss(x) for x in self.actor_vars])
            # self.entropy_a =- tf.reduce_sum(self.actor * tf.log(self.actor))
            self.loss_a = tf.reduce_mean(
                tf.stop_gradient(self.td_error) *
                tf.nn.sparse_softmax_cross_entropy_with_logits(
                    logits=self.actor, labels=self.a),
                name='loss_actor')  # + 0.001 * self.reg_a
            self.optim_a = tf.train.AdamOptimizer(self.lr_a)
            self.grads_a = self.optim_a.compute_gradients(
                self.loss_a, self.actor_vars)
            if self.clip_norm:
                self.grads_a = [(tf.clip_by_norm(grad, self.clip_norm), var)
                                for grad, var in self.grads_a]

            self.train_op_a = self.optim_a.apply_gradients(self.grads_a)

        with tf.variable_scope('summary'):
            self.ep_reward = tf.placeholder(tf.float32, name='episode_reward')
            self.summary = [
                tf.summary.scalar('loss/critic', self.loss_c),
                tf.summary.scalar('loss/actor', self.loss_a),
                tf.summary.scalar('episode_reward', self.ep_reward)
            ]
            self.summary += [
                tf.summary.scalar('grads/a_' + var.name, tf.norm(grad))
                for grad, var in self.grads_a if grad is not None
            ]
            self.summary += [
                tf.summary.scalar('grads/c_' + var.name, tf.norm(grad))
                for grad, var in self.grads_c if grad is not None
            ]
            self.merged_summary = tf.summary.merge_all(
                key=tf.GraphKeys.SUMMARIES)

        self.train_ops = [self.train_op_a, self.train_op_c]

        self.sess.run(tf.global_variables_initializer())
Example #2
0
    def _build_train_ops(self):
        self.lr_a = tf.placeholder(tf.float32,
                                   shape=None,
                                   name='learning_rate_actor')
        self.lr_c = tf.placeholder(tf.float32,
                                   shape=None,
                                   name='learning_rate_critic')
        self.clip_range = tf.placeholder(tf.float32,
                                         shape=None,
                                         name='ratio_clip_range')

        with tf.variable_scope('actor_train'):
            ratio = tf.exp(self.logp_a - self.old_logp_a)
            ratio_clipped = tf.clip_by_value(ratio, 1.0 - self.clip_range,
                                             1.0 + self.clip_range)
            loss_a = -tf.reduce_mean(
                tf.minimum(self.adv * ratio, self.adv * ratio_clipped))

            optim_a = tf.train.AdamOptimizer(self.lr_a)
            grads_a = optim_a.compute_gradients(loss_a,
                                                var_list=self.actor_vars)
            if self.clip_norm:
                grads_a = [(tf.clip_by_norm(g, self.clip_norm), v)
                           for g, v in grads_a]
            self.train_op_a = optim_a.apply_gradients(grads_a)

        with tf.variable_scope('critic_train'):
            loss_c = tf.reduce_mean(tf.square(self.v_target - self.critic))

            optim_c = tf.train.AdamOptimizer(self.lr_c)
            grads_c = optim_c.compute_gradients(loss_c,
                                                var_list=self.critic_vars)
            if self.clip_norm:
                grads_c = [(tf.clip_by_norm(g, self.clip_norm), v)
                           for g, v in grads_c]
            self.train_op_c = optim_c.apply_gradients(grads_c)

        self.train_ops = [self.train_op_a, self.train_op_c]

        with tf.variable_scope('summary'):
            self.ep_reward = tf.placeholder(tf.float32, name='episode_reward')

            self.summary = [
                tf.summary.scalar('loss/adv', tf.reduce_mean(self.adv)),
                tf.summary.scalar('loss/ratio', tf.reduce_mean(ratio)),
                tf.summary.scalar('loss/loss_actor', loss_a),
                tf.summary.scalar('loss/loss_critic', loss_c),
                tf.summary.scalar('episode_reward', self.ep_reward)
            ]

            # self.summary += [tf.summary.scalar('grads/' + v.name, tf.norm(g))
            #                 for g, v in grads_a if g is not None]
            # self.summary += [tf.summary.scalar('grads/' + v.name, tf.norm(g))
            #                 for g, v in grads_c if g is not None]

            self.merged_summary = tf.summary.merge_all(
                key=tf.GraphKeys.SUMMARIES)

        self.sess.run(tf.global_variables_initializer())
Example #3
0
  def two_linear( self, xin, linear_size, residual, dropout_keep_prob, max_norm, batch_norm, dtype, idx ):
    """
    Make a bi-linear block with optional residual connection

    Args
      xin: the batch that enters the block
      linear_size: integer. The size of the linear units
      residual: boolean. Whether to add a residual connection
      dropout_keep_prob: float [0,1]. Probability of dropping something out
      max_norm: boolean. Whether to clip weights to 1-norm
      batch_norm: boolean. Whether to do batch normalization
      dtype: type of the weigths. Usually tf.float32
      idx: integer. Number of layer (for naming/scoping)
    Returns
      y: the batch after it leaves the block
    """

    with vs.variable_scope( "two_linear_"+str(idx) ) as scope:

      input_size = int(xin.get_shape()[1])

      # Linear 1
      w2 = tf.get_variable( name="w2_"+str(idx), initializer=kaiming, shape=[input_size, linear_size], dtype=dtype)
      b2 = tf.get_variable( name="b2_"+str(idx), initializer=kaiming, shape=[linear_size], dtype=dtype)
      w2 = tf.clip_by_norm(w2,1) if max_norm else w2
      y = tf.matmul(xin, w2) + b2
      if  batch_norm:
        y = tf.layers.batch_normalization(y,training=self.isTraining,name="batch_normalization1"+str(idx))

      y = tf.nn.relu( y )
      y = tf.nn.dropout( y, dropout_keep_prob )

      # Linear 2
      w3 = tf.get_variable( name="w3_"+str(idx), initializer=kaiming, shape=[linear_size, linear_size], dtype=dtype)
      b3 = tf.get_variable( name="b3_"+str(idx), initializer=kaiming, shape=[linear_size], dtype=dtype)
      w3 = tf.clip_by_norm(w3,1) if max_norm else w3
      y = tf.matmul(y, w3) + b3

      if  batch_norm:
        y = tf.layers.batch_normalization(y,training=self.isTraining,name="batch_normalization2"+str(idx))

      y = tf.nn.relu( y )
      y = tf.nn.dropout( y, dropout_keep_prob )

      # Residual every 2 blocks
      y = (xin + y) if residual else y

    return y
Example #4
0
 def optimize_normal(self, loss, params):
     '''
     optimize
     loss: the loss.
     params: the params need to be optimized.
     '''
     # the optimize.
     self.global_step = tf.Variable(0, name='global_step')
     if self.is_update_lr:
         self.lr = self.update_lr()
     else:
         self.lr = tf.Variable(self.init_lr, trainable=False)
     self.optimizer = tf.train.AdamOptimizer(self.lr)
     grads_and_vars = self.optimizer.compute_gradients(loss, params)
     if self.max_grad_norm != None:
         clipped_grads_and_vars = [
             (tf.clip_by_norm(gv[0], self.max_grad_norm), gv[1])
             for gv in grads_and_vars
         ]
     else:
         clipped_grads_and_vars = grads_and_vars
     inc = self.global_step.assign_add(1)
     optimize = None
     with tf.control_dependencies([inc]):
         optimize = self.optimizer.apply_gradients(clipped_grads_and_vars)
     return optimize
    def _add_train_graph(self):
        """Define the training operation."""
        mc = self.mc

        self.global_step = tf.Variable(0, name='global_step', trainable=False)
        lr = tf.train.exponential_decay(mc.LEARNING_RATE,
                                        self.global_step,
                                        mc.DECAY_STEPS,
                                        mc.LR_DECAY_FACTOR,
                                        staircase=True)

        tf.summary.scalar('learning_rate', lr)

        _add_loss_summaries(self.loss)

        opt = tf.train.MomentumOptimizer(learning_rate=lr,
                                         momentum=mc.MOMENTUM)
        grads_vars = opt.compute_gradients(self.loss, tf.trainable_variables())

        with tf.variable_scope('clip_gradient') as scope:
            for i, (grad, var) in enumerate(grads_vars):
                grads_vars[i] = (tf.clip_by_norm(grad, mc.MAX_GRAD_NORM), var)

        apply_gradient_op = opt.apply_gradients(grads_vars,
                                                global_step=self.global_step)

        for var in tf.trainable_variables():
            tf.summary.histogram(var.op.name, var)

        for grad, var in grads_vars:
            if grad is not None:
                tf.summary.histogram(var.op.name + '/gradients', grad)

        with tf.control_dependencies([apply_gradient_op]):
            self.train_op = tf.no_op(name='train')
Example #6
0
def linear(input_features, output_size, weight_max_norm, weight_initializer,
           bias_initializer, name):
    """Builds a linear layer.

  Args:
    input_features: A tensor for input features. Shape = [..., feature_dim].
    output_size: An integer for the number of output nodes.
    weight_max_norm: A float for the maximum weight norm to clip at. Use
      non-positive to ignore.
    weight_initializer: A function handle for kernel weight initializer.
    bias_initializer: A function handle for bias initializer.
    name: A string for the name scope.

  Returns:
    A tensor for the output logits. Shape = [..., output_size].
  """
    with tf.variable_scope(name, reuse=tf.AUTO_REUSE):
        weights = tf.get_variable(
            name='weight',
            shape=[input_features.shape.as_list()[-1], output_size],
            initializer=weight_initializer)
        if weight_max_norm > 0.0:
            weights = tf.clip_by_norm(weights, clip_norm=weight_max_norm)

        bias = tf.get_variable(name='bias',
                               shape=[output_size],
                               initializer=bias_initializer)

    return tf.linalg.matmul(input_features, weights) + bias
Example #7
0
def clip_gradients_by_norm(grads_and_vars, add_to_summary=False):
    if add_to_summary:
        for grad, var in grads_and_vars:
            if grad is not None:
                variable_summaries(grad, 'grad/{}'.format(var.name[:-2]),
                                   'full')
                variable_summaries(tf.abs(grad),
                                   'grad/abs/{}'.format(var.name[:-2]), 'full')

    # Clip by norm. Grad can be null when not training some modules.
    with tf.name_scope('clip_gradients_by_norm'):
        grads_and_vars = [(tf.check_numerics(tf.clip_by_norm(gv[0], 10.),
                                             'Invalid gradient'),
                           gv[1]) if gv[0] is not None else gv
                          for gv in grads_and_vars]

    if add_to_summary:
        for grad, var in grads_and_vars:
            if grad is not None:
                variable_summaries(grad,
                                   'clipped_grad/{}'.format(var.name[:-2]),
                                   'full')
                variable_summaries(tf.abs(grad),
                                   'clipped_grad/{}'.format(var.name[:-2]),
                                   'full')

    return grads_and_vars
    def make_train_step(self):
        trainable_vars = self.sess.graph.get_collection(
            tf.GraphKeys.TRAINABLE_VARIABLES)
        if self.args.get('--freeze-graph-model'):
            graph_vars = set(
                self.sess.graph.get_collection(
                    tf.GraphKeys.TRAINABLE_VARIABLES, scope="graph_model"))
            filtered_vars = []
            for var in trainable_vars:
                if var not in graph_vars:
                    filtered_vars.append(var)
                else:
                    print("Freezing weights of variable %s." % var.name)
            trainable_vars = filtered_vars
        #optimize the loss
        optimizer = tf.train.AdamOptimizer(self.params['learning_rate'])
        grads_and_vars = optimizer.compute_gradients(self.ops['loss'],
                                                     var_list=trainable_vars)
        clipped_grads = []
        for grad, var in grads_and_vars:
            if grad is not None:
                clipped_grads.append(
                    (tf.clip_by_norm(grad,
                                     self.params['clamp_gradient_norm']), var))
            else:
                clipped_grads.append((grad, var))
        self.ops['train_step'] = optimizer.apply_gradients(clipped_grads)

        # Initialize newly-introduced variables:
        #self.sess.run(tf.local_variables_initializer())
        self.sess.run(tf.global_variables_initializer())
def flatgrad(loss, var_list, clip_norm=None):
    grads = tf.gradients(loss, var_list)
    if clip_norm is not None:
        grads = [tf.clip_by_norm(grad, clip_norm=clip_norm) for grad in grads]
    return tf.concat(axis=0, values=[
        tf.reshape(grad if grad is not None else tf.zeros_like(v), [numel(v)])
        for (v, grad) in zip(var_list, grads)
    ])
def minimize_and_clip(optimizer, objective, var_list, clip_val=10):
    if clip_val is None:
        return optimizer.minimize(objective, var_list=var_list)
    else:
        gradients = optimizer.compute_gradients(objective, var_list=var_list)
        for i, (grad, var) in enumerate(gradients):
            if grad is not None:
                gradients[i] = (tf.clip_by_norm(grad, clip_val), var)
        return optimizer.apply_gradients(gradients)
Example #11
0
def flatgrad(loss, var_list, clip_norm=None):
    grads = tf.gradients(loss, var_list)
    if clip_norm is not None:
        grads = [tf.clip_by_norm(grad, clip_norm=clip_norm) for grad in grads]
    return tf.concat([
        tf.reshape(g if g is not None else tf.zeros_like(v), [-1])
        for v, g in zip(var_list, grads)
    ],
                     axis=0)
def minimize_and_clip(optimizer, objective, var_list, clip_val=10):
    """Minimized `objective` using `optimizer` w.r.t. variables in
    `var_list` while ensure the norm of the gradients for each
    variable is clipped to `clip_val`
    """
    gradients = optimizer.compute_gradients(objective, var_list=var_list)
    for i, (grad, var) in enumerate(gradients):
        if grad is not None:
            gradients[i] = (tf.clip_by_norm(grad, clip_val), var)
    return optimizer.apply_gradients(gradients)
Example #13
0
    def __init__(self, sess, state_dim, action_dim, action_bound,
                 learning_rate, tau, batch_size):
        self.sess = sess
        self.s_dim = state_dim
        self.a_dim = action_dim
        self.action_bound = action_bound
        self.learning_rate = learning_rate
        self.tau = tau
        self.batch_size = batch_size

        # Actor Network
        self.inputs, self.out, self.scaled_out = self.create_actor_network()

        self.network_params = tf.trainable_variables()

        # Target Network
        self.target_inputs, self.target_out, self.target_scaled_out = self.create_actor_network(
        )

        self.target_network_params = tf.trainable_variables(
        )[len(self.network_params):]

        # Op for periodically updating target network with online network
        # weights
        self.update_target_network_params = \
            [self.target_network_params[i].assign(tf.multiply(self.network_params[i], self.tau) +
                                                  tf.multiply(self.target_network_params[i], 1. - self.tau))
                for i in range(len(self.target_network_params))]

        # This gradient will be provided by the critic network
        self.action_gradient = tf.placeholder(tf.float32, [None, self.a_dim])

        # Combine the gradients here
        self.unnormalized_actor_gradients = tf.gradients(
            self.scaled_out, self.network_params, -self.action_gradient)
        self.clipped_gradients = [
            tf.clip_by_norm(grad, 5.0)
            for grad in self.unnormalized_actor_gradients
        ]
        self.actor_gradients = list(
            map(lambda x: tf.div(x, self.batch_size), self.clipped_gradients))

        # Optimization Op
        self.update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        with tf.control_dependencies(self.update_ops):
            self.optimize = tf.train.AdamOptimizer(
                self.learning_rate).apply_gradients(
                    zip(self.actor_gradients, self.network_params))

        self.num_trainable_vars = len(self.network_params) + len(
            self.target_network_params)
Example #14
0
        def langevin_step(counter, x_mod):
            x_mod = x_mod + tf.random_normal(
                tf.shape(x_mod),
                mean=0.0,
                stddev=0.005 * FLAGS.rescale * FLAGS.noise_scale)

            energy_noise = energy_start = tf.concat([
                model.forward(x_mod,
                              weights[0],
                              label=LABEL_SPLIT[j],
                              reuse=True,
                              stop_at_grad=False,
                              stop_batch=True)
            ],
                                                    axis=0)

            x_grad, label_grad = tf.gradients(FLAGS.temperature * energy_noise,
                                              [x_mod, LABEL_SPLIT[j]])
            energy_noise_old = energy_noise

            lr = FLAGS.step_lr

            if FLAGS.proj_norm != 0.0:
                if FLAGS.proj_norm_type == 'l2':
                    x_grad = tf.clip_by_norm(x_grad, FLAGS.proj_norm)
                elif FLAGS.proj_norm_type == 'li':
                    x_grad = tf.clip_by_value(x_grad, -FLAGS.proj_norm,
                                              FLAGS.proj_norm)
                else:
                    print("Other types of projection are not supported!!!")
                    assert False

            # Clip gradient norm for now
            if FLAGS.hmc:
                # Step size should be tuned to get around 65% acceptance
                def energy(x):
                    return FLAGS.temperature * \
                        model.forward(x, weights[0], label=LABEL_SPLIT[j], reuse=True)

                x_last = hmc(x_mod, 15., 10, energy)
            else:
                x_last = x_mod - (lr) * x_grad

            x_mod = x_last
            x_mod = tf.clip_by_value(x_mod, 0, FLAGS.rescale)

            counter = counter + 1

            return counter, x_mod
Example #15
0
    def add_optimizer_op(self, scope):
        """
        Set self.train_op and self.grad_norm
        Args:
            scope: (string) scope name, that specifies if target network or not
        """

        ##############################################################
        """
        TODO: 
            1. get Adam Optimizer
            2. compute grads with respect to variables in scope for self.loss
            3. if self.config.grad_clip is True, then clip the grads
                by norm using self.config.clip_val 
            4. apply the gradients and store the train op in self.train_op
                (sess.run(train_op) must update the variables)
            5. compute the global norm of the gradients (which are not None) and store 
                this scalar in self.grad_norm

        HINT: you may find the following functions useful
            - tf.get_collection
            - optimizer.compute_gradients
            - tf.clip_by_norm
            - optimizer.apply_gradients
            - tf.global_norm
             
             you can access config variables by writing self.config.variable_name
        """
        ##############################################################
        #################### YOUR CODE HERE - 8-12 lines #############

        opt = tf.train.AdamOptimizer(learning_rate=self.lr)

        scope_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=scope)
        # print(tf.GraphKeys.GLOBAL_VARIABLES)
        # print(scope_vars)
        # print(scope)
        grads = opt.compute_gradients(self.loss, scope_vars)

        if self.config.grad_clip: grads=[(tf.clip_by_norm(grad, self.config.clip_val), var) for grad, var in grads]

        self.train_op = opt.apply_gradients(grads)
        self.grad_norm = tf.global_norm([grad[0] for grad in grads])
Example #16
0
    def _compute_gradients(self, loss, var_list=None):
        # Sanity check
        assert isinstance(loss, tf.Tensor)

        # Compute gradients using default method
        grads_and_vars = self._tf_optimizer.compute_gradients(
            loss, var_list=var_list)

        # Deal with NaN if necessary
        if hub.clip_nan_protection:
            grads_and_vars = [(self._deal_with_nan(grad), var)
                              for grad, var in grads_and_vars]

        # Apply lr decay if necessary
        lr_decay = hub.clip_lr_multiplier
        if lr_decay < 1.0:
            assert lr_decay > 0
            grads_and_vars = [(grad * lr_decay, var)
                              for grad, var in grads_and_vars]

        # Clip gradient if necessary
        if self._threshold > 0:
            bound = self._threshold
            if self._method in ('norm', 'value', 'avg_norm'):
                if self._method == 'norm':
                    method = lambda g: tf.clip_by_norm(g, bound)
                elif self._method == 'value':
                    method = lambda g: tf.clip_by_value(g, -bound, bound)
                else:
                    method = lambda g: tf.clip_by_average_norm(g, bound)
                grads_and_vars = [(method(grad), var)
                                  for grad, var in grads_and_vars]
            else:
                assert self._method == 'global_norm'
                grads = [g for g, _ in grads_and_vars]
                clipped_grads, _ = tf.clip_by_global_norm(
                    grads, self._threshold)
                vars_ = [v for _, v in grads_and_vars]
                grads_and_vars = list(zip(clipped_grads, vars_))

        return grads_and_vars
Example #17
0
    def __init__(self,
                 state_size,
                 action_size,
                 alpha=0.01,
                 clip_norm=None,
                 minibatch_size=5,
                 **kwargs):
        """
		Parameters
		----------
		state_size, action_size : int
			Size of the environment state space and action space
		alpha : float, optional
			Network learning rate
		clip_norm : float, optional
			Max gradient magnitude for clipping, default no clipping
		minibatch_size : int, optional
			Size of minibatches for updating
		**kwargs
			Additional keyword arguments passed to `SingleLayerNetwork`
		"""
        super().__init__(state_size, action_size, **kwargs)

        self.k = int(minibatch_size)
        self.nextQ = tf.placeholder(shape=[None, action_size],
                                    dtype=tf.float32)
        loss = tf.reduce_sum(tf.square(self.nextQ - self.Q_est))
        trainer = tf.train.RMSPropOptimizer(alpha)

        if clip_norm is not None:
            grads = trainer.compute_gradients(loss,
                                              [self.W, self.w_in, self.b_in])
            cap_grads = [(tf.clip_by_norm(grad, clip_norm), var)
                         for grad, var in grads]
            self.updateModel = trainer.apply_gradients(cap_grads)
        else:
            self.updateModel = trainer.minimize(
                loss, var_list=[self.W, self.w_in, self.b_in])

        self.var_init()
Example #18
0
    def add_optimizer_op(self, scope):
        """
        Set self.train_op and self.grad_norm

        Args:
            scope: (string) name of the scope whose variables we are
                   differentiating with respect to
        """

        ##############################################################
        """
        TODO: 
            1. get Adam Optimizer
            2. compute grads with respect to variables in scope for self.loss
            3. if self.config.grad_clip is True, then clip the grads
                by norm using self.config.clip_val 
            4. apply the gradients and store the train op in self.train_op
                (sess.run(train_op) must update the variables)
            5. compute the global norm of the gradients (which are not None) and store 
                this scalar in self.grad_norm

        HINT: you may find the following functions useful
            - tf.get_collection
            - optimizer.compute_gradients
            - tf.clip_by_norm
            - optimizer.apply_gradients
            - tf.global_norm
             
             you can access config variables by writing self.config.variable_name
        """
        ##############################################################
        #################### YOUR CODE HERE - 8-12 lines #############

        optimizer = tf.train.AdamOptimizer(learning_rate=self.lr)
        scope_variable = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=scope)
        grads_and_vars = optimizer.compute_gradients(self.loss, scope_variable)
        if self.config.grad_clip:
            clipped_grads_and_vars = [(tf.clip_by_norm(item[0], self.config.clip_val), item[1]) for item in grads_and_vars]
        self.train_op = optimizer.apply_gradients(clipped_grads_and_vars)
        self.grad_norm = tf.global_norm([item[0] for item in grads_and_vars])
Example #19
0
def clip_by_norm(v, clip_norm):
    dim = len(v.get_shape())
    return tf.clip_by_norm(v, clip_norm, axes=[i for i in range(dim - 1)])
Example #20
0
def dpg(q_max, a_max, dqda_clipping=None, clip_norm=False, name="DpgLearning"):
    """Implements the Deterministic Policy Gradient (DPG) loss as a TensorFlow Op.

  This op implements the loss for the `actor`, the `critic` can instead be
  updated by minimizing the `value_ops.td_learning` loss.

  See "Deterministic Policy Gradient Algorithms" by Silver, Lever, Heess,
  Degris, Wierstra, Riedmiller (http://proceedings.mlr.press/v32/silver14.pdf).

  Args:
    q_max: Tensor holding Q-values generated by Q network with the input of
      (state, a_max) pair, shape `[B]`.
    a_max: Tensor holding the optimal action, shape `[B, action_dimension]`.
    dqda_clipping: `int` or `float`, clips the gradient dqda element-wise
      between `[-dqda_clipping, dqda_clipping]`.
    clip_norm: Whether to perform dqda clipping on the vector norm of the last
      dimension, or component wise (default).
    name: name to prefix ops created within this op.

  Returns:
    A namedtuple with fields:

    * `loss`: a tensor containing the batch of losses, shape `[B]`.
    * `extra`: a namedtuple with fields:
        * `q_max`: Tensor holding the optimal Q values, `[B]`.
        * `a_max`: Tensor holding the optimal action, `[B, action_dimension]`.
        * `dqda`: Tensor holding the derivative dq/da, `[B, action_dimension]`.

  Raises:
    ValueError: If `q_max` doesn't depend on `a_max` or if `dqda_clipping <= 0`.
  """

    # DPG op.
    with tf.name_scope(name, values=[q_max, a_max]):

        # Calculate the gradient dq/da.
        dqda = tf.gradients([q_max], [a_max])[0]

        # Check that `q_max` depends on `a_max`.
        if dqda is None:
            raise ValueError("q_max needs to be a function of a_max")

        # Clipping the gradient dq/da.
        if dqda_clipping is not None:
            if dqda_clipping <= 0:
                raise ValueError(
                    "dqda_clipping should be bigger than 0, {} found".format(
                        dqda_clipping))
            if clip_norm:
                dqda = tf.clip_by_norm(dqda, dqda_clipping, axes=-1)
            else:
                dqda = tf.clip_by_value(dqda, -1. * dqda_clipping,
                                        dqda_clipping)

        # Target_a ensures correct gradient calculated during backprop.
        target_a = dqda + a_max
        # Stop the gradient going through Q network when backprop.
        target_a = tf.stop_gradient(target_a)
        # Gradient only go through actor network.
        loss = 0.5 * tf.reduce_sum(tf.square(target_a - a_max), axis=-1)
        return base_ops.LossOutput(
            loss, DPGExtra(q_max=q_max, a_max=a_max, dqda=dqda))
Example #21
0
    def build_model(self):

        self.X = tf.placeholder(tf.int32, [self.batch_size], name='input')
        if self.n_samples:
            self.Y = tf.placeholder(tf.int32,
                                    [self.batch_size + self.n_samples],
                                    name='output')
        else:
            self.Y = tf.placeholder(tf.int32, [self.batch_size], name='output')
        self.Behavior = tf.placeholder(tf.int32, [self.batch_size],
                                       name='behavior')
        self.state_item = [
            tf.placeholder(tf.float32, [self.batch_size, self.item_size],
                           name='rnn_state') for _ in range(self.layers)
        ]
        self.state_beha = [
            tf.placeholder(tf.float32, [self.batch_size, self.behavior_size],
                           name='rnn_state') for _ in range(self.layers)
        ]
        self.global_step = tf.Variable(0, name='global_step', trainable=False)
        self.alpha = tf.get_variable('alpha',
                                     shape=[1],
                                     initializer=tf.constant_initializer(0.5))

        with tf.variable_scope('gru_layer_item'):
            sigma = self.sigma if self.sigma != 0 else np.sqrt(
                6.0 / (self.n_items + self.item_size))
            if self.init_as_normal:
                initializer = tf.random_normal_initializer(mean=0,
                                                           stddev=sigma)
            else:
                initializer = tf.random_uniform_initializer(minval=-sigma,
                                                            maxval=sigma)
            embedding_item = tf.get_variable('embedding_item',
                                             [self.n_items, self.item_size],
                                             initializer=initializer)
            cell_item = rnn_cell.GRUCell(self.item_size,
                                         activation=self.hidden_act)
            drop_cell_item = rnn_cell.DropoutWrapper(
                cell_item, output_keep_prob=self.dropout_p_hidden)
            stacked_cell_item = rnn_cell.MultiRNNCell([drop_cell_item] *
                                                      self.layers)
            inputs_item = tf.nn.embedding_lookup(embedding_item, self.X)
            output_item, state_item = stacked_cell_item(
                inputs_item, tuple(self.state_item))
            self.final_state_item = state_item

        with tf.variable_scope('gru_layer_beha'):

            embedding_behavior = tf.get_variable(
                'embedding_behavior', [self.n_behaviors, self.behavior_size],
                initializer=initializer)
            inputs_beha = tf.nn.embedding_lookup(embedding_behavior,
                                                 self.Behavior)

        with tf.variable_scope('output'):
            output = tf.concat([output_item, inputs_beha], axis=1)
            output = tf.layers.dense(output,
                                     self.latent_size,
                                     activation='tanh')
            softmax_W = tf.get_variable('softmax_w',
                                        [self.n_items, self.latent_size],
                                        initializer=initializer)
            softmax_b = tf.get_variable(
                'softmax_b', [self.n_items],
                initializer=tf.constant_initializer(0.0))

        if self.is_training:
            '''
            Use other examples of the minibatch as negative samples.
            '''
            sampled_W = tf.nn.embedding_lookup(softmax_W, self.Y)
            sampled_b = tf.nn.embedding_lookup(softmax_b, self.Y)
            logits = tf.matmul(output, sampled_W, transpose_b=True) + sampled_b
            self.yhat = self.final_activation(logits)
            self.cost = self.loss_function(self.yhat)
        else:
            logits = tf.matmul(output, softmax_W, transpose_b=True) + softmax_b
            self.yhat = self.final_activation(logits)

        if not self.is_training:
            return

        self.lr = tf.maximum(
            1e-5,
            tf.train.exponential_decay(self.learning_rate,
                                       self.global_step,
                                       self.decay_steps,
                                       self.decay,
                                       staircase=True))
        '''
        Try different optimizers.
        '''
        # optimizer = tf.train.AdagradOptimizer(self.lr)
        optimizer = tf.train.AdamOptimizer(self.lr)
        # optimizer = tf.train.AdadeltaOptimizer(self.lr)
        # optimizer = tf.train.RMSPropOptimizer(self.lr)

        tvars = tf.trainable_variables()
        gvs = optimizer.compute_gradients(self.cost, tvars)
        if self.grad_cap > 0:
            capped_gvs = [(tf.clip_by_norm(grad, self.grad_cap), var)
                          for grad, var in gvs]
        else:
            capped_gvs = gvs
        self.train_op = optimizer.apply_gradients(capped_gvs,
                                                  global_step=self.global_step)
Example #22
0
def _model_fn(features, labels, mode, params, model, variable_filter_fn=None):
    """Model definition entry.

  Args:
    features: the input image tensor with shape [batch_size, height, width, 3].
      The height and width are fixed and equal.
    labels: the input labels in a dictionary. The labels include class targets
      and box targets which are dense label maps. The labels are generated from
      get_input_fn function in data/dataloader.py
    mode: the mode of TPUEstimator including TRAIN and EVAL.
    params: the dictionary defines hyperparameters of model. The default
      settings are in default_hparams function in this file.
    model: the model outputs class logits and box regression outputs.
    variable_filter_fn: the filter function that takes trainable_variables and
      returns the variable list after applying the filter rule.

  Returns:
    tpu_spec: the TPUEstimatorSpec to run training, evaluation, or prediction.

  Raises:
    RuntimeError: if both ckpt and backbone_ckpt are set.
  """
    is_tpu = params['strategy'] == 'tpu'
    if params['img_summary_steps']:
        utils.image('input_image', features, is_tpu)
    training_hooks = []
    params['is_training_bn'] = (mode == tf.estimator.ModeKeys.TRAIN)

    if params['use_keras_model']:

        def model_fn(inputs):
            model = efficientdet_keras.EfficientDetNet(
                config=hparams_config.Config(params))
            cls_out_list, box_out_list = model(inputs,
                                               params['is_training_bn'])
            cls_outputs, box_outputs = {}, {}
            for i in range(params['min_level'], params['max_level'] + 1):
                cls_outputs[i] = cls_out_list[i - params['min_level']]
                box_outputs[i] = box_out_list[i - params['min_level']]
            return cls_outputs, box_outputs
    else:
        model_fn = functools.partial(model,
                                     config=hparams_config.Config(params))

    precision = utils.get_precision(params['strategy'],
                                    params['mixed_precision'])
    cls_outputs, box_outputs = utils.build_model_with_precision(
        precision, model_fn, features, params['is_training_bn'])

    levels = cls_outputs.keys()
    for level in levels:
        cls_outputs[level] = tf.cast(cls_outputs[level], tf.float32)
        box_outputs[level] = tf.cast(box_outputs[level], tf.float32)

    # Set up training loss and learning rate.
    update_learning_rate_schedule_parameters(params)
    global_step = tf.train.get_or_create_global_step()
    learning_rate = learning_rate_schedule(params, global_step)

    # cls_loss and box_loss are for logging. only total_loss is optimized.
    det_loss, cls_loss, box_loss = detection_loss(cls_outputs, box_outputs,
                                                  labels, params)
    reg_l2loss = reg_l2_loss(params['weight_decay'])
    total_loss = det_loss + reg_l2loss

    if mode == tf.estimator.ModeKeys.TRAIN:
        utils.scalar('lrn_rate', learning_rate, is_tpu)
        utils.scalar('trainloss/cls_loss', cls_loss, is_tpu)
        utils.scalar('trainloss/box_loss', box_loss, is_tpu)
        utils.scalar('trainloss/det_loss', det_loss, is_tpu)
        utils.scalar('trainloss/reg_l2_loss', reg_l2loss, is_tpu)
        utils.scalar('trainloss/loss', total_loss, is_tpu)
        train_epochs = tf.cast(global_step,
                               tf.float32) / params['steps_per_epoch']
        utils.scalar('train_epochs', train_epochs, is_tpu)

    moving_average_decay = params['moving_average_decay']
    if moving_average_decay:
        ema = tf.train.ExponentialMovingAverage(decay=moving_average_decay,
                                                num_updates=global_step)
        ema_vars = utils.get_ema_vars()

    if mode == tf.estimator.ModeKeys.TRAIN:
        if params['optimizer'].lower() == 'sgd':
            optimizer = tf.train.MomentumOptimizer(learning_rate,
                                                   momentum=params['momentum'])
        elif params['optimizer'].lower() == 'adam':
            optimizer = tf.train.AdamOptimizer(learning_rate)
        else:
            raise ValueError('optimizers should be adam or sgd')

        if is_tpu:
            optimizer = tf.tpu.CrossShardOptimizer(optimizer)

        # Batch norm requires update_ops to be added as a train_op dependency.
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        var_list = tf.trainable_variables()
        if variable_filter_fn:
            var_list = variable_filter_fn(var_list)

        if params.get('clip_gradients_norm', None):
            logging.info('clip gradients norm by %f',
                         params['clip_gradients_norm'])
            grads_and_vars = optimizer.compute_gradients(total_loss, var_list)
            with tf.name_scope('clip'):
                grads = [gv[0] for gv in grads_and_vars]
                tvars = [gv[1] for gv in grads_and_vars]
                # First clip each variable's norm, then clip global norm.
                clip_norm = abs(params['clip_gradients_norm'])
                clipped_grads = [
                    tf.clip_by_norm(g, clip_norm) if g is not None else None
                    for g in grads
                ]
                clipped_grads, _ = tf.clip_by_global_norm(
                    clipped_grads, clip_norm)
                utils.scalar('gradient_norm',
                             tf.linalg.global_norm(clipped_grads), is_tpu)
                grads_and_vars = list(zip(clipped_grads, tvars))

            with tf.control_dependencies(update_ops):
                train_op = optimizer.apply_gradients(grads_and_vars,
                                                     global_step)
        else:
            with tf.control_dependencies(update_ops):
                train_op = optimizer.minimize(total_loss,
                                              global_step,
                                              var_list=var_list)

        if moving_average_decay:
            with tf.control_dependencies([train_op]):
                train_op = ema.apply(ema_vars)

    else:
        train_op = None

    eval_metrics = None
    if mode == tf.estimator.ModeKeys.EVAL:

        def metric_fn(**kwargs):
            """Returns a dictionary that has the evaluation metrics."""
            if params['nms_configs'].get('pyfunc', True):
                detections_bs = []
                nms_configs = params['nms_configs']
                for index in range(kwargs['boxes'].shape[0]):
                    detections = tf.numpy_function(
                        functools.partial(nms_np.per_class_nms,
                                          nms_configs=nms_configs),
                        [
                            kwargs['boxes'][index],
                            kwargs['scores'][index],
                            kwargs['classes'][index],
                            tf.slice(kwargs['image_ids'], [index], [1]),
                            tf.slice(kwargs['image_scales'], [index], [1]),
                            params['num_classes'],
                            nms_configs['max_output_size'],
                        ], tf.float32)
                    detections_bs.append(detections)
                detections_bs = postprocess.transform_detections(
                    tf.stack(detections_bs))
            else:
                # These two branches should be equivalent, but currently they are not.
                # TODO(tanmingxing): enable the non_pyfun path after bug fix.
                nms_boxes, nms_scores, nms_classes, _ = postprocess.per_class_nms(
                    params, kwargs['boxes'], kwargs['scores'],
                    kwargs['classes'], kwargs['image_scales'])
                img_ids = tf.cast(tf.expand_dims(kwargs['image_ids'], -1),
                                  nms_scores.dtype)
                detections_bs = [
                    img_ids * tf.ones_like(nms_scores),
                    nms_boxes[:, :, 1],
                    nms_boxes[:, :, 0],
                    nms_boxes[:, :, 3] - nms_boxes[:, :, 1],
                    nms_boxes[:, :, 2] - nms_boxes[:, :, 0],
                    nms_scores,
                    nms_classes,
                ]
                detections_bs = tf.stack(detections_bs,
                                         axis=-1,
                                         name='detnections')

            if params.get('testdev_dir', None):
                logging.info('Eval testdev_dir %s', params['testdev_dir'])
                eval_metric = coco_metric.EvaluationMetric(
                    testdev_dir=params['testdev_dir'])
                coco_metrics = eval_metric.estimator_metric_fn(
                    detections_bs, tf.zeros([1]))
            else:
                logging.info('Eval val with groudtruths %s.',
                             params['val_json_file'])
                eval_metric = coco_metric.EvaluationMetric(
                    filename=params['val_json_file'],
                    label_map=params['label_map'])
                coco_metrics = eval_metric.estimator_metric_fn(
                    detections_bs, kwargs['groundtruth_data'])

            # Add metrics to output.
            cls_loss = tf.metrics.mean(kwargs['cls_loss_repeat'])
            box_loss = tf.metrics.mean(kwargs['box_loss_repeat'])
            output_metrics = {
                'cls_loss': cls_loss,
                'box_loss': box_loss,
            }
            output_metrics.update(coco_metrics)
            return output_metrics

        cls_loss_repeat = tf.reshape(
            tf.tile(tf.expand_dims(cls_loss, 0), [
                params['batch_size'],
            ]), [params['batch_size'], 1])
        box_loss_repeat = tf.reshape(
            tf.tile(tf.expand_dims(box_loss, 0), [
                params['batch_size'],
            ]), [params['batch_size'], 1])

        cls_outputs = postprocess.to_list(cls_outputs)
        box_outputs = postprocess.to_list(box_outputs)
        params['nms_configs']['max_nms_inputs'] = anchors.MAX_DETECTION_POINTS
        boxes, scores, classes = postprocess.pre_nms(params, cls_outputs,
                                                     box_outputs)
        metric_fn_inputs = {
            'cls_loss_repeat': cls_loss_repeat,
            'box_loss_repeat': box_loss_repeat,
            'image_ids': labels['source_ids'],
            'groundtruth_data': labels['groundtruth_data'],
            'image_scales': labels['image_scales'],
            'boxes': boxes,
            'scores': scores,
            'classes': classes,
        }
        eval_metrics = (metric_fn, metric_fn_inputs)

    checkpoint = params.get('ckpt') or params.get('backbone_ckpt')

    if checkpoint and mode == tf.estimator.ModeKeys.TRAIN:
        # Initialize the model from an EfficientDet or backbone checkpoint.
        if params.get('ckpt') and params.get('backbone_ckpt'):
            raise RuntimeError(
                '--backbone_ckpt and --checkpoint are mutually exclusive')

        if params.get('backbone_ckpt'):
            var_scope = params['backbone_name'] + '/'
            if params['ckpt_var_scope'] is None:
                # Use backbone name as default checkpoint scope.
                ckpt_scope = params['backbone_name'] + '/'
            else:
                ckpt_scope = params['ckpt_var_scope'] + '/'
        else:
            # Load every var in the given checkpoint
            var_scope = ckpt_scope = '/'

        def scaffold_fn():
            """Loads pretrained model through scaffold function."""
            logging.info('restore variables from %s', checkpoint)

            var_map = utils.get_ckpt_var_map(
                ckpt_path=checkpoint,
                ckpt_scope=ckpt_scope,
                var_scope=var_scope,
                skip_mismatch=params['skip_mismatch'])

            tf.train.init_from_checkpoint(checkpoint, var_map)
            return tf.train.Scaffold()
    elif mode == tf.estimator.ModeKeys.EVAL and moving_average_decay:

        def scaffold_fn():
            """Load moving average variables for eval."""
            logging.info('Load EMA vars with ema_decay=%f',
                         moving_average_decay)
            restore_vars_dict = ema.variables_to_restore(ema_vars)
            saver = tf.train.Saver(restore_vars_dict)
            return tf.train.Scaffold(saver=saver)
    else:
        scaffold_fn = None

    if is_tpu:
        return tf.estimator.tpu.TPUEstimatorSpec(
            mode=mode,
            loss=total_loss,
            train_op=train_op,
            eval_metrics=eval_metrics,
            host_call=utils.get_tpu_host_call(global_step, params),
            scaffold_fn=scaffold_fn,
            training_hooks=training_hooks)
    else:
        # Profile every 1K steps.
        if params.get('profile', False):
            profile_hook = tf.estimator.ProfilerHook(
                save_steps=1000,
                output_dir=params['model_dir'],
                show_memory=True)
            training_hooks.append(profile_hook)

            # Report memory allocation if OOM; it will slow down the running.
            class OomReportingHook(tf.estimator.SessionRunHook):
                def before_run(self, run_context):
                    return tf.estimator.SessionRunArgs(
                        fetches=[],
                        options=tf.RunOptions(
                            report_tensor_allocations_upon_oom=True))

            training_hooks.append(OomReportingHook())

        logging_hook = tf.estimator.LoggingTensorHook(
            {
                'step': global_step,
                'det_loss': det_loss,
                'cls_loss': cls_loss,
                'box_loss': box_loss,
            },
            every_n_iter=params.get('iterations_per_loop', 100),
        )
        training_hooks.append(logging_hook)

        eval_metric_ops = (eval_metrics[0](
            **eval_metrics[1]) if eval_metrics else None)
        return tf.estimator.EstimatorSpec(
            mode=mode,
            loss=total_loss,
            train_op=train_op,
            eval_metric_ops=eval_metric_ops,
            scaffold=scaffold_fn() if scaffold_fn else None,
            training_hooks=training_hooks)
Example #23
0
    def build_model(self):
        """
        neural network architecture
        """
        if self.user_data:
            self.U = tf.placeholder(tf.int32, [self.batch_size],
                                    name='user_id')
        self.X = tf.placeholder(tf.int32, [self.batch_size], name='input')
        self.Y = tf.placeholder(tf.int32, [self.batch_size + self.n_samples],
                                name='output')
        self.state = [
            tf.placeholder(tf.float32, [self.batch_size, self.rnn_size],
                           name='rnn_state') for _ in range(self.layers)
        ]
        self.global_step = tf.Variable(0, name='global_step', trainable=False)

        with tf.variable_scope('gru_layer'):

            # parameter initialization
            sigma = self.sigma if self.sigma != 0 else np.sqrt(
                6.0 / (self.n_items + self.rnn_size))
            if self.init_as_normal:
                initializer = tf.random_normal_initializer(mean=0,
                                                           stddev=sigma)
            else:
                initializer = tf.random_uniform_initializer(minval=-sigma,
                                                            maxval=sigma)
            embedding = tf.get_variable('embedding',
                                        [self.n_items, self.rnn_size],
                                        initializer=initializer)
            softmax_W = tf.get_variable('softmax_w',
                                        [self.n_items, self.rnn_size],
                                        initializer=initializer)
            softmax_b = tf.get_variable(
                'softmax_b', [self.n_items],
                initializer=tf.constant_initializer(0.0))
            if self.user_data:
                user_embedding = tf.get_variable('user_embedding',
                                                 [self.n_users, self.rnn_size],
                                                 initializer=initializer)

            cells = []
            for i in range(self.layers):
                cell = rnn_cell.GRUCell(num_units=self.rnn_size,
                                        activation=self.hidden_act)
                # GRU cell in the hidden layer
                cell = rnn_cell.DropoutWrapper(
                    cell, output_keep_prob=self.dropout_p_hidden)
                cells.append(cell)

            multi_cell = rnn_cell.MultiRNNCell(
                cells)  # multiple GRU cells in hidden layers with dropout

            # input and output
            inputs = tf.nn.embedding_lookup(embedding,
                                            self.X)  # self.X is input
            output, state = multi_cell(inputs, tuple(self.state))
            if self.user_data:
                user_output = tf.nn.embedding_lookup(user_embedding, self.U)
                user_concatenated = tf.concat([output, user_output], axis=1)
                output = tf.layers.dense(user_concatenated,
                                         self.rnn_size,
                                         activation='tanh')
            self.final_state = state

        if self.is_training:
            '''
            Use other examples of the minibatch as negative samples.
            '''

            sampled_W = tf.nn.embedding_lookup(softmax_W, self.Y)
            sampled_b = tf.nn.embedding_lookup(softmax_b, self.Y)
            logits = tf.matmul(output, sampled_W, transpose_b=True) + sampled_b
            # sampled_W is transposed before multiplication
            self.yhat = self.final_activation(logits)
            self.cost = self.loss_function(self.yhat)
        else:  # if not training
            logits = tf.matmul(output, softmax_W, transpose_b=True) + softmax_b
            self.yhat = self.final_activation(logits)

        if not self.is_training:
            return

        self.lr = tf.maximum(
            1e-5,
            tf.train.exponential_decay(self.learning_rate,
                                       self.global_step,
                                       self.decay_steps,
                                       self.decay,
                                       staircase=True))

        # set optimizer
        if self.optimizer == 'adagrad':
            self.optimizer = tf.train.AdagradOptimizer(self.lr)
        elif self.optimizer == 'adam':
            self.optimizer = tf.train.AdamOptimizer(self.lr)
        elif self.optimizer == 'adadelta':
            self.optimizer = tf.train.AdadeltaOptimizer(self.lr)
        elif self.optimizer == 'rmsprop':
            self.optimizer = tf.train.RMSPropOptimizer(self.lr)

        tvars = tf.trainable_variables()
        gvs = self.optimizer.compute_gradients(self.cost, tvars)
        if self.grad_cap > 0:
            capped_gvs = [(tf.clip_by_norm(grad, self.grad_cap), var)
                          for grad, var in gvs]
        else:
            capped_gvs = gvs
        self.train_op = self.optimizer.apply_gradients(
            capped_gvs, global_step=self.global_step)
    def __init__(self,
                 linear_size,
                 num_layers,
                 residual,
                 batch_norm,
                 max_norm,
                 batch_size,
                 learning_rate,
                 summaries_dir,
                 predict_14=False,
                 dtype=tf.float32):
        """Creates the linear + relu model

        Args
          linear_size: integer. number of units in each layer of the model
          num_layers: integer. number of bilinear blocks in the model
          residual: boolean. Whether to add residual connections
          batch_norm: boolean. Whether to use batch normalization
          max_norm: boolean. Whether to clip weights to a norm of 1
          batch_size: integer. The size of the batches used during training
          learning_rate: float. Learning rate to start with
          summaries_dir: String. Directory where to log progress
          predict_14: boolean. Whether to predict 14 instead of 17 joints
          dtype: the data type to use to store internal variables
        """

        # There are in total 17 joints in H3.6M and 16 in MPII (and therefore in stacked
        # hourglass detections). We settled with 16 joints in 2d just to make models
        # compatible (e.g. you can train on ground truth 2d and test on SH detections).
        # This does not seem to have an effect on prediction performance.
        self.HUMAN_2D_SIZE = 16 * 2

        # In 3d all the predictions are zero-centered around the root (hip) joint, so
        # we actually predict only 16 joints. The error is still computed over 17 joints,
        # because if one uses, e.g. Procrustes alignment, there is still error in the
        # hip to account for!
        # There is also an option to predict only 14 joints, which makes our results
        # directly comparable to those in https://arxiv.org/pdf/1611.09010.pdf
        self.HUMAN_3D_SIZE = 14 * 3 if predict_14 else 16 * 3

        self.input_size = self.HUMAN_2D_SIZE
        self.output_size = self.HUMAN_3D_SIZE

        self.isTraining = tf.placeholder(tf.bool, name="isTrainingflag")
        self.dropout_keep_prob = tf.placeholder(tf.float32,
                                                name="dropout_keep_prob")

        # Summary writers for train and test runs
        self.train_writer = tf.summary.FileWriter(
            os.path.join(summaries_dir, 'train'))
        self.test_writer = tf.summary.FileWriter(
            os.path.join(summaries_dir, 'test'))

        self.linear_size = linear_size
        self.batch_size = batch_size
        self.learning_rate = tf.Variable(float(learning_rate),
                                         trainable=False,
                                         dtype=dtype,
                                         name="learning_rate")
        self.global_step = tf.Variable(0, trainable=False, name="global_step")
        decay_steps = 100000  # empirical
        decay_rate = 0.96  # empirical
        self.learning_rate = tf.train.exponential_decay(
            self.learning_rate, self.global_step, decay_steps, decay_rate)

        # === Transform the inputs ===
        with vs.variable_scope("inputs"):

            # in=2d poses, out=3d poses
            enc_in = tf.placeholder(dtype,
                                    shape=[None, self.input_size],
                                    name="enc_in")
            dec_out = tf.placeholder(dtype,
                                     shape=[None, self.output_size],
                                     name="dec_out")

            self.encoder_inputs = enc_in
            self.decoder_outputs = dec_out

        # === Create the linear + relu combos ===
        with vs.variable_scope("linear_model"):

            # === First layer, brings dimensionality up to linear_size ===
            w1 = tf.get_variable(name="w1",
                                 initializer=kaiming,
                                 shape=[self.HUMAN_2D_SIZE, linear_size],
                                 dtype=dtype)
            b1 = tf.get_variable(name="b1",
                                 initializer=kaiming,
                                 shape=[linear_size],
                                 dtype=dtype)
            w1 = tf.clip_by_norm(w1, 1) if max_norm else w1
            y3 = tf.matmul(enc_in, w1) + b1

            if batch_norm:
                y3 = tf.layers.batch_normalization(y3,
                                                   training=self.isTraining,
                                                   name="batch_normalization")
            y3 = tf.nn.relu(y3)
            y3 = tf.nn.dropout(y3, self.dropout_keep_prob)

            # === Create multiple bi-linear layers ===
            for idx in range(num_layers):
                y3 = self.two_linear(y3, linear_size, residual,
                                     self.dropout_keep_prob, max_norm,
                                     batch_norm, dtype, idx)

            # === Last linear layer has HUMAN_3D_SIZE in output ===
            w4 = tf.get_variable(name="w4",
                                 initializer=kaiming,
                                 shape=[linear_size, self.HUMAN_3D_SIZE],
                                 dtype=dtype)
            b4 = tf.get_variable(name="b4",
                                 initializer=kaiming,
                                 shape=[self.HUMAN_3D_SIZE],
                                 dtype=dtype)
            w4 = tf.clip_by_norm(w4, 1) if max_norm else w4
            y = tf.matmul(y3, w4) + b4
            # === End linear model ===

        # Store the outputs here
        self.outputs = y
        self.loss = tf.reduce_mean(tf.square(y - dec_out))
        self.loss_summary = tf.summary.scalar('loss/loss', self.loss)

        # To keep track of the loss in mm
        self.err_mm = tf.placeholder(tf.float32, name="error_mm")
        self.err_mm_summary = tf.summary.scalar("loss/error_mm", self.err_mm)

        # Gradients and update operation for training the model.
        opt = tf.train.AdamOptimizer(self.learning_rate)
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)

        with tf.control_dependencies(update_ops):

            # Update all the trainable parameters
            gradients = opt.compute_gradients(self.loss)
            self.gradients = [[] if i == None else i for i in gradients]
            self.updates = opt.apply_gradients(gradients,
                                               global_step=self.global_step)

        # Keep track of the learning rate
        self.learning_rate_summary = tf.summary.scalar(
            'learning_rate/learning_rate', self.learning_rate)

        # To save the model
        self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=10)
Example #25
0
    def run_epoch(self, fileName):
        # 实例化配置参数对象
        config = Config()

        # 实例化数据生成对象
        dataGen = DataGenerator(fileName, config)
        dataGen.gen_attr()  # 生成训练集和测试集

        # 下列两个数组的形式是:[ [[知识点id,答题结果], [知识点id,答题结果], ...], [[],[],[],...], [[],[],[]],... ]
        # 例如train_seqs有3384个元组,每个元组是某个学生的做题序列:[[知识点id,答题结果], [知识点id,答题结果], ...]
        train_seqs = dataGen.train_seqs     # length: 3384
        test_seqs = dataGen.test_seqs       # length: 843

        session_conf = tf.ConfigProto(
            allow_soft_placement=True,
            log_device_placement=False
        )
        sess = tf.Session(config=session_conf)
        self.sess = sess

        with sess.as_default():
            # 实例化dkt模型对象
            with tf.name_scope("train"):
                with tf.variable_scope("dkt", reuse=None):
                    # train_dkt: 一个TensorFlowDKT模型
                    train_dkt = TensorFlowDKT(config)

            with tf.name_scope("test"):
                with tf.variable_scope("dkt", reuse=True):
                    test_dkt = TensorFlowDKT(config)

            self.train_dkt = train_dkt  # 一个TensorFlowDKT模型
            self.test_dkt = test_dkt    # 一个TensorFlowDKT模型

            global_step = tf.Variable(0, name="global_step", trainable=False)
            self.global_step = global_step  # <tf.Variable 'global_step:0' shape=( ) dtype=int32_ref>

            # 定义一个优化器
            optimizer = tf.train.AdamOptimizer(config.trainConfig.learning_rate)
            grads_and_vars = optimizer.compute_gradients(train_dkt.loss)    # 误差是train_dkt.loss

            # 对梯度进行截断,并且加上梯度噪音
            grads_and_vars = [(tf.clip_by_norm(g, config.trainConfig.max_grad_norm), v)
                              for g, v in grads_and_vars if g is not None]

            # 定义图中最后的节点
            train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step, name="train_op")

            # 保存各种变量或结果的值,保存到文件中
            grad_summaries = []
            for g, v in grads_and_vars:
                if g is not None:
                    grad_hist_summary = tf.summary.histogram("{}/grad/hist".format(v.name), g)
                    sparsity_summary = tf.summary.scalar("{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g))
                    grad_summaries.append(grad_hist_summary)
                    grad_summaries.append(sparsity_summary)
            grad_summaries_merged = tf.summary.merge(grad_summaries)

            timestamp = str(int(time.time()))
            out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
            print("writing to {}".format(out_dir))

            # 训练时的 Summaries
            train_loss_summary = tf.summary.scalar("loss", train_dkt.loss)
            train_summary_op = tf.summary.merge([train_loss_summary, grad_summaries_merged])
            train_summary_dir = os.path.join(out_dir, "summaries", "train")
            train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph)

            # 测试时的 summaries
            test_loss_summary = tf.summary.scalar("loss", test_dkt.loss)
            dev_summary_op = tf.summary.merge([test_loss_summary])
            dev_summary_dir = os.path.join(out_dir, "summaries", "dev")
            dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, sess.graph)

            saver = tf.train.Saver(tf.global_variables())

            sess.run(tf.global_variables_initializer())

            print("初始化完毕,开始训练")
            for i in range(config.trainConfig.epochs):
                np.random.shuffle(train_seqs)
                for params in dataGen.next_batch(train_seqs):
                    # 批次获得训练集,训练模型
                    # params是一个map,包含的元素有(举个例子:)
                    # 1) input_x --> shape:(32, 1109, 248), 32个学生,做题序列长度为1109,每个序列包含124个知识点,用one-hot表示
                    # 2) target_id --> shape:(32, 1109), 32个学生,表示学生做题的序号,后面用0来填充:[idx1, idx2 ,...,0,0,0]
                    # 3) target_correctness --> 表示target_id是否作对
                    # 4) seq_len: shape:(32),32个学生的做题序列长度
                    # 5) max_len: shape:(), seq_len的最大值:1109
                    # 开始训练:输入params,
                    self.train_step(params, train_op, train_summary_op, train_summary_writer)

                    current_step = tf.train.global_step(sess, global_step)

                    # 对结果进行记录
                    if current_step % config.trainConfig.evaluate_every == 0:
                        print("\nEvaluation:")
                        # 获得测试数据

                        losses = []
                        accuracys = []
                        aucs = []
                        precisions = []
                        recalls = []
                        for params in dataGen.next_batch(test_seqs):
                            loss, accuracy, auc, precision, recall = self.dev_step(params, dev_summary_op, writer=None)
                            losses.append(loss)
                            accuracys.append(accuracy)
                            aucs.append(auc)
                            precisions.append(precision)
                            recalls.append(recall)

                        time_str = datetime.datetime.now().isoformat()
                        print("dev: {}, step: {}, loss: {}, acc: {}, auc: {}, precision: {}, recall: {}".
                              format(time_str, current_step, mean(losses), mean(accuracys), mean(aucs), mean(precisions), mean(recalls)))

                    if current_step % config.trainConfig.checkpoint_every == 0:
                        path = saver.save(sess, "model/my-model", global_step=current_step)
                        print("Saved model checkpoint to {}\n".format(path))
policy_gradient_loss = tf.reduce_mean(
    tf.stop_gradient(final_loss_per_sample - baseline) * log_seq_prob)

total_training_loss = policy_gradient_loss + avg_sample_loss
total_loss = tf.add_n([
    total_training_loss, lambda_entropy * nmn3_model.entropy_reg,
    weight_decay * nmn3_model.l2_reg
])

# Train with Adam
solver = tf.train.AdamOptimizer()
gradients = solver.compute_gradients(total_loss)

# Clip gradient by L2 norm
# gradients = gradients_part1+gradients_part2
gradients = [(tf.clip_by_norm(g, max_grad_l2_norm), v) for g, v in gradients]
solver_op = solver.apply_gradients(gradients)

# Training operation
# Partial-run can't fetch training operations
# some workaround to make partial-run work
with tf.control_dependencies([solver_op, baseline_update_op]):
    train_step = tf.constant(0)

# Write summary to TensorBoard
os.makedirs(log_dir, exist_ok=True)
log_writer = tf.summary.FileWriter(log_dir, tf.get_default_graph())
loss_ph = tf.placeholder(tf.float32, [])
entropy_ph = tf.placeholder(tf.float32, [])
accuracy_ph = tf.placeholder(tf.float32, [])
baseline_ph = tf.placeholder(tf.float32, [])
Example #27
0
def get_train_ops(loss,
                  tf_variables,
                  train_step,
                  clip_mode=None,
                  grad_bound=None,
                  l2_reg=1e-4,
                  lr_warmup_val=None,
                  lr_warmup_steps=100,
                  lr_init=0.1,
                  lr_dec_start=0,
                  lr_dec_every=10000,
                  lr_dec_rate=0.1,
                  lr_dec_min=None,
                  lr_cosine=False,
                  lr_max=None,
                  lr_min=None,
                  lr_T_0=None,
                  lr_T_mul=None,
                  num_train_batches=None,
                  optim_algo=None,
                  sync_replicas=False,
                  num_aggregate=None,
                  num_replicas=None,
                  get_grad_norms=False,
                  moving_average=None):
    """
	Args:
	  clip_mode: "global", "norm", or None.
	  moving_average: store the moving average of parameters
	"""

    if l2_reg > 0:
        l2_losses = []
        for var in tf_variables:
            l2_losses.append(tf.reduce_sum(var**2))
        l2_loss = tf.add_n(l2_losses)
        loss += l2_reg * l2_loss  # loss = loss + 1e-4*l2_loss

    grads = tf.gradients(loss, tf_variables)
    grad_norm = tf.global_norm(grads)

    grad_norms = {}
    for v, g in zip(tf_variables, grads):
        if v is None or g is None:
            continue
        if isinstance(g, tf.IndexedSlices):
            grad_norms[v.name] = tf.sqrt(tf.reduce_sum(g.values**2))
        else:
            grad_norms[v.name] = tf.sqrt(tf.reduce_sum(g**2))

    if clip_mode is not None:
        assert grad_bound is not None, "Need grad_bound to clip gradients."
        if clip_mode == "global":
            grads, _ = tf.clip_by_global_norm(grads, grad_bound)
        elif clip_mode == "norm":
            clipped = []
            for g in grads:
                if isinstance(g, tf.IndexedSlices):
                    c_g = tf.clip_by_norm(g.values, grad_bound)
                    c_g = tf.IndexedSlices(g.indices, c_g)
                else:
                    c_g = tf.clip_by_norm(g, grad_bound)
                clipped.append(g)
            grads = clipped
        else:
            raise NotImplementedError("Unknown clip_mode {}".format(clip_mode))

    if lr_cosine:
        assert lr_max is not None, "Need lr_max to use lr_cosine"
        assert lr_min is not None, "Need lr_min to use lr_cosine"
        assert lr_T_0 is not None, "Need lr_T_0 to use lr_cosine"
        assert lr_T_mul is not None, "Need lr_T_mul to use lr_cosine"
        assert num_train_batches is not None, ("Need num_train_batches to use"
                                               " lr_cosine")

        curr_epoch = train_step // num_train_batches  # train step will be calculated by just one batch!

        last_reset = tf.Variable(0,
                                 dtype=tf.int32,
                                 trainable=False,
                                 name="last_reset")
        T_i = tf.Variable(lr_T_0, dtype=tf.int32, trainable=False, name="T_i")
        T_curr = curr_epoch - last_reset

        def _update():
            update_last_reset = tf.assign(last_reset,
                                          curr_epoch,
                                          use_locking=True)
            update_T_i = tf.assign(T_i, T_i * lr_T_mul, use_locking=True)
            with tf.control_dependencies([update_last_reset, update_T_i]):
                rate = tf.to_float(T_curr) / tf.to_float(T_i) * 3.1415926
                lr = lr_min + 0.5 * (lr_max - lr_min) * (1.0 + tf.cos(rate))
            return lr

        def _no_update():
            rate = tf.to_float(T_curr) / tf.to_float(T_i) * 3.1415926
            lr = lr_min + 0.5 * (lr_max - lr_min) * (1.0 + tf.cos(rate))
            return lr

        learning_rate = tf.cond(tf.greater_equal(T_curr, T_i), _update,
                                _no_update)
    else:
        learning_rate = tf.train.exponential_decay(
            lr_init,
            tf.maximum(train_step - lr_dec_start, 0),
            lr_dec_every,
            lr_dec_rate,
            staircase=True)
        if lr_dec_min is not None:
            learning_rate = tf.maximum(learning_rate, lr_dec_min)

    if lr_warmup_val is not None:
        learning_rate = tf.cond(tf.less(train_step, lr_warmup_steps),
                                lambda: lr_warmup_val, lambda: learning_rate)

    if optim_algo == "momentum":
        opt = tf.train.MomentumOptimizer(learning_rate,
                                         0.9,
                                         use_locking=True,
                                         use_nesterov=True)
    elif optim_algo == "sgd":
        opt = tf.train.GradientDescentOptimizer(learning_rate,
                                                use_locking=True)
    elif optim_algo == "adam":
        opt = tf.train.AdamOptimizer(learning_rate,
                                     beta1=0.0,
                                     epsilon=1e-3,
                                     use_locking=True)
    else:
        raise ValueError("Unknown optim_algo {}".format(optim_algo))

    if sync_replicas:
        assert num_aggregate is not None, "Need num_aggregate to sync."
        assert num_replicas is not None, "Need num_replicas to sync."

        opt = tf.train.SyncReplicasOptimizer(
            opt,
            replicas_to_aggregate=num_aggregate,
            total_num_replicas=num_replicas,
            use_locking=True)

    if moving_average is not None:
        opt = tf.contrib.opt.MovingAverageOptimizer(
            opt, average_decay=moving_average)

    train_op = opt.apply_gradients(zip(grads, tf_variables),
                                   global_step=train_step)

    if get_grad_norms:
        return train_op, learning_rate, grad_norm, opt, grad_norms
    else:
        return train_op, learning_rate, grad_norm, opt
Example #28
0
def model_fn(features, labels, mode, params):
    """The model_fn to be used with TPUEstimator.

  Args:
    features: A dict of `Tensor` of batched images and other features.
    labels: a Tensor or a dict of Tensor representing the batched labels.
    mode: one of `tf.estimator.ModeKeys.{TRAIN,EVAL,PREDICT}`
    params: `dict` of parameters passed to the model from the TPUEstimator,
      `params['batch_size']` is always provided and should be used as the
      effective batch size.

  Returns:
    A `TPUEstimatorSpec` for the model
  """
    logging.info('params=%s', params)
    images = features['image'] if isinstance(features, dict) else features
    labels = labels['label'] if isinstance(labels, dict) else labels
    config = params['config']
    image_size = params['image_size']
    utils.scalar('model/resolution', image_size)

    if config.model.data_format == 'channels_first':
        images = tf.transpose(images, [0, 3, 1, 2])

    is_training = (mode == tf.estimator.ModeKeys.TRAIN)
    has_moving_average_decay = (config.train.ema_decay > 0)
    if FLAGS.use_tpu and not config.model.bn_type:
        config.model.bn_type = 'tpu_bn'
    # This is essential, if using a keras-derived model.
    tf.keras.backend.set_learning_phase(is_training)

    def build_model(in_images):
        """Build model using the model_name given through the command line."""
        config.model.num_classes = config.data.num_classes
        model = effnetv2_model.EffNetV2Model(config.model.model_name,
                                             config.model)
        logits = model(in_images, training=is_training)[0]
        return logits

    pre_num_params, pre_num_flops = utils.num_params_flops(
        readable_format=True)

    if config.runtime.mixed_precision:
        precision = 'mixed_bfloat16' if FLAGS.use_tpu else 'mixed_float16'
        logits = utils.build_model_with_precision(precision, build_model,
                                                  images, is_training)
        logits = tf.cast(logits, tf.float32)
    else:
        logits = build_model(images)

    num_params, num_flops = utils.num_params_flops(readable_format=True)
    num_params = num_params - pre_num_params
    num_flops = (num_flops - pre_num_flops) / params['batch_size']
    logging.info('backbone params/flops = %.4f M / %.4f B', num_params,
                 num_flops)
    utils.scalar('model/params', num_params)
    utils.scalar('model/flops', num_flops)

    # Calculate loss, which includes softmax cross entropy and L2 regularization.
    if config.train.loss_type == 'sigmoid':
        cross_entropy = tf.losses.sigmoid_cross_entropy(
            multi_class_labels=tf.cast(labels, dtype=logits.dtype),
            logits=logits,
            label_smoothing=config.train.label_smoothing)
    elif config.train.loss_type == 'custom':
        xent = tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.cast(
            labels, dtype=logits.dtype),
                                                       logits=logits)
        cross_entropy = tf.reduce_mean(tf.reduce_sum(xent, axis=-1))
    else:
        if config.data.multiclass:
            logging.info('use multi-class loss: %s', config.data.multiclass)
            labels /= tf.reshape(tf.reduce_sum(labels, axis=1), (-1, 1))
        cross_entropy = tf.losses.softmax_cross_entropy(
            onehot_labels=labels,
            logits=logits,
            label_smoothing=config.train.label_smoothing)

    train_steps = max(config.train.min_steps,
                      config.train.epochs * params['steps_per_epoch'])
    global_step = tf.train.get_global_step()
    weight_decay_inc = config.train.weight_decay_inc * (
        tf.cast(global_step, tf.float32) / tf.cast(train_steps, tf.float32))
    weight_decay = (1 + weight_decay_inc) * config.train.weight_decay
    utils.scalar('train/weight_decay', weight_decay)
    # Add weight decay to the loss for non-batch-normalization variables.
    matcher = re.compile(config.train.weight_decay_exclude)
    l2loss = weight_decay * tf.add_n([
        tf.nn.l2_loss(v)
        for v in tf.trainable_variables() if not matcher.match(v.name)
    ])
    loss = cross_entropy + l2loss
    utils.scalar('loss/l2reg', l2loss)
    utils.scalar('loss/xent', cross_entropy)

    if has_moving_average_decay:
        ema = tf.train.ExponentialMovingAverage(decay=config.train.ema_decay,
                                                num_updates=global_step)
        ema_vars = utils.get_ema_vars()

    host_call = None
    restore_vars_dict = None
    if is_training:
        # Compute the current epoch and associated learning rate from global_step.
        current_epoch = (tf.cast(global_step, tf.float32) /
                         params['steps_per_epoch'])
        utils.scalar('train/epoch', current_epoch)

        scaled_lr = config.train.lr_base * (config.train.batch_size / 256.0)
        scaled_lr_min = config.train.lr_min * (config.train.batch_size / 256.0)
        learning_rate = utils.WarmupLearningRateSchedule(
            scaled_lr,
            steps_per_epoch=params['steps_per_epoch'],
            decay_epochs=config.train.lr_decay_epoch,
            warmup_epochs=config.train.lr_warmup_epoch,
            decay_factor=config.train.lr_decay_factor,
            lr_decay_type=config.train.lr_sched,
            total_steps=train_steps,
            minimal_lr=scaled_lr_min)(global_step)
        utils.scalar('train/lr', learning_rate)
        optimizer = utils.build_optimizer(
            learning_rate, optimizer_name=config.train.optimizer)
        if FLAGS.use_tpu:
            # When using TPU, wrap the optimizer with CrossShardOptimizer which
            # handles synchronization details between different TPU cores. To the
            # user, this should look like regular synchronous training.
            optimizer = tf.tpu.CrossShardOptimizer(optimizer)

        # filter trainable variables if needed.
        var_list = tf.trainable_variables()
        if config.train.varsexp:
            vars2 = [
                v for v in var_list if re.match(config.train.varsexp, v.name)
            ]
            if len(vars2) == len(var_list):
                logging.warning('%s has no match.', config.train.freeze)
            logging.info('Filter variables: orig=%d, final=%d, delta=%d',
                         len(var_list), len(vars2),
                         len(var_list) - len(vars2))
            var_list = vars2

        # Batch norm requires update_ops to be added as a train_op dependency.
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        if config.train.gclip and is_training:
            logging.info('clip gradients norm by %f', config.train.gclip)
            grads_and_vars = optimizer.compute_gradients(loss, var_list)
            with tf.name_scope('gclip'):
                grads = [gv[0] for gv in grads_and_vars]
                tvars = [gv[1] for gv in grads_and_vars]
                utils.scalar('train/gnorm', tf.linalg.global_norm(grads))
                utils.scalar('train/gnormmax',
                             tf.math.reduce_max([tf.norm(g) for g in grads]))
                # First clip each variable's norm, then clip global norm.
                clip_norm = abs(config.train.gclip)
                clipped_grads = [
                    tf.clip_by_norm(g, clip_norm) if g is not None else None
                    for g in grads
                ]
                clipped_grads, _ = tf.clip_by_global_norm(
                    clipped_grads, clip_norm)
                grads_and_vars = list(zip(clipped_grads, tvars))

            with tf.control_dependencies(update_ops):
                train_op = optimizer.apply_gradients(grads_and_vars,
                                                     global_step)
        else:
            with tf.control_dependencies(update_ops):
                train_op = optimizer.minimize(loss,
                                              global_step,
                                              var_list=var_list)

        if has_moving_average_decay:
            with tf.control_dependencies([train_op]):
                train_op = ema.apply(ema_vars)

        if not config.runtime.skip_host_call:
            host_call = utils.get_tpu_host_call(
                global_step, FLAGS.model_dir,
                config.runtime.iterations_per_loop)
    else:
        train_op = None
        if has_moving_average_decay:
            # Load moving average variables for eval.
            restore_vars_dict = ema.variables_to_restore(ema_vars)

    eval_metrics = None
    if mode == tf.estimator.ModeKeys.EVAL:

        def metric_fn(labels, logits):
            """Evaluation metric function.

      Evaluates accuracy.

      This function is executed on the CPU and should not directly reference
      any Tensors in the rest of the `model_fn`. To pass Tensors from the model
      to the `metric_fn`, provide as part of the `eval_metrics`. See
      https://www.tensorflow.org/api_docs/python/tf/estimator/tpu/TPUEstimatorSpec
      for more information.

      Arguments should match the list of `Tensor` objects passed as the second
      element in the tuple passed to `eval_metrics`.

      Args:
        labels: `Tensor` with shape `[batch, num_classes]`.
        logits: `Tensor` with shape `[batch, num_classes]`.

      Returns:
        A dict of the metrics to return from evaluation.
      """
            metrics = {}
            if config.data.multiclass:
                metrics['eval/global_ap'] = tf.metrics.auc(
                    labels,
                    tf.nn.sigmoid(logits),
                    curve='PR',
                    num_thresholds=200,
                    summation_method='careful_interpolation',
                    name='global_ap')

                # Convert labels to set: be careful, tf.metrics.xx_at_k are horrible.
                labels = tf.cast(labels, dtype=tf.int64)
                label_to_repeat = tf.expand_dims(tf.argmax(labels, axis=-1),
                                                 axis=-1)
                all_labels_set = tf.range(0, labels.shape[-1], dtype=tf.int64)
                all_labels_set = tf.expand_dims(all_labels_set, axis=0)
                labels_set = labels * all_labels_set + (
                    1 - labels) * label_to_repeat

                metrics['eval/precision@1'] = tf.metrics.precision_at_k(
                    labels_set, logits, k=1)
                metrics['eval/recall@1'] = tf.metrics.recall_at_k(labels_set,
                                                                  logits,
                                                                  k=1)
                metrics['eval/precision@5'] = tf.metrics.precision_at_k(
                    labels_set, logits, k=5)
                metrics['eval/recall@5'] = tf.metrics.recall_at_k(labels_set,
                                                                  logits,
                                                                  k=5)

            # always add accuracy.
            labels = tf.argmax(labels, axis=1)
            predictions = tf.argmax(logits, axis=1)
            metrics['eval/acc_top1'] = tf.metrics.accuracy(labels, predictions)
            in_top_5 = tf.cast(tf.nn.in_top_k(logits, labels, 5), tf.float32)
            metrics['eval/acc_top5'] = tf.metrics.mean(in_top_5)
            metrics['model/resolution'] = tf.metrics.mean(image_size)
            metrics['model/flops'] = tf.metrics.mean(num_flops)
            metrics['model/params'] = tf.metrics.mean(num_params)
            return metrics

        eval_metrics = (metric_fn, [labels, logits])

    if has_moving_average_decay and not is_training:

        def scaffold_fn():  # read ema for eval jobs.
            saver = tf.train.Saver(restore_vars_dict)
            return tf.train.Scaffold(saver=saver)
    elif config.train.ft_init_ckpt and is_training:

        def scaffold_fn():
            logging.info('restore variables from %s',
                         config.train.ft_init_ckpt)
            var_map = utils.get_ckpt_var_map(
                ckpt_path=config.train.ft_init_ckpt,
                skip_mismatch=True,
                init_ema=config.train.ft_init_ema)
            tf.train.init_from_checkpoint(config.train.ft_init_ckpt, var_map)
            return tf.train.Scaffold()
    else:
        scaffold_fn = None

    return tf.estimator.tpu.TPUEstimatorSpec(mode=mode,
                                             loss=loss,
                                             train_op=train_op,
                                             host_call=host_call,
                                             eval_metrics=eval_metrics,
                                             scaffold_fn=scaffold_fn)
 def clip_if_not_none(grad, clip_norm=5.):
     """Clip the gradient only if not None."""
     if grad is None:
         return grad
     return tf.clip_by_norm(grad, clip_norm)
Example #30
0
def gradient_clipping(optimizer, computed_loss, learning_rate, beta1):
    grads = optimizer.compute_gradients(computed_loss)
    clipped_grads = [(tf.clip_by_norm(grad, 5), var) for grads, var in grads]
    return optimizer.apply_gradients(clipped_grads)