Esempio n. 1
0
def bacthnorm(inputs, scope, epsilon=1e-05, momentum=0.99, is_training=True):
    inputs_shape = inputs.get_shape().as_list()# 输出 形状尺寸
    params_shape = inputs_shape[-1:]# 输入参数的长度
    axis = list(range(len(inputs_shape) - 1))

    with tf.variable_scope(scope):
        beta = create_variable("beta", params_shape,
                               initializer=tf.zeros_initializer())
        gamma = create_variable("gamma", params_shape,
                                initializer=tf.ones_initializer())
        # 均值 常量 不需要训练 for inference
        moving_mean = create_variable("moving_mean", params_shape,
                            initializer=tf.zeros_initializer(), trainable=False)
		# 方差 常量 不需要训练
        moving_variance = create_variable("moving_variance", params_shape,
                            initializer=tf.ones_initializer(), trainable=False)
    if is_training:
        mean, variance = tf.nn.moments(inputs, axes=axis)# 计算均值和方差
		# 移动平均求 均值和 方差  考虑上一次的量 xt = a * x_t-1 +(1-a)*x_now
        update_move_mean = moving_averages.assign_moving_average(moving_mean,
                                                mean, decay=momentum)
        update_move_variance = moving_averages.assign_moving_average(moving_variance,
                                                variance, decay=momentum)
        tf.add_to_collection(UPDATE_OPS_COLLECTION, update_move_mean)
        tf.add_to_collection(UPDATE_OPS_COLLECTION, update_move_variance)
    else:
        mean, variance = moving_mean, moving_variance
    return tf.nn.batch_normalization(inputs, mean, variance, beta, gamma, epsilon)
Esempio n. 2
0
def batch_norm(x, decay=0.999, epsilon=1e-03, is_training=True,
               scope="scope"):
    x_shape = x.get_shape()
    num_inputs = x_shape[-1]
    reduce_dims = list(range(len(x_shape) - 1))
    with tf.variable_scope(scope):
        beta = create_var("beta", [num_inputs,],
                               initializer=tf.zeros_initializer())
        gamma = create_var("gamma", [num_inputs,],
                                initializer=tf.ones_initializer())
        # for inference
        moving_mean = create_var("moving_mean", [num_inputs,],
                                 initializer=tf.zeros_initializer(),
                                 trainable=False)
        moving_variance = create_var("moving_variance", [num_inputs],
                                     initializer=tf.ones_initializer(),
                                     trainable=False)
    if is_training:
        mean, variance = tf.nn.moments(x, axes=reduce_dims)
        update_move_mean = moving_averages.assign_moving_average(moving_mean,
                                                mean, decay=decay)
        update_move_variance = moving_averages.assign_moving_average(moving_variance,
                                                variance, decay=decay)
        tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, update_move_mean)
        tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, update_move_variance)
    else:
        mean, variance = moving_mean, moving_variance
    return tf.nn.batch_normalization(x, mean, variance, beta, gamma, epsilon)
Esempio n. 3
0
 def _batch_norm_without_layers(self, input_layer, decay, use_scale,
                                epsilon):
     """Batch normalization on `input_layer` without tf.layers."""
     shape = input_layer.shape
     num_channels = shape[3] if self.data_format == 'NHWC' else shape[1]
     beta = self.get_variable(
         'beta', [num_channels],
         tf.float32,
         tf.float32,
         initializer=tf.zeros_initializer())
     if use_scale:
         gamma = self.get_variable(
             'gamma', [num_channels],
             tf.float32,
             tf.float32,
             initializer=tf.ones_initializer())
     else:
         gamma = tf.constant(1.0, tf.float32, [num_channels])
     moving_mean = tf.get_variable(
         'moving_mean', [num_channels],
         tf.float32,
         initializer=tf.zeros_initializer(),
         trainable=False)
     moving_variance = tf.get_variable(
         'moving_variance', [num_channels],
         tf.float32,
         initializer=tf.ones_initializer(),
         trainable=False)
     if self.phase_train:
         bn, batch_mean, batch_variance = tf.nn.fused_batch_norm(
             input_layer,
             gamma,
             beta,
             epsilon=epsilon,
             data_format=self.data_format,
             is_training=True)
         mean_update = moving_averages.assign_moving_average(
             moving_mean, batch_mean, decay=decay, zero_debias=False)
         variance_update = moving_averages.assign_moving_average(
             moving_variance,
             batch_variance,
             decay=decay,
             zero_debias=False)
         tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, mean_update)
         tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, variance_update)
     else:
         bn, _, _ = tf.nn.fused_batch_norm(
             input_layer,
             gamma,
             beta,
             mean=moving_mean,
             variance=moving_variance,
             epsilon=epsilon,
             data_format=self.data_format,
             is_training=False)
     return bn
Esempio n. 4
0
    def __init__(self, size, eps=1e-2, default_clip_range=np.inf, sess=None):
        """A normalizer that ensures that observations are approximately distributed according to
        a standard Normal distribution (i.e. have mean zero and variance one).

        Args:
            size (int): the size of the observation to be normalized
            eps (float): a small constant that avoids underflows
            default_clip_range (float): normalized observations are clipped to be in
                [-default_clip_range, default_clip_range]
            sess (object): the TensorFlow session to be used
        """
        self.size = size
        self.eps = eps
        self.default_clip_range = default_clip_range
        self.sess = sess if sess is not None else tf.get_default_session()

        self.local_sum = np.zeros(self.size, np.float32)
        self.local_sumsq = np.zeros(self.size, np.float32)
        self.local_count = np.zeros(1, np.float32)

        self.sum_tf = tf.get_variable(
            initializer=tf.zeros_initializer(), shape=self.local_sum.shape, name='sum',
            trainable=False, dtype=tf.float32)
        self.sumsq_tf = tf.get_variable(
            initializer=tf.zeros_initializer(), shape=self.local_sumsq.shape, name='sumsq',
            trainable=False, dtype=tf.float32)
        self.count_tf = tf.get_variable(
            initializer=tf.ones_initializer(), shape=self.local_count.shape, name='count',
            trainable=False, dtype=tf.float32)
        self.mean = tf.get_variable(
            initializer=tf.zeros_initializer(), shape=(self.size,), name='mean',
            trainable=False, dtype=tf.float32)
        self.std = tf.get_variable(
            initializer=tf.ones_initializer(), shape=(self.size,), name='std',
            trainable=False, dtype=tf.float32)
        self.count_pl = tf.placeholder(name='count_pl', shape=(1,), dtype=tf.float32)
        self.sum_pl = tf.placeholder(name='sum_pl', shape=(self.size,), dtype=tf.float32)
        self.sumsq_pl = tf.placeholder(name='sumsq_pl', shape=(self.size,), dtype=tf.float32)

        self.update_op = tf.group(
            self.count_tf.assign_add(self.count_pl),
            self.sum_tf.assign_add(self.sum_pl),
            self.sumsq_tf.assign_add(self.sumsq_pl)
        )
        self.recompute_op = tf.group(
            tf.assign(self.mean, self.sum_tf / self.count_tf),
            tf.assign(self.std, tf.sqrt(tf.maximum(
                tf.square(self.eps),
                self.sumsq_tf / self.count_tf - tf.square(self.sum_tf / self.count_tf)
            ))),
        )
        self.lock = threading.Lock()
Esempio n. 5
0
def initialize_model(sess, train_data_flat, train_labels):
  """Reproduce model from train-on-mnist/mnist_lbfgs"""

  dtype = tf.float64
  batchSize = 100
  learningRate = 0.1

  W = tf.Variable(tf.ones_initializer((1024, 10), dtype=dtype))
  b = tf.Variable(tf.ones_initializer((1, 10), dtype=dtype))
  x = tf.Variable(tf.zeros_initializer((batchSize, 1024), dtype=dtype))
  targets = tf.Variable(tf.zeros_initializer((batchSize, 10), dtype=dtype))
  logits = tf.matmul(x, W) + b

  # cross entropy expects batch dimension to be first, transpose inputs
  cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits, targets)
  cross_entropy_loss = tf.reduce_mean(cross_entropy)
  Wnorm = tf.reduce_sum(tf.square(W))
  bnorm = tf.reduce_sum(tf.square(b))
  loss = cross_entropy_loss + (bnorm + Wnorm)/2
  loss_handle_op = tf.get_session_handle(loss)

  # grads = tf.gradients(loss, [W, b])
  opt = tf.train.GradientDescentOptimizer(learning_rate=learningRate)
  grads_and_vars = opt.compute_gradients(loss, [W, b])
  train_step = opt.apply_gradients(grads_and_vars)

  W_grad = grads_and_vars[0][0]
  b_grad = grads_and_vars[1][0]
  flat_grad = concat_flatten([tf.transpose(W_grad), b_grad])
  flat_grad_handle_op = tf.get_session_handle(flat_grad)
  flat_params = concat_flatten([tf.transpose(W), b])

  # initialize x and targets
  x_placeholder = tf.placeholder(dtype=dtype)
  x_init = x.assign(x_placeholder)

  # initialize labels
  labels_placeholder = tf.placeholder(shape=(batchSize), dtype=tf.int32)
  # Lua labels are off-by-one hence -1
  labels_onehot = tf.one_hot(labels_placeholder - 1, 10, dtype=dtype)
  targets_init = targets.assign(labels_onehot)

  sess.run(x_init, feed_dict={x_placeholder:train_data_flat[:batchSize]})
  sess.run(targets_init, feed_dict={labels_placeholder:
                                    train_labels[:batchSize]})
  sess.run([W.initializer, b.initializer])
  [(Wgrad, W), (bgrad, b)] = grads_and_vars
  return [loss, loss_handle_op, flat_params, flat_grad, flat_grad_handle_op,
          W, b, train_step]
Esempio n. 6
0
def layer_norm(x: tf.Tensor, epsilon: float = 1e-6) -> tf.Tensor:
    """Layer normalize the tensor x, averaging over the last dimension.

    Implementation based on tensor2tensor.

    Arguments:
        x: The ``Tensor`` to normalize.
        epsilon: The smoothing parameter of the normalization.

    Returns:
        The normalized tensor.
    """
    with tf.variable_scope("LayerNorm"):
        gamma = get_variable(
            name="gamma",
            shape=[x.get_shape()[-1]],
            dtype=tf.float32,
            initializer=tf.ones_initializer())
        beta = get_variable(
            name="beta",
            shape=[x.get_shape()[-1]],
            dtype=tf.float32,
            initializer=tf.zeros_initializer())

        mean = tf.reduce_mean(x, axis=[-1], keepdims=True)
        variance = tf.reduce_mean(
            tf.square(x - mean),
            axis=[-1],
            keepdims=True)
        norm_x = (x - mean) * tf.rsqrt(variance + epsilon)
        return norm_x * gamma + beta
  def call(self, x, h):
    channels = x.shape[self._feature_axis].value

    with tf.variable_scope('gates'):
      inputs = tf.concat([x, h], axis=self._feature_axis)
      n = channels + self._filters
      m = 2 * self._filters if self._filters > 1 else 2
      W = tf.get_variable('kernel', self._kernel + [n, m])
      y = tf.nn.convolution(inputs, W, 'SAME', data_format=self._data_format)
      if self._normalize:
        r, u = tf.split(y, 2, axis=self._feature_axis)
        r = tf.contrib.layers.layer_norm(r)
        u = tf.contrib.layers.layer_norm(u)
      else:
        y += tf.get_variable('bias', [m], initializer=tf.ones_initializer())
        r, u = tf.split(y, 2, axis=self._feature_axis)
      r, u = tf.sigmoid(r), tf.sigmoid(u)

      # TODO
      #tf.summary.histogram('reset_gate', r)
      #tf.summary.histogram('update_gate', u)

    with tf.variable_scope('candidate'):
      inputs = tf.concat([x, r * h], axis=self._feature_axis)
      n = channels + self._filters
      m = self._filters
      W = tf.get_variable('kernel', self._kernel + [n, m])
      y = tf.nn.convolution(inputs, W, 'SAME', data_format=self._data_format)
      if self._normalize:
        y = tf.contrib.layers.layer_norm(y)
      else:
        y += tf.get_variable('bias', [m], initializer=tf.zeros_initializer())
      h = u * h + (1 - u) * self._activation(y)

	return h, h
 def _network_template(self, state):
   # This dummy network allows us to deterministically anticipate that
   # action 0 will be selected by an argmax.
   inputs = tf.constant(
       np.zeros((state.shape[0], stack_size)), dtype=tf.float32)
   # In Rainbow we are dealing with a distribution over Q-values,
   # which are represented as num_atoms bins, ranging from -vmax to vmax.
   # The output layer will have num_actions * num_atoms elements,
   # so each group of num_atoms weights represent the logits for a
   # particular action. By setting 1s everywhere, except for the first
   # num_atoms (representing the logits for the first action), which are
   # set to np.arange(num_atoms), we are ensuring that the first action
   # places higher weight on higher Q-values; this results in the first
   # action being chosen.
   first_row = np.tile(np.ones(self._num_atoms), self.num_actions - 1)
   first_row = np.concatenate((np.arange(self._num_atoms), first_row))
   bottom_rows = np.tile(
       np.ones(self.num_actions * self._num_atoms), (stack_size - 1, 1))
   weights_initializer = np.concatenate(([first_row], bottom_rows))
   net = slim.fully_connected(
       inputs,
       self.num_actions * self._num_atoms,
       weights_initializer=tf.constant_initializer(weights_initializer),
       biases_initializer=tf.ones_initializer(),
       activation_fn=None)
   logits = tf.reshape(net, [-1, self.num_actions, self._num_atoms])
   probabilities = tf.contrib.layers.softmax(logits)
   qs = tf.reduce_sum(self._support * probabilities, axis=2)
   return self._get_network_type()(qs, logits, probabilities)
Esempio n. 9
0
    def test_basic_rnn_cell(self):
        """see test_basic_rnn_cell.png for the graph"""
        batch_size = 1
        input_shape = [batch_size, 2]
        state_shape = [batch_size, 3]
        num_units = 4  # should be equal to state_shape[1] to be recurrent

        input_value = np.random.rand(*input_shape)
        state_value = np.random.rand(*state_shape)
        np_result = TestRNNCells._basic_linear(input_value, state_value, num_units)

        with tf.Session() as sess:
            with tf.variable_scope('test_basic_rnn_cell', initializer=tf.ones_initializer()):
                inputs = tf.placeholder(tf.float32, input_shape, 'inputs')
                prev_state = tf.placeholder(tf.float32, state_shape, 'prev_state')

                cell = tf.contrib.rnn.BasicRNNCell(num_units)
                output_op, new_state_op = cell(inputs, prev_state)

                self.assertIsInstance(output_op, tf.Tensor)

                tf.summary.FileWriter('/tmp/test_basic_rnn_cell', sess.graph)
                sess.run(tf.global_variables_initializer())

                output, new_state = sess.run([output_op, new_state_op],
                                             feed_dict={
                                                 inputs: input_value,
                                                 prev_state: state_value
                                             })

                self.assertIsInstance(output, np.ndarray)
                self.assertEqual(output.shape, (batch_size, num_units))
                self.assertTrue(np.array_equal(output, new_state))
                np.testing.assert_array_almost_equal(np_result, output)
Esempio n. 10
0
File: ops.py Progetto: gdahia/DLF
def conv2d_zeros(x,
                 width,
                 filter_size=[3, 3],
                 stride=[1, 1],
                 pad="SAME",
                 logscale_factor=3,
                 skip=1,
                 edge_bias=True,
                 name=None):
    with tf.variable_scope(name, "conv2d"):
        if edge_bias and pad == "SAME":
            x = add_edge_padding(x, filter_size)
            pad = 'VALID'

        n_in = int(x.get_shape()[3])
        stride_shape = [1] + stride + [1]
        filter_shape = filter_size + [n_in, width]
        w = tf.get_variable("W", filter_shape, tf.float32,
                            initializer=tf.zeros_initializer())
        if skip == 1:
            x = tf.nn.conv2d(x, w, stride_shape, pad, data_format='NHWC')
        else:
            assert stride[0] == 1 and stride[1] == 1
            x = tf.nn.atrous_conv2d(x, w, skip, pad)
        x += tf.get_variable("b", [1, 1, 1, width],
                             initializer=tf.ones_initializer())
        x *= tf.exp(tf.get_variable("logs",
                                    [1, width], initializer=tf.zeros_initializer()) * logscale_factor)
    return x
Esempio n. 11
0
    def get_logits(self, image):
        gauss_init = tf.random_normal_initializer(stddev=0.01)
        with argscope(Conv2D,
                      kernel_initializer=tf.variance_scaling_initializer(scale=2.)), \
                argscope([Conv2D, FullyConnected], activation=tf.nn.relu), \
                argscope([Conv2D, MaxPooling], data_format='channels_last'):
            # necessary padding to get 55x55 after conv1
            image = tf.pad(image, [[0, 0], [2, 2], [2, 2], [0, 0]])
            l = Conv2D('conv1', image, filters=96, kernel_size=11, strides=4, padding='VALID')
            # size: 55
            visualize_conv1_weights(l.variables.W)
            l = tf.nn.lrn(l, 2, bias=1.0, alpha=2e-5, beta=0.75, name='norm1')
            l = MaxPooling('pool1', l, 3, strides=2, padding='VALID')
            # 27
            l = Conv2D('conv2', l, filters=256, kernel_size=5, split=2)
            l = tf.nn.lrn(l, 2, bias=1.0, alpha=2e-5, beta=0.75, name='norm2')
            l = MaxPooling('pool2', l, 3, strides=2, padding='VALID')
            # 13
            l = Conv2D('conv3', l, filters=384, kernel_size=3)
            l = Conv2D('conv4', l, filters=384, kernel_size=3, split=2)
            l = Conv2D('conv5', l, filters=256, kernel_size=3, split=2)
            l = MaxPooling('pool3', l, 3, strides=2, padding='VALID')

            l = FullyConnected('fc6', l, 4096,
                               kernel_initializer=gauss_init,
                               bias_initializer=tf.ones_initializer())
            l = Dropout(l, rate=0.5)
            l = FullyConnected('fc7', l, 4096, kernel_initializer=gauss_init)
            l = Dropout(l, rate=0.5)
        logits = FullyConnected('fc8', l, 1000, kernel_initializer=gauss_init)
        return logits
def batch_norm(inputs, name_scope, is_training, epsilon=1e-3, decay=0.99):
    with tf.variable_scope(name_scope):
        size = inputs.get_shape().as_list()[1]

        gamma = tf.get_variable(
            'gamma', [size], initializer=tf.constant_initializer(0.1))
        # beta = tf.get_variable('beta', [size], initializer=tf.constant_initializer(0))
        beta = tf.get_variable('beta', [size])

        pop_mean = tf.get_variable('pop_mean', [size],
                                   initializer=tf.zeros_initializer(), trainable=False)
        pop_var = tf.get_variable('pop_var', [size],
                                  initializer=tf.ones_initializer(), trainable=False)
        batch_mean, batch_var = tf.nn.moments(inputs, [0])

        train_mean_op = tf.assign(
            pop_mean, pop_mean * decay + batch_mean * (1 - decay))
        train_var_op = tf.assign(
            pop_var, pop_var * decay + batch_var * (1 - decay))

        def batch_statistics():
            with tf.control_dependencies([train_mean_op, train_var_op]):
                return tf.nn.batch_normalization(inputs, batch_mean, batch_var, beta, gamma, epsilon)

        def pop_statistics():
            return tf.nn.batch_normalization(inputs, pop_mean, pop_var, beta, gamma, epsilon)

        # control flow
        return tf.cond(is_training, batch_statistics, pop_statistics)
def batch_norm(x, name_scope, training, epsilon=1e-3, decay=0.999):
    """Assume 2d [batch, values] tensor"""

    with tf.variable_scope(name_scope):
        size = x.get_shape().as_list()[1]

        scale = tf.get_variable('scale', [size],
            initializer=tf.constant_initializer(0.1))
        offset = tf.get_variable('offset', [size])

        pop_mean = tf.get_variable('pop_mean', [size],
            initializer=tf.zeros_initializer(),
            trainable=False)
        pop_var = tf.get_variable('pop_var', [size],
            initializer=tf.ones_initializer(),
            trainable=False)
        batch_mean, batch_var = tf.nn.moments(x, [0])

        train_mean_op = tf.assign(
            pop_mean,
            pop_mean * decay + batch_mean * (1 - decay))
        train_var_op = tf.assign(
            pop_var,
            pop_var * decay + batch_var * (1 - decay))

        def batch_statistics():
            with tf.control_dependencies([train_mean_op, train_var_op]):
                return tf.nn.batch_normalization(x, batch_mean, batch_var, offset, scale, epsilon)

        def population_statistics():
            return tf.nn.batch_normalization(x, pop_mean, pop_var, offset, scale, epsilon)

        return tf.cond(training, batch_statistics, population_statistics)
 def __init__ (self, name, inputs, training, data_format, start=None, end=None, weights=None, 
                     weight_scope=None, fake=False):
     super(BatchNorm, self).__init__(name = name, start=start, end=end)
     self.fake = fake
     if not self.fake:
         if weights is not None:
             params_name = weight_scope + '/' + str(name) + '/batch_normalization/'
             np_dict = load_pkl_obj(weights)
             beta_np = np_dict[params_name+'beta:0']
             gamma_np = np_dict[params_name+'gamma:0']
             moving_mean_np = np_dict[params_name+'moving_mean:0']
             moving_variance_np = np_dict[params_name+'moving_variance:0']
             in_shp = inputs.shape.as_list()[1]
             if not beta_np.shape[0] == in_shp:
                 beta_np = np.resize(beta_np, (in_shp,))
                 gamma_np = np.resize(gamma_np, (in_shp,))
                 moving_mean_np = np.resize(moving_mean_np, (in_shp))
                 moving_variance_np = np.resize(moving_variance_np, (in_shp))
             beta_initializer = tf.constant_initializer(beta_np)
             gamma_initializer = tf.constant_initializer(gamma_np)
             moving_mean_initializer = tf.constant_initializer(moving_mean_np)
             moving_variance_initializer = tf.constant_initializer(moving_variance_np)            
         else:
             beta_initializer = tf.zeros_initializer()
             gamma_initializer = tf.ones_initializer()
             moving_mean_initializer = tf.zeros_initializer()
             moving_variance_initializer = tf.ones_initializer()            
         with tf.variable_scope(self._name):
             self.output=tf.layers.batch_normalization(inputs=inputs,
                                                     axis=1 if data_format == 'channels_first' else 3,
                                                     momentum=_BATCH_NORM_DECAY,
                                                     epsilon=_BATCH_NORM_EPSILON,
                                                     center=True,
                                                     scale=True,
                                                     training=training,
                                                     beta_initializer=beta_initializer,
                                                     gamma_initializer=gamma_initializer,
                                                     moving_mean_initializer=moving_mean_initializer,
                                                     moving_variance_initializer=moving_variance_initializer,
                                                     fused=True )
         self._tf_name = self.output.name.split('/')[0] + '/' + self.output.name.split('/')[1]
     else:
         assert isinstance(inputs, Fake)
         self.output=Fake(inputs.shape)
         self.param=Fake(inputs.shape[1] * 4)
     self.description.append('BatchNorm')
     self.description.append(self.get_memory_footprint())
Esempio n. 15
0
def main(_):
  ed.set_seed(42)

  # DATA
  x_data = build_toy_dataset(FLAGS.N)

  # MODEL
  pi = Dirichlet(concentration=tf.ones(FLAGS.K))
  mu = Normal(0.0, 1.0, sample_shape=[FLAGS.K, FLAGS.D])
  sigma = InverseGamma(concentration=1.0, rate=1.0,
                       sample_shape=[FLAGS.K, FLAGS.D])
  c = Categorical(logits=tf.log(pi) - tf.log(1.0 - pi), sample_shape=FLAGS.N)
  x = Normal(loc=tf.gather(mu, c), scale=tf.gather(sigma, c))

  # INFERENCE
  qpi = Empirical(params=tf.get_variable(
      "qpi/params",
      [FLAGS.T, FLAGS.K],
      initializer=tf.constant_initializer(1.0 / FLAGS.K)))
  qmu = Empirical(params=tf.get_variable("qmu/params",
                                         [FLAGS.T, FLAGS.K, FLAGS.D],
                                         initializer=tf.zeros_initializer()))
  qsigma = Empirical(params=tf.get_variable("qsigma/params",
                                            [FLAGS.T, FLAGS.K, FLAGS.D],
                                            initializer=tf.ones_initializer()))
  qc = Empirical(params=tf.get_variable("qc/params",
                                        [FLAGS.T, FLAGS.N],
                                        initializer=tf.zeros_initializer(),
                                        dtype=tf.int32))

  gpi = Dirichlet(concentration=tf.constant([1.4, 1.6]))
  gmu = Normal(loc=tf.constant([[1.0, 1.0], [-1.0, -1.0]]),
               scale=tf.constant([[0.5, 0.5], [0.5, 0.5]]))
  gsigma = InverseGamma(concentration=tf.constant([[1.1, 1.1], [1.1, 1.1]]),
                        rate=tf.constant([[1.0, 1.0], [1.0, 1.0]]))
  gc = Categorical(logits=tf.zeros([FLAGS.N, FLAGS.K]))

  inference = ed.MetropolisHastings(
      latent_vars={pi: qpi, mu: qmu, sigma: qsigma, c: qc},
      proposal_vars={pi: gpi, mu: gmu, sigma: gsigma, c: gc},
      data={x: x_data})

  inference.initialize()

  sess = ed.get_session()
  tf.global_variables_initializer().run()

  for _ in range(inference.n_iter):
    info_dict = inference.update()
    inference.print_progress(info_dict)

    t = info_dict['t']
    if t == 1 or t % inference.n_print == 0:
      qpi_mean, qmu_mean = sess.run([qpi.mean(), qmu.mean()])
      print("")
      print("Inferred membership probabilities:")
      print(qpi_mean)
      print("Inferred cluster means:")
      print(qmu_mean)
Esempio n. 16
0
 def build(self, _):
   self.scale = tf.get_variable("layer_norm_scale", [self.hidden_size],
                                initializer=tf.ones_initializer(dtype=tf.float32),
                                dtype=tf.float32)
   self.bias = tf.get_variable("layer_norm_bias", [self.hidden_size],
                               initializer=tf.zeros_initializer(dtype=tf.float32),
                               dtype=tf.float32)
   self.built = True
Esempio n. 17
0
def create_graph(device0, device1):
  """Create graph that keeps var1 on device0, var2 on device1 and adds them"""
  
  tf.reset_default_graph()
  dtype=tf.int32
  params_size = 250*1000*FLAGS.data_mb # 1MB is 250k integers

  with tf.device(device0):
    var1 = tf.get_variable("var1", [params_size], dtype,
                             initializer=tf.ones_initializer())
  with tf.device(device1):
    var2 = tf.get_variable("var2", [params_size], dtype,
                           initializer=tf.ones_initializer())
    add_op = var1.assign_add(var2)
    
  init_op = tf.global_variables_initializer()
  return init_op, add_op
Esempio n. 18
0
def make_params():
  params_size = 250*1000*FLAGS.data_mb # 1MB is 250k integers
  dtype=tf.int32
  ps_device = get_ps_device(0)
  with tf.device(ps_device):
    params = tf.get_variable("params", [params_size], dtype,
                             initializer=tf.ones_initializer())
  return params
Esempio n. 19
0
def bn(x, c):
    x_shape = x.get_shape()
    params_shape = x_shape[-1:]

    if c['use_bias']:
        bias = _get_variable('bias', params_shape,
                             initializer=tf.zeros_initializer)
        return x + bias


    axis = list(range(len(x_shape) - 1))

    beta = _get_variable('beta',
                         params_shape,
                         initializer=tf.zeros_initializer)
    gamma = _get_variable('gamma',
                          params_shape,
                          initializer=tf.ones_initializer())

    moving_mean = _get_variable('moving_mean',
                                params_shape,
                                initializer=tf.zeros_initializer,
                                trainable=False)
    moving_variance = _get_variable('moving_variance',
                                    params_shape,
                                    initializer=tf.ones_initializer(),
                                    trainable=False)

    # These ops will only be preformed when training.
    mean, variance = tf.nn.moments(x, axis)
    update_moving_mean = moving_averages.assign_moving_average(moving_mean,
                                                               mean, BN_DECAY)
    update_moving_variance = moving_averages.assign_moving_average(
        moving_variance, variance, BN_DECAY)
    tf.add_to_collection(UPDATE_OPS_COLLECTION, update_moving_mean)
    tf.add_to_collection(UPDATE_OPS_COLLECTION, update_moving_variance)

    mean, variance = control_flow_ops.cond(
        c['is_training'], lambda: (mean, variance),
        lambda: (moving_mean, moving_variance))

    x = tf.nn.batch_normalization(x, mean, variance, beta, gamma, BN_EPSILON)
    #x.set_shape(inputs.get_shape()) ??

    return x
Esempio n. 20
0
 def __init__(self, capacity):
   s = ()
   d = tf.int32
   super().__init__(capacity - 1, [d], [s])
   self._first = tf.get_variable(name="var1",
                                 initializer=tf.ones_initializer(),
                                 shape=s, dtype=d, use_resource=False)
   self._size = tf.get_variable(name="size", shape=(),
                                initializer=tf.zeros_initializer(),
                                dtype=tf.int32, use_resource=False)
Esempio n. 21
0
def layer_norm(x, nmaps, prefix, epsilon=1e-5):
  """Layer normalize the 4D tensor x, averaging over the last dimension."""
  with tf.variable_scope(prefix):
    scale = tf.get_variable("layer_norm_scale", [nmaps],
                            initializer=tf.ones_initializer())
    bias = tf.get_variable("layer_norm_bias", [nmaps],
                           initializer=tf.zeros_initializer())
    mean, variance = tf.nn.moments(x, [3], keep_dims=True)
    norm_x = (x - mean) / tf.sqrt(variance + epsilon)
    return norm_x * scale + bias
Esempio n. 22
0
  def testInitializers(self):
    inputs = tf.placeholder(tf.float32, shape=[self.batch_size, self.in_size])
    prev_state = tf.placeholder(tf.float32,
                                shape=[self.batch_size, self.hidden_size])

    with self.assertRaisesRegexp(KeyError, "Invalid initializer keys.*"):
      snt.VanillaRNN(name="rnn",
                     hidden_size=self.hidden_size,
                     initializers={"invalid": None})

    err = "Initializer for 'w' is not a callable function"
    with self.assertRaisesRegexp(TypeError, err):
      snt.VanillaRNN(name="rnn",
                     hidden_size=self.hidden_size,
                     initializers={"in_to_hidden": {"w": tf.zeros([10, 10])}})

    # Nested initializer.
    valid_initializers = {
        "in_to_hidden": {
            "w": tf.ones_initializer(),
        },
        "hidden_to_hidden": {
            "b": tf.ones_initializer(),
        }
    }

    vanilla_rnn = snt.VanillaRNN(name="rnn",
                                 hidden_size=self.hidden_size,
                                 initializers=valid_initializers)

    vanilla_rnn(inputs, prev_state)
    init = tf.global_variables_initializer()

    with self.test_session() as sess:
      sess.run(init)
      w_v, b_v = sess.run([
          vanilla_rnn.in_to_hidden_linear.w,
          vanilla_rnn.hidden_to_hidden_linear.b,
      ])
      self.assertAllClose(w_v, np.ones([self.in_size, self.hidden_size]))
      self.assertAllClose(b_v, np.ones([self.hidden_size]))
 def _batch_norm_without_layers(self, input_layer, decay, use_scale, epsilon):
   """Batch normalization on `input_layer` without tf.layers."""
   # We make this function as similar as possible to the
   # tf.contrib.layers.batch_norm, to minimize the differences between using
   # layers and not using layers.
   shape = input_layer.shape
   num_channels = shape[3] if self.data_format == 'NHWC' else shape[1]
   beta = self.get_variable('beta', [num_channels], tf.float32, tf.float32,
                            initializer=tf.zeros_initializer())
   if use_scale:
     gamma = self.get_variable('gamma', [num_channels], tf.float32,
                               tf.float32, initializer=tf.ones_initializer())
   else:
     gamma = tf.constant(1.0, tf.float32, [num_channels])
   # For moving variables, we use tf.get_variable instead of self.get_variable,
   # since self.get_variable returns the result of tf.cast which we cannot
   # assign to.
   moving_mean = tf.get_variable('moving_mean', [num_channels],
                                 tf.float32,
                                 initializer=tf.zeros_initializer(),
                                 trainable=False)
   moving_variance = tf.get_variable('moving_variance', [num_channels],
                                     tf.float32,
                                     initializer=tf.ones_initializer(),
                                     trainable=False)
   if self.phase_train:
     bn, batch_mean, batch_variance = tf.nn.fused_batch_norm(
         input_layer, gamma, beta, epsilon=epsilon,
         data_format=self.data_format, is_training=True)
     mean_update = moving_averages.assign_moving_average(
         moving_mean, batch_mean, decay=decay, zero_debias=False)
     variance_update = moving_averages.assign_moving_average(
         moving_variance, batch_variance, decay=decay, zero_debias=False)
     tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, mean_update)
     tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, variance_update)
   else:
     bn, _, _ = tf.nn.fused_batch_norm(
         input_layer, gamma, beta, mean=moving_mean,
         variance=moving_variance, epsilon=epsilon,
         data_format=self.data_format, is_training=False)
   return bn
Esempio n. 24
0
def batch_normalization_layer(signal):
    batch_mean, batch_variance = tf.nn.moments(
        signal, list(range(signal.get_shape().ndims - 1)))

    gamma = tf.get_variable(
        'gamma', batch_mean.get_shape(), tf.float32, tf.ones_initializer())
    beta = tf.get_variable(
        'beta', batch_mean.get_shape(), tf.float32, tf.zeros_initializer())
    signal = signal - batch_mean
    signal /= tf.sqrt(batch_variance + 0.0001)
    signal = gamma * signal + beta
    return signal
Esempio n. 25
0
    def test_fully_connected(self):
        input_size = 3
        layer_size = 2
        inputs = [[.1, .2, .3], [.4, .5, .6]]  # batch size (=2) * input_size
        activation_fn = tf.sigmoid
        weight_init = tf.ones_initializer()
        bias_init = tf.ones_initializer()

        w = weight_init([input_size, layer_size])
        b = bias_init([layer_size])  # equivalent to [1, layer_size]

        x = tf.placeholder(tf.float32, [None, input_size])
        infer = tf.contrib.layers.fully_connected(x, layer_size,
                                                  activation_fn=activation_fn,
                                                  weights_initializer=weight_init,
                                                  biases_initializer=bias_init)
        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            output = sess.run(infer, feed_dict={x: inputs})
            expect = sess.run(activation_fn(tf.matmul(inputs, w) + b))

            self.assertTrue(np.array_equal(output, expect))
Esempio n. 26
0
def complex_model(X,y,is_training):

    N, H, W, C = X.shape

    #initialization
    Wconv1 = tf.get_variable("Wconv1", [7, 7, 3, 32], initializer = tf.contrib.layers.xavier_initializer())
    bconv1 = tf.get_variable("bconv1", [32, ], initializer = tf.zeros_initializer())
    gamma1 = tf.get_variable("gamma1", [32, ], initializer = tf.ones_initializer())
    beta1 = tf.get_variable("beta1", [32, ], initializer = tf.zeros_initializer())
    running_mean = tf.get_variable("running_mean", [32, ], initializer = tf.zeros_initializer())
    running_variance = tf.get_variable("running_variance", [32, ], initializer = tf.ones_initializer())
    W1 = tf.get_variable("W1", [8192, 10], initializer = tf.contrib.layers.xavier_initializer())
    b1 = tf.get_variable("b1", [10, ], initializer = tf.zeros_initializer())

    #construct CG
    A1 = tf.nn.conv2d(X, Wconv1, strides=[1, 1, 1 ,1], padding='SAME') + bconv1
    A1b = tf.layers.batch_normalization(A1, training=is_training)
    H1 = tf.nn.relu(A1b)
    #tf.nn.max_pool(value, ksize, strides, padding, data_format='NHWC', name=None)
    H1P = tf.nn.max_pool(H1, [1, 2, 2, 1], [1, 2, 2, 1], padding='VALID')
    #H1D = tf.layers.dropout(H1P, 0.25, training=is_training)
    H1_reshaped = tf.reshape(H1P, [-1, 8192])
    y_out = tf.matmul(H1_reshaped, W1) + b1
    return y_out
Esempio n. 27
0
def test_tf():
  tf.reset_default_graph()
  arr = tf.Variable(tf.ones_initializer(N), dtype=dtype)
  result = tf.reduce_sum(arr)
  result_fetch = tf.group(result)
  sess = tf.Session()
  sess.run(arr.initializer)
  times = []
  for i in range(iters):
    start_time = time.time()
    sess.run(result_fetch)
    end_time = time.time()
    times.append(end_time-start_time)

  return np.asarray(times)
def weight_normalization(weight, scope='weight_norm'):
  """based upon openai's https://github.com/openai/generating-reviews-discovering-sentiment/blob/master/encoder.py"""

  weight_shape_list = weight.get_shape().as_list()
  if len(weight.get_shape()) == 2: #I think you want to sum on axis [0,1,2]
    g_shape = [weight_shape_list[1]]
  else:
    raise ValueError('dimensions unacceptable for weight normalization')

  with tf.variable_scope(scope):

    g = tf.get_variable('g_scalar', shape=g_shape, initializer = tf.ones_initializer())
    weight = g * tf.nn.l2_normalize(weight, dim=0)

    return weight
Esempio n. 29
0
    def __init__(self, dims_out, name=None, eps=1e-5):
        if name is None:
            name = 'layer_norm'
        else:
            name = '{:s}_layer_norm'.format(name)

        with tf.variable_scope(name, values=[dims_out]):
            self.offset = tf.get_variable(name='offset',
                                          shape=[dims_out],
                                          dtype=tf.float32,
                                          initializer=tf.zeros_initializer())
            self.scale = tf.get_variable(name='scale',
                                         shape=[dims_out],
                                         dtype=tf.float32,
                                         initializer=tf.ones_initializer())
            self.eps = tf.constant(eps)
Esempio n. 30
0
 def _network_template(self, state):
   # This dummy network allows us to deterministically anticipate that
   # action 0 will be selected by an argmax.
   inputs = tf.constant(
       np.zeros((state.shape[0], stack_size)), dtype=tf.float32)
   # This weights_initializer gives action 0 a higher weight, ensuring
   # that it gets picked by the argmax.
   weights_initializer = np.tile(
       np.arange(self.num_actions, 0, -1), (stack_size, 1))
   q = slim.fully_connected(
       inputs,
       self.num_actions,
       weights_initializer=tf.constant_initializer(weights_initializer),
       biases_initializer=tf.ones_initializer(),
       activation_fn=None)
   return self._get_network_type()(q)
Esempio n. 31
0
def BatchNorm(inputs, axis=None, training=None, momentum=0.9, epsilon=1e-5,
              center=True, scale=True,
              beta_initializer=tf.zeros_initializer(),
              gamma_initializer=tf.ones_initializer(),
              virtual_batch_size=None,
              data_format='channels_last',
              internal_update=False):
    """
    Mostly equivalent to `tf.layers.batch_normalization`, but different in
    the following:

    1. Accepts `data_format` when `axis` is None. For 2D input, this argument will be ignored.
    2. Default value for `momentum` and `epsilon` is different.
    3. Default value for `training` is automatically obtained from `TowerContext`.
    4. Support the `internal_update` option.

    Args:
        internal_update (bool): if False, add EMA update ops to
            `tf.GraphKeys.UPDATE_OPS`. If True, update EMA inside the layer
            by control dependencies.

    Variable Names:

    * ``beta``: the bias term. Will be zero-inited by default.
    * ``gamma``: the scale term. Will be one-inited by default. Input will be transformed by ``x * gamma + beta``.
    * ``mean/EMA``: the moving average of mean.
    * ``variance/EMA``: the moving average of variance.

    Note:
        1. About multi-GPU training: moving averages across GPUs are not aggregated.
           Batch statistics are computed independently.  This is consistent with most frameworks.
        2. Combinations of ``training`` and ``ctx.is_training``:
            * ``training == ctx.is_training``: standard BN, EMA are
                maintained during training and used during inference. This is
                the default.
            * ``training and not ctx.is_training``: still use batch statistics in inference.
            * ``not training and ctx.is_training``: use EMA to normalize in
                training. This is useful when you load a pre-trained BN and
                don't want to fine tune the EMA. EMA will not be updated in
                this case.
    """
    # parse shapes
    data_format = get_data_format(data_format, tfmode=False)
    shape = inputs.get_shape().as_list()
    ndims = len(shape)
    assert ndims in [2, 4], ndims

    if axis is None:
        if ndims == 2:
            data_format = 'NHWC'
            axis = 1
        else:
            axis = 1 if data_format == 'NCHW' else 3

    # parse training/ctx
    ctx = get_current_tower_context()
    if training is None:
        training = ctx.is_training
    training = bool(training)
    TF_version = get_tf_version_number()
    if not training and ctx.is_training:
        assert TF_version >= 1.4, \
            "Fine tuning a BatchNorm model with fixed statistics is only " \
            "supported after https://github.com/tensorflow/tensorflow/pull/12580 "
        if ctx.is_main_training_tower:  # only warn in first tower
            logger.warn("[BatchNorm] Using moving_mean/moving_variance in training.")
        # Using moving_mean/moving_variance in training, which means we
        # loaded a pre-trained BN and only fine-tuning the affine part.

    coll_bk = backup_collection([tf.GraphKeys.UPDATE_OPS])
    with rename_get_variable(
            {'moving_mean': 'mean/EMA',
             'moving_variance': 'variance/EMA'}):
        if TF_version >= 1.5:
            layer = tf.layers.BatchNormalization(
                axis=axis,
                momentum=momentum, epsilon=epsilon,
                center=center, scale=scale,
                beta_initializer=beta_initializer,
                gamma_initializer=gamma_initializer,
                virtual_batch_size=virtual_batch_size,
                fused=True
            )
        else:
            assert virtual_batch_size is None, "Feature not supported in this version of TF!"
            layer = tf.layers.BatchNormalization(
                axis=axis,
                momentum=momentum, epsilon=epsilon,
                center=center, scale=scale,
                beta_initializer=beta_initializer,
                gamma_initializer=gamma_initializer,
                fused=True
            )
        xn = layer.apply(inputs, training=training, scope=tf.get_variable_scope())

    # maintain EMA only on one GPU is OK, even in replicated mode.
    # because training time doesn't use EMA
    if ctx.is_main_training_tower:
        for v in layer.non_trainable_variables:
            add_model_variable(v)
    if not ctx.is_main_training_tower or internal_update:
        restore_collection(coll_bk)

    if training and internal_update:
        assert layer.updates
        with tf.control_dependencies(layer.updates):
            ret = tf.identity(xn, name='output')
    else:
        ret = tf.identity(xn, name='output')

    vh = ret.variables = VariableHolder(
        moving_mean=layer.moving_mean,
        mean=layer.moving_mean,  # for backward-compatibility
        moving_variance=layer.moving_variance,
        variance=layer.moving_variance)  # for backward-compatibility
    if scale:
        vh.gamma = layer.gamma
    if center:
        vh.beta = layer.beta
    return ret
Esempio n. 32
0
            padding='SAME',  # "same" padding
            activation=None,  # None
            kernel_initializer=tf.truncated_normal_initializer(stddev=5e-2,
                                                               seed=100),
            kernel_regularizer=tf.contrib.layers.l2_regularizer(scale=lamC),
            name='conv1')

        conv1 = tf.layers.batch_normalization(
            conv1,
            axis=-1,
            momentum=0.99,
            epsilon=epsilon,
            center=True,
            scale=True,
            beta_initializer=tf.zeros_initializer(),
            gamma_initializer=tf.ones_initializer(),
            moving_mean_initializer=tf.zeros_initializer(),
            moving_variance_initializer=tf.ones_initializer(),
            training=training,
            name='bn1')

        # apply relu
        conv1_bn_relu = tf.nn.relu(conv1, name='relu1')

    with tf.name_scope('conv1.1') as scope:
        conv11 = tf.layers.conv2d(
            conv1_bn_relu,  # Input data
            filters=32,  # 32 filters
            kernel_size=(3, 3),  # Kernel size: 5x5
            strides=(1, 1),  # Stride: 2
            padding='SAME',  # "same" padding
Esempio n. 33
0
def CNN_3d_change(x, out_channels_0, out_channels_1, add_relu=True):
    '''Add a 3d convlution layer with relu and max pooling layer.

    Args:
        x: a tensor with shape [batch, in_depth, in_height, in_width, in_channels]
        out_channels: a number
        filter_size: a number
        pooling_size: a number

    Returns:
        a flattened tensor with shape [batch, num_features]

    Raises:
    '''
    in_channels = x.shape[-1]
    weights_0 = tf.get_variable(
        name='filter_0',
        shape=[3, 3, 3, in_channels, out_channels_0],
        dtype=tf.float32,
        #initializer=tf.random_normal_initializer(0, 0.05))
        initializer=tf.random_uniform_initializer(-0.01, 0.01))
    bias_0 = tf.get_variable(name='bias_0',
                             shape=[out_channels_0],
                             dtype=tf.float32,
                             initializer=tf.zeros_initializer())
    #Todo
    g_0 = tf.get_variable(name='scale_0',
                          shape=[out_channels_0],
                          dtype=tf.float32,
                          initializer=tf.ones_initializer())
    weights_0 = tf.reshape(g_0, [1, 1, 1, out_channels_0
                                 ]) * tf.nn.l2_normalize(weights_0, [0, 1, 2])

    conv_0 = tf.nn.conv3d(x,
                          weights_0,
                          strides=[1, 1, 1, 1, 1],
                          padding="VALID")
    print('conv_0 shape: %s' % conv_0.shape)
    conv_0 = conv_0 + bias_0
    #######
    '''
    with tf.variable_scope('layer_0'):
        conv_0 = op.layer_norm(conv_0, axis=[1, 2, 3, 4])
        print('layer_norm in cnn')
    '''
    if add_relu:
        conv_0 = tf.nn.elu(conv_0)

    pooling_0 = tf.nn.max_pool3d(conv_0,
                                 ksize=[1, 2, 3, 3, 1],
                                 strides=[1, 2, 3, 3, 1],
                                 padding="VALID")
    print('pooling_0 shape: %s' % pooling_0.shape)

    #layer_1
    weights_1 = tf.get_variable(
        name='filter_1',
        shape=[2, 2, 2, out_channels_0, out_channels_1],
        dtype=tf.float32,
        initializer=tf.random_uniform_initializer(-0.01, 0.01))

    bias_1 = tf.get_variable(name='bias_1',
                             shape=[out_channels_1],
                             dtype=tf.float32,
                             initializer=tf.zeros_initializer())

    g_1 = tf.get_variable(name='scale_1',
                          shape=[out_channels_1],
                          dtype=tf.float32,
                          initializer=tf.ones_initializer())
    weights_1 = tf.reshape(g_1, [1, 1, 1, out_channels_1
                                 ]) * tf.nn.l2_normalize(weights_1, [0, 1, 2])

    conv_1 = tf.nn.conv3d(pooling_0,
                          weights_1,
                          strides=[1, 1, 1, 1, 1],
                          padding="VALID")
    print('conv_1 shape: %s' % conv_1.shape)
    conv_1 = conv_1 + bias_1
    #with tf.variable_scope('layer_1'):
    #    conv_1 = op.layer_norm(conv_1, axis=[1, 2, 3, 4])

    if add_relu:
        conv_1 = tf.nn.elu(conv_1)

    pooling_1 = tf.nn.max_pool3d(conv_1,
                                 ksize=[1, 3, 3, 3, 1],
                                 strides=[1, 3, 3, 3, 1],
                                 padding="VALID")
    print('pooling_1 shape: %s' % pooling_1.shape)

    return tf.contrib.layers.flatten(pooling_1)
Esempio n. 34
0
def batch_norm_lasagne(x,
                       is_training,
                       reuse,
                       decay=0.9,
                       epsilon=1e-4,
                       updates_collections=tf.GraphKeys.UPDATE_OPS,
                       outputs_collections=None,
                       trainable=True,
                       name='bn'):
    with tf.variable_scope(name, reuse=reuse) as curr_scope:
        beta = tf.get_variable(name='beta',
                               initializer=tf.constant(
                                   0.0, shape=[x.get_shape()[-1]]),
                               trainable=trainable)
        gamma = tf.get_variable(name='gamma',
                                initializer=tf.constant(
                                    1.0, shape=[x.get_shape()[-1]]),
                                trainable=trainable)

        moving_mean = tf.get_variable(name='moving_mean',
                                      shape=[x.get_shape()[-1]],
                                      initializer=tf.zeros_initializer(),
                                      trainable=False)

        moving_inv_std = tf.get_variable(name='moving_inv_std',
                                         shape=[x.get_shape()[-1]],
                                         initializer=tf.ones_initializer(),
                                         trainable=False)

        input_shape = helper.get_input_shape(x)
        moments_axes = list(range(len(input_shape) - 1))

        def mean_inv_std_with_update():
            mean, variance = tf.nn.moments(x,
                                           moments_axes,
                                           shift=moving_mean,
                                           name='bn-moments')
            inv_std = math_ops.rsqrt(variance + epsilon)
            update_moving_mean = moving_averages.assign_moving_average(
                moving_mean, mean, decay, zero_debias=False)
            update_moving_inv_std = moving_averages.assign_moving_average(
                moving_inv_std, inv_std, decay, zero_debias=False)
            with tf.control_dependencies(
                [update_moving_mean, update_moving_inv_std]):
                m, v = tf.identity(mean), tf.identity(inv_std)
                return m, v

        def mean_inv_std_with_pending_update():
            mean, variance = tf.nn.moments(x,
                                           moments_axes,
                                           shift=moving_mean,
                                           name='bn-moments')
            inv_std = math_ops.rsqrt(variance + epsilon)
            update_moving_mean = moving_averages.assign_moving_average(
                moving_mean, mean, decay, zero_debias=False)
            update_moving_inv_std = moving_averages.assign_moving_average(
                moving_inv_std, inv_std, decay, zero_debias=False)
            tf.add_to_collection(updates_collections, update_moving_mean)
            tf.add_to_collection(updates_collections, update_moving_inv_std)
            return mean, inv_std

        mean_inv_std_with_relevant_update = \
            mean_inv_std_with_pending_update if updates_collections is not None else mean_inv_std_with_update

        (mean,
         inv_std) = mean_inv_std_with_relevant_update() if is_training else (
             moving_mean, moving_inv_std)

        def _batch_normalization(x, mean, inv, offset, scale):
            with tf.name_scope(name, "batchnorm",
                               [x, mean, inv, scale, offset]):
                if scale is not None:
                    inv *= scale
                return x * inv + (offset - mean * inv
                                  if offset is not None else -mean * inv)

        output = _batch_normalization(x, mean, inv_std, beta, gamma)
        return _collect_named_outputs(outputs_collections,
                                      curr_scope.original_name_scope, name,
                                      output)
Esempio n. 35
0
    def __init__(self, cov_func, lik_func, num_train, inducing_inputs, args):
        """Create a new variational inference object which will keep track of all variables.

        Args:
            cov_func: covariance function (kernel function)
            lik_func: likelihood function
            num_train: the number of training examples
            inducing_inputs: the initial values for the inducing_inputs or just the number of
                             inducing inputs
            args: additional parameters: num_components, diag_post, use_loo, num_samples,
                  optimize_inducing
        """

        # self.mean = mean_func
        self.cov = cov_func
        self.lik = lik_func
        self.num_train = num_train
        self.num_latents = len(self.cov)
        self.args = args

        # Initialize inducing inputs if they are provided
        if isinstance(inducing_inputs, int):
            # Only the number of inducing inputs is given -> just specify the shape
            num_inducing = inducing_inputs
            inducing_params = {
                'shape':
                [self.num_latents, num_inducing, self.cov[0].input_dim],
                'dtype': tf.float32
            }
        else:
            # Repeat the inducing inputs for all latent processes if we haven't been given
            # individually specified inputs per process.
            if inducing_inputs.ndim == 2:
                inducing_inputs = np.tile(inducing_inputs[np.newaxis, :, :],
                                          reps=[self.num_latents, 1, 1])
            # Initialize with the given values
            inducing_params = {
                'initializer': tf.constant(inducing_inputs, dtype=tf.float32)
            }
            num_inducing = inducing_inputs.shape[-2]

        num_components = args['num_components']
        # Initialize all variables
        with tf.variable_scope(None, "variational_inference"):
            # Define all parameters that get optimized directly in raw form. Some parameters get
            # transformed internally to maintain certain pre-conditions.

            self.inducing_inputs = tf.get_variable("inducing_inputs",
                                                   **inducing_params)

            zeros = tf.zeros_initializer(dtype=tf.float32)
            self.raw_weights = tf.get_variable("raw_weights", [num_components],
                                               initializer=zeros)
            self.means = tf.get_variable(
                "means", [num_components, self.num_latents, num_inducing],
                initializer=zeros)
            if args['diag_post']:
                self.raw_covars = tf.get_variable(
                    "raw_covars",
                    [num_components, self.num_latents, num_inducing],
                    initializer=tf.ones_initializer())
            else:
                self.raw_covars = tf.get_variable(
                    "raw_covars",
                    shape=[num_components, self.num_latents] +
                    util.tri_vec_shape(num_inducing),
                    initializer=zeros)
Esempio n. 36
0
def _view_pool_with_classes(view_features, y, n_classes, is_training,reuse = False):

    with tf.variable_scope("view_pool_with_classes", reuse = reuse ) as scope:

        W = tf.get_variable(name="weights", shape= [1, n_classes], dtype=tf.float64, initializer=tf.ones_initializer())
        b = tf.get_variable(name="biases", shape = [n_classes], dtype=tf.float64, initializer=tf.zeros_initializer())

        #W1 = tf.get_variable(name="weights1", shape= [n_classes, n_classes], dtype=tf.float64, initializer=tf.ones_initializer())
        #b1 = tf.get_variable(name="biases1", shape = [n_classes], dtype=tf.float64, initializer=tf.zeros_initializer())

        y = tf.cast(y, tf.float64)
        y = tf.expand_dims(y, 1)
        mask = tf.add(tf.matmul(y , W), b)
        #mask = tf.sigmoid(mask)
        #mask = tf.add(tf.matmul(mask , W1), b1)
        #mask = tf.sigmoid(mask)
        vp = tf.stack(view_features, 0)
        vp = tf.reduce_mean(vp,0)

        vp = tf.multiply(vp, mask)

        return vp
Esempio n. 37
0
a = tf.constant([1.0, 2.0], name='a')
b = tf.constant([2.0, 3.0], name='b')

g1 = tf.Graph()
with g1.as_default():
    v = tf.get_variable('v', shape=[1], initializer=tf.zeros_initializer())

with tf.Session(graph=g1) as sess:
    tf.global_variables_initializer().run()
    with tf.variable_scope("", reuse=True):
        print(sess.run(tf.get_variable('v')))

g2 = tf.Graph()
with g2.as_default():
    v = tf.get_variable('v', shape=[2, 2], initializer=tf.ones_initializer())

with tf.Session(graph=g2) as sess:
    tf.global_variables_initializer().run()
    with tf.variable_scope("", reuse=True):
        print(sess.run(tf.get_variable('v')))

# g = tf.Graph()
# with g.device('/gpu:0'):
#     result = a+b

weights = tf.Variable(tf.random_normal([2, 3], mean=0, stddev=2))
biases = tf.Variable(tf.zeros([3]))
w2 = tf.Variable(weights.initialized_value())
w3 = tf.Variable(weights.initialized_value() * 2.0)
Esempio n. 38
0
def model_fn(model,
             features,
             mode,
             hparams,
             problem_names,
             train_steps=100000,
             worker_id=0,
             worker_replicas=1,
             eval_run_autoregressive=False,
             decode_hparams=None):
  """Builds the model for all modes.

  * TRAIN: Constructs loss and train_op
  * EVAL: Constructs the loss and eval metrics
  * PREDICT: Constructs the predictions

  Args:
    model: str, name of model.
    features: dict<feature name, Tensor>. Expected to have keys
      {inputs, targets, problem_choice}.
    mode: tf.estimator.ModeKeys.
    hparams: model HParams.
    problem_names: list of str, names of the problems.
    train_steps: int, total number of training steps. Used to compute learning
      rate decay.
    worker_id: int, id of this worker.
    worker_replicas: int, number of workers.
    eval_run_autoregressive: bool, whether to run evaluation autoregressively.
    decode_hparams: HParams for decode settings. Used when mode == PREDICT.

  Returns:
    tf.estimator.EstimatorSpec
  """
  assert len(problem_names) == len(hparams.problem_instances)
  decode_hp = decode_hparams

  # TODO(rsepassi): This still depends on FLAGS. Rm eventually.
  dp = devices.data_parallelism(hparams)

  tf.get_variable_scope().set_initializer(_get_variable_initializer(hparams))
  is_training = mode == tf.estimator.ModeKeys.TRAIN

  # Add input statistics for incoming features.
  with tf.name_scope("input_stats"):
    for (k, v) in six.iteritems(features):
      if isinstance(v, tf.Tensor) and v.get_shape().ndims > 1:
        tf.summary.scalar("%s_batch" % k, tf.shape(v)[0] // dp.n)
        tf.summary.scalar("%s_length" % k, tf.shape(v)[1])
        nonpadding = tf.to_float(tf.not_equal(v, 0))
        nonpadding_tokens = tf.reduce_sum(nonpadding)
        if k == "targets":
          targets_nonpadding_tokens = nonpadding_tokens
        tf.summary.scalar("%s_nonpadding_tokens" % k, nonpadding_tokens)
        tf.summary.scalar("%s_nonpadding_fraction" % k,
                          tf.reduce_mean(nonpadding))

  # Get multi-problem logits and loss based on features["problem_choice"].
  loss_variable_names = []

  def nth_model(n):
    """Build the model for the n-th problem, plus some added variables."""
    model_class = registry.model(model)(
        hparams,
        mode,
        hparams.problems[n],
        n,
        dp,
        devices.ps_devices(all_workers=True),
        decode_hparams=decode_hparams)
    if mode == tf.estimator.ModeKeys.PREDICT:
      return model_class.infer(
          features,
          beam_size=decode_hp.beam_size,
          top_beams=(decode_hp.beam_size if decode_hp.return_beams else 1),
          alpha=decode_hp.alpha,
          decode_length=decode_hp.extra_length)
    # In distributed mode, we build graph for problem=0 and problem=worker_id.
    skipping_is_on = hparams.problem_choice == "distributed" and is_training
    problem_worker_id = worker_id % len(hparams.problems)
    skip_this_one = n != 0 and n % worker_replicas != problem_worker_id
    # On worker 0 also build graph for problems <= 1.
    # TODO(lukaszkaiser): why is this hack needed for variables init? Repair.
    skip_this_one = skip_this_one and (worker_id != 0 or n > 1)
    if eval_run_autoregressive and mode == tf.estimator.ModeKeys.EVAL:
      logits, losses_dict = model_class.eval_autoregressive(features)
    else:
      logits, losses_dict = model_class(
          features, skip=(skipping_is_on and skip_this_one))
    with tf.variable_scope("losses_avg"):
      total_loss, ops = 0.0, []
      for loss_key, loss_value in six.iteritems(losses_dict):
        loss_name = "problem_%d/%s_loss" % (n, loss_key)
        loss_moving_avg = tf.get_variable(
            loss_name, initializer=100.0, trainable=False)
        loss_variable_names.append(loss_name)
        ops.append(
            loss_moving_avg.assign(loss_moving_avg * 0.9 + loss_value * 0.1))
        total_loss += loss_value
      try:  # Total loss avg might be reused or not, we try both.
        with tf.variable_scope(tf.get_variable_scope(), reuse=True):
          # Total loss was already constructed on input.
          loss_moving_avg = tf.get_variable("problem_%d/total_loss" % n)
      except ValueError:
        loss_moving_avg = tf.get_variable(
            "problem_%d/total_loss" % n, initializer=100.0, trainable=False)
      ops.append(
          loss_moving_avg.assign(loss_moving_avg * 0.9 + total_loss * 0.1))
    with tf.variable_scope("train_stats"):  # Count steps for this problem.
      problem_steps = tf.get_variable(
          "problem_%d_steps" % n, initializer=0, trainable=False)
      ops.append(problem_steps.assign_add(1))
    with tf.control_dependencies(ops):  # Make sure the ops run.
      # Ensure the loss is a scalar here.
      total_loss = tf.reshape(total_loss, [], name="total_loss_control_id")
    return [total_loss, logits]

  model_output = input_fn_builder.cond_on_index(
      nth_model,
      index_tensor=features["problem_choice"],
      max_idx=len(hparams.problems) - 1)

  if mode == tf.estimator.ModeKeys.PREDICT:
    # If beam searching, model_output will be a dict with keys "outputs" and
    # "scores".
    if isinstance(model_output, dict):
      outputs = model_output["outputs"]
      scores = model_output["scores"]
    else:
      outputs = model_output
      scores = None

    batched_problem_choice = (
        features["problem_choice"] * tf.ones(
            (tf.shape(features["inputs"])[0],), dtype=tf.int32))
    predictions = {
        "outputs": outputs,
        "scores": scores,
        "inputs": features.get("inputs", None),
        "targets": features.get("infer_targets", None),
        "problem_choice": batched_problem_choice,
    }
    _del_dict_nones(predictions)

    export_out = {"outputs": predictions["outputs"]}
    if "scores" in predictions:
      export_out["scores"] = predictions["scores"]

    return tf.estimator.EstimatorSpec(
        mode,
        predictions=predictions,
        export_outputs={
            "output": tf.estimator.export.PredictOutput(export_out)
        })

  total_loss, logits = model_output

  if mode == tf.estimator.ModeKeys.EVAL:
    eval_metrics_fns = metrics.create_evaluation_metrics(
        hparams.problem_instances, hparams)

    eval_metrics = {}
    for metric_name, metric_fn in six.iteritems(eval_metrics_fns):
      eval_metrics[metric_name] = metric_fn(logits, features)

    return tf.estimator.EstimatorSpec(
        mode,
        predictions={"predictions": logits},
        eval_metric_ops=eval_metrics,
        loss=total_loss)

  assert mode == tf.estimator.ModeKeys.TRAIN

  # Set learning rate
  learning_rate = hparams.learning_rate * optimize.learning_rate_decay(
      hparams, num_worker_replicas=worker_replicas, num_train_steps=train_steps)
  learning_rate /= math.sqrt(float(worker_replicas))

  # Get global step
  global_step = tf.train.get_or_create_global_step()

  # Some training statistics.
  with tf.name_scope("training_stats"):
    tf.summary.scalar("learning_rate", learning_rate)
    for n in xrange(len(hparams.problems)):
      names_and_vars = []
      with tf.variable_scope("losses_avg", reuse=True):
        total_loss_var = tf.get_variable("problem_%d/total_loss" % n)
        names_and_vars.append(("total_loss", total_loss_var))
      with tf.variable_scope("losses_avg", reuse=True):
        for loss_name in loss_variable_names:
          if loss_name.startswith("problem_%d/" % n):
            loss_var = tf.get_variable(loss_name)
            loss_suffix = loss_name[loss_name.index("/") + 1:]
            names_and_vars.append((loss_suffix, loss_var))
      for (loss_name, loss_var) in names_and_vars:
        tf.summary.scalar("loss_avg_%d/%s" % (n, loss_name), loss_var)
      with tf.variable_scope("train_stats", reuse=True):
        nth_steps = tf.get_variable("problem_%d_steps" % n, dtype=tf.int32)
      tf.summary.scalar("problem_%d_frequency" % n,
                        tf.to_float(nth_steps) /
                        (tf.to_float(global_step) + 1.0))

  # Add weight decay and noise.
  total_size, weight_decay_loss = 0, 0.0
  all_weights = {v.name: v for v in tf.trainable_variables()}
  for v_name in sorted(list(all_weights)):
    v = all_weights[v_name]
    v_size = int(np.prod(np.array(v.shape.as_list())))
    total_size += v_size
    if hparams.weight_decay > 0.0 and len(v.shape.as_list()) > 1:
      # Add weight regularization if set and the weight is not a bias (dim>1).
      with tf.device(v._ref().device):  # pylint: disable=protected-access
        v_loss = tf.nn.l2_loss(v) / v_size
      weight_decay_loss += v_loss
    is_body = len(v_name) > 5 and v_name[:5] == "body/"
    if hparams.weight_noise > 0.0 and is_body:
      # Add weight noise if set in hparams.
      with tf.device(v._ref().device):  # pylint: disable=protected-access
        scale = learning_rate * 0.001
        noise = tf.truncated_normal(v.shape) * hparams.weight_noise * scale
        noise_op = v.assign_add(noise)
      with tf.control_dependencies([noise_op]):
        total_loss = tf.identity(total_loss)
  if hparams.weight_decay > 0.0:
    total_loss += weight_decay_loss * hparams.weight_decay

  # The new data reader occasionally emits very small batches, which
  # cause the examples in those batches to be grossly overweighted.
  # We decrease the loss proportionally to the ratio of the size of this
  # batch to the size of the largest training batch ever.
  # TODO(noam): to be more sophisticated, we could keep separate
  # maxima based on problem choice.
  max_nonpadding_var = tf.get_variable(
      "max_nonpadding",
      shape=[],
      initializer=tf.ones_initializer(),
      trainable=False)
  max_nonpadding = tf.maximum(max_nonpadding_var, targets_nonpadding_tokens)
  with tf.control_dependencies([tf.assign(max_nonpadding_var, max_nonpadding)]):
    small_batch_multiplier = targets_nonpadding_tokens / max_nonpadding
  tf.summary.scalar("small_batch_multiplier", small_batch_multiplier)
  total_loss *= small_batch_multiplier

  # Log variable sizes
  _log_variable_sizes(tf.trainable_variables(), "Trainable Variables")
  diet_vars = [
      v for v in tf.global_variables() if v.dtype == dtypes.float16_ref
  ]
  _log_variable_sizes(diet_vars, "Diet Variables")

  # Optimize
  train_op = optimize.optimize(total_loss, learning_rate, hparams)

  # Remove summaries that will fail to run because they are in conditionals.
  # TODO(cwhipkey): Test with this code removed, later in 2017.
  summaries = tf.get_collection_ref(tf.GraphKeys.SUMMARIES)
  for i in reversed(range(len(summaries))):
    if summaries[i].name.startswith("cond_"):
      del summaries[i]

  tf.logging.info("Global model_fn finished.")
  return tf.estimator.EstimatorSpec(
      mode,
      predictions={"problem_choice": features["problem_choice"]},
      loss=total_loss,
      train_op=train_op)
Esempio n. 39
0
def conv_gn(input_tensor,
            kernel_size,
            filters,
            strides,
            name,
            relu=False,
            center=False,
            scale=False,
            channel_wise=True,
            group=32,
            group_channel=8,
            padding='same',
            biased=False,
            reuse=tf.AUTO_REUSE,
            dilation=1):
    assert len(input_tensor.get_shape()) == 4

    # deconvolution
    res = tf.layers.conv2d(input_tensor,
                           kernel_size=kernel_size,
                           filters=filters,
                           padding=padding,
                           strides=strides,
                           reuse=reuse,
                           name=name,
                           dilation_rate=dilation)
    # group normalization
    x = tf.transpose(res, [0, 3, 1, 2])
    shape = tf.shape(x)
    N = shape[0]
    C = x.get_shape()[1]
    H = shape[2]
    W = shape[3]
    if channel_wise:
        G = max(1, C / group_channel)
    else:
        G = min(group, C)

    # normalization
    x = tf.reshape(x, [N, G, C // G, H, W])
    mean, var = tf.nn.moments(x, [2, 3, 4], keep_dims=True)
    x = (x - mean) / tf.sqrt(var + 1e-5)

    # per channel scale and bias (gamma and beta)
    with tf.variable_scope(name + '/gn', reuse=reuse):
        if scale:
            gamma = tf.get_variable('gamma', [C],
                                    dtype=tf.float32,
                                    initializer=tf.ones_initializer())
        else:
            gamma = tf.constant(1.0, shape=[C])
        if center:
            beta = tf.get_variable('beta', [C],
                                   dtype=tf.float32,
                                   initializer=tf.zeros_initializer())
        else:
            beta = tf.constant(0.0, shape=[C])
    gamma = tf.reshape(gamma, [1, C, 1, 1])
    beta = tf.reshape(beta, [1, C, 1, 1])
    output = tf.reshape(x, [-1, C, H, W]) * gamma + beta

    # tranpose: [bs, c, h, w, c] to [bs, h, w, c] following the paper
    output = tf.transpose(output, [0, 2, 3, 1])

    if relu:
        output = tf.nn.relu(output, name + '/relu')
    return output
Esempio n. 40
0
    def __init__(self, size, eps=1e-2, default_clip_range=np.inf, sess=None):
        """A normalizer that ensures that observations are approximately distributed according to
        a standard Normal distribution (i.e. have mean zero and variance one).

        Args:
            size (int): the size of the observation to be normalized
            eps (float): a small constant that avoids underflows
            default_clip_range (float): normalized observations are clipped to be in
                [-default_clip_range, default_clip_range]
            sess (object): the TensorFlow session to be used
        """
        self.size = size
        self.eps = eps
        self.default_clip_range = default_clip_range
        self.sess = sess if sess is not None else tf.get_default_session()

        self.local_sum = np.zeros(self.size, np.float32)
        self.local_sumsq = np.zeros(self.size, np.float32)
        self.local_count = np.zeros(1, np.float32)

        self.sum_tf = tf.compat.v1.get_variable(
            initializer=tf.zeros_initializer(),
            shape=self.local_sum.shape,
            name='sum',
            trainable=False,
            dtype=tf.float32)
        self.sumsq_tf = tf.compat.v1.get_variable(
            initializer=tf.zeros_initializer(),
            shape=self.local_sumsq.shape,
            name='sumsq',
            trainable=False,
            dtype=tf.float32)
        self.count_tf = tf.compat.v1.get_variable(
            initializer=tf.ones_initializer(),
            shape=self.local_count.shape,
            name='count',
            trainable=False,
            dtype=tf.float32)
        self.mean = tf.compat.v1.get_variable(
            initializer=tf.zeros_initializer(),
            shape=(self.size, ),
            name='mean',
            trainable=False,
            dtype=tf.float32)
        self.std = tf.compat.v1.get_variable(initializer=tf.ones_initializer(),
                                             shape=(self.size, ),
                                             name='std',
                                             trainable=False,
                                             dtype=tf.float32)
        self.count_pl = tf.compat.v1.placeholder(name='count_pl',
                                                 shape=(1, ),
                                                 dtype=tf.float32)
        self.sum_pl = tf.compat.v1.placeholder(name='sum_pl',
                                               shape=(self.size, ),
                                               dtype=tf.float32)
        self.sumsq_pl = tf.compat.v1.placeholder(name='sumsq_pl',
                                                 shape=(self.size, ),
                                                 dtype=tf.float32)

        self.update_op = tf.group(self.count_tf.assign_add(self.count_pl),
                                  self.sum_tf.assign_add(self.sum_pl),
                                  self.sumsq_tf.assign_add(self.sumsq_pl))
        self.recompute_op = tf.group(
            tf.compat.v1.assign(self.mean, self.sum_tf / self.count_tf),
            tf.compat.v1.assign(
                self.std,
                tf.sqrt(
                    tf.maximum(
                        tf.square(self.eps), self.sumsq_tf / self.count_tf -
                        tf.square(self.sum_tf / self.count_tf)))),
        )
        self.lock = threading.Lock()
Esempio n. 41
0
class OptimizationConstrainsTest(tf.test.TestCase, parameterized.TestCase):

  @parameterized.parameters([
      (0.5, 0.5),
      (17.3, 17.3),
      (tf.constant_initializer(3.14), 3.14),
      (tf.ones_initializer(), 1.0)
  ])
  def testLagrangeMultInit(self, initializer, exp_lag_mul):
    cons = optimization_constraints.OptimizationConstraints()
    lhs = tf.zeros_like(1.0)
    rhs = tf.ones_like(1.0)
    cons.add(lhs > rhs, initializer=initializer)()
    l = cons.lagrange_multipliers[0]
    with tf.train.MonitoredSession() as sess:
      lag_mul = sess.run(l)
    self.assertAllClose(lag_mul, exp_lag_mul)

  @mock.patch.object(optimization_constraints, '_parametrize')
  def testRateDefaults(self, mocked_parametrized):
    mocked_parametrized.side_effect = (
        lambda x, rate: scale_gradient.scale_gradient(x, -rate))
    rate = 0.1
    cons = optimization_constraints.OptimizationConstraints(rate=rate)
    lhs = tf.zeros_like(1.0)
    rhs = tf.ones_like(1.0)
    x = cons.add(lhs < rhs)()
    v = tf.all_variables()[0]
    dxdl = tf.gradients(x, v)
    with tf.train.MonitoredSession() as sess:
      grads = sess.run(dxdl)
    self.assertAllClose(grads[0], rate)

  @mock.patch.object(optimization_constraints, '_parametrize')
  def testRateOverrides(self, mocked_parametrized):
    mocked_parametrized.side_effect = (
        lambda x, rate: scale_gradient.scale_gradient(x, -rate))
    rate = 7.3
    cons = optimization_constraints.OptimizationConstraints()
    lhs = tf.zeros_like(1.0)
    rhs = tf.ones_like(1.0)
    x = cons.add(lhs < rhs, rate=rate)()
    v = tf.all_variables()[0]
    dxdl = tf.gradients(x, v)
    with tf.train.MonitoredSession() as sess:
      grads = sess.run(dxdl)
    self.assertAllClose(grads[0], rate)

  def testValidRangeDefaults(self):
    valid_range = (1.0, 2.0)
    cons = optimization_constraints.OptimizationConstraints(
        valid_range=valid_range)
    lhs = tf.zeros_like(1.0)
    rhs = tf.ones_like(1.0)
    cons.add(lhs < rhs, initializer=3.0)()
    with tf.train.MonitoredSession() as sess:
      lag_mul = sess.run(cons.lagrange_multipliers[0])
    self.assertAllClose(lag_mul, valid_range[1])

  def testValidRangeOverrides(self):
    cons = optimization_constraints.OptimizationConstraints()
    lhs = tf.zeros_like(1.0)
    rhs = tf.ones_like(1.0)
    valid_range = (1.0, 2.0)
    cons.add(lhs < rhs, initializer=3.0, valid_range=valid_range)()
    with tf.train.MonitoredSession() as sess:
      lag_mul = sess.run(cons.lagrange_multipliers[0])
    self.assertAllClose(lag_mul, valid_range[1])

  @mock.patch.object(
      optimization_constraints.OptimizationConstraints, 'add_geq')
  @mock.patch.object(
      optimization_constraints.OptimizationConstraints, 'add_leq')
  def testOpIdentification(self, mocked_add_leq, mocked_add_geq):
    calls_to_add_leq = [0]
    def mock_add_leq(*args, **kwargs):
      del args
      del kwargs
      calls_to_add_leq[0] += 1
    mocked_add_leq.side_effect = mock_add_leq

    calls_to_add_geq = [0]
    def mock_add_geq(*args, **kwargs):
      del args
      del kwargs
      calls_to_add_geq[0] += 1
    mocked_add_geq.side_effect = mock_add_geq

    cons = optimization_constraints.OptimizationConstraints()
    lhs = tf.zeros_like(1.0)
    rhs = tf.ones_like(1.0)

    self.assertEqual(calls_to_add_leq[0], 0)
    self.assertEqual(calls_to_add_geq[0], 0)
    cons.add(lhs < rhs)
    self.assertEqual(calls_to_add_leq[0], 1)
    self.assertEqual(calls_to_add_geq[0], 0)
    cons.add(lhs <= rhs)
    self.assertEqual(calls_to_add_leq[0], 2)
    self.assertEqual(calls_to_add_geq[0], 0)
    cons.add(lhs > rhs)
    self.assertEqual(calls_to_add_geq[0], 1)
    self.assertEqual(calls_to_add_leq[0], 2)
    cons.add(lhs >= rhs)
    self.assertEqual(calls_to_add_geq[0], 2)
    self.assertEqual(calls_to_add_leq[0], 2)

  def testMinimalRun(self):
    x = basic.TrainableVariable(
        shape=(), initializers={'w': tf.ones_initializer()})()
    x2 = x ** 2.0
    min_value = 0.5
    constr = optimization_constraints.OptimizationConstraints().add(
        x > min_value)

    self.assertFalse(constr._is_connected)
    loss = moving_average.MovingAverage()(
        x2 + tf.random.normal((), stddev=1.0)) + constr()

    self.assertTrue(constr._is_connected)
    with self.assertRaisesRegexp(ValueError, 'Cannot add further constraints'):
      constr.add(x > min_value)
    with self.assertRaisesRegexp(ValueError, 'Cannot add further constraints'):
      constr.add_geq(x, min_value)
    with self.assertRaisesRegexp(ValueError, 'Cannot add further constraints'):
      constr.add_leq(min_value < x)

    opt = tf.train.AdamOptimizer(1e-2, beta1=0.0)
    update = opt.minimize(loss)
    with tf.control_dependencies([update]):
      x2 = tf.identity(x2)

    with tf.train.MonitoredSession() as sess:
      for _ in range(500):
        v, _ = sess.run([x2, update])
    self.assertAllClose(v, min_value**2)
else:
    shuffle = False
    repeat = 1
    train = False

train_flag = tf.placeholder(tf.bool, name='train_flag')
input_img, label_gt = imgs_input_fn(path_tfrecords_train,
                                    perform_shuffle=shuffle,
                                    repeat_count=repeat,
                                    batch_size=5)

conv1 = tf.layers.conv2d(inputs=input_img,
                         filters=64,
                         kernel_size=[5, 5],
                         kernel_initializer=tf.random_normal_initializer(),
                         bias_initializer=tf.ones_initializer(),
                         padding="valid",
                         activation=tf.nn.relu,
                         name='conv1')
drop1 = tf.layers.dropout(inputs=conv1, rate=0.5, training=train_flag)
conv2 = tf.layers.conv2d(inputs=drop1,
                         filters=256,
                         kernel_size=[5, 5],
                         kernel_initializer=tf.random_normal_initializer(),
                         bias_initializer=tf.ones_initializer(),
                         padding="valid",
                         activation=tf.nn.relu,
                         name='conv2')
drop2 = tf.layers.dropout(inputs=conv2, rate=0.5, training=train_flag)
conv3 = tf.layers.conv2d(inputs=drop2,
                         filters=768,
Esempio n. 43
0
def batch_norm(inputs,
               decay=0.999,
               center=True,
               scale=False,
               epsilon=0.001,
               moving_vars='moving_vars',
               activation=None,
               is_training=True,
               trainable=True,
               restore=True,
               scope=None,
               reuse=None):
  """Adds a Batch Normalization layer.

  Args:
    inputs: a tensor of size [batch_size, height, width, channels]
            or [batch_size, channels].
    decay: decay for the moving average.
    center: If True, subtract beta. If False, beta is not created and ignored.
    scale: If True, multiply by gamma. If False, gamma is
      not used. When the next layer is linear (also e.g. ReLU), this can be
      disabled since the scaling can be done by the next layer.
    epsilon: small float added to variance to avoid dividing by zero.
    moving_vars: collection to store the moving_mean and moving_variance.
    activation: activation function.
    is_training: whether or not the model is in training mode.
    trainable: whether or not the variables should be trainable or not.
    restore: whether or not the variables should be marked for restore.
    scope: Optional scope for variable_scope.
    reuse: whether or not the layer and its variables should be reused. To be
      able to reuse the layer scope must be given.

  Returns:
    a tensor representing the output of the operation.

  """
  inputs_shape = inputs.get_shape()
  with tf.variable_scope(scope, 'BatchNorm', [inputs], reuse=reuse):
    axis = list(range(len(inputs_shape) - 1))
    params_shape = inputs_shape[-1:]
    # Allocate parameters for the beta and gamma of the normalization.
    beta, gamma = None, None
    if center:
      beta = variables.variable('beta',
                                params_shape,
                                initializer=tf.zeros_initializer(),
                                trainable=trainable,
                                restore=restore)
    if scale:
      gamma = variables.variable('gamma',
                                 params_shape,
                                 initializer=tf.ones_initializer(),
                                 trainable=trainable,
                                 restore=restore)
    # Create moving_mean and moving_variance add them to
    # GraphKeys.MOVING_AVERAGE_VARIABLES collections.
    moving_collections = [moving_vars, tf.GraphKeys.MOVING_AVERAGE_VARIABLES]
    moving_mean = variables.variable('moving_mean',
                                     params_shape,
                                     initializer=tf.zeros_initializer(),
                                     trainable=False,
                                     restore=restore,
                                     collections=moving_collections)
    moving_variance = variables.variable('moving_variance',
                                         params_shape,
                                         initializer=tf.ones_initializer(),
                                         trainable=False,
                                         restore=restore,
                                         collections=moving_collections)
    if is_training:
      # Calculate the moments based on the individual batch.
      mean, variance = tf.nn.moments(inputs, axis)

      update_moving_mean = moving_averages.assign_moving_average(
          moving_mean, mean, decay)
      tf.add_to_collection(UPDATE_OPS_COLLECTION, update_moving_mean)
      update_moving_variance = moving_averages.assign_moving_average(
          moving_variance, variance, decay)
      tf.add_to_collection(UPDATE_OPS_COLLECTION, update_moving_variance)
    else:
      # Just use the moving_mean and moving_variance.
      mean = moving_mean
      variance = moving_variance
    # Normalize the activations.
    outputs = tf.nn.batch_normalization(
        inputs, mean, variance, beta, gamma, epsilon)
    outputs.set_shape(inputs.get_shape())
    if activation:
      outputs = activation(outputs)
    return outputs
Esempio n. 44
0
def build_net(in_dim, n_hidden, data_type, link='square', total_size=None,
              bw_indiv=1.0, indiv_y_bol=False, kernel='ard', initialse='identity',
              seed=23, dtype=tf.float32, landmarks=None, log_y=False,  device_name=None,
              avg_label=1.0, **others):
    with tf.device(device_name): 
        if avg_label - 1.0 < 0: # HACK FIX FOR MORE GENERAL DATA.
            #print('Alternate Intialisation')
            ard_mat_init_scale = 0.15 # For malaria
            mean_scale = sqrt(avg_label - ard_mat_init_scale*2.0)
        else:
            mean_scale = sqrt(avg_label - 1.0) # i.e. predict baseline at start.
            ard_mat_init_scale = 0.5
        net = Network(in_dim, data_type, n_hidden=n_hidden, link=link, kernel=kernel,
                      indiv_bol=indiv_y_bol, dtype=dtype, seed=seed, log_y=log_y, 
                      ard_mat_init_scale=ard_mat_init_scale)
        inputs = net.inputs
        params = net.params
        land_size = n_hidden
        cst = partial(tf.cast, dtype=dtype)
        # Model parameters
        initializer = tf.initializers.random_normal(seed=seed, dtype=dtype) # normal initialiser 
        z_initializer = tf.zeros_initializer(dtype=dtype)
        o_initializer = tf.ones_initializer(dtype=dtype)
        #initializer = tf.keras.initializers.he_normal(seed=seed)
        if initialse == 'identity':
            triangle_vec = tf.constant(triangular_vec(None, n=land_size), dtype=dtype)
        elif initialse == 'kernel':
            if kernel == 'additive':
                init_kernel = net.kernel(landmarks, landmarks, stddev_ard=bw_indiv[:-2], scale_ard=ard_mat_init_scale, 
                                                               stddev_mat=bw_indiv[-2:], scale_mat=ard_mat_init_scale, 
                                                               tensorf=False)
            elif kernel in ['rbf', 'ard']:
                init_kernel = net.kernel(landmarks, landmarks, stddev=bw_indiv, scale=1.0, tensorf=False)
            L = np.linalg.cholesky(init_kernel)
            #print('L', L)
            triangle_vec = tf.constant(triangular_vec(L, n=land_size), dtype=dtype)
        # Intialise with L = I for safe inversion at start.
        #print('bw_indiv', bw_indiv)
        #print('mean_scale', mean_scale)
        params['L'] = tf.Variable(triangle_vec, name= 'L', dtype=dtype)
        params['mean'] = tf.Variable(mean_scale * o_initializer([land_size, 1]), name = 'mean', dtype=dtype)
        params['prior_mean'] = tf.Variable(z_initializer([1]), name = 'prior_mean', dtype=dtype)

        if kernel in ['ard', 'additive']:
            params['log_bw'] = tf.Variable(tf.log(tf.constant(bw_indiv, dtype=dtype)), name = 'log_bw_sq')
        elif kernel == 'rbf':
            #print('Vary Bandwidth RBF')
            params['log_bw'] = tf.Variable(tf.log(tf.constant(bw_indiv, dtype=dtype)), name = 'log_bw_sq')

        n_bags = cst(tf.shape(inputs['sizes'])[0])
        n_indiv = cst(tf.shape(inputs['X'])[0])

        scale = tf.exp(params['log_scale'])
        stddev = tf.exp(params['log_bw'])
        
        landmarks = inputs['landmarks']
        #stddev = tf.Print(stddev, [stddev], message='bw', summarize=100)
        if kernel in ['ard', 'rbf']:
            k_ww = net.kernel(landmarks, landmarks, stddev=stddev, scale=scale)
            k_wz = net.kernel(landmarks, inputs['X'], stddev=stddev, scale=scale) #K_wz
            #k_wz = tf.Print(k_wz, [k_wz])
            term_0_diag = scale * tf.ones([tf.cast(n_indiv, dtype=tf.int32)], dtype=dtype) #k_zz diagonal
        elif kernel == 'additive':
            scale_mat = tf.exp(params['log_scale_m'])
            k_ww = net.kernel(landmarks, landmarks, stddev_ard=stddev[:-2], scale_ard=scale, 
                                                    stddev_mat=stddev[-2:], scale_mat=scale_mat)
            k_wz = net.kernel(landmarks, inputs['X'], stddev_ard=stddev[:-2], scale_ard=scale, 
                                                      stddev_mat=stddev[-2:], scale_mat=scale_mat)
            term_0_diag = (scale + scale_mat) * tf.ones([tf.cast(n_indiv, dtype=tf.int32)], dtype=dtype) 
        
        chol_k = tf.cholesky(k_ww)
        k_ww_inv = tf.matrix_inverse(k_ww) # K_ww^-1
        triangular = fill_triangular(params['L']) #\Sigma_u=LL^T
        Sigma_u = tf.matmul(triangular, tf.transpose(triangular)) # Sigma_u = L L^T

        k_inv_k_wz = tf.matmul(k_ww_inv, k_wz) # K_ww^-1 K_wz
        mean_diff = params['mean'] - params['prior_mean']
        # mu_prior + K_zw K_ww^-1 (mu_u - mu_prior)
        net.mu = mu = params['prior_mean'] + tf.squeeze(tf.matmul(tf.transpose(k_inv_k_wz), mean_diff)) 

        inputs_int = tf.concat([tf.constant([0], tf.int32), tf.cumsum(tf.cast(inputs['sizes'], tf.int32))], 0)
        if kernel in ['ard', 'rbf']:
            term_1_vec = tf.map_fn(fn=lambda k: term_1_func(net, mu, inputs, stddev, scale, k_wz, Sigma_u,
                                                            inputs_int[k], inputs_int[k+1], k_inv_k_wz),
                                       elems=tf.range(tf.cast(n_bags, dtype=tf.int32)),
                                       dtype=dtype)
        elif kernel == 'additive':
            term_1_vec = tf.map_fn(fn=lambda k: term_1_func_additive(net, mu, inputs, stddev, 
                                                                     scale, scale_mat, k_wz, Sigma_u,
                                                                     inputs_int[k], inputs_int[k+1], k_inv_k_wz),
                                       elems=tf.range(tf.cast(n_bags, dtype=tf.int32)),
                                       dtype=dtype)
        #term_1_vec = tf.Print(term_1_vec, [term_1_vec], '1')
        # We do not do multiple outputs, instead we recompute diag, as multiple outputs is CPU only...
        term_1 = tf.reduce_sum(tf.multiply(term_1_vec, inputs['y']))
        # sum mu^2
        mu_square = tf.multiply(mu, mu)
        # diag is transpose first one, elementwise multiply, sum across rows axis=0
        term_1_diag = tf.reduce_sum( tf.multiply(k_wz, k_inv_k_wz), axis=0) #diag K_zw K_ww^-1 k_wz
        k_zw_k_inv_S = tf.matmul(tf.transpose(k_inv_k_wz), Sigma_u) # k_zw K_ww^-1 Sigma_u
        term_2_diag = tf.reduce_sum(tf.multiply(tf.transpose(k_zw_k_inv_S), k_inv_k_wz), axis=0)
        # diagonal as [n_indiv]
        net.Sigma_diag = Sigma_diag = term_0_diag - term_1_diag + term_2_diag
        net.indiv = indiv = Sigma_diag + mu_square # E(X^2) is just normal second moment.
        term_2 = tf.reduce_sum(tf.multiply(indiv, inputs['indiv_pop']))
        # sum of all pop * (mu_square + sigma_diag)
        #indiv = tf.Print(indiv, [indiv, inputs['indiv_y']], message='indiv', summarize=5)

        #pop_mu = tf.multiply(inputs['indiv_pop'], tf.exp(mu))
        #pool_pop_mu = tf.squeeze(net.bag_pool(tf.expand_dims(pop_mu, 1))) #[n_bags]
        #term_1 = tf.reduce_sum(tf.multiply(inputs['y'], tf.log(pool_pop_mu)))

        # Term 2 \sum \sum p^i_j exp(\mu^i_j + Sigma^i_j/2)
        #pop_mu_sig = tf.multiply(inputs['indiv_pop'], tf.exp(mu + 0.5 * Sigma_diag))
        #term_2 = tf.reduce_sum(pop_mu_sig)

        # Term 3
        tfd = tf.contrib.distributions
        mvn_q = tfd.MultivariateNormalTriL(loc=tf.squeeze(params['mean']), scale_tril=triangular)
        mvn_u = tfd.MultivariateNormalTriL(loc=tf.tile(params['prior_mean'], [land_size]), scale_tril=chol_k)
        term_3 = tf.distributions.kl_divergence(mvn_q, mvn_u)
        
        #term_1 = tf.Print(term_1, [term_1/n_bags], message='1')
        #term_2 = tf.Print(term_2, [term_2/n_bags], message='2')
        #term_3 = tf.Print(term_3, [term_3/total_size], message='3')

        # Stirlings approximation to enable comparison across losses (\sum log (y_j !))
        zeros = tf.zeros_like(inputs['y']) # create a tensor all ones
        mask = tf.greater(inputs['y'], zeros) # boolean tensor, mask[i] = True iff x[i] > 1
        non_zero_y = tf.boolean_mask(inputs['y'], mask)
        #non_zero_y = tf.Print(non_zero_y, [non_zero_y, inputs['y']], summarize=100)
        term_4 = tf.reduce_sum(tf.multiply(non_zero_y, tf.log(non_zero_y)) - non_zero_y + 0.5 * tf.log(2.0 * pi * non_zero_y))
        #term_4 = tf.Print(term_4, [term_4/n_bags], message='4')
        
        net.loss  = -1.0/n_bags * (term_1 - term_2 - term_4) + term_3/total_size

        #if MAP:
        #net.indiv = indiv = tf.exp(mu - Sigma_diag)
        #else:
        net.indiv_se = net.square_err(inputs['indiv_true_y'], indiv)
        net.indiv_nll = net.nll_term(inputs['indiv_y'], indiv)

        #indiv = tf.Print(indiv, [indiv], summarize =200, message='indiv')
        #indiv_mean = tf.exp(mu + 0.5 * Sigma_diag)
        net.indiv_y = indiv_y_pop = tf.multiply(inputs['indiv_pop'], indiv)
        indiv_y_pop = tf.expand_dims(indiv_y_pop, 1)
        net.bag_y = bag_y = tf.squeeze(net.bag_pool(indiv_y_pop))
        #bag_y = tf.Print(bag_y, [bag_y, inputs['y']], message='bag', summarize=5)
        net.bag_se = net.square_err(inputs['y'], bag_y, bags=True)
        net.bag_nll = net.nll_term(inputs['y'], bag_y, bags=True)

        #indiv_y_mean = tf.multiply(inputs['indiv_pop'], tf.exp(mu + 0.5 * Sigma_diag))
        #indiv_y_var = tf.multiply(tf.exp(Sigma_diag) - 1.0, tf.exp( 2.0* mu + Sigma_diag) )
        #indiv_y = tf.Print(indiv_y, [indiv_y_mean, inputs['indiv_y'], indiv_y_var], summarize=2)
        #net.bag_se = tf.reduce_sum(tf.square(bag_y - inputs['y']))
        #if indiv_y_bol:
        #    net.indiv_se = tf.reduce_sum(tf.square(indiv_y - inputs['indiv_y']))
        # Can add net.print_out
    return net
Esempio n. 45
0
    def _build(self, inp, is_training=True, test_local_stats=False):
        """Applies the batch norm operation to an input tensor

        Parameters
        ----------
        inp : tf.Tensor
            input tensor for this module
        is_training : bool, optional
            flag to specify whether this is training. If so, batch statistics are used and the moving averages
            are updated
        test_local_stats : bool, optional
            flag to use batch statistics during test time

        Returns
        -------
        tf.Tensor
            normalized tensor

        """

        if self.param_shape is None:
            self.param_shape = inp.get_shape().as_list()[-1]
        assert(self.param_shape == inp.get_shape().as_list()[-1],
               'Input shape must match parameter shape - was initialised for another shape')

        if self.axis is None:
            self.axis = list(np.arange(len(inp.get_shape().as_list()) - 1))
        assert (len(self.axis) == len(inp.get_shape().as_list()) - 1,
                'Input shape must match axis - was initialised for another shape')

        use_batch_stats = is_training | test_local_stats

        self._beta = tf.get_variable('beta', self.param_shape, tf.float32,
                                     initializer=tf.zeros_initializer(),
                                     collections=self.TRAINABLE_COLLECTIONS) if self.offset else None
        self._gamma = tf.get_variable('gamma', self.param_shape, tf.float32,
                                     initializer=tf.ones_initializer(),
                                      collections=self.TRAINABLE_COLLECTIONS) if self.offset else None

        if self.offset:
            self.variables.append(self._beta)
        if self.scale:
            self.variables.append(self._gamma)

        self._mm = tf.get_variable('moving_mean', self.param_shape, tf.float32,
                                   initializer=tf.zeros_initializer(), trainable=False,
                                   collections=self.MOVING_COLLECTIONS)
        self._mv = tf.get_variable('moving_variance', self.param_shape, tf.float32,
                                   initializer=tf.ones_initializer(), trainable=False,
                                   collections=self.MOVING_COLLECTIONS)

        if use_batch_stats:
            mean, variance = tf.nn.moments(inp, self.axis, name='moments')

            # fix for negative variances - see https://github.com/tensorflow/tensorflow/issues/3290
            variance = tf.maximum(variance, tf.constant(0.))

            if is_training:
                update_mean_op = moving_averages.assign_moving_average(
                    variable=self._mm,
                    value=mean,
                    decay=self.decay_rate,
                    zero_debias=False,
                    name="update_moving_mean").op
                update_variance_op = moving_averages.assign_moving_average(
                    variable=self._mv,
                    value=variance,
                    decay=self.decay_rate,
                    zero_debias=False,
                    name="update_moving_variance").op

                with tf.control_dependencies([update_mean_op, update_variance_op]):
                    mean = tf.identity(mean)
                    variance = tf.identity(variance)
        else:
            mean = tf.identity(self._mm)
            variance = tf.identity(self._mv)

        outp = tf.nn.batch_normalization(inp, mean, variance, self._beta, self._gamma, self.eps, name="bn")

        return outp
Esempio n. 46
0
 def __init__(self, name=None):
     super(Network, self).__init__(name=name)
     self._layer = tf.keras.layers.Dense(
         3, kernel_initializer=tf.ones_initializer(), name='logits')
Esempio n. 47
0
    def _layer_stack(self,
                     x,
                     layers,
                     encoder_output=None,
                     self_attention_mask=None,
                     encdec_attention_mask=None,
                     losses=None,
                     step_num=None,
                     encdec_tensors=None,
                     states=None):
        """Encoder or decoder stack.

    Args:
      x: a mtf.Tensor with shape [<batch_dims>, length_dim, model_dim]
      layers: an list of strings
      encoder_output: an optional mtf.Tensor with shape
        [<batch_dims>, encoder_length_dim, model_dim]
      self_attention_mask: an optional mtf.Tensor with shape
        [batch, length_dim, memory_length_dim] containing values 0 or -inf.
      encdec_attention_mask: an optional mtf.Tensor with shape
        [batch, length_dim, encoder_length_dim] containing values 0 or -inf.
      losses: a list to be appended-to
      step_num: an optional mtf integer Scalar (used in incrmenental mode)
      encdec_tensors: an optional list of num_layers tuples, each of the form
        (q_var, o_var, k, v), (used in incremental mode)
      states: an optional list of Tensors (used in incremental mode)
    Returns:
      a mtf.Tensor with shape [<batch_dims>, length_dim, model_dim]
    Raises:
      ValueError: if hparams make no sense
    """
        hparams = self._hparams
        is_incremental = (step_num is not None)

        def layer_prepostprocess_dropout(x):
            if is_incremental:
                return x
            return mtf.dropout(
                x,
                keep_prob=1.0 - hparams.layer_prepostprocess_dropout,
                noise_shape=mtf.Shape(self.batch_dims + [self.model_dim]))

        num_layers = len(layers)
        num_layer_norms = num_layers + 1
        layer_norms_dim = mtf.Dimension("layer_norms", num_layer_norms)
        layer_norm_combined_var = mtf.get_variable(
            x.mesh,
            "layer_norm_scale",
            mtf.Shape([layer_norms_dim, self.model_dim]),
            initializer=tf.ones_initializer(),
            activation_dtype=x.dtype)
        layer_norm_vars = mtf.unstack(layer_norm_combined_var, layer_norms_dim)

        def normalize(x):
            scale = layer_norm_vars.pop(0)
            variance = mtf.reduce_mean(mtf.square(x),
                                       reduced_dim=self.model_dim)
            return x * mtf.rsqrt(variance + hparams.norm_epsilon) * scale

        if is_incremental:
            states = list(states)
            new_states = []
        tf.logging.info("states = %s" % (states, ))

        for lnum, layer_type in enumerate(layers):
            with tf.variable_scope("%s_%d" % (layer_type, lnum)):
                if layer_type == "att":
                    # Self attention layer
                    if is_incremental:
                        y, new_k, new_v = mtf.layers.multihead_self_attention_incremental(
                            normalize(x),
                            prev_k=states.pop(0),
                            prev_v=states.pop(0),
                            step_num=step_num,
                            master_dtype=self.master_dtype,
                            slice_dtype=self.slice_dtype,
                            name="att")
                        new_states.append(new_k)
                        new_states.append(new_v)
                        x += y
                    else:
                        x += layer_prepostprocess_dropout(
                            mtf.layers.multihead_attention(
                                normalize(x),
                                None,
                                self_attention_mask,
                                self.kv_dim,
                                self.heads_dim,
                                dropout=hparams.attention_dropout,
                                dropout_broadcast_dims=[self.length_dim],
                                master_dtype=self.master_dtype,
                                slice_dtype=self.slice_dtype,
                                name="att"))
                elif layer_type == "enc_att":
                    # Encoder-Decoder attention layer
                    if is_incremental:
                        # Encoder-Decoder attention layer
                        q_var, o_var, k, v = encdec_tensors[lnum]
                        x += mtf.layers.multihead_encdec_attention_incremental(
                            normalize(x),
                            q_var,
                            o_var,
                            k,
                            v,
                            encdec_attention_mask,
                            name="enc_att")
                    else:
                        x += layer_prepostprocess_dropout(
                            mtf.layers.multihead_attention(
                                normalize(x),
                                encoder_output,
                                encdec_attention_mask,
                                self.kv_dim,
                                self.heads_dim,
                                dropout=hparams.attention_dropout,
                                dropout_broadcast_dims=[self.length_dim],
                                master_dtype=self.master_dtype,
                                slice_dtype=self.slice_dtype,
                                name="enc_att"))
                elif layer_type == "local_att":
                    if is_incremental:
                        y, new_k, new_v = mtf.layers.masked_local_attention_1d_incremental(
                            normalize(x),
                            prev_k=states.pop(0),
                            prev_v=states.pop(0),
                            step_num=step_num,
                            master_dtype=self.master_dtype,
                            slice_dtype=self.slice_dtype,
                            name="local_att")
                        new_states.append(new_k)
                        new_states.append(new_v)
                        x += y
                    else:
                        x += layer_prepostprocess_dropout(
                            mtf.layers.masked_local_attention_1d(
                                normalize(x),
                                self.kv_dim,
                                self.heads_dim,
                                window_size=hparams.
                                local_attention_window_size,
                                master_dtype=self.master_dtype,
                                slice_dtype=self.slice_dtype,
                                length_per_split=mtf.
                                tensor_dim_to_size_per_split(
                                    hparams.layout, hparams.mesh_shape,
                                    self.max_length_dim),
                                name="local_att"))
                elif layer_type == "compressed_att":
                    if is_incremental:
                        raise ValueError(
                            "compressed_att incremental not implemented")
                    else:
                        x += layer_prepostprocess_dropout(
                            mtf.layers.
                            multihead_self_attention_memory_compressed(
                                normalize(x),
                                mask_right=True,
                                compression_factor=hparams.compression_factor,
                                kv_channels=self.kv_dim,
                                heads=self.heads_dim,
                                dropout=hparams.attention_dropout,
                                dropout_broadcast_dims=[self.length_dim],
                                master_dtype=self.master_dtype,
                                slice_dtype=self.slice_dtype,
                                name="compressed_att"))
                else:
                    if is_incremental:
                        # insert length dimension.
                        x_shape = x.shape
                        shape_with_length = mtf.Shape(
                            x_shape.dims[:-1] + [mtf.Dimension("length", 1)] +
                            x_shape.dims[-1:])
                        x = mtf.reshape(x, shape_with_length)
                    # ffn layer
                    x += layer_prepostprocess_dropout(
                        self._feedforward_layer(normalize(x),
                                                layer_type,
                                                losses=losses))
                    if is_incremental:
                        # remove length dimension
                        x = mtf.reshape(x, x_shape)

        x = layer_prepostprocess_dropout(normalize(x))
        assert not layer_norm_vars
        if is_incremental:
            return x, new_states
        else:
            return x
Esempio n. 48
0
    def _decoder_layer_stack_incremental(self,
                                         x,
                                         step_num,
                                         encdec_tensors,
                                         self_attention_k,
                                         self_attention_v,
                                         encdec_attention_mask=None):
        """Decoder layer stack during inference.

    We are processing only one position at a time.

    The self-attention keys and values have already been computed for
    previous positions.  In addition to the decoder output, we need to
    produce the updated self-attention keys and values.

    If there is an encoder, then additional Tensors are supplied in
    encdec_tensors, which give us the keys and values for encoder-decoder
    attention as well as the weight matrices q_var and o_var.

    Args:
      x: a mtf.Tensor with shape [<batch_dims>, model_dim]
      step_num: an mtf integer Scalar
      encdec_tensors: an optional list of num_layers tuples, each of the form
        (q_var, o_var, k, v)
      self_attention_k: an optional list of num_layers Tensors each with shape
        [batch, heads, memory_length, kv_channels]
      self_attention_v: an optional list of num_layers Tensors each with shape
        [batch, heads, memory_length, kv_channels]
      encdec_attention_mask: an optional mtf.Tensor with shape
        [batch, length_dim, encoder_length_dim] containing values 0 or -inf.

    Returns:
      y: a mtf.Tensor with shape [<batch_dims>, model_dim]
      new_self_attention_k: a list of num_layers mtf.Tensors, with the same
        shapes as the elements of self_attention_k
      new_self_attention_v: a list of num_layers mtf.Tensors, with the same
        shapes as the elements of self_attention_v

    Raises:
      ValueError: if hparams make no sense
    """
        hparams = self._hparams
        num_layers = hparams.num_decoder_layers
        num_layer_norms = num_layers * (2 if encdec_tensors is None else 3) + 1
        layer_norms_dim = mtf.Dimension("layer_norms", num_layer_norms)
        layer_norm_combined_var = mtf.get_variable(
            x.mesh,
            "layer_norm_scale",
            mtf.Shape([layer_norms_dim, self.model_dim]),
            initializer=tf.ones_initializer(),
            activation_dtype=x.dtype)
        layer_norm_vars = mtf.unstack(layer_norm_combined_var, layer_norms_dim)

        def normalize(x):
            scale = layer_norm_vars.pop(0)
            variance = mtf.reduce_mean(mtf.square(x),
                                       reduced_dim=self.model_dim)
            return x * mtf.rsqrt(variance + hparams.norm_epsilon) * scale

        new_self_attention_k = []
        new_self_attention_v = []
        for layer in range(num_layers):
            with tf.variable_scope("layer_%d" % layer):
                # Self attention layer
                y, new_k, new_v = mtf_layers.multihead_self_attention_incremental(
                    normalize(x),
                    prev_k=self_attention_k[layer],
                    prev_v=self_attention_v[layer],
                    step_num=step_num,
                    name="self_attention")
                new_self_attention_k.append(new_k)
                new_self_attention_v.append(new_v)
                x += y
                if encdec_tensors is not None:
                    # Encoder-Decoder attention layer
                    q_var, o_var, k, v = encdec_tensors[layer]
                    x += mtf_layers.multihead_encdec_attention_incremental(
                        normalize(x),
                        q_var,
                        o_var,
                        k,
                        v,
                        encdec_attention_mask,
                        name="encdec_attention")
                # ffn layer
                x += self._feedforward_layer(normalize(x), hparams)
        x = normalize(x)
        assert not layer_norm_vars
        return x, new_self_attention_k, new_self_attention_v
Esempio n. 49
0
	def __init__(self, config, batch_ops, is_train=True):

		# Model name
		model_name = 'layer_norm'

		# Model inputs
		imgs = batch_ops['imgs']
		ABCD = tf.cast(batch_ops['ABCD'], dtype=tf.int64)
		not_D = tf.cast(batch_ops['not_D'], dtype=tf.int64)

		# Dimensions
		batch_size = int(config.batch_size)
		N_foils = int(not_D.shape[1])

		# Get latent codes for all images
		A_latent, B_latent, C_latent, D_latent, all_foil_latent = encode_analogy_objs(imgs, ABCD, not_D)
		N_latent = int(A_latent.shape[1])

		# Normalization
		# Small constant (for avoiding division by zero)
		eps = 1e-8
		# Normalization parameters
		A_latent_mean, A_latent_var = tf.nn.moments(A_latent, 1)
		A_latent_SD = tf.sqrt(A_latent_var + eps)
		B_latent_mean, B_latent_var = tf.nn.moments(B_latent, 1)
		B_latent_SD = tf.sqrt(B_latent_var + eps)
		C_latent_mean, C_latent_var = tf.nn.moments(C_latent, 1)
		C_latent_SD = tf.sqrt(C_latent_var + eps)
		D_latent_mean, D_latent_var = tf.nn.moments(D_latent, 1)
		D_latent_SD = tf.sqrt(D_latent_var + eps)
		# Scale and shift parameters
		with tf.variable_scope('norm_params', reuse=tf.AUTO_REUSE) as scope:
			scale = tf.get_variable('scale', N_latent, initializer=tf.ones_initializer())
			shift = tf.get_variable('shift', N_latent, initializer=tf.zeros_initializer())
		# Normalize
		A_layer_norm = (((A_latent - tf.expand_dims(A_latent_mean,1)) / tf.expand_dims(A_latent_SD,1)) * scale) + shift
		B_layer_norm = (((B_latent - tf.expand_dims(B_latent_mean,1)) / tf.expand_dims(B_latent_SD,1)) * scale) + shift
		C_layer_norm = (((C_latent - tf.expand_dims(C_latent_mean,1)) / tf.expand_dims(C_latent_SD,1)) * scale) + shift
		D_layer_norm = (((D_latent - tf.expand_dims(D_latent_mean,1)) / tf.expand_dims(D_latent_SD,1)) * scale) + shift

		# [A, B, C, D] -> LSTM
		log.info('[A,B,C,D] -> LSTM...')
		D_score = scoring_model(A_layer_norm, B_layer_norm, C_layer_norm, D_layer_norm)

		# [A, B, C, foils] -> LSTM
		log.info('[A,B,C,foils] -> LSTM...')
		all_foil_score = []
		for foil in range(N_foils):
			# Extract latent rep for this foil
			this_foil_latent = all_foil_latent[:,foil,:]
			# Normalization
			# Normalization parameters
			foil_latent_mean, foil_latent_var = tf.nn.moments(this_foil_latent, 1)
			foil_latent_SD = tf.sqrt(foil_latent_var + eps)
			# Normalize
			foil_layer_norm = (((this_foil_latent - tf.expand_dims(foil_latent_mean,1)) / tf.expand_dims(foil_latent_SD,1)) * scale) + shift
			# Get score
			foil_score = scoring_model(A_layer_norm, B_layer_norm, C_layer_norm, foil_layer_norm)
			# Accumulate foil scores
			all_foil_score.append(foil_score)

		# Concatenate all scores
		all_foil_score = tf.concat(all_foil_score, axis=1)
		all_scores = tf.concat([D_score, all_foil_score], axis=1)
		all_scores_softmax = tf.nn.softmax(all_scores)

		# Loss
		log.info("Loss (cross-entropy over candidate scores)...")
		targets = tf.concat([tf.ones(D_score.shape), tf.zeros(all_foil_score.shape)], axis=1)
		self.train_loss, accuracy, correct_preds = build_cross_entropy_loss(all_scores, targets)
		accuracy = accuracy * 100.0

		# Model outputs
		self.all_out = {
						'accuracy': accuracy}
def ones_init():
    return tf.ones_initializer()
Esempio n. 51
0
File: gp.py Progetto: thomkeh/AutoGP
    def __init__(
            self,
            inducing_inputs,
            cov_func,
            inf_func,
            # mean_func=mean.ZeroOffset(),
            lik_func,
            num_components=1,
            diag_post=False,
            inducing_outputs=None):
        """
        Args:
            lik_func: subclass of likelihoods.Likelihood
                An object representing the likelihood function p(y|f).
            cov_func: list of subclasses of kernels.Kernel
                A list of one kernel per latent function.
            inducing_inputs: ndarray
                An array of initial inducing input locations. Dimensions: num_inducing * input_dim.
            num_components: int
                The number of mixture of Gaussian components.
            diag_post: bool
                True if the mixture of Gaussians uses a diagonal covariance, False otherwise.
            num_samples: int
                The number of samples to approximate the expected log likelihood of the posterior.
        """
        # Get the actual functions if they were initialized as strings.
        self.inf = inf_func
        assert isinstance(self.inf, inf.Inference)
        num_latent = cov_func.num_latent_functions()

        # Repeat the inducing inputs for all latent processes if we haven't been given individually
        # specified inputs per process.
        if inducing_inputs.ndim == 2:
            inducing_inputs = np.tile(inducing_inputs[np.newaxis, :, :],
                                      [num_latent, 1, 1])

        # Initialize all model dimension constants.
        num_inducing = inducing_inputs.shape[-2]
        self.input_dim = inducing_inputs.shape[-1]

        # Define all parameters that get optimized directly in raw form. Some parameters get
        # transformed internally to maintain certain pre-conditions.
        self.raw_weights = tf.get_variable("raw_weights", [num_components],
                                           initializer=tf.zeros_initializer())
        self.raw_means = tf.get_variable(
            "raw_means", [num_components, num_latent, num_inducing],
            initializer=tf.zeros_initializer())
        if diag_post:
            self.raw_covars = tf.get_variable(
                "raw_covars", [num_components, num_latent, num_inducing],
                initializer=tf.ones_initializer())
        else:
            self.raw_covars = tf.get_variable(
                "raw_covars", [num_components, num_latent] +
                util.tri_vec_shape(num_inducing),
                initializer=tf.zeros_initializer())
        self.raw_inducing_inputs = tf.get_variable("raw_inducing_inputs",
                                                   initializer=tf.constant(
                                                       inducing_inputs,
                                                       dtype=tf.float32))
        self.raw_likelihood_params = lik_func.get_params()
        self.raw_kernel_params = cov_func.get_params()
        raw_inducing_outputs = 0 if inducing_outputs is None else tf.constant(
            inducing_outputs, dtype=tf.float32)

        # Define placeholder variables for training and predicting.
        self.num_train = tf.placeholder(tf.float32, shape=[], name="num_train")
        self.train_inputs = tf.placeholder(tf.float32,
                                           shape=[None, self.input_dim],
                                           name="train_inputs")
        self.train_outputs = tf.placeholder(tf.float32,
                                            shape=[None, None],
                                            name="train_outputs")
        self.test_inputs = tf.placeholder(tf.float32,
                                          shape=[None, self.input_dim],
                                          name="test_inputs")

        # Now build our computational graph.
        self.nelbo, self.loo_loss, self.predictions = self.inf.inference(
            self.raw_weights, self.raw_means, self.raw_covars,
            self.raw_inducing_inputs, self.train_inputs, self.train_outputs,
            self.num_train, self.test_inputs, raw_inducing_outputs)

        # config = tf.ConfigProto(log_device_placement=True, allow_soft_placement=True)
        # Do all the tensorflow bookkeeping.
        self.session = tf.Session()
        self.optimizer = None
        self.train_step = None
Esempio n. 52
0
 def build(self, _):
   self.scale = tf.get_variable("layer_norm_scale", [self.hidden_size],
                                initializer=tf.ones_initializer())
   self.bias = tf.get_variable("layer_norm_bias", [self.hidden_size],
                               initializer=tf.zeros_initializer())
   self.built = True
Esempio n. 53
0
def emb_score(config, input_tensor, input_ids, 
				output_weights,
				input_mask, **kargs):

	input_shape_list = bert_utils.get_shape_list(input_tensor, expected_rank=3)
	batch_size = input_shape_list[0]
	seq_length = input_shape_list[1]
	hidden_dims = input_shape_list[2]

	scope = kargs.get('scope', None)
	if scope:
		lm_scope = scope + '/' + 'cls/predictions'
	else:
		lm_scope = 'cls/predictions'

	tf.logging.info("**** mlm generator scope **** %s", str(lm_scope))

	# with tf.variable_scope("cls/predictions", reuse=tf.AUTO_REUSE):
	with tf.variable_scope(lm_scope, reuse=tf.AUTO_REUSE):
		if config.get('ln_type', 'postln') == 'preln':
			input_tensor = bert_modules.layer_norm(input_tensor)
		elif config.get('ln_type', 'postln') == 'postln':
			input_tensor = input_tensor
		else:
			input_tensor = input_tensor

		if config.get("embedding", "none_factorized") == "none_factorized":
			projection_width = config.hidden_size
			tf.logging.info("==not using embedding factorized==")
		else:
			projection_width = config.get('embedding_size', config.hidden_size)
			tf.logging.info("==using embedding factorized: embedding size: %s==", str(projection_width))

		if kargs.get("energy_pooling", "mi") == "mi":
			with tf.variable_scope("transform"):
				input_tensor = tf.layers.dense(
						input_tensor,
						units=projection_width,
						activation=bert_modules.get_activation(config.hidden_act),
						kernel_initializer=bert_modules.create_initializer(
								config.initializer_range))

				if config.get('ln_type', 'postln') == 'preln':
					input_tensor = input_tensor
				elif config.get('ln_type', 'postln') == 'postln':
					input_tensor = bert_modules.layer_norm(input_tensor)
				else:
					input_tensor = bert_modules.layer_norm(input_tensor)
			output_bias = tf.get_variable(
				"output_bias",
				shape=[config.vocab_size],
				initializer=tf.zeros_initializer())
			tf.logging.info("****** mi using mlm transform *******")
		elif kargs.get("energy_pooling", "mi") == "cls":
			with tf.variable_scope("transform_ebm"):
				# We "pool" the model by simply taking the hidden state corresponding
				# to the first token. We assume that this has been pre-trained
				first_token_tensor = tf.squeeze(input_tensor[:, 0:1, :], axis=1)
				input_tensor = tf.layers.dense(
						first_token_tensor,
						config.hidden_size,
						activation=tf.tanh, #bert_modules.get_activation(config.hidden_act),
						kernel_initializer=bert_modules.create_initializer(config.initializer_range))
				tf.logging.info("****** using cls pooling *******")
		else:
			with tf.variable_scope("transform_ebm"):
				input_tensor = tf.layers.dense(
				  input_tensor,
				  units=projection_width,
				  activation=tf.tanh, #bert_modules.get_activation(config.hidden_act),
				  kernel_initializer=bert_modules.create_initializer(
					  config.initializer_range))
			tf.logging.info("****** using other pooling transform *******")

	# with tf.variable_scope("cls/predictions", reuse=tf.AUTO_REUSE):
	if scope:
		ebm_scope = scope + '/' + 'ebm/predictions'
	else:
		ebm_scope = 'ebm/predictions'
	
	tf.logging.info("**** ebm generator scope **** %s", str(ebm_scope))

	print(input_tensor.get_shape(), "==input_tensor shape==")

	with tf.variable_scope(ebm_scope, reuse=tf.AUTO_REUSE):
		# assume the whole model is self-normalization

		if kargs.get("normalized_constant", "constant") == 'zero_constant':
			normalized_constant = tf.get_variable(
					"ebm_normalized_constant",
					shape=[config.max_position_embeddings],
					initializer=tf.zeros_initializer())

			valid_seq_length = tf.cast(tf.reduce_sum(input_mask, axis=-1), tf.int32) # batch_size
			onehot_length_ids = tf.one_hot(valid_seq_length, config.max_position_embeddings)

			input_normalized_constant = tf.einsum("ab,b->a", tf.cast(onehot_length_ids, tf.float32), normalized_constant)

			tf.logging.info("****** zero_constant logz *******")
		elif kargs.get("normalized_constant", "constant") == 'one_constant':
			normalized_constant = tf.get_variable(
					"ebm_normalized_constant",
					shape=[config.max_position_embeddings],
					initializer=tf.ones_initializer())
			tf.logging.info("****** one_constant logz *******")
			valid_seq_length = tf.cast(tf.reduce_sum(input_mask, axis=-1), tf.int32) # batch_size
			onehot_length_ids = tf.one_hot(valid_seq_length, config.max_position_embeddings)

			input_normalized_constant = tf.einsum("ab,b->a", tf.cast(onehot_length_ids, tf.float32), normalized_constant)

		elif kargs.get("normalized_constant", "constant") == 'constant_constant':
			normalized_constant = tf.get_variable(
					"ebm_normalized_constant",
					shape=[config.max_position_embeddings],
					initializer=tf.constant_initializer(np.ones((config.max_position_embeddings))*200.0, tf.float32))
			tf.logging.info("****** one_constant logz *******")
			valid_seq_length = tf.cast(tf.reduce_sum(input_mask, axis=-1), tf.int32) # batch_size
			onehot_length_ids = tf.one_hot(valid_seq_length, config.max_position_embeddings)

			input_normalized_constant = tf.einsum("ab,b->a", tf.cast(onehot_length_ids, tf.float32), normalized_constant)

		elif kargs.get("normalized_constant", "constant") == 'log9_constant':
			normalized_constant = tf.get_variable(
					"ebm_normalized_constant",
					shape=[config.max_position_embeddings],
					initializer=tf.constant_initializer(np.ones((config.max_position_embeddings))*np.log(9.0), tf.float32))
			tf.logging.info("****** one_constant logz *******")
			valid_seq_length = tf.cast(tf.reduce_sum(input_mask, axis=-1), tf.int32) # batch_size
			onehot_length_ids = tf.one_hot(valid_seq_length, config.max_position_embeddings)

			input_normalized_constant = tf.einsum("ab,b->a", tf.cast(onehot_length_ids, tf.float32), normalized_constant)

		elif kargs.get("normalized_constant", "constant") == 'logv_constant':
			normalized_constant = tf.get_variable(
					"ebm_normalized_constant",
					shape=[config.max_position_embeddings],
					initializer=tf.constant_initializer(np.ones((config.max_position_embeddings))*np.log(config.vocab_size), tf.float32))
			tf.logging.info("****** one_constant logz *******")
			valid_seq_length = tf.cast(tf.reduce_sum(input_mask, axis=-1), tf.int32) # batch_size
			onehot_length_ids = tf.one_hot(valid_seq_length, config.max_position_embeddings)

			input_normalized_constant = tf.einsum("ab,b->a", tf.cast(onehot_length_ids, tf.float32), normalized_constant)

		elif kargs.get("normalized_constant", "constant") == 'logv_constant_ln':
			normalized_constant = tf.get_variable(
					"ebm_normalized_constant",
					shape=[],
					initializer=tf.constant_initializer(np.log(config.vocab_size), tf.float32))

			input_normalized_constant = normalized_constant

		elif kargs.get("normalized_constant", "length_linear") == 'length_linear':
			normalized_constant = tf.get_variable(
					"ebm_normalized_constant",
					shape=[config.max_position_embeddings],
					initializer=tf.constant_initializer(np.arange((config.max_position_embeddings))+1, tf.float32),
					trainable=False)
			scale_weights = tf.get_variable(
					"ebm_normalized_constant_scale",
					shape=[config.max_position_embeddings],
					initializer=tf.constant_initializer(np.log(config.vocab_size)*np.ones((config.max_position_embeddings)), dtype=tf.float32),
					trainable=True)
			scale_bias = tf.get_variable(
					"ebm_normalized_constant_bias",
					shape=[config.max_position_embeddings],
					initializer=tf.zeros_initializer(),
					trainable=True)
			tf.logging.info("****** length linear logz *******")
			# normalized_constant = scale_bias + scale_weights * tf.pow(normalized_constant, 2)

			valid_seq_length = tf.cast(tf.reduce_sum(input_mask, axis=-1), tf.int32) # batch_size
			onehot_length_ids = tf.one_hot(valid_seq_length, config.max_position_embeddings)
			
			length_part = tf.einsum("ab,b->a", tf.cast(onehot_length_ids, tf.float32), normalized_constant)
			length_scale_part = tf.einsum("ab,b->a", tf.cast(onehot_length_ids, tf.float32), scale_weights)
			length_bias_part = tf.einsum("ab,b->a", tf.cast(onehot_length_ids, tf.float32), scale_bias)

			input_normalized_constant = length_part*length_scale_part + length_bias_part

		# input_normalized_constant = tf.einsum("ab,b->a", tf.cast(onehot_length_ids, tf.float32), normalized_constant)

		# f_input_mask = tf.cast(tf.expand_dims(input_mask, axis=-1), tf.float32)

		if kargs.get("energy_pooling", "mi") == "mean_pooling":
			tf.logging.info("==apply mean pooling to get hidden states projections==")
			# for input token sequence: <start> a b c
			# we only calculate energy on a,b,c which <start> can't contribute to final 
			# energy function
			# batch x dim
			pool_features = tf.einsum("abc,ab->ac", input_tensor[:, 1:], tf.cast(input_mask[:, 1:], tf.float32))
			pool_features /= (1e-10+tf.reduce_sum(tf.cast(input_mask[:, 1:], tf.float32), axis=1, keepdims=True))
			# tf.reduce_sum(input_tensor*f_input_mask, axis=1) #/ (1e-10+tf.reduce_sum(f_input_mask, axis=1))

			print(pool_features.get_shape(), "===pool_features shape===")
		elif kargs.get("energy_pooling", "mi") == "mi":
			tf.logging.info("==apply mi to get hidden states projections==")
			# input_tensor_norm = tf.expand_dims(tf.sqrt(tf.reduce_sum(tf.pow(input_tensor, 2), axis=-1))+1e-20, axis=-1)
			# input_tensor = input_tensor / tf.stop_gradient(input_tensor_norm)
			# output_weights_norm = tf.expand_dims(tf.sqrt(tf.reduce_sum(tf.pow(output_weights, 2), axis=-1))+1e-20, axis=-1)
			# output_weights = output_weights / tf.stop_gradient(output_weights_norm)
			# we calculate cosine distance to make mi bounded by [-1, 1]
			logits = tf.einsum("abc,dc->abd", input_tensor, output_weights) # batch x seq x vocab
			logits = tf.nn.bias_add(logits, output_bias)

			input_id_shape = bert_utils.get_shape_list(input_ids, [2,3])
			if len(input_id_shape) == 2:
				onehot_input_ids = tf.cast(tf.one_hot(tf.cast(input_ids, tf.int32), config.vocab_size), tf.float32) # batch x seq x vocab
				input_ori_ids = tf.cast(onehot_input_ids, tf.float32)
				print("==input ori ids shape== 2-dim", input_ori_ids.get_shape())
			else:
				input_ori_ids = tf.cast(input_ids, tf.float32)
				print("==input ori ids shape== 3-dim", input_ori_ids.get_shape())

			logits = tf.einsum("abd,abd->ab", logits, input_ori_ids)
			print(logits.get_shape(), "==pooled logits shape==")
			# with l2-normalize, we can bound logits to 1
			pool_features = tf.reduce_sum(logits[:, 1:]*tf.cast(input_mask[:, 1:], tf.float32), axis=1) #/ (1e-10+tf.reduce_sum(tf.cast(input_mask[:, 1:], tf.float32), axis=1))
			pool_features = tf.expand_dims(pool_features, axis=-1)
			print(pool_features.get_shape(), "==pooled feature shape==")

			if kargs.get("softplus_features", False):
				# when pooled_features is to infinite, it converges to 0
				# when is to minus inifinite, it will converges to inifite
				pool_features = tf.nn.softplus(-pool_features)
				tf.logging.info("****** apply softplus transformation for pooled_features *******")

		elif kargs.get("energy_pooling", "mi") == "cls":
			with tf.variable_scope("transform"):
				pool_features = tf.layers.dense(
						input_tensor,
						units=1,
						use_bias=False,
						activation=None
						)
			tf.logging.info("****** apply linear transformation for pooled_features *******")
		# batch_size x hidden_dims

		if kargs.get('transform', True):

			if kargs.get("transformer_activation", "none") == 'softplus':
				with tf.variable_scope("transform"):
					ebm_scalar = tf.layers.dense(
							pool_features,
							units=1,
							use_bias=True,
							activation=tf.nn.softplus # mask scalar to [0,inifite]
							)
				tf.logging.info("****** apply softplus *******")
			elif kargs.get("transformer_activation", "none") == 'linear':
				tf.logging.info("****** apply linear projection *******")
				with tf.variable_scope("transform"):
					ebm_scalar = tf.layers.dense(
							pool_features,
							units=1,
							use_bias=True,
							activation=None # mask scalar to [0,inifite]
							)
			else:
				with tf.variable_scope("transform"):

					feature_shape = bert_utils.get_shape_list(pool_features, expected_rank=[1,2])

					pool_features = tf.layers.dense(
							pool_features,
							units=feature_shape[-1],
							activation=tf.nn.relu,
							)

					output_weights = tf.get_variable(
							"output_weights", [config.max_position_embeddings, feature_shape[-1]],
							initializer=tf.truncated_normal_initializer(stddev=0.02))

					output_bias = tf.get_variable(
							"output_bias", [config.max_position_embeddings], 
							initializer=tf.constant_initializer(-np.log(np.arange(config.max_position_embeddings).astype(np.float32)+1.0), dtype=tf.float32)
							)
				
					# batch x max_position_embeddings
					ebm_scalar_pos = tf.nn.relu(tf.matmul(pool_features, output_weights, transpose_b=True)) + output_bias
					
					pos_tensor = tf.cast(tf.reduce_sum(tf.cast(input_mask, tf.float32), axis=-1), tf.int32)
					onehot_pos = tf.cast(tf.one_hot(tf.cast(pos_tensor, tf.int32), config.max_position_embeddings), tf.float32) # batch x seq x vocab
					ebm_scalar = tf.einsum("ab,ab->a", ebm_scalar_pos, onehot_pos)
					ebm_scalar = tf.expand_dims(ebm_scalar, axis=-1)

				tf.logging.info("****** apply linear projection *******")
			print("===ebm_scalar====", ebm_scalar.get_shape())

			ebm_scalar = tf.squeeze(ebm_scalar, axis=-1)
			print("===ebm_scalar====", ebm_scalar.get_shape())
			# ebm_scalar /= (1e-10+tf.reduce_sum(tf.cast(input_mask, tf.float32), axis=-1))
			
			# if kargs.get("energy_pooling", "mi") == "mean_pooling":
			
			print("===ebm_scalar====", ebm_scalar.get_shape())
			print("===input_normalized_constant====", input_normalized_constant.get_shape())

		else:
			ebm_scalar = tf.squeeze(pool_features, axis=-1)
			# ebm_scalar /= (1e-10+tf.reduce_sum(tf.cast(input_mask, tf.float32), axis=-1))
			print("===ebm_scalar====", ebm_scalar.get_shape())
			print("===input_normalized_constant====", input_normalized_constant.get_shape())

		if not kargs.get("prob_ln", False):
			tf.logging.info("****** sum of plogprob as sentence probability *******")
			# ebm_scalar /= (1e-10+tf.reduce_sum(tf.cast(input_mask, tf.float32), axis=-1))
		else:
			ebm_scalar /= (1e-10+tf.reduce_sum(tf.cast(input_mask[:, 1:], tf.float32), axis=-1))
			tf.logging.info("****** sum of plogprob with length normalization as sentence probability *******")
		print("===ebm_scalar====", ebm_scalar.get_shape())
		print("===input_normalized_constant====", input_normalized_constant.get_shape())

		# original ebm log-likelihood:
		# log(exp(-E(x))/Z) = -E(x) - log(Z)
		# here we use bert encoder of pooled hidden states as energy function which need to minus when apply to 
		# actual energy function

		if not kargs.get("use_tpu", False):
			tf.summary.scalar('ebm_scalar', 
							tf.reduce_mean(ebm_scalar))

		if kargs.get("logz_mode", "default") == 'default':
			tf.logging.info("****** default logz *******")
			logits = -ebm_scalar - input_normalized_constant - tf.log(1e-10+tf.reduce_sum(tf.cast(input_mask, tf.float32), axis=-1))
		elif kargs.get("logz_mode", "default") == 'standard':
			logits = ebm_scalar - input_normalized_constant
			tf.logging.info("****** standard logz *******")
		elif kargs.get("logz_mode", "default") == 'standard_minus':
			tf.logging.info("****** minus standard logz *******")
			logits = -ebm_scalar - input_normalized_constant
		elif kargs.get("logz_mode", "default") == 'constant':
			logits = -ebm_scalar - tf.log(1e-10+tf.reduce_sum(tf.cast(input_mask, tf.float32), axis=-1))
			tf.logging.info("****** constant logz *******")
		elif kargs.get("logz_mode", "self_normalizing") == 'self_normalizing':
			logits = -ebm_scalar
			tf.logging.info("****** self_normalizing *******")
		elif kargs.get("logz_mode", "none") == 'none':
			logits = ebm_scalar
			tf.logging.info("****** none logz *******")
		else:
			tf.logging.info("****** linear logz *******")
			logits = ebm_scalar - input_normalized_constant * tf.reduce_sum(tf.cast(input_mask, tf.float32), axis=-1)
		print("=ebm logits shape==", logits.get_shape())

	return logits
Esempio n. 54
0
def BatchNorm3d(inputs, axis=None, training=None, momentum=0.9, epsilon=1e-5,
              center=True, scale=True,
              beta_initializer=tf.zeros_initializer(),
              gamma_initializer=tf.ones_initializer(),
              virtual_batch_size=None,
              data_format='channels_last',
              internal_update=False,
              sync_statistics=None):


    data_format = get_data_format(data_format, tfmode=False)
    shape = inputs.get_shape().as_list()
    ndims = len(shape)
    if sync_statistics is not None:
        sync_statistics = sync_statistics.lower()
    assert sync_statistics in [None, 'nccl', 'horovod'], sync_statistics

    if axis is None:
        if ndims == 2:
            data_format = 'NHWC'
            axis = 1
        elif ndims == 5:
            axis = 1 if data_format == 'NCHW' else 4
        else:
            axis = 1 if data_format == 'NCHW' else 3
    else:
        data_format = 'NCHW' if axis == 1 else 'NHWC'
    num_chan = shape[axis]

    ctx = get_current_tower_context()
    if training is None:
        training = ctx.is_training
    training = bool(training)
    TF_version = get_tf_version_tuple()
    if not training and ctx.is_training:
        assert TF_version >= 1.4
        if ctx.is_main_training_tower: 
            logger.warn("[BatchNorm] Using moving_mean/moving_variance in training.")

    if sync_statistics is None or not (training and ctx.is_training):
        coll_bk = backup_collection([tf.GraphKeys.UPDATE_OPS])
        with rename_get_variable(
                {'moving_mean': 'mean/EMA',
                 'moving_variance': 'variance/EMA'}):
            tf_args = dict(
                axis=axis,
                momentum=momentum, epsilon=epsilon,
                center=center, scale=scale,
                beta_initializer=beta_initializer,
                gamma_initializer=gamma_initializer,
                fused=True,
                _reuse=tf.get_variable_scope().reuse)
            if TF_version >= 1.5:
                tf_args['virtual_batch_size'] = virtual_batch_size
            else:
                assert virtual_batch_size is None
            layer = tf.layers.BatchNormalization(**tf_args)
            xn = layer.apply(inputs, training=training, scope=tf.get_variable_scope())

        # maintain EMA only on one GPU
        if ctx.is_main_training_tower:
            for v in layer.non_trainable_variables:
                add_model_variable(v)
        if not ctx.is_main_training_tower or internal_update:
            restore_collection(coll_bk)

        if training and internal_update:
            assert layer.updates
            with tf.control_dependencies(layer.updates):
                ret = tf.identity(xn, name='output')
        else:
            ret = tf.identity(xn, name='output')

        vh = ret.variables = VariableHolder(
            moving_mean=layer.moving_mean,
            mean=layer.moving_mean,  #backward-compatibility
            moving_variance=layer.moving_variance,
            variance=layer.moving_variance)  #backward-compatibility
        if scale:
            vh.gamma = layer.gamma
        if center:
            vh.beta = layer.beta
    else:
        red_axis = [0] if ndims == 2 else ([0, 2, 3] if axis == 1 else [0, 1, 2])
        if ndims == 5:
            red_axis = [0, 2, 3, 4] if axis == 1 else [0, 1, 2, 3]
        new_shape = None 
        if ndims == 4 and axis == 1:
            new_shape = [1, num_chan, 1, 1]
        if ndims == 5 and axis == 1:
            new_shape = [1, num_chan, 1, 1, 1]

        batch_mean = tf.reduce_mean(inputs, axis=red_axis)
        batch_mean_square = tf.reduce_mean(tf.square(inputs), axis=red_axis)

        if sync_statistics == 'nccl':
            if six.PY3 and TF_version <= 1.8 and ctx.is_main_training_tower:
                logger.warn("A TensorFlow bug cusing cross-GPU BatchNorm to fail")

            from tensorflow.contrib.nccl.ops import gen_nccl_ops
            shared_name = re.sub('tower[0-9]+/', '', tf.get_variable_scope().name)
            num_dev = ctx.total
            batch_mean = gen_nccl_ops.nccl_all_reduce(
                input=batch_mean,
                reduction='sum',
                num_devices=num_dev,
                shared_name=shared_name + '_NCCL_mean') * (1.0 / num_dev)
            batch_mean_square = gen_nccl_ops.nccl_all_reduce(
                input=batch_mean_square,
                reduction='sum',
                num_devices=num_dev,
                shared_name=shared_name + '_NCCL_mean_square') * (1.0 / num_dev)
        elif sync_statistics == 'horovod':
            import horovod.tensorflow as hvd
            batch_mean = hvd.allreduce(batch_mean, average=True)
            batch_mean_square = hvd.allreduce(batch_mean_square, average=True)
        batch_var = batch_mean_square - tf.square(batch_mean)
        batch_mean_vec = batch_mean
        batch_var_vec = batch_var

        beta, gamma, moving_mean, moving_var = get_bn_variables(
            num_chan, scale, center, beta_initializer, gamma_initializer)
        if new_shape is not None:
            batch_mean = tf.reshape(batch_mean, new_shape)
            batch_var = tf.reshape(batch_var, new_shape)
            xn = tf.nn.batch_normalization(
                inputs, batch_mean, batch_var,
                tf.reshape(beta, new_shape),
                tf.reshape(gamma, new_shape), epsilon)
        else:
            xn = tf.nn.batch_normalization(
                inputs, batch_mean, batch_var,
                beta, gamma, epsilon)

        if ctx.is_main_training_tower:
            ret = update_bn_ema(
                xn, batch_mean_vec, batch_var_vec, moving_mean, moving_var,
                momentum, internal_update)
        else:
            ret = tf.identity(xn, name='output')

        vh = ret.variables = VariableHolder(
            moving_mean=moving_mean,
            mean=moving_mean,  
            moving_variance=moving_var,
            variance=moving_var)  
        if scale:
            vh.gamma = gamma
        if center:
            vh.beta = beta
    return ret
Esempio n. 55
0
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

report = model.fit(X_train,
                   y_train,
                   validation_data=(X_test, y_test),
                   epochs=1)
"""## ONES"""

i_layer = Input(shape=input_shape)
h_layer = Conv2D(64, (3, 3),
                 strides=2,
                 activation='relu',
                 kernel_initializer=tf.ones_initializer())(i_layer)
h_layer = Flatten()(h_layer)
h_layer = Dropout(0.4)(h_layer)
h_layer = Dense(128,
                activation='relu',
                kernel_initializer=tf.ones_initializer())(h_layer)
h_layer = Dropout(0.4)(h_layer)
o_layer = Dense(classes, activation='softmax')(h_layer)

model = Model(i_layer, o_layer)

model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

report = model.fit(X_train,
Esempio n. 56
0
    def add_logits_op(self, data_batch, size_batch, reuse=False):
        with tf.variable_scope('logits', reuse=reuse):
            data_embeddings = self.add_embeddings_op(data_batch)

            with tf.name_scope('recurrent_layer'):

                def make_cell(input_size):
                    lstm_cell = tf.nn.rnn_cell.LSTMCell(self.config.num_units)
                    drop_cell = tf.nn.rnn_cell.DropoutWrapper(
                        lstm_cell,
                        state_keep_prob=self.lstm_state_dropout_placeholder,
                        output_keep_prob=self.lstm_output_dropout_placeholder,
                        variational_recurrent=True,
                        input_size=input_size,
                        dtype=tf.float32)

                    return drop_cell

                input_sizes = [
                    self.config.embedding_size, self.config.num_units,
                    self.config.num_units
                ]
                self.cell = tf.nn.rnn_cell.MultiRNNCell([
                    make_cell(input_sizes[i])
                    for i in range(self.config.num_layers)
                ])

                self.initial_state = self.cell.zero_state(
                    tf.shape(data_batch)[0], tf.float32)

                outputs, final_state = tf.nn.dynamic_rnn(
                    self.cell,
                    data_embeddings,
                    sequence_length=size_batch,
                    initial_state=self.initial_state,
                    dtype=tf.float32)

            with tf.name_scope('logits'):
                flat_outputs = tf.reshape(outputs, [-1, self.config.num_units])

                weights = tf.get_variable(
                    'weights',
                    initializer=tf.contrib.layers.xavier_initializer(),
                    shape=(self.config.num_units, self.config.embedding_size),
                    dtype=tf.float32)

                bias = tf.get_variable('bias',
                                       initializer=tf.ones_initializer(),
                                       shape=(self.config.embedding_size),
                                       dtype=tf.float32)

                flat_inputs = tf.matmul(flat_outputs, weights) + bias

                bias_logits = tf.get_variable(
                    'bias_logits',
                    initializer=tf.ones_initializer(),
                    shape=(self.config.vocab_size),
                    dtype=tf.float32)

                flat_logits = tf.matmul(
                    flat_inputs, tf.transpose(self.embeddings)) + bias_logits

                batch_size = tf.shape(data_batch)[0]
                max_len = tf.shape(data_batch)[1]

                logits = tf.reshape(
                    flat_logits, [batch_size, max_len, self.config.vocab_size])

            return logits, final_state
Esempio n. 57
0
def BatchNorm(inputs, axis=None, training=None, momentum=0.9, epsilon=1e-5,
              center=True, scale=True,
              beta_initializer=tf.zeros_initializer(),
              gamma_initializer=tf.ones_initializer(),
              virtual_batch_size=None,
              data_format='channels_last',
              internal_update=False,
              sync_statistics=None):
    """
    Almost equivalent to `tf.layers.batch_normalization`, but different (and more powerful)
    in the following:

    1. Accepts an alternative `data_format` option when `axis` is None. For 2D input, this argument will be ignored.
    2. Default value for `momentum` and `epsilon` is different.
    3. Default value for `training` is automatically obtained from tensorpack's `TowerContext`, but can be overwritten.
    4. Support the `internal_update` option, which enables the use of BatchNorm layer inside conditionals.
    5. Support the `sync_statistics` option, which is very useful in small-batch models.

    Args:
        internal_update (bool): if False, add EMA update ops to
          `tf.GraphKeys.UPDATE_OPS`. If True, update EMA inside the layer by control dependencies.
          They are very similar in speed, but `internal_update=True` can be used
          when you have conditionals in your model, or when you have multiple networks to train.
          Corresponding TF issue: https://github.com/tensorflow/tensorflow/issues/14699
        sync_statistics: either None or "nccl". By default (None), it uses statistics of the input tensor to normalize.
          When set to "nccl", this layer must be used under tensorpack multi-gpu trainers,
          and it then uses per-machine (multiple GPU) statistics to normalize.

          Note that this implementation averages the per-tower E[x] and E[x^2] among towers to compute
          global mean&variance. The result is the global mean&variance only if each tower has the same batch size.

          This option has no effect when not training.
          This option is also known as "Cross-GPU BatchNorm" as mentioned in https://arxiv.org/abs/1711.07240.
          Corresponding TF issue: https://github.com/tensorflow/tensorflow/issues/18222

    Variable Names:

    * ``beta``: the bias term. Will be zero-inited by default.
    * ``gamma``: the scale term. Will be one-inited by default.
    * ``mean/EMA``: the moving average of mean.
    * ``variance/EMA``: the moving average of variance.

    Note:
        Combinations of ``training`` and ``ctx.is_training``:

        * ``training == ctx.is_training``: standard BN, EMA are maintained during training
          and used during inference. This is the default.
        * ``training and not ctx.is_training``: still use batch statistics in inference.
        * ``not training and ctx.is_training``: use EMA to normalize in
          training. This is useful when you load a pre-trained BN and
          don't want to fine tune the EMA. EMA will not be updated in
          this case.
    """
    # parse shapes
    data_format = get_data_format(data_format, tfmode=False)
    shape = inputs.get_shape().as_list()
    ndims = len(shape)
    assert ndims in [2, 4], ndims
    if sync_statistics is not None:
        sync_statistics = sync_statistics.lower()
    assert sync_statistics in [None, 'nccl', 'horovod'], sync_statistics

    if axis is None:
        if ndims == 2:
            data_format = 'NHWC'
            axis = 1
        else:
            axis = 1 if data_format == 'NCHW' else 3
    else:
        data_format = 'NCHW' if axis == 1 else 'NHWC'
    num_chan = shape[axis]

    # parse training/ctx
    ctx = get_current_tower_context()
    if training is None:
        training = ctx.is_training
    training = bool(training)
    TF_version = get_tf_version_number()
    if not training and ctx.is_training:
        assert TF_version >= 1.4, \
            "Fine tuning a BatchNorm model with fixed statistics is only " \
            "supported after https://github.com/tensorflow/tensorflow/pull/12580 "
        if ctx.is_main_training_tower:  # only warn in first tower
            logger.warn("[BatchNorm] Using moving_mean/moving_variance in training.")
        # Using moving_mean/moving_variance in training, which means we
        # loaded a pre-trained BN and only fine-tuning the affine part.

    if sync_statistics is None or not (training and ctx.is_training):
        coll_bk = backup_collection([tf.GraphKeys.UPDATE_OPS])
        with rename_get_variable(
                {'moving_mean': 'mean/EMA',
                 'moving_variance': 'variance/EMA'}):
            tf_args = dict(
                axis=axis,
                momentum=momentum, epsilon=epsilon,
                center=center, scale=scale,
                beta_initializer=beta_initializer,
                gamma_initializer=gamma_initializer,
                fused=(ndims == 4 and axis in [1, 3]),
                _reuse=tf.get_variable_scope().reuse)
            if TF_version >= 1.5:
                tf_args['virtual_batch_size'] = virtual_batch_size
            else:
                assert virtual_batch_size is None, "Feature not supported in this version of TF!"
            layer = tf.layers.BatchNormalization(**tf_args)
            xn = layer.apply(inputs, training=training, scope=tf.get_variable_scope())

        # maintain EMA only on one GPU is OK, even in replicated mode.
        # because during training, EMA isn't used
        if ctx.is_main_training_tower:
            for v in layer.non_trainable_variables:
                add_model_variable(v)
        if not ctx.is_main_training_tower or internal_update:
            restore_collection(coll_bk)

        if training and internal_update:
            assert layer.updates
            with tf.control_dependencies(layer.updates):
                ret = tf.identity(xn, name='output')
        else:
            ret = tf.identity(xn, name='output')

        vh = ret.variables = VariableHolder(
            moving_mean=layer.moving_mean,
            mean=layer.moving_mean,  # for backward-compatibility
            moving_variance=layer.moving_variance,
            variance=layer.moving_variance)  # for backward-compatibility
        if scale:
            vh.gamma = layer.gamma
        if center:
            vh.beta = layer.beta
    else:
        red_axis = [0] if ndims == 2 else ([0, 2, 3] if axis == 1 else [0, 1, 2])

        new_shape = None  # don't need to reshape unless ...
        if ndims == 4 and axis == 1:
            new_shape = [1, num_chan, 1, 1]

        batch_mean = tf.reduce_mean(inputs, axis=red_axis)
        batch_mean_square = tf.reduce_mean(tf.square(inputs), axis=red_axis)

        if sync_statistics == 'nccl':
            if six.PY3 and TF_version <= 1.8 and ctx.is_main_training_tower:
                logger.warn("A TensorFlow bug will cause cross-GPU BatchNorm to fail. "
                            "Apply this patch: https://github.com/tensorflow/tensorflow/pull/20360")

            from tensorflow.contrib.nccl.ops import gen_nccl_ops
            shared_name = re.sub('tower[0-9]+/', '', tf.get_variable_scope().name)
            num_dev = ctx.total
            batch_mean = gen_nccl_ops.nccl_all_reduce(
                input=batch_mean,
                reduction='sum',
                num_devices=num_dev,
                shared_name=shared_name + '_NCCL_mean') * (1.0 / num_dev)
            batch_mean_square = gen_nccl_ops.nccl_all_reduce(
                input=batch_mean_square,
                reduction='sum',
                num_devices=num_dev,
                shared_name=shared_name + '_NCCL_mean_square') * (1.0 / num_dev)
        elif sync_statistics == 'horovod':
            # Require https://github.com/uber/horovod/pull/331
            # Proof-of-concept, not ready yet.
            import horovod.tensorflow as hvd
            batch_mean = hvd.allreduce(batch_mean, average=True)
            batch_mean_square = hvd.allreduce(batch_mean_square, average=True)
        batch_var = batch_mean_square - tf.square(batch_mean)
        batch_mean_vec = batch_mean
        batch_var_vec = batch_var

        beta, gamma, moving_mean, moving_var = get_bn_variables(
            num_chan, scale, center, beta_initializer, gamma_initializer)
        if new_shape is not None:
            batch_mean = tf.reshape(batch_mean, new_shape)
            batch_var = tf.reshape(batch_var, new_shape)
            # Using fused_batch_norm(is_training=False) is actually slightly faster,
            # but hopefully this call will be JITed in the future.
            xn = tf.nn.batch_normalization(
                inputs, batch_mean, batch_var,
                tf.reshape(beta, new_shape),
                tf.reshape(gamma, new_shape), epsilon)
        else:
            xn = tf.nn.batch_normalization(
                inputs, batch_mean, batch_var,
                beta, gamma, epsilon)

        if ctx.is_main_training_tower:
            ret = update_bn_ema(
                xn, batch_mean_vec, batch_var_vec, moving_mean, moving_var,
                momentum, internal_update)
        else:
            ret = tf.identity(xn, name='output')

        vh = ret.variables = VariableHolder(
            moving_mean=moving_mean,
            mean=moving_mean,  # for backward-compatibility
            moving_variance=moving_var,
            variance=moving_var)  # for backward-compatibility
        if scale:
            vh.gamma = gamma
        if center:
            vh.beta = beta
    return ret
Esempio n. 58
0
def layer_norm(inputs,
               center=True,
               scale=True,
               activation_fn=None,
               reuse=None,
               variables_collections=None,
               outputs_collections=None,
               trainable=True,
               begin_norm_axis=1,
               begin_params_axis=-1,
               scope=None):
    # https://github.com/pytorch/fairseq/blob/5d543f9b19e76772386903d4eeebdceaeb3d1b69/fairseq/modules/layer_norm.py#L9
    # https://github.com/NVIDIA/apex/blob/3ef01faef2492b3e650f44ecc510f3a8f2426783/csrc/layer_norm_cuda_kernel.cu#L303
    # https://github.com/tensorflow/tensorflow/blob/r1.14/tensorflow/python/ops/nn_impl.py#L1240
    """Custom Layer Normalization with changable epsilon."""
    with tf.variable_scope(scope, 'LayerNorm', [inputs], reuse=reuse):
        inputs_shape = inputs.shape
        inputs_rank = inputs_shape.ndims
        if inputs_rank is None:
            raise ValueError('Inputs %s has undefined rank.' % inputs.name)
        dtype = inputs.dtype.base_dtype
        if begin_norm_axis < 0:
            begin_norm_axis = inputs_rank + begin_norm_axis
        if begin_params_axis >= inputs_rank or begin_norm_axis >= inputs_rank:
            raise ValueError('begin_params_axis (%d) and begin_norm_axis (%d) '
                             'must be < rank(inputs) (%d)' %
                             (begin_params_axis, begin_norm_axis, inputs_rank))
        params_shape = inputs_shape[begin_params_axis:]
        if not params_shape.is_fully_defined():
            raise ValueError(
                'Inputs %s: shape(inputs)[%s:] is not fully defined: %s' %
                (inputs.name, begin_params_axis, inputs_shape))
        # Allocate parameters for the beta and gamma of the normalization.
        beta, gamma = None, None
        if center:
            beta = tf.get_variable('beta',
                                   shape=params_shape,
                                   dtype=dtype,
                                   initializer=tf.zeros_initializer(),
                                   trainable=trainable)
        if scale:
            gamma = tf.get_variable('gamma',
                                    shape=params_shape,
                                    dtype=dtype,
                                    initializer=tf.ones_initializer(),
                                    trainable=trainable)
        # By default, compute the moments across all the dimensions except the one
        # with index 0.
        norm_axes = list(range(begin_norm_axis, inputs_rank))
        mean, variance = tf.nn.moments(inputs, norm_axes, keep_dims=True)
        # Compute layer normalization using the batch_normalization function.
        # Note that epsilon must be increased for float16 due to the limited
        # representable range.
        variance_epsilon = (FLAGS.ln_eps if dtype != tf.float16 else max(
            FLAGS.ln_eps, 1e-3))
        outputs = tf.nn.batch_normalization(inputs,
                                            mean,
                                            variance,
                                            offset=beta,
                                            scale=gamma,
                                            variance_epsilon=variance_epsilon)
        outputs.set_shape(inputs_shape)
        if activation_fn is not None:
            outputs = activation_fn(outputs)
        return outputs
Esempio n. 59
0
 def get_coordinate(i):
     return tf.get_variable("x_{}".format(i),
                            shape=[],
                            dtype=tf.float32,
                            initializer=tf.ones_initializer())
Esempio n. 60
0
    def _layer_stack(self,
                     x,
                     num_layers,
                     encoder_output=None,
                     self_attention_mask=None,
                     encdec_attention_mask=None,
                     losses=None):
        """Encoder or decoder stack.

    Args:
      x: a mtf.Tensor with shape [<batch_dims>, length_dim, model_dim]
      num_layers: an integer
      encoder_output: an optional mtf.Tensor with shape
        [<batch_dims>, encoder_length_dim, model_dim]
      self_attention_mask: an optional mtf.Tensor with shape
        [batch, length_dim, memory_length_dim] containing values 0 or -inf.
      encdec_attention_mask: an optional mtf.Tensor with shape
        [batch, length_dim, encoder_length_dim] containing values 0 or -inf.
      losses: a list to be appended-to
    Returns:
      a mtf.Tensor with shape [<batch_dims>, length_dim, model_dim]
    Raises:
      ValueError: if hparams make no sense
    """
        hparams = self._hparams

        def layer_prepostprocess_dropout(x):
            return mtf.dropout(
                x,
                keep_prob=1.0 - hparams.layer_prepostprocess_dropout,
                noise_shape=mtf.Shape(self.batch_dims + [self.model_dim]))

        num_layer_norms = num_layers * (2 if encoder_output is None else 3) + 1
        layer_norms_dim = mtf.Dimension("layer_norms", num_layer_norms)
        layer_norm_combined_var = mtf.get_variable(
            x.mesh,
            "layer_norm_scale",
            mtf.Shape([layer_norms_dim, self.model_dim]),
            initializer=tf.ones_initializer(),
            activation_dtype=x.dtype)
        layer_norm_vars = mtf.unstack(layer_norm_combined_var, layer_norms_dim)

        def normalize(x):
            scale = layer_norm_vars.pop(0)
            variance = mtf.reduce_mean(mtf.square(x),
                                       reduced_dim=self.model_dim)
            return x * mtf.rsqrt(variance + hparams.norm_epsilon) * scale

        for layer in range(num_layers):
            with tf.variable_scope("layer_%d" % layer):
                # Self attention layer
                x += layer_prepostprocess_dropout(
                    mtf_layers.multihead_attention(
                        normalize(x),
                        None,
                        self_attention_mask,
                        self.kv_dim,
                        self.heads_dim,
                        dropout=hparams.attention_dropout,
                        dropout_broadcast_dims=[self.length_dim],
                        name="self_attention"))
                if encoder_output is not None:
                    # Encoder-Decoder attention layer
                    x += layer_prepostprocess_dropout(
                        mtf_layers.multihead_attention(
                            normalize(x),
                            encoder_output,
                            encdec_attention_mask,
                            self.kv_dim,
                            self.heads_dim,
                            dropout=hparams.attention_dropout,
                            dropout_broadcast_dims=[self.length_dim],
                            name="encdec_attention"))
                # ffn layer
                x += layer_prepostprocess_dropout(
                    self._feedforward_layer(normalize(x), losses=losses))
        x = layer_prepostprocess_dropout(normalize(x))
        assert not layer_norm_vars
        return x