def bacthnorm(inputs, scope, epsilon=1e-05, momentum=0.99, is_training=True): inputs_shape = inputs.get_shape().as_list()# 输出 形状尺寸 params_shape = inputs_shape[-1:]# 输入参数的长度 axis = list(range(len(inputs_shape) - 1)) with tf.variable_scope(scope): beta = create_variable("beta", params_shape, initializer=tf.zeros_initializer()) gamma = create_variable("gamma", params_shape, initializer=tf.ones_initializer()) # 均值 常量 不需要训练 for inference moving_mean = create_variable("moving_mean", params_shape, initializer=tf.zeros_initializer(), trainable=False) # 方差 常量 不需要训练 moving_variance = create_variable("moving_variance", params_shape, initializer=tf.ones_initializer(), trainable=False) if is_training: mean, variance = tf.nn.moments(inputs, axes=axis)# 计算均值和方差 # 移动平均求 均值和 方差 考虑上一次的量 xt = a * x_t-1 +(1-a)*x_now update_move_mean = moving_averages.assign_moving_average(moving_mean, mean, decay=momentum) update_move_variance = moving_averages.assign_moving_average(moving_variance, variance, decay=momentum) tf.add_to_collection(UPDATE_OPS_COLLECTION, update_move_mean) tf.add_to_collection(UPDATE_OPS_COLLECTION, update_move_variance) else: mean, variance = moving_mean, moving_variance return tf.nn.batch_normalization(inputs, mean, variance, beta, gamma, epsilon)
def batch_norm(x, decay=0.999, epsilon=1e-03, is_training=True, scope="scope"): x_shape = x.get_shape() num_inputs = x_shape[-1] reduce_dims = list(range(len(x_shape) - 1)) with tf.variable_scope(scope): beta = create_var("beta", [num_inputs,], initializer=tf.zeros_initializer()) gamma = create_var("gamma", [num_inputs,], initializer=tf.ones_initializer()) # for inference moving_mean = create_var("moving_mean", [num_inputs,], initializer=tf.zeros_initializer(), trainable=False) moving_variance = create_var("moving_variance", [num_inputs], initializer=tf.ones_initializer(), trainable=False) if is_training: mean, variance = tf.nn.moments(x, axes=reduce_dims) update_move_mean = moving_averages.assign_moving_average(moving_mean, mean, decay=decay) update_move_variance = moving_averages.assign_moving_average(moving_variance, variance, decay=decay) tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, update_move_mean) tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, update_move_variance) else: mean, variance = moving_mean, moving_variance return tf.nn.batch_normalization(x, mean, variance, beta, gamma, epsilon)
def _batch_norm_without_layers(self, input_layer, decay, use_scale, epsilon): """Batch normalization on `input_layer` without tf.layers.""" shape = input_layer.shape num_channels = shape[3] if self.data_format == 'NHWC' else shape[1] beta = self.get_variable( 'beta', [num_channels], tf.float32, tf.float32, initializer=tf.zeros_initializer()) if use_scale: gamma = self.get_variable( 'gamma', [num_channels], tf.float32, tf.float32, initializer=tf.ones_initializer()) else: gamma = tf.constant(1.0, tf.float32, [num_channels]) moving_mean = tf.get_variable( 'moving_mean', [num_channels], tf.float32, initializer=tf.zeros_initializer(), trainable=False) moving_variance = tf.get_variable( 'moving_variance', [num_channels], tf.float32, initializer=tf.ones_initializer(), trainable=False) if self.phase_train: bn, batch_mean, batch_variance = tf.nn.fused_batch_norm( input_layer, gamma, beta, epsilon=epsilon, data_format=self.data_format, is_training=True) mean_update = moving_averages.assign_moving_average( moving_mean, batch_mean, decay=decay, zero_debias=False) variance_update = moving_averages.assign_moving_average( moving_variance, batch_variance, decay=decay, zero_debias=False) tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, mean_update) tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, variance_update) else: bn, _, _ = tf.nn.fused_batch_norm( input_layer, gamma, beta, mean=moving_mean, variance=moving_variance, epsilon=epsilon, data_format=self.data_format, is_training=False) return bn
def __init__(self, size, eps=1e-2, default_clip_range=np.inf, sess=None): """A normalizer that ensures that observations are approximately distributed according to a standard Normal distribution (i.e. have mean zero and variance one). Args: size (int): the size of the observation to be normalized eps (float): a small constant that avoids underflows default_clip_range (float): normalized observations are clipped to be in [-default_clip_range, default_clip_range] sess (object): the TensorFlow session to be used """ self.size = size self.eps = eps self.default_clip_range = default_clip_range self.sess = sess if sess is not None else tf.get_default_session() self.local_sum = np.zeros(self.size, np.float32) self.local_sumsq = np.zeros(self.size, np.float32) self.local_count = np.zeros(1, np.float32) self.sum_tf = tf.get_variable( initializer=tf.zeros_initializer(), shape=self.local_sum.shape, name='sum', trainable=False, dtype=tf.float32) self.sumsq_tf = tf.get_variable( initializer=tf.zeros_initializer(), shape=self.local_sumsq.shape, name='sumsq', trainable=False, dtype=tf.float32) self.count_tf = tf.get_variable( initializer=tf.ones_initializer(), shape=self.local_count.shape, name='count', trainable=False, dtype=tf.float32) self.mean = tf.get_variable( initializer=tf.zeros_initializer(), shape=(self.size,), name='mean', trainable=False, dtype=tf.float32) self.std = tf.get_variable( initializer=tf.ones_initializer(), shape=(self.size,), name='std', trainable=False, dtype=tf.float32) self.count_pl = tf.placeholder(name='count_pl', shape=(1,), dtype=tf.float32) self.sum_pl = tf.placeholder(name='sum_pl', shape=(self.size,), dtype=tf.float32) self.sumsq_pl = tf.placeholder(name='sumsq_pl', shape=(self.size,), dtype=tf.float32) self.update_op = tf.group( self.count_tf.assign_add(self.count_pl), self.sum_tf.assign_add(self.sum_pl), self.sumsq_tf.assign_add(self.sumsq_pl) ) self.recompute_op = tf.group( tf.assign(self.mean, self.sum_tf / self.count_tf), tf.assign(self.std, tf.sqrt(tf.maximum( tf.square(self.eps), self.sumsq_tf / self.count_tf - tf.square(self.sum_tf / self.count_tf) ))), ) self.lock = threading.Lock()
def initialize_model(sess, train_data_flat, train_labels): """Reproduce model from train-on-mnist/mnist_lbfgs""" dtype = tf.float64 batchSize = 100 learningRate = 0.1 W = tf.Variable(tf.ones_initializer((1024, 10), dtype=dtype)) b = tf.Variable(tf.ones_initializer((1, 10), dtype=dtype)) x = tf.Variable(tf.zeros_initializer((batchSize, 1024), dtype=dtype)) targets = tf.Variable(tf.zeros_initializer((batchSize, 10), dtype=dtype)) logits = tf.matmul(x, W) + b # cross entropy expects batch dimension to be first, transpose inputs cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits, targets) cross_entropy_loss = tf.reduce_mean(cross_entropy) Wnorm = tf.reduce_sum(tf.square(W)) bnorm = tf.reduce_sum(tf.square(b)) loss = cross_entropy_loss + (bnorm + Wnorm)/2 loss_handle_op = tf.get_session_handle(loss) # grads = tf.gradients(loss, [W, b]) opt = tf.train.GradientDescentOptimizer(learning_rate=learningRate) grads_and_vars = opt.compute_gradients(loss, [W, b]) train_step = opt.apply_gradients(grads_and_vars) W_grad = grads_and_vars[0][0] b_grad = grads_and_vars[1][0] flat_grad = concat_flatten([tf.transpose(W_grad), b_grad]) flat_grad_handle_op = tf.get_session_handle(flat_grad) flat_params = concat_flatten([tf.transpose(W), b]) # initialize x and targets x_placeholder = tf.placeholder(dtype=dtype) x_init = x.assign(x_placeholder) # initialize labels labels_placeholder = tf.placeholder(shape=(batchSize), dtype=tf.int32) # Lua labels are off-by-one hence -1 labels_onehot = tf.one_hot(labels_placeholder - 1, 10, dtype=dtype) targets_init = targets.assign(labels_onehot) sess.run(x_init, feed_dict={x_placeholder:train_data_flat[:batchSize]}) sess.run(targets_init, feed_dict={labels_placeholder: train_labels[:batchSize]}) sess.run([W.initializer, b.initializer]) [(Wgrad, W), (bgrad, b)] = grads_and_vars return [loss, loss_handle_op, flat_params, flat_grad, flat_grad_handle_op, W, b, train_step]
def layer_norm(x: tf.Tensor, epsilon: float = 1e-6) -> tf.Tensor: """Layer normalize the tensor x, averaging over the last dimension. Implementation based on tensor2tensor. Arguments: x: The ``Tensor`` to normalize. epsilon: The smoothing parameter of the normalization. Returns: The normalized tensor. """ with tf.variable_scope("LayerNorm"): gamma = get_variable( name="gamma", shape=[x.get_shape()[-1]], dtype=tf.float32, initializer=tf.ones_initializer()) beta = get_variable( name="beta", shape=[x.get_shape()[-1]], dtype=tf.float32, initializer=tf.zeros_initializer()) mean = tf.reduce_mean(x, axis=[-1], keepdims=True) variance = tf.reduce_mean( tf.square(x - mean), axis=[-1], keepdims=True) norm_x = (x - mean) * tf.rsqrt(variance + epsilon) return norm_x * gamma + beta
def call(self, x, h): channels = x.shape[self._feature_axis].value with tf.variable_scope('gates'): inputs = tf.concat([x, h], axis=self._feature_axis) n = channels + self._filters m = 2 * self._filters if self._filters > 1 else 2 W = tf.get_variable('kernel', self._kernel + [n, m]) y = tf.nn.convolution(inputs, W, 'SAME', data_format=self._data_format) if self._normalize: r, u = tf.split(y, 2, axis=self._feature_axis) r = tf.contrib.layers.layer_norm(r) u = tf.contrib.layers.layer_norm(u) else: y += tf.get_variable('bias', [m], initializer=tf.ones_initializer()) r, u = tf.split(y, 2, axis=self._feature_axis) r, u = tf.sigmoid(r), tf.sigmoid(u) # TODO #tf.summary.histogram('reset_gate', r) #tf.summary.histogram('update_gate', u) with tf.variable_scope('candidate'): inputs = tf.concat([x, r * h], axis=self._feature_axis) n = channels + self._filters m = self._filters W = tf.get_variable('kernel', self._kernel + [n, m]) y = tf.nn.convolution(inputs, W, 'SAME', data_format=self._data_format) if self._normalize: y = tf.contrib.layers.layer_norm(y) else: y += tf.get_variable('bias', [m], initializer=tf.zeros_initializer()) h = u * h + (1 - u) * self._activation(y) return h, h
def _network_template(self, state): # This dummy network allows us to deterministically anticipate that # action 0 will be selected by an argmax. inputs = tf.constant( np.zeros((state.shape[0], stack_size)), dtype=tf.float32) # In Rainbow we are dealing with a distribution over Q-values, # which are represented as num_atoms bins, ranging from -vmax to vmax. # The output layer will have num_actions * num_atoms elements, # so each group of num_atoms weights represent the logits for a # particular action. By setting 1s everywhere, except for the first # num_atoms (representing the logits for the first action), which are # set to np.arange(num_atoms), we are ensuring that the first action # places higher weight on higher Q-values; this results in the first # action being chosen. first_row = np.tile(np.ones(self._num_atoms), self.num_actions - 1) first_row = np.concatenate((np.arange(self._num_atoms), first_row)) bottom_rows = np.tile( np.ones(self.num_actions * self._num_atoms), (stack_size - 1, 1)) weights_initializer = np.concatenate(([first_row], bottom_rows)) net = slim.fully_connected( inputs, self.num_actions * self._num_atoms, weights_initializer=tf.constant_initializer(weights_initializer), biases_initializer=tf.ones_initializer(), activation_fn=None) logits = tf.reshape(net, [-1, self.num_actions, self._num_atoms]) probabilities = tf.contrib.layers.softmax(logits) qs = tf.reduce_sum(self._support * probabilities, axis=2) return self._get_network_type()(qs, logits, probabilities)
def test_basic_rnn_cell(self): """see test_basic_rnn_cell.png for the graph""" batch_size = 1 input_shape = [batch_size, 2] state_shape = [batch_size, 3] num_units = 4 # should be equal to state_shape[1] to be recurrent input_value = np.random.rand(*input_shape) state_value = np.random.rand(*state_shape) np_result = TestRNNCells._basic_linear(input_value, state_value, num_units) with tf.Session() as sess: with tf.variable_scope('test_basic_rnn_cell', initializer=tf.ones_initializer()): inputs = tf.placeholder(tf.float32, input_shape, 'inputs') prev_state = tf.placeholder(tf.float32, state_shape, 'prev_state') cell = tf.contrib.rnn.BasicRNNCell(num_units) output_op, new_state_op = cell(inputs, prev_state) self.assertIsInstance(output_op, tf.Tensor) tf.summary.FileWriter('/tmp/test_basic_rnn_cell', sess.graph) sess.run(tf.global_variables_initializer()) output, new_state = sess.run([output_op, new_state_op], feed_dict={ inputs: input_value, prev_state: state_value }) self.assertIsInstance(output, np.ndarray) self.assertEqual(output.shape, (batch_size, num_units)) self.assertTrue(np.array_equal(output, new_state)) np.testing.assert_array_almost_equal(np_result, output)
def conv2d_zeros(x, width, filter_size=[3, 3], stride=[1, 1], pad="SAME", logscale_factor=3, skip=1, edge_bias=True, name=None): with tf.variable_scope(name, "conv2d"): if edge_bias and pad == "SAME": x = add_edge_padding(x, filter_size) pad = 'VALID' n_in = int(x.get_shape()[3]) stride_shape = [1] + stride + [1] filter_shape = filter_size + [n_in, width] w = tf.get_variable("W", filter_shape, tf.float32, initializer=tf.zeros_initializer()) if skip == 1: x = tf.nn.conv2d(x, w, stride_shape, pad, data_format='NHWC') else: assert stride[0] == 1 and stride[1] == 1 x = tf.nn.atrous_conv2d(x, w, skip, pad) x += tf.get_variable("b", [1, 1, 1, width], initializer=tf.ones_initializer()) x *= tf.exp(tf.get_variable("logs", [1, width], initializer=tf.zeros_initializer()) * logscale_factor) return x
def get_logits(self, image): gauss_init = tf.random_normal_initializer(stddev=0.01) with argscope(Conv2D, kernel_initializer=tf.variance_scaling_initializer(scale=2.)), \ argscope([Conv2D, FullyConnected], activation=tf.nn.relu), \ argscope([Conv2D, MaxPooling], data_format='channels_last'): # necessary padding to get 55x55 after conv1 image = tf.pad(image, [[0, 0], [2, 2], [2, 2], [0, 0]]) l = Conv2D('conv1', image, filters=96, kernel_size=11, strides=4, padding='VALID') # size: 55 visualize_conv1_weights(l.variables.W) l = tf.nn.lrn(l, 2, bias=1.0, alpha=2e-5, beta=0.75, name='norm1') l = MaxPooling('pool1', l, 3, strides=2, padding='VALID') # 27 l = Conv2D('conv2', l, filters=256, kernel_size=5, split=2) l = tf.nn.lrn(l, 2, bias=1.0, alpha=2e-5, beta=0.75, name='norm2') l = MaxPooling('pool2', l, 3, strides=2, padding='VALID') # 13 l = Conv2D('conv3', l, filters=384, kernel_size=3) l = Conv2D('conv4', l, filters=384, kernel_size=3, split=2) l = Conv2D('conv5', l, filters=256, kernel_size=3, split=2) l = MaxPooling('pool3', l, 3, strides=2, padding='VALID') l = FullyConnected('fc6', l, 4096, kernel_initializer=gauss_init, bias_initializer=tf.ones_initializer()) l = Dropout(l, rate=0.5) l = FullyConnected('fc7', l, 4096, kernel_initializer=gauss_init) l = Dropout(l, rate=0.5) logits = FullyConnected('fc8', l, 1000, kernel_initializer=gauss_init) return logits
def batch_norm(inputs, name_scope, is_training, epsilon=1e-3, decay=0.99): with tf.variable_scope(name_scope): size = inputs.get_shape().as_list()[1] gamma = tf.get_variable( 'gamma', [size], initializer=tf.constant_initializer(0.1)) # beta = tf.get_variable('beta', [size], initializer=tf.constant_initializer(0)) beta = tf.get_variable('beta', [size]) pop_mean = tf.get_variable('pop_mean', [size], initializer=tf.zeros_initializer(), trainable=False) pop_var = tf.get_variable('pop_var', [size], initializer=tf.ones_initializer(), trainable=False) batch_mean, batch_var = tf.nn.moments(inputs, [0]) train_mean_op = tf.assign( pop_mean, pop_mean * decay + batch_mean * (1 - decay)) train_var_op = tf.assign( pop_var, pop_var * decay + batch_var * (1 - decay)) def batch_statistics(): with tf.control_dependencies([train_mean_op, train_var_op]): return tf.nn.batch_normalization(inputs, batch_mean, batch_var, beta, gamma, epsilon) def pop_statistics(): return tf.nn.batch_normalization(inputs, pop_mean, pop_var, beta, gamma, epsilon) # control flow return tf.cond(is_training, batch_statistics, pop_statistics)
def batch_norm(x, name_scope, training, epsilon=1e-3, decay=0.999): """Assume 2d [batch, values] tensor""" with tf.variable_scope(name_scope): size = x.get_shape().as_list()[1] scale = tf.get_variable('scale', [size], initializer=tf.constant_initializer(0.1)) offset = tf.get_variable('offset', [size]) pop_mean = tf.get_variable('pop_mean', [size], initializer=tf.zeros_initializer(), trainable=False) pop_var = tf.get_variable('pop_var', [size], initializer=tf.ones_initializer(), trainable=False) batch_mean, batch_var = tf.nn.moments(x, [0]) train_mean_op = tf.assign( pop_mean, pop_mean * decay + batch_mean * (1 - decay)) train_var_op = tf.assign( pop_var, pop_var * decay + batch_var * (1 - decay)) def batch_statistics(): with tf.control_dependencies([train_mean_op, train_var_op]): return tf.nn.batch_normalization(x, batch_mean, batch_var, offset, scale, epsilon) def population_statistics(): return tf.nn.batch_normalization(x, pop_mean, pop_var, offset, scale, epsilon) return tf.cond(training, batch_statistics, population_statistics)
def __init__ (self, name, inputs, training, data_format, start=None, end=None, weights=None, weight_scope=None, fake=False): super(BatchNorm, self).__init__(name = name, start=start, end=end) self.fake = fake if not self.fake: if weights is not None: params_name = weight_scope + '/' + str(name) + '/batch_normalization/' np_dict = load_pkl_obj(weights) beta_np = np_dict[params_name+'beta:0'] gamma_np = np_dict[params_name+'gamma:0'] moving_mean_np = np_dict[params_name+'moving_mean:0'] moving_variance_np = np_dict[params_name+'moving_variance:0'] in_shp = inputs.shape.as_list()[1] if not beta_np.shape[0] == in_shp: beta_np = np.resize(beta_np, (in_shp,)) gamma_np = np.resize(gamma_np, (in_shp,)) moving_mean_np = np.resize(moving_mean_np, (in_shp)) moving_variance_np = np.resize(moving_variance_np, (in_shp)) beta_initializer = tf.constant_initializer(beta_np) gamma_initializer = tf.constant_initializer(gamma_np) moving_mean_initializer = tf.constant_initializer(moving_mean_np) moving_variance_initializer = tf.constant_initializer(moving_variance_np) else: beta_initializer = tf.zeros_initializer() gamma_initializer = tf.ones_initializer() moving_mean_initializer = tf.zeros_initializer() moving_variance_initializer = tf.ones_initializer() with tf.variable_scope(self._name): self.output=tf.layers.batch_normalization(inputs=inputs, axis=1 if data_format == 'channels_first' else 3, momentum=_BATCH_NORM_DECAY, epsilon=_BATCH_NORM_EPSILON, center=True, scale=True, training=training, beta_initializer=beta_initializer, gamma_initializer=gamma_initializer, moving_mean_initializer=moving_mean_initializer, moving_variance_initializer=moving_variance_initializer, fused=True ) self._tf_name = self.output.name.split('/')[0] + '/' + self.output.name.split('/')[1] else: assert isinstance(inputs, Fake) self.output=Fake(inputs.shape) self.param=Fake(inputs.shape[1] * 4) self.description.append('BatchNorm') self.description.append(self.get_memory_footprint())
def main(_): ed.set_seed(42) # DATA x_data = build_toy_dataset(FLAGS.N) # MODEL pi = Dirichlet(concentration=tf.ones(FLAGS.K)) mu = Normal(0.0, 1.0, sample_shape=[FLAGS.K, FLAGS.D]) sigma = InverseGamma(concentration=1.0, rate=1.0, sample_shape=[FLAGS.K, FLAGS.D]) c = Categorical(logits=tf.log(pi) - tf.log(1.0 - pi), sample_shape=FLAGS.N) x = Normal(loc=tf.gather(mu, c), scale=tf.gather(sigma, c)) # INFERENCE qpi = Empirical(params=tf.get_variable( "qpi/params", [FLAGS.T, FLAGS.K], initializer=tf.constant_initializer(1.0 / FLAGS.K))) qmu = Empirical(params=tf.get_variable("qmu/params", [FLAGS.T, FLAGS.K, FLAGS.D], initializer=tf.zeros_initializer())) qsigma = Empirical(params=tf.get_variable("qsigma/params", [FLAGS.T, FLAGS.K, FLAGS.D], initializer=tf.ones_initializer())) qc = Empirical(params=tf.get_variable("qc/params", [FLAGS.T, FLAGS.N], initializer=tf.zeros_initializer(), dtype=tf.int32)) gpi = Dirichlet(concentration=tf.constant([1.4, 1.6])) gmu = Normal(loc=tf.constant([[1.0, 1.0], [-1.0, -1.0]]), scale=tf.constant([[0.5, 0.5], [0.5, 0.5]])) gsigma = InverseGamma(concentration=tf.constant([[1.1, 1.1], [1.1, 1.1]]), rate=tf.constant([[1.0, 1.0], [1.0, 1.0]])) gc = Categorical(logits=tf.zeros([FLAGS.N, FLAGS.K])) inference = ed.MetropolisHastings( latent_vars={pi: qpi, mu: qmu, sigma: qsigma, c: qc}, proposal_vars={pi: gpi, mu: gmu, sigma: gsigma, c: gc}, data={x: x_data}) inference.initialize() sess = ed.get_session() tf.global_variables_initializer().run() for _ in range(inference.n_iter): info_dict = inference.update() inference.print_progress(info_dict) t = info_dict['t'] if t == 1 or t % inference.n_print == 0: qpi_mean, qmu_mean = sess.run([qpi.mean(), qmu.mean()]) print("") print("Inferred membership probabilities:") print(qpi_mean) print("Inferred cluster means:") print(qmu_mean)
def build(self, _): self.scale = tf.get_variable("layer_norm_scale", [self.hidden_size], initializer=tf.ones_initializer(dtype=tf.float32), dtype=tf.float32) self.bias = tf.get_variable("layer_norm_bias", [self.hidden_size], initializer=tf.zeros_initializer(dtype=tf.float32), dtype=tf.float32) self.built = True
def create_graph(device0, device1): """Create graph that keeps var1 on device0, var2 on device1 and adds them""" tf.reset_default_graph() dtype=tf.int32 params_size = 250*1000*FLAGS.data_mb # 1MB is 250k integers with tf.device(device0): var1 = tf.get_variable("var1", [params_size], dtype, initializer=tf.ones_initializer()) with tf.device(device1): var2 = tf.get_variable("var2", [params_size], dtype, initializer=tf.ones_initializer()) add_op = var1.assign_add(var2) init_op = tf.global_variables_initializer() return init_op, add_op
def make_params(): params_size = 250*1000*FLAGS.data_mb # 1MB is 250k integers dtype=tf.int32 ps_device = get_ps_device(0) with tf.device(ps_device): params = tf.get_variable("params", [params_size], dtype, initializer=tf.ones_initializer()) return params
def bn(x, c): x_shape = x.get_shape() params_shape = x_shape[-1:] if c['use_bias']: bias = _get_variable('bias', params_shape, initializer=tf.zeros_initializer) return x + bias axis = list(range(len(x_shape) - 1)) beta = _get_variable('beta', params_shape, initializer=tf.zeros_initializer) gamma = _get_variable('gamma', params_shape, initializer=tf.ones_initializer()) moving_mean = _get_variable('moving_mean', params_shape, initializer=tf.zeros_initializer, trainable=False) moving_variance = _get_variable('moving_variance', params_shape, initializer=tf.ones_initializer(), trainable=False) # These ops will only be preformed when training. mean, variance = tf.nn.moments(x, axis) update_moving_mean = moving_averages.assign_moving_average(moving_mean, mean, BN_DECAY) update_moving_variance = moving_averages.assign_moving_average( moving_variance, variance, BN_DECAY) tf.add_to_collection(UPDATE_OPS_COLLECTION, update_moving_mean) tf.add_to_collection(UPDATE_OPS_COLLECTION, update_moving_variance) mean, variance = control_flow_ops.cond( c['is_training'], lambda: (mean, variance), lambda: (moving_mean, moving_variance)) x = tf.nn.batch_normalization(x, mean, variance, beta, gamma, BN_EPSILON) #x.set_shape(inputs.get_shape()) ?? return x
def __init__(self, capacity): s = () d = tf.int32 super().__init__(capacity - 1, [d], [s]) self._first = tf.get_variable(name="var1", initializer=tf.ones_initializer(), shape=s, dtype=d, use_resource=False) self._size = tf.get_variable(name="size", shape=(), initializer=tf.zeros_initializer(), dtype=tf.int32, use_resource=False)
def layer_norm(x, nmaps, prefix, epsilon=1e-5): """Layer normalize the 4D tensor x, averaging over the last dimension.""" with tf.variable_scope(prefix): scale = tf.get_variable("layer_norm_scale", [nmaps], initializer=tf.ones_initializer()) bias = tf.get_variable("layer_norm_bias", [nmaps], initializer=tf.zeros_initializer()) mean, variance = tf.nn.moments(x, [3], keep_dims=True) norm_x = (x - mean) / tf.sqrt(variance + epsilon) return norm_x * scale + bias
def testInitializers(self): inputs = tf.placeholder(tf.float32, shape=[self.batch_size, self.in_size]) prev_state = tf.placeholder(tf.float32, shape=[self.batch_size, self.hidden_size]) with self.assertRaisesRegexp(KeyError, "Invalid initializer keys.*"): snt.VanillaRNN(name="rnn", hidden_size=self.hidden_size, initializers={"invalid": None}) err = "Initializer for 'w' is not a callable function" with self.assertRaisesRegexp(TypeError, err): snt.VanillaRNN(name="rnn", hidden_size=self.hidden_size, initializers={"in_to_hidden": {"w": tf.zeros([10, 10])}}) # Nested initializer. valid_initializers = { "in_to_hidden": { "w": tf.ones_initializer(), }, "hidden_to_hidden": { "b": tf.ones_initializer(), } } vanilla_rnn = snt.VanillaRNN(name="rnn", hidden_size=self.hidden_size, initializers=valid_initializers) vanilla_rnn(inputs, prev_state) init = tf.global_variables_initializer() with self.test_session() as sess: sess.run(init) w_v, b_v = sess.run([ vanilla_rnn.in_to_hidden_linear.w, vanilla_rnn.hidden_to_hidden_linear.b, ]) self.assertAllClose(w_v, np.ones([self.in_size, self.hidden_size])) self.assertAllClose(b_v, np.ones([self.hidden_size]))
def _batch_norm_without_layers(self, input_layer, decay, use_scale, epsilon): """Batch normalization on `input_layer` without tf.layers.""" # We make this function as similar as possible to the # tf.contrib.layers.batch_norm, to minimize the differences between using # layers and not using layers. shape = input_layer.shape num_channels = shape[3] if self.data_format == 'NHWC' else shape[1] beta = self.get_variable('beta', [num_channels], tf.float32, tf.float32, initializer=tf.zeros_initializer()) if use_scale: gamma = self.get_variable('gamma', [num_channels], tf.float32, tf.float32, initializer=tf.ones_initializer()) else: gamma = tf.constant(1.0, tf.float32, [num_channels]) # For moving variables, we use tf.get_variable instead of self.get_variable, # since self.get_variable returns the result of tf.cast which we cannot # assign to. moving_mean = tf.get_variable('moving_mean', [num_channels], tf.float32, initializer=tf.zeros_initializer(), trainable=False) moving_variance = tf.get_variable('moving_variance', [num_channels], tf.float32, initializer=tf.ones_initializer(), trainable=False) if self.phase_train: bn, batch_mean, batch_variance = tf.nn.fused_batch_norm( input_layer, gamma, beta, epsilon=epsilon, data_format=self.data_format, is_training=True) mean_update = moving_averages.assign_moving_average( moving_mean, batch_mean, decay=decay, zero_debias=False) variance_update = moving_averages.assign_moving_average( moving_variance, batch_variance, decay=decay, zero_debias=False) tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, mean_update) tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, variance_update) else: bn, _, _ = tf.nn.fused_batch_norm( input_layer, gamma, beta, mean=moving_mean, variance=moving_variance, epsilon=epsilon, data_format=self.data_format, is_training=False) return bn
def batch_normalization_layer(signal): batch_mean, batch_variance = tf.nn.moments( signal, list(range(signal.get_shape().ndims - 1))) gamma = tf.get_variable( 'gamma', batch_mean.get_shape(), tf.float32, tf.ones_initializer()) beta = tf.get_variable( 'beta', batch_mean.get_shape(), tf.float32, tf.zeros_initializer()) signal = signal - batch_mean signal /= tf.sqrt(batch_variance + 0.0001) signal = gamma * signal + beta return signal
def test_fully_connected(self): input_size = 3 layer_size = 2 inputs = [[.1, .2, .3], [.4, .5, .6]] # batch size (=2) * input_size activation_fn = tf.sigmoid weight_init = tf.ones_initializer() bias_init = tf.ones_initializer() w = weight_init([input_size, layer_size]) b = bias_init([layer_size]) # equivalent to [1, layer_size] x = tf.placeholder(tf.float32, [None, input_size]) infer = tf.contrib.layers.fully_connected(x, layer_size, activation_fn=activation_fn, weights_initializer=weight_init, biases_initializer=bias_init) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) output = sess.run(infer, feed_dict={x: inputs}) expect = sess.run(activation_fn(tf.matmul(inputs, w) + b)) self.assertTrue(np.array_equal(output, expect))
def complex_model(X,y,is_training): N, H, W, C = X.shape #initialization Wconv1 = tf.get_variable("Wconv1", [7, 7, 3, 32], initializer = tf.contrib.layers.xavier_initializer()) bconv1 = tf.get_variable("bconv1", [32, ], initializer = tf.zeros_initializer()) gamma1 = tf.get_variable("gamma1", [32, ], initializer = tf.ones_initializer()) beta1 = tf.get_variable("beta1", [32, ], initializer = tf.zeros_initializer()) running_mean = tf.get_variable("running_mean", [32, ], initializer = tf.zeros_initializer()) running_variance = tf.get_variable("running_variance", [32, ], initializer = tf.ones_initializer()) W1 = tf.get_variable("W1", [8192, 10], initializer = tf.contrib.layers.xavier_initializer()) b1 = tf.get_variable("b1", [10, ], initializer = tf.zeros_initializer()) #construct CG A1 = tf.nn.conv2d(X, Wconv1, strides=[1, 1, 1 ,1], padding='SAME') + bconv1 A1b = tf.layers.batch_normalization(A1, training=is_training) H1 = tf.nn.relu(A1b) #tf.nn.max_pool(value, ksize, strides, padding, data_format='NHWC', name=None) H1P = tf.nn.max_pool(H1, [1, 2, 2, 1], [1, 2, 2, 1], padding='VALID') #H1D = tf.layers.dropout(H1P, 0.25, training=is_training) H1_reshaped = tf.reshape(H1P, [-1, 8192]) y_out = tf.matmul(H1_reshaped, W1) + b1 return y_out
def test_tf(): tf.reset_default_graph() arr = tf.Variable(tf.ones_initializer(N), dtype=dtype) result = tf.reduce_sum(arr) result_fetch = tf.group(result) sess = tf.Session() sess.run(arr.initializer) times = [] for i in range(iters): start_time = time.time() sess.run(result_fetch) end_time = time.time() times.append(end_time-start_time) return np.asarray(times)
def weight_normalization(weight, scope='weight_norm'): """based upon openai's https://github.com/openai/generating-reviews-discovering-sentiment/blob/master/encoder.py""" weight_shape_list = weight.get_shape().as_list() if len(weight.get_shape()) == 2: #I think you want to sum on axis [0,1,2] g_shape = [weight_shape_list[1]] else: raise ValueError('dimensions unacceptable for weight normalization') with tf.variable_scope(scope): g = tf.get_variable('g_scalar', shape=g_shape, initializer = tf.ones_initializer()) weight = g * tf.nn.l2_normalize(weight, dim=0) return weight
def __init__(self, dims_out, name=None, eps=1e-5): if name is None: name = 'layer_norm' else: name = '{:s}_layer_norm'.format(name) with tf.variable_scope(name, values=[dims_out]): self.offset = tf.get_variable(name='offset', shape=[dims_out], dtype=tf.float32, initializer=tf.zeros_initializer()) self.scale = tf.get_variable(name='scale', shape=[dims_out], dtype=tf.float32, initializer=tf.ones_initializer()) self.eps = tf.constant(eps)
def _network_template(self, state): # This dummy network allows us to deterministically anticipate that # action 0 will be selected by an argmax. inputs = tf.constant( np.zeros((state.shape[0], stack_size)), dtype=tf.float32) # This weights_initializer gives action 0 a higher weight, ensuring # that it gets picked by the argmax. weights_initializer = np.tile( np.arange(self.num_actions, 0, -1), (stack_size, 1)) q = slim.fully_connected( inputs, self.num_actions, weights_initializer=tf.constant_initializer(weights_initializer), biases_initializer=tf.ones_initializer(), activation_fn=None) return self._get_network_type()(q)
def BatchNorm(inputs, axis=None, training=None, momentum=0.9, epsilon=1e-5, center=True, scale=True, beta_initializer=tf.zeros_initializer(), gamma_initializer=tf.ones_initializer(), virtual_batch_size=None, data_format='channels_last', internal_update=False): """ Mostly equivalent to `tf.layers.batch_normalization`, but different in the following: 1. Accepts `data_format` when `axis` is None. For 2D input, this argument will be ignored. 2. Default value for `momentum` and `epsilon` is different. 3. Default value for `training` is automatically obtained from `TowerContext`. 4. Support the `internal_update` option. Args: internal_update (bool): if False, add EMA update ops to `tf.GraphKeys.UPDATE_OPS`. If True, update EMA inside the layer by control dependencies. Variable Names: * ``beta``: the bias term. Will be zero-inited by default. * ``gamma``: the scale term. Will be one-inited by default. Input will be transformed by ``x * gamma + beta``. * ``mean/EMA``: the moving average of mean. * ``variance/EMA``: the moving average of variance. Note: 1. About multi-GPU training: moving averages across GPUs are not aggregated. Batch statistics are computed independently. This is consistent with most frameworks. 2. Combinations of ``training`` and ``ctx.is_training``: * ``training == ctx.is_training``: standard BN, EMA are maintained during training and used during inference. This is the default. * ``training and not ctx.is_training``: still use batch statistics in inference. * ``not training and ctx.is_training``: use EMA to normalize in training. This is useful when you load a pre-trained BN and don't want to fine tune the EMA. EMA will not be updated in this case. """ # parse shapes data_format = get_data_format(data_format, tfmode=False) shape = inputs.get_shape().as_list() ndims = len(shape) assert ndims in [2, 4], ndims if axis is None: if ndims == 2: data_format = 'NHWC' axis = 1 else: axis = 1 if data_format == 'NCHW' else 3 # parse training/ctx ctx = get_current_tower_context() if training is None: training = ctx.is_training training = bool(training) TF_version = get_tf_version_number() if not training and ctx.is_training: assert TF_version >= 1.4, \ "Fine tuning a BatchNorm model with fixed statistics is only " \ "supported after https://github.com/tensorflow/tensorflow/pull/12580 " if ctx.is_main_training_tower: # only warn in first tower logger.warn("[BatchNorm] Using moving_mean/moving_variance in training.") # Using moving_mean/moving_variance in training, which means we # loaded a pre-trained BN and only fine-tuning the affine part. coll_bk = backup_collection([tf.GraphKeys.UPDATE_OPS]) with rename_get_variable( {'moving_mean': 'mean/EMA', 'moving_variance': 'variance/EMA'}): if TF_version >= 1.5: layer = tf.layers.BatchNormalization( axis=axis, momentum=momentum, epsilon=epsilon, center=center, scale=scale, beta_initializer=beta_initializer, gamma_initializer=gamma_initializer, virtual_batch_size=virtual_batch_size, fused=True ) else: assert virtual_batch_size is None, "Feature not supported in this version of TF!" layer = tf.layers.BatchNormalization( axis=axis, momentum=momentum, epsilon=epsilon, center=center, scale=scale, beta_initializer=beta_initializer, gamma_initializer=gamma_initializer, fused=True ) xn = layer.apply(inputs, training=training, scope=tf.get_variable_scope()) # maintain EMA only on one GPU is OK, even in replicated mode. # because training time doesn't use EMA if ctx.is_main_training_tower: for v in layer.non_trainable_variables: add_model_variable(v) if not ctx.is_main_training_tower or internal_update: restore_collection(coll_bk) if training and internal_update: assert layer.updates with tf.control_dependencies(layer.updates): ret = tf.identity(xn, name='output') else: ret = tf.identity(xn, name='output') vh = ret.variables = VariableHolder( moving_mean=layer.moving_mean, mean=layer.moving_mean, # for backward-compatibility moving_variance=layer.moving_variance, variance=layer.moving_variance) # for backward-compatibility if scale: vh.gamma = layer.gamma if center: vh.beta = layer.beta return ret
padding='SAME', # "same" padding activation=None, # None kernel_initializer=tf.truncated_normal_initializer(stddev=5e-2, seed=100), kernel_regularizer=tf.contrib.layers.l2_regularizer(scale=lamC), name='conv1') conv1 = tf.layers.batch_normalization( conv1, axis=-1, momentum=0.99, epsilon=epsilon, center=True, scale=True, beta_initializer=tf.zeros_initializer(), gamma_initializer=tf.ones_initializer(), moving_mean_initializer=tf.zeros_initializer(), moving_variance_initializer=tf.ones_initializer(), training=training, name='bn1') # apply relu conv1_bn_relu = tf.nn.relu(conv1, name='relu1') with tf.name_scope('conv1.1') as scope: conv11 = tf.layers.conv2d( conv1_bn_relu, # Input data filters=32, # 32 filters kernel_size=(3, 3), # Kernel size: 5x5 strides=(1, 1), # Stride: 2 padding='SAME', # "same" padding
def CNN_3d_change(x, out_channels_0, out_channels_1, add_relu=True): '''Add a 3d convlution layer with relu and max pooling layer. Args: x: a tensor with shape [batch, in_depth, in_height, in_width, in_channels] out_channels: a number filter_size: a number pooling_size: a number Returns: a flattened tensor with shape [batch, num_features] Raises: ''' in_channels = x.shape[-1] weights_0 = tf.get_variable( name='filter_0', shape=[3, 3, 3, in_channels, out_channels_0], dtype=tf.float32, #initializer=tf.random_normal_initializer(0, 0.05)) initializer=tf.random_uniform_initializer(-0.01, 0.01)) bias_0 = tf.get_variable(name='bias_0', shape=[out_channels_0], dtype=tf.float32, initializer=tf.zeros_initializer()) #Todo g_0 = tf.get_variable(name='scale_0', shape=[out_channels_0], dtype=tf.float32, initializer=tf.ones_initializer()) weights_0 = tf.reshape(g_0, [1, 1, 1, out_channels_0 ]) * tf.nn.l2_normalize(weights_0, [0, 1, 2]) conv_0 = tf.nn.conv3d(x, weights_0, strides=[1, 1, 1, 1, 1], padding="VALID") print('conv_0 shape: %s' % conv_0.shape) conv_0 = conv_0 + bias_0 ####### ''' with tf.variable_scope('layer_0'): conv_0 = op.layer_norm(conv_0, axis=[1, 2, 3, 4]) print('layer_norm in cnn') ''' if add_relu: conv_0 = tf.nn.elu(conv_0) pooling_0 = tf.nn.max_pool3d(conv_0, ksize=[1, 2, 3, 3, 1], strides=[1, 2, 3, 3, 1], padding="VALID") print('pooling_0 shape: %s' % pooling_0.shape) #layer_1 weights_1 = tf.get_variable( name='filter_1', shape=[2, 2, 2, out_channels_0, out_channels_1], dtype=tf.float32, initializer=tf.random_uniform_initializer(-0.01, 0.01)) bias_1 = tf.get_variable(name='bias_1', shape=[out_channels_1], dtype=tf.float32, initializer=tf.zeros_initializer()) g_1 = tf.get_variable(name='scale_1', shape=[out_channels_1], dtype=tf.float32, initializer=tf.ones_initializer()) weights_1 = tf.reshape(g_1, [1, 1, 1, out_channels_1 ]) * tf.nn.l2_normalize(weights_1, [0, 1, 2]) conv_1 = tf.nn.conv3d(pooling_0, weights_1, strides=[1, 1, 1, 1, 1], padding="VALID") print('conv_1 shape: %s' % conv_1.shape) conv_1 = conv_1 + bias_1 #with tf.variable_scope('layer_1'): # conv_1 = op.layer_norm(conv_1, axis=[1, 2, 3, 4]) if add_relu: conv_1 = tf.nn.elu(conv_1) pooling_1 = tf.nn.max_pool3d(conv_1, ksize=[1, 3, 3, 3, 1], strides=[1, 3, 3, 3, 1], padding="VALID") print('pooling_1 shape: %s' % pooling_1.shape) return tf.contrib.layers.flatten(pooling_1)
def batch_norm_lasagne(x, is_training, reuse, decay=0.9, epsilon=1e-4, updates_collections=tf.GraphKeys.UPDATE_OPS, outputs_collections=None, trainable=True, name='bn'): with tf.variable_scope(name, reuse=reuse) as curr_scope: beta = tf.get_variable(name='beta', initializer=tf.constant( 0.0, shape=[x.get_shape()[-1]]), trainable=trainable) gamma = tf.get_variable(name='gamma', initializer=tf.constant( 1.0, shape=[x.get_shape()[-1]]), trainable=trainable) moving_mean = tf.get_variable(name='moving_mean', shape=[x.get_shape()[-1]], initializer=tf.zeros_initializer(), trainable=False) moving_inv_std = tf.get_variable(name='moving_inv_std', shape=[x.get_shape()[-1]], initializer=tf.ones_initializer(), trainable=False) input_shape = helper.get_input_shape(x) moments_axes = list(range(len(input_shape) - 1)) def mean_inv_std_with_update(): mean, variance = tf.nn.moments(x, moments_axes, shift=moving_mean, name='bn-moments') inv_std = math_ops.rsqrt(variance + epsilon) update_moving_mean = moving_averages.assign_moving_average( moving_mean, mean, decay, zero_debias=False) update_moving_inv_std = moving_averages.assign_moving_average( moving_inv_std, inv_std, decay, zero_debias=False) with tf.control_dependencies( [update_moving_mean, update_moving_inv_std]): m, v = tf.identity(mean), tf.identity(inv_std) return m, v def mean_inv_std_with_pending_update(): mean, variance = tf.nn.moments(x, moments_axes, shift=moving_mean, name='bn-moments') inv_std = math_ops.rsqrt(variance + epsilon) update_moving_mean = moving_averages.assign_moving_average( moving_mean, mean, decay, zero_debias=False) update_moving_inv_std = moving_averages.assign_moving_average( moving_inv_std, inv_std, decay, zero_debias=False) tf.add_to_collection(updates_collections, update_moving_mean) tf.add_to_collection(updates_collections, update_moving_inv_std) return mean, inv_std mean_inv_std_with_relevant_update = \ mean_inv_std_with_pending_update if updates_collections is not None else mean_inv_std_with_update (mean, inv_std) = mean_inv_std_with_relevant_update() if is_training else ( moving_mean, moving_inv_std) def _batch_normalization(x, mean, inv, offset, scale): with tf.name_scope(name, "batchnorm", [x, mean, inv, scale, offset]): if scale is not None: inv *= scale return x * inv + (offset - mean * inv if offset is not None else -mean * inv) output = _batch_normalization(x, mean, inv_std, beta, gamma) return _collect_named_outputs(outputs_collections, curr_scope.original_name_scope, name, output)
def __init__(self, cov_func, lik_func, num_train, inducing_inputs, args): """Create a new variational inference object which will keep track of all variables. Args: cov_func: covariance function (kernel function) lik_func: likelihood function num_train: the number of training examples inducing_inputs: the initial values for the inducing_inputs or just the number of inducing inputs args: additional parameters: num_components, diag_post, use_loo, num_samples, optimize_inducing """ # self.mean = mean_func self.cov = cov_func self.lik = lik_func self.num_train = num_train self.num_latents = len(self.cov) self.args = args # Initialize inducing inputs if they are provided if isinstance(inducing_inputs, int): # Only the number of inducing inputs is given -> just specify the shape num_inducing = inducing_inputs inducing_params = { 'shape': [self.num_latents, num_inducing, self.cov[0].input_dim], 'dtype': tf.float32 } else: # Repeat the inducing inputs for all latent processes if we haven't been given # individually specified inputs per process. if inducing_inputs.ndim == 2: inducing_inputs = np.tile(inducing_inputs[np.newaxis, :, :], reps=[self.num_latents, 1, 1]) # Initialize with the given values inducing_params = { 'initializer': tf.constant(inducing_inputs, dtype=tf.float32) } num_inducing = inducing_inputs.shape[-2] num_components = args['num_components'] # Initialize all variables with tf.variable_scope(None, "variational_inference"): # Define all parameters that get optimized directly in raw form. Some parameters get # transformed internally to maintain certain pre-conditions. self.inducing_inputs = tf.get_variable("inducing_inputs", **inducing_params) zeros = tf.zeros_initializer(dtype=tf.float32) self.raw_weights = tf.get_variable("raw_weights", [num_components], initializer=zeros) self.means = tf.get_variable( "means", [num_components, self.num_latents, num_inducing], initializer=zeros) if args['diag_post']: self.raw_covars = tf.get_variable( "raw_covars", [num_components, self.num_latents, num_inducing], initializer=tf.ones_initializer()) else: self.raw_covars = tf.get_variable( "raw_covars", shape=[num_components, self.num_latents] + util.tri_vec_shape(num_inducing), initializer=zeros)
def _view_pool_with_classes(view_features, y, n_classes, is_training,reuse = False): with tf.variable_scope("view_pool_with_classes", reuse = reuse ) as scope: W = tf.get_variable(name="weights", shape= [1, n_classes], dtype=tf.float64, initializer=tf.ones_initializer()) b = tf.get_variable(name="biases", shape = [n_classes], dtype=tf.float64, initializer=tf.zeros_initializer()) #W1 = tf.get_variable(name="weights1", shape= [n_classes, n_classes], dtype=tf.float64, initializer=tf.ones_initializer()) #b1 = tf.get_variable(name="biases1", shape = [n_classes], dtype=tf.float64, initializer=tf.zeros_initializer()) y = tf.cast(y, tf.float64) y = tf.expand_dims(y, 1) mask = tf.add(tf.matmul(y , W), b) #mask = tf.sigmoid(mask) #mask = tf.add(tf.matmul(mask , W1), b1) #mask = tf.sigmoid(mask) vp = tf.stack(view_features, 0) vp = tf.reduce_mean(vp,0) vp = tf.multiply(vp, mask) return vp
a = tf.constant([1.0, 2.0], name='a') b = tf.constant([2.0, 3.0], name='b') g1 = tf.Graph() with g1.as_default(): v = tf.get_variable('v', shape=[1], initializer=tf.zeros_initializer()) with tf.Session(graph=g1) as sess: tf.global_variables_initializer().run() with tf.variable_scope("", reuse=True): print(sess.run(tf.get_variable('v'))) g2 = tf.Graph() with g2.as_default(): v = tf.get_variable('v', shape=[2, 2], initializer=tf.ones_initializer()) with tf.Session(graph=g2) as sess: tf.global_variables_initializer().run() with tf.variable_scope("", reuse=True): print(sess.run(tf.get_variable('v'))) # g = tf.Graph() # with g.device('/gpu:0'): # result = a+b weights = tf.Variable(tf.random_normal([2, 3], mean=0, stddev=2)) biases = tf.Variable(tf.zeros([3])) w2 = tf.Variable(weights.initialized_value()) w3 = tf.Variable(weights.initialized_value() * 2.0)
def model_fn(model, features, mode, hparams, problem_names, train_steps=100000, worker_id=0, worker_replicas=1, eval_run_autoregressive=False, decode_hparams=None): """Builds the model for all modes. * TRAIN: Constructs loss and train_op * EVAL: Constructs the loss and eval metrics * PREDICT: Constructs the predictions Args: model: str, name of model. features: dict<feature name, Tensor>. Expected to have keys {inputs, targets, problem_choice}. mode: tf.estimator.ModeKeys. hparams: model HParams. problem_names: list of str, names of the problems. train_steps: int, total number of training steps. Used to compute learning rate decay. worker_id: int, id of this worker. worker_replicas: int, number of workers. eval_run_autoregressive: bool, whether to run evaluation autoregressively. decode_hparams: HParams for decode settings. Used when mode == PREDICT. Returns: tf.estimator.EstimatorSpec """ assert len(problem_names) == len(hparams.problem_instances) decode_hp = decode_hparams # TODO(rsepassi): This still depends on FLAGS. Rm eventually. dp = devices.data_parallelism(hparams) tf.get_variable_scope().set_initializer(_get_variable_initializer(hparams)) is_training = mode == tf.estimator.ModeKeys.TRAIN # Add input statistics for incoming features. with tf.name_scope("input_stats"): for (k, v) in six.iteritems(features): if isinstance(v, tf.Tensor) and v.get_shape().ndims > 1: tf.summary.scalar("%s_batch" % k, tf.shape(v)[0] // dp.n) tf.summary.scalar("%s_length" % k, tf.shape(v)[1]) nonpadding = tf.to_float(tf.not_equal(v, 0)) nonpadding_tokens = tf.reduce_sum(nonpadding) if k == "targets": targets_nonpadding_tokens = nonpadding_tokens tf.summary.scalar("%s_nonpadding_tokens" % k, nonpadding_tokens) tf.summary.scalar("%s_nonpadding_fraction" % k, tf.reduce_mean(nonpadding)) # Get multi-problem logits and loss based on features["problem_choice"]. loss_variable_names = [] def nth_model(n): """Build the model for the n-th problem, plus some added variables.""" model_class = registry.model(model)( hparams, mode, hparams.problems[n], n, dp, devices.ps_devices(all_workers=True), decode_hparams=decode_hparams) if mode == tf.estimator.ModeKeys.PREDICT: return model_class.infer( features, beam_size=decode_hp.beam_size, top_beams=(decode_hp.beam_size if decode_hp.return_beams else 1), alpha=decode_hp.alpha, decode_length=decode_hp.extra_length) # In distributed mode, we build graph for problem=0 and problem=worker_id. skipping_is_on = hparams.problem_choice == "distributed" and is_training problem_worker_id = worker_id % len(hparams.problems) skip_this_one = n != 0 and n % worker_replicas != problem_worker_id # On worker 0 also build graph for problems <= 1. # TODO(lukaszkaiser): why is this hack needed for variables init? Repair. skip_this_one = skip_this_one and (worker_id != 0 or n > 1) if eval_run_autoregressive and mode == tf.estimator.ModeKeys.EVAL: logits, losses_dict = model_class.eval_autoregressive(features) else: logits, losses_dict = model_class( features, skip=(skipping_is_on and skip_this_one)) with tf.variable_scope("losses_avg"): total_loss, ops = 0.0, [] for loss_key, loss_value in six.iteritems(losses_dict): loss_name = "problem_%d/%s_loss" % (n, loss_key) loss_moving_avg = tf.get_variable( loss_name, initializer=100.0, trainable=False) loss_variable_names.append(loss_name) ops.append( loss_moving_avg.assign(loss_moving_avg * 0.9 + loss_value * 0.1)) total_loss += loss_value try: # Total loss avg might be reused or not, we try both. with tf.variable_scope(tf.get_variable_scope(), reuse=True): # Total loss was already constructed on input. loss_moving_avg = tf.get_variable("problem_%d/total_loss" % n) except ValueError: loss_moving_avg = tf.get_variable( "problem_%d/total_loss" % n, initializer=100.0, trainable=False) ops.append( loss_moving_avg.assign(loss_moving_avg * 0.9 + total_loss * 0.1)) with tf.variable_scope("train_stats"): # Count steps for this problem. problem_steps = tf.get_variable( "problem_%d_steps" % n, initializer=0, trainable=False) ops.append(problem_steps.assign_add(1)) with tf.control_dependencies(ops): # Make sure the ops run. # Ensure the loss is a scalar here. total_loss = tf.reshape(total_loss, [], name="total_loss_control_id") return [total_loss, logits] model_output = input_fn_builder.cond_on_index( nth_model, index_tensor=features["problem_choice"], max_idx=len(hparams.problems) - 1) if mode == tf.estimator.ModeKeys.PREDICT: # If beam searching, model_output will be a dict with keys "outputs" and # "scores". if isinstance(model_output, dict): outputs = model_output["outputs"] scores = model_output["scores"] else: outputs = model_output scores = None batched_problem_choice = ( features["problem_choice"] * tf.ones( (tf.shape(features["inputs"])[0],), dtype=tf.int32)) predictions = { "outputs": outputs, "scores": scores, "inputs": features.get("inputs", None), "targets": features.get("infer_targets", None), "problem_choice": batched_problem_choice, } _del_dict_nones(predictions) export_out = {"outputs": predictions["outputs"]} if "scores" in predictions: export_out["scores"] = predictions["scores"] return tf.estimator.EstimatorSpec( mode, predictions=predictions, export_outputs={ "output": tf.estimator.export.PredictOutput(export_out) }) total_loss, logits = model_output if mode == tf.estimator.ModeKeys.EVAL: eval_metrics_fns = metrics.create_evaluation_metrics( hparams.problem_instances, hparams) eval_metrics = {} for metric_name, metric_fn in six.iteritems(eval_metrics_fns): eval_metrics[metric_name] = metric_fn(logits, features) return tf.estimator.EstimatorSpec( mode, predictions={"predictions": logits}, eval_metric_ops=eval_metrics, loss=total_loss) assert mode == tf.estimator.ModeKeys.TRAIN # Set learning rate learning_rate = hparams.learning_rate * optimize.learning_rate_decay( hparams, num_worker_replicas=worker_replicas, num_train_steps=train_steps) learning_rate /= math.sqrt(float(worker_replicas)) # Get global step global_step = tf.train.get_or_create_global_step() # Some training statistics. with tf.name_scope("training_stats"): tf.summary.scalar("learning_rate", learning_rate) for n in xrange(len(hparams.problems)): names_and_vars = [] with tf.variable_scope("losses_avg", reuse=True): total_loss_var = tf.get_variable("problem_%d/total_loss" % n) names_and_vars.append(("total_loss", total_loss_var)) with tf.variable_scope("losses_avg", reuse=True): for loss_name in loss_variable_names: if loss_name.startswith("problem_%d/" % n): loss_var = tf.get_variable(loss_name) loss_suffix = loss_name[loss_name.index("/") + 1:] names_and_vars.append((loss_suffix, loss_var)) for (loss_name, loss_var) in names_and_vars: tf.summary.scalar("loss_avg_%d/%s" % (n, loss_name), loss_var) with tf.variable_scope("train_stats", reuse=True): nth_steps = tf.get_variable("problem_%d_steps" % n, dtype=tf.int32) tf.summary.scalar("problem_%d_frequency" % n, tf.to_float(nth_steps) / (tf.to_float(global_step) + 1.0)) # Add weight decay and noise. total_size, weight_decay_loss = 0, 0.0 all_weights = {v.name: v for v in tf.trainable_variables()} for v_name in sorted(list(all_weights)): v = all_weights[v_name] v_size = int(np.prod(np.array(v.shape.as_list()))) total_size += v_size if hparams.weight_decay > 0.0 and len(v.shape.as_list()) > 1: # Add weight regularization if set and the weight is not a bias (dim>1). with tf.device(v._ref().device): # pylint: disable=protected-access v_loss = tf.nn.l2_loss(v) / v_size weight_decay_loss += v_loss is_body = len(v_name) > 5 and v_name[:5] == "body/" if hparams.weight_noise > 0.0 and is_body: # Add weight noise if set in hparams. with tf.device(v._ref().device): # pylint: disable=protected-access scale = learning_rate * 0.001 noise = tf.truncated_normal(v.shape) * hparams.weight_noise * scale noise_op = v.assign_add(noise) with tf.control_dependencies([noise_op]): total_loss = tf.identity(total_loss) if hparams.weight_decay > 0.0: total_loss += weight_decay_loss * hparams.weight_decay # The new data reader occasionally emits very small batches, which # cause the examples in those batches to be grossly overweighted. # We decrease the loss proportionally to the ratio of the size of this # batch to the size of the largest training batch ever. # TODO(noam): to be more sophisticated, we could keep separate # maxima based on problem choice. max_nonpadding_var = tf.get_variable( "max_nonpadding", shape=[], initializer=tf.ones_initializer(), trainable=False) max_nonpadding = tf.maximum(max_nonpadding_var, targets_nonpadding_tokens) with tf.control_dependencies([tf.assign(max_nonpadding_var, max_nonpadding)]): small_batch_multiplier = targets_nonpadding_tokens / max_nonpadding tf.summary.scalar("small_batch_multiplier", small_batch_multiplier) total_loss *= small_batch_multiplier # Log variable sizes _log_variable_sizes(tf.trainable_variables(), "Trainable Variables") diet_vars = [ v for v in tf.global_variables() if v.dtype == dtypes.float16_ref ] _log_variable_sizes(diet_vars, "Diet Variables") # Optimize train_op = optimize.optimize(total_loss, learning_rate, hparams) # Remove summaries that will fail to run because they are in conditionals. # TODO(cwhipkey): Test with this code removed, later in 2017. summaries = tf.get_collection_ref(tf.GraphKeys.SUMMARIES) for i in reversed(range(len(summaries))): if summaries[i].name.startswith("cond_"): del summaries[i] tf.logging.info("Global model_fn finished.") return tf.estimator.EstimatorSpec( mode, predictions={"problem_choice": features["problem_choice"]}, loss=total_loss, train_op=train_op)
def conv_gn(input_tensor, kernel_size, filters, strides, name, relu=False, center=False, scale=False, channel_wise=True, group=32, group_channel=8, padding='same', biased=False, reuse=tf.AUTO_REUSE, dilation=1): assert len(input_tensor.get_shape()) == 4 # deconvolution res = tf.layers.conv2d(input_tensor, kernel_size=kernel_size, filters=filters, padding=padding, strides=strides, reuse=reuse, name=name, dilation_rate=dilation) # group normalization x = tf.transpose(res, [0, 3, 1, 2]) shape = tf.shape(x) N = shape[0] C = x.get_shape()[1] H = shape[2] W = shape[3] if channel_wise: G = max(1, C / group_channel) else: G = min(group, C) # normalization x = tf.reshape(x, [N, G, C // G, H, W]) mean, var = tf.nn.moments(x, [2, 3, 4], keep_dims=True) x = (x - mean) / tf.sqrt(var + 1e-5) # per channel scale and bias (gamma and beta) with tf.variable_scope(name + '/gn', reuse=reuse): if scale: gamma = tf.get_variable('gamma', [C], dtype=tf.float32, initializer=tf.ones_initializer()) else: gamma = tf.constant(1.0, shape=[C]) if center: beta = tf.get_variable('beta', [C], dtype=tf.float32, initializer=tf.zeros_initializer()) else: beta = tf.constant(0.0, shape=[C]) gamma = tf.reshape(gamma, [1, C, 1, 1]) beta = tf.reshape(beta, [1, C, 1, 1]) output = tf.reshape(x, [-1, C, H, W]) * gamma + beta # tranpose: [bs, c, h, w, c] to [bs, h, w, c] following the paper output = tf.transpose(output, [0, 2, 3, 1]) if relu: output = tf.nn.relu(output, name + '/relu') return output
def __init__(self, size, eps=1e-2, default_clip_range=np.inf, sess=None): """A normalizer that ensures that observations are approximately distributed according to a standard Normal distribution (i.e. have mean zero and variance one). Args: size (int): the size of the observation to be normalized eps (float): a small constant that avoids underflows default_clip_range (float): normalized observations are clipped to be in [-default_clip_range, default_clip_range] sess (object): the TensorFlow session to be used """ self.size = size self.eps = eps self.default_clip_range = default_clip_range self.sess = sess if sess is not None else tf.get_default_session() self.local_sum = np.zeros(self.size, np.float32) self.local_sumsq = np.zeros(self.size, np.float32) self.local_count = np.zeros(1, np.float32) self.sum_tf = tf.compat.v1.get_variable( initializer=tf.zeros_initializer(), shape=self.local_sum.shape, name='sum', trainable=False, dtype=tf.float32) self.sumsq_tf = tf.compat.v1.get_variable( initializer=tf.zeros_initializer(), shape=self.local_sumsq.shape, name='sumsq', trainable=False, dtype=tf.float32) self.count_tf = tf.compat.v1.get_variable( initializer=tf.ones_initializer(), shape=self.local_count.shape, name='count', trainable=False, dtype=tf.float32) self.mean = tf.compat.v1.get_variable( initializer=tf.zeros_initializer(), shape=(self.size, ), name='mean', trainable=False, dtype=tf.float32) self.std = tf.compat.v1.get_variable(initializer=tf.ones_initializer(), shape=(self.size, ), name='std', trainable=False, dtype=tf.float32) self.count_pl = tf.compat.v1.placeholder(name='count_pl', shape=(1, ), dtype=tf.float32) self.sum_pl = tf.compat.v1.placeholder(name='sum_pl', shape=(self.size, ), dtype=tf.float32) self.sumsq_pl = tf.compat.v1.placeholder(name='sumsq_pl', shape=(self.size, ), dtype=tf.float32) self.update_op = tf.group(self.count_tf.assign_add(self.count_pl), self.sum_tf.assign_add(self.sum_pl), self.sumsq_tf.assign_add(self.sumsq_pl)) self.recompute_op = tf.group( tf.compat.v1.assign(self.mean, self.sum_tf / self.count_tf), tf.compat.v1.assign( self.std, tf.sqrt( tf.maximum( tf.square(self.eps), self.sumsq_tf / self.count_tf - tf.square(self.sum_tf / self.count_tf)))), ) self.lock = threading.Lock()
class OptimizationConstrainsTest(tf.test.TestCase, parameterized.TestCase): @parameterized.parameters([ (0.5, 0.5), (17.3, 17.3), (tf.constant_initializer(3.14), 3.14), (tf.ones_initializer(), 1.0) ]) def testLagrangeMultInit(self, initializer, exp_lag_mul): cons = optimization_constraints.OptimizationConstraints() lhs = tf.zeros_like(1.0) rhs = tf.ones_like(1.0) cons.add(lhs > rhs, initializer=initializer)() l = cons.lagrange_multipliers[0] with tf.train.MonitoredSession() as sess: lag_mul = sess.run(l) self.assertAllClose(lag_mul, exp_lag_mul) @mock.patch.object(optimization_constraints, '_parametrize') def testRateDefaults(self, mocked_parametrized): mocked_parametrized.side_effect = ( lambda x, rate: scale_gradient.scale_gradient(x, -rate)) rate = 0.1 cons = optimization_constraints.OptimizationConstraints(rate=rate) lhs = tf.zeros_like(1.0) rhs = tf.ones_like(1.0) x = cons.add(lhs < rhs)() v = tf.all_variables()[0] dxdl = tf.gradients(x, v) with tf.train.MonitoredSession() as sess: grads = sess.run(dxdl) self.assertAllClose(grads[0], rate) @mock.patch.object(optimization_constraints, '_parametrize') def testRateOverrides(self, mocked_parametrized): mocked_parametrized.side_effect = ( lambda x, rate: scale_gradient.scale_gradient(x, -rate)) rate = 7.3 cons = optimization_constraints.OptimizationConstraints() lhs = tf.zeros_like(1.0) rhs = tf.ones_like(1.0) x = cons.add(lhs < rhs, rate=rate)() v = tf.all_variables()[0] dxdl = tf.gradients(x, v) with tf.train.MonitoredSession() as sess: grads = sess.run(dxdl) self.assertAllClose(grads[0], rate) def testValidRangeDefaults(self): valid_range = (1.0, 2.0) cons = optimization_constraints.OptimizationConstraints( valid_range=valid_range) lhs = tf.zeros_like(1.0) rhs = tf.ones_like(1.0) cons.add(lhs < rhs, initializer=3.0)() with tf.train.MonitoredSession() as sess: lag_mul = sess.run(cons.lagrange_multipliers[0]) self.assertAllClose(lag_mul, valid_range[1]) def testValidRangeOverrides(self): cons = optimization_constraints.OptimizationConstraints() lhs = tf.zeros_like(1.0) rhs = tf.ones_like(1.0) valid_range = (1.0, 2.0) cons.add(lhs < rhs, initializer=3.0, valid_range=valid_range)() with tf.train.MonitoredSession() as sess: lag_mul = sess.run(cons.lagrange_multipliers[0]) self.assertAllClose(lag_mul, valid_range[1]) @mock.patch.object( optimization_constraints.OptimizationConstraints, 'add_geq') @mock.patch.object( optimization_constraints.OptimizationConstraints, 'add_leq') def testOpIdentification(self, mocked_add_leq, mocked_add_geq): calls_to_add_leq = [0] def mock_add_leq(*args, **kwargs): del args del kwargs calls_to_add_leq[0] += 1 mocked_add_leq.side_effect = mock_add_leq calls_to_add_geq = [0] def mock_add_geq(*args, **kwargs): del args del kwargs calls_to_add_geq[0] += 1 mocked_add_geq.side_effect = mock_add_geq cons = optimization_constraints.OptimizationConstraints() lhs = tf.zeros_like(1.0) rhs = tf.ones_like(1.0) self.assertEqual(calls_to_add_leq[0], 0) self.assertEqual(calls_to_add_geq[0], 0) cons.add(lhs < rhs) self.assertEqual(calls_to_add_leq[0], 1) self.assertEqual(calls_to_add_geq[0], 0) cons.add(lhs <= rhs) self.assertEqual(calls_to_add_leq[0], 2) self.assertEqual(calls_to_add_geq[0], 0) cons.add(lhs > rhs) self.assertEqual(calls_to_add_geq[0], 1) self.assertEqual(calls_to_add_leq[0], 2) cons.add(lhs >= rhs) self.assertEqual(calls_to_add_geq[0], 2) self.assertEqual(calls_to_add_leq[0], 2) def testMinimalRun(self): x = basic.TrainableVariable( shape=(), initializers={'w': tf.ones_initializer()})() x2 = x ** 2.0 min_value = 0.5 constr = optimization_constraints.OptimizationConstraints().add( x > min_value) self.assertFalse(constr._is_connected) loss = moving_average.MovingAverage()( x2 + tf.random.normal((), stddev=1.0)) + constr() self.assertTrue(constr._is_connected) with self.assertRaisesRegexp(ValueError, 'Cannot add further constraints'): constr.add(x > min_value) with self.assertRaisesRegexp(ValueError, 'Cannot add further constraints'): constr.add_geq(x, min_value) with self.assertRaisesRegexp(ValueError, 'Cannot add further constraints'): constr.add_leq(min_value < x) opt = tf.train.AdamOptimizer(1e-2, beta1=0.0) update = opt.minimize(loss) with tf.control_dependencies([update]): x2 = tf.identity(x2) with tf.train.MonitoredSession() as sess: for _ in range(500): v, _ = sess.run([x2, update]) self.assertAllClose(v, min_value**2)
else: shuffle = False repeat = 1 train = False train_flag = tf.placeholder(tf.bool, name='train_flag') input_img, label_gt = imgs_input_fn(path_tfrecords_train, perform_shuffle=shuffle, repeat_count=repeat, batch_size=5) conv1 = tf.layers.conv2d(inputs=input_img, filters=64, kernel_size=[5, 5], kernel_initializer=tf.random_normal_initializer(), bias_initializer=tf.ones_initializer(), padding="valid", activation=tf.nn.relu, name='conv1') drop1 = tf.layers.dropout(inputs=conv1, rate=0.5, training=train_flag) conv2 = tf.layers.conv2d(inputs=drop1, filters=256, kernel_size=[5, 5], kernel_initializer=tf.random_normal_initializer(), bias_initializer=tf.ones_initializer(), padding="valid", activation=tf.nn.relu, name='conv2') drop2 = tf.layers.dropout(inputs=conv2, rate=0.5, training=train_flag) conv3 = tf.layers.conv2d(inputs=drop2, filters=768,
def batch_norm(inputs, decay=0.999, center=True, scale=False, epsilon=0.001, moving_vars='moving_vars', activation=None, is_training=True, trainable=True, restore=True, scope=None, reuse=None): """Adds a Batch Normalization layer. Args: inputs: a tensor of size [batch_size, height, width, channels] or [batch_size, channels]. decay: decay for the moving average. center: If True, subtract beta. If False, beta is not created and ignored. scale: If True, multiply by gamma. If False, gamma is not used. When the next layer is linear (also e.g. ReLU), this can be disabled since the scaling can be done by the next layer. epsilon: small float added to variance to avoid dividing by zero. moving_vars: collection to store the moving_mean and moving_variance. activation: activation function. is_training: whether or not the model is in training mode. trainable: whether or not the variables should be trainable or not. restore: whether or not the variables should be marked for restore. scope: Optional scope for variable_scope. reuse: whether or not the layer and its variables should be reused. To be able to reuse the layer scope must be given. Returns: a tensor representing the output of the operation. """ inputs_shape = inputs.get_shape() with tf.variable_scope(scope, 'BatchNorm', [inputs], reuse=reuse): axis = list(range(len(inputs_shape) - 1)) params_shape = inputs_shape[-1:] # Allocate parameters for the beta and gamma of the normalization. beta, gamma = None, None if center: beta = variables.variable('beta', params_shape, initializer=tf.zeros_initializer(), trainable=trainable, restore=restore) if scale: gamma = variables.variable('gamma', params_shape, initializer=tf.ones_initializer(), trainable=trainable, restore=restore) # Create moving_mean and moving_variance add them to # GraphKeys.MOVING_AVERAGE_VARIABLES collections. moving_collections = [moving_vars, tf.GraphKeys.MOVING_AVERAGE_VARIABLES] moving_mean = variables.variable('moving_mean', params_shape, initializer=tf.zeros_initializer(), trainable=False, restore=restore, collections=moving_collections) moving_variance = variables.variable('moving_variance', params_shape, initializer=tf.ones_initializer(), trainable=False, restore=restore, collections=moving_collections) if is_training: # Calculate the moments based on the individual batch. mean, variance = tf.nn.moments(inputs, axis) update_moving_mean = moving_averages.assign_moving_average( moving_mean, mean, decay) tf.add_to_collection(UPDATE_OPS_COLLECTION, update_moving_mean) update_moving_variance = moving_averages.assign_moving_average( moving_variance, variance, decay) tf.add_to_collection(UPDATE_OPS_COLLECTION, update_moving_variance) else: # Just use the moving_mean and moving_variance. mean = moving_mean variance = moving_variance # Normalize the activations. outputs = tf.nn.batch_normalization( inputs, mean, variance, beta, gamma, epsilon) outputs.set_shape(inputs.get_shape()) if activation: outputs = activation(outputs) return outputs
def build_net(in_dim, n_hidden, data_type, link='square', total_size=None, bw_indiv=1.0, indiv_y_bol=False, kernel='ard', initialse='identity', seed=23, dtype=tf.float32, landmarks=None, log_y=False, device_name=None, avg_label=1.0, **others): with tf.device(device_name): if avg_label - 1.0 < 0: # HACK FIX FOR MORE GENERAL DATA. #print('Alternate Intialisation') ard_mat_init_scale = 0.15 # For malaria mean_scale = sqrt(avg_label - ard_mat_init_scale*2.0) else: mean_scale = sqrt(avg_label - 1.0) # i.e. predict baseline at start. ard_mat_init_scale = 0.5 net = Network(in_dim, data_type, n_hidden=n_hidden, link=link, kernel=kernel, indiv_bol=indiv_y_bol, dtype=dtype, seed=seed, log_y=log_y, ard_mat_init_scale=ard_mat_init_scale) inputs = net.inputs params = net.params land_size = n_hidden cst = partial(tf.cast, dtype=dtype) # Model parameters initializer = tf.initializers.random_normal(seed=seed, dtype=dtype) # normal initialiser z_initializer = tf.zeros_initializer(dtype=dtype) o_initializer = tf.ones_initializer(dtype=dtype) #initializer = tf.keras.initializers.he_normal(seed=seed) if initialse == 'identity': triangle_vec = tf.constant(triangular_vec(None, n=land_size), dtype=dtype) elif initialse == 'kernel': if kernel == 'additive': init_kernel = net.kernel(landmarks, landmarks, stddev_ard=bw_indiv[:-2], scale_ard=ard_mat_init_scale, stddev_mat=bw_indiv[-2:], scale_mat=ard_mat_init_scale, tensorf=False) elif kernel in ['rbf', 'ard']: init_kernel = net.kernel(landmarks, landmarks, stddev=bw_indiv, scale=1.0, tensorf=False) L = np.linalg.cholesky(init_kernel) #print('L', L) triangle_vec = tf.constant(triangular_vec(L, n=land_size), dtype=dtype) # Intialise with L = I for safe inversion at start. #print('bw_indiv', bw_indiv) #print('mean_scale', mean_scale) params['L'] = tf.Variable(triangle_vec, name= 'L', dtype=dtype) params['mean'] = tf.Variable(mean_scale * o_initializer([land_size, 1]), name = 'mean', dtype=dtype) params['prior_mean'] = tf.Variable(z_initializer([1]), name = 'prior_mean', dtype=dtype) if kernel in ['ard', 'additive']: params['log_bw'] = tf.Variable(tf.log(tf.constant(bw_indiv, dtype=dtype)), name = 'log_bw_sq') elif kernel == 'rbf': #print('Vary Bandwidth RBF') params['log_bw'] = tf.Variable(tf.log(tf.constant(bw_indiv, dtype=dtype)), name = 'log_bw_sq') n_bags = cst(tf.shape(inputs['sizes'])[0]) n_indiv = cst(tf.shape(inputs['X'])[0]) scale = tf.exp(params['log_scale']) stddev = tf.exp(params['log_bw']) landmarks = inputs['landmarks'] #stddev = tf.Print(stddev, [stddev], message='bw', summarize=100) if kernel in ['ard', 'rbf']: k_ww = net.kernel(landmarks, landmarks, stddev=stddev, scale=scale) k_wz = net.kernel(landmarks, inputs['X'], stddev=stddev, scale=scale) #K_wz #k_wz = tf.Print(k_wz, [k_wz]) term_0_diag = scale * tf.ones([tf.cast(n_indiv, dtype=tf.int32)], dtype=dtype) #k_zz diagonal elif kernel == 'additive': scale_mat = tf.exp(params['log_scale_m']) k_ww = net.kernel(landmarks, landmarks, stddev_ard=stddev[:-2], scale_ard=scale, stddev_mat=stddev[-2:], scale_mat=scale_mat) k_wz = net.kernel(landmarks, inputs['X'], stddev_ard=stddev[:-2], scale_ard=scale, stddev_mat=stddev[-2:], scale_mat=scale_mat) term_0_diag = (scale + scale_mat) * tf.ones([tf.cast(n_indiv, dtype=tf.int32)], dtype=dtype) chol_k = tf.cholesky(k_ww) k_ww_inv = tf.matrix_inverse(k_ww) # K_ww^-1 triangular = fill_triangular(params['L']) #\Sigma_u=LL^T Sigma_u = tf.matmul(triangular, tf.transpose(triangular)) # Sigma_u = L L^T k_inv_k_wz = tf.matmul(k_ww_inv, k_wz) # K_ww^-1 K_wz mean_diff = params['mean'] - params['prior_mean'] # mu_prior + K_zw K_ww^-1 (mu_u - mu_prior) net.mu = mu = params['prior_mean'] + tf.squeeze(tf.matmul(tf.transpose(k_inv_k_wz), mean_diff)) inputs_int = tf.concat([tf.constant([0], tf.int32), tf.cumsum(tf.cast(inputs['sizes'], tf.int32))], 0) if kernel in ['ard', 'rbf']: term_1_vec = tf.map_fn(fn=lambda k: term_1_func(net, mu, inputs, stddev, scale, k_wz, Sigma_u, inputs_int[k], inputs_int[k+1], k_inv_k_wz), elems=tf.range(tf.cast(n_bags, dtype=tf.int32)), dtype=dtype) elif kernel == 'additive': term_1_vec = tf.map_fn(fn=lambda k: term_1_func_additive(net, mu, inputs, stddev, scale, scale_mat, k_wz, Sigma_u, inputs_int[k], inputs_int[k+1], k_inv_k_wz), elems=tf.range(tf.cast(n_bags, dtype=tf.int32)), dtype=dtype) #term_1_vec = tf.Print(term_1_vec, [term_1_vec], '1') # We do not do multiple outputs, instead we recompute diag, as multiple outputs is CPU only... term_1 = tf.reduce_sum(tf.multiply(term_1_vec, inputs['y'])) # sum mu^2 mu_square = tf.multiply(mu, mu) # diag is transpose first one, elementwise multiply, sum across rows axis=0 term_1_diag = tf.reduce_sum( tf.multiply(k_wz, k_inv_k_wz), axis=0) #diag K_zw K_ww^-1 k_wz k_zw_k_inv_S = tf.matmul(tf.transpose(k_inv_k_wz), Sigma_u) # k_zw K_ww^-1 Sigma_u term_2_diag = tf.reduce_sum(tf.multiply(tf.transpose(k_zw_k_inv_S), k_inv_k_wz), axis=0) # diagonal as [n_indiv] net.Sigma_diag = Sigma_diag = term_0_diag - term_1_diag + term_2_diag net.indiv = indiv = Sigma_diag + mu_square # E(X^2) is just normal second moment. term_2 = tf.reduce_sum(tf.multiply(indiv, inputs['indiv_pop'])) # sum of all pop * (mu_square + sigma_diag) #indiv = tf.Print(indiv, [indiv, inputs['indiv_y']], message='indiv', summarize=5) #pop_mu = tf.multiply(inputs['indiv_pop'], tf.exp(mu)) #pool_pop_mu = tf.squeeze(net.bag_pool(tf.expand_dims(pop_mu, 1))) #[n_bags] #term_1 = tf.reduce_sum(tf.multiply(inputs['y'], tf.log(pool_pop_mu))) # Term 2 \sum \sum p^i_j exp(\mu^i_j + Sigma^i_j/2) #pop_mu_sig = tf.multiply(inputs['indiv_pop'], tf.exp(mu + 0.5 * Sigma_diag)) #term_2 = tf.reduce_sum(pop_mu_sig) # Term 3 tfd = tf.contrib.distributions mvn_q = tfd.MultivariateNormalTriL(loc=tf.squeeze(params['mean']), scale_tril=triangular) mvn_u = tfd.MultivariateNormalTriL(loc=tf.tile(params['prior_mean'], [land_size]), scale_tril=chol_k) term_3 = tf.distributions.kl_divergence(mvn_q, mvn_u) #term_1 = tf.Print(term_1, [term_1/n_bags], message='1') #term_2 = tf.Print(term_2, [term_2/n_bags], message='2') #term_3 = tf.Print(term_3, [term_3/total_size], message='3') # Stirlings approximation to enable comparison across losses (\sum log (y_j !)) zeros = tf.zeros_like(inputs['y']) # create a tensor all ones mask = tf.greater(inputs['y'], zeros) # boolean tensor, mask[i] = True iff x[i] > 1 non_zero_y = tf.boolean_mask(inputs['y'], mask) #non_zero_y = tf.Print(non_zero_y, [non_zero_y, inputs['y']], summarize=100) term_4 = tf.reduce_sum(tf.multiply(non_zero_y, tf.log(non_zero_y)) - non_zero_y + 0.5 * tf.log(2.0 * pi * non_zero_y)) #term_4 = tf.Print(term_4, [term_4/n_bags], message='4') net.loss = -1.0/n_bags * (term_1 - term_2 - term_4) + term_3/total_size #if MAP: #net.indiv = indiv = tf.exp(mu - Sigma_diag) #else: net.indiv_se = net.square_err(inputs['indiv_true_y'], indiv) net.indiv_nll = net.nll_term(inputs['indiv_y'], indiv) #indiv = tf.Print(indiv, [indiv], summarize =200, message='indiv') #indiv_mean = tf.exp(mu + 0.5 * Sigma_diag) net.indiv_y = indiv_y_pop = tf.multiply(inputs['indiv_pop'], indiv) indiv_y_pop = tf.expand_dims(indiv_y_pop, 1) net.bag_y = bag_y = tf.squeeze(net.bag_pool(indiv_y_pop)) #bag_y = tf.Print(bag_y, [bag_y, inputs['y']], message='bag', summarize=5) net.bag_se = net.square_err(inputs['y'], bag_y, bags=True) net.bag_nll = net.nll_term(inputs['y'], bag_y, bags=True) #indiv_y_mean = tf.multiply(inputs['indiv_pop'], tf.exp(mu + 0.5 * Sigma_diag)) #indiv_y_var = tf.multiply(tf.exp(Sigma_diag) - 1.0, tf.exp( 2.0* mu + Sigma_diag) ) #indiv_y = tf.Print(indiv_y, [indiv_y_mean, inputs['indiv_y'], indiv_y_var], summarize=2) #net.bag_se = tf.reduce_sum(tf.square(bag_y - inputs['y'])) #if indiv_y_bol: # net.indiv_se = tf.reduce_sum(tf.square(indiv_y - inputs['indiv_y'])) # Can add net.print_out return net
def _build(self, inp, is_training=True, test_local_stats=False): """Applies the batch norm operation to an input tensor Parameters ---------- inp : tf.Tensor input tensor for this module is_training : bool, optional flag to specify whether this is training. If so, batch statistics are used and the moving averages are updated test_local_stats : bool, optional flag to use batch statistics during test time Returns ------- tf.Tensor normalized tensor """ if self.param_shape is None: self.param_shape = inp.get_shape().as_list()[-1] assert(self.param_shape == inp.get_shape().as_list()[-1], 'Input shape must match parameter shape - was initialised for another shape') if self.axis is None: self.axis = list(np.arange(len(inp.get_shape().as_list()) - 1)) assert (len(self.axis) == len(inp.get_shape().as_list()) - 1, 'Input shape must match axis - was initialised for another shape') use_batch_stats = is_training | test_local_stats self._beta = tf.get_variable('beta', self.param_shape, tf.float32, initializer=tf.zeros_initializer(), collections=self.TRAINABLE_COLLECTIONS) if self.offset else None self._gamma = tf.get_variable('gamma', self.param_shape, tf.float32, initializer=tf.ones_initializer(), collections=self.TRAINABLE_COLLECTIONS) if self.offset else None if self.offset: self.variables.append(self._beta) if self.scale: self.variables.append(self._gamma) self._mm = tf.get_variable('moving_mean', self.param_shape, tf.float32, initializer=tf.zeros_initializer(), trainable=False, collections=self.MOVING_COLLECTIONS) self._mv = tf.get_variable('moving_variance', self.param_shape, tf.float32, initializer=tf.ones_initializer(), trainable=False, collections=self.MOVING_COLLECTIONS) if use_batch_stats: mean, variance = tf.nn.moments(inp, self.axis, name='moments') # fix for negative variances - see https://github.com/tensorflow/tensorflow/issues/3290 variance = tf.maximum(variance, tf.constant(0.)) if is_training: update_mean_op = moving_averages.assign_moving_average( variable=self._mm, value=mean, decay=self.decay_rate, zero_debias=False, name="update_moving_mean").op update_variance_op = moving_averages.assign_moving_average( variable=self._mv, value=variance, decay=self.decay_rate, zero_debias=False, name="update_moving_variance").op with tf.control_dependencies([update_mean_op, update_variance_op]): mean = tf.identity(mean) variance = tf.identity(variance) else: mean = tf.identity(self._mm) variance = tf.identity(self._mv) outp = tf.nn.batch_normalization(inp, mean, variance, self._beta, self._gamma, self.eps, name="bn") return outp
def __init__(self, name=None): super(Network, self).__init__(name=name) self._layer = tf.keras.layers.Dense( 3, kernel_initializer=tf.ones_initializer(), name='logits')
def _layer_stack(self, x, layers, encoder_output=None, self_attention_mask=None, encdec_attention_mask=None, losses=None, step_num=None, encdec_tensors=None, states=None): """Encoder or decoder stack. Args: x: a mtf.Tensor with shape [<batch_dims>, length_dim, model_dim] layers: an list of strings encoder_output: an optional mtf.Tensor with shape [<batch_dims>, encoder_length_dim, model_dim] self_attention_mask: an optional mtf.Tensor with shape [batch, length_dim, memory_length_dim] containing values 0 or -inf. encdec_attention_mask: an optional mtf.Tensor with shape [batch, length_dim, encoder_length_dim] containing values 0 or -inf. losses: a list to be appended-to step_num: an optional mtf integer Scalar (used in incrmenental mode) encdec_tensors: an optional list of num_layers tuples, each of the form (q_var, o_var, k, v), (used in incremental mode) states: an optional list of Tensors (used in incremental mode) Returns: a mtf.Tensor with shape [<batch_dims>, length_dim, model_dim] Raises: ValueError: if hparams make no sense """ hparams = self._hparams is_incremental = (step_num is not None) def layer_prepostprocess_dropout(x): if is_incremental: return x return mtf.dropout( x, keep_prob=1.0 - hparams.layer_prepostprocess_dropout, noise_shape=mtf.Shape(self.batch_dims + [self.model_dim])) num_layers = len(layers) num_layer_norms = num_layers + 1 layer_norms_dim = mtf.Dimension("layer_norms", num_layer_norms) layer_norm_combined_var = mtf.get_variable( x.mesh, "layer_norm_scale", mtf.Shape([layer_norms_dim, self.model_dim]), initializer=tf.ones_initializer(), activation_dtype=x.dtype) layer_norm_vars = mtf.unstack(layer_norm_combined_var, layer_norms_dim) def normalize(x): scale = layer_norm_vars.pop(0) variance = mtf.reduce_mean(mtf.square(x), reduced_dim=self.model_dim) return x * mtf.rsqrt(variance + hparams.norm_epsilon) * scale if is_incremental: states = list(states) new_states = [] tf.logging.info("states = %s" % (states, )) for lnum, layer_type in enumerate(layers): with tf.variable_scope("%s_%d" % (layer_type, lnum)): if layer_type == "att": # Self attention layer if is_incremental: y, new_k, new_v = mtf.layers.multihead_self_attention_incremental( normalize(x), prev_k=states.pop(0), prev_v=states.pop(0), step_num=step_num, master_dtype=self.master_dtype, slice_dtype=self.slice_dtype, name="att") new_states.append(new_k) new_states.append(new_v) x += y else: x += layer_prepostprocess_dropout( mtf.layers.multihead_attention( normalize(x), None, self_attention_mask, self.kv_dim, self.heads_dim, dropout=hparams.attention_dropout, dropout_broadcast_dims=[self.length_dim], master_dtype=self.master_dtype, slice_dtype=self.slice_dtype, name="att")) elif layer_type == "enc_att": # Encoder-Decoder attention layer if is_incremental: # Encoder-Decoder attention layer q_var, o_var, k, v = encdec_tensors[lnum] x += mtf.layers.multihead_encdec_attention_incremental( normalize(x), q_var, o_var, k, v, encdec_attention_mask, name="enc_att") else: x += layer_prepostprocess_dropout( mtf.layers.multihead_attention( normalize(x), encoder_output, encdec_attention_mask, self.kv_dim, self.heads_dim, dropout=hparams.attention_dropout, dropout_broadcast_dims=[self.length_dim], master_dtype=self.master_dtype, slice_dtype=self.slice_dtype, name="enc_att")) elif layer_type == "local_att": if is_incremental: y, new_k, new_v = mtf.layers.masked_local_attention_1d_incremental( normalize(x), prev_k=states.pop(0), prev_v=states.pop(0), step_num=step_num, master_dtype=self.master_dtype, slice_dtype=self.slice_dtype, name="local_att") new_states.append(new_k) new_states.append(new_v) x += y else: x += layer_prepostprocess_dropout( mtf.layers.masked_local_attention_1d( normalize(x), self.kv_dim, self.heads_dim, window_size=hparams. local_attention_window_size, master_dtype=self.master_dtype, slice_dtype=self.slice_dtype, length_per_split=mtf. tensor_dim_to_size_per_split( hparams.layout, hparams.mesh_shape, self.max_length_dim), name="local_att")) elif layer_type == "compressed_att": if is_incremental: raise ValueError( "compressed_att incremental not implemented") else: x += layer_prepostprocess_dropout( mtf.layers. multihead_self_attention_memory_compressed( normalize(x), mask_right=True, compression_factor=hparams.compression_factor, kv_channels=self.kv_dim, heads=self.heads_dim, dropout=hparams.attention_dropout, dropout_broadcast_dims=[self.length_dim], master_dtype=self.master_dtype, slice_dtype=self.slice_dtype, name="compressed_att")) else: if is_incremental: # insert length dimension. x_shape = x.shape shape_with_length = mtf.Shape( x_shape.dims[:-1] + [mtf.Dimension("length", 1)] + x_shape.dims[-1:]) x = mtf.reshape(x, shape_with_length) # ffn layer x += layer_prepostprocess_dropout( self._feedforward_layer(normalize(x), layer_type, losses=losses)) if is_incremental: # remove length dimension x = mtf.reshape(x, x_shape) x = layer_prepostprocess_dropout(normalize(x)) assert not layer_norm_vars if is_incremental: return x, new_states else: return x
def _decoder_layer_stack_incremental(self, x, step_num, encdec_tensors, self_attention_k, self_attention_v, encdec_attention_mask=None): """Decoder layer stack during inference. We are processing only one position at a time. The self-attention keys and values have already been computed for previous positions. In addition to the decoder output, we need to produce the updated self-attention keys and values. If there is an encoder, then additional Tensors are supplied in encdec_tensors, which give us the keys and values for encoder-decoder attention as well as the weight matrices q_var and o_var. Args: x: a mtf.Tensor with shape [<batch_dims>, model_dim] step_num: an mtf integer Scalar encdec_tensors: an optional list of num_layers tuples, each of the form (q_var, o_var, k, v) self_attention_k: an optional list of num_layers Tensors each with shape [batch, heads, memory_length, kv_channels] self_attention_v: an optional list of num_layers Tensors each with shape [batch, heads, memory_length, kv_channels] encdec_attention_mask: an optional mtf.Tensor with shape [batch, length_dim, encoder_length_dim] containing values 0 or -inf. Returns: y: a mtf.Tensor with shape [<batch_dims>, model_dim] new_self_attention_k: a list of num_layers mtf.Tensors, with the same shapes as the elements of self_attention_k new_self_attention_v: a list of num_layers mtf.Tensors, with the same shapes as the elements of self_attention_v Raises: ValueError: if hparams make no sense """ hparams = self._hparams num_layers = hparams.num_decoder_layers num_layer_norms = num_layers * (2 if encdec_tensors is None else 3) + 1 layer_norms_dim = mtf.Dimension("layer_norms", num_layer_norms) layer_norm_combined_var = mtf.get_variable( x.mesh, "layer_norm_scale", mtf.Shape([layer_norms_dim, self.model_dim]), initializer=tf.ones_initializer(), activation_dtype=x.dtype) layer_norm_vars = mtf.unstack(layer_norm_combined_var, layer_norms_dim) def normalize(x): scale = layer_norm_vars.pop(0) variance = mtf.reduce_mean(mtf.square(x), reduced_dim=self.model_dim) return x * mtf.rsqrt(variance + hparams.norm_epsilon) * scale new_self_attention_k = [] new_self_attention_v = [] for layer in range(num_layers): with tf.variable_scope("layer_%d" % layer): # Self attention layer y, new_k, new_v = mtf_layers.multihead_self_attention_incremental( normalize(x), prev_k=self_attention_k[layer], prev_v=self_attention_v[layer], step_num=step_num, name="self_attention") new_self_attention_k.append(new_k) new_self_attention_v.append(new_v) x += y if encdec_tensors is not None: # Encoder-Decoder attention layer q_var, o_var, k, v = encdec_tensors[layer] x += mtf_layers.multihead_encdec_attention_incremental( normalize(x), q_var, o_var, k, v, encdec_attention_mask, name="encdec_attention") # ffn layer x += self._feedforward_layer(normalize(x), hparams) x = normalize(x) assert not layer_norm_vars return x, new_self_attention_k, new_self_attention_v
def __init__(self, config, batch_ops, is_train=True): # Model name model_name = 'layer_norm' # Model inputs imgs = batch_ops['imgs'] ABCD = tf.cast(batch_ops['ABCD'], dtype=tf.int64) not_D = tf.cast(batch_ops['not_D'], dtype=tf.int64) # Dimensions batch_size = int(config.batch_size) N_foils = int(not_D.shape[1]) # Get latent codes for all images A_latent, B_latent, C_latent, D_latent, all_foil_latent = encode_analogy_objs(imgs, ABCD, not_D) N_latent = int(A_latent.shape[1]) # Normalization # Small constant (for avoiding division by zero) eps = 1e-8 # Normalization parameters A_latent_mean, A_latent_var = tf.nn.moments(A_latent, 1) A_latent_SD = tf.sqrt(A_latent_var + eps) B_latent_mean, B_latent_var = tf.nn.moments(B_latent, 1) B_latent_SD = tf.sqrt(B_latent_var + eps) C_latent_mean, C_latent_var = tf.nn.moments(C_latent, 1) C_latent_SD = tf.sqrt(C_latent_var + eps) D_latent_mean, D_latent_var = tf.nn.moments(D_latent, 1) D_latent_SD = tf.sqrt(D_latent_var + eps) # Scale and shift parameters with tf.variable_scope('norm_params', reuse=tf.AUTO_REUSE) as scope: scale = tf.get_variable('scale', N_latent, initializer=tf.ones_initializer()) shift = tf.get_variable('shift', N_latent, initializer=tf.zeros_initializer()) # Normalize A_layer_norm = (((A_latent - tf.expand_dims(A_latent_mean,1)) / tf.expand_dims(A_latent_SD,1)) * scale) + shift B_layer_norm = (((B_latent - tf.expand_dims(B_latent_mean,1)) / tf.expand_dims(B_latent_SD,1)) * scale) + shift C_layer_norm = (((C_latent - tf.expand_dims(C_latent_mean,1)) / tf.expand_dims(C_latent_SD,1)) * scale) + shift D_layer_norm = (((D_latent - tf.expand_dims(D_latent_mean,1)) / tf.expand_dims(D_latent_SD,1)) * scale) + shift # [A, B, C, D] -> LSTM log.info('[A,B,C,D] -> LSTM...') D_score = scoring_model(A_layer_norm, B_layer_norm, C_layer_norm, D_layer_norm) # [A, B, C, foils] -> LSTM log.info('[A,B,C,foils] -> LSTM...') all_foil_score = [] for foil in range(N_foils): # Extract latent rep for this foil this_foil_latent = all_foil_latent[:,foil,:] # Normalization # Normalization parameters foil_latent_mean, foil_latent_var = tf.nn.moments(this_foil_latent, 1) foil_latent_SD = tf.sqrt(foil_latent_var + eps) # Normalize foil_layer_norm = (((this_foil_latent - tf.expand_dims(foil_latent_mean,1)) / tf.expand_dims(foil_latent_SD,1)) * scale) + shift # Get score foil_score = scoring_model(A_layer_norm, B_layer_norm, C_layer_norm, foil_layer_norm) # Accumulate foil scores all_foil_score.append(foil_score) # Concatenate all scores all_foil_score = tf.concat(all_foil_score, axis=1) all_scores = tf.concat([D_score, all_foil_score], axis=1) all_scores_softmax = tf.nn.softmax(all_scores) # Loss log.info("Loss (cross-entropy over candidate scores)...") targets = tf.concat([tf.ones(D_score.shape), tf.zeros(all_foil_score.shape)], axis=1) self.train_loss, accuracy, correct_preds = build_cross_entropy_loss(all_scores, targets) accuracy = accuracy * 100.0 # Model outputs self.all_out = { 'accuracy': accuracy}
def ones_init(): return tf.ones_initializer()
def __init__( self, inducing_inputs, cov_func, inf_func, # mean_func=mean.ZeroOffset(), lik_func, num_components=1, diag_post=False, inducing_outputs=None): """ Args: lik_func: subclass of likelihoods.Likelihood An object representing the likelihood function p(y|f). cov_func: list of subclasses of kernels.Kernel A list of one kernel per latent function. inducing_inputs: ndarray An array of initial inducing input locations. Dimensions: num_inducing * input_dim. num_components: int The number of mixture of Gaussian components. diag_post: bool True if the mixture of Gaussians uses a diagonal covariance, False otherwise. num_samples: int The number of samples to approximate the expected log likelihood of the posterior. """ # Get the actual functions if they were initialized as strings. self.inf = inf_func assert isinstance(self.inf, inf.Inference) num_latent = cov_func.num_latent_functions() # Repeat the inducing inputs for all latent processes if we haven't been given individually # specified inputs per process. if inducing_inputs.ndim == 2: inducing_inputs = np.tile(inducing_inputs[np.newaxis, :, :], [num_latent, 1, 1]) # Initialize all model dimension constants. num_inducing = inducing_inputs.shape[-2] self.input_dim = inducing_inputs.shape[-1] # Define all parameters that get optimized directly in raw form. Some parameters get # transformed internally to maintain certain pre-conditions. self.raw_weights = tf.get_variable("raw_weights", [num_components], initializer=tf.zeros_initializer()) self.raw_means = tf.get_variable( "raw_means", [num_components, num_latent, num_inducing], initializer=tf.zeros_initializer()) if diag_post: self.raw_covars = tf.get_variable( "raw_covars", [num_components, num_latent, num_inducing], initializer=tf.ones_initializer()) else: self.raw_covars = tf.get_variable( "raw_covars", [num_components, num_latent] + util.tri_vec_shape(num_inducing), initializer=tf.zeros_initializer()) self.raw_inducing_inputs = tf.get_variable("raw_inducing_inputs", initializer=tf.constant( inducing_inputs, dtype=tf.float32)) self.raw_likelihood_params = lik_func.get_params() self.raw_kernel_params = cov_func.get_params() raw_inducing_outputs = 0 if inducing_outputs is None else tf.constant( inducing_outputs, dtype=tf.float32) # Define placeholder variables for training and predicting. self.num_train = tf.placeholder(tf.float32, shape=[], name="num_train") self.train_inputs = tf.placeholder(tf.float32, shape=[None, self.input_dim], name="train_inputs") self.train_outputs = tf.placeholder(tf.float32, shape=[None, None], name="train_outputs") self.test_inputs = tf.placeholder(tf.float32, shape=[None, self.input_dim], name="test_inputs") # Now build our computational graph. self.nelbo, self.loo_loss, self.predictions = self.inf.inference( self.raw_weights, self.raw_means, self.raw_covars, self.raw_inducing_inputs, self.train_inputs, self.train_outputs, self.num_train, self.test_inputs, raw_inducing_outputs) # config = tf.ConfigProto(log_device_placement=True, allow_soft_placement=True) # Do all the tensorflow bookkeeping. self.session = tf.Session() self.optimizer = None self.train_step = None
def build(self, _): self.scale = tf.get_variable("layer_norm_scale", [self.hidden_size], initializer=tf.ones_initializer()) self.bias = tf.get_variable("layer_norm_bias", [self.hidden_size], initializer=tf.zeros_initializer()) self.built = True
def emb_score(config, input_tensor, input_ids, output_weights, input_mask, **kargs): input_shape_list = bert_utils.get_shape_list(input_tensor, expected_rank=3) batch_size = input_shape_list[0] seq_length = input_shape_list[1] hidden_dims = input_shape_list[2] scope = kargs.get('scope', None) if scope: lm_scope = scope + '/' + 'cls/predictions' else: lm_scope = 'cls/predictions' tf.logging.info("**** mlm generator scope **** %s", str(lm_scope)) # with tf.variable_scope("cls/predictions", reuse=tf.AUTO_REUSE): with tf.variable_scope(lm_scope, reuse=tf.AUTO_REUSE): if config.get('ln_type', 'postln') == 'preln': input_tensor = bert_modules.layer_norm(input_tensor) elif config.get('ln_type', 'postln') == 'postln': input_tensor = input_tensor else: input_tensor = input_tensor if config.get("embedding", "none_factorized") == "none_factorized": projection_width = config.hidden_size tf.logging.info("==not using embedding factorized==") else: projection_width = config.get('embedding_size', config.hidden_size) tf.logging.info("==using embedding factorized: embedding size: %s==", str(projection_width)) if kargs.get("energy_pooling", "mi") == "mi": with tf.variable_scope("transform"): input_tensor = tf.layers.dense( input_tensor, units=projection_width, activation=bert_modules.get_activation(config.hidden_act), kernel_initializer=bert_modules.create_initializer( config.initializer_range)) if config.get('ln_type', 'postln') == 'preln': input_tensor = input_tensor elif config.get('ln_type', 'postln') == 'postln': input_tensor = bert_modules.layer_norm(input_tensor) else: input_tensor = bert_modules.layer_norm(input_tensor) output_bias = tf.get_variable( "output_bias", shape=[config.vocab_size], initializer=tf.zeros_initializer()) tf.logging.info("****** mi using mlm transform *******") elif kargs.get("energy_pooling", "mi") == "cls": with tf.variable_scope("transform_ebm"): # We "pool" the model by simply taking the hidden state corresponding # to the first token. We assume that this has been pre-trained first_token_tensor = tf.squeeze(input_tensor[:, 0:1, :], axis=1) input_tensor = tf.layers.dense( first_token_tensor, config.hidden_size, activation=tf.tanh, #bert_modules.get_activation(config.hidden_act), kernel_initializer=bert_modules.create_initializer(config.initializer_range)) tf.logging.info("****** using cls pooling *******") else: with tf.variable_scope("transform_ebm"): input_tensor = tf.layers.dense( input_tensor, units=projection_width, activation=tf.tanh, #bert_modules.get_activation(config.hidden_act), kernel_initializer=bert_modules.create_initializer( config.initializer_range)) tf.logging.info("****** using other pooling transform *******") # with tf.variable_scope("cls/predictions", reuse=tf.AUTO_REUSE): if scope: ebm_scope = scope + '/' + 'ebm/predictions' else: ebm_scope = 'ebm/predictions' tf.logging.info("**** ebm generator scope **** %s", str(ebm_scope)) print(input_tensor.get_shape(), "==input_tensor shape==") with tf.variable_scope(ebm_scope, reuse=tf.AUTO_REUSE): # assume the whole model is self-normalization if kargs.get("normalized_constant", "constant") == 'zero_constant': normalized_constant = tf.get_variable( "ebm_normalized_constant", shape=[config.max_position_embeddings], initializer=tf.zeros_initializer()) valid_seq_length = tf.cast(tf.reduce_sum(input_mask, axis=-1), tf.int32) # batch_size onehot_length_ids = tf.one_hot(valid_seq_length, config.max_position_embeddings) input_normalized_constant = tf.einsum("ab,b->a", tf.cast(onehot_length_ids, tf.float32), normalized_constant) tf.logging.info("****** zero_constant logz *******") elif kargs.get("normalized_constant", "constant") == 'one_constant': normalized_constant = tf.get_variable( "ebm_normalized_constant", shape=[config.max_position_embeddings], initializer=tf.ones_initializer()) tf.logging.info("****** one_constant logz *******") valid_seq_length = tf.cast(tf.reduce_sum(input_mask, axis=-1), tf.int32) # batch_size onehot_length_ids = tf.one_hot(valid_seq_length, config.max_position_embeddings) input_normalized_constant = tf.einsum("ab,b->a", tf.cast(onehot_length_ids, tf.float32), normalized_constant) elif kargs.get("normalized_constant", "constant") == 'constant_constant': normalized_constant = tf.get_variable( "ebm_normalized_constant", shape=[config.max_position_embeddings], initializer=tf.constant_initializer(np.ones((config.max_position_embeddings))*200.0, tf.float32)) tf.logging.info("****** one_constant logz *******") valid_seq_length = tf.cast(tf.reduce_sum(input_mask, axis=-1), tf.int32) # batch_size onehot_length_ids = tf.one_hot(valid_seq_length, config.max_position_embeddings) input_normalized_constant = tf.einsum("ab,b->a", tf.cast(onehot_length_ids, tf.float32), normalized_constant) elif kargs.get("normalized_constant", "constant") == 'log9_constant': normalized_constant = tf.get_variable( "ebm_normalized_constant", shape=[config.max_position_embeddings], initializer=tf.constant_initializer(np.ones((config.max_position_embeddings))*np.log(9.0), tf.float32)) tf.logging.info("****** one_constant logz *******") valid_seq_length = tf.cast(tf.reduce_sum(input_mask, axis=-1), tf.int32) # batch_size onehot_length_ids = tf.one_hot(valid_seq_length, config.max_position_embeddings) input_normalized_constant = tf.einsum("ab,b->a", tf.cast(onehot_length_ids, tf.float32), normalized_constant) elif kargs.get("normalized_constant", "constant") == 'logv_constant': normalized_constant = tf.get_variable( "ebm_normalized_constant", shape=[config.max_position_embeddings], initializer=tf.constant_initializer(np.ones((config.max_position_embeddings))*np.log(config.vocab_size), tf.float32)) tf.logging.info("****** one_constant logz *******") valid_seq_length = tf.cast(tf.reduce_sum(input_mask, axis=-1), tf.int32) # batch_size onehot_length_ids = tf.one_hot(valid_seq_length, config.max_position_embeddings) input_normalized_constant = tf.einsum("ab,b->a", tf.cast(onehot_length_ids, tf.float32), normalized_constant) elif kargs.get("normalized_constant", "constant") == 'logv_constant_ln': normalized_constant = tf.get_variable( "ebm_normalized_constant", shape=[], initializer=tf.constant_initializer(np.log(config.vocab_size), tf.float32)) input_normalized_constant = normalized_constant elif kargs.get("normalized_constant", "length_linear") == 'length_linear': normalized_constant = tf.get_variable( "ebm_normalized_constant", shape=[config.max_position_embeddings], initializer=tf.constant_initializer(np.arange((config.max_position_embeddings))+1, tf.float32), trainable=False) scale_weights = tf.get_variable( "ebm_normalized_constant_scale", shape=[config.max_position_embeddings], initializer=tf.constant_initializer(np.log(config.vocab_size)*np.ones((config.max_position_embeddings)), dtype=tf.float32), trainable=True) scale_bias = tf.get_variable( "ebm_normalized_constant_bias", shape=[config.max_position_embeddings], initializer=tf.zeros_initializer(), trainable=True) tf.logging.info("****** length linear logz *******") # normalized_constant = scale_bias + scale_weights * tf.pow(normalized_constant, 2) valid_seq_length = tf.cast(tf.reduce_sum(input_mask, axis=-1), tf.int32) # batch_size onehot_length_ids = tf.one_hot(valid_seq_length, config.max_position_embeddings) length_part = tf.einsum("ab,b->a", tf.cast(onehot_length_ids, tf.float32), normalized_constant) length_scale_part = tf.einsum("ab,b->a", tf.cast(onehot_length_ids, tf.float32), scale_weights) length_bias_part = tf.einsum("ab,b->a", tf.cast(onehot_length_ids, tf.float32), scale_bias) input_normalized_constant = length_part*length_scale_part + length_bias_part # input_normalized_constant = tf.einsum("ab,b->a", tf.cast(onehot_length_ids, tf.float32), normalized_constant) # f_input_mask = tf.cast(tf.expand_dims(input_mask, axis=-1), tf.float32) if kargs.get("energy_pooling", "mi") == "mean_pooling": tf.logging.info("==apply mean pooling to get hidden states projections==") # for input token sequence: <start> a b c # we only calculate energy on a,b,c which <start> can't contribute to final # energy function # batch x dim pool_features = tf.einsum("abc,ab->ac", input_tensor[:, 1:], tf.cast(input_mask[:, 1:], tf.float32)) pool_features /= (1e-10+tf.reduce_sum(tf.cast(input_mask[:, 1:], tf.float32), axis=1, keepdims=True)) # tf.reduce_sum(input_tensor*f_input_mask, axis=1) #/ (1e-10+tf.reduce_sum(f_input_mask, axis=1)) print(pool_features.get_shape(), "===pool_features shape===") elif kargs.get("energy_pooling", "mi") == "mi": tf.logging.info("==apply mi to get hidden states projections==") # input_tensor_norm = tf.expand_dims(tf.sqrt(tf.reduce_sum(tf.pow(input_tensor, 2), axis=-1))+1e-20, axis=-1) # input_tensor = input_tensor / tf.stop_gradient(input_tensor_norm) # output_weights_norm = tf.expand_dims(tf.sqrt(tf.reduce_sum(tf.pow(output_weights, 2), axis=-1))+1e-20, axis=-1) # output_weights = output_weights / tf.stop_gradient(output_weights_norm) # we calculate cosine distance to make mi bounded by [-1, 1] logits = tf.einsum("abc,dc->abd", input_tensor, output_weights) # batch x seq x vocab logits = tf.nn.bias_add(logits, output_bias) input_id_shape = bert_utils.get_shape_list(input_ids, [2,3]) if len(input_id_shape) == 2: onehot_input_ids = tf.cast(tf.one_hot(tf.cast(input_ids, tf.int32), config.vocab_size), tf.float32) # batch x seq x vocab input_ori_ids = tf.cast(onehot_input_ids, tf.float32) print("==input ori ids shape== 2-dim", input_ori_ids.get_shape()) else: input_ori_ids = tf.cast(input_ids, tf.float32) print("==input ori ids shape== 3-dim", input_ori_ids.get_shape()) logits = tf.einsum("abd,abd->ab", logits, input_ori_ids) print(logits.get_shape(), "==pooled logits shape==") # with l2-normalize, we can bound logits to 1 pool_features = tf.reduce_sum(logits[:, 1:]*tf.cast(input_mask[:, 1:], tf.float32), axis=1) #/ (1e-10+tf.reduce_sum(tf.cast(input_mask[:, 1:], tf.float32), axis=1)) pool_features = tf.expand_dims(pool_features, axis=-1) print(pool_features.get_shape(), "==pooled feature shape==") if kargs.get("softplus_features", False): # when pooled_features is to infinite, it converges to 0 # when is to minus inifinite, it will converges to inifite pool_features = tf.nn.softplus(-pool_features) tf.logging.info("****** apply softplus transformation for pooled_features *******") elif kargs.get("energy_pooling", "mi") == "cls": with tf.variable_scope("transform"): pool_features = tf.layers.dense( input_tensor, units=1, use_bias=False, activation=None ) tf.logging.info("****** apply linear transformation for pooled_features *******") # batch_size x hidden_dims if kargs.get('transform', True): if kargs.get("transformer_activation", "none") == 'softplus': with tf.variable_scope("transform"): ebm_scalar = tf.layers.dense( pool_features, units=1, use_bias=True, activation=tf.nn.softplus # mask scalar to [0,inifite] ) tf.logging.info("****** apply softplus *******") elif kargs.get("transformer_activation", "none") == 'linear': tf.logging.info("****** apply linear projection *******") with tf.variable_scope("transform"): ebm_scalar = tf.layers.dense( pool_features, units=1, use_bias=True, activation=None # mask scalar to [0,inifite] ) else: with tf.variable_scope("transform"): feature_shape = bert_utils.get_shape_list(pool_features, expected_rank=[1,2]) pool_features = tf.layers.dense( pool_features, units=feature_shape[-1], activation=tf.nn.relu, ) output_weights = tf.get_variable( "output_weights", [config.max_position_embeddings, feature_shape[-1]], initializer=tf.truncated_normal_initializer(stddev=0.02)) output_bias = tf.get_variable( "output_bias", [config.max_position_embeddings], initializer=tf.constant_initializer(-np.log(np.arange(config.max_position_embeddings).astype(np.float32)+1.0), dtype=tf.float32) ) # batch x max_position_embeddings ebm_scalar_pos = tf.nn.relu(tf.matmul(pool_features, output_weights, transpose_b=True)) + output_bias pos_tensor = tf.cast(tf.reduce_sum(tf.cast(input_mask, tf.float32), axis=-1), tf.int32) onehot_pos = tf.cast(tf.one_hot(tf.cast(pos_tensor, tf.int32), config.max_position_embeddings), tf.float32) # batch x seq x vocab ebm_scalar = tf.einsum("ab,ab->a", ebm_scalar_pos, onehot_pos) ebm_scalar = tf.expand_dims(ebm_scalar, axis=-1) tf.logging.info("****** apply linear projection *******") print("===ebm_scalar====", ebm_scalar.get_shape()) ebm_scalar = tf.squeeze(ebm_scalar, axis=-1) print("===ebm_scalar====", ebm_scalar.get_shape()) # ebm_scalar /= (1e-10+tf.reduce_sum(tf.cast(input_mask, tf.float32), axis=-1)) # if kargs.get("energy_pooling", "mi") == "mean_pooling": print("===ebm_scalar====", ebm_scalar.get_shape()) print("===input_normalized_constant====", input_normalized_constant.get_shape()) else: ebm_scalar = tf.squeeze(pool_features, axis=-1) # ebm_scalar /= (1e-10+tf.reduce_sum(tf.cast(input_mask, tf.float32), axis=-1)) print("===ebm_scalar====", ebm_scalar.get_shape()) print("===input_normalized_constant====", input_normalized_constant.get_shape()) if not kargs.get("prob_ln", False): tf.logging.info("****** sum of plogprob as sentence probability *******") # ebm_scalar /= (1e-10+tf.reduce_sum(tf.cast(input_mask, tf.float32), axis=-1)) else: ebm_scalar /= (1e-10+tf.reduce_sum(tf.cast(input_mask[:, 1:], tf.float32), axis=-1)) tf.logging.info("****** sum of plogprob with length normalization as sentence probability *******") print("===ebm_scalar====", ebm_scalar.get_shape()) print("===input_normalized_constant====", input_normalized_constant.get_shape()) # original ebm log-likelihood: # log(exp(-E(x))/Z) = -E(x) - log(Z) # here we use bert encoder of pooled hidden states as energy function which need to minus when apply to # actual energy function if not kargs.get("use_tpu", False): tf.summary.scalar('ebm_scalar', tf.reduce_mean(ebm_scalar)) if kargs.get("logz_mode", "default") == 'default': tf.logging.info("****** default logz *******") logits = -ebm_scalar - input_normalized_constant - tf.log(1e-10+tf.reduce_sum(tf.cast(input_mask, tf.float32), axis=-1)) elif kargs.get("logz_mode", "default") == 'standard': logits = ebm_scalar - input_normalized_constant tf.logging.info("****** standard logz *******") elif kargs.get("logz_mode", "default") == 'standard_minus': tf.logging.info("****** minus standard logz *******") logits = -ebm_scalar - input_normalized_constant elif kargs.get("logz_mode", "default") == 'constant': logits = -ebm_scalar - tf.log(1e-10+tf.reduce_sum(tf.cast(input_mask, tf.float32), axis=-1)) tf.logging.info("****** constant logz *******") elif kargs.get("logz_mode", "self_normalizing") == 'self_normalizing': logits = -ebm_scalar tf.logging.info("****** self_normalizing *******") elif kargs.get("logz_mode", "none") == 'none': logits = ebm_scalar tf.logging.info("****** none logz *******") else: tf.logging.info("****** linear logz *******") logits = ebm_scalar - input_normalized_constant * tf.reduce_sum(tf.cast(input_mask, tf.float32), axis=-1) print("=ebm logits shape==", logits.get_shape()) return logits
def BatchNorm3d(inputs, axis=None, training=None, momentum=0.9, epsilon=1e-5, center=True, scale=True, beta_initializer=tf.zeros_initializer(), gamma_initializer=tf.ones_initializer(), virtual_batch_size=None, data_format='channels_last', internal_update=False, sync_statistics=None): data_format = get_data_format(data_format, tfmode=False) shape = inputs.get_shape().as_list() ndims = len(shape) if sync_statistics is not None: sync_statistics = sync_statistics.lower() assert sync_statistics in [None, 'nccl', 'horovod'], sync_statistics if axis is None: if ndims == 2: data_format = 'NHWC' axis = 1 elif ndims == 5: axis = 1 if data_format == 'NCHW' else 4 else: axis = 1 if data_format == 'NCHW' else 3 else: data_format = 'NCHW' if axis == 1 else 'NHWC' num_chan = shape[axis] ctx = get_current_tower_context() if training is None: training = ctx.is_training training = bool(training) TF_version = get_tf_version_tuple() if not training and ctx.is_training: assert TF_version >= 1.4 if ctx.is_main_training_tower: logger.warn("[BatchNorm] Using moving_mean/moving_variance in training.") if sync_statistics is None or not (training and ctx.is_training): coll_bk = backup_collection([tf.GraphKeys.UPDATE_OPS]) with rename_get_variable( {'moving_mean': 'mean/EMA', 'moving_variance': 'variance/EMA'}): tf_args = dict( axis=axis, momentum=momentum, epsilon=epsilon, center=center, scale=scale, beta_initializer=beta_initializer, gamma_initializer=gamma_initializer, fused=True, _reuse=tf.get_variable_scope().reuse) if TF_version >= 1.5: tf_args['virtual_batch_size'] = virtual_batch_size else: assert virtual_batch_size is None layer = tf.layers.BatchNormalization(**tf_args) xn = layer.apply(inputs, training=training, scope=tf.get_variable_scope()) # maintain EMA only on one GPU if ctx.is_main_training_tower: for v in layer.non_trainable_variables: add_model_variable(v) if not ctx.is_main_training_tower or internal_update: restore_collection(coll_bk) if training and internal_update: assert layer.updates with tf.control_dependencies(layer.updates): ret = tf.identity(xn, name='output') else: ret = tf.identity(xn, name='output') vh = ret.variables = VariableHolder( moving_mean=layer.moving_mean, mean=layer.moving_mean, #backward-compatibility moving_variance=layer.moving_variance, variance=layer.moving_variance) #backward-compatibility if scale: vh.gamma = layer.gamma if center: vh.beta = layer.beta else: red_axis = [0] if ndims == 2 else ([0, 2, 3] if axis == 1 else [0, 1, 2]) if ndims == 5: red_axis = [0, 2, 3, 4] if axis == 1 else [0, 1, 2, 3] new_shape = None if ndims == 4 and axis == 1: new_shape = [1, num_chan, 1, 1] if ndims == 5 and axis == 1: new_shape = [1, num_chan, 1, 1, 1] batch_mean = tf.reduce_mean(inputs, axis=red_axis) batch_mean_square = tf.reduce_mean(tf.square(inputs), axis=red_axis) if sync_statistics == 'nccl': if six.PY3 and TF_version <= 1.8 and ctx.is_main_training_tower: logger.warn("A TensorFlow bug cusing cross-GPU BatchNorm to fail") from tensorflow.contrib.nccl.ops import gen_nccl_ops shared_name = re.sub('tower[0-9]+/', '', tf.get_variable_scope().name) num_dev = ctx.total batch_mean = gen_nccl_ops.nccl_all_reduce( input=batch_mean, reduction='sum', num_devices=num_dev, shared_name=shared_name + '_NCCL_mean') * (1.0 / num_dev) batch_mean_square = gen_nccl_ops.nccl_all_reduce( input=batch_mean_square, reduction='sum', num_devices=num_dev, shared_name=shared_name + '_NCCL_mean_square') * (1.0 / num_dev) elif sync_statistics == 'horovod': import horovod.tensorflow as hvd batch_mean = hvd.allreduce(batch_mean, average=True) batch_mean_square = hvd.allreduce(batch_mean_square, average=True) batch_var = batch_mean_square - tf.square(batch_mean) batch_mean_vec = batch_mean batch_var_vec = batch_var beta, gamma, moving_mean, moving_var = get_bn_variables( num_chan, scale, center, beta_initializer, gamma_initializer) if new_shape is not None: batch_mean = tf.reshape(batch_mean, new_shape) batch_var = tf.reshape(batch_var, new_shape) xn = tf.nn.batch_normalization( inputs, batch_mean, batch_var, tf.reshape(beta, new_shape), tf.reshape(gamma, new_shape), epsilon) else: xn = tf.nn.batch_normalization( inputs, batch_mean, batch_var, beta, gamma, epsilon) if ctx.is_main_training_tower: ret = update_bn_ema( xn, batch_mean_vec, batch_var_vec, moving_mean, moving_var, momentum, internal_update) else: ret = tf.identity(xn, name='output') vh = ret.variables = VariableHolder( moving_mean=moving_mean, mean=moving_mean, moving_variance=moving_var, variance=moving_var) if scale: vh.gamma = gamma if center: vh.beta = beta return ret
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy']) report = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=1) """## ONES""" i_layer = Input(shape=input_shape) h_layer = Conv2D(64, (3, 3), strides=2, activation='relu', kernel_initializer=tf.ones_initializer())(i_layer) h_layer = Flatten()(h_layer) h_layer = Dropout(0.4)(h_layer) h_layer = Dense(128, activation='relu', kernel_initializer=tf.ones_initializer())(h_layer) h_layer = Dropout(0.4)(h_layer) o_layer = Dense(classes, activation='softmax')(h_layer) model = Model(i_layer, o_layer) model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy']) report = model.fit(X_train,
def add_logits_op(self, data_batch, size_batch, reuse=False): with tf.variable_scope('logits', reuse=reuse): data_embeddings = self.add_embeddings_op(data_batch) with tf.name_scope('recurrent_layer'): def make_cell(input_size): lstm_cell = tf.nn.rnn_cell.LSTMCell(self.config.num_units) drop_cell = tf.nn.rnn_cell.DropoutWrapper( lstm_cell, state_keep_prob=self.lstm_state_dropout_placeholder, output_keep_prob=self.lstm_output_dropout_placeholder, variational_recurrent=True, input_size=input_size, dtype=tf.float32) return drop_cell input_sizes = [ self.config.embedding_size, self.config.num_units, self.config.num_units ] self.cell = tf.nn.rnn_cell.MultiRNNCell([ make_cell(input_sizes[i]) for i in range(self.config.num_layers) ]) self.initial_state = self.cell.zero_state( tf.shape(data_batch)[0], tf.float32) outputs, final_state = tf.nn.dynamic_rnn( self.cell, data_embeddings, sequence_length=size_batch, initial_state=self.initial_state, dtype=tf.float32) with tf.name_scope('logits'): flat_outputs = tf.reshape(outputs, [-1, self.config.num_units]) weights = tf.get_variable( 'weights', initializer=tf.contrib.layers.xavier_initializer(), shape=(self.config.num_units, self.config.embedding_size), dtype=tf.float32) bias = tf.get_variable('bias', initializer=tf.ones_initializer(), shape=(self.config.embedding_size), dtype=tf.float32) flat_inputs = tf.matmul(flat_outputs, weights) + bias bias_logits = tf.get_variable( 'bias_logits', initializer=tf.ones_initializer(), shape=(self.config.vocab_size), dtype=tf.float32) flat_logits = tf.matmul( flat_inputs, tf.transpose(self.embeddings)) + bias_logits batch_size = tf.shape(data_batch)[0] max_len = tf.shape(data_batch)[1] logits = tf.reshape( flat_logits, [batch_size, max_len, self.config.vocab_size]) return logits, final_state
def BatchNorm(inputs, axis=None, training=None, momentum=0.9, epsilon=1e-5, center=True, scale=True, beta_initializer=tf.zeros_initializer(), gamma_initializer=tf.ones_initializer(), virtual_batch_size=None, data_format='channels_last', internal_update=False, sync_statistics=None): """ Almost equivalent to `tf.layers.batch_normalization`, but different (and more powerful) in the following: 1. Accepts an alternative `data_format` option when `axis` is None. For 2D input, this argument will be ignored. 2. Default value for `momentum` and `epsilon` is different. 3. Default value for `training` is automatically obtained from tensorpack's `TowerContext`, but can be overwritten. 4. Support the `internal_update` option, which enables the use of BatchNorm layer inside conditionals. 5. Support the `sync_statistics` option, which is very useful in small-batch models. Args: internal_update (bool): if False, add EMA update ops to `tf.GraphKeys.UPDATE_OPS`. If True, update EMA inside the layer by control dependencies. They are very similar in speed, but `internal_update=True` can be used when you have conditionals in your model, or when you have multiple networks to train. Corresponding TF issue: https://github.com/tensorflow/tensorflow/issues/14699 sync_statistics: either None or "nccl". By default (None), it uses statistics of the input tensor to normalize. When set to "nccl", this layer must be used under tensorpack multi-gpu trainers, and it then uses per-machine (multiple GPU) statistics to normalize. Note that this implementation averages the per-tower E[x] and E[x^2] among towers to compute global mean&variance. The result is the global mean&variance only if each tower has the same batch size. This option has no effect when not training. This option is also known as "Cross-GPU BatchNorm" as mentioned in https://arxiv.org/abs/1711.07240. Corresponding TF issue: https://github.com/tensorflow/tensorflow/issues/18222 Variable Names: * ``beta``: the bias term. Will be zero-inited by default. * ``gamma``: the scale term. Will be one-inited by default. * ``mean/EMA``: the moving average of mean. * ``variance/EMA``: the moving average of variance. Note: Combinations of ``training`` and ``ctx.is_training``: * ``training == ctx.is_training``: standard BN, EMA are maintained during training and used during inference. This is the default. * ``training and not ctx.is_training``: still use batch statistics in inference. * ``not training and ctx.is_training``: use EMA to normalize in training. This is useful when you load a pre-trained BN and don't want to fine tune the EMA. EMA will not be updated in this case. """ # parse shapes data_format = get_data_format(data_format, tfmode=False) shape = inputs.get_shape().as_list() ndims = len(shape) assert ndims in [2, 4], ndims if sync_statistics is not None: sync_statistics = sync_statistics.lower() assert sync_statistics in [None, 'nccl', 'horovod'], sync_statistics if axis is None: if ndims == 2: data_format = 'NHWC' axis = 1 else: axis = 1 if data_format == 'NCHW' else 3 else: data_format = 'NCHW' if axis == 1 else 'NHWC' num_chan = shape[axis] # parse training/ctx ctx = get_current_tower_context() if training is None: training = ctx.is_training training = bool(training) TF_version = get_tf_version_number() if not training and ctx.is_training: assert TF_version >= 1.4, \ "Fine tuning a BatchNorm model with fixed statistics is only " \ "supported after https://github.com/tensorflow/tensorflow/pull/12580 " if ctx.is_main_training_tower: # only warn in first tower logger.warn("[BatchNorm] Using moving_mean/moving_variance in training.") # Using moving_mean/moving_variance in training, which means we # loaded a pre-trained BN and only fine-tuning the affine part. if sync_statistics is None or not (training and ctx.is_training): coll_bk = backup_collection([tf.GraphKeys.UPDATE_OPS]) with rename_get_variable( {'moving_mean': 'mean/EMA', 'moving_variance': 'variance/EMA'}): tf_args = dict( axis=axis, momentum=momentum, epsilon=epsilon, center=center, scale=scale, beta_initializer=beta_initializer, gamma_initializer=gamma_initializer, fused=(ndims == 4 and axis in [1, 3]), _reuse=tf.get_variable_scope().reuse) if TF_version >= 1.5: tf_args['virtual_batch_size'] = virtual_batch_size else: assert virtual_batch_size is None, "Feature not supported in this version of TF!" layer = tf.layers.BatchNormalization(**tf_args) xn = layer.apply(inputs, training=training, scope=tf.get_variable_scope()) # maintain EMA only on one GPU is OK, even in replicated mode. # because during training, EMA isn't used if ctx.is_main_training_tower: for v in layer.non_trainable_variables: add_model_variable(v) if not ctx.is_main_training_tower or internal_update: restore_collection(coll_bk) if training and internal_update: assert layer.updates with tf.control_dependencies(layer.updates): ret = tf.identity(xn, name='output') else: ret = tf.identity(xn, name='output') vh = ret.variables = VariableHolder( moving_mean=layer.moving_mean, mean=layer.moving_mean, # for backward-compatibility moving_variance=layer.moving_variance, variance=layer.moving_variance) # for backward-compatibility if scale: vh.gamma = layer.gamma if center: vh.beta = layer.beta else: red_axis = [0] if ndims == 2 else ([0, 2, 3] if axis == 1 else [0, 1, 2]) new_shape = None # don't need to reshape unless ... if ndims == 4 and axis == 1: new_shape = [1, num_chan, 1, 1] batch_mean = tf.reduce_mean(inputs, axis=red_axis) batch_mean_square = tf.reduce_mean(tf.square(inputs), axis=red_axis) if sync_statistics == 'nccl': if six.PY3 and TF_version <= 1.8 and ctx.is_main_training_tower: logger.warn("A TensorFlow bug will cause cross-GPU BatchNorm to fail. " "Apply this patch: https://github.com/tensorflow/tensorflow/pull/20360") from tensorflow.contrib.nccl.ops import gen_nccl_ops shared_name = re.sub('tower[0-9]+/', '', tf.get_variable_scope().name) num_dev = ctx.total batch_mean = gen_nccl_ops.nccl_all_reduce( input=batch_mean, reduction='sum', num_devices=num_dev, shared_name=shared_name + '_NCCL_mean') * (1.0 / num_dev) batch_mean_square = gen_nccl_ops.nccl_all_reduce( input=batch_mean_square, reduction='sum', num_devices=num_dev, shared_name=shared_name + '_NCCL_mean_square') * (1.0 / num_dev) elif sync_statistics == 'horovod': # Require https://github.com/uber/horovod/pull/331 # Proof-of-concept, not ready yet. import horovod.tensorflow as hvd batch_mean = hvd.allreduce(batch_mean, average=True) batch_mean_square = hvd.allreduce(batch_mean_square, average=True) batch_var = batch_mean_square - tf.square(batch_mean) batch_mean_vec = batch_mean batch_var_vec = batch_var beta, gamma, moving_mean, moving_var = get_bn_variables( num_chan, scale, center, beta_initializer, gamma_initializer) if new_shape is not None: batch_mean = tf.reshape(batch_mean, new_shape) batch_var = tf.reshape(batch_var, new_shape) # Using fused_batch_norm(is_training=False) is actually slightly faster, # but hopefully this call will be JITed in the future. xn = tf.nn.batch_normalization( inputs, batch_mean, batch_var, tf.reshape(beta, new_shape), tf.reshape(gamma, new_shape), epsilon) else: xn = tf.nn.batch_normalization( inputs, batch_mean, batch_var, beta, gamma, epsilon) if ctx.is_main_training_tower: ret = update_bn_ema( xn, batch_mean_vec, batch_var_vec, moving_mean, moving_var, momentum, internal_update) else: ret = tf.identity(xn, name='output') vh = ret.variables = VariableHolder( moving_mean=moving_mean, mean=moving_mean, # for backward-compatibility moving_variance=moving_var, variance=moving_var) # for backward-compatibility if scale: vh.gamma = gamma if center: vh.beta = beta return ret
def layer_norm(inputs, center=True, scale=True, activation_fn=None, reuse=None, variables_collections=None, outputs_collections=None, trainable=True, begin_norm_axis=1, begin_params_axis=-1, scope=None): # https://github.com/pytorch/fairseq/blob/5d543f9b19e76772386903d4eeebdceaeb3d1b69/fairseq/modules/layer_norm.py#L9 # https://github.com/NVIDIA/apex/blob/3ef01faef2492b3e650f44ecc510f3a8f2426783/csrc/layer_norm_cuda_kernel.cu#L303 # https://github.com/tensorflow/tensorflow/blob/r1.14/tensorflow/python/ops/nn_impl.py#L1240 """Custom Layer Normalization with changable epsilon.""" with tf.variable_scope(scope, 'LayerNorm', [inputs], reuse=reuse): inputs_shape = inputs.shape inputs_rank = inputs_shape.ndims if inputs_rank is None: raise ValueError('Inputs %s has undefined rank.' % inputs.name) dtype = inputs.dtype.base_dtype if begin_norm_axis < 0: begin_norm_axis = inputs_rank + begin_norm_axis if begin_params_axis >= inputs_rank or begin_norm_axis >= inputs_rank: raise ValueError('begin_params_axis (%d) and begin_norm_axis (%d) ' 'must be < rank(inputs) (%d)' % (begin_params_axis, begin_norm_axis, inputs_rank)) params_shape = inputs_shape[begin_params_axis:] if not params_shape.is_fully_defined(): raise ValueError( 'Inputs %s: shape(inputs)[%s:] is not fully defined: %s' % (inputs.name, begin_params_axis, inputs_shape)) # Allocate parameters for the beta and gamma of the normalization. beta, gamma = None, None if center: beta = tf.get_variable('beta', shape=params_shape, dtype=dtype, initializer=tf.zeros_initializer(), trainable=trainable) if scale: gamma = tf.get_variable('gamma', shape=params_shape, dtype=dtype, initializer=tf.ones_initializer(), trainable=trainable) # By default, compute the moments across all the dimensions except the one # with index 0. norm_axes = list(range(begin_norm_axis, inputs_rank)) mean, variance = tf.nn.moments(inputs, norm_axes, keep_dims=True) # Compute layer normalization using the batch_normalization function. # Note that epsilon must be increased for float16 due to the limited # representable range. variance_epsilon = (FLAGS.ln_eps if dtype != tf.float16 else max( FLAGS.ln_eps, 1e-3)) outputs = tf.nn.batch_normalization(inputs, mean, variance, offset=beta, scale=gamma, variance_epsilon=variance_epsilon) outputs.set_shape(inputs_shape) if activation_fn is not None: outputs = activation_fn(outputs) return outputs
def get_coordinate(i): return tf.get_variable("x_{}".format(i), shape=[], dtype=tf.float32, initializer=tf.ones_initializer())
def _layer_stack(self, x, num_layers, encoder_output=None, self_attention_mask=None, encdec_attention_mask=None, losses=None): """Encoder or decoder stack. Args: x: a mtf.Tensor with shape [<batch_dims>, length_dim, model_dim] num_layers: an integer encoder_output: an optional mtf.Tensor with shape [<batch_dims>, encoder_length_dim, model_dim] self_attention_mask: an optional mtf.Tensor with shape [batch, length_dim, memory_length_dim] containing values 0 or -inf. encdec_attention_mask: an optional mtf.Tensor with shape [batch, length_dim, encoder_length_dim] containing values 0 or -inf. losses: a list to be appended-to Returns: a mtf.Tensor with shape [<batch_dims>, length_dim, model_dim] Raises: ValueError: if hparams make no sense """ hparams = self._hparams def layer_prepostprocess_dropout(x): return mtf.dropout( x, keep_prob=1.0 - hparams.layer_prepostprocess_dropout, noise_shape=mtf.Shape(self.batch_dims + [self.model_dim])) num_layer_norms = num_layers * (2 if encoder_output is None else 3) + 1 layer_norms_dim = mtf.Dimension("layer_norms", num_layer_norms) layer_norm_combined_var = mtf.get_variable( x.mesh, "layer_norm_scale", mtf.Shape([layer_norms_dim, self.model_dim]), initializer=tf.ones_initializer(), activation_dtype=x.dtype) layer_norm_vars = mtf.unstack(layer_norm_combined_var, layer_norms_dim) def normalize(x): scale = layer_norm_vars.pop(0) variance = mtf.reduce_mean(mtf.square(x), reduced_dim=self.model_dim) return x * mtf.rsqrt(variance + hparams.norm_epsilon) * scale for layer in range(num_layers): with tf.variable_scope("layer_%d" % layer): # Self attention layer x += layer_prepostprocess_dropout( mtf_layers.multihead_attention( normalize(x), None, self_attention_mask, self.kv_dim, self.heads_dim, dropout=hparams.attention_dropout, dropout_broadcast_dims=[self.length_dim], name="self_attention")) if encoder_output is not None: # Encoder-Decoder attention layer x += layer_prepostprocess_dropout( mtf_layers.multihead_attention( normalize(x), encoder_output, encdec_attention_mask, self.kv_dim, self.heads_dim, dropout=hparams.attention_dropout, dropout_broadcast_dims=[self.length_dim], name="encdec_attention")) # ffn layer x += layer_prepostprocess_dropout( self._feedforward_layer(normalize(x), losses=losses)) x = layer_prepostprocess_dropout(normalize(x)) assert not layer_norm_vars return x