def logistic_regression(X, y, class_weight=None, init_mean=None, init_stddev=1.0): """Creates logistic regression TensorFlow subgraph. Args: X: tensor or placeholder for input features, shape should be [batch_size, n_features]. y: tensor or placeholder for target, shape should be [batch_size, n_classes]. class_weight: tensor, [n_classes], where for each class it has weight of the class. If not provided will check if graph contains tensor `class_weight:0`. If that is not provided either all ones are used. init_mean: the mean value to use for initialization. init_stddev: the standard devation to use for initialization. Returns: Predictions and loss tensors. Side effects: The variables linear_regression.weights and linear_regression.bias are initialized as follows. If init_mean is not None, then initialization will be done using a random normal initializer with the given init_mean and init_stddv. (These may be set to 0.0 each if a zero initialization is desirable for convex use cases.) If init_mean is None, then the uniform_unit_scaling_initialzer will be used. """ with vs.variable_scope('logistic_regression'): logging_ops.histogram_summary('logistic_regression.X', X) logging_ops.histogram_summary('logistic_regression.y', y) # Set up the requested initialization. if (init_mean is None): weights = vs.get_variable('weights', [X.get_shape()[1], y.get_shape()[-1]]) bias = vs.get_variable('bias', [y.get_shape()[-1]]) else: weights = vs.get_variable('weights', [X.get_shape()[1], y.get_shape()[-1]], initializer=init_ops.random_normal_initializer( init_mean, init_stddev)) bias = vs.get_variable('bias', [y.get_shape()[-1]], initializer=init_ops.random_normal_initializer( init_mean, init_stddev)) logging_ops.histogram_summary('logistic_regression.weights', weights) logging_ops.histogram_summary('logistic_regression.bias', bias) # If no class weight provided, try to retrieve one from pre-defined # tensor name in the graph. if not class_weight: try: class_weight = ops.get_default_graph().get_tensor_by_name('class_weight:0') except KeyError: pass return losses_ops.softmax_classifier(X, y, weights, bias, class_weight=class_weight)
def _BuildSmallModel(self): image = array_ops.zeros([2, 6, 6, 3]) kernel = variable_scope.get_variable( 'DW', [3, 3, 3, 6], dtypes.float32, initializer=init_ops.random_normal_initializer(stddev=0.001)) x = nn_ops.conv2d(image, kernel, [1, 2, 2, 1], padding='SAME') kernel = variable_scope.get_variable( 'DW2', [2, 2, 6, 12], dtypes.float32, initializer=init_ops.random_normal_initializer(stddev=0.001)) x = nn_ops.conv2d(x, kernel, [1, 2, 2, 1], padding='SAME') return x
def linear_regression(x, y, init_mean=None, init_stddev=1.0): """Creates linear regression TensorFlow subgraph. Args: x: tensor or placeholder for input features. y: tensor or placeholder for labels. init_mean: the mean value to use for initialization. init_stddev: the standard devation to use for initialization. Returns: Predictions and loss tensors. Side effects: The variables linear_regression.weights and linear_regression.bias are initialized as follows. If init_mean is not None, then initialization will be done using a random normal initializer with the given init_mean and init_stddv. (These may be set to 0.0 each if a zero initialization is desirable for convex use cases.) If init_mean is None, then the uniform_unit_scaling_initialzer will be used. """ with vs.variable_scope('linear_regression'): scope_name = vs.get_variable_scope().name summary.histogram('%s.x' % scope_name, x) summary.histogram('%s.y' % scope_name, y) dtype = x.dtype.base_dtype y_shape = y.get_shape() if len(y_shape) == 1: output_shape = 1 else: output_shape = y_shape[1] # Set up the requested initialization. if init_mean is None: weights = vs.get_variable( 'weights', [x.get_shape()[1], output_shape], dtype=dtype) bias = vs.get_variable('bias', [output_shape], dtype=dtype) else: weights = vs.get_variable( 'weights', [x.get_shape()[1], output_shape], initializer=init_ops.random_normal_initializer( init_mean, init_stddev, dtype=dtype), dtype=dtype) bias = vs.get_variable( 'bias', [output_shape], initializer=init_ops.random_normal_initializer( init_mean, init_stddev, dtype=dtype), dtype=dtype) summary.histogram('%s.weights' % scope_name, weights) summary.histogram('%s.bias' % scope_name, bias) return losses_ops.mean_squared_error_regressor(x, y, weights, bias)
def batch_normalize(tensor_in, epsilon=1e-5, convnet=False, decay=0.9, scale_after_normalization=True): """Batch Normalization Args: tensor_in: input Tensor, 4D shape: [batch, in_height, in_width, in_depth]. epsilon : A float number to avoid being divided by 0. decay: decay rate for exponential moving average. convnet: Whether this is for convolutional net use. If this is True, moments will sum across axis [0, 1, 2]. Otherwise, only [0]. scale_after_normalization: Whether to scale after normalization. """ shape = tensor_in.get_shape().as_list() with vs.variable_scope("batch_norm"): gamma = vs.get_variable("gamma", [shape[-1]], initializer=init_ops.random_normal_initializer(1.0, 0.02)) beta = vs.get_variable("beta", [shape[-1]], initializer=init_ops.constant_initializer(0.0)) ema = moving_averages.ExponentialMovingAverage(decay=decay) if convnet: assign_mean, assign_var = nn.moments(tensor_in, [0, 1, 2]) else: assign_mean, assign_var = nn.moments(tensor_in, [0]) ema_assign_op = ema.apply([assign_mean, assign_var]) ema_mean, ema_var = ema.average(assign_mean), ema.average(assign_var) def update_mean_var(): """Internal function that updates mean and variance during training""" with ops.control_dependencies([ema_assign_op]): return array_ops_.identity(assign_mean), array_ops_.identity(assign_var) is_training = array_ops_.squeeze(ops.get_collection("IS_TRAINING")) mean, variance = control_flow_ops.cond(is_training, update_mean_var, lambda: (ema_mean, ema_var)) return nn.batch_norm_with_global_normalization( tensor_in, mean, variance, beta, gamma, epsilon, scale_after_normalization=scale_after_normalization )
def doTestIndexedSlicesGradientInCondInWhileLoop(self, use_resource=False): with ops.Graph().as_default(): embedding_matrix = variable_scope.get_variable( "embedding_matrix", [5, 5], initializer=init_ops.random_normal_initializer(), use_resource=use_resource) def Cond(it, _): return it < 5 def Body(it, cost): embedding = embedding_ops.embedding_lookup(embedding_matrix, [0]) cost = control_flow_ops.cond( math_ops.equal(it, 3), lambda: math_ops.square(cost), lambda: cost + math_ops.reduce_sum(embedding)) return it + 1, cost _, cost = control_flow_ops.while_loop( Cond, Body, [constant_op.constant(0), constant_op.constant(0.0)]) dynamic_grads = gradients_impl.gradients(cost, [embedding_matrix])[0] dynamic_grads = math_ops.segment_sum(dynamic_grads.values, dynamic_grads.indices) embedding = embedding_ops.embedding_lookup(embedding_matrix, [0]) static = math_ops.square( math_ops.reduce_sum(embedding) + math_ops.reduce_sum(embedding) + math_ops.reduce_sum(embedding)) + math_ops.reduce_sum(embedding) static_grads = gradients_impl.gradients(static, [embedding_matrix])[0] static_grads = math_ops.segment_sum(static_grads.values, static_grads.indices) with self.test_session() as sess: sess.run(variables.global_variables_initializer()) self.assertAllEqual(*sess.run([static_grads, dynamic_grads]))
def batch_normalize(tensor_in, epsilon=1e-5, convnet=False, decay=0.9, scale_after_normalization=True): """Batch normalization. Args: tensor_in: input `Tensor`, 4D shape: [batch, in_height, in_width, in_depth]. epsilon : A float number to avoid being divided by 0. convnet: Whether this is for convolutional net use. If `True`, moments will sum across axis `[0, 1, 2]`. Otherwise, only `[0]`. decay: Decay rate for exponential moving average. scale_after_normalization: Whether to scale after normalization. Returns: A batch-normalized `Tensor`. """ shape = tensor_in.get_shape().as_list() with vs.variable_scope("batch_norm"): gamma = vs.get_variable( "gamma", [shape[-1]], initializer=init_ops.random_normal_initializer(1., 0.02)) beta = vs.get_variable("beta", [shape[-1]], initializer=init_ops.constant_initializer(0.)) moving_mean = vs.get_variable( 'moving_mean', shape=[shape[-1]], initializer=init_ops.zeros_initializer, trainable=False) moving_var = vs.get_variable( 'moving_var', shape=[shape[-1]], initializer=init_ops.ones_initializer, trainable=False) def _update_mean_var(): """Internal function that updates mean and variance during training.""" axis = [0, 1, 2] if convnet else [0] mean, var = nn.moments(tensor_in, axis) update_moving_mean = moving_averages.assign_moving_average( moving_mean, mean, decay) update_moving_var = moving_averages.assign_moving_average( moving_var, var, decay) with ops.control_dependencies([update_moving_mean, update_moving_var]): return array_ops_.identity(mean), array_ops_.identity(var) is_training = array_ops_.squeeze(ops.get_collection("IS_TRAINING")) mean, variance = control_flow_ops.cond(is_training, _update_mean_var, lambda: (moving_mean, moving_var)) return nn.batch_norm_with_global_normalization( tensor_in, mean, variance, beta, gamma, epsilon, scale_after_normalization=scale_after_normalization)
def linear_regression(X, y, init_mean=None, init_stddev=1.0): """Creates linear regression TensorFlow subgraph. Args: X: tensor or placeholder for input features. y: tensor or placeholder for target. init_mean: the mean value to use for initialization. init_stddev: the standard devation to use for initialization. Returns: Predictions and loss tensors. Side effects: The variables linear_regression.weights and linear_regression.bias are initialized as follows. If init_mean is not None, then initialization will be done using a random normal initializer with the given init_mean and init_stddv. (These may be set to 0.0 each if a zero initialization is desirable for convex use cases.) If init_mean is None, then the uniform_unit_scaling_initialzer will be used. """ with vs.variable_scope('linear_regression'): logging_ops.histogram_summary('linear_regression.X', X) logging_ops.histogram_summary('linear_regression.y', y) y_shape = y.get_shape() if len(y_shape) == 1: output_shape = 1 else: output_shape = y_shape[1] # Set up the requested initialization. if (init_mean is None): weights = vs.get_variable('weights', [X.get_shape()[1], output_shape]) bias = vs.get_variable('bias', [output_shape]) else: weights = vs.get_variable('weights', [X.get_shape()[1], output_shape], initializer=init_ops.random_normal_initializer( init_mean, init_stddev)) bias = vs.get_variable('bias', [output_shape], initializer=init_ops.random_normal_initializer( init_mean, init_stddev)) logging_ops.histogram_summary('linear_regression.weights', weights) logging_ops.histogram_summary('linear_regression.bias', bias) return losses_ops.mean_squared_error_regressor(X, y, weights, bias)
def __init__(self, W_in=init_ops.random_normal_initializer(stddev=0.1), W_hid=init_ops.random_normal_initializer(stddev=0.1), W_cell=init_ops.random_normal_initializer(stddev=0.1), b=init_ops.constant_initializer(0.), activation=None): self.W_in = W_in self.W_hid = W_hid # Don't store a cell weight vector when cell is None if W_cell is not None: self.W_cell = W_cell if b is not None: self.b = b # For the activation, if None is supplied, use identity if activation is None: self.activation = control_flow_ops.identity else: self.activation = activation
def BuildSmallModel(): """Build a small forward conv model.""" image = array_ops.zeros([2, 6, 6, 3]) _ = variable_scope.get_variable( 'ScalarW', [], dtypes.float32, initializer=init_ops.random_normal_initializer(stddev=0.001)) kernel = variable_scope.get_variable( 'DW', [3, 3, 3, 6], dtypes.float32, initializer=init_ops.random_normal_initializer(stddev=0.001)) x = nn_ops.conv2d(image, kernel, [1, 2, 2, 1], padding='SAME') kernel = variable_scope.get_variable( 'DW2', [2, 2, 6, 12], dtypes.float32, initializer=init_ops.random_normal_initializer(stddev=0.001)) x = nn_ops.conv2d(x, kernel, [1, 2, 2, 1], padding='SAME') return x
def BuildSplitableModel(): """Build a small model that can be run partially in each step.""" image = array_ops.zeros([2, 6, 6, 3]) kernel1 = variable_scope.get_variable( 'DW', [3, 3, 3, 6], dtypes.float32, initializer=init_ops.random_normal_initializer(stddev=0.001)) r1 = nn_ops.conv2d(image, kernel1, [1, 2, 2, 1], padding='SAME') kernel2 = variable_scope.get_variable( 'DW2', [2, 3, 3, 6], dtypes.float32, initializer=init_ops.random_normal_initializer(stddev=0.001)) r2 = nn_ops.conv2d(image, kernel2, [1, 2, 2, 1], padding='SAME') r3 = r1 + r2 return r1, r2, r3
def _TestOneSimpleTraining(self, rnn_mode, num_layers, num_units, input_size, batch_size, seq_length, dir_count, dropout, dtype, delta, tolerance): # Gradient checking runs two forward ops with almost the same input. Need to # make sure the drop patterns across the two runs are the same. logging.info("Training test with config: %s", locals()) old_env_state = os.environ.get("TF_CUDNN_RESET_RND_GEN_STATE", str(False)) os.environ["TF_CUDNN_RESET_RND_GEN_STATE"] = str(True) random_seed.set_random_seed(5678) has_input_c = (rnn_mode == CUDNN_LSTM) direction = (CUDNN_RNN_UNIDIRECTION if dir_count == 1 else CUDNN_RNN_BIDIRECTION) model = CudnnTestModel( rnn_mode, num_layers, num_units, input_size, direction=direction, dropout=dropout, dtype=dtype, training=True, bias_initializer=init_ops.random_normal_initializer( mean=1., dtype=dtype)) rnn = model.rnn params = rnn.trainable_variables[0] inputs = variables.Variable( random_ops.random_uniform( [seq_length, batch_size, input_size], dtype=dtype), dtype=dtype) input_h = variables.Variable( random_ops.random_uniform( [num_layers * dir_count, batch_size, num_units], dtype=dtype), dtype=dtype) if has_input_c: input_c = variables.Variable( random_ops.random_uniform( [num_layers * dir_count, batch_size, num_units], dtype=dtype), dtype=dtype) initial_state = (input_h, input_c) else: initial_state = (input_h,) total_sum = model.FProp(inputs, initial_state, training=True) with self.test_session(use_gpu=True, graph=ops.get_default_graph()) as sess: sess.run(variables.global_variables_initializer()) all_inputs = [inputs, params] for s in initial_state: all_inputs.append(s) self._GradientCheck( sess, total_sum, all_inputs, tolerance=tolerance, delta=delta) os.environ["TF_CUDNN_RESET_RND_GEN_STATE"] = old_env_state
def compute_spectral_norm(w_tensor, power_iteration_rounds=1, name=None): """Estimates the largest singular value in the weight tensor. Args: w_tensor: The weight matrix whose spectral norm should be computed. power_iteration_rounds: The number of iterations of the power method to perform. A higher number yields a better approximation. name: An optional scope name. Returns: The largest singular value (the spectral norm) of w. """ with variable_scope.variable_scope(name, 'spectral_norm'): # The paper says to flatten convnet kernel weights from # (C_out, C_in, KH, KW) to (C_out, C_in * KH * KW). But TensorFlow's Conv2D # kernel weight shape is (KH, KW, C_in, C_out), so it should be reshaped to # (KH * KW * C_in, C_out), and similarly for other layers that put output # channels as last dimension. # n.b. this means that w here is equivalent to w.T in the paper. w = array_ops.reshape(w_tensor, (-1, w_tensor.get_shape()[-1])) # Persisted approximation of first left singular vector of matrix `w`. u_var = variable_scope.get_variable( _PERSISTED_U_VARIABLE_SUFFIX, shape=(w.shape[0], 1), dtype=w.dtype, initializer=init_ops.random_normal_initializer(), trainable=False) u = u_var # Use power iteration method to approximate spectral norm. for _ in range(power_iteration_rounds): # `v` approximates the first right singular vector of matrix `w`. v = nn.l2_normalize(math_ops.matmul(array_ops.transpose(w), u)) u = nn.l2_normalize(math_ops.matmul(w, v)) # Update persisted approximation. with ops.control_dependencies([u_var.assign(u, name='update_u')]): u = array_ops.identity(u) u = array_ops.stop_gradient(u) v = array_ops.stop_gradient(v) # Largest singular value of `w`. spectral_norm = math_ops.matmul( math_ops.matmul(array_ops.transpose(u), w), v) spectral_norm.shape.assert_is_fully_defined() spectral_norm.shape.assert_is_compatible_with([1, 1]) return spectral_norm[0][0]
def __call__(self, inputs, state, scope=None): dtype = inputs.dtype batch_size, input_size = inputs.get_shape().as_list() # as_list() so that it is a float. Seems strange... if self._O is not None: input_size = input_size - self._O.get_shape().as_list()[0] with vs.variable_scope(scope or type(self).__name__): A = vs.get_variable('A', [self._num_units, self._num_units], dtype=dtype, initializer=init_ops.random_normal_initializer(stddev=1/math.sqrt(self._num_units))) B = vs.get_variable('B', [input_size, self._num_units], dtype=dtype, initializer=init_ops.random_normal_initializer(stddev=1/math.sqrt(input_size))) b = vs.get_variable('b', [self._num_units], initializer=init_ops.random_normal_initializer(stddev=0.01)) if self._O is not None: output = (1 - self._dt_tau)*state + self._dt_tau*(math_ops.matmul(self._activation(state), A) + math_ops.matmul(inputs, array_ops.concat(0,[B,self._O])) + b + random_ops.random_normal([batch_size, self._num_units], stddev=self._sigma)) else: output = (1 - self._dt_tau)*state + self._dt_tau*(math_ops.matmul(self._activation(state), A) + math_ops.matmul(inputs, B) + b + random_ops.random_normal([batch_size, self._num_units], stddev=self._sigma)) return output, output
def build(self, inputs_shape): if inputs_shape[1].value is None: raise ValueError("Expected inputs.shape[-1] to be known, saw shape: %s" % inputs_shape) input_depth = inputs_shape[1].value if self._input_initializer is None: self._input_initializer = init_ops.random_normal_initializer(mean=0.0, stddev=0.001) self._input_kernel = self.add_variable( "input_kernel", shape=[input_depth, self._num_units], initializer=self._input_initializer) if self._recurrent_initializer is None: self._recurrent_initializer = init_ops.constant_initializer(1.) self._recurrent_kernel = self.add_variable( "recurrent_kernel", shape=[self._num_units], initializer=self._recurrent_initializer) # Clip the absolute values of the recurrent weights to the specified minimum if self._recurrent_min_abs: abs_kernel = math_ops.abs(self._recurrent_kernel) min_abs_kernel = math_ops.maximum(abs_kernel, self._recurrent_min_abs) self._recurrent_kernel = math_ops.multiply( math_ops.sign(self._recurrent_kernel), min_abs_kernel ) # Clip the absolute values of the recurrent weights to the specified maximum if self._recurrent_max_abs: self._recurrent_kernel = clip_ops.clip_by_value(self._recurrent_kernel, -self._recurrent_max_abs, self._recurrent_max_abs) self._bias = self.add_variable( "bias", shape=[self._num_units], initializer=init_ops.zeros_initializer(dtype=self.dtype)) self.built = True
def _get_random_features_initializer(initializer, shape): """Returns Initializer object for random features.""" def _get_cauchy_samples(loc, scale, shape): probs = np.random.uniform(low=0., high=1., size=shape) return loc + scale * np.tan(np.pi * (probs - 0.5)) random_features_initializer = initializer if isinstance(initializer, six.string_types): if initializer.lower() == 'gaussian': random_features_initializer = init_ops.random_normal_initializer( stddev=1.0) elif initializer.lower() == 'laplacian': random_features_initializer = init_ops.constant_initializer( _get_cauchy_samples(loc=0.0, scale=1.0, shape=shape)) else: raise ValueError( 'Unsupported kernel type: \'{}\'. Supported kernel types: {}.'.format( random_features_initializer, _SUPPORTED_RBF_KERNEL_TYPES)) return random_features_initializer
def testIndexedSlicesGradient(self): with ops.Graph().as_default(): embedding_matrix = variable_scope.get_variable( "embedding_matrix", [5, 5], initializer=init_ops.random_normal_initializer() ) def Cond(it, _): return it < 5 def Body(it, cost): embedding = embedding_ops.embedding_lookup(embedding_matrix + 0.0, [0]) cost += math_ops.reduce_sum(embedding) return it + 1, cost _, cost = control_flow_ops.while_loop(Cond, Body, [constant_op.constant(0), constant_op.constant(0.0)]) optimizer = momentum.MomentumOptimizer(0.1, 0.9) train_op = optimizer.minimize(cost) with self.test_session() as sess: sess.run(variables.global_variables_initializer()) for _ in range(10): sess.run([train_op])
def testIndexedSlicesGradient(self): with ops.Graph().as_default(): embedding_matrix = variable_scope.get_variable( "embedding_matrix", [5, 5], initializer=init_ops.random_normal_initializer()) def cond(it, _): return it < 5 def body(it, cost): embedding = embedding_ops.embedding_lookup(embedding_matrix + 0.0, [0]) cost += math_ops.reduce_sum(embedding) return it + 1, cost _, cost = control_flow_ops.while_loop( cond, body, [constant_op.constant(0), constant_op.constant(0.0)]) optimizer = momentum.MomentumOptimizer(0.1, 0.9) train_op = optimizer.minimize(cost) with self.test_session() as sess: sess.run(variables.global_variables_initializer()) for _ in range(10): sess.run([train_op])
def attention(query, hidden, hidden_features, v, encoder_mask, attn_length, scope=None): # added by al with variable_scope.variable_scope(scope or "attention"): # Put attention masks on hidden using hidden_features and query. ds = [] # Results of attention reads will be stored here. aa = [] if nest.is_sequence(query): # If the query is a tuple, flatten it. query_list = nest.flatten(query) for q in query_list: # Check that ndims == 2 if specified. ndims = q.get_shape().ndims if ndims: assert ndims == 2 query = array_ops.concat(1, query_list) for a in xrange(num_heads): with variable_scope.variable_scope("AttnU_%d" % a): y = linear(query, attention_vec_size, False, weight_initializer=init_ops.random_normal_initializer(0, 0.001, seed=SEED)) y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size]) # Attention mask is a softmax of v^T * tanh(...). s = math_ops.reduce_sum( v[a] * math_ops.tanh(hidden_features[a] + y), [2, 3]) # a = nn_ops.softmax(s) s = array_ops.transpose(array_ops.transpose(s) - math_ops.reduce_max(s, [1])) s = math_ops.exp(s) s = math_ops.to_float(encoder_mask) * s # s_s = math_ops.reduce_sum(s, [1]) # a = array_ops.transpose(array_ops.transpose(s) / (s_s + (1.0 - math_ops.sign(s_s)))) a = array_ops.transpose(array_ops.transpose(s) / math_ops.reduce_sum(s, [1])) # complete softmax, added by al aa.append(a) d = math_ops.reduce_sum( array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden, [1, 2]) # complete attention calculation ds.append(array_ops.reshape(d, [-1, attn_size])) return ds, aa
def __init__(self, filters, kernel_size, strides=1, padding="valid", data_format="channels_last", dilation_rate=1, activation=None, use_bias=True, dropout_rate=0.5, temperature=0.6, gamma=-0.1, zeta=1.1, kernel_initializer=init.random_normal_initializer(0., 1e-2), bias_initializer=init.zeros_initializer(), trainable=True, name=None, **kwargs): super(L0NormConv2D, self).__init__(rank=2, filters=filters, kernel_size=kernel_size, strides=strides, padding=padding, data_format=data_format, dilation_rate=dilation_rate, activation=activation, use_bias=use_bias, dropout_rate=dropout_rate, temperature=temperature, gamma=gamma, zeta=zeta, kernel_initializer=kernel_initializer, bias_initializer=bias_initializer, trainable=trainable, name=name, **kwargs)
def build(self, inputs_shape): if inputs_shape[1].value is None: raise ValueError("Expected inputs.shape[-1] to be known, saw shape: %s" % inputs_shape) input_dim = inputs_shape[1].value with tf.variable_scope('autoconceptor_vars'): self.W_in = tf.get_variable( "W_in", shape=[input_dim, self.num_units], initializer=init_ops.random_normal_initializer(), dtype=tf.float32) self.b_in = tf.get_variable( "b_in", shape=[self.num_units], initializer= init_ops.zeros_initializer(), dtype=tf.float32) self.W = tf.get_variable( "W", shape=[self.num_units, self.num_units], initializer=init_ops.constant_initializer(0.05 * np.identity(self.num_units)), dtype=tf.float32) self.gain = tf.get_variable( 'layer-norm-gain', shape=[self.num_units], initializer=init_ops.constant_initializer(np.ones([self.num_units])), dtype=tf.float32) self.bias = tf.get_variable( 'layer-norm-bias', shape=[self.num_units], initializer=init_ops.constant_initializer(np.zeros([self.num_units])), dtype=tf.float32)
def attention(query, scope=None): """Put attention masks on hidden using hidden_features and query.""" with variable_scope.variable_scope(scope or "attention"): ds = [] # Results of attention reads will be stored here. if nest.is_sequence( query): # If the query is a tuple, flatten it. query_list = nest.flatten(query) for q in query_list: # Check that ndims == 2 if specified. ndims = q.get_shape().ndims if ndims: assert ndims == 2 query = array_ops.concat(query_list, 1) with variable_scope.variable_scope("AttnU"): y = linear( query, attention_vec_size, False, weight_initializer=init_ops.random_normal_initializer( 0, 0.001, seed=SEED)) y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size]) # the additive attention is computed by v^T * tanh(...). s = math_ops.reduce_sum( v * math_ops.tanh(hidden_features + y), [2, 3]) s = array_ops.transpose( array_ops.transpose(s) - math_ops.reduce_max(s, [1])) # sofxmax with mask s = math_ops.exp(s) s = math_ops.to_float(encoder_mask) * s a = array_ops.transpose( array_ops.transpose(s) / math_ops.reduce_sum(s, [1])) d = math_ops.reduce_sum( array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden, [1, 2]) ds.append(array_ops.reshape(d, [-1, attn_size])) return ds
def build(self, inputs_shape): if inputs_shape[1].value is None: raise ValueError("Expected inputs.shape[-1] to be known, saw shape: %s" % inputs_shape) input_depth = inputs_shape[1].value if self._input_initializer is None: self._input_initializer = init_ops.random_normal_initializer(mean = 0.0, stddev = 0.001) self._input_kernel = self.add_variable("input_kernel", shape = [input_depth, self._num_units], initializer = self._input_initializer) if self._recurrent_initializer is None: self._recurrent_initializer = init_ops.constant_initializer(1.) self._recurrent_kernel = self.add_variable("recurrent_kernel", shape = [self._num_units], initializer = self._recurrent_initializer) # Clip the absolute values of the recurrent weights to the specified minimum if self._recurrent_min_abs: abs_kernel = math_ops.abs(self._recurrent_kernel) min_abs_kernel = math_ops.maximum(abs_kernel, self._recurrent_min_abs) self._recurrent_kernel = math_ops.multiply( math_ops.sign(self._recurrent_kernel), min_abs_kernel ) # Clip the absolute values of the recurrent weights to the specified maximum if self._recurrent_max_abs: self._recurrent_kernel = clip_ops.clip_by_value(self._recurrent_kernel, -self._recurrent_max_abs, self._recurrent_max_abs) self._bias = self.add_variable( "bias", shape = [self._num_units], initializer = init_ops.zeros_initializer(dtype = self.dtype)) self.built = True
def body(i, loss): i = i + 1 init = init_ops.random_normal_initializer(0.0, 1.0, seed=1, dtype=np.float32) x = variable_scope.get_variable("v2", dtype=np.float32, shape=[1, 4, 4, 2], initializer=init) with variable_scope.variable_scope("vs", use_resource=True): y = layers.Conv2D( 2, 1, use_bias=True, kernel_initializer=init_ops.ones_initializer(), name='conv1')(x) y = layers.Conv2D( 2, 1, use_bias=True, kernel_initializer=init_ops.ones_initializer(), name='conv2')(y) y = layers.Conv2D( 2, 1, use_bias=True, kernel_initializer=init_ops.ones_initializer(), name='conv3')(y) loss = math_ops.reduce_sum(y) optimizer = gradient_descent.GradientDescentOptimizer(0.1) train = optimizer.minimize(loss) with ops.control_dependencies([train]): i = array_ops.identity(i) loss = array_ops.identity(loss) return (i, loss)
def test_variable_initializer(self): id = 0 for initializer, target_mean, target_stddev in [ (-1.0, -1.0, 0.0), (init_ops.random_normal_initializer(0.0, 0.01, seed=2), 0.0, 0.01), ]: with self.session(config=default_config, use_gpu=test_util.is_gpu_available()): id += 1 keys = constant_op.constant(list(range(2**17)), dtypes.int64) table = de.get_variable( "t1" + str(id), key_dtype=dtypes.int64, value_dtype=dtypes.float32, initializer=initializer, dim=10, ) vals_op = table.lookup(keys) mean = self.evaluate(math_ops.reduce_mean(vals_op)) stddev = self.evaluate(math_ops.reduce_std(vals_op)) rtol = 2e-5 atol = rtol self.assertAllClose(target_mean, mean, rtol, atol) self.assertAllClose(target_stddev, stddev, rtol, atol)
def doTestIndexedSlicesGradientInCondInWhileLoop(self, use_resource=False): with ops.Graph().as_default(): embedding_matrix = variable_scope.get_variable( "embedding_matrix", [5, 5], initializer=init_ops.random_normal_initializer(), use_resource=use_resource) def cond(it, _): return it < 5 def body(it, cost): embedding = embedding_ops.embedding_lookup(embedding_matrix, [0]) cost = control_flow_ops.cond( math_ops.equal(it, 3), lambda: math_ops.square(cost), lambda: cost + math_ops.reduce_sum(embedding)) return it + 1, cost _, cost = control_flow_ops.while_loop( cond, body, [constant_op.constant(0), constant_op.constant(0.0)]) dynamic_grads = gradients_impl.gradients(cost, [embedding_matrix])[0] dynamic_grads = math_ops.segment_sum(dynamic_grads.values, dynamic_grads.indices) embedding = embedding_ops.embedding_lookup(embedding_matrix, [0]) static = math_ops.square( math_ops.reduce_sum(embedding) + math_ops.reduce_sum(embedding) + math_ops.reduce_sum(embedding)) + math_ops.reduce_sum(embedding) static_grads = gradients_impl.gradients(static, [embedding_matrix])[0] static_grads = math_ops.segment_sum(static_grads.values, static_grads.indices) with self.test_session() as sess: sess.run(variables.global_variables_initializer()) self.assertAllEqual(*sess.run([static_grads, dynamic_grads]))
import tensorflow as tf from tensorflow.python.ops.init_ops import random_normal_initializer weights_init = random_normal_initializer(mean=0.0, stddev=0.1) def block_5x5(inputs, filters=32): branch_5x5 = tf.layers.conv2d(inputs, kernel_size=(1, 1), strides=1, filters=filters, activation=tf.nn.relu, kernel_initializer=weights_init) branch_5x5 = tf.layers.conv2d(branch_5x5, kernel_size=(5, 5), strides=1, filters=filters, padding='same', kernel_initializer=weights_init, activation=tf.nn.relu) branch_3x3 = tf.layers.conv2d(inputs, kernel_size=(1, 1), strides=1, filters=filters, activation=tf.nn.relu) branch_3x3 = tf.layers.conv2d(branch_3x3, kernel_size=(3, 3), strides=1, filters=filters, padding='same', kernel_initializer=weights_init,
def testInitializerDifferent(self): for dtype in [dtypes.float32, dtypes.float64]: init1 = init_ops.random_normal_initializer(0.0, 1.0, seed=1, dtype=dtype) init2 = init_ops.random_normal_initializer(0.0, 1.0, seed=2, dtype=dtype) self.assertFalse(identicaltest(self, init1, init2))
def _testWithAttention(self, create_attention_mechanism, expected_final_output, expected_final_state, attention_mechanism_depth=3, alignment_history=False, expected_final_alignment_history=None, attention_layer_size=6, name=''): encoder_sequence_length = [3, 2, 3, 1, 1] decoder_sequence_length = [2, 0, 1, 2, 3] batch_size = 5 encoder_max_time = 8 decoder_max_time = 4 input_depth = 7 encoder_output_depth = 10 cell_depth = 9 if attention_layer_size is not None: attention_depth = attention_layer_size else: attention_depth = encoder_output_depth decoder_inputs = array_ops.placeholder_with_default( np.random.randn(batch_size, decoder_max_time, input_depth).astype(np.float32), shape=(None, None, input_depth)) encoder_outputs = array_ops.placeholder_with_default( np.random.randn(batch_size, encoder_max_time, encoder_output_depth).astype(np.float32), shape=(None, None, encoder_output_depth)) attention_mechanism = create_attention_mechanism( num_units=attention_mechanism_depth, memory=encoder_outputs, memory_sequence_length=encoder_sequence_length) with self.test_session(use_gpu=True) as sess: with vs.variable_scope( 'root', initializer=init_ops.random_normal_initializer(stddev=0.01, seed=3)): cell = rnn_cell.LSTMCell(cell_depth) cell = wrapper.AttentionWrapper( cell, attention_mechanism, attention_layer_size=attention_layer_size, alignment_history=alignment_history) helper = helper_py.TrainingHelper(decoder_inputs, decoder_sequence_length) my_decoder = basic_decoder.BasicDecoder( cell=cell, helper=helper, initial_state=cell.zero_state(dtype=dtypes.float32, batch_size=batch_size)) final_outputs, final_state, _ = decoder.dynamic_decode( my_decoder) self.assertTrue( isinstance(final_outputs, basic_decoder.BasicDecoderOutput)) self.assertTrue( isinstance(final_state, wrapper.AttentionWrapperState)) self.assertTrue( isinstance(final_state.cell_state, rnn_cell.LSTMStateTuple)) self.assertEqual( (batch_size, None, attention_depth), tuple(final_outputs.rnn_output.get_shape().as_list())) self.assertEqual( (batch_size, None), tuple(final_outputs.sample_id.get_shape().as_list())) self.assertEqual( (batch_size, attention_depth), tuple(final_state.attention.get_shape().as_list())) self.assertEqual( (batch_size, cell_depth), tuple(final_state.cell_state.c.get_shape().as_list())) self.assertEqual( (batch_size, cell_depth), tuple(final_state.cell_state.h.get_shape().as_list())) if alignment_history: state_alignment_history = final_state.alignment_history.stack() # Remove the history from final_state for purposes of the # remainder of the tests. final_state = final_state._replace(alignment_history=()) # pylint: disable=protected-access self.assertEqual( (None, batch_size, None), tuple(state_alignment_history.get_shape().as_list())) else: state_alignment_history = () sess.run(variables.global_variables_initializer()) sess_results = sess.run({ 'final_outputs': final_outputs, 'final_state': final_state, 'state_alignment_history': state_alignment_history, }) final_output_info = nest.map_structure( get_result_summary, sess_results['final_outputs']) final_state_info = nest.map_structure(get_result_summary, sess_results['final_state']) print(name) print('Copy/paste:\nexpected_final_output = %s' % str(final_output_info)) print('expected_final_state = %s' % str(final_state_info)) nest.map_structure(self.assertAllCloseOrEqual, expected_final_output, final_output_info) nest.map_structure(self.assertAllCloseOrEqual, expected_final_state, final_state_info) if alignment_history: # by default, the wrapper emits attention as output final_alignment_history_info = nest.map_structure( get_result_summary, sess_results['state_alignment_history']) print('expected_final_alignment_history = %s' % str(final_alignment_history_info)) nest.map_structure( self.assertAllCloseOrEqual, # outputs are batch major but the stacked TensorArray is time major expected_final_alignment_history, final_alignment_history_info)
def default_loc_scale_fn( is_singular=False, loc_initializer=init_ops.random_normal_initializer(stddev=0.1), untransformed_scale_initializer=init_ops.random_normal_initializer( mean=-3., stddev=0.1), loc_regularizer=None, untransformed_scale_regularizer=None, loc_constraint=None, untransformed_scale_constraint=None): """Makes closure which creates `loc`, `scale` params from `tf.get_variable`. This function produces a closure which produces `loc`, `scale` using `tf.get_variable`. The closure accepts the following arguments: dtype: Type of parameter's event. shape: Python `list`-like representing the parameter's event shape. name: Python `str` name prepended to any created (or existing) `tf.Variable`s. trainable: Python `bool` indicating all created `tf.Variable`s should be added to the graph collection `GraphKeys.TRAINABLE_VARIABLES`. add_variable_fn: `tf.get_variable`-like `callable` used to create (or access existing) `tf.Variable`s. Args: is_singular: Python `bool` indicating if `scale is None`. Default: `False`. loc_initializer: Initializer function for the `loc` parameters. The default is `tf.random_normal_initializer(mean=0., stddev=0.1)`. untransformed_scale_initializer: Initializer function for the `scale` parameters. Default value: `tf.random_normal_initializer(mean=-3., stddev=0.1)`. This implies the softplus transformed result has mean approximately `0.05` and std. deviation approximately `0.005`. loc_regularizer: Regularizer function for the `loc` parameters. The default (`None`) is to use the `tf.get_variable` default. untransformed_scale_regularizer: Regularizer function for the `scale` parameters. The default (`None`) is to use the `tf.get_variable` default. loc_constraint: An optional projection function to be applied to the loc after being updated by an `Optimizer`. The function must take as input the unprojected variable and must return the projected variable (which must have the same shape). Constraints are not safe to use when doing asynchronous distributed training. The default (`None`) is to use the `tf.get_variable` default. untransformed_scale_constraint: An optional projection function to be applied to the `scale` parameters after being updated by an `Optimizer` (e.g. used to implement norm constraints or value constraints). The function must take as input the unprojected variable and must return the projected variable (which must have the same shape). Constraints are not safe to use when doing asynchronous distributed training. The default (`None`) is to use the `tf.get_variable` default. Returns: default_loc_scale_fn: Python `callable` which instantiates `loc`, `scale` parameters from args: `dtype, shape, name, trainable, add_variable_fn`. """ def _fn(dtype, shape, name, trainable, add_variable_fn): """Creates `loc`, `scale` parameters.""" loc = add_variable_fn( name=name + "_loc", shape=shape, initializer=loc_initializer, regularizer=loc_regularizer, constraint=loc_constraint, dtype=dtype, trainable=trainable) if is_singular: return loc, None untransformed_scale = add_variable_fn( name=name + "_untransformed_scale", shape=shape, initializer=untransformed_scale_initializer, regularizer=untransformed_scale_regularizer, constraint=untransformed_scale_constraint, dtype=dtype, trainable=trainable) scale = (np.finfo(dtype.as_numpy_dtype).eps + nn_ops.softplus(untransformed_scale)) return loc, scale return _fn
def _fc(self, bottom, out_size, name): with tf.variable_scope(name): _, size = bottom.get_shape().as_list() weights = tf.get_variable(name=name + "_weights", shape = [size, out_size], initializer=init_ops.random_normal_initializer(stddev=0.01)) biases = tf.get_variable(name=name + "_biases", shape=[out_size], initializer=init_ops.random_normal_initializer(stddev=0.01)) print weights fc = tf.nn.bias_add(tf.matmul(bottom, weights), biases) return fc
def _testWithAttention(self, create_attention_mechanism, expected_final_outputs, expected_final_state, attention_mechanism_depth=3): encoder_sequence_length = [3, 2, 3, 1, 0] decoder_sequence_length = [2, 0, 1, 2, 3] batch_size = 5 encoder_max_time = 8 decoder_max_time = 4 input_depth = 7 encoder_output_depth = 10 cell_depth = 9 attention_depth = 6 decoder_inputs = np.random.randn(batch_size, decoder_max_time, input_depth).astype(np.float32) encoder_outputs = np.random.randn(batch_size, encoder_max_time, encoder_output_depth).astype(np.float32) attention_mechanism = create_attention_mechanism( num_units=attention_mechanism_depth, memory=encoder_outputs, memory_sequence_length=encoder_sequence_length) with self.test_session() as sess: with vs.variable_scope( "root", initializer=init_ops.random_normal_initializer(stddev=0.01, seed=3)): cell = core_rnn_cell.LSTMCell(cell_depth) cell = wrapper.DynamicAttentionWrapper( cell, attention_mechanism, attention_size=attention_depth) helper = helper_py.TrainingHelper(decoder_inputs, decoder_sequence_length) my_decoder = basic_decoder.BasicDecoder( cell=cell, helper=helper, initial_state=cell.zero_state( dtype=dtypes.float32, batch_size=batch_size)) final_outputs, final_state = decoder.dynamic_decode(my_decoder) self.assertTrue( isinstance(final_outputs, basic_decoder.BasicDecoderOutput)) self.assertTrue( isinstance(final_state, wrapper.DynamicAttentionWrapperState)) self.assertTrue( isinstance(final_state.cell_state, core_rnn_cell.LSTMStateTuple)) self.assertEqual((batch_size, None, attention_depth), tuple(final_outputs.rnn_output.get_shape().as_list())) self.assertEqual((batch_size, None), tuple(final_outputs.sample_id.get_shape().as_list())) self.assertEqual((batch_size, attention_depth), tuple(final_state.attention.get_shape().as_list())) self.assertEqual((batch_size, cell_depth), tuple(final_state.cell_state.c.get_shape().as_list())) self.assertEqual((batch_size, cell_depth), tuple(final_state.cell_state.h.get_shape().as_list())) sess.run(variables.global_variables_initializer()) sess_results = sess.run({ "final_outputs": final_outputs, "final_state": final_state }) nest.map_structure(self.assertAllClose, expected_final_outputs, sess_results["final_outputs"]) nest.map_structure(self.assertAllClose, expected_final_state, sess_results["final_state"])
def __init__( self, filters, dau_units, max_kernel_size, strides=1, data_format='channels_first', activation=None, use_bias=True, weight_initializer=init_ops.random_normal_initializer(stddev=0.1), mu1_initializer=None, mu2_initializer=None, sigma_initializer=None, bias_initializer=init_ops.zeros_initializer(), weight_regularizer=None, mu1_regularizer=None, mu2_regularizer=None, sigma_regularizer=None, bias_regularizer=None, activity_regularizer=None, weight_constraint=None, mu1_constraint=None, mu2_constraint=None, sigma_constraint=None, bias_constraint=None, trainable=True, mu_learning_rate_factor=500, dau_unit_border_bound=0.01, dau_unit_single_dim=False, dau_aggregation_forbid_positive_dim1=False, unit_testing=False, # for competability between CPU and GPU version (where gradients of last edge need to be ignored) during unit testing name=None, **kwargs): super(DAUConv2d, self).__init__(trainable=trainable, name=name, activity_regularizer=activity_regularizer, **kwargs) self.rank = 2 self.filters = filters self.dau_units = utils.normalize_tuple(dau_units, self.rank, 'dau_components') self.max_kernel_size = max_kernel_size self.padding = np.floor(self.max_kernel_size / 2.0) self.strides = strides self.data_format = utils.normalize_data_format(data_format) self.activation = activation self.use_bias = use_bias self.bias_initializer = bias_initializer self.bias_regularizer = bias_regularizer self.bias_constraint = bias_constraint self.weight_initializer = weight_initializer self.weight_regularizer = weight_regularizer self.weight_constraint = weight_constraint self.mu1_initializer = mu1_initializer self.mu1_regularizer = mu1_regularizer self.mu1_constraint = mu1_constraint self.mu2_initializer = mu2_initializer self.mu2_regularizer = mu2_regularizer self.mu2_constraint = mu2_constraint self.sigma_initializer = sigma_initializer self.sigma_regularizer = sigma_regularizer self.sigma_constraint = sigma_constraint if self.mu1_initializer is None: self.mu1_initializer = DAUGridMean( dau_units=self.dau_units, max_value=np.floor(self.max_kernel_size / 2.0) - 1, dau_unit_axis=2) if self.mu2_initializer is None: self.mu2_initializer = DAUGridMean( dau_units=self.dau_units, max_value=np.floor(self.max_kernel_size / 2.0) - 1, dau_unit_axis=1) if self.sigma_initializer is None: self.sigma_initializer = init_ops.constant_initializer(0.5) self.mu_learning_rate_factor = mu_learning_rate_factor self.unit_testing = unit_testing self.input_spec = base.InputSpec(ndim=self.rank + 2) self.dau_unit_border_bound = dau_unit_border_bound self.num_dau_units_all = np.int32(np.prod(self.dau_units)) self.num_dau_units_ignore = 0 self.dau_unit_single_dim = dau_unit_single_dim self.dau_aggregation_forbid_positive_dim1 = dau_aggregation_forbid_positive_dim1 # if we have less then 2 units per channel then or have odd number of them then add one more dummy unit # since computation is always done with 2 units at the same time (effectively set weight=0 for those dummy units) # make sure we have at least ALLOWED_UNITS_GROUP (this is requested so for fast version that can handle only factor of 2) if self.num_dau_units_all % self.DAU_UNITS_GROUP != 0: new_num_units = np.int32( np.ceil(self.num_dau_units_all / float(self.DAU_UNITS_GROUP)) * self.DAU_UNITS_GROUP) self.num_dau_units_ignore = new_num_units - self.num_dau_units_all if self.dau_units[0] < self.dau_units[1]: self.dau_units = (self.dau_units[0] + self.num_dau_units_ignore, self.dau_units[1]) else: self.dau_units = (self.dau_units[0], self.dau_units[1] + self.num_dau_units_ignore) self.num_dau_units_all = new_num_units self.weight_initializer = ZeroNLast( self.weight_initializer, last_num_to_zero=self.num_dau_units_ignore, axis=2) self.dau_weights = None self.dau_mu1 = None self.dau_mu2 = None self.dau_sigma = None # show notice when using stride>1 that this is not implemented by CUDA code and is only emulating it (will have same computationa requirements as for stride=1) if self.strides > 1: tf.logging.warning( 'NOTICE: using stride>=2 in DAU convolution uses the same computational resources as with ' + 'stride=1 (current implementation only emulates stride>=2 using tensor slicing).' )
def test_dynamic_rnn_decoder_time_major(self): with self.test_session() as sess: with variable_scope.variable_scope( "root", initializer=init_ops.constant_initializer(0.5)) as varscope: # Define inputs/outputs to model batch_size = 2 encoder_embedding_size = 3 decoder_embedding_size = 4 encoder_hidden_size = 5 decoder_hidden_size = encoder_hidden_size input_sequence_length = 6 decoder_sequence_length = 7 num_decoder_symbols = 20 start_of_sequence_id = end_of_sequence_id = 1 decoder_embeddings = variable_scope.get_variable( "decoder_embeddings", [num_decoder_symbols, decoder_embedding_size], initializer=init_ops.random_normal_initializer(stddev=0.1)) inputs = constant_op.constant( 0.5, shape=[input_sequence_length, batch_size, encoder_embedding_size]) decoder_inputs = constant_op.constant( 0.4, shape=[decoder_sequence_length, batch_size, decoder_embedding_size]) decoder_length = constant_op.constant( decoder_sequence_length, dtype=dtypes.int32, shape=[batch_size,]) with variable_scope.variable_scope("rnn") as scope: # setting up weights for computing the final output output_fn = lambda x: layers.linear(x, num_decoder_symbols, scope=scope) # Define model encoder_outputs, encoder_state = rnn.dynamic_rnn( cell=core_rnn_cell_impl.GRUCell(encoder_hidden_size), inputs=inputs, dtype=dtypes.float32, time_major=True, scope=scope) with variable_scope.variable_scope("decoder") as scope: # Train decoder decoder_cell = core_rnn_cell_impl.GRUCell(decoder_hidden_size) decoder_fn_train = Seq2SeqTest._decoder_fn_with_context_state( decoder_fn_lib.simple_decoder_fn_train( encoder_state=encoder_state)) (decoder_outputs_train, decoder_state_train, decoder_context_state_train) = (seq2seq.dynamic_rnn_decoder( cell=decoder_cell, decoder_fn=decoder_fn_train, inputs=decoder_inputs, sequence_length=decoder_length, time_major=True, scope=scope)) decoder_outputs_train = output_fn(decoder_outputs_train) # Setup variable reuse scope.reuse_variables() # Inference decoder decoder_fn_inference = Seq2SeqTest._decoder_fn_with_context_state( decoder_fn_lib.simple_decoder_fn_inference( output_fn=output_fn, encoder_state=encoder_state, embeddings=decoder_embeddings, start_of_sequence_id=start_of_sequence_id, end_of_sequence_id=end_of_sequence_id, #TODO: find out why it goes to +1 maximum_length=decoder_sequence_length - 1, num_decoder_symbols=num_decoder_symbols, dtype=dtypes.int32)) (decoder_outputs_inference, decoder_state_inference, decoder_context_state_inference) = (seq2seq.dynamic_rnn_decoder( cell=decoder_cell, decoder_fn=decoder_fn_inference, time_major=True, scope=scope)) # Run model variables.global_variables_initializer().run() (decoder_outputs_train_res, decoder_state_train_res, decoder_context_state_train_res) = sess.run([ decoder_outputs_train, decoder_state_train, decoder_context_state_train ]) (decoder_outputs_inference_res, decoder_state_inference_res, decoder_context_state_inference_res) = sess.run([ decoder_outputs_inference, decoder_state_inference, decoder_context_state_inference ]) # Assert outputs self.assertEqual((decoder_sequence_length, batch_size, num_decoder_symbols), decoder_outputs_train_res.shape) self.assertEqual((batch_size, num_decoder_symbols), decoder_outputs_inference_res.shape[1:3]) self.assertEqual(decoder_sequence_length, decoder_context_state_inference_res) self.assertEqual((batch_size, decoder_hidden_size), decoder_state_train_res.shape) self.assertEqual((batch_size, decoder_hidden_size), decoder_state_inference_res.shape) self.assertEqual(decoder_sequence_length, decoder_context_state_train_res) # The dynamic decoder might end earlier than `maximal_length` # under inference self.assertGreaterEqual(decoder_sequence_length, decoder_state_inference_res.shape[0])
def test_attention(self): with self.test_session() as sess: with variable_scope.variable_scope( "root", initializer=init_ops.constant_initializer(0.5)): # Define inputs/outputs to model batch_size = 2 encoder_embedding_size = 3 decoder_embedding_size = 4 encoder_hidden_size = 5 decoder_hidden_size = encoder_hidden_size input_sequence_length = 6 decoder_sequence_length = 7 num_decoder_symbols = 20 start_of_sequence_id = end_of_sequence_id = 1 decoder_embeddings = variable_scope.get_variable( "decoder_embeddings", [num_decoder_symbols, decoder_embedding_size], initializer=init_ops.random_normal_initializer(stddev=0.1)) inputs = constant_op.constant( 0.5, shape=[input_sequence_length, batch_size, encoder_embedding_size]) decoder_inputs = constant_op.constant( 0.4, shape=[decoder_sequence_length, batch_size, decoder_embedding_size]) decoder_length = constant_op.constant( decoder_sequence_length, dtype=dtypes.int32, shape=[batch_size,]) # attention attention_option = "luong" # can be "bahdanau" with variable_scope.variable_scope("rnn") as scope: # Define model encoder_outputs, encoder_state = rnn.dynamic_rnn( cell=core_rnn_cell_impl.GRUCell(encoder_hidden_size), inputs=inputs, dtype=dtypes.float32, time_major=True, scope=scope) # attention_states: size [batch_size, max_time, num_units] attention_states = array_ops.transpose(encoder_outputs, [1, 0, 2]) with variable_scope.variable_scope("decoder") as scope: # Prepare attention (attention_keys, attention_values, attention_score_fn, attention_construct_fn) = (attention_decoder_fn.prepare_attention( attention_states, attention_option, decoder_hidden_size)) decoder_fn_train = attention_decoder_fn.attention_decoder_fn_train( encoder_state=encoder_state, attention_keys=attention_keys, attention_values=attention_values, attention_score_fn=attention_score_fn, attention_construct_fn=attention_construct_fn) # setting up weights for computing the final output def create_output_fn(): def output_fn(x): return layers.linear(x, num_decoder_symbols, scope=scope) return output_fn output_fn = create_output_fn() # Train decoder decoder_cell = core_rnn_cell_impl.GRUCell(decoder_hidden_size) (decoder_outputs_train, decoder_state_train, _) = ( seq2seq.dynamic_rnn_decoder( cell=decoder_cell, decoder_fn=decoder_fn_train, inputs=decoder_inputs, sequence_length=decoder_length, time_major=True, scope=scope)) decoder_outputs_train = output_fn(decoder_outputs_train) # Setup variable reuse scope.reuse_variables() # Inference decoder decoder_fn_inference = ( attention_decoder_fn.attention_decoder_fn_inference( output_fn=output_fn, encoder_state=encoder_state, attention_keys=attention_keys, attention_values=attention_values, attention_score_fn=attention_score_fn, attention_construct_fn=attention_construct_fn, embeddings=decoder_embeddings, start_of_sequence_id=start_of_sequence_id, end_of_sequence_id=end_of_sequence_id, maximum_length=decoder_sequence_length - 1, num_decoder_symbols=num_decoder_symbols, dtype=dtypes.int32)) (decoder_outputs_inference, decoder_state_inference, _) = ( seq2seq.dynamic_rnn_decoder( cell=decoder_cell, decoder_fn=decoder_fn_inference, time_major=True, scope=scope)) # Run model variables.global_variables_initializer().run() (decoder_outputs_train_res, decoder_state_train_res) = sess.run( [decoder_outputs_train, decoder_state_train]) (decoder_outputs_inference_res, decoder_state_inference_res) = sess.run( [decoder_outputs_inference, decoder_state_inference]) # Assert outputs self.assertEqual((decoder_sequence_length, batch_size, num_decoder_symbols), decoder_outputs_train_res.shape) self.assertEqual((batch_size, num_decoder_symbols), decoder_outputs_inference_res.shape[1:3]) self.assertEqual((batch_size, decoder_hidden_size), decoder_state_train_res.shape) self.assertEqual((batch_size, decoder_hidden_size), decoder_state_inference_res.shape) # The dynamic decoder might end earlier than `maximal_length` # under inference self.assertGreaterEqual(decoder_sequence_length, decoder_state_inference_res.shape[0])
def embedding_attention_decoder(encoder_mask, decoder_inputs, initial_state, attention_states, cell, num_symbols, embedding_size, beam_size, output_size=None, output_projection=None, num_layers=1, feed_previous=False, update_embedding_for_previous=True, dtype=dtypes.float32, scope=None, initial_state_attention=False): """RNN decoder with embedding and attention and a pure-decoding option. Args: decoder_inputs: A list of 1D batch-sized int32 Tensors (decoder inputs). initial_state: 2D Tensor [batch_size x cell.state_size]. attention_states: 3D Tensor [batch_size x attn_length x attn_size]. cell: rnn_cell.RNNCell defining the cell function. num_symbols: Integer, how many symbols come into the embedding. embedding_size: Integer, the length of the embedding vector for each symbol. beam_size: the beam size of beam search output_size: Size of the output vectors; if None, use output_size. output_projection: None or a pair (W, B) of output projection weights and biases; W has shape [output_size x num_symbols] and B has shape [num_symbols]; if provided and feed_previous=True, each fed previous output will first be multiplied by W and added B. feed_previous: Boolean; if True, only the first of decoder_inputs will be used (the "GO" symbol), and all other decoder inputs will be generated by: next = embedding_lookup(embedding, top_k(previous_output)), In effect, this implements a beam search decoder. If False, decoder_inputs are used as given (the standard decoder case). update_embedding_for_previous: Boolean; if False and feed_previous=True, only the embedding for the first symbol of decoder_inputs (the "GO" symbol) will be updated by back propagation. Embeddings for the symbols generated from the decoder itself remain unchanged. This parameter has no effect if feed_previous=False. dtype: The dtype to use for the RNN initial states (default: tf.float32). scope: VariableScope for the created subgraph; defaults to "embedding_attention_decoder". initial_state_attention: If False (default), initial attentions are zero. If True, initialize the attentions from the initial state. Returns: A tuple of the form (outputs, state, symbols), where: outputs: A list of the same length as decoder_inputs of 2D Tensors with shape [batch_size x output_size] containing the generated outputs. state: The state of each decoder cell at the final time-step. It is a 2D Tensor of shape [batch_size x cell.state_size]. symbols: When training, it is []; when decoding, it is the best translation generated by beam search. Raises: ValueError: When output_projection has the wrong shape. """ if output_size is None: output_size = cell.output_size if output_projection is not None: proj_biases = ops.convert_to_tensor(output_projection[1], dtype=dtype) proj_biases.get_shape().assert_is_compatible_with([num_symbols]) with variable_scope.variable_scope(scope or "embedding_attention_decoder"): # word embeddings of target words embedding = variable_scope.get_variable( "embedding", [num_symbols, embedding_size], dtype=dtype, initializer=init_ops.random_normal_initializer(0, 0.01, seed=SEED)) # loop function for generating loop_function = _extract_argmax_and_embed( embedding, num_symbols, output_projection, update_embedding_for_previous) if feed_previous else None emb_inp = [ embedding_ops.embedding_lookup(embedding, i) for i in decoder_inputs ] return attention_decoder( encoder_mask, emb_inp, initial_state, attention_states, cell, beam_size, output_size=output_size, num_layers=num_layers, loop_function=loop_function, initial_state_attention=initial_state_attention)
def embedding_attention_seq2seq(encoder_inputs, encoder_mask, decoder_inputs, cell, num_encoder_symbols, num_decoder_symbols, embedding_size, beam_size, output_projection=None, num_layers=1, feed_previous=False, dtype=dtypes.float32, scope=None, initial_state_attention=True): """Embedding sequence-to-sequence model with attention. This model first embeds encoder_inputs by a newly created embedding (of shape [num_encoder_symbols x input_size]). Then it runs an bidirectional-RNN to encode embedded encoder_inputs into a state vector. It keeps the outputs of this bidirectional-RNN at every step to use for attention later. Next, it embeds decoder_inputs by another newly created embedding (of shape [num_decoder_symbols x input_size]). Then it runs attention decoder, initialized with the last encoder state, on embedded decoder_inputs and attending to encoder outputs. Args: encoder_inputs: A list of 1D int32 Tensors of shape [batch_size]. encoder_mask: the mask of encoder inputs that label where are PADs. decoder_inputs: A list of 1D int32 Tensors of shape [batch_size]. cell: rnn_cell.RNNCell defining the cell function and size. num_encoder_symbols: Integer; number of symbols on the encoder side. num_decoder_symbols: Integer; number of symbols on the decoder side. embedding_size: Integer, the length of the embedding vector for each symbol. output_projection: None or a pair (W, B) of output projection weights and biases; W has shape [output_size x num_decoder_symbols] and B has shape [num_decoder_symbols]; if provided and feed_previous=True, each fed previous output will first be multiplied by W and added B. feed_previous: Boolean or scalar Boolean Tensor; if True, only the first of decoder_inputs will be used (the "GO" symbol), and all other decoder inputs will be taken from previous outputs (as in embedding_rnn_decoder). If False, decoder_inputs are used as given (the standard decoder case). dtype: The dtype of the initial RNN state (default: tf.float32). scope: VariableScope for the created subgraph; defaults to "embedding_attention_seq2seq". initial_state_attention: If False (default), initial attentions are zero. If True, initialize the attentions from the initial state and attention states. Returns: A tuple of the form (outputs, state, symbols), where: outputs: A list of the same length as decoder_inputs of 2D Tensors with shape [batch_size x num_decoder_symbols] containing the generated outputs. state: The state of each decoder cell at the final time-step. It is a 2D Tensor of shape [batch_size x cell.state_size]. symbols: When training, it is []; when decoding, it is the best translation generated by beam search. """ with variable_scope.variable_scope(scope or "embedding_attention_seq2seq"): # word embeddings of source words embedding = variable_scope.get_variable( "embedding", [num_encoder_symbols, embedding_size], dtype=dtype, initializer=init_ops.random_normal_initializer(0, 0.01, seed=SEED)) # wrap encoder cell with embedding encoder_cell = rnn_cell.EmbeddingWrapper( cell, embedding_classes=num_encoder_symbols, embedding_size=embedding_size, embedding=embedding) # get the sentence lengths of source sentences encoder_lens = math_ops.reduce_sum(encoder_mask, [1]) # encode source sentences with a bidirectional_rnn encoder encoder_outputs, _, encoder_state = rnn.bidirectional_rnn( encoder_cell, encoder_cell, encoder_inputs, sequence_length=encoder_lens, dtype=dtype) # First calculate a concatenation of encoder outputs. top_states = [ array_ops.reshape(e, [-1, 1, 2 * cell.output_size]) for e in encoder_outputs ] attention_states = array_ops.concat(top_states, 1) # Decoder. output_size = None if output_projection is None: cell = rnn_cell.OutputProjectionWrapper(cell, num_decoder_symbols) output_size = num_decoder_symbols return embedding_attention_decoder( encoder_mask, decoder_inputs, encoder_state, attention_states, cell, num_decoder_symbols, embedding_size, beam_size=beam_size, output_size=output_size, output_projection=output_projection, num_layers=num_layers, feed_previous=feed_previous, initial_state_attention=initial_state_attention)
def _testWithAttention(self, create_attention_mechanism, expected_final_outputs, expected_final_state, attention_mechanism_depth=3): encoder_sequence_length = [3, 2, 3, 1, 0] decoder_sequence_length = [2, 0, 1, 2, 3] batch_size = 5 encoder_max_time = 8 decoder_max_time = 4 input_depth = 7 encoder_output_depth = 10 cell_depth = 9 attention_depth = 6 decoder_inputs = np.random.randn(batch_size, decoder_max_time, input_depth).astype(np.float32) encoder_outputs = np.random.randn(batch_size, encoder_max_time, encoder_output_depth).astype( np.float32) attention_mechanism = create_attention_mechanism( num_units=attention_mechanism_depth, memory=encoder_outputs, memory_sequence_length=encoder_sequence_length) with self.test_session() as sess: with vs.variable_scope( "root", initializer=init_ops.random_normal_initializer(stddev=0.01, seed=3)): cell = core_rnn_cell.LSTMCell(cell_depth) cell = wrapper.DynamicAttentionWrapper( cell, attention_mechanism, attention_size=attention_depth) helper = helper_py.TrainingHelper(decoder_inputs, decoder_sequence_length) my_decoder = basic_decoder.BasicDecoder( cell=cell, helper=helper, initial_state=cell.zero_state(dtype=dtypes.float32, batch_size=batch_size)) final_outputs, final_state = decoder.dynamic_decode(my_decoder) self.assertTrue( isinstance(final_outputs, basic_decoder.BasicDecoderOutput)) self.assertTrue( isinstance(final_state, wrapper.DynamicAttentionWrapperState)) self.assertTrue( isinstance(final_state.cell_state, core_rnn_cell.LSTMStateTuple)) self.assertEqual( (batch_size, None, attention_depth), tuple(final_outputs.rnn_output.get_shape().as_list())) self.assertEqual( (batch_size, None), tuple(final_outputs.sample_id.get_shape().as_list())) self.assertEqual( (batch_size, attention_depth), tuple(final_state.attention.get_shape().as_list())) self.assertEqual( (batch_size, cell_depth), tuple(final_state.cell_state.c.get_shape().as_list())) self.assertEqual( (batch_size, cell_depth), tuple(final_state.cell_state.h.get_shape().as_list())) sess.run(variables.global_variables_initializer()) sess_results = sess.run({ "final_outputs": final_outputs, "final_state": final_state }) nest.map_structure(self.assertAllClose, expected_final_outputs, sess_results["final_outputs"]) nest.map_structure(self.assertAllClose, expected_final_state, sess_results["final_state"])
def attention_decoder(encoder_mask_1, encoder_mask_2, decoder_inputs, initial_state, attention_states_1, attention_states_2, cell, beam_size, # added by shiyue output_size=None, num_heads=1, loop_function=None, dtype=dtypes.float32, scope=None, initial_state_attention=False ): """RNN decoder with attention for the sequence-to-sequence model. In this context "attention" means that, during decoding, the RNN can look up information in the additional tensor attention_states, and it does this by focusing on a few entries from the tensor. This model has proven to yield especially good results in a number of sequence-to-sequence tasks. This implementation is based on http://arxiv.org/abs/1412.7449 (see below for details). It is recommended for complex sequence-to-sequence tasks. Args: decoder_inputs: A list of 2D Tensors [batch_size x input_size]. initial_state: 2D Tensor [batch_size x cell.state_size]. attention_states: 3D Tensor [batch_size x attn_length x attn_size]. cell: rnn_cell.RNNCell defining the cell function and size. output_size: Size of the output vectors; if None, we use cell.output_size. num_heads: Number of attention heads that read from attention_states. loop_function: If not None, this function will be applied to i-th output in order to generate i+1-th input, and decoder_inputs will be ignored, except for the first element ("GO" symbol). This can be used for decoding, but also for training to emulate http://arxiv.org/abs/1506.03099. Signature -- loop_function(prev, i) = next * prev is a 2D Tensor of shape [batch_size x output_size], * i is an integer, the step number (when advanced control is needed), * next is a 2D Tensor of shape [batch_size x input_size]. dtype: The dtype to use for the RNN initial state (default: tf.float32). scope: VariableScope for the created subgraph; default: "attention_decoder". initial_state_attention: If False (default), initial attentions are zero. If True, initialize the attentions from the initial state and attention states -- useful when we wish to resume decoding from a previously stored decoder state and attention states. Returns: A tuple of the form (outputs, state), where: outputs: A list of the same length as decoder_inputs of 2D Tensors of shape [batch_size x output_size]. These represent the generated outputs. Output i is computed from input i (which is either the i-th element of decoder_inputs or loop_function(output {i-1}, i)) as follows. First, we run the cell on a combination of the input and previous attention masks: cell_output, new_state = cell(linear(input, prev_attn), prev_state). Then, we calculate new attention masks: new_attn = softmax(V^T * tanh(W * attention_states + U * new_state)) and then we calculate the output: output = linear(cell_output, new_attn). state: The state of each decoder cell the final time-step. It is a 2D Tensor of shape [batch_size x cell.state_size]. Raises: ValueError: when num_heads is not positive, there are no inputs, shapes of attention_states are not set, or input size cannot be inferred from the input. """ if not decoder_inputs: raise ValueError("Must provide at least 1 input to attention decoder.") if num_heads < 1: raise ValueError("With less than 1 heads, use a non-attention decoder.") if not attention_states_1.get_shape()[1:2].is_fully_defined(): raise ValueError("Shape[1] and [2] of attention_states must be known: %s" % attention_states_1.get_shape()) if not attention_states_2.get_shape()[1:2].is_fully_defined(): raise ValueError("Shape[1] and [2] of attention_states must be known: %s" % attention_states_2.get_shape()) if output_size is None: output_size = cell.output_size with variable_scope.variable_scope(scope or "attention_decoder"): batch_size = array_ops.shape(decoder_inputs[0])[0] # Needed for reshaping. attn_length_1 = attention_states_1.get_shape()[1].value attn_length_2 = attention_states_2.get_shape()[1].value attn_size = attention_states_1.get_shape()[2].value # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before. hidden_1 = array_ops.reshape( attention_states_1, [-1, attn_length_1, 1, attn_size]) hidden_2 = array_ops.reshape( attention_states_2, [-1, attn_length_2, 1, attn_size]) attention_vec_size = cell.output_size # Size of query vectors for attention. initial_state = math_ops.tanh(linear(initial_state, attention_vec_size, False, weight_initializer=init_ops.random_normal_initializer(0, 0.01, seed=SEED))) # special initial state # with variable_scope.variable_scope(scope or "attention"): hidden_features_1, v_1 = [], [] hidden_features_2, v_2 = [], [] with variable_scope.variable_scope("attention_1"): for a in xrange(num_heads): k_1 = variable_scope.get_variable("AttnW_%d" % a, [1, 1, attn_size, attention_vec_size], initializer=init_ops.random_normal_initializer(0, 0.001, seed=SEED)) hidden_features_1.append(nn_ops.conv2d(hidden_1, k_1, [1, 1, 1, 1], "SAME")) v_1.append(variable_scope.get_variable("AttnV_%d" % a, [attention_vec_size], initializer=init_ops.constant_initializer(0.0))) with variable_scope.variable_scope("attention_2"): for a in xrange(num_heads): k_2 = variable_scope.get_variable("AttnW_%d" % a, [1, 1, attn_size, attention_vec_size], initializer=init_ops.random_normal_initializer(0, 0.001, seed=SEED)) hidden_features_2.append(nn_ops.conv2d(hidden_2, k_2, [1, 1, 1, 1], "SAME")) v_2.append(variable_scope.get_variable("AttnV_%d" % a, [attention_vec_size], initializer=init_ops.constant_initializer(0.0))) def attention(query, hidden, hidden_features, v, encoder_mask, attn_length, scope=None): # added by al with variable_scope.variable_scope(scope or "attention"): # Put attention masks on hidden using hidden_features and query. ds = [] # Results of attention reads will be stored here. aa = [] if nest.is_sequence(query): # If the query is a tuple, flatten it. query_list = nest.flatten(query) for q in query_list: # Check that ndims == 2 if specified. ndims = q.get_shape().ndims if ndims: assert ndims == 2 query = array_ops.concat(1, query_list) for a in xrange(num_heads): with variable_scope.variable_scope("AttnU_%d" % a): y = linear(query, attention_vec_size, False, weight_initializer=init_ops.random_normal_initializer(0, 0.001, seed=SEED)) y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size]) # Attention mask is a softmax of v^T * tanh(...). s = math_ops.reduce_sum( v[a] * math_ops.tanh(hidden_features[a] + y), [2, 3]) # a = nn_ops.softmax(s) s = array_ops.transpose(array_ops.transpose(s) - math_ops.reduce_max(s, [1])) s = math_ops.exp(s) s = math_ops.to_float(encoder_mask) * s # s_s = math_ops.reduce_sum(s, [1]) # a = array_ops.transpose(array_ops.transpose(s) / (s_s + (1.0 - math_ops.sign(s_s)))) a = array_ops.transpose(array_ops.transpose(s) / math_ops.reduce_sum(s, [1])) # complete softmax, added by al aa.append(a) d = math_ops.reduce_sum( array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden, [1, 2]) # complete attention calculation ds.append(array_ops.reshape(d, [-1, attn_size])) return ds, aa outputs = [] output = None state = initial_state prev = None # added by shiyue symbols = [] aligns_1, aligns_2 = [], [] prev_probs = [0] # ended by shiyue batch_attn_size = array_ops.pack([batch_size, attn_size]) attns = [] # added by al # annotated by al # attns = [array_ops.zeros(batch_attn_size, dtype=dtype) # for _ in xrange(num_heads)] # for a in attns: # Ensure the second shape of attention vectors is set. # a.set_shape([None, attn_size]) # end by al for i, inp in enumerate(decoder_inputs): if i > 0: variable_scope.get_variable_scope().reuse_variables() # If loop_function is set, we use it instead of decoder_inputs. if loop_function is not None and prev is not None: with variable_scope.variable_scope("loop_function", reuse=True): # inp = loop_function(prev, i) # annotated by shiyue # added by shiyue inp, prev_probs, index, prev_symbol = loop_function(prev, prev_probs, beam_size, i) state = array_ops.gather(state, index) # update prev state attns = [array_ops.gather(attn, index) for attn in attns] # update prev attens for j, output in enumerate(outputs): outputs[j] = array_ops.gather(output, index) # update prev outputs for j, symbol in enumerate(symbols): symbols[j] = array_ops.gather(symbol, index) # update prev symbols symbols.append(prev_symbol) # ended by shiyue # Merge input and previous attentions into one vector of the right size. input_size = inp.get_shape().with_rank(2)[1] if input_size.value is None: raise ValueError("Could not infer input size from input: %s" % inp.name) # Run the attention mechanism. attns = [] if i > 0 or (i == 0 and initial_state_attention): attns_1, aa_1 = attention(state, hidden_1, hidden_features_1, v_1, encoder_mask_1, attn_length_1, scope="attention_1") attns_2, aa_2 = attention(state, hidden_2, hidden_features_2, v_2, encoder_mask_2, attn_length_2, scope="attention_2") for id_head in xrange(num_heads): # added by al attns.append(alpha * attns_1[id_head] + beta * attns_2[id_head]) ''' for a1, a2 in zip(attns_1, attns_2): # added by al attns.append(alpha * a1 + a2) ''' aligns_1.append(aa_1) aligns_2.append(aa_2) # x = linear([inp] + attns, input_size, False, # scope="cell_input") # added by yfeng # Run the RNN. state, _ = cell(inp, state, attns[0]) with variable_scope.variable_scope("AttnOutputProjection"): output = linear([state] + [inp] + attns, output_size, False) output = array_ops.reshape(output, [-1, output_size // 2, 2]) output = math_ops.reduce_max(output, 2) if loop_function is not None: prev = output outputs.append(output) # added by shiyue if loop_function is not None: # process the last symbol inp, prev_probs, index, prev_symbol = loop_function(prev, prev_probs, beam_size, i + 1) state = array_ops.gather(state, index) # update prev state for j, output in enumerate(outputs): outputs[j] = array_ops.gather(output, index) # update prev outputs for j, symbol in enumerate(symbols): symbols[j] = array_ops.gather(symbol, index) # update prev symbols symbols.append(prev_symbol) # output the final best result of beam search for k, symbol in enumerate(symbols): symbols[k] = array_ops.gather(symbol, 0) state = array_ops.expand_dims(array_ops.gather(state, 0), 0) for j, output in enumerate(outputs): outputs[j] = array_ops.expand_dims(array_ops.gather(output, 0), 0) # update prev outputs # ended by shiyue return outputs, state, symbols # modified by shiyue
def attention_decoder(encoder_mask, decoder_inputs, initial_state, attention_states, cell, beam_size, output_size=None, num_layers=1, loop_function=None, dtype=dtypes.float32, scope=None, initial_state_attention=False): """RNN decoder with attention for the sequence-to-sequence model. In this context "attention" means that, during decoding, the RNN can look up information in the additional tensor attention_states, and it does this by focusing on a few entries from the tensor. This model has proven to yield especially good results in a number of sequence-to-sequence tasks. This implementation is based on http://arxiv.org/abs/1409.0473 (see below for details). Args: encoder_mask: the mask of encoder inputs [batch_size x attn_length]. decoder_inputs: A list of 2D Tensors [batch_size x input_size]. initial_state: 2D Tensor [batch_size x cell.state_size]. attention_states: 3D Tensor [batch_size x attn_length x attn_size]. cell: rnn_cell.RNNCell defining the cell function and size. beam_size: the beam size of beam search output_size: Size of the output vectors; if None, we use cell.output_size. loop_function: When decoding, this function will be applied to i-th output in order to generate i+1-th input. The generation is by beam search. dtype: The dtype to use for the RNN initial state (default: tf.float32). scope: VariableScope for the created subgraph; default: "attention_decoder". initial_state_attention: If False (default), initial attentions are zero. If True, initialize the attentions from the initial state. Returns: A tuple of the form (outputs, state, symbols), where: outputs: A list of the same length as decoder_inputs of 2D Tensors of shape [batch_size x output_size]. These represent the generated outputs. Output i is computed from input i (which is either the i-th element of decoder_inputs or loop_function(output {i-1}, i)) as follows. state: The state of each decoder cell the final time-step. It is a 2D Tensor of shape [batch_size x cell.state_size]. symbols: When training, it is []; when decoding, it is the best translation generated by beam search. Raises: ValueError: when shapes of attention_states are not set, or input size cannot be inferred from the input. """ if not decoder_inputs: raise ValueError("Must provide at least 1 input to attention decoder.") if not attention_states.get_shape()[1:2].is_fully_defined(): raise ValueError( "Shape[1] and [2] of attention_states must be known: %s" % attention_states.get_shape()) if output_size is None: output_size = cell.output_size with variable_scope.variable_scope(scope or "attention_decoder"): batch_size = array_ops.shape( decoder_inputs[0])[0] # Needed for reshaping. attn_length = attention_states.get_shape()[1].value attn_size = attention_states.get_shape()[2].value state_size = initial_state.get_shape()[1].value attention_vec_size = attn_size // 2 # Size of query vectors for attention. hidden = array_ops.reshape(attention_states, [-1, attn_length, 1, attn_size]) # compute the initial hidden state of decoder initial_state = math_ops.tanh( linear(initial_state, state_size, False, weight_initializer=init_ops.random_normal_initializer( 0, 0.01, seed=SEED))) with variable_scope.variable_scope(scope or "attention"): k = variable_scope.get_variable( "AttnW", [1, 1, attn_size, attention_vec_size], initializer=init_ops.random_normal_initializer(0, 0.001, seed=SEED)) hidden_features = nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME") v = variable_scope.get_variable( "AttnV", [attention_vec_size], initializer=init_ops.constant_initializer(0.0)) def attention(query, scope=None): """Put attention masks on hidden using hidden_features and query.""" with variable_scope.variable_scope(scope or "attention"): ds = [] # Results of attention reads will be stored here. if nest.is_sequence( query): # If the query is a tuple, flatten it. query_list = nest.flatten(query) for q in query_list: # Check that ndims == 2 if specified. ndims = q.get_shape().ndims if ndims: assert ndims == 2 query = array_ops.concat(query_list, 1) with variable_scope.variable_scope("AttnU"): y = linear( query, attention_vec_size, False, weight_initializer=init_ops.random_normal_initializer( 0, 0.001, seed=SEED)) y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size]) # the additive attention is computed by v^T * tanh(...). s = math_ops.reduce_sum( v * math_ops.tanh(hidden_features + y), [2, 3]) s = array_ops.transpose( array_ops.transpose(s) - math_ops.reduce_max(s, [1])) # sofxmax with mask s = math_ops.exp(s) s = math_ops.to_float(encoder_mask) * s a = array_ops.transpose( array_ops.transpose(s) / math_ops.reduce_sum(s, [1])) d = math_ops.reduce_sum( array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden, [1, 2]) ds.append(array_ops.reshape(d, [-1, attn_size])) return ds outputs = [] output = None state = initial_state out_state = array_ops.split(state, num_layers, 1)[-1] prev = None symbols = [] prev_probs = [0] batch_attn_size = array_ops.stack([batch_size, attn_size]) attns = [array_ops.zeros(batch_attn_size, dtype=dtype)] for a in attns: # Ensure the second shape of attention vectors is set. a.set_shape([None, attn_size]) for i, inp in enumerate(decoder_inputs): if i > 0: variable_scope.get_variable_scope().reuse_variables() # If loop_function is set, we use it instead of decoder_inputs. if loop_function is not None and prev is not None: with variable_scope.variable_scope("loop_function", reuse=True): inp, prev_probs, index, prev_symbol = loop_function( prev, prev_probs, beam_size, i) out_state = array_ops.gather(out_state, index) # update prev state state = array_ops.gather(state, index) # update prev state attns = [array_ops.gather(attn, index) for attn in attns] # update prev attens for j, output in enumerate(outputs): outputs[j] = array_ops.gather( output, index) # update prev outputs for j, symbol in enumerate(symbols): symbols[j] = array_ops.gather( symbol, index) # update prev symbols symbols.append(prev_symbol) # Run the attention mechanism. if i > 0 or (i == 0 and initial_state_attention): attns = attention(out_state, scope="attention") # Run the RNN. cinp = array_ops.concat( [inp, attns[0]], 1) # concatenate next input and the context vector out_state, state = cell(cinp, state) with variable_scope.variable_scope("AttnOutputProjection"): output = linear([out_state] + [cinp], output_size, False) output = array_ops.reshape(output, [-1, output_size // 2, 2]) output = math_ops.reduce_max(output, 2) # maxout if loop_function is not None: prev = output outputs.append(output) if loop_function is not None: # handle the last symbol inp, prev_probs, index, prev_symbol = loop_function( prev, prev_probs, beam_size, i + 1) out_state = array_ops.gather(out_state, index) # update prev state state = array_ops.gather(state, index) # update prev state for j, output in enumerate(outputs): outputs[j] = array_ops.gather(output, index) # update prev outputs for j, symbol in enumerate(symbols): symbols[j] = array_ops.gather(symbol, index) # update prev symbols symbols.append(prev_symbol) # output the best result of beam search for k, symbol in enumerate(symbols): symbols[k] = array_ops.gather(symbol, 0) out_state = array_ops.expand_dims(array_ops.gather(out_state, 0), 0) state = array_ops.expand_dims(array_ops.gather(state, 0), 0) for j, output in enumerate(outputs): outputs[j] = array_ops.expand_dims(array_ops.gather(output, 0), 0) # update prev outputs return outputs, state, symbols
def dau_conv1d( inputs, filters, dau_units, max_kernel_size, stride=1, mu_learning_rate_factor=500, data_format=None, activation_fn=nn.relu, normalizer_fn=None, normalizer_params=None, weights_initializer=init_ops.random_normal_initializer( stddev=0.1), #init_ops.glorot_uniform_initializer(), weights_regularizer=None, weights_constraint=None, mu1_initializer=None, mu1_regularizer=None, mu1_constraint=None, sigma_initializer=None, sigma_regularizer=None, sigma_constraint=None, biases_initializer=init_ops.zeros_initializer(), biases_regularizer=None, dau_unit_border_bound=0.01, dau_aggregation_forbid_positive_dim1=False, reuse=None, variables_collections=None, outputs_collections=None, trainable=True, scope=None): if data_format not in [None, 'NCHW']: raise ValueError('Invalid data_format: %r' % (data_format, )) layer_variable_getter = layers_contrib._build_variable_getter({ 'bias': 'biases', 'weight': 'weights', 'mu1': 'mu1', 'sigma': 'sigma' }) with variable_scope.variable_scope( scope, 'DAUConv', [inputs], reuse=reuse, custom_getter=layer_variable_getter) as sc: inputs = ops.convert_to_tensor(inputs) input_rank = inputs.get_shape().ndims if input_rank != 4: raise ValueError( 'DAU convolution not supported for input with rank', input_rank) df = ('channels_first' if data_format and data_format.startswith('NC') else 'channels_last') layer = DAUConv1d(filters, dau_units, max_kernel_size, strides=stride, data_format=df, activation=None, use_bias=not normalizer_fn and biases_initializer, mu_learning_rate_factor=mu_learning_rate_factor, weight_initializer=weights_initializer, mu1_initializer=mu1_initializer, sigma_initializer=sigma_initializer, bias_initializer=biases_initializer, weight_regularizer=weights_regularizer, mu1_regularizer=mu1_regularizer, sigma_regularizer=sigma_regularizer, bias_regularizer=biases_regularizer, activity_regularizer=None, dau_unit_border_bound=dau_unit_border_bound, dau_aggregation_forbid_positive_dim1= dau_aggregation_forbid_positive_dim1, trainable=trainable, unit_testing=False, name=sc.name, _scope=sc, _reuse=reuse) dau_weights = weights_constraint( layer.add_dau_weights_var( inputs.shape)) if weights_constraint is not None else None dau_mu1 = mu1_constraint(layer.add_dau_mu1_var( inputs.shape)) if mu1_constraint is not None else None dau_sigma = sigma_constraint(layer.add_dau_sigma_var( inputs.shape)) if sigma_constraint is not None else None layer.set_dau_variables_manually(dau_weights, dau_mu1, None, dau_sigma) outputs = layer.apply(inputs) # Add variables to collections. layers_contrib._add_variable_to_collections(layer.dau_weights, variables_collections, 'weights') layers_contrib._add_variable_to_collections(layer.dau_mu1, variables_collections, 'mu1') layers_contrib._add_variable_to_collections(layer.dau_sigma, variables_collections, 'sigma') if layer.use_bias: layers_contrib._add_variable_to_collections( layer.bias, variables_collections, 'biases') if normalizer_fn is not None: normalizer_params = normalizer_params or {} outputs = normalizer_fn(outputs, **normalizer_params) if activation_fn is not None: outputs = activation_fn(outputs) return utils_contrib.collect_named_outputs(outputs_collections, sc.name, outputs)
def fc_layer(self, bottom, out_size, name): with tf.variable_scope(name): _, _height, _width, _channel = bottom.get_shape().as_list() size = _height*_width*_channel weights = tf.get_variable(name=name + "_weights", shape = [size, out_size], initializer=init_ops.random_normal_initializer(stddev=0.01)) biases = tf.get_variable(name=name + "_biases", shape=[out_size], initializer=init_ops.random_normal_initializer(stddev=0.01)) print weights x = tf.reshape(bottom, [-1, size]) fc = tf.nn.bias_add(tf.matmul(x, weights), biases) return fc
def call(self, inputs, state): with vs.variable_scope(self._name + "/FastGRNNcell", reuse=self._reuse): if self._wRank is None: W_matrix_init = init_ops.random_normal_initializer( mean=0.0, stddev=0.1, dtype=tf.float32) self.W = vs.get_variable( "W", [inputs.get_shape()[-1], self._hidden_size], initializer=W_matrix_init) wComp = math_ops.matmul(inputs, self.W) else: W_matrix_1_init = init_ops.random_normal_initializer( mean=0.0, stddev=0.1, dtype=tf.float32) self.W1 = vs.get_variable( "W1", [inputs.get_shape()[-1], self._wRank], initializer=W_matrix_1_init) W_matrix_2_init = init_ops.random_normal_initializer( mean=0.0, stddev=0.1, dtype=tf.float32) self.W2 = vs.get_variable("W2", [self._wRank, self._hidden_size], initializer=W_matrix_2_init) wComp = math_ops.matmul(math_ops.matmul(inputs, self.W1), self.W2) if self._uRank is None: U_matrix_init = init_ops.random_normal_initializer( mean=0.0, stddev=0.1, dtype=tf.float32) self.U = vs.get_variable( "U", [self._hidden_size, self._hidden_size], initializer=U_matrix_init) uComp = math_ops.matmul(state, self.U) else: U_matrix_1_init = init_ops.random_normal_initializer( mean=0.0, stddev=0.1, dtype=tf.float32) self.U1 = vs.get_variable("U1", [self._hidden_size, self._uRank], initializer=U_matrix_1_init) U_matrix_2_init = init_ops.random_normal_initializer( mean=0.0, stddev=0.1, dtype=tf.float32) self.U2 = vs.get_variable("U2", [self._uRank, self._hidden_size], initializer=U_matrix_2_init) uComp = math_ops.matmul(math_ops.matmul(state, self.U1), self.U2) # Init zeta to 6.0 and nu to -6.0 if this doesn't give good # results. The inits are hyper-params. zeta_init = init_ops.constant_initializer(self._zetaInit, dtype=tf.float32) self.zeta = vs.get_variable("zeta", [1, 1], initializer=zeta_init) nu_init = init_ops.constant_initializer(self._nuInit, dtype=tf.float32) self.nu = vs.get_variable("nu", [1, 1], initializer=nu_init) pre_comp = wComp + uComp bias_gate_init = init_ops.constant_initializer(1.0, dtype=tf.float32) self.bias_gate = vs.get_variable("B_g", [1, self._hidden_size], initializer=bias_gate_init) z = gen_non_linearity(pre_comp + self.bias_gate, self._gate_non_linearity) bias_update_init = init_ops.constant_initializer(1.0, dtype=tf.float32) self.bias_update = vs.get_variable("B_h", [1, self._hidden_size], initializer=bias_update_init) c = gen_non_linearity(pre_comp + self.bias_update, self._update_non_linearity) new_h = z * state + (math_ops.sigmoid(self.zeta) * (1.0 - z) + math_ops.sigmoid(self.nu)) * c return new_h, new_h
def logistic_regression(X, y, class_weight=None, init_mean=None, init_stddev=1.0): """Creates logistic regression TensorFlow subgraph. Args: X: tensor or placeholder for input features, shape should be [batch_size, n_features]. y: tensor or placeholder for target, shape should be [batch_size, n_classes]. class_weight: tensor, [n_classes], where for each class it has weight of the class. If not provided will check if graph contains tensor `class_weight:0`. If that is not provided either all ones are used. init_mean: the mean value to use for initialization. init_stddev: the standard devation to use for initialization. Returns: Predictions and loss tensors. Side effects: The variables linear_regression.weights and linear_regression.bias are initialized as follows. If init_mean is not None, then initialization will be done using a random normal initializer with the given init_mean and init_stddv. (These may be set to 0.0 each if a zero initialization is desirable for convex use cases.) If init_mean is None, then the uniform_unit_scaling_initialzer will be used. """ with vs.variable_scope('logistic_regression'): logging_ops.histogram_summary('logistic_regression.X', X) logging_ops.histogram_summary('logistic_regression.y', y) # Set up the requested initialization. if (init_mean is None): weights = vs.get_variable( 'weights', [X.get_shape()[1], y.get_shape()[-1]]) bias = vs.get_variable('bias', [y.get_shape()[-1]]) else: weights = vs.get_variable( 'weights', [X.get_shape()[1], y.get_shape()[-1]], initializer=init_ops.random_normal_initializer( init_mean, init_stddev)) bias = vs.get_variable( 'bias', [y.get_shape()[-1]], initializer=init_ops.random_normal_initializer( init_mean, init_stddev)) logging_ops.histogram_summary('logistic_regression.weights', weights) logging_ops.histogram_summary('logistic_regression.bias', bias) # If no class weight provided, try to retrieve one from pre-defined # tensor name in the graph. if not class_weight: try: class_weight = ops.get_default_graph().get_tensor_by_name( 'class_weight:0') except KeyError: pass return softmax_classifier(X, y, weights, bias, class_weight=class_weight)
def _testWithMaybeMultiAttention(self, is_multi, create_attention_mechanisms, expected_final_output, expected_final_state, attention_mechanism_depths, alignment_history=False, expected_final_alignment_history=None, attention_layer_sizes=None, attention_layers=None, name=''): # Allow is_multi to be True with a single mechanism to enable test for # passing in a single mechanism in a list. assert len(create_attention_mechanisms) == 1 or is_multi encoder_sequence_length = [3, 2, 3, 1, 1] decoder_sequence_length = [2, 0, 1, 2, 3] batch_size = 5 encoder_max_time = 8 decoder_max_time = 4 input_depth = 7 encoder_output_depth = 10 cell_depth = 9 if attention_layer_sizes is not None: # Compute sum of attention_layer_sizes. Use encoder_output_depth if None. attention_depth = sum(attention_layer_size or encoder_output_depth for attention_layer_size in attention_layer_sizes) elif attention_layers is not None: # Compute sum of attention_layers output depth. attention_depth = sum( attention_layer.compute_output_shape( [batch_size, cell_depth + encoder_output_depth]).dims[-1].value for attention_layer in attention_layers) else: attention_depth = encoder_output_depth * len(create_attention_mechanisms) decoder_inputs = array_ops.placeholder_with_default( np.random.randn(batch_size, decoder_max_time, input_depth).astype(np.float32), shape=(None, None, input_depth)) encoder_outputs = array_ops.placeholder_with_default( np.random.randn(batch_size, encoder_max_time, encoder_output_depth).astype(np.float32), shape=(None, None, encoder_output_depth)) attention_mechanisms = [ creator(num_units=depth, memory=encoder_outputs, memory_sequence_length=encoder_sequence_length) for creator, depth in zip(create_attention_mechanisms, attention_mechanism_depths)] with self.session(use_gpu=True) as sess: with vs.variable_scope( 'root', initializer=init_ops.random_normal_initializer(stddev=0.01, seed=3)): attention_layer_size = attention_layer_sizes attention_layer = attention_layers if not is_multi: if attention_layer_size is not None: attention_layer_size = attention_layer_size[0] if attention_layer is not None: attention_layer = attention_layer[0] cell = rnn_cell.LSTMCell(cell_depth) cell = wrapper.AttentionWrapper( cell, attention_mechanisms if is_multi else attention_mechanisms[0], attention_layer_size=attention_layer_size, alignment_history=alignment_history, attention_layer=attention_layer) helper = helper_py.TrainingHelper(decoder_inputs, decoder_sequence_length) my_decoder = basic_decoder.BasicDecoder( cell=cell, helper=helper, initial_state=cell.zero_state( dtype=dtypes.float32, batch_size=batch_size)) final_outputs, final_state, _ = decoder.dynamic_decode(my_decoder) self.assertTrue( isinstance(final_outputs, basic_decoder.BasicDecoderOutput)) self.assertTrue( isinstance(final_state, wrapper.AttentionWrapperState)) self.assertTrue( isinstance(final_state.cell_state, rnn_cell.LSTMStateTuple)) self.assertEqual((batch_size, None, attention_depth), tuple(final_outputs.rnn_output.get_shape().as_list())) self.assertEqual((batch_size, None), tuple(final_outputs.sample_id.get_shape().as_list())) self.assertEqual((batch_size, attention_depth), tuple(final_state.attention.get_shape().as_list())) self.assertEqual((batch_size, cell_depth), tuple(final_state.cell_state.c.get_shape().as_list())) self.assertEqual((batch_size, cell_depth), tuple(final_state.cell_state.h.get_shape().as_list())) if alignment_history: if is_multi: state_alignment_history = [] for history_array in final_state.alignment_history: history = history_array.stack() self.assertEqual( (None, batch_size, None), tuple(history.get_shape().as_list())) state_alignment_history.append(history) state_alignment_history = tuple(state_alignment_history) else: state_alignment_history = final_state.alignment_history.stack() self.assertEqual( (None, batch_size, None), tuple(state_alignment_history.get_shape().as_list())) nest.assert_same_structure( cell.state_size, cell.zero_state(batch_size, dtypes.float32)) # Remove the history from final_state for purposes of the # remainder of the tests. final_state = final_state._replace(alignment_history=()) # pylint: disable=protected-access else: state_alignment_history = () sess.run(variables.global_variables_initializer()) sess_results = sess.run({ 'final_outputs': final_outputs, 'final_state': final_state, 'state_alignment_history': state_alignment_history, }) final_output_info = nest.map_structure(get_result_summary, sess_results['final_outputs']) final_state_info = nest.map_structure(get_result_summary, sess_results['final_state']) print(name) print('Copy/paste:\nexpected_final_output = %s' % str(final_output_info)) print('expected_final_state = %s' % str(final_state_info)) nest.map_structure(self.assertAllCloseOrEqual, expected_final_output, final_output_info) nest.map_structure(self.assertAllCloseOrEqual, expected_final_state, final_state_info) if alignment_history: # by default, the wrapper emits attention as output final_alignment_history_info = nest.map_structure( get_result_summary, sess_results['state_alignment_history']) print('expected_final_alignment_history = %s' % str(final_alignment_history_info)) nest.map_structure( self.assertAllCloseOrEqual, # outputs are batch major but the stacked TensorArray is time major expected_final_alignment_history, final_alignment_history_info)
def test_attention(self): with self.test_session() as sess: with variable_scope.variable_scope( "root", initializer=init_ops.constant_initializer(0.5)): # Define inputs/outputs to model batch_size = 2 encoder_embedding_size = 3 decoder_embedding_size = 4 encoder_hidden_size = 5 decoder_hidden_size = encoder_hidden_size input_sequence_length = 6 decoder_sequence_length = 7 num_decoder_symbols = 20 start_of_sequence_id = end_of_sequence_id = 1 decoder_embeddings = variable_scope.get_variable( "decoder_embeddings", [num_decoder_symbols, decoder_embedding_size], initializer=init_ops.random_normal_initializer(stddev=0.1)) inputs = constant_op.constant(0.5, shape=[ input_sequence_length, batch_size, encoder_embedding_size ]) decoder_inputs = constant_op.constant( 0.4, shape=[ decoder_sequence_length, batch_size, decoder_embedding_size ]) decoder_length = constant_op.constant(decoder_sequence_length, dtype=dtypes.int32, shape=[ batch_size, ]) # attention attention_option = "luong" # can be "bahdanau" with variable_scope.variable_scope("rnn") as scope: # Define model encoder_outputs, encoder_state = rnn.dynamic_rnn( cell=core_rnn_cell_impl.GRUCell(encoder_hidden_size), inputs=inputs, dtype=dtypes.float32, time_major=True, scope=scope) # attention_states: size [batch_size, max_time, num_units] attention_states = array_ops.transpose( encoder_outputs, [1, 0, 2]) with variable_scope.variable_scope("decoder") as scope: # Prepare attention (attention_keys, attention_values, attention_score_fn, attention_construct_fn) = ( attention_decoder_fn.prepare_attention( attention_states, attention_option, decoder_hidden_size)) decoder_fn_train = attention_decoder_fn.attention_decoder_fn_train( encoder_state=encoder_state, attention_keys=attention_keys, attention_values=attention_values, attention_score_fn=attention_score_fn, attention_construct_fn=attention_construct_fn) # setting up weights for computing the final output def create_output_fn(): def output_fn(x): return layers.linear(x, num_decoder_symbols, scope=scope) return output_fn output_fn = create_output_fn() # Train decoder decoder_cell = core_rnn_cell_impl.GRUCell( decoder_hidden_size) (decoder_outputs_train, decoder_state_train, _) = (seq2seq.dynamic_rnn_decoder( cell=decoder_cell, decoder_fn=decoder_fn_train, inputs=decoder_inputs, sequence_length=decoder_length, time_major=True, scope=scope)) decoder_outputs_train = output_fn(decoder_outputs_train) # Setup variable reuse scope.reuse_variables() # Inference decoder decoder_fn_inference = ( attention_decoder_fn.attention_decoder_fn_inference( output_fn=output_fn, encoder_state=encoder_state, attention_keys=attention_keys, attention_values=attention_values, attention_score_fn=attention_score_fn, attention_construct_fn=attention_construct_fn, embeddings=decoder_embeddings, start_of_sequence_id=start_of_sequence_id, end_of_sequence_id=end_of_sequence_id, maximum_length=decoder_sequence_length - 1, num_decoder_symbols=num_decoder_symbols, dtype=dtypes.int32)) (decoder_outputs_inference, decoder_state_inference, _) = (seq2seq.dynamic_rnn_decoder( cell=decoder_cell, decoder_fn=decoder_fn_inference, time_major=True, scope=scope)) # Run model variables.global_variables_initializer().run() (decoder_outputs_train_res, decoder_state_train_res) = sess.run( [decoder_outputs_train, decoder_state_train]) (decoder_outputs_inference_res, decoder_state_inference_res) = sess.run( [decoder_outputs_inference, decoder_state_inference]) # Assert outputs self.assertEqual( (decoder_sequence_length, batch_size, num_decoder_symbols), decoder_outputs_train_res.shape) self.assertEqual((batch_size, num_decoder_symbols), decoder_outputs_inference_res.shape[1:3]) self.assertEqual((batch_size, decoder_hidden_size), decoder_state_train_res.shape) self.assertEqual((batch_size, decoder_hidden_size), decoder_state_inference_res.shape) # The dynamic decoder might end earlier than `maximal_length` # under inference self.assertGreaterEqual(decoder_sequence_length, decoder_state_inference_res.shape[0])
def _testWithAttention(self, create_attention_mechanism, expected_final_output, expected_final_state, attention_mechanism_depth=3, alignment_history=False, expected_final_alignment_history=None, attention_layer_size=6, name=""): encoder_sequence_length = [3, 2, 3, 1, 0] decoder_sequence_length = [2, 0, 1, 2, 3] batch_size = 5 encoder_max_time = 8 decoder_max_time = 4 input_depth = 7 encoder_output_depth = 10 cell_depth = 9 if attention_layer_size is not None: attention_depth = attention_layer_size else: attention_depth = encoder_output_depth decoder_inputs = np.random.randn(batch_size, decoder_max_time, input_depth).astype(np.float32) encoder_outputs = np.random.randn(batch_size, encoder_max_time, encoder_output_depth).astype(np.float32) attention_mechanism = create_attention_mechanism( num_units=attention_mechanism_depth, memory=encoder_outputs, memory_sequence_length=encoder_sequence_length) with self.test_session(use_gpu=True) as sess: with vs.variable_scope( "root", initializer=init_ops.random_normal_initializer(stddev=0.01, seed=3)): cell = core_rnn_cell.LSTMCell(cell_depth) cell = wrapper.AttentionWrapper( cell, attention_mechanism, attention_layer_size=attention_layer_size, alignment_history=alignment_history) helper = helper_py.TrainingHelper(decoder_inputs, decoder_sequence_length) my_decoder = basic_decoder.BasicDecoder( cell=cell, helper=helper, initial_state=cell.zero_state( dtype=dtypes.float32, batch_size=batch_size)) final_outputs, final_state, _ = decoder.dynamic_decode(my_decoder) self.assertTrue( isinstance(final_outputs, basic_decoder.BasicDecoderOutput)) self.assertTrue( isinstance(final_state, wrapper.AttentionWrapperState)) self.assertTrue( isinstance(final_state.cell_state, core_rnn_cell.LSTMStateTuple)) self.assertEqual((batch_size, None, attention_depth), tuple(final_outputs.rnn_output.get_shape().as_list())) self.assertEqual((batch_size, None), tuple(final_outputs.sample_id.get_shape().as_list())) self.assertEqual((batch_size, attention_depth), tuple(final_state.attention.get_shape().as_list())) self.assertEqual((batch_size, cell_depth), tuple(final_state.cell_state.c.get_shape().as_list())) self.assertEqual((batch_size, cell_depth), tuple(final_state.cell_state.h.get_shape().as_list())) if alignment_history: state_alignment_history = final_state.alignment_history.stack() # Remove the history from final_state for purposes of the # remainder of the tests. final_state = final_state._replace(alignment_history=()) # pylint: disable=protected-access self.assertEqual((None, batch_size, encoder_max_time), tuple(state_alignment_history.get_shape().as_list())) else: state_alignment_history = () sess.run(variables.global_variables_initializer()) sess_results = sess.run({ "final_outputs": final_outputs, "final_state": final_state, "state_alignment_history": state_alignment_history, }) print("Copy/paste (%s)\nexpected_final_output = " % name, sess_results["final_outputs"]) sys.stdout.flush() print("Copy/paste (%s)\nexpected_final_state = " % name, sess_results["final_state"]) sys.stdout.flush() print("Copy/paste (%s)\nexpected_final_alignment_history = " % name, np.asarray(sess_results["state_alignment_history"])) sys.stdout.flush() nest.map_structure(self.assertAllClose, expected_final_output, sess_results["final_outputs"]) nest.map_structure(self.assertAllClose, expected_final_state, sess_results["final_state"]) if alignment_history: # by default, the wrapper emits attention as output self.assertAllClose( # outputs are batch major but the stacked TensorArray is time major sess_results["state_alignment_history"], expected_final_alignment_history)
def testDuplicatedInitializer(self): init = init_ops.random_normal_initializer(0.0, 1.0) self.assertFalse(duplicated_initializer(self, init, 1))
def _attention_with_coverage(state, context, last_coverage, encoded_fertility): with vs.variable_scope("attention"): ctx_shape = context.get_shape().as_list() dim_ctx = ctx_shape[-1] if isinstance(state_size, tuple): _, m_prev = state _, m_size = state_size else: m_prev, m_size = state, state_size # print (last_coverage.get_shape().as_list()) init_std = 1. / math.sqrt(m_size) cov_initializer = init_ops.random_normal_initializer(mean=0, stddev=1.) initializer = init_ops.random_normal_initializer( mean=0, stddev=init_std) with vs.variable_scope("ctx_proj"): pcoverage = dense(array_ops.expand_dims(last_coverage, -1), units=dim_ctx, kernel_initializer=cov_initializer, use_bias=False) pctx = dense(context, units=dim_ctx, kernel_initializer=initializer, use_bias=True) # pctx = _linear(array_ops.reshape(pctx, [-1, dim_ctx + 1]), dim_ctx, bias=True) # pctx = array_ops.reshape(pctx, [-1, ctx_shape[1], dim_ctx]) # pctx = array_ops.reshape(context, [-1, dim_ctx]) # pctx = array_ops.reshape(_linear(pctx, dim_ctx, bias=True), [-1, ctx_shape[1], dim_ctx]) with vs.variable_scope("state_proj"): pstate = array_ops.expand_dims(_linear( m_prev, dim_ctx, kernel_initializer=initializer, bias=False), axis=1) with vs.variable_scope("cell_proj") as cell_proj_scope: # alpha = math_ops.reduce_sum(math_ops.tanh(pstate + pctx + pcoverage), [2]) alpha = dense(math_ops.tanh(pstate + pctx + pcoverage), units=1, kernel_initializer=initializer, use_bias=False) alpha = math_ops.reduce_sum(alpha, [2]) # pctx = math_ops.tanh(array_ops.reshape((pctx + pstate), [-1, dim_ctx])) # alpha = array_ops.reshape(_linear(pctx, 1, bias=True), [-1, ctx_shape[1]]) if att_sequence_length is not None: alpha_mask = array_ops.sequence_mask( lengths=att_sequence_length, maxlen=ctx_shape[1], dtype=dtypes.float32) alpha = alpha * alpha_mask + ( (1.0 - alpha_mask) * dtypes.float32.min) alpha_normalized = nn_ops.softmax(alpha) ctx = math_ops.reduce_sum( context * array_ops.expand_dims(alpha_normalized, axis=2), axis=1) # print (alpha_normalized, last_coverage, encoded_fertility) encoded_fertility = array_ops.identity( encoded_fertility, name="encoded_fertility") new_coverage = last_coverage + alpha_normalized * math_ops.pow( 2 * encoded_fertility, -1) new_coverage = new_coverage * alpha_mask + ( (1.0 - alpha_mask) * last_coverage) return ctx, alpha_normalized, new_coverage
def _testWithAttention(self, create_attention_mechanism, expected_final_output, expected_final_state, attention_mechanism_depth=3, alignment_history=False, expected_final_alignment_history=None, name=""): encoder_sequence_length = [3, 2, 3, 1, 0] decoder_sequence_length = [2, 0, 1, 2, 3] batch_size = 5 encoder_max_time = 8 decoder_max_time = 4 input_depth = 7 encoder_output_depth = 10 cell_depth = 9 attention_depth = 6 decoder_inputs = np.random.randn(batch_size, decoder_max_time, input_depth).astype(np.float32) encoder_outputs = np.random.randn(batch_size, encoder_max_time, encoder_output_depth).astype( np.float32) attention_mechanism = create_attention_mechanism( num_units=attention_mechanism_depth, memory=encoder_outputs, memory_sequence_length=encoder_sequence_length) with self.test_session(use_gpu=True) as sess: with vs.variable_scope( "root", initializer=init_ops.random_normal_initializer(stddev=0.01, seed=3)): cell = core_rnn_cell.LSTMCell(cell_depth) cell = wrapper.AttentionWrapper( cell, attention_mechanism, attention_size=attention_depth, alignment_history=alignment_history) helper = helper_py.TrainingHelper(decoder_inputs, decoder_sequence_length) my_decoder = basic_decoder.BasicDecoder( cell=cell, helper=helper, initial_state=cell.zero_state(dtype=dtypes.float32, batch_size=batch_size)) final_outputs, final_state = decoder.dynamic_decode(my_decoder) self.assertTrue( isinstance(final_outputs, basic_decoder.BasicDecoderOutput)) self.assertTrue( isinstance(final_state, wrapper.AttentionWrapperState)) self.assertTrue( isinstance(final_state.cell_state, core_rnn_cell.LSTMStateTuple)) self.assertEqual( (batch_size, None, attention_depth), tuple(final_outputs.rnn_output.get_shape().as_list())) self.assertEqual( (batch_size, None), tuple(final_outputs.sample_id.get_shape().as_list())) self.assertEqual( (batch_size, attention_depth), tuple(final_state.attention.get_shape().as_list())) self.assertEqual( (batch_size, cell_depth), tuple(final_state.cell_state.c.get_shape().as_list())) self.assertEqual( (batch_size, cell_depth), tuple(final_state.cell_state.h.get_shape().as_list())) if alignment_history: state_alignment_history = final_state.alignment_history.stack() # Remove the history from final_state for purposes of the # remainder of the tests. final_state = final_state._replace(alignment_history=()) # pylint: disable=protected-access self.assertEqual( (None, batch_size, encoder_max_time), tuple(state_alignment_history.get_shape().as_list())) else: state_alignment_history = () sess.run(variables.global_variables_initializer()) sess_results = sess.run({ "final_outputs": final_outputs, "final_state": final_state, "state_alignment_history": state_alignment_history, }) print("Copy/paste (%s)\nexpected_final_output = " % name, sess_results["final_outputs"]) sys.stdout.flush() print("Copy/paste (%s)\nexpected_final_state = " % name, sess_results["final_state"]) sys.stdout.flush() print( "Copy/paste (%s)\nexpected_final_alignment_history = " % name, sess_results["state_alignment_history"]) sys.stdout.flush() nest.map_structure(self.assertAllClose, expected_final_output, sess_results["final_outputs"]) nest.map_structure(self.assertAllClose, expected_final_state, sess_results["final_state"]) if alignment_history: # by default, the wrapper emits attention as output self.assertAllClose( # outputs are batch major but the stacked TensorArray is time major sess_results["state_alignment_history"], expected_final_alignment_history)
def _testWithMaybeMultiAttention(self, is_multi, create_attention_mechanisms, expected_final_output, expected_final_state, attention_mechanism_depths, alignment_history=False, expected_final_alignment_history=None, attention_layer_sizes=None, attention_layers=None, name=''): # Allow is_multi to be True with a single mechanism to enable test for # passing in a single mechanism in a list. assert len(create_attention_mechanisms) == 1 or is_multi encoder_sequence_length = [3, 2, 3, 1, 1] decoder_sequence_length = [2, 0, 1, 2, 3] batch_size = 5 encoder_max_time = 8 decoder_max_time = 4 input_depth = 7 encoder_output_depth = 10 cell_depth = 9 if attention_layer_sizes is not None: # Compute sum of attention_layer_sizes. Use encoder_output_depth if None. attention_depth = sum([attention_layer_size or encoder_output_depth for attention_layer_size in attention_layer_sizes]) elif attention_layers is not None: # Compute sum of attention_layers output depth. attention_depth = sum( attention_layer.compute_output_shape( [batch_size, cell_depth + encoder_output_depth])[-1].value for attention_layer in attention_layers) else: attention_depth = encoder_output_depth * len(create_attention_mechanisms) decoder_inputs = array_ops.placeholder_with_default( np.random.randn(batch_size, decoder_max_time, input_depth).astype(np.float32), shape=(None, None, input_depth)) encoder_outputs = array_ops.placeholder_with_default( np.random.randn(batch_size, encoder_max_time, encoder_output_depth).astype(np.float32), shape=(None, None, encoder_output_depth)) attention_mechanisms = [ creator(num_units=depth, memory=encoder_outputs, memory_sequence_length=encoder_sequence_length) for creator, depth in zip(create_attention_mechanisms, attention_mechanism_depths)] with self.test_session(use_gpu=True) as sess: with vs.variable_scope( 'root', initializer=init_ops.random_normal_initializer(stddev=0.01, seed=3)): attention_layer_size = attention_layer_sizes attention_layer = attention_layers if not is_multi: if attention_layer_size is not None: attention_layer_size = attention_layer_size[0] if attention_layer is not None: attention_layer = attention_layer[0] cell = rnn_cell.LSTMCell(cell_depth) cell = wrapper.AttentionWrapper( cell, attention_mechanisms if is_multi else attention_mechanisms[0], attention_layer_size=attention_layer_size, alignment_history=alignment_history, attention_layer=attention_layer) helper = helper_py.TrainingHelper(decoder_inputs, decoder_sequence_length) my_decoder = basic_decoder.BasicDecoder( cell=cell, helper=helper, initial_state=cell.zero_state( dtype=dtypes.float32, batch_size=batch_size)) final_outputs, final_state, _ = decoder.dynamic_decode(my_decoder) self.assertTrue( isinstance(final_outputs, basic_decoder.BasicDecoderOutput)) self.assertTrue( isinstance(final_state, wrapper.AttentionWrapperState)) self.assertTrue( isinstance(final_state.cell_state, rnn_cell.LSTMStateTuple)) self.assertEqual((batch_size, None, attention_depth), tuple(final_outputs.rnn_output.get_shape().as_list())) self.assertEqual((batch_size, None), tuple(final_outputs.sample_id.get_shape().as_list())) self.assertEqual((batch_size, attention_depth), tuple(final_state.attention.get_shape().as_list())) self.assertEqual((batch_size, cell_depth), tuple(final_state.cell_state.c.get_shape().as_list())) self.assertEqual((batch_size, cell_depth), tuple(final_state.cell_state.h.get_shape().as_list())) if alignment_history: if is_multi: state_alignment_history = [] for history_array in final_state.alignment_history: history = history_array.stack() self.assertEqual( (None, batch_size, None), tuple(history.get_shape().as_list())) state_alignment_history.append(history) state_alignment_history = tuple(state_alignment_history) else: state_alignment_history = final_state.alignment_history.stack() self.assertEqual( (None, batch_size, None), tuple(state_alignment_history.get_shape().as_list())) nest.assert_same_structure( cell.state_size, cell.zero_state(batch_size, dtypes.float32)) # Remove the history from final_state for purposes of the # remainder of the tests. final_state = final_state._replace(alignment_history=()) # pylint: disable=protected-access else: state_alignment_history = () sess.run(variables.global_variables_initializer()) sess_results = sess.run({ 'final_outputs': final_outputs, 'final_state': final_state, 'state_alignment_history': state_alignment_history, }) final_output_info = nest.map_structure(get_result_summary, sess_results['final_outputs']) final_state_info = nest.map_structure(get_result_summary, sess_results['final_state']) print(name) print('Copy/paste:\nexpected_final_output = %s' % str(final_output_info)) print('expected_final_state = %s' % str(final_state_info)) nest.map_structure(self.assertAllCloseOrEqual, expected_final_output, final_output_info) nest.map_structure(self.assertAllCloseOrEqual, expected_final_state, final_state_info) if alignment_history: # by default, the wrapper emits attention as output final_alignment_history_info = nest.map_structure( get_result_summary, sess_results['state_alignment_history']) print('expected_final_alignment_history = %s' % str(final_alignment_history_info)) nest.map_structure( self.assertAllCloseOrEqual, # outputs are batch major but the stacked TensorArray is time major expected_final_alignment_history, final_alignment_history_info)
def _TestOneSimpleTraining(self, rnn_mode, num_layers, num_units, input_size, batch_size, seq_length, dir_count, dropout, dtype, delta, tolerance): # Gradient checking runs two forward ops with almost the same input. Need to # make sure the drop patterns across the two runs are the same. logging.info("Training test with config: %s", locals()) old_env_state = os.environ.get("TF_CUDNN_RESET_RND_GEN_STATE", str(False)) os.environ["TF_CUDNN_RESET_RND_GEN_STATE"] = str(True) np.random.seed(1234) random_seed.set_random_seed(5678) has_input_c = (rnn_mode == CUDNN_LSTM) direction = (CUDNN_RNN_UNIDIRECTION if dir_count == 1 else CUDNN_RNN_BIDIRECTION) model = CudnnTestModel( rnn_mode, num_layers, num_units, input_size, direction=direction, dropout=dropout, dtype=dtype, training=True, bias_initializer=init_ops.random_normal_initializer( mean=1., dtype=dtype)) rnn = model.rnn params = rnn.trainable_variables[0] inputs = variables.Variable( random_ops.random_uniform( [seq_length, batch_size, input_size], dtype=dtype), dtype=dtype) input_h = variables.Variable( random_ops.random_uniform( [num_layers * dir_count, batch_size, num_units], dtype=dtype), dtype=dtype) if has_input_c: input_c = variables.Variable( random_ops.random_uniform( [num_layers * dir_count, batch_size, num_units], dtype=dtype), dtype=dtype) initial_state = (input_h, input_c) else: initial_state = (input_h,) total_sum = model.FProp(inputs, initial_state, training=True) with self.test_session(use_gpu=True, graph=ops.get_default_graph()) as sess: sess.run(variables.global_variables_initializer()) all_inputs = [inputs, params] for s in initial_state: all_inputs.append(s) if dtype == dtypes.float16: self._GradientCheckFp16( sess, total_sum, all_inputs, num_samples=FLAGS.grad_check_num_samples, tolerance=tolerance, delta=delta) else: for _ in range(FLAGS.grad_check_num_samples): # Each time choose a different set of inputs. sess.run(variables.global_variables_initializer()) self._GradientCheck( sess, total_sum, all_inputs, tolerance=tolerance, delta=delta) os.environ["TF_CUDNN_RESET_RND_GEN_STATE"] = old_env_state
def test_dynamic_rnn_decoder_time_major(self): with self.test_session() as sess: with variable_scope.variable_scope( "root", initializer=init_ops.constant_initializer( 0.5)) as varscope: # Define inputs/outputs to model batch_size = 2 encoder_embedding_size = 3 decoder_embedding_size = 4 encoder_hidden_size = 5 decoder_hidden_size = encoder_hidden_size input_sequence_length = 6 decoder_sequence_length = 7 num_decoder_symbols = 20 start_of_sequence_id = end_of_sequence_id = 1 decoder_embeddings = variable_scope.get_variable( "decoder_embeddings", [num_decoder_symbols, decoder_embedding_size], initializer=init_ops.random_normal_initializer(stddev=0.1)) inputs = constant_op.constant(0.5, shape=[ input_sequence_length, batch_size, encoder_embedding_size ]) decoder_inputs = constant_op.constant( 0.4, shape=[ decoder_sequence_length, batch_size, decoder_embedding_size ]) decoder_length = constant_op.constant(decoder_sequence_length, dtype=dtypes.int32, shape=[ batch_size, ]) with variable_scope.variable_scope("rnn") as scope: # setting up weights for computing the final output output_fn = lambda x: layers.linear( x, num_decoder_symbols, scope=scope) # Define model encoder_outputs, encoder_state = rnn.dynamic_rnn( cell=core_rnn_cell_impl.GRUCell(encoder_hidden_size), inputs=inputs, dtype=dtypes.float32, time_major=True, scope=scope) with variable_scope.variable_scope("decoder") as scope: # Train decoder decoder_cell = core_rnn_cell_impl.GRUCell( decoder_hidden_size) decoder_fn_train = Seq2SeqTest._decoder_fn_with_context_state( decoder_fn_lib.simple_decoder_fn_train( encoder_state=encoder_state)) (decoder_outputs_train, decoder_state_train, decoder_context_state_train) = ( seq2seq.dynamic_rnn_decoder( cell=decoder_cell, decoder_fn=decoder_fn_train, inputs=decoder_inputs, sequence_length=decoder_length, time_major=True, scope=scope)) decoder_outputs_train = output_fn(decoder_outputs_train) # Setup variable reuse scope.reuse_variables() # Inference decoder decoder_fn_inference = Seq2SeqTest._decoder_fn_with_context_state( decoder_fn_lib.simple_decoder_fn_inference( output_fn=output_fn, encoder_state=encoder_state, embeddings=decoder_embeddings, start_of_sequence_id=start_of_sequence_id, end_of_sequence_id=end_of_sequence_id, #TODO: find out why it goes to +1 maximum_length=decoder_sequence_length - 1, num_decoder_symbols=num_decoder_symbols, dtype=dtypes.int32)) (decoder_outputs_inference, decoder_state_inference, decoder_context_state_inference) = ( seq2seq.dynamic_rnn_decoder( cell=decoder_cell, decoder_fn=decoder_fn_inference, time_major=True, scope=scope)) # Run model variables.global_variables_initializer().run() (decoder_outputs_train_res, decoder_state_train_res, decoder_context_state_train_res) = sess.run([ decoder_outputs_train, decoder_state_train, decoder_context_state_train ]) (decoder_outputs_inference_res, decoder_state_inference_res, decoder_context_state_inference_res) = sess.run([ decoder_outputs_inference, decoder_state_inference, decoder_context_state_inference ]) # Assert outputs self.assertEqual( (decoder_sequence_length, batch_size, num_decoder_symbols), decoder_outputs_train_res.shape) self.assertEqual((batch_size, num_decoder_symbols), decoder_outputs_inference_res.shape[1:3]) self.assertEqual(decoder_sequence_length, decoder_context_state_inference_res) self.assertEqual((batch_size, decoder_hidden_size), decoder_state_train_res.shape) self.assertEqual((batch_size, decoder_hidden_size), decoder_state_inference_res.shape) self.assertEqual(decoder_sequence_length, decoder_context_state_train_res) # The dynamic decoder might end earlier than `maximal_length` # under inference self.assertGreaterEqual(decoder_sequence_length, decoder_state_inference_res.shape[0])