Example #1
0
def logistic_regression(X, y, class_weight=None, init_mean=None,
                        init_stddev=1.0):
    """Creates logistic regression TensorFlow subgraph.

    Args:
        X: tensor or placeholder for input features,
           shape should be [batch_size, n_features].
        y: tensor or placeholder for target,
           shape should be [batch_size, n_classes].
        class_weight: tensor, [n_classes], where for each class
                      it has weight of the class. If not provided
                      will check if graph contains tensor `class_weight:0`.
                      If that is not provided either all ones are used.
        init_mean: the mean value to use for initialization.
        init_stddev: the standard devation to use for initialization.

    Returns:
        Predictions and loss tensors.

    Side effects:
        The variables linear_regression.weights and linear_regression.bias are
        initialized as follows.  If init_mean is not None, then initialization
        will be done using a random normal initializer with the given init_mean
        and init_stddv.  (These may be set to 0.0 each if a zero initialization
        is desirable for convex use cases.)  If init_mean is None, then the
        uniform_unit_scaling_initialzer will be used.
    """
    with vs.variable_scope('logistic_regression'):
        logging_ops.histogram_summary('logistic_regression.X', X)
        logging_ops.histogram_summary('logistic_regression.y', y)
        # Set up the requested initialization.
        if (init_mean is None):
            weights = vs.get_variable('weights',
                                      [X.get_shape()[1], y.get_shape()[-1]])
            bias = vs.get_variable('bias',
                                   [y.get_shape()[-1]])
        else:
            weights = vs.get_variable('weights',
                                      [X.get_shape()[1], y.get_shape()[-1]],
                                      initializer=init_ops.random_normal_initializer(
                                          init_mean, init_stddev))
            bias = vs.get_variable('bias',
                                   [y.get_shape()[-1]],
                                   initializer=init_ops.random_normal_initializer(
                                       init_mean, init_stddev))
        logging_ops.histogram_summary('logistic_regression.weights', weights)
        logging_ops.histogram_summary('logistic_regression.bias', bias)
        # If no class weight provided, try to retrieve one from pre-defined
        # tensor name in the graph.
        if not class_weight:
            try:
                class_weight = ops.get_default_graph().get_tensor_by_name('class_weight:0')
            except KeyError:
                pass

        return losses_ops.softmax_classifier(X, y, weights, bias,
                                  class_weight=class_weight)
 def _BuildSmallModel(self):
   image = array_ops.zeros([2, 6, 6, 3])
   kernel = variable_scope.get_variable(
       'DW', [3, 3, 3, 6],
       dtypes.float32,
       initializer=init_ops.random_normal_initializer(stddev=0.001))
   x = nn_ops.conv2d(image, kernel, [1, 2, 2, 1], padding='SAME')
   kernel = variable_scope.get_variable(
       'DW2', [2, 2, 6, 12],
       dtypes.float32,
       initializer=init_ops.random_normal_initializer(stddev=0.001))
   x = nn_ops.conv2d(x, kernel, [1, 2, 2, 1], padding='SAME')
   return x
Example #3
0
def linear_regression(x, y, init_mean=None, init_stddev=1.0):
  """Creates linear regression TensorFlow subgraph.

  Args:
    x: tensor or placeholder for input features.
    y: tensor or placeholder for labels.
    init_mean: the mean value to use for initialization.
    init_stddev: the standard devation to use for initialization.

  Returns:
    Predictions and loss tensors.

  Side effects:
    The variables linear_regression.weights and linear_regression.bias are
    initialized as follows.  If init_mean is not None, then initialization
    will be done using a random normal initializer with the given init_mean
    and init_stddv.  (These may be set to 0.0 each if a zero initialization
    is desirable for convex use cases.)  If init_mean is None, then the
    uniform_unit_scaling_initialzer will be used.
  """
  with vs.variable_scope('linear_regression'):
    scope_name = vs.get_variable_scope().name
    summary.histogram('%s.x' % scope_name, x)
    summary.histogram('%s.y' % scope_name, y)
    dtype = x.dtype.base_dtype
    y_shape = y.get_shape()
    if len(y_shape) == 1:
      output_shape = 1
    else:
      output_shape = y_shape[1]
    # Set up the requested initialization.
    if init_mean is None:
      weights = vs.get_variable(
          'weights', [x.get_shape()[1], output_shape], dtype=dtype)
      bias = vs.get_variable('bias', [output_shape], dtype=dtype)
    else:
      weights = vs.get_variable(
          'weights', [x.get_shape()[1], output_shape],
          initializer=init_ops.random_normal_initializer(
              init_mean, init_stddev, dtype=dtype),
          dtype=dtype)
      bias = vs.get_variable(
          'bias', [output_shape],
          initializer=init_ops.random_normal_initializer(
              init_mean, init_stddev, dtype=dtype),
          dtype=dtype)
    summary.histogram('%s.weights' % scope_name, weights)
    summary.histogram('%s.bias' % scope_name, bias)
    return losses_ops.mean_squared_error_regressor(x, y, weights, bias)
Example #4
0
def batch_normalize(tensor_in, epsilon=1e-5, convnet=False, decay=0.9, scale_after_normalization=True):
    """Batch Normalization

  Args:
    tensor_in: input Tensor, 4D shape: [batch, in_height, in_width, in_depth].
    epsilon : A float number to avoid being divided by 0.
    decay: decay rate for exponential moving average.
    convnet: Whether this is for convolutional net use. If this is True,
      moments will sum across axis [0, 1, 2]. Otherwise, only [0].
    scale_after_normalization: Whether to scale after normalization.
  """
    shape = tensor_in.get_shape().as_list()

    with vs.variable_scope("batch_norm"):
        gamma = vs.get_variable("gamma", [shape[-1]], initializer=init_ops.random_normal_initializer(1.0, 0.02))
        beta = vs.get_variable("beta", [shape[-1]], initializer=init_ops.constant_initializer(0.0))
        ema = moving_averages.ExponentialMovingAverage(decay=decay)
        if convnet:
            assign_mean, assign_var = nn.moments(tensor_in, [0, 1, 2])
        else:
            assign_mean, assign_var = nn.moments(tensor_in, [0])
        ema_assign_op = ema.apply([assign_mean, assign_var])
        ema_mean, ema_var = ema.average(assign_mean), ema.average(assign_var)

        def update_mean_var():
            """Internal function that updates mean and variance during training"""
            with ops.control_dependencies([ema_assign_op]):
                return array_ops_.identity(assign_mean), array_ops_.identity(assign_var)

        is_training = array_ops_.squeeze(ops.get_collection("IS_TRAINING"))
        mean, variance = control_flow_ops.cond(is_training, update_mean_var, lambda: (ema_mean, ema_var))
        return nn.batch_norm_with_global_normalization(
            tensor_in, mean, variance, beta, gamma, epsilon, scale_after_normalization=scale_after_normalization
        )
  def doTestIndexedSlicesGradientInCondInWhileLoop(self, use_resource=False):
    with ops.Graph().as_default():
      embedding_matrix = variable_scope.get_variable(
          "embedding_matrix", [5, 5],
          initializer=init_ops.random_normal_initializer(),
          use_resource=use_resource)

      def Cond(it, _):
        return it < 5

      def Body(it, cost):
        embedding = embedding_ops.embedding_lookup(embedding_matrix, [0])
        cost = control_flow_ops.cond(
            math_ops.equal(it, 3), lambda: math_ops.square(cost),
            lambda: cost + math_ops.reduce_sum(embedding))
        return it + 1, cost

      _, cost = control_flow_ops.while_loop(
          Cond, Body, [constant_op.constant(0), constant_op.constant(0.0)])

      dynamic_grads = gradients_impl.gradients(cost, [embedding_matrix])[0]
      dynamic_grads = math_ops.segment_sum(dynamic_grads.values,
                                           dynamic_grads.indices)

      embedding = embedding_ops.embedding_lookup(embedding_matrix, [0])
      static = math_ops.square(
          math_ops.reduce_sum(embedding) + math_ops.reduce_sum(embedding) +
          math_ops.reduce_sum(embedding)) + math_ops.reduce_sum(embedding)
      static_grads = gradients_impl.gradients(static, [embedding_matrix])[0]
      static_grads = math_ops.segment_sum(static_grads.values,
                                          static_grads.indices)

      with self.test_session() as sess:
        sess.run(variables.global_variables_initializer())
        self.assertAllEqual(*sess.run([static_grads, dynamic_grads]))
def batch_normalize(tensor_in,
                    epsilon=1e-5,
                    convnet=False,
                    decay=0.9,
                    scale_after_normalization=True):
  """Batch normalization.

  Args:
    tensor_in: input `Tensor`, 4D shape: [batch, in_height, in_width, in_depth].
    epsilon : A float number to avoid being divided by 0.
    convnet: Whether this is for convolutional net use. If `True`, moments
        will sum across axis `[0, 1, 2]`. Otherwise, only `[0]`.
    decay: Decay rate for exponential moving average.
    scale_after_normalization: Whether to scale after normalization.

  Returns:
    A batch-normalized `Tensor`.
  """
  shape = tensor_in.get_shape().as_list()

  with vs.variable_scope("batch_norm"):
    gamma = vs.get_variable(
        "gamma", [shape[-1]],
        initializer=init_ops.random_normal_initializer(1., 0.02))
    beta = vs.get_variable("beta", [shape[-1]],
                           initializer=init_ops.constant_initializer(0.))
    moving_mean = vs.get_variable(
        'moving_mean',
        shape=[shape[-1]],
        initializer=init_ops.zeros_initializer,
        trainable=False)
    moving_var = vs.get_variable(
        'moving_var',
        shape=[shape[-1]],
        initializer=init_ops.ones_initializer,
        trainable=False)

    def _update_mean_var():
      """Internal function that updates mean and variance during training."""
      axis = [0, 1, 2] if convnet else [0]
      mean, var = nn.moments(tensor_in, axis)
      update_moving_mean = moving_averages.assign_moving_average(
          moving_mean, mean, decay)
      update_moving_var = moving_averages.assign_moving_average(
          moving_var, var, decay)
      with ops.control_dependencies([update_moving_mean, update_moving_var]):
        return array_ops_.identity(mean), array_ops_.identity(var)

    is_training = array_ops_.squeeze(ops.get_collection("IS_TRAINING"))
    mean, variance = control_flow_ops.cond(is_training, _update_mean_var,
                                           lambda: (moving_mean, moving_var))
    return nn.batch_norm_with_global_normalization(
        tensor_in,
        mean,
        variance,
        beta,
        gamma,
        epsilon,
        scale_after_normalization=scale_after_normalization)
Example #7
0
def linear_regression(X, y, init_mean=None, init_stddev=1.0):
    """Creates linear regression TensorFlow subgraph.

    Args:
        X: tensor or placeholder for input features.
        y: tensor or placeholder for target.
        init_mean: the mean value to use for initialization.
        init_stddev: the standard devation to use for initialization.

    Returns:
        Predictions and loss tensors.

    Side effects:
        The variables linear_regression.weights and linear_regression.bias are
        initialized as follows.  If init_mean is not None, then initialization
        will be done using a random normal initializer with the given init_mean
        and init_stddv.  (These may be set to 0.0 each if a zero initialization
        is desirable for convex use cases.)  If init_mean is None, then the
        uniform_unit_scaling_initialzer will be used.
    """
    with vs.variable_scope('linear_regression'):
        logging_ops.histogram_summary('linear_regression.X', X)
        logging_ops.histogram_summary('linear_regression.y', y)
        y_shape = y.get_shape()
        if len(y_shape) == 1:
            output_shape = 1
        else:
            output_shape = y_shape[1]
        # Set up the requested initialization.
        if (init_mean is None):
            weights = vs.get_variable('weights',
                                      [X.get_shape()[1], output_shape])
            bias = vs.get_variable('bias',
                                   [output_shape])
        else:
            weights = vs.get_variable('weights',
                                      [X.get_shape()[1], output_shape],
                                      initializer=init_ops.random_normal_initializer(
                                          init_mean, init_stddev))
            bias = vs.get_variable('bias',
                                   [output_shape],
                                   initializer=init_ops.random_normal_initializer(
                                       init_mean, init_stddev))
        logging_ops.histogram_summary('linear_regression.weights', weights)
        logging_ops.histogram_summary('linear_regression.bias', bias)
        return losses_ops.mean_squared_error_regressor(X, y, weights, bias)
Example #8
0
 def __init__(self, W_in=init_ops.random_normal_initializer(stddev=0.1),
              W_hid=init_ops.random_normal_initializer(stddev=0.1),
              W_cell=init_ops.random_normal_initializer(stddev=0.1),
              b=init_ops.constant_initializer(0.),
              activation=None):
   self.W_in = W_in
   self.W_hid = W_hid
   # Don't store a cell weight vector when cell is None
   if W_cell is not None:
       self.W_cell = W_cell
   if b is not None:
     self.b = b
   # For the activation, if None is supplied, use identity
   if activation is None:
       self.activation = control_flow_ops.identity
   else:
       self.activation = activation
def BuildSmallModel():
  """Build a small forward conv model."""
  image = array_ops.zeros([2, 6, 6, 3])
  _ = variable_scope.get_variable(
      'ScalarW', [],
      dtypes.float32,
      initializer=init_ops.random_normal_initializer(stddev=0.001))
  kernel = variable_scope.get_variable(
      'DW', [3, 3, 3, 6],
      dtypes.float32,
      initializer=init_ops.random_normal_initializer(stddev=0.001))
  x = nn_ops.conv2d(image, kernel, [1, 2, 2, 1], padding='SAME')
  kernel = variable_scope.get_variable(
      'DW2', [2, 2, 6, 12],
      dtypes.float32,
      initializer=init_ops.random_normal_initializer(stddev=0.001))
  x = nn_ops.conv2d(x, kernel, [1, 2, 2, 1], padding='SAME')
  return x
def BuildSplitableModel():
  """Build a small model that can be run partially in each step."""
  image = array_ops.zeros([2, 6, 6, 3])

  kernel1 = variable_scope.get_variable(
      'DW', [3, 3, 3, 6],
      dtypes.float32,
      initializer=init_ops.random_normal_initializer(stddev=0.001))
  r1 = nn_ops.conv2d(image, kernel1, [1, 2, 2, 1], padding='SAME')

  kernel2 = variable_scope.get_variable(
      'DW2', [2, 3, 3, 6],
      dtypes.float32,
      initializer=init_ops.random_normal_initializer(stddev=0.001))
  r2 = nn_ops.conv2d(image, kernel2, [1, 2, 2, 1], padding='SAME')

  r3 = r1 + r2
  return r1, r2, r3
  def _TestOneSimpleTraining(self, rnn_mode, num_layers, num_units, input_size,
                             batch_size, seq_length, dir_count, dropout, dtype,
                             delta, tolerance):
    # Gradient checking runs two forward ops with almost the same input. Need to
    # make sure the drop patterns across the two runs are the same.
    logging.info("Training test with config: %s", locals())
    old_env_state = os.environ.get("TF_CUDNN_RESET_RND_GEN_STATE", str(False))
    os.environ["TF_CUDNN_RESET_RND_GEN_STATE"] = str(True)
    random_seed.set_random_seed(5678)
    has_input_c = (rnn_mode == CUDNN_LSTM)
    direction = (CUDNN_RNN_UNIDIRECTION
                 if dir_count == 1 else CUDNN_RNN_BIDIRECTION)
    model = CudnnTestModel(
        rnn_mode,
        num_layers,
        num_units,
        input_size,
        direction=direction,
        dropout=dropout,
        dtype=dtype,
        training=True,
        bias_initializer=init_ops.random_normal_initializer(
            mean=1., dtype=dtype))
    rnn = model.rnn
    params = rnn.trainable_variables[0]

    inputs = variables.Variable(
        random_ops.random_uniform(
            [seq_length, batch_size, input_size], dtype=dtype),
        dtype=dtype)
    input_h = variables.Variable(
        random_ops.random_uniform(
            [num_layers * dir_count, batch_size, num_units], dtype=dtype),
        dtype=dtype)
    if has_input_c:
      input_c = variables.Variable(
          random_ops.random_uniform(
              [num_layers * dir_count, batch_size, num_units], dtype=dtype),
          dtype=dtype)
      initial_state = (input_h, input_c)
    else:
      initial_state = (input_h,)
    total_sum = model.FProp(inputs, initial_state, training=True)

    with self.test_session(use_gpu=True, graph=ops.get_default_graph()) as sess:
      sess.run(variables.global_variables_initializer())
      all_inputs = [inputs, params]
      for s in initial_state:
        all_inputs.append(s)
      self._GradientCheck(
          sess, total_sum, all_inputs, tolerance=tolerance, delta=delta)
      os.environ["TF_CUDNN_RESET_RND_GEN_STATE"] = old_env_state
def compute_spectral_norm(w_tensor, power_iteration_rounds=1, name=None):
  """Estimates the largest singular value in the weight tensor.

  Args:
    w_tensor: The weight matrix whose spectral norm should be computed.
    power_iteration_rounds: The number of iterations of the power method to
      perform. A higher number yields a better approximation.
    name: An optional scope name.

  Returns:
    The largest singular value (the spectral norm) of w.
  """
  with variable_scope.variable_scope(name, 'spectral_norm'):
    # The paper says to flatten convnet kernel weights from
    # (C_out, C_in, KH, KW) to (C_out, C_in * KH * KW). But TensorFlow's Conv2D
    # kernel weight shape is (KH, KW, C_in, C_out), so it should be reshaped to
    # (KH * KW * C_in, C_out), and similarly for other layers that put output
    # channels as last dimension.
    # n.b. this means that w here is equivalent to w.T in the paper.
    w = array_ops.reshape(w_tensor, (-1, w_tensor.get_shape()[-1]))

    # Persisted approximation of first left singular vector of matrix `w`.
    u_var = variable_scope.get_variable(
        _PERSISTED_U_VARIABLE_SUFFIX,
        shape=(w.shape[0], 1),
        dtype=w.dtype,
        initializer=init_ops.random_normal_initializer(),
        trainable=False)
    u = u_var

    # Use power iteration method to approximate spectral norm.
    for _ in range(power_iteration_rounds):
      # `v` approximates the first right singular vector of matrix `w`.
      v = nn.l2_normalize(math_ops.matmul(array_ops.transpose(w), u))
      u = nn.l2_normalize(math_ops.matmul(w, v))

    # Update persisted approximation.
    with ops.control_dependencies([u_var.assign(u, name='update_u')]):
      u = array_ops.identity(u)

    u = array_ops.stop_gradient(u)
    v = array_ops.stop_gradient(v)

    # Largest singular value of `w`.
    spectral_norm = math_ops.matmul(
        math_ops.matmul(array_ops.transpose(u), w), v)
    spectral_norm.shape.assert_is_fully_defined()
    spectral_norm.shape.assert_is_compatible_with([1, 1])

    return spectral_norm[0][0]
 def __call__(self, inputs, state, scope=None):
   dtype = inputs.dtype
   batch_size, input_size = inputs.get_shape().as_list() # as_list() so that it is a float. Seems strange...
   if self._O is not None:
     input_size = input_size - self._O.get_shape().as_list()[0]
   with vs.variable_scope(scope or type(self).__name__):
     A = vs.get_variable('A', [self._num_units, self._num_units], dtype=dtype, initializer=init_ops.random_normal_initializer(stddev=1/math.sqrt(self._num_units)))
     B = vs.get_variable('B', [input_size, self._num_units],      dtype=dtype, initializer=init_ops.random_normal_initializer(stddev=1/math.sqrt(input_size)))
     b = vs.get_variable('b', [self._num_units], initializer=init_ops.random_normal_initializer(stddev=0.01))
     
     if self._O is not None:
       output = (1 - self._dt_tau)*state + self._dt_tau*(math_ops.matmul(self._activation(state), A) + math_ops.matmul(inputs, array_ops.concat(0,[B,self._O])) + b + random_ops.random_normal([batch_size, self._num_units], stddev=self._sigma))
     else:
       output = (1 - self._dt_tau)*state + self._dt_tau*(math_ops.matmul(self._activation(state), A) + math_ops.matmul(inputs, B) + b + random_ops.random_normal([batch_size, self._num_units], stddev=self._sigma))
   return output, output
Example #14
0
  def build(self, inputs_shape):
    if inputs_shape[1].value is None:
      raise ValueError("Expected inputs.shape[-1] to be known, saw shape: %s"
                       % inputs_shape)

    input_depth = inputs_shape[1].value
    if self._input_initializer is None:
      self._input_initializer = init_ops.random_normal_initializer(mean=0.0,
                                                                   stddev=0.001)
    self._input_kernel = self.add_variable(
        "input_kernel",
        shape=[input_depth, self._num_units],
        initializer=self._input_initializer)

    if self._recurrent_initializer is None:
      self._recurrent_initializer = init_ops.constant_initializer(1.)
    self._recurrent_kernel = self.add_variable(
        "recurrent_kernel",
        shape=[self._num_units],
        initializer=self._recurrent_initializer)

    # Clip the absolute values of the recurrent weights to the specified minimum
    if self._recurrent_min_abs:
      abs_kernel = math_ops.abs(self._recurrent_kernel)
      min_abs_kernel = math_ops.maximum(abs_kernel, self._recurrent_min_abs)
      self._recurrent_kernel = math_ops.multiply(
          math_ops.sign(self._recurrent_kernel),
          min_abs_kernel
      )

    # Clip the absolute values of the recurrent weights to the specified maximum
    if self._recurrent_max_abs:
      self._recurrent_kernel = clip_ops.clip_by_value(self._recurrent_kernel,
                                                      -self._recurrent_max_abs,
                                                      self._recurrent_max_abs)

    self._bias = self.add_variable(
        "bias",
        shape=[self._num_units],
        initializer=init_ops.zeros_initializer(dtype=self.dtype))

    self.built = True
Example #15
0
def _get_random_features_initializer(initializer, shape):
  """Returns Initializer object for random features."""

  def _get_cauchy_samples(loc, scale, shape):
    probs = np.random.uniform(low=0., high=1., size=shape)
    return loc + scale * np.tan(np.pi * (probs - 0.5))

  random_features_initializer = initializer
  if isinstance(initializer, six.string_types):
    if initializer.lower() == 'gaussian':
      random_features_initializer = init_ops.random_normal_initializer(
          stddev=1.0)
    elif initializer.lower() == 'laplacian':
      random_features_initializer = init_ops.constant_initializer(
          _get_cauchy_samples(loc=0.0, scale=1.0, shape=shape))

    else:
      raise ValueError(
          'Unsupported kernel type: \'{}\'. Supported kernel types: {}.'.format(
              random_features_initializer, _SUPPORTED_RBF_KERNEL_TYPES))
  return random_features_initializer
    def testIndexedSlicesGradient(self):
        with ops.Graph().as_default():
            embedding_matrix = variable_scope.get_variable(
                "embedding_matrix", [5, 5], initializer=init_ops.random_normal_initializer()
            )

            def Cond(it, _):
                return it < 5

            def Body(it, cost):
                embedding = embedding_ops.embedding_lookup(embedding_matrix + 0.0, [0])
                cost += math_ops.reduce_sum(embedding)
                return it + 1, cost

            _, cost = control_flow_ops.while_loop(Cond, Body, [constant_op.constant(0), constant_op.constant(0.0)])
            optimizer = momentum.MomentumOptimizer(0.1, 0.9)
            train_op = optimizer.minimize(cost)
            with self.test_session() as sess:
                sess.run(variables.global_variables_initializer())
                for _ in range(10):
                    sess.run([train_op])
Example #17
0
  def testIndexedSlicesGradient(self):
    with ops.Graph().as_default():
      embedding_matrix = variable_scope.get_variable(
          "embedding_matrix", [5, 5],
          initializer=init_ops.random_normal_initializer())

      def cond(it, _):
        return it < 5

      def body(it, cost):
        embedding = embedding_ops.embedding_lookup(embedding_matrix + 0.0, [0])
        cost += math_ops.reduce_sum(embedding)
        return it + 1, cost

      _, cost = control_flow_ops.while_loop(
          cond, body, [constant_op.constant(0),
                       constant_op.constant(0.0)])
      optimizer = momentum.MomentumOptimizer(0.1, 0.9)
      train_op = optimizer.minimize(cost)
      with self.test_session() as sess:
        sess.run(variables.global_variables_initializer())
        for _ in range(10):
          sess.run([train_op])
Example #18
0
        def attention(query, hidden, hidden_features, v, encoder_mask, attn_length, scope=None): # added by al
            with variable_scope.variable_scope(scope or "attention"):
            # Put attention masks on hidden using hidden_features and query.

                ds = []  # Results of attention reads will be stored here.
                aa = []
                if nest.is_sequence(query):  # If the query is a tuple, flatten it.
                    query_list = nest.flatten(query)
                    for q in query_list:  # Check that ndims == 2 if specified.
                        ndims = q.get_shape().ndims
                        if ndims:
                            assert ndims == 2
                    query = array_ops.concat(1, query_list)

                for a in xrange(num_heads):
                    with variable_scope.variable_scope("AttnU_%d" % a):
                        y = linear(query, attention_vec_size, False,
                                   weight_initializer=init_ops.random_normal_initializer(0, 0.001, seed=SEED))
                        y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size])
                        # Attention mask is a softmax of v^T * tanh(...).
                        s = math_ops.reduce_sum(
                                v[a] * math_ops.tanh(hidden_features[a] + y), [2, 3])
                        # a = nn_ops.softmax(s)
                        s = array_ops.transpose(array_ops.transpose(s) - math_ops.reduce_max(s, [1]))
                        s = math_ops.exp(s)
                        s = math_ops.to_float(encoder_mask) * s
                        # s_s = math_ops.reduce_sum(s, [1])
                        # a = array_ops.transpose(array_ops.transpose(s) / (s_s + (1.0 - math_ops.sign(s_s))))
                        a = array_ops.transpose(array_ops.transpose(s) / math_ops.reduce_sum(s, [1]))
                        # complete softmax, added by al
                        aa.append(a)
                        d = math_ops.reduce_sum(
                                array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden,
                                [1, 2])
                        # complete attention calculation
                        ds.append(array_ops.reshape(d, [-1, attn_size]))
            return ds, aa
Example #19
0
 def __init__(self,
              filters,
              kernel_size,
              strides=1,
              padding="valid",
              data_format="channels_last",
              dilation_rate=1,
              activation=None,
              use_bias=True,
              dropout_rate=0.5,
              temperature=0.6,
              gamma=-0.1,
              zeta=1.1,
              kernel_initializer=init.random_normal_initializer(0., 1e-2),
              bias_initializer=init.zeros_initializer(),
              trainable=True,
              name=None,
              **kwargs):
     super(L0NormConv2D,
           self).__init__(rank=2,
                          filters=filters,
                          kernel_size=kernel_size,
                          strides=strides,
                          padding=padding,
                          data_format=data_format,
                          dilation_rate=dilation_rate,
                          activation=activation,
                          use_bias=use_bias,
                          dropout_rate=dropout_rate,
                          temperature=temperature,
                          gamma=gamma,
                          zeta=zeta,
                          kernel_initializer=kernel_initializer,
                          bias_initializer=bias_initializer,
                          trainable=trainable,
                          name=name,
                          **kwargs)
Example #20
0
    def build(self, inputs_shape):
        if inputs_shape[1].value is None:
            raise ValueError("Expected inputs.shape[-1] to be known, saw shape: %s"
                           % inputs_shape)
        input_dim = inputs_shape[1].value
        with tf.variable_scope('autoconceptor_vars'):

            self.W_in = tf.get_variable(
                "W_in",
                shape=[input_dim, self.num_units],
                initializer=init_ops.random_normal_initializer(),
                dtype=tf.float32)

            self.b_in = tf.get_variable(
                "b_in",
                shape=[self.num_units],
                initializer= init_ops.zeros_initializer(),
                dtype=tf.float32)

            self.W = tf.get_variable(
                "W",
                shape=[self.num_units, self.num_units],
                initializer=init_ops.constant_initializer(0.05 * np.identity(self.num_units)),
                dtype=tf.float32)


            self.gain = tf.get_variable(
                'layer-norm-gain',
                shape=[self.num_units],
                initializer=init_ops.constant_initializer(np.ones([self.num_units])),
                dtype=tf.float32)

            self.bias = tf.get_variable(
                'layer-norm-bias',
                shape=[self.num_units],
                initializer=init_ops.constant_initializer(np.zeros([self.num_units])),
                dtype=tf.float32)
Example #21
0
        def attention(query, scope=None):
            """Put attention masks on hidden using hidden_features and query."""
            with variable_scope.variable_scope(scope or "attention"):
                ds = []  # Results of attention reads will be stored here.
                if nest.is_sequence(
                        query):  # If the query is a tuple, flatten it.
                    query_list = nest.flatten(query)
                    for q in query_list:  # Check that ndims == 2 if specified.
                        ndims = q.get_shape().ndims
                        if ndims:
                            assert ndims == 2
                    query = array_ops.concat(query_list, 1)

                with variable_scope.variable_scope("AttnU"):
                    y = linear(
                        query,
                        attention_vec_size,
                        False,
                        weight_initializer=init_ops.random_normal_initializer(
                            0, 0.001, seed=SEED))
                    y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size])
                    # the additive attention is computed by v^T * tanh(...).
                    s = math_ops.reduce_sum(
                        v * math_ops.tanh(hidden_features + y), [2, 3])
                    s = array_ops.transpose(
                        array_ops.transpose(s) - math_ops.reduce_max(s, [1]))
                    # sofxmax with mask
                    s = math_ops.exp(s)
                    s = math_ops.to_float(encoder_mask) * s
                    a = array_ops.transpose(
                        array_ops.transpose(s) / math_ops.reduce_sum(s, [1]))
                    d = math_ops.reduce_sum(
                        array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden,
                        [1, 2])
                    ds.append(array_ops.reshape(d, [-1, attn_size]))
            return ds
Example #22
0
    def build(self, inputs_shape):
        if inputs_shape[1].value is None:
            raise ValueError("Expected inputs.shape[-1] to be known, saw shape: %s"
                             % inputs_shape)

        input_depth = inputs_shape[1].value
        if self._input_initializer is None:
            self._input_initializer = init_ops.random_normal_initializer(mean = 0.0,
                                                                         stddev = 0.001)
        self._input_kernel = self.add_variable("input_kernel", shape = [input_depth, self._num_units], initializer = self._input_initializer)

        if self._recurrent_initializer is None:
            self._recurrent_initializer = init_ops.constant_initializer(1.)
        self._recurrent_kernel = self.add_variable("recurrent_kernel", shape = [self._num_units], initializer = self._recurrent_initializer)

        # Clip the absolute values of the recurrent weights to the specified minimum
        if self._recurrent_min_abs:
            abs_kernel = math_ops.abs(self._recurrent_kernel)
            min_abs_kernel = math_ops.maximum(abs_kernel, self._recurrent_min_abs)
            self._recurrent_kernel = math_ops.multiply(
                math_ops.sign(self._recurrent_kernel),
                min_abs_kernel
            )

        # Clip the absolute values of the recurrent weights to the specified maximum
        if self._recurrent_max_abs:
            self._recurrent_kernel = clip_ops.clip_by_value(self._recurrent_kernel,
                                                            -self._recurrent_max_abs,
                                                            self._recurrent_max_abs)

        self._bias = self.add_variable(
            "bias",
            shape = [self._num_units],
            initializer = init_ops.zeros_initializer(dtype = self.dtype))

        self.built = True
Example #23
0
 def body(i, loss):
     i = i + 1
     init = init_ops.random_normal_initializer(0.0,
                                               1.0,
                                               seed=1,
                                               dtype=np.float32)
     x = variable_scope.get_variable("v2",
                                     dtype=np.float32,
                                     shape=[1, 4, 4, 2],
                                     initializer=init)
     with variable_scope.variable_scope("vs", use_resource=True):
         y = layers.Conv2D(
             2,
             1,
             use_bias=True,
             kernel_initializer=init_ops.ones_initializer(),
             name='conv1')(x)
         y = layers.Conv2D(
             2,
             1,
             use_bias=True,
             kernel_initializer=init_ops.ones_initializer(),
             name='conv2')(y)
         y = layers.Conv2D(
             2,
             1,
             use_bias=True,
             kernel_initializer=init_ops.ones_initializer(),
             name='conv3')(y)
     loss = math_ops.reduce_sum(y)
     optimizer = gradient_descent.GradientDescentOptimizer(0.1)
     train = optimizer.minimize(loss)
     with ops.control_dependencies([train]):
         i = array_ops.identity(i)
         loss = array_ops.identity(loss)
         return (i, loss)
 def test_variable_initializer(self):
     id = 0
     for initializer, target_mean, target_stddev in [
         (-1.0, -1.0, 0.0),
         (init_ops.random_normal_initializer(0.0, 0.01, seed=2), 0.0, 0.01),
     ]:
         with self.session(config=default_config,
                           use_gpu=test_util.is_gpu_available()):
             id += 1
             keys = constant_op.constant(list(range(2**17)), dtypes.int64)
             table = de.get_variable(
                 "t1" + str(id),
                 key_dtype=dtypes.int64,
                 value_dtype=dtypes.float32,
                 initializer=initializer,
                 dim=10,
             )
             vals_op = table.lookup(keys)
             mean = self.evaluate(math_ops.reduce_mean(vals_op))
             stddev = self.evaluate(math_ops.reduce_std(vals_op))
             rtol = 2e-5
             atol = rtol
             self.assertAllClose(target_mean, mean, rtol, atol)
             self.assertAllClose(target_stddev, stddev, rtol, atol)
Example #25
0
  def doTestIndexedSlicesGradientInCondInWhileLoop(self, use_resource=False):
    with ops.Graph().as_default():
      embedding_matrix = variable_scope.get_variable(
          "embedding_matrix", [5, 5],
          initializer=init_ops.random_normal_initializer(),
          use_resource=use_resource)

      def cond(it, _):
        return it < 5

      def body(it, cost):
        embedding = embedding_ops.embedding_lookup(embedding_matrix, [0])
        cost = control_flow_ops.cond(
            math_ops.equal(it, 3), lambda: math_ops.square(cost),
            lambda: cost + math_ops.reduce_sum(embedding))
        return it + 1, cost

      _, cost = control_flow_ops.while_loop(
          cond, body, [constant_op.constant(0),
                       constant_op.constant(0.0)])

      dynamic_grads = gradients_impl.gradients(cost, [embedding_matrix])[0]
      dynamic_grads = math_ops.segment_sum(dynamic_grads.values,
                                           dynamic_grads.indices)

      embedding = embedding_ops.embedding_lookup(embedding_matrix, [0])
      static = math_ops.square(
          math_ops.reduce_sum(embedding) + math_ops.reduce_sum(embedding) +
          math_ops.reduce_sum(embedding)) + math_ops.reduce_sum(embedding)
      static_grads = gradients_impl.gradients(static, [embedding_matrix])[0]
      static_grads = math_ops.segment_sum(static_grads.values,
                                          static_grads.indices)

      with self.test_session() as sess:
        sess.run(variables.global_variables_initializer())
        self.assertAllEqual(*sess.run([static_grads, dynamic_grads]))
Example #26
0
import tensorflow as tf
from tensorflow.python.ops.init_ops import random_normal_initializer

weights_init = random_normal_initializer(mean=0.0, stddev=0.1)


def block_5x5(inputs, filters=32):
    branch_5x5 = tf.layers.conv2d(inputs,
                                  kernel_size=(1, 1),
                                  strides=1,
                                  filters=filters,
                                  activation=tf.nn.relu,
                                  kernel_initializer=weights_init)
    branch_5x5 = tf.layers.conv2d(branch_5x5,
                                  kernel_size=(5, 5),
                                  strides=1,
                                  filters=filters,
                                  padding='same',
                                  kernel_initializer=weights_init,
                                  activation=tf.nn.relu)
    branch_3x3 = tf.layers.conv2d(inputs,
                                  kernel_size=(1, 1),
                                  strides=1,
                                  filters=filters,
                                  activation=tf.nn.relu)
    branch_3x3 = tf.layers.conv2d(branch_3x3,
                                  kernel_size=(3, 3),
                                  strides=1,
                                  filters=filters,
                                  padding='same',
                                  kernel_initializer=weights_init,
 def testInitializerDifferent(self):
   for dtype in [dtypes.float32, dtypes.float64]:
     init1 = init_ops.random_normal_initializer(0.0, 1.0, seed=1, dtype=dtype)
     init2 = init_ops.random_normal_initializer(0.0, 1.0, seed=2, dtype=dtype)
     self.assertFalse(identicaltest(self, init1, init2))
    def _testWithAttention(self,
                           create_attention_mechanism,
                           expected_final_output,
                           expected_final_state,
                           attention_mechanism_depth=3,
                           alignment_history=False,
                           expected_final_alignment_history=None,
                           attention_layer_size=6,
                           name=''):
        encoder_sequence_length = [3, 2, 3, 1, 1]
        decoder_sequence_length = [2, 0, 1, 2, 3]
        batch_size = 5
        encoder_max_time = 8
        decoder_max_time = 4
        input_depth = 7
        encoder_output_depth = 10
        cell_depth = 9

        if attention_layer_size is not None:
            attention_depth = attention_layer_size
        else:
            attention_depth = encoder_output_depth

        decoder_inputs = array_ops.placeholder_with_default(
            np.random.randn(batch_size, decoder_max_time,
                            input_depth).astype(np.float32),
            shape=(None, None, input_depth))
        encoder_outputs = array_ops.placeholder_with_default(
            np.random.randn(batch_size, encoder_max_time,
                            encoder_output_depth).astype(np.float32),
            shape=(None, None, encoder_output_depth))

        attention_mechanism = create_attention_mechanism(
            num_units=attention_mechanism_depth,
            memory=encoder_outputs,
            memory_sequence_length=encoder_sequence_length)

        with self.test_session(use_gpu=True) as sess:
            with vs.variable_scope(
                    'root',
                    initializer=init_ops.random_normal_initializer(stddev=0.01,
                                                                   seed=3)):
                cell = rnn_cell.LSTMCell(cell_depth)
                cell = wrapper.AttentionWrapper(
                    cell,
                    attention_mechanism,
                    attention_layer_size=attention_layer_size,
                    alignment_history=alignment_history)
                helper = helper_py.TrainingHelper(decoder_inputs,
                                                  decoder_sequence_length)
                my_decoder = basic_decoder.BasicDecoder(
                    cell=cell,
                    helper=helper,
                    initial_state=cell.zero_state(dtype=dtypes.float32,
                                                  batch_size=batch_size))

                final_outputs, final_state, _ = decoder.dynamic_decode(
                    my_decoder)

            self.assertTrue(
                isinstance(final_outputs, basic_decoder.BasicDecoderOutput))
            self.assertTrue(
                isinstance(final_state, wrapper.AttentionWrapperState))
            self.assertTrue(
                isinstance(final_state.cell_state, rnn_cell.LSTMStateTuple))

            self.assertEqual(
                (batch_size, None, attention_depth),
                tuple(final_outputs.rnn_output.get_shape().as_list()))
            self.assertEqual(
                (batch_size, None),
                tuple(final_outputs.sample_id.get_shape().as_list()))

            self.assertEqual(
                (batch_size, attention_depth),
                tuple(final_state.attention.get_shape().as_list()))
            self.assertEqual(
                (batch_size, cell_depth),
                tuple(final_state.cell_state.c.get_shape().as_list()))
            self.assertEqual(
                (batch_size, cell_depth),
                tuple(final_state.cell_state.h.get_shape().as_list()))

            if alignment_history:
                state_alignment_history = final_state.alignment_history.stack()
                # Remove the history from final_state for purposes of the
                # remainder of the tests.
                final_state = final_state._replace(alignment_history=())  # pylint: disable=protected-access
                self.assertEqual(
                    (None, batch_size, None),
                    tuple(state_alignment_history.get_shape().as_list()))
            else:
                state_alignment_history = ()

            sess.run(variables.global_variables_initializer())
            sess_results = sess.run({
                'final_outputs':
                final_outputs,
                'final_state':
                final_state,
                'state_alignment_history':
                state_alignment_history,
            })

            final_output_info = nest.map_structure(
                get_result_summary, sess_results['final_outputs'])
            final_state_info = nest.map_structure(get_result_summary,
                                                  sess_results['final_state'])
            print(name)
            print('Copy/paste:\nexpected_final_output = %s' %
                  str(final_output_info))
            print('expected_final_state = %s' % str(final_state_info))
            nest.map_structure(self.assertAllCloseOrEqual,
                               expected_final_output, final_output_info)
            nest.map_structure(self.assertAllCloseOrEqual,
                               expected_final_state, final_state_info)
            if alignment_history:  # by default, the wrapper emits attention as output
                final_alignment_history_info = nest.map_structure(
                    get_result_summary,
                    sess_results['state_alignment_history'])
                print('expected_final_alignment_history = %s' %
                      str(final_alignment_history_info))
                nest.map_structure(
                    self.assertAllCloseOrEqual,
                    # outputs are batch major but the stacked TensorArray is time major
                    expected_final_alignment_history,
                    final_alignment_history_info)
def default_loc_scale_fn(
    is_singular=False,
    loc_initializer=init_ops.random_normal_initializer(stddev=0.1),
    untransformed_scale_initializer=init_ops.random_normal_initializer(
        mean=-3., stddev=0.1),
    loc_regularizer=None,
    untransformed_scale_regularizer=None,
    loc_constraint=None,
    untransformed_scale_constraint=None):
  """Makes closure which creates `loc`, `scale` params from `tf.get_variable`.

  This function produces a closure which produces `loc`, `scale` using
  `tf.get_variable`. The closure accepts the following arguments:

    dtype: Type of parameter's event.
    shape: Python `list`-like representing the parameter's event shape.
    name: Python `str` name prepended to any created (or existing)
      `tf.Variable`s.
    trainable: Python `bool` indicating all created `tf.Variable`s should be
      added to the graph collection `GraphKeys.TRAINABLE_VARIABLES`.
    add_variable_fn: `tf.get_variable`-like `callable` used to create (or
      access existing) `tf.Variable`s.

  Args:
    is_singular: Python `bool` indicating if `scale is None`. Default: `False`.
    loc_initializer: Initializer function for the `loc` parameters.
      The default is `tf.random_normal_initializer(mean=0., stddev=0.1)`.
    untransformed_scale_initializer: Initializer function for the `scale`
      parameters. Default value: `tf.random_normal_initializer(mean=-3.,
      stddev=0.1)`. This implies the softplus transformed result has mean
      approximately `0.05` and std. deviation approximately `0.005`.
    loc_regularizer: Regularizer function for the `loc` parameters.
      The default (`None`) is to use the `tf.get_variable` default.
    untransformed_scale_regularizer: Regularizer function for the `scale`
      parameters. The default (`None`) is to use the `tf.get_variable` default.
    loc_constraint: An optional projection function to be applied to the
      loc after being updated by an `Optimizer`. The function must take as input
      the unprojected variable and must return the projected variable (which
      must have the same shape). Constraints are not safe to use when doing
      asynchronous distributed training.
      The default (`None`) is to use the `tf.get_variable` default.
    untransformed_scale_constraint: An optional projection function to be
      applied to the `scale` parameters after being updated by an `Optimizer`
      (e.g. used to implement norm constraints or value constraints). The
      function must take as input the unprojected variable and must return the
      projected variable (which must have the same shape). Constraints are not
      safe to use when doing asynchronous distributed training. The default
      (`None`) is to use the `tf.get_variable` default.

  Returns:
    default_loc_scale_fn: Python `callable` which instantiates `loc`, `scale`
    parameters from args: `dtype, shape, name, trainable, add_variable_fn`.
  """
  def _fn(dtype, shape, name, trainable, add_variable_fn):
    """Creates `loc`, `scale` parameters."""
    loc = add_variable_fn(
        name=name + "_loc",
        shape=shape,
        initializer=loc_initializer,
        regularizer=loc_regularizer,
        constraint=loc_constraint,
        dtype=dtype,
        trainable=trainable)
    if is_singular:
      return loc, None
    untransformed_scale = add_variable_fn(
        name=name + "_untransformed_scale",
        shape=shape,
        initializer=untransformed_scale_initializer,
        regularizer=untransformed_scale_regularizer,
        constraint=untransformed_scale_constraint,
        dtype=dtype,
        trainable=trainable)
    scale = (np.finfo(dtype.as_numpy_dtype).eps +
             nn_ops.softplus(untransformed_scale))
    return loc, scale
  return _fn
 def _fc(self, bottom, out_size, name):
     with tf.variable_scope(name):
         _, size = bottom.get_shape().as_list() 
         weights = tf.get_variable(name=name + "_weights", shape = [size, out_size], initializer=init_ops.random_normal_initializer(stddev=0.01))
         biases = tf.get_variable(name=name + "_biases", shape=[out_size], initializer=init_ops.random_normal_initializer(stddev=0.01)) 
         print weights
         fc = tf.nn.bias_add(tf.matmul(bottom, weights), biases)
         return fc
  def _testWithAttention(self,
                         create_attention_mechanism,
                         expected_final_outputs,
                         expected_final_state,
                         attention_mechanism_depth=3):
    encoder_sequence_length = [3, 2, 3, 1, 0]
    decoder_sequence_length = [2, 0, 1, 2, 3]
    batch_size = 5
    encoder_max_time = 8
    decoder_max_time = 4
    input_depth = 7
    encoder_output_depth = 10
    cell_depth = 9
    attention_depth = 6

    decoder_inputs = np.random.randn(batch_size, decoder_max_time,
                                     input_depth).astype(np.float32)
    encoder_outputs = np.random.randn(batch_size, encoder_max_time,
                                      encoder_output_depth).astype(np.float32)

    attention_mechanism = create_attention_mechanism(
        num_units=attention_mechanism_depth,
        memory=encoder_outputs,
        memory_sequence_length=encoder_sequence_length)

    with self.test_session() as sess:
      with vs.variable_scope(
          "root",
          initializer=init_ops.random_normal_initializer(stddev=0.01, seed=3)):
        cell = core_rnn_cell.LSTMCell(cell_depth)
        cell = wrapper.DynamicAttentionWrapper(
            cell, attention_mechanism, attention_size=attention_depth)
        helper = helper_py.TrainingHelper(decoder_inputs,
                                          decoder_sequence_length)
        my_decoder = basic_decoder.BasicDecoder(
            cell=cell,
            helper=helper,
            initial_state=cell.zero_state(
                dtype=dtypes.float32, batch_size=batch_size))

        final_outputs, final_state = decoder.dynamic_decode(my_decoder)

      self.assertTrue(
          isinstance(final_outputs, basic_decoder.BasicDecoderOutput))
      self.assertTrue(
          isinstance(final_state, wrapper.DynamicAttentionWrapperState))
      self.assertTrue(
          isinstance(final_state.cell_state, core_rnn_cell.LSTMStateTuple))

      self.assertEqual((batch_size, None, attention_depth),
                       tuple(final_outputs.rnn_output.get_shape().as_list()))
      self.assertEqual((batch_size, None),
                       tuple(final_outputs.sample_id.get_shape().as_list()))

      self.assertEqual((batch_size, attention_depth),
                       tuple(final_state.attention.get_shape().as_list()))
      self.assertEqual((batch_size, cell_depth),
                       tuple(final_state.cell_state.c.get_shape().as_list()))
      self.assertEqual((batch_size, cell_depth),
                       tuple(final_state.cell_state.h.get_shape().as_list()))

      sess.run(variables.global_variables_initializer())
      sess_results = sess.run({
          "final_outputs": final_outputs,
          "final_state": final_state
      })

      nest.map_structure(self.assertAllClose, expected_final_outputs,
                         sess_results["final_outputs"])
      nest.map_structure(self.assertAllClose, expected_final_state,
                         sess_results["final_state"])
Example #32
0
    def __init__(
            self,
            filters,
            dau_units,
            max_kernel_size,
            strides=1,
            data_format='channels_first',
            activation=None,
            use_bias=True,
            weight_initializer=init_ops.random_normal_initializer(stddev=0.1),
            mu1_initializer=None,
            mu2_initializer=None,
            sigma_initializer=None,
            bias_initializer=init_ops.zeros_initializer(),
            weight_regularizer=None,
            mu1_regularizer=None,
            mu2_regularizer=None,
            sigma_regularizer=None,
            bias_regularizer=None,
            activity_regularizer=None,
            weight_constraint=None,
            mu1_constraint=None,
            mu2_constraint=None,
            sigma_constraint=None,
            bias_constraint=None,
            trainable=True,
            mu_learning_rate_factor=500,
            dau_unit_border_bound=0.01,
            dau_unit_single_dim=False,
            dau_aggregation_forbid_positive_dim1=False,
            unit_testing=False,  # for competability between CPU and GPU version (where gradients of last edge need to be ignored) during unit testing
            name=None,
            **kwargs):
        super(DAUConv2d,
              self).__init__(trainable=trainable,
                             name=name,
                             activity_regularizer=activity_regularizer,
                             **kwargs)
        self.rank = 2
        self.filters = filters
        self.dau_units = utils.normalize_tuple(dau_units, self.rank,
                                               'dau_components')
        self.max_kernel_size = max_kernel_size
        self.padding = np.floor(self.max_kernel_size / 2.0)
        self.strides = strides
        self.data_format = utils.normalize_data_format(data_format)
        self.activation = activation
        self.use_bias = use_bias
        self.bias_initializer = bias_initializer
        self.bias_regularizer = bias_regularizer
        self.bias_constraint = bias_constraint

        self.weight_initializer = weight_initializer
        self.weight_regularizer = weight_regularizer
        self.weight_constraint = weight_constraint

        self.mu1_initializer = mu1_initializer
        self.mu1_regularizer = mu1_regularizer
        self.mu1_constraint = mu1_constraint

        self.mu2_initializer = mu2_initializer
        self.mu2_regularizer = mu2_regularizer
        self.mu2_constraint = mu2_constraint

        self.sigma_initializer = sigma_initializer
        self.sigma_regularizer = sigma_regularizer
        self.sigma_constraint = sigma_constraint

        if self.mu1_initializer is None:
            self.mu1_initializer = DAUGridMean(
                dau_units=self.dau_units,
                max_value=np.floor(self.max_kernel_size / 2.0) - 1,
                dau_unit_axis=2)
        if self.mu2_initializer is None:
            self.mu2_initializer = DAUGridMean(
                dau_units=self.dau_units,
                max_value=np.floor(self.max_kernel_size / 2.0) - 1,
                dau_unit_axis=1)

        if self.sigma_initializer is None:
            self.sigma_initializer = init_ops.constant_initializer(0.5)

        self.mu_learning_rate_factor = mu_learning_rate_factor

        self.unit_testing = unit_testing

        self.input_spec = base.InputSpec(ndim=self.rank + 2)

        self.dau_unit_border_bound = dau_unit_border_bound
        self.num_dau_units_all = np.int32(np.prod(self.dau_units))
        self.num_dau_units_ignore = 0

        self.dau_unit_single_dim = dau_unit_single_dim
        self.dau_aggregation_forbid_positive_dim1 = dau_aggregation_forbid_positive_dim1
        # if we have less then 2 units per channel then or have odd number of them then add one more dummy unit
        # since computation is always done with 2 units at the same time (effectively set weight=0 for those dummy units)

        # make sure we have at least ALLOWED_UNITS_GROUP (this is requested so for fast version that can handle only factor of 2)
        if self.num_dau_units_all % self.DAU_UNITS_GROUP != 0:
            new_num_units = np.int32(
                np.ceil(self.num_dau_units_all / float(self.DAU_UNITS_GROUP)) *
                self.DAU_UNITS_GROUP)

            self.num_dau_units_ignore = new_num_units - self.num_dau_units_all

            if self.dau_units[0] < self.dau_units[1]:
                self.dau_units = (self.dau_units[0] +
                                  self.num_dau_units_ignore, self.dau_units[1])
            else:
                self.dau_units = (self.dau_units[0], self.dau_units[1] +
                                  self.num_dau_units_ignore)

            self.num_dau_units_all = new_num_units

            self.weight_initializer = ZeroNLast(
                self.weight_initializer,
                last_num_to_zero=self.num_dau_units_ignore,
                axis=2)

        self.dau_weights = None
        self.dau_mu1 = None
        self.dau_mu2 = None
        self.dau_sigma = None

        # show notice when using stride>1 that this is not implemented by CUDA code and is only emulating it (will have same computationa requirements as for stride=1)
        if self.strides > 1:
            tf.logging.warning(
                'NOTICE: using stride>=2 in DAU convolution uses the same computational resources as with '
                +
                'stride=1 (current implementation only emulates stride>=2 using tensor slicing).'
            )
Example #33
0
  def test_dynamic_rnn_decoder_time_major(self):
    with self.test_session() as sess:
      with variable_scope.variable_scope(
          "root", initializer=init_ops.constant_initializer(0.5)) as varscope:
        # Define inputs/outputs to model
        batch_size = 2
        encoder_embedding_size = 3
        decoder_embedding_size = 4
        encoder_hidden_size = 5
        decoder_hidden_size = encoder_hidden_size
        input_sequence_length = 6
        decoder_sequence_length = 7
        num_decoder_symbols = 20
        start_of_sequence_id = end_of_sequence_id = 1
        decoder_embeddings = variable_scope.get_variable(
            "decoder_embeddings", [num_decoder_symbols, decoder_embedding_size],
            initializer=init_ops.random_normal_initializer(stddev=0.1))
        inputs = constant_op.constant(
            0.5,
            shape=[input_sequence_length, batch_size, encoder_embedding_size])
        decoder_inputs = constant_op.constant(
            0.4,
            shape=[decoder_sequence_length, batch_size, decoder_embedding_size])
        decoder_length = constant_op.constant(
            decoder_sequence_length, dtype=dtypes.int32, shape=[batch_size,])
        with variable_scope.variable_scope("rnn") as scope:
          # setting up weights for computing the final output
          output_fn = lambda x: layers.linear(x, num_decoder_symbols,
                                              scope=scope)

          # Define model
          encoder_outputs, encoder_state = rnn.dynamic_rnn(
              cell=core_rnn_cell_impl.GRUCell(encoder_hidden_size),
              inputs=inputs,
              dtype=dtypes.float32,
              time_major=True,
              scope=scope)

        with variable_scope.variable_scope("decoder") as scope:
          # Train decoder
          decoder_cell = core_rnn_cell_impl.GRUCell(decoder_hidden_size)
          decoder_fn_train = Seq2SeqTest._decoder_fn_with_context_state(
              decoder_fn_lib.simple_decoder_fn_train(
                  encoder_state=encoder_state))
          (decoder_outputs_train, decoder_state_train,
           decoder_context_state_train) = (seq2seq.dynamic_rnn_decoder(
               cell=decoder_cell,
               decoder_fn=decoder_fn_train,
               inputs=decoder_inputs,
               sequence_length=decoder_length,
               time_major=True,
               scope=scope))
          decoder_outputs_train = output_fn(decoder_outputs_train)

          # Setup variable reuse
          scope.reuse_variables()

          # Inference decoder
          decoder_fn_inference = Seq2SeqTest._decoder_fn_with_context_state(
              decoder_fn_lib.simple_decoder_fn_inference(
                  output_fn=output_fn,
                  encoder_state=encoder_state,
                  embeddings=decoder_embeddings,
                  start_of_sequence_id=start_of_sequence_id,
                  end_of_sequence_id=end_of_sequence_id,
                  #TODO: find out why it goes to +1
                  maximum_length=decoder_sequence_length - 1,
                  num_decoder_symbols=num_decoder_symbols,
                  dtype=dtypes.int32))
          (decoder_outputs_inference, decoder_state_inference,
           decoder_context_state_inference) = (seq2seq.dynamic_rnn_decoder(
               cell=decoder_cell,
               decoder_fn=decoder_fn_inference,
               time_major=True,
               scope=scope))

        # Run model
        variables.global_variables_initializer().run()
        (decoder_outputs_train_res, decoder_state_train_res,
         decoder_context_state_train_res) = sess.run([
             decoder_outputs_train, decoder_state_train,
             decoder_context_state_train
         ])
        (decoder_outputs_inference_res, decoder_state_inference_res,
         decoder_context_state_inference_res) = sess.run([
             decoder_outputs_inference, decoder_state_inference,
             decoder_context_state_inference
         ])

        # Assert outputs
        self.assertEqual((decoder_sequence_length, batch_size,
                          num_decoder_symbols), decoder_outputs_train_res.shape)
        self.assertEqual((batch_size, num_decoder_symbols),
                         decoder_outputs_inference_res.shape[1:3])
        self.assertEqual(decoder_sequence_length,
                         decoder_context_state_inference_res)
        self.assertEqual((batch_size, decoder_hidden_size),
                         decoder_state_train_res.shape)
        self.assertEqual((batch_size, decoder_hidden_size),
                         decoder_state_inference_res.shape)
        self.assertEqual(decoder_sequence_length,
                         decoder_context_state_train_res)
        # The dynamic decoder might end earlier than `maximal_length`
        # under inference
        self.assertGreaterEqual(decoder_sequence_length,
                                decoder_state_inference_res.shape[0])
Example #34
0
  def test_attention(self):
    with self.test_session() as sess:
      with variable_scope.variable_scope(
          "root", initializer=init_ops.constant_initializer(0.5)):
        # Define inputs/outputs to model
        batch_size = 2
        encoder_embedding_size = 3
        decoder_embedding_size = 4
        encoder_hidden_size = 5
        decoder_hidden_size = encoder_hidden_size
        input_sequence_length = 6
        decoder_sequence_length = 7
        num_decoder_symbols = 20
        start_of_sequence_id = end_of_sequence_id = 1
        decoder_embeddings = variable_scope.get_variable(
            "decoder_embeddings", [num_decoder_symbols, decoder_embedding_size],
            initializer=init_ops.random_normal_initializer(stddev=0.1))
        inputs = constant_op.constant(
            0.5,
            shape=[input_sequence_length, batch_size, encoder_embedding_size])
        decoder_inputs = constant_op.constant(
            0.4,
            shape=[decoder_sequence_length, batch_size, decoder_embedding_size])
        decoder_length = constant_op.constant(
            decoder_sequence_length, dtype=dtypes.int32, shape=[batch_size,])

        # attention
        attention_option = "luong"  # can be "bahdanau"

        with variable_scope.variable_scope("rnn") as scope:
          # Define model
          encoder_outputs, encoder_state = rnn.dynamic_rnn(
              cell=core_rnn_cell_impl.GRUCell(encoder_hidden_size),
              inputs=inputs,
              dtype=dtypes.float32,
              time_major=True,
              scope=scope)

          # attention_states: size [batch_size, max_time, num_units]
          attention_states = array_ops.transpose(encoder_outputs, [1, 0, 2])

        with variable_scope.variable_scope("decoder") as scope:
          # Prepare attention
          (attention_keys, attention_values, attention_score_fn,
           attention_construct_fn) = (attention_decoder_fn.prepare_attention(
               attention_states, attention_option, decoder_hidden_size))
          decoder_fn_train = attention_decoder_fn.attention_decoder_fn_train(
              encoder_state=encoder_state,
              attention_keys=attention_keys,
              attention_values=attention_values,
              attention_score_fn=attention_score_fn,
              attention_construct_fn=attention_construct_fn)

          # setting up weights for computing the final output
          def create_output_fn():

            def output_fn(x):
              return layers.linear(x, num_decoder_symbols, scope=scope)

            return output_fn

          output_fn = create_output_fn()

          # Train decoder
          decoder_cell = core_rnn_cell_impl.GRUCell(decoder_hidden_size)
          (decoder_outputs_train, decoder_state_train, _) = (
              seq2seq.dynamic_rnn_decoder(
                  cell=decoder_cell,
                  decoder_fn=decoder_fn_train,
                  inputs=decoder_inputs,
                  sequence_length=decoder_length,
                  time_major=True,
                  scope=scope))
          decoder_outputs_train = output_fn(decoder_outputs_train)
          # Setup variable reuse
          scope.reuse_variables()

          # Inference decoder
          decoder_fn_inference = (
              attention_decoder_fn.attention_decoder_fn_inference(
                  output_fn=output_fn,
                  encoder_state=encoder_state,
                  attention_keys=attention_keys,
                  attention_values=attention_values,
                  attention_score_fn=attention_score_fn,
                  attention_construct_fn=attention_construct_fn,
                  embeddings=decoder_embeddings,
                  start_of_sequence_id=start_of_sequence_id,
                  end_of_sequence_id=end_of_sequence_id,
                  maximum_length=decoder_sequence_length - 1,
                  num_decoder_symbols=num_decoder_symbols,
                  dtype=dtypes.int32))
          (decoder_outputs_inference, decoder_state_inference, _) = (
              seq2seq.dynamic_rnn_decoder(
                  cell=decoder_cell,
                  decoder_fn=decoder_fn_inference,
                  time_major=True,
                  scope=scope))

        # Run model
        variables.global_variables_initializer().run()
        (decoder_outputs_train_res, decoder_state_train_res) = sess.run(
            [decoder_outputs_train, decoder_state_train])
        (decoder_outputs_inference_res, decoder_state_inference_res) = sess.run(
            [decoder_outputs_inference, decoder_state_inference])

        # Assert outputs
        self.assertEqual((decoder_sequence_length, batch_size,
                          num_decoder_symbols), decoder_outputs_train_res.shape)
        self.assertEqual((batch_size, num_decoder_symbols),
                         decoder_outputs_inference_res.shape[1:3])
        self.assertEqual((batch_size, decoder_hidden_size),
                         decoder_state_train_res.shape)
        self.assertEqual((batch_size, decoder_hidden_size),
                         decoder_state_inference_res.shape)
        # The dynamic decoder might end earlier than `maximal_length`
        # under inference
        self.assertGreaterEqual(decoder_sequence_length,
                                decoder_state_inference_res.shape[0])
Example #35
0
def embedding_attention_decoder(encoder_mask,
                                decoder_inputs,
                                initial_state,
                                attention_states,
                                cell,
                                num_symbols,
                                embedding_size,
                                beam_size,
                                output_size=None,
                                output_projection=None,
                                num_layers=1,
                                feed_previous=False,
                                update_embedding_for_previous=True,
                                dtype=dtypes.float32,
                                scope=None,
                                initial_state_attention=False):
    """RNN decoder with embedding and attention and a pure-decoding option.

    Args:
      decoder_inputs: A list of 1D batch-sized int32 Tensors (decoder inputs).
      initial_state: 2D Tensor [batch_size x cell.state_size].
      attention_states: 3D Tensor [batch_size x attn_length x attn_size].
      cell: rnn_cell.RNNCell defining the cell function.
      num_symbols: Integer, how many symbols come into the embedding.
      embedding_size: Integer, the length of the embedding vector for each symbol.
      beam_size: the beam size of beam search
      output_size: Size of the output vectors; if None, use output_size.
      output_projection: None or a pair (W, B) of output projection weights and
        biases; W has shape [output_size x num_symbols] and B has shape
        [num_symbols]; if provided and feed_previous=True, each fed previous
        output will first be multiplied by W and added B.
      feed_previous: Boolean; if True, only the first of decoder_inputs will be
        used (the "GO" symbol), and all other decoder inputs will be generated by:
          next = embedding_lookup(embedding, top_k(previous_output)),
        In effect, this implements a beam search decoder.
        If False, decoder_inputs are used as given (the standard decoder case).
      update_embedding_for_previous: Boolean; if False and feed_previous=True,
        only the embedding for the first symbol of decoder_inputs (the "GO"
        symbol) will be updated by back propagation. Embeddings for the symbols
        generated from the decoder itself remain unchanged. This parameter has
        no effect if feed_previous=False.
      dtype: The dtype to use for the RNN initial states (default: tf.float32).
      scope: VariableScope for the created subgraph; defaults to
        "embedding_attention_decoder".
      initial_state_attention: If False (default), initial attentions are zero.
        If True, initialize the attentions from the initial state.

    Returns:
      A tuple of the form (outputs, state, symbols), where:
        outputs: A list of the same length as decoder_inputs of 2D Tensors with
          shape [batch_size x output_size] containing the generated outputs.
        state: The state of each decoder cell at the final time-step.
          It is a 2D Tensor of shape [batch_size x cell.state_size].
        symbols: When training, it is []; when decoding, it is the best translation
          generated by beam search.

    Raises:
      ValueError: When output_projection has the wrong shape.
    """
    if output_size is None:
        output_size = cell.output_size
    if output_projection is not None:
        proj_biases = ops.convert_to_tensor(output_projection[1], dtype=dtype)
        proj_biases.get_shape().assert_is_compatible_with([num_symbols])

    with variable_scope.variable_scope(scope or "embedding_attention_decoder"):
        # word embeddings of target words
        embedding = variable_scope.get_variable(
            "embedding", [num_symbols, embedding_size],
            dtype=dtype,
            initializer=init_ops.random_normal_initializer(0, 0.01, seed=SEED))

        # loop function for generating
        loop_function = _extract_argmax_and_embed(
            embedding, num_symbols, output_projection,
            update_embedding_for_previous) if feed_previous else None
        emb_inp = [
            embedding_ops.embedding_lookup(embedding, i)
            for i in decoder_inputs
        ]
        return attention_decoder(
            encoder_mask,
            emb_inp,
            initial_state,
            attention_states,
            cell,
            beam_size,
            output_size=output_size,
            num_layers=num_layers,
            loop_function=loop_function,
            initial_state_attention=initial_state_attention)
Example #36
0
def embedding_attention_seq2seq(encoder_inputs,
                                encoder_mask,
                                decoder_inputs,
                                cell,
                                num_encoder_symbols,
                                num_decoder_symbols,
                                embedding_size,
                                beam_size,
                                output_projection=None,
                                num_layers=1,
                                feed_previous=False,
                                dtype=dtypes.float32,
                                scope=None,
                                initial_state_attention=True):
    """Embedding sequence-to-sequence model with attention.

    This model first embeds encoder_inputs by a newly created embedding (of shape
    [num_encoder_symbols x input_size]). Then it runs an bidirectional-RNN to encode
    embedded encoder_inputs into a state vector. It keeps the outputs of this
    bidirectional-RNN at every step to use for attention later. Next, it embeds decoder_inputs
    by another newly created embedding (of shape [num_decoder_symbols x
    input_size]). Then it runs attention decoder, initialized with the last
    encoder state, on embedded decoder_inputs and attending to encoder outputs.

    Args:
      encoder_inputs: A list of 1D int32 Tensors of shape [batch_size].
      encoder_mask: the mask of encoder inputs that label where are PADs.
      decoder_inputs: A list of 1D int32 Tensors of shape [batch_size].
      cell: rnn_cell.RNNCell defining the cell function and size.
      num_encoder_symbols: Integer; number of symbols on the encoder side.
      num_decoder_symbols: Integer; number of symbols on the decoder side.
      embedding_size: Integer, the length of the embedding vector for each symbol.
      output_projection: None or a pair (W, B) of output projection weights and
        biases; W has shape [output_size x num_decoder_symbols] and B has
        shape [num_decoder_symbols]; if provided and feed_previous=True, each
        fed previous output will first be multiplied by W and added B.
      feed_previous: Boolean or scalar Boolean Tensor; if True, only the first
        of decoder_inputs will be used (the "GO" symbol), and all other decoder
        inputs will be taken from previous outputs (as in embedding_rnn_decoder).
        If False, decoder_inputs are used as given (the standard decoder case).
      dtype: The dtype of the initial RNN state (default: tf.float32).
      scope: VariableScope for the created subgraph; defaults to
        "embedding_attention_seq2seq".
      initial_state_attention: If False (default), initial attentions are zero.
        If True, initialize the attentions from the initial state and attention
        states.

    Returns:
      A tuple of the form (outputs, state, symbols), where:
        outputs: A list of the same length as decoder_inputs of 2D Tensors with
          shape [batch_size x num_decoder_symbols] containing the generated
          outputs.
        state: The state of each decoder cell at the final time-step.
          It is a 2D Tensor of shape [batch_size x cell.state_size].
        symbols: When training, it is []; when decoding, it is the best translation
          generated by beam search.
    """
    with variable_scope.variable_scope(scope or "embedding_attention_seq2seq"):
        # word embeddings of source words
        embedding = variable_scope.get_variable(
            "embedding", [num_encoder_symbols, embedding_size],
            dtype=dtype,
            initializer=init_ops.random_normal_initializer(0, 0.01, seed=SEED))
        # wrap encoder cell with embedding
        encoder_cell = rnn_cell.EmbeddingWrapper(
            cell,
            embedding_classes=num_encoder_symbols,
            embedding_size=embedding_size,
            embedding=embedding)

        # get the sentence lengths of source sentences
        encoder_lens = math_ops.reduce_sum(encoder_mask, [1])

        # encode source sentences with a bidirectional_rnn encoder
        encoder_outputs, _, encoder_state = rnn.bidirectional_rnn(
            encoder_cell,
            encoder_cell,
            encoder_inputs,
            sequence_length=encoder_lens,
            dtype=dtype)
        # First calculate a concatenation of encoder outputs.
        top_states = [
            array_ops.reshape(e, [-1, 1, 2 * cell.output_size])
            for e in encoder_outputs
        ]
        attention_states = array_ops.concat(top_states, 1)

        # Decoder.
        output_size = None
        if output_projection is None:
            cell = rnn_cell.OutputProjectionWrapper(cell, num_decoder_symbols)
            output_size = num_decoder_symbols

        return embedding_attention_decoder(
            encoder_mask,
            decoder_inputs,
            encoder_state,
            attention_states,
            cell,
            num_decoder_symbols,
            embedding_size,
            beam_size=beam_size,
            output_size=output_size,
            output_projection=output_projection,
            num_layers=num_layers,
            feed_previous=feed_previous,
            initial_state_attention=initial_state_attention)
    def _testWithAttention(self,
                           create_attention_mechanism,
                           expected_final_outputs,
                           expected_final_state,
                           attention_mechanism_depth=3):
        encoder_sequence_length = [3, 2, 3, 1, 0]
        decoder_sequence_length = [2, 0, 1, 2, 3]
        batch_size = 5
        encoder_max_time = 8
        decoder_max_time = 4
        input_depth = 7
        encoder_output_depth = 10
        cell_depth = 9
        attention_depth = 6

        decoder_inputs = np.random.randn(batch_size, decoder_max_time,
                                         input_depth).astype(np.float32)
        encoder_outputs = np.random.randn(batch_size, encoder_max_time,
                                          encoder_output_depth).astype(
                                              np.float32)

        attention_mechanism = create_attention_mechanism(
            num_units=attention_mechanism_depth,
            memory=encoder_outputs,
            memory_sequence_length=encoder_sequence_length)

        with self.test_session() as sess:
            with vs.variable_scope(
                    "root",
                    initializer=init_ops.random_normal_initializer(stddev=0.01,
                                                                   seed=3)):
                cell = core_rnn_cell.LSTMCell(cell_depth)
                cell = wrapper.DynamicAttentionWrapper(
                    cell, attention_mechanism, attention_size=attention_depth)
                helper = helper_py.TrainingHelper(decoder_inputs,
                                                  decoder_sequence_length)
                my_decoder = basic_decoder.BasicDecoder(
                    cell=cell,
                    helper=helper,
                    initial_state=cell.zero_state(dtype=dtypes.float32,
                                                  batch_size=batch_size))

                final_outputs, final_state = decoder.dynamic_decode(my_decoder)

            self.assertTrue(
                isinstance(final_outputs, basic_decoder.BasicDecoderOutput))
            self.assertTrue(
                isinstance(final_state, wrapper.DynamicAttentionWrapperState))
            self.assertTrue(
                isinstance(final_state.cell_state,
                           core_rnn_cell.LSTMStateTuple))

            self.assertEqual(
                (batch_size, None, attention_depth),
                tuple(final_outputs.rnn_output.get_shape().as_list()))
            self.assertEqual(
                (batch_size, None),
                tuple(final_outputs.sample_id.get_shape().as_list()))

            self.assertEqual(
                (batch_size, attention_depth),
                tuple(final_state.attention.get_shape().as_list()))
            self.assertEqual(
                (batch_size, cell_depth),
                tuple(final_state.cell_state.c.get_shape().as_list()))
            self.assertEqual(
                (batch_size, cell_depth),
                tuple(final_state.cell_state.h.get_shape().as_list()))

            sess.run(variables.global_variables_initializer())
            sess_results = sess.run({
                "final_outputs": final_outputs,
                "final_state": final_state
            })

            nest.map_structure(self.assertAllClose, expected_final_outputs,
                               sess_results["final_outputs"])
            nest.map_structure(self.assertAllClose, expected_final_state,
                               sess_results["final_state"])
Example #38
0
def attention_decoder(encoder_mask_1, encoder_mask_2, decoder_inputs, initial_state, 
                      attention_states_1, attention_states_2, cell,
                      beam_size,  # added by shiyue
                      output_size=None, num_heads=1, loop_function=None,
                      dtype=dtypes.float32, scope=None,
                      initial_state_attention=False
                      ):
    """RNN decoder with attention for the sequence-to-sequence model.

    In this context "attention" means that, during decoding, the RNN can look up
    information in the additional tensor attention_states, and it does this by
    focusing on a few entries from the tensor. This model has proven to yield
    especially good results in a number of sequence-to-sequence tasks. This
    implementation is based on http://arxiv.org/abs/1412.7449 (see below for
    details). It is recommended for complex sequence-to-sequence tasks.

    Args:
      decoder_inputs: A list of 2D Tensors [batch_size x input_size].
      initial_state: 2D Tensor [batch_size x cell.state_size].
      attention_states: 3D Tensor [batch_size x attn_length x attn_size].
      cell: rnn_cell.RNNCell defining the cell function and size.
      output_size: Size of the output vectors; if None, we use cell.output_size.
      num_heads: Number of attention heads that read from attention_states.
      loop_function: If not None, this function will be applied to i-th output
        in order to generate i+1-th input, and decoder_inputs will be ignored,
        except for the first element ("GO" symbol). This can be used for decoding,
        but also for training to emulate http://arxiv.org/abs/1506.03099.
        Signature -- loop_function(prev, i) = next
          * prev is a 2D Tensor of shape [batch_size x output_size],
          * i is an integer, the step number (when advanced control is needed),
          * next is a 2D Tensor of shape [batch_size x input_size].
      dtype: The dtype to use for the RNN initial state (default: tf.float32).
      scope: VariableScope for the created subgraph; default: "attention_decoder".
      initial_state_attention: If False (default), initial attentions are zero.
        If True, initialize the attentions from the initial state and attention
        states -- useful when we wish to resume decoding from a previously
        stored decoder state and attention states.

    Returns:
      A tuple of the form (outputs, state), where:
        outputs: A list of the same length as decoder_inputs of 2D Tensors of
          shape [batch_size x output_size]. These represent the generated outputs.
          Output i is computed from input i (which is either the i-th element
          of decoder_inputs or loop_function(output {i-1}, i)) as follows.
          First, we run the cell on a combination of the input and previous
          attention masks:
            cell_output, new_state = cell(linear(input, prev_attn), prev_state).
          Then, we calculate new attention masks:
            new_attn = softmax(V^T * tanh(W * attention_states + U * new_state))
          and then we calculate the output:
            output = linear(cell_output, new_attn).
        state: The state of each decoder cell the final time-step.
          It is a 2D Tensor of shape [batch_size x cell.state_size].

    Raises:
      ValueError: when num_heads is not positive, there are no inputs, shapes
        of attention_states are not set, or input size cannot be inferred
        from the input.
    """
    if not decoder_inputs:
        raise ValueError("Must provide at least 1 input to attention decoder.")
    if num_heads < 1:
        raise ValueError("With less than 1 heads, use a non-attention decoder.")
    if not attention_states_1.get_shape()[1:2].is_fully_defined():
        raise ValueError("Shape[1] and [2] of attention_states must be known: %s"
                         % attention_states_1.get_shape())
    if not attention_states_2.get_shape()[1:2].is_fully_defined():
        raise ValueError("Shape[1] and [2] of attention_states must be known: %s"
                         % attention_states_2.get_shape())
    if output_size is None:
        output_size = cell.output_size

    with variable_scope.variable_scope(scope or "attention_decoder"):
        batch_size = array_ops.shape(decoder_inputs[0])[0]  # Needed for reshaping.
        attn_length_1 = attention_states_1.get_shape()[1].value
        attn_length_2 = attention_states_2.get_shape()[1].value
        attn_size = attention_states_1.get_shape()[2].value

        # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before.

        hidden_1 = array_ops.reshape(
                attention_states_1, [-1, attn_length_1, 1, attn_size])
        hidden_2 = array_ops.reshape(
                attention_states_2, [-1, attn_length_2, 1, attn_size])

        attention_vec_size = cell.output_size  # Size of query vectors for attention.

        initial_state = math_ops.tanh(linear(initial_state, attention_vec_size, False,
                                             weight_initializer=init_ops.random_normal_initializer(0,
                                                                                                   0.01,
                                                                                                   seed=SEED)))  # special initial state

        # with variable_scope.variable_scope(scope or "attention"):

        hidden_features_1, v_1 = [], []
        hidden_features_2, v_2 = [], []

        with variable_scope.variable_scope("attention_1"):
            for a in xrange(num_heads):
                k_1 = variable_scope.get_variable("AttnW_%d" % a,
                                                [1, 1, attn_size, attention_vec_size],
                                                initializer=init_ops.random_normal_initializer(0, 0.001, seed=SEED))
                hidden_features_1.append(nn_ops.conv2d(hidden_1, k_1, [1, 1, 1, 1], "SAME"))
                v_1.append(variable_scope.get_variable("AttnV_%d" % a,
                                                     [attention_vec_size],
                                                     initializer=init_ops.constant_initializer(0.0)))
        with variable_scope.variable_scope("attention_2"):
            for a in xrange(num_heads):
                k_2 = variable_scope.get_variable("AttnW_%d" % a,
                                                [1, 1, attn_size, attention_vec_size],
                                                initializer=init_ops.random_normal_initializer(0, 0.001, seed=SEED))
                hidden_features_2.append(nn_ops.conv2d(hidden_2, k_2, [1, 1, 1, 1], "SAME"))
                v_2.append(variable_scope.get_variable("AttnV_%d" % a,
                                                     [attention_vec_size],
                                                     initializer=init_ops.constant_initializer(0.0)))

        def attention(query, hidden, hidden_features, v, encoder_mask, attn_length, scope=None): # added by al
            with variable_scope.variable_scope(scope or "attention"):
            # Put attention masks on hidden using hidden_features and query.

                ds = []  # Results of attention reads will be stored here.
                aa = []
                if nest.is_sequence(query):  # If the query is a tuple, flatten it.
                    query_list = nest.flatten(query)
                    for q in query_list:  # Check that ndims == 2 if specified.
                        ndims = q.get_shape().ndims
                        if ndims:
                            assert ndims == 2
                    query = array_ops.concat(1, query_list)

                for a in xrange(num_heads):
                    with variable_scope.variable_scope("AttnU_%d" % a):
                        y = linear(query, attention_vec_size, False,
                                   weight_initializer=init_ops.random_normal_initializer(0, 0.001, seed=SEED))
                        y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size])
                        # Attention mask is a softmax of v^T * tanh(...).
                        s = math_ops.reduce_sum(
                                v[a] * math_ops.tanh(hidden_features[a] + y), [2, 3])
                        # a = nn_ops.softmax(s)
                        s = array_ops.transpose(array_ops.transpose(s) - math_ops.reduce_max(s, [1]))
                        s = math_ops.exp(s)
                        s = math_ops.to_float(encoder_mask) * s
                        # s_s = math_ops.reduce_sum(s, [1])
                        # a = array_ops.transpose(array_ops.transpose(s) / (s_s + (1.0 - math_ops.sign(s_s))))
                        a = array_ops.transpose(array_ops.transpose(s) / math_ops.reduce_sum(s, [1]))
                        # complete softmax, added by al
                        aa.append(a)
                        d = math_ops.reduce_sum(
                                array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden,
                                [1, 2])
                        # complete attention calculation
                        ds.append(array_ops.reshape(d, [-1, attn_size]))
            return ds, aa

        outputs = []
        output = None
        state = initial_state
        prev = None
        # added by shiyue
        symbols = []
        aligns_1, aligns_2 = [], []
        prev_probs = [0]
        # ended by shiyue
        batch_attn_size = array_ops.pack([batch_size, attn_size])
        attns = [] # added by al
        
        # annotated by al
        # attns = [array_ops.zeros(batch_attn_size, dtype=dtype)
        #          for _ in xrange(num_heads)]
        # for a in attns:  # Ensure the second shape of attention vectors is set.
        #     a.set_shape([None, attn_size])
        # end by al

        for i, inp in enumerate(decoder_inputs):
            if i > 0:
                variable_scope.get_variable_scope().reuse_variables()
            # If loop_function is set, we use it instead of decoder_inputs.
            if loop_function is not None and prev is not None:
                with variable_scope.variable_scope("loop_function", reuse=True):
                    # inp = loop_function(prev, i) # annotated by shiyue
                    # added by shiyue
                    inp, prev_probs, index, prev_symbol = loop_function(prev, prev_probs, beam_size, i)
                    state = array_ops.gather(state, index)  # update prev state
                    attns = [array_ops.gather(attn, index) for attn in attns]  # update prev attens
                    for j, output in enumerate(outputs):
                        outputs[j] = array_ops.gather(output, index)  # update prev outputs
                    for j, symbol in enumerate(symbols):
                        symbols[j] = array_ops.gather(symbol, index)  # update prev symbols
                    symbols.append(prev_symbol)
                    # ended by shiyue

            # Merge input and previous attentions into one vector of the right size.
            input_size = inp.get_shape().with_rank(2)[1]
            if input_size.value is None:
                raise ValueError("Could not infer input size from input: %s" % inp.name)

            # Run the attention mechanism.
            attns = []
            if i > 0 or (i == 0 and initial_state_attention):
                attns_1, aa_1 = attention(state, hidden_1, hidden_features_1, v_1, encoder_mask_1, attn_length_1, scope="attention_1")
                attns_2, aa_2 = attention(state, hidden_2, hidden_features_2, v_2, encoder_mask_2, attn_length_2, scope="attention_2")
                for id_head in xrange(num_heads): # added by al
                    attns.append(alpha * attns_1[id_head] + beta * attns_2[id_head])
                '''
                for a1, a2 in zip(attns_1, attns_2): # added by al
                    attns.append(alpha * a1 + a2)
                '''
                aligns_1.append(aa_1)
                aligns_2.append(aa_2)

            # x = linear([inp] + attns, input_size, False,
            #            scope="cell_input")  # added by yfeng
            # Run the RNN.
            state, _ = cell(inp, state, attns[0])

            with variable_scope.variable_scope("AttnOutputProjection"):
                output = linear([state] + [inp] + attns, output_size, False)
                output = array_ops.reshape(output, [-1, output_size // 2, 2])
                output = math_ops.reduce_max(output, 2)

            if loop_function is not None:
                prev = output
            outputs.append(output)

        # added by shiyue
        if loop_function is not None:
            # process the last symbol
            inp, prev_probs, index, prev_symbol = loop_function(prev, prev_probs, beam_size, i + 1)
            state = array_ops.gather(state, index)  # update prev state
            for j, output in enumerate(outputs):
                outputs[j] = array_ops.gather(output, index)  # update prev outputs
            for j, symbol in enumerate(symbols):
                symbols[j] = array_ops.gather(symbol, index)  # update prev symbols
            symbols.append(prev_symbol)

            # output the final best result of beam search
            for k, symbol in enumerate(symbols):
                symbols[k] = array_ops.gather(symbol, 0)
            state = array_ops.expand_dims(array_ops.gather(state, 0), 0)
            for j, output in enumerate(outputs):
                outputs[j] = array_ops.expand_dims(array_ops.gather(output, 0), 0)  # update prev outputs
                # ended by shiyue
    return outputs, state, symbols  # modified by shiyue
Example #39
0
def attention_decoder(encoder_mask,
                      decoder_inputs,
                      initial_state,
                      attention_states,
                      cell,
                      beam_size,
                      output_size=None,
                      num_layers=1,
                      loop_function=None,
                      dtype=dtypes.float32,
                      scope=None,
                      initial_state_attention=False):
    """RNN decoder with attention for the sequence-to-sequence model.

    In this context "attention" means that, during decoding, the RNN can look up
    information in the additional tensor attention_states, and it does this by
    focusing on a few entries from the tensor. This model has proven to yield
    especially good results in a number of sequence-to-sequence tasks. This
    implementation is based on http://arxiv.org/abs/1409.0473 (see below for
    details).

    Args:
      encoder_mask: the mask of encoder inputs [batch_size x attn_length].
      decoder_inputs: A list of 2D Tensors [batch_size x input_size].
      initial_state: 2D Tensor [batch_size x cell.state_size].
      attention_states: 3D Tensor [batch_size x attn_length x attn_size].
      cell: rnn_cell.RNNCell defining the cell function and size.
      beam_size: the beam size of beam search
      output_size: Size of the output vectors; if None, we use cell.output_size.
      loop_function: When decoding, this function will be applied to i-th output
        in order to generate i+1-th input. The generation is by beam search.
      dtype: The dtype to use for the RNN initial state (default: tf.float32).
      scope: VariableScope for the created subgraph; default: "attention_decoder".
      initial_state_attention: If False (default), initial attentions are zero.
        If True, initialize the attentions from the initial state.

    Returns:
      A tuple of the form (outputs, state, symbols), where:
        outputs: A list of the same length as decoder_inputs of 2D Tensors of
          shape [batch_size x output_size]. These represent the generated outputs.
          Output i is computed from input i (which is either the i-th element
          of decoder_inputs or loop_function(output {i-1}, i)) as follows.
        state: The state of each decoder cell the final time-step.
          It is a 2D Tensor of shape [batch_size x cell.state_size].
        symbols: When training, it is []; when decoding, it is the best translation
          generated by beam search.

    Raises:
      ValueError: when shapes of attention_states are not set,
        or input size cannot be inferred from the input.
    """
    if not decoder_inputs:
        raise ValueError("Must provide at least 1 input to attention decoder.")
    if not attention_states.get_shape()[1:2].is_fully_defined():
        raise ValueError(
            "Shape[1] and [2] of attention_states must be known: %s" %
            attention_states.get_shape())
    if output_size is None:
        output_size = cell.output_size

    with variable_scope.variable_scope(scope or "attention_decoder"):
        batch_size = array_ops.shape(
            decoder_inputs[0])[0]  # Needed for reshaping.
        attn_length = attention_states.get_shape()[1].value
        attn_size = attention_states.get_shape()[2].value
        state_size = initial_state.get_shape()[1].value
        attention_vec_size = attn_size // 2  # Size of query vectors for attention.

        hidden = array_ops.reshape(attention_states,
                                   [-1, attn_length, 1, attn_size])

        # compute the initial hidden state of decoder
        initial_state = math_ops.tanh(
            linear(initial_state,
                   state_size,
                   False,
                   weight_initializer=init_ops.random_normal_initializer(
                       0, 0.01, seed=SEED)))

        with variable_scope.variable_scope(scope or "attention"):
            k = variable_scope.get_variable(
                "AttnW", [1, 1, attn_size, attention_vec_size],
                initializer=init_ops.random_normal_initializer(0,
                                                               0.001,
                                                               seed=SEED))
            hidden_features = nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME")
            v = variable_scope.get_variable(
                "AttnV", [attention_vec_size],
                initializer=init_ops.constant_initializer(0.0))

        def attention(query, scope=None):
            """Put attention masks on hidden using hidden_features and query."""
            with variable_scope.variable_scope(scope or "attention"):
                ds = []  # Results of attention reads will be stored here.
                if nest.is_sequence(
                        query):  # If the query is a tuple, flatten it.
                    query_list = nest.flatten(query)
                    for q in query_list:  # Check that ndims == 2 if specified.
                        ndims = q.get_shape().ndims
                        if ndims:
                            assert ndims == 2
                    query = array_ops.concat(query_list, 1)

                with variable_scope.variable_scope("AttnU"):
                    y = linear(
                        query,
                        attention_vec_size,
                        False,
                        weight_initializer=init_ops.random_normal_initializer(
                            0, 0.001, seed=SEED))
                    y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size])
                    # the additive attention is computed by v^T * tanh(...).
                    s = math_ops.reduce_sum(
                        v * math_ops.tanh(hidden_features + y), [2, 3])
                    s = array_ops.transpose(
                        array_ops.transpose(s) - math_ops.reduce_max(s, [1]))
                    # sofxmax with mask
                    s = math_ops.exp(s)
                    s = math_ops.to_float(encoder_mask) * s
                    a = array_ops.transpose(
                        array_ops.transpose(s) / math_ops.reduce_sum(s, [1]))
                    d = math_ops.reduce_sum(
                        array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden,
                        [1, 2])
                    ds.append(array_ops.reshape(d, [-1, attn_size]))
            return ds

        outputs = []
        output = None
        state = initial_state
        out_state = array_ops.split(state, num_layers, 1)[-1]
        prev = None
        symbols = []
        prev_probs = [0]
        batch_attn_size = array_ops.stack([batch_size, attn_size])
        attns = [array_ops.zeros(batch_attn_size, dtype=dtype)]
        for a in attns:  # Ensure the second shape of attention vectors is set.
            a.set_shape([None, attn_size])

        for i, inp in enumerate(decoder_inputs):
            if i > 0:
                variable_scope.get_variable_scope().reuse_variables()
            # If loop_function is set, we use it instead of decoder_inputs.
            if loop_function is not None and prev is not None:
                with variable_scope.variable_scope("loop_function",
                                                   reuse=True):
                    inp, prev_probs, index, prev_symbol = loop_function(
                        prev, prev_probs, beam_size, i)
                    out_state = array_ops.gather(out_state,
                                                 index)  # update prev state
                    state = array_ops.gather(state, index)  # update prev state
                    attns = [array_ops.gather(attn, index)
                             for attn in attns]  # update prev attens
                    for j, output in enumerate(outputs):
                        outputs[j] = array_ops.gather(
                            output, index)  # update prev outputs
                    for j, symbol in enumerate(symbols):
                        symbols[j] = array_ops.gather(
                            symbol, index)  # update prev symbols
                    symbols.append(prev_symbol)

            # Run the attention mechanism.
            if i > 0 or (i == 0 and initial_state_attention):
                attns = attention(out_state, scope="attention")

            # Run the RNN.
            cinp = array_ops.concat(
                [inp, attns[0]],
                1)  # concatenate next input and the context vector
            out_state, state = cell(cinp, state)

            with variable_scope.variable_scope("AttnOutputProjection"):
                output = linear([out_state] + [cinp], output_size, False)
                output = array_ops.reshape(output, [-1, output_size // 2, 2])
                output = math_ops.reduce_max(output, 2)  # maxout

            if loop_function is not None:
                prev = output
            outputs.append(output)

        if loop_function is not None:
            # handle the last symbol
            inp, prev_probs, index, prev_symbol = loop_function(
                prev, prev_probs, beam_size, i + 1)
            out_state = array_ops.gather(out_state, index)  # update prev state
            state = array_ops.gather(state, index)  # update prev state
            for j, output in enumerate(outputs):
                outputs[j] = array_ops.gather(output,
                                              index)  # update prev outputs
            for j, symbol in enumerate(symbols):
                symbols[j] = array_ops.gather(symbol,
                                              index)  # update prev symbols
            symbols.append(prev_symbol)

            # output the best result of beam search
            for k, symbol in enumerate(symbols):
                symbols[k] = array_ops.gather(symbol, 0)
            out_state = array_ops.expand_dims(array_ops.gather(out_state, 0),
                                              0)
            state = array_ops.expand_dims(array_ops.gather(state, 0), 0)
            for j, output in enumerate(outputs):
                outputs[j] = array_ops.expand_dims(array_ops.gather(output, 0),
                                                   0)  # update prev outputs
    return outputs, state, symbols
Example #40
0
def dau_conv1d(
        inputs,
        filters,
        dau_units,
        max_kernel_size,
        stride=1,
        mu_learning_rate_factor=500,
        data_format=None,
        activation_fn=nn.relu,
        normalizer_fn=None,
        normalizer_params=None,
        weights_initializer=init_ops.random_normal_initializer(
            stddev=0.1),  #init_ops.glorot_uniform_initializer(),
        weights_regularizer=None,
        weights_constraint=None,
        mu1_initializer=None,
        mu1_regularizer=None,
        mu1_constraint=None,
        sigma_initializer=None,
        sigma_regularizer=None,
        sigma_constraint=None,
        biases_initializer=init_ops.zeros_initializer(),
        biases_regularizer=None,
        dau_unit_border_bound=0.01,
        dau_aggregation_forbid_positive_dim1=False,
        reuse=None,
        variables_collections=None,
        outputs_collections=None,
        trainable=True,
        scope=None):

    if data_format not in [None, 'NCHW']:
        raise ValueError('Invalid data_format: %r' % (data_format, ))

    layer_variable_getter = layers_contrib._build_variable_getter({
        'bias':
        'biases',
        'weight':
        'weights',
        'mu1':
        'mu1',
        'sigma':
        'sigma'
    })

    with variable_scope.variable_scope(
            scope,
            'DAUConv', [inputs],
            reuse=reuse,
            custom_getter=layer_variable_getter) as sc:
        inputs = ops.convert_to_tensor(inputs)
        input_rank = inputs.get_shape().ndims

        if input_rank != 4:
            raise ValueError(
                'DAU convolution not supported for input with rank',
                input_rank)

        df = ('channels_first' if data_format and data_format.startswith('NC')
              else 'channels_last')

        layer = DAUConv1d(filters,
                          dau_units,
                          max_kernel_size,
                          strides=stride,
                          data_format=df,
                          activation=None,
                          use_bias=not normalizer_fn and biases_initializer,
                          mu_learning_rate_factor=mu_learning_rate_factor,
                          weight_initializer=weights_initializer,
                          mu1_initializer=mu1_initializer,
                          sigma_initializer=sigma_initializer,
                          bias_initializer=biases_initializer,
                          weight_regularizer=weights_regularizer,
                          mu1_regularizer=mu1_regularizer,
                          sigma_regularizer=sigma_regularizer,
                          bias_regularizer=biases_regularizer,
                          activity_regularizer=None,
                          dau_unit_border_bound=dau_unit_border_bound,
                          dau_aggregation_forbid_positive_dim1=
                          dau_aggregation_forbid_positive_dim1,
                          trainable=trainable,
                          unit_testing=False,
                          name=sc.name,
                          _scope=sc,
                          _reuse=reuse)

        dau_weights = weights_constraint(
            layer.add_dau_weights_var(
                inputs.shape)) if weights_constraint is not None else None
        dau_mu1 = mu1_constraint(layer.add_dau_mu1_var(
            inputs.shape)) if mu1_constraint is not None else None
        dau_sigma = sigma_constraint(layer.add_dau_sigma_var(
            inputs.shape)) if sigma_constraint is not None else None

        layer.set_dau_variables_manually(dau_weights, dau_mu1, None, dau_sigma)

        outputs = layer.apply(inputs)

        # Add variables to collections.
        layers_contrib._add_variable_to_collections(layer.dau_weights,
                                                    variables_collections,
                                                    'weights')
        layers_contrib._add_variable_to_collections(layer.dau_mu1,
                                                    variables_collections,
                                                    'mu1')
        layers_contrib._add_variable_to_collections(layer.dau_sigma,
                                                    variables_collections,
                                                    'sigma')

        if layer.use_bias:
            layers_contrib._add_variable_to_collections(
                layer.bias, variables_collections, 'biases')

        if normalizer_fn is not None:
            normalizer_params = normalizer_params or {}
            outputs = normalizer_fn(outputs, **normalizer_params)

        if activation_fn is not None:
            outputs = activation_fn(outputs)
        return utils_contrib.collect_named_outputs(outputs_collections,
                                                   sc.name, outputs)
 def fc_layer(self, bottom, out_size, name):
     with tf.variable_scope(name):
         _, _height, _width, _channel = bottom.get_shape().as_list() 
         size = _height*_width*_channel
         weights = tf.get_variable(name=name + "_weights", shape = [size, out_size], initializer=init_ops.random_normal_initializer(stddev=0.01))
         biases = tf.get_variable(name=name + "_biases", shape=[out_size], initializer=init_ops.random_normal_initializer(stddev=0.01)) 
         print weights
         x = tf.reshape(bottom, [-1, size])
         fc = tf.nn.bias_add(tf.matmul(x, weights), biases)
         return fc
Example #42
0
 def testInitializerDifferent(self):
   for dtype in [dtypes.float32, dtypes.float64]:
     init1 = init_ops.random_normal_initializer(0.0, 1.0, seed=1, dtype=dtype)
     init2 = init_ops.random_normal_initializer(0.0, 1.0, seed=2, dtype=dtype)
     self.assertFalse(identicaltest(self, init1, init2))
Example #43
0
    def call(self, inputs, state):
        with vs.variable_scope(self._name + "/FastGRNNcell",
                               reuse=self._reuse):

            if self._wRank is None:
                W_matrix_init = init_ops.random_normal_initializer(
                    mean=0.0, stddev=0.1, dtype=tf.float32)
                self.W = vs.get_variable(
                    "W", [inputs.get_shape()[-1], self._hidden_size],
                    initializer=W_matrix_init)
                wComp = math_ops.matmul(inputs, self.W)
            else:
                W_matrix_1_init = init_ops.random_normal_initializer(
                    mean=0.0, stddev=0.1, dtype=tf.float32)
                self.W1 = vs.get_variable(
                    "W1", [inputs.get_shape()[-1], self._wRank],
                    initializer=W_matrix_1_init)
                W_matrix_2_init = init_ops.random_normal_initializer(
                    mean=0.0, stddev=0.1, dtype=tf.float32)
                self.W2 = vs.get_variable("W2",
                                          [self._wRank, self._hidden_size],
                                          initializer=W_matrix_2_init)
                wComp = math_ops.matmul(math_ops.matmul(inputs, self.W1),
                                        self.W2)

            if self._uRank is None:
                U_matrix_init = init_ops.random_normal_initializer(
                    mean=0.0, stddev=0.1, dtype=tf.float32)
                self.U = vs.get_variable(
                    "U", [self._hidden_size, self._hidden_size],
                    initializer=U_matrix_init)
                uComp = math_ops.matmul(state, self.U)
            else:
                U_matrix_1_init = init_ops.random_normal_initializer(
                    mean=0.0, stddev=0.1, dtype=tf.float32)
                self.U1 = vs.get_variable("U1",
                                          [self._hidden_size, self._uRank],
                                          initializer=U_matrix_1_init)
                U_matrix_2_init = init_ops.random_normal_initializer(
                    mean=0.0, stddev=0.1, dtype=tf.float32)
                self.U2 = vs.get_variable("U2",
                                          [self._uRank, self._hidden_size],
                                          initializer=U_matrix_2_init)
                uComp = math_ops.matmul(math_ops.matmul(state, self.U1),
                                        self.U2)
            # Init zeta to 6.0 and nu to -6.0 if this doesn't give good
            # results. The inits are hyper-params.
            zeta_init = init_ops.constant_initializer(self._zetaInit,
                                                      dtype=tf.float32)
            self.zeta = vs.get_variable("zeta", [1, 1], initializer=zeta_init)

            nu_init = init_ops.constant_initializer(self._nuInit,
                                                    dtype=tf.float32)
            self.nu = vs.get_variable("nu", [1, 1], initializer=nu_init)

            pre_comp = wComp + uComp

            bias_gate_init = init_ops.constant_initializer(1.0,
                                                           dtype=tf.float32)
            self.bias_gate = vs.get_variable("B_g", [1, self._hidden_size],
                                             initializer=bias_gate_init)
            z = gen_non_linearity(pre_comp + self.bias_gate,
                                  self._gate_non_linearity)

            bias_update_init = init_ops.constant_initializer(1.0,
                                                             dtype=tf.float32)
            self.bias_update = vs.get_variable("B_h", [1, self._hidden_size],
                                               initializer=bias_update_init)
            c = gen_non_linearity(pre_comp + self.bias_update,
                                  self._update_non_linearity)
            new_h = z * state + (math_ops.sigmoid(self.zeta) *
                                 (1.0 - z) + math_ops.sigmoid(self.nu)) * c
        return new_h, new_h
Example #44
0
def logistic_regression(X,
                        y,
                        class_weight=None,
                        init_mean=None,
                        init_stddev=1.0):
    """Creates logistic regression TensorFlow subgraph.

    Args:
        X: tensor or placeholder for input features,
           shape should be [batch_size, n_features].
        y: tensor or placeholder for target,
           shape should be [batch_size, n_classes].
        class_weight: tensor, [n_classes], where for each class
                      it has weight of the class. If not provided
                      will check if graph contains tensor `class_weight:0`.
                      If that is not provided either all ones are used.
        init_mean: the mean value to use for initialization.
        init_stddev: the standard devation to use for initialization.

    Returns:
        Predictions and loss tensors.

    Side effects:
        The variables linear_regression.weights and linear_regression.bias are
        initialized as follows.  If init_mean is not None, then initialization
        will be done using a random normal initializer with the given init_mean
        and init_stddv.  (These may be set to 0.0 each if a zero initialization
        is desirable for convex use cases.)  If init_mean is None, then the
        uniform_unit_scaling_initialzer will be used.
    """
    with vs.variable_scope('logistic_regression'):
        logging_ops.histogram_summary('logistic_regression.X', X)
        logging_ops.histogram_summary('logistic_regression.y', y)
        # Set up the requested initialization.
        if (init_mean is None):
            weights = vs.get_variable(
                'weights',
                [X.get_shape()[1], y.get_shape()[-1]])
            bias = vs.get_variable('bias', [y.get_shape()[-1]])
        else:
            weights = vs.get_variable(
                'weights',
                [X.get_shape()[1], y.get_shape()[-1]],
                initializer=init_ops.random_normal_initializer(
                    init_mean, init_stddev))
            bias = vs.get_variable(
                'bias', [y.get_shape()[-1]],
                initializer=init_ops.random_normal_initializer(
                    init_mean, init_stddev))
        logging_ops.histogram_summary('logistic_regression.weights', weights)
        logging_ops.histogram_summary('logistic_regression.bias', bias)
        # If no class weight provided, try to retrieve one from pre-defined
        # tensor name in the graph.
        if not class_weight:
            try:
                class_weight = ops.get_default_graph().get_tensor_by_name(
                    'class_weight:0')
            except KeyError:
                pass

        return softmax_classifier(X,
                                  y,
                                  weights,
                                  bias,
                                  class_weight=class_weight)
Example #45
0
  def _testWithMaybeMultiAttention(self,
                                   is_multi,
                                   create_attention_mechanisms,
                                   expected_final_output,
                                   expected_final_state,
                                   attention_mechanism_depths,
                                   alignment_history=False,
                                   expected_final_alignment_history=None,
                                   attention_layer_sizes=None,
                                   attention_layers=None,
                                   name=''):
    # Allow is_multi to be True with a single mechanism to enable test for
    # passing in a single mechanism in a list.
    assert len(create_attention_mechanisms) == 1 or is_multi
    encoder_sequence_length = [3, 2, 3, 1, 1]
    decoder_sequence_length = [2, 0, 1, 2, 3]
    batch_size = 5
    encoder_max_time = 8
    decoder_max_time = 4
    input_depth = 7
    encoder_output_depth = 10
    cell_depth = 9

    if attention_layer_sizes is not None:
      # Compute sum of attention_layer_sizes. Use encoder_output_depth if None.
      attention_depth = sum(attention_layer_size or encoder_output_depth
                            for attention_layer_size in attention_layer_sizes)
    elif attention_layers is not None:
      # Compute sum of attention_layers output depth.
      attention_depth = sum(
          attention_layer.compute_output_shape(
              [batch_size, cell_depth + encoder_output_depth]).dims[-1].value
          for attention_layer in attention_layers)
    else:
      attention_depth = encoder_output_depth * len(create_attention_mechanisms)

    decoder_inputs = array_ops.placeholder_with_default(
        np.random.randn(batch_size, decoder_max_time,
                        input_depth).astype(np.float32),
        shape=(None, None, input_depth))
    encoder_outputs = array_ops.placeholder_with_default(
        np.random.randn(batch_size, encoder_max_time,
                        encoder_output_depth).astype(np.float32),
        shape=(None, None, encoder_output_depth))

    attention_mechanisms = [
        creator(num_units=depth,
                memory=encoder_outputs,
                memory_sequence_length=encoder_sequence_length)
        for creator, depth in zip(create_attention_mechanisms,
                                  attention_mechanism_depths)]

    with self.session(use_gpu=True) as sess:
      with vs.variable_scope(
          'root',
          initializer=init_ops.random_normal_initializer(stddev=0.01, seed=3)):
        attention_layer_size = attention_layer_sizes
        attention_layer = attention_layers
        if not is_multi:
          if attention_layer_size is not None:
            attention_layer_size = attention_layer_size[0]
          if attention_layer is not None:
            attention_layer = attention_layer[0]
        cell = rnn_cell.LSTMCell(cell_depth)
        cell = wrapper.AttentionWrapper(
            cell,
            attention_mechanisms if is_multi else attention_mechanisms[0],
            attention_layer_size=attention_layer_size,
            alignment_history=alignment_history,
            attention_layer=attention_layer)
        helper = helper_py.TrainingHelper(decoder_inputs,
                                          decoder_sequence_length)
        my_decoder = basic_decoder.BasicDecoder(
            cell=cell,
            helper=helper,
            initial_state=cell.zero_state(
                dtype=dtypes.float32, batch_size=batch_size))

        final_outputs, final_state, _ = decoder.dynamic_decode(my_decoder)

      self.assertTrue(
          isinstance(final_outputs, basic_decoder.BasicDecoderOutput))
      self.assertTrue(
          isinstance(final_state, wrapper.AttentionWrapperState))
      self.assertTrue(
          isinstance(final_state.cell_state, rnn_cell.LSTMStateTuple))

      self.assertEqual((batch_size, None, attention_depth),
                       tuple(final_outputs.rnn_output.get_shape().as_list()))
      self.assertEqual((batch_size, None),
                       tuple(final_outputs.sample_id.get_shape().as_list()))

      self.assertEqual((batch_size, attention_depth),
                       tuple(final_state.attention.get_shape().as_list()))
      self.assertEqual((batch_size, cell_depth),
                       tuple(final_state.cell_state.c.get_shape().as_list()))
      self.assertEqual((batch_size, cell_depth),
                       tuple(final_state.cell_state.h.get_shape().as_list()))

      if alignment_history:
        if is_multi:
          state_alignment_history = []
          for history_array in final_state.alignment_history:
            history = history_array.stack()
            self.assertEqual(
                (None, batch_size, None),
                tuple(history.get_shape().as_list()))
            state_alignment_history.append(history)
          state_alignment_history = tuple(state_alignment_history)
        else:
          state_alignment_history = final_state.alignment_history.stack()
          self.assertEqual(
              (None, batch_size, None),
              tuple(state_alignment_history.get_shape().as_list()))
        nest.assert_same_structure(
            cell.state_size,
            cell.zero_state(batch_size, dtypes.float32))
        # Remove the history from final_state for purposes of the
        # remainder of the tests.
        final_state = final_state._replace(alignment_history=())  # pylint: disable=protected-access
      else:
        state_alignment_history = ()

      sess.run(variables.global_variables_initializer())
      sess_results = sess.run({
          'final_outputs': final_outputs,
          'final_state': final_state,
          'state_alignment_history': state_alignment_history,
      })

      final_output_info = nest.map_structure(get_result_summary,
                                             sess_results['final_outputs'])
      final_state_info = nest.map_structure(get_result_summary,
                                            sess_results['final_state'])
      print(name)
      print('Copy/paste:\nexpected_final_output = %s' % str(final_output_info))
      print('expected_final_state = %s' % str(final_state_info))
      nest.map_structure(self.assertAllCloseOrEqual, expected_final_output,
                         final_output_info)
      nest.map_structure(self.assertAllCloseOrEqual, expected_final_state,
                         final_state_info)
      if alignment_history:  # by default, the wrapper emits attention as output
        final_alignment_history_info = nest.map_structure(
            get_result_summary, sess_results['state_alignment_history'])
        print('expected_final_alignment_history = %s' %
              str(final_alignment_history_info))
        nest.map_structure(
            self.assertAllCloseOrEqual,
            # outputs are batch major but the stacked TensorArray is time major
            expected_final_alignment_history,
            final_alignment_history_info)
    def test_attention(self):
        with self.test_session() as sess:
            with variable_scope.variable_scope(
                    "root", initializer=init_ops.constant_initializer(0.5)):
                # Define inputs/outputs to model
                batch_size = 2
                encoder_embedding_size = 3
                decoder_embedding_size = 4
                encoder_hidden_size = 5
                decoder_hidden_size = encoder_hidden_size
                input_sequence_length = 6
                decoder_sequence_length = 7
                num_decoder_symbols = 20
                start_of_sequence_id = end_of_sequence_id = 1
                decoder_embeddings = variable_scope.get_variable(
                    "decoder_embeddings",
                    [num_decoder_symbols, decoder_embedding_size],
                    initializer=init_ops.random_normal_initializer(stddev=0.1))
                inputs = constant_op.constant(0.5,
                                              shape=[
                                                  input_sequence_length,
                                                  batch_size,
                                                  encoder_embedding_size
                                              ])
                decoder_inputs = constant_op.constant(
                    0.4,
                    shape=[
                        decoder_sequence_length, batch_size,
                        decoder_embedding_size
                    ])
                decoder_length = constant_op.constant(decoder_sequence_length,
                                                      dtype=dtypes.int32,
                                                      shape=[
                                                          batch_size,
                                                      ])

                # attention
                attention_option = "luong"  # can be "bahdanau"

                with variable_scope.variable_scope("rnn") as scope:
                    # Define model
                    encoder_outputs, encoder_state = rnn.dynamic_rnn(
                        cell=core_rnn_cell_impl.GRUCell(encoder_hidden_size),
                        inputs=inputs,
                        dtype=dtypes.float32,
                        time_major=True,
                        scope=scope)

                    # attention_states: size [batch_size, max_time, num_units]
                    attention_states = array_ops.transpose(
                        encoder_outputs, [1, 0, 2])

                with variable_scope.variable_scope("decoder") as scope:
                    # Prepare attention
                    (attention_keys, attention_values, attention_score_fn,
                     attention_construct_fn) = (
                         attention_decoder_fn.prepare_attention(
                             attention_states, attention_option,
                             decoder_hidden_size))
                    decoder_fn_train = attention_decoder_fn.attention_decoder_fn_train(
                        encoder_state=encoder_state,
                        attention_keys=attention_keys,
                        attention_values=attention_values,
                        attention_score_fn=attention_score_fn,
                        attention_construct_fn=attention_construct_fn)

                    # setting up weights for computing the final output
                    def create_output_fn():
                        def output_fn(x):
                            return layers.linear(x,
                                                 num_decoder_symbols,
                                                 scope=scope)

                        return output_fn

                    output_fn = create_output_fn()

                    # Train decoder
                    decoder_cell = core_rnn_cell_impl.GRUCell(
                        decoder_hidden_size)
                    (decoder_outputs_train, decoder_state_train,
                     _) = (seq2seq.dynamic_rnn_decoder(
                         cell=decoder_cell,
                         decoder_fn=decoder_fn_train,
                         inputs=decoder_inputs,
                         sequence_length=decoder_length,
                         time_major=True,
                         scope=scope))
                    decoder_outputs_train = output_fn(decoder_outputs_train)
                    # Setup variable reuse
                    scope.reuse_variables()

                    # Inference decoder
                    decoder_fn_inference = (
                        attention_decoder_fn.attention_decoder_fn_inference(
                            output_fn=output_fn,
                            encoder_state=encoder_state,
                            attention_keys=attention_keys,
                            attention_values=attention_values,
                            attention_score_fn=attention_score_fn,
                            attention_construct_fn=attention_construct_fn,
                            embeddings=decoder_embeddings,
                            start_of_sequence_id=start_of_sequence_id,
                            end_of_sequence_id=end_of_sequence_id,
                            maximum_length=decoder_sequence_length - 1,
                            num_decoder_symbols=num_decoder_symbols,
                            dtype=dtypes.int32))
                    (decoder_outputs_inference, decoder_state_inference,
                     _) = (seq2seq.dynamic_rnn_decoder(
                         cell=decoder_cell,
                         decoder_fn=decoder_fn_inference,
                         time_major=True,
                         scope=scope))

                # Run model
                variables.global_variables_initializer().run()
                (decoder_outputs_train_res,
                 decoder_state_train_res) = sess.run(
                     [decoder_outputs_train, decoder_state_train])
                (decoder_outputs_inference_res,
                 decoder_state_inference_res) = sess.run(
                     [decoder_outputs_inference, decoder_state_inference])

                # Assert outputs
                self.assertEqual(
                    (decoder_sequence_length, batch_size, num_decoder_symbols),
                    decoder_outputs_train_res.shape)
                self.assertEqual((batch_size, num_decoder_symbols),
                                 decoder_outputs_inference_res.shape[1:3])
                self.assertEqual((batch_size, decoder_hidden_size),
                                 decoder_state_train_res.shape)
                self.assertEqual((batch_size, decoder_hidden_size),
                                 decoder_state_inference_res.shape)
                # The dynamic decoder might end earlier than `maximal_length`
                # under inference
                self.assertGreaterEqual(decoder_sequence_length,
                                        decoder_state_inference_res.shape[0])
  def _testWithAttention(self,
                         create_attention_mechanism,
                         expected_final_output,
                         expected_final_state,
                         attention_mechanism_depth=3,
                         alignment_history=False,
                         expected_final_alignment_history=None,
                         attention_layer_size=6,
                         name=""):
    encoder_sequence_length = [3, 2, 3, 1, 0]
    decoder_sequence_length = [2, 0, 1, 2, 3]
    batch_size = 5
    encoder_max_time = 8
    decoder_max_time = 4
    input_depth = 7
    encoder_output_depth = 10
    cell_depth = 9

    if attention_layer_size is not None:
      attention_depth = attention_layer_size
    else:
      attention_depth = encoder_output_depth

    decoder_inputs = np.random.randn(batch_size, decoder_max_time,
                                     input_depth).astype(np.float32)
    encoder_outputs = np.random.randn(batch_size, encoder_max_time,
                                      encoder_output_depth).astype(np.float32)

    attention_mechanism = create_attention_mechanism(
        num_units=attention_mechanism_depth,
        memory=encoder_outputs,
        memory_sequence_length=encoder_sequence_length)

    with self.test_session(use_gpu=True) as sess:
      with vs.variable_scope(
          "root",
          initializer=init_ops.random_normal_initializer(stddev=0.01, seed=3)):
        cell = core_rnn_cell.LSTMCell(cell_depth)
        cell = wrapper.AttentionWrapper(
            cell,
            attention_mechanism,
            attention_layer_size=attention_layer_size,
            alignment_history=alignment_history)
        helper = helper_py.TrainingHelper(decoder_inputs,
                                          decoder_sequence_length)
        my_decoder = basic_decoder.BasicDecoder(
            cell=cell,
            helper=helper,
            initial_state=cell.zero_state(
                dtype=dtypes.float32, batch_size=batch_size))

        final_outputs, final_state, _ = decoder.dynamic_decode(my_decoder)

      self.assertTrue(
          isinstance(final_outputs, basic_decoder.BasicDecoderOutput))
      self.assertTrue(
          isinstance(final_state, wrapper.AttentionWrapperState))
      self.assertTrue(
          isinstance(final_state.cell_state, core_rnn_cell.LSTMStateTuple))

      self.assertEqual((batch_size, None, attention_depth),
                       tuple(final_outputs.rnn_output.get_shape().as_list()))
      self.assertEqual((batch_size, None),
                       tuple(final_outputs.sample_id.get_shape().as_list()))

      self.assertEqual((batch_size, attention_depth),
                       tuple(final_state.attention.get_shape().as_list()))
      self.assertEqual((batch_size, cell_depth),
                       tuple(final_state.cell_state.c.get_shape().as_list()))
      self.assertEqual((batch_size, cell_depth),
                       tuple(final_state.cell_state.h.get_shape().as_list()))

      if alignment_history:
        state_alignment_history = final_state.alignment_history.stack()
        # Remove the history from final_state for purposes of the
        # remainder of the tests.
        final_state = final_state._replace(alignment_history=())  # pylint: disable=protected-access
        self.assertEqual((None, batch_size, encoder_max_time),
                         tuple(state_alignment_history.get_shape().as_list()))
      else:
        state_alignment_history = ()

      sess.run(variables.global_variables_initializer())
      sess_results = sess.run({
          "final_outputs": final_outputs,
          "final_state": final_state,
          "state_alignment_history": state_alignment_history,
      })

      print("Copy/paste (%s)\nexpected_final_output = " % name,
            sess_results["final_outputs"])
      sys.stdout.flush()
      print("Copy/paste (%s)\nexpected_final_state = " % name,
            sess_results["final_state"])
      sys.stdout.flush()
      print("Copy/paste (%s)\nexpected_final_alignment_history = " % name,
            np.asarray(sess_results["state_alignment_history"]))
      sys.stdout.flush()
      nest.map_structure(self.assertAllClose, expected_final_output,
                         sess_results["final_outputs"])
      nest.map_structure(self.assertAllClose, expected_final_state,
                         sess_results["final_state"])
      if alignment_history:  # by default, the wrapper emits attention as output
        self.assertAllClose(
            # outputs are batch major but the stacked TensorArray is time major
            sess_results["state_alignment_history"],
            expected_final_alignment_history)
 def testDuplicatedInitializer(self):
     init = init_ops.random_normal_initializer(0.0, 1.0)
     self.assertFalse(duplicated_initializer(self, init, 1))
        def _attention_with_coverage(state, context, last_coverage,
                                     encoded_fertility):
            with vs.variable_scope("attention"):
                ctx_shape = context.get_shape().as_list()
                dim_ctx = ctx_shape[-1]
                if isinstance(state_size, tuple):
                    _, m_prev = state
                    _, m_size = state_size
                else:
                    m_prev, m_size = state, state_size
                # print (last_coverage.get_shape().as_list())

                init_std = 1. / math.sqrt(m_size)
                cov_initializer = init_ops.random_normal_initializer(mean=0,
                                                                     stddev=1.)
                initializer = init_ops.random_normal_initializer(
                    mean=0, stddev=init_std)
                with vs.variable_scope("ctx_proj"):
                    pcoverage = dense(array_ops.expand_dims(last_coverage, -1),
                                      units=dim_ctx,
                                      kernel_initializer=cov_initializer,
                                      use_bias=False)
                    pctx = dense(context,
                                 units=dim_ctx,
                                 kernel_initializer=initializer,
                                 use_bias=True)
                    # pctx = _linear(array_ops.reshape(pctx, [-1, dim_ctx + 1]), dim_ctx, bias=True)
                    # pctx = array_ops.reshape(pctx, [-1, ctx_shape[1], dim_ctx])
                    # pctx = array_ops.reshape(context, [-1, dim_ctx])
                    # pctx = array_ops.reshape(_linear(pctx, dim_ctx, bias=True), [-1, ctx_shape[1], dim_ctx])

                with vs.variable_scope("state_proj"):
                    pstate = array_ops.expand_dims(_linear(
                        m_prev,
                        dim_ctx,
                        kernel_initializer=initializer,
                        bias=False),
                                                   axis=1)

                with vs.variable_scope("cell_proj") as cell_proj_scope:
                    # alpha = math_ops.reduce_sum(math_ops.tanh(pstate + pctx + pcoverage), [2])
                    alpha = dense(math_ops.tanh(pstate + pctx + pcoverage),
                                  units=1,
                                  kernel_initializer=initializer,
                                  use_bias=False)
                    alpha = math_ops.reduce_sum(alpha, [2])
                    # pctx = math_ops.tanh(array_ops.reshape((pctx + pstate), [-1, dim_ctx]))
                    # alpha = array_ops.reshape(_linear(pctx, 1, bias=True), [-1, ctx_shape[1]])

                if att_sequence_length is not None:
                    alpha_mask = array_ops.sequence_mask(
                        lengths=att_sequence_length,
                        maxlen=ctx_shape[1],
                        dtype=dtypes.float32)
                    alpha = alpha * alpha_mask + (
                        (1.0 - alpha_mask) * dtypes.float32.min)
                alpha_normalized = nn_ops.softmax(alpha)
                ctx = math_ops.reduce_sum(
                    context * array_ops.expand_dims(alpha_normalized, axis=2),
                    axis=1)
                # print (alpha_normalized, last_coverage, encoded_fertility)

                encoded_fertility = array_ops.identity(
                    encoded_fertility, name="encoded_fertility")
                new_coverage = last_coverage + alpha_normalized * math_ops.pow(
                    2 * encoded_fertility, -1)
                new_coverage = new_coverage * alpha_mask + (
                    (1.0 - alpha_mask) * last_coverage)
                return ctx, alpha_normalized, new_coverage
    def _testWithAttention(self,
                           create_attention_mechanism,
                           expected_final_output,
                           expected_final_state,
                           attention_mechanism_depth=3,
                           alignment_history=False,
                           expected_final_alignment_history=None,
                           name=""):
        encoder_sequence_length = [3, 2, 3, 1, 0]
        decoder_sequence_length = [2, 0, 1, 2, 3]
        batch_size = 5
        encoder_max_time = 8
        decoder_max_time = 4
        input_depth = 7
        encoder_output_depth = 10
        cell_depth = 9
        attention_depth = 6

        decoder_inputs = np.random.randn(batch_size, decoder_max_time,
                                         input_depth).astype(np.float32)
        encoder_outputs = np.random.randn(batch_size, encoder_max_time,
                                          encoder_output_depth).astype(
                                              np.float32)

        attention_mechanism = create_attention_mechanism(
            num_units=attention_mechanism_depth,
            memory=encoder_outputs,
            memory_sequence_length=encoder_sequence_length)

        with self.test_session(use_gpu=True) as sess:
            with vs.variable_scope(
                    "root",
                    initializer=init_ops.random_normal_initializer(stddev=0.01,
                                                                   seed=3)):
                cell = core_rnn_cell.LSTMCell(cell_depth)
                cell = wrapper.AttentionWrapper(
                    cell,
                    attention_mechanism,
                    attention_size=attention_depth,
                    alignment_history=alignment_history)
                helper = helper_py.TrainingHelper(decoder_inputs,
                                                  decoder_sequence_length)
                my_decoder = basic_decoder.BasicDecoder(
                    cell=cell,
                    helper=helper,
                    initial_state=cell.zero_state(dtype=dtypes.float32,
                                                  batch_size=batch_size))

                final_outputs, final_state = decoder.dynamic_decode(my_decoder)

            self.assertTrue(
                isinstance(final_outputs, basic_decoder.BasicDecoderOutput))
            self.assertTrue(
                isinstance(final_state, wrapper.AttentionWrapperState))
            self.assertTrue(
                isinstance(final_state.cell_state,
                           core_rnn_cell.LSTMStateTuple))

            self.assertEqual(
                (batch_size, None, attention_depth),
                tuple(final_outputs.rnn_output.get_shape().as_list()))
            self.assertEqual(
                (batch_size, None),
                tuple(final_outputs.sample_id.get_shape().as_list()))

            self.assertEqual(
                (batch_size, attention_depth),
                tuple(final_state.attention.get_shape().as_list()))
            self.assertEqual(
                (batch_size, cell_depth),
                tuple(final_state.cell_state.c.get_shape().as_list()))
            self.assertEqual(
                (batch_size, cell_depth),
                tuple(final_state.cell_state.h.get_shape().as_list()))

            if alignment_history:
                state_alignment_history = final_state.alignment_history.stack()
                # Remove the history from final_state for purposes of the
                # remainder of the tests.
                final_state = final_state._replace(alignment_history=())  # pylint: disable=protected-access
                self.assertEqual(
                    (None, batch_size, encoder_max_time),
                    tuple(state_alignment_history.get_shape().as_list()))
            else:
                state_alignment_history = ()

            sess.run(variables.global_variables_initializer())
            sess_results = sess.run({
                "final_outputs":
                final_outputs,
                "final_state":
                final_state,
                "state_alignment_history":
                state_alignment_history,
            })

            print("Copy/paste (%s)\nexpected_final_output = " % name,
                  sess_results["final_outputs"])
            sys.stdout.flush()
            print("Copy/paste (%s)\nexpected_final_state = " % name,
                  sess_results["final_state"])
            sys.stdout.flush()
            print(
                "Copy/paste (%s)\nexpected_final_alignment_history = " % name,
                sess_results["state_alignment_history"])
            sys.stdout.flush()
            nest.map_structure(self.assertAllClose, expected_final_output,
                               sess_results["final_outputs"])
            nest.map_structure(self.assertAllClose, expected_final_state,
                               sess_results["final_state"])
            if alignment_history:  # by default, the wrapper emits attention as output
                self.assertAllClose(
                    # outputs are batch major but the stacked TensorArray is time major
                    sess_results["state_alignment_history"],
                    expected_final_alignment_history)
Example #51
0
 def testDuplicatedInitializer(self):
   init = init_ops.random_normal_initializer(0.0, 1.0)
   self.assertFalse(duplicated_initializer(self, init, 1))
  def _testWithMaybeMultiAttention(self,
                                   is_multi,
                                   create_attention_mechanisms,
                                   expected_final_output,
                                   expected_final_state,
                                   attention_mechanism_depths,
                                   alignment_history=False,
                                   expected_final_alignment_history=None,
                                   attention_layer_sizes=None,
                                   attention_layers=None,
                                   name=''):
    # Allow is_multi to be True with a single mechanism to enable test for
    # passing in a single mechanism in a list.
    assert len(create_attention_mechanisms) == 1 or is_multi
    encoder_sequence_length = [3, 2, 3, 1, 1]
    decoder_sequence_length = [2, 0, 1, 2, 3]
    batch_size = 5
    encoder_max_time = 8
    decoder_max_time = 4
    input_depth = 7
    encoder_output_depth = 10
    cell_depth = 9

    if attention_layer_sizes is not None:
      # Compute sum of attention_layer_sizes. Use encoder_output_depth if None.
      attention_depth = sum([attention_layer_size or encoder_output_depth
                             for attention_layer_size in attention_layer_sizes])
    elif attention_layers is not None:
      # Compute sum of attention_layers output depth.
      attention_depth = sum(
          attention_layer.compute_output_shape(
              [batch_size, cell_depth + encoder_output_depth])[-1].value
          for attention_layer in attention_layers)
    else:
      attention_depth = encoder_output_depth * len(create_attention_mechanisms)

    decoder_inputs = array_ops.placeholder_with_default(
        np.random.randn(batch_size, decoder_max_time,
                        input_depth).astype(np.float32),
        shape=(None, None, input_depth))
    encoder_outputs = array_ops.placeholder_with_default(
        np.random.randn(batch_size, encoder_max_time,
                        encoder_output_depth).astype(np.float32),
        shape=(None, None, encoder_output_depth))

    attention_mechanisms = [
        creator(num_units=depth,
                memory=encoder_outputs,
                memory_sequence_length=encoder_sequence_length)
        for creator, depth in zip(create_attention_mechanisms,
                                  attention_mechanism_depths)]

    with self.test_session(use_gpu=True) as sess:
      with vs.variable_scope(
          'root',
          initializer=init_ops.random_normal_initializer(stddev=0.01, seed=3)):
        attention_layer_size = attention_layer_sizes
        attention_layer = attention_layers
        if not is_multi:
          if attention_layer_size is not None:
            attention_layer_size = attention_layer_size[0]
          if attention_layer is not None:
            attention_layer = attention_layer[0]
        cell = rnn_cell.LSTMCell(cell_depth)
        cell = wrapper.AttentionWrapper(
            cell,
            attention_mechanisms if is_multi else attention_mechanisms[0],
            attention_layer_size=attention_layer_size,
            alignment_history=alignment_history,
            attention_layer=attention_layer)
        helper = helper_py.TrainingHelper(decoder_inputs,
                                          decoder_sequence_length)
        my_decoder = basic_decoder.BasicDecoder(
            cell=cell,
            helper=helper,
            initial_state=cell.zero_state(
                dtype=dtypes.float32, batch_size=batch_size))

        final_outputs, final_state, _ = decoder.dynamic_decode(my_decoder)

      self.assertTrue(
          isinstance(final_outputs, basic_decoder.BasicDecoderOutput))
      self.assertTrue(
          isinstance(final_state, wrapper.AttentionWrapperState))
      self.assertTrue(
          isinstance(final_state.cell_state, rnn_cell.LSTMStateTuple))

      self.assertEqual((batch_size, None, attention_depth),
                       tuple(final_outputs.rnn_output.get_shape().as_list()))
      self.assertEqual((batch_size, None),
                       tuple(final_outputs.sample_id.get_shape().as_list()))

      self.assertEqual((batch_size, attention_depth),
                       tuple(final_state.attention.get_shape().as_list()))
      self.assertEqual((batch_size, cell_depth),
                       tuple(final_state.cell_state.c.get_shape().as_list()))
      self.assertEqual((batch_size, cell_depth),
                       tuple(final_state.cell_state.h.get_shape().as_list()))

      if alignment_history:
        if is_multi:
          state_alignment_history = []
          for history_array in final_state.alignment_history:
            history = history_array.stack()
            self.assertEqual(
                (None, batch_size, None),
                tuple(history.get_shape().as_list()))
            state_alignment_history.append(history)
          state_alignment_history = tuple(state_alignment_history)
        else:
          state_alignment_history = final_state.alignment_history.stack()
          self.assertEqual(
              (None, batch_size, None),
              tuple(state_alignment_history.get_shape().as_list()))
        nest.assert_same_structure(
            cell.state_size,
            cell.zero_state(batch_size, dtypes.float32))
        # Remove the history from final_state for purposes of the
        # remainder of the tests.
        final_state = final_state._replace(alignment_history=())  # pylint: disable=protected-access
      else:
        state_alignment_history = ()

      sess.run(variables.global_variables_initializer())
      sess_results = sess.run({
          'final_outputs': final_outputs,
          'final_state': final_state,
          'state_alignment_history': state_alignment_history,
      })

      final_output_info = nest.map_structure(get_result_summary,
                                             sess_results['final_outputs'])
      final_state_info = nest.map_structure(get_result_summary,
                                            sess_results['final_state'])
      print(name)
      print('Copy/paste:\nexpected_final_output = %s' % str(final_output_info))
      print('expected_final_state = %s' % str(final_state_info))
      nest.map_structure(self.assertAllCloseOrEqual, expected_final_output,
                         final_output_info)
      nest.map_structure(self.assertAllCloseOrEqual, expected_final_state,
                         final_state_info)
      if alignment_history:  # by default, the wrapper emits attention as output
        final_alignment_history_info = nest.map_structure(
            get_result_summary, sess_results['state_alignment_history'])
        print('expected_final_alignment_history = %s' %
              str(final_alignment_history_info))
        nest.map_structure(
            self.assertAllCloseOrEqual,
            # outputs are batch major but the stacked TensorArray is time major
            expected_final_alignment_history,
            final_alignment_history_info)
Example #53
0
  def _TestOneSimpleTraining(self, rnn_mode, num_layers, num_units, input_size,
                             batch_size, seq_length, dir_count, dropout, dtype,
                             delta, tolerance):
    # Gradient checking runs two forward ops with almost the same input. Need to
    # make sure the drop patterns across the two runs are the same.
    logging.info("Training test with config: %s", locals())
    old_env_state = os.environ.get("TF_CUDNN_RESET_RND_GEN_STATE", str(False))
    os.environ["TF_CUDNN_RESET_RND_GEN_STATE"] = str(True)

    np.random.seed(1234)
    random_seed.set_random_seed(5678)
    has_input_c = (rnn_mode == CUDNN_LSTM)
    direction = (CUDNN_RNN_UNIDIRECTION
                 if dir_count == 1 else CUDNN_RNN_BIDIRECTION)
    model = CudnnTestModel(
        rnn_mode,
        num_layers,
        num_units,
        input_size,
        direction=direction,
        dropout=dropout,
        dtype=dtype,
        training=True,
        bias_initializer=init_ops.random_normal_initializer(
            mean=1., dtype=dtype))
    rnn = model.rnn
    params = rnn.trainable_variables[0]

    inputs = variables.Variable(
        random_ops.random_uniform(
            [seq_length, batch_size, input_size], dtype=dtype),
        dtype=dtype)
    input_h = variables.Variable(
        random_ops.random_uniform(
            [num_layers * dir_count, batch_size, num_units], dtype=dtype),
        dtype=dtype)
    if has_input_c:
      input_c = variables.Variable(
          random_ops.random_uniform(
              [num_layers * dir_count, batch_size, num_units], dtype=dtype),
          dtype=dtype)
      initial_state = (input_h, input_c)
    else:
      initial_state = (input_h,)
    total_sum = model.FProp(inputs, initial_state, training=True)

    with self.test_session(use_gpu=True, graph=ops.get_default_graph()) as sess:
      sess.run(variables.global_variables_initializer())
      all_inputs = [inputs, params]
      for s in initial_state:
        all_inputs.append(s)
      if dtype == dtypes.float16:
        self._GradientCheckFp16(
            sess, total_sum, all_inputs,
            num_samples=FLAGS.grad_check_num_samples,
            tolerance=tolerance, delta=delta)
      else:
        for _ in range(FLAGS.grad_check_num_samples):
          # Each time choose a different set of inputs.
          sess.run(variables.global_variables_initializer())
          self._GradientCheck(
              sess, total_sum, all_inputs,
              tolerance=tolerance, delta=delta)
      os.environ["TF_CUDNN_RESET_RND_GEN_STATE"] = old_env_state
    def test_dynamic_rnn_decoder_time_major(self):
        with self.test_session() as sess:
            with variable_scope.variable_scope(
                    "root", initializer=init_ops.constant_initializer(
                        0.5)) as varscope:
                # Define inputs/outputs to model
                batch_size = 2
                encoder_embedding_size = 3
                decoder_embedding_size = 4
                encoder_hidden_size = 5
                decoder_hidden_size = encoder_hidden_size
                input_sequence_length = 6
                decoder_sequence_length = 7
                num_decoder_symbols = 20
                start_of_sequence_id = end_of_sequence_id = 1
                decoder_embeddings = variable_scope.get_variable(
                    "decoder_embeddings",
                    [num_decoder_symbols, decoder_embedding_size],
                    initializer=init_ops.random_normal_initializer(stddev=0.1))
                inputs = constant_op.constant(0.5,
                                              shape=[
                                                  input_sequence_length,
                                                  batch_size,
                                                  encoder_embedding_size
                                              ])
                decoder_inputs = constant_op.constant(
                    0.4,
                    shape=[
                        decoder_sequence_length, batch_size,
                        decoder_embedding_size
                    ])
                decoder_length = constant_op.constant(decoder_sequence_length,
                                                      dtype=dtypes.int32,
                                                      shape=[
                                                          batch_size,
                                                      ])
                with variable_scope.variable_scope("rnn") as scope:
                    # setting up weights for computing the final output
                    output_fn = lambda x: layers.linear(
                        x, num_decoder_symbols, scope=scope)

                    # Define model
                    encoder_outputs, encoder_state = rnn.dynamic_rnn(
                        cell=core_rnn_cell_impl.GRUCell(encoder_hidden_size),
                        inputs=inputs,
                        dtype=dtypes.float32,
                        time_major=True,
                        scope=scope)

                with variable_scope.variable_scope("decoder") as scope:
                    # Train decoder
                    decoder_cell = core_rnn_cell_impl.GRUCell(
                        decoder_hidden_size)
                    decoder_fn_train = Seq2SeqTest._decoder_fn_with_context_state(
                        decoder_fn_lib.simple_decoder_fn_train(
                            encoder_state=encoder_state))
                    (decoder_outputs_train, decoder_state_train,
                     decoder_context_state_train) = (
                         seq2seq.dynamic_rnn_decoder(
                             cell=decoder_cell,
                             decoder_fn=decoder_fn_train,
                             inputs=decoder_inputs,
                             sequence_length=decoder_length,
                             time_major=True,
                             scope=scope))
                    decoder_outputs_train = output_fn(decoder_outputs_train)

                    # Setup variable reuse
                    scope.reuse_variables()

                    # Inference decoder
                    decoder_fn_inference = Seq2SeqTest._decoder_fn_with_context_state(
                        decoder_fn_lib.simple_decoder_fn_inference(
                            output_fn=output_fn,
                            encoder_state=encoder_state,
                            embeddings=decoder_embeddings,
                            start_of_sequence_id=start_of_sequence_id,
                            end_of_sequence_id=end_of_sequence_id,
                            #TODO: find out why it goes to +1
                            maximum_length=decoder_sequence_length - 1,
                            num_decoder_symbols=num_decoder_symbols,
                            dtype=dtypes.int32))
                    (decoder_outputs_inference, decoder_state_inference,
                     decoder_context_state_inference) = (
                         seq2seq.dynamic_rnn_decoder(
                             cell=decoder_cell,
                             decoder_fn=decoder_fn_inference,
                             time_major=True,
                             scope=scope))

                # Run model
                variables.global_variables_initializer().run()
                (decoder_outputs_train_res, decoder_state_train_res,
                 decoder_context_state_train_res) = sess.run([
                     decoder_outputs_train, decoder_state_train,
                     decoder_context_state_train
                 ])
                (decoder_outputs_inference_res, decoder_state_inference_res,
                 decoder_context_state_inference_res) = sess.run([
                     decoder_outputs_inference, decoder_state_inference,
                     decoder_context_state_inference
                 ])

                # Assert outputs
                self.assertEqual(
                    (decoder_sequence_length, batch_size, num_decoder_symbols),
                    decoder_outputs_train_res.shape)
                self.assertEqual((batch_size, num_decoder_symbols),
                                 decoder_outputs_inference_res.shape[1:3])
                self.assertEqual(decoder_sequence_length,
                                 decoder_context_state_inference_res)
                self.assertEqual((batch_size, decoder_hidden_size),
                                 decoder_state_train_res.shape)
                self.assertEqual((batch_size, decoder_hidden_size),
                                 decoder_state_inference_res.shape)
                self.assertEqual(decoder_sequence_length,
                                 decoder_context_state_train_res)
                # The dynamic decoder might end earlier than `maximal_length`
                # under inference
                self.assertGreaterEqual(decoder_sequence_length,
                                        decoder_state_inference_res.shape[0])