def testFuseResizeAndConv(self):
    with self.cached_session() as sess:
      inputs = [1, 4, 2, 5, 3, 6, -1, -4, -2, -5, -3, -6]
      input_op = constant_op.constant(
          np.array(inputs), shape=[1, 2, 3, 2], dtype=dtypes.float32)
      resize_op = image_ops.resize_bilinear(
          input_op, [12, 4], align_corners=False)
      weights = [1, 2, 3, 4, 0.1, 0.2, 0.3, 0.4]
      weights_op = constant_op.constant(
          np.array(weights), shape=[1, 2, 2, 2], dtype=dtypes.float32)
      nn_ops.conv2d(
          resize_op, weights_op, [1, 1, 1, 1], padding="VALID", name="output")
      original_graph_def = sess.graph_def
      original_result = sess.run(["output:0"])
    optimized_graph_def = optimize_for_inference_lib.fuse_resize_and_conv(
        original_graph_def, ["output"])

    with self.cached_session() as sess:
      _ = importer.import_graph_def(
          optimized_graph_def, input_map={}, name="optimized")
      optimized_result = sess.run(["optimized/output:0"])

    self.assertAllClose(original_result, optimized_result)

    for node in optimized_graph_def.node:
      self.assertNotEqual("Conv2D", node.op)
      self.assertNotEqual("MirrorPad", node.op)
Esempio n. 2
0
def _test_convolution(tensor_in_sizes, filter_in_sizes,
                      dilations, strides, padding, data_format):
    """ One iteration of convolution with given shapes and attributes """

    total_size_1 = 1
    total_size_2 = 1
    for s in tensor_in_sizes:
        total_size_1 *= s
    for s in filter_in_sizes:
        total_size_2 *= s
    # Initializes the input tensor with array containing incrementing
    # numbers from 1.
    data_array = [f * 1.0 for f in range(1, total_size_1 + 1)]
    filter_array = [f * 1.0 for f in range(1, total_size_2 + 1)]

    with tf.Graph().as_default():
        in_data = array_ops.placeholder(shape=tensor_in_sizes, dtype='float32')
        in_filter = constant_op.constant(filter_array, shape=filter_in_sizes, dtype='float32')
        strides = [1] + strides + [1]
        dilations = [1] + dilations + [1]

        nn_ops.conv2d(in_data,
                      in_filter,
                      strides=strides,
                      padding=padding,
                      data_format=data_format)

        compare_tf_with_tvm(np.reshape(data_array, tensor_in_sizes).astype('float32'),
                            'Placeholder:0', 'Conv2D:0')
  def testFusePadAndConv(self):
    with self.cached_session() as sess:
      inputs = [1, 4, 2, 5, 3, 6, -1, -4, -2, -5, -3, -6]
      input_op = constant_op.constant(
          np.array(inputs), shape=[1, 2, 3, 2], dtype=dtypes.float32)
      pad_op = array_ops.pad(input_op, [[0, 0], [1, 1], [2, 2], [0, 0]],
                             mode="REFLECT")
      weights = [1, 2, 3, 4, 0.1, 0.2, 0.3, 0.4]
      weights_op = constant_op.constant(
          np.array(weights), shape=[1, 2, 2, 2], dtype=dtypes.float32)
      nn_ops.conv2d(
          pad_op, weights_op, [1, 1, 1, 1], padding="VALID", name="output")
      original_graph_def = sess.graph_def
      original_result = sess.run(["output:0"])
    optimized_graph_def = optimize_for_inference_lib.fuse_resize_and_conv(
        original_graph_def, ["output"])

    with self.cached_session() as sess:
      _ = importer.import_graph_def(
          optimized_graph_def, input_map={}, name="optimized")
      optimized_result = sess.run(["optimized/output:0"])

    self.assertAllClose(original_result, optimized_result)

    for node in optimized_graph_def.node:
      self.assertNotEqual("Conv2D", node.op)
      self.assertNotEqual("ResizeBilinear", node.op)
Esempio n. 4
0
def build_graph(device, input_shape, filter_shape, strides, padding, num_iters):
  """builds a graph containing a sequence of conv2d operations.

  Args:
    device: String, the device to run on.
    input_shape: Shape of the input tensor.
    filter_shape: Shape of the filter tensor.
    strides: A list of ints. 1-D of length 4. The stride of sliding
             window for each dimension of input.
    padding: A string from: "SAME", "VALID". The type of padding
             algorithm to use.
    num_iters: number of iterations to run conv2d.

  Returns:
    An array of tensors to run()
  """
  with ops.device("/%s:0" % device):
    inp = variables.Variable(random_ops.truncated_normal(input_shape))
    filt = variables.Variable(random_ops.truncated_normal(filter_shape))

    outputs = []
    conv2d_op = nn_ops.conv2d(inp, filt, strides, padding, data_format="NHWC")
    outputs.append(conv2d_op)
    for _ in range(1, num_iters):
      with ops.control_dependencies([conv2d_op]):
        conv2d_op = nn_ops.conv2d(
            inp, filt, strides, padding, data_format="NHWC")
        outputs.append(conv2d_op)
    return control_flow_ops.group(*outputs)
  def testAtrousSequence(self):
    """Tests optimization of sequence of atrous convolutions.

    Verifies that a sequence of `atrous_conv2d` operations with identical `rate`
    parameters, 'SAME' `padding`, and `filters` with odd heights/ widths:

        net = atrous_conv2d(net, filters1, rate, padding="SAME")
        net = atrous_conv2d(net, filters2, rate, padding="SAME")
        ...
        net = atrous_conv2d(net, filtersK, rate, padding="SAME")

    is equivalent to:

        pad = ...  # padding so that the input dims are multiples of rate
        net = space_to_batch(net, paddings=pad, block_size=rate)
        net = conv2d(net, filters1, strides=[1, 1, 1, 1], padding="SAME")
        net = conv2d(net, filters2, strides=[1, 1, 1, 1], padding="SAME")
        ...
        net = conv2d(net, filtersK, strides=[1, 1, 1, 1], padding="SAME")
        net = batch_to_space(net, crops=pad, block_size=rate)
    """
    padding = "SAME"  # The padding needs to be "SAME"
    np.random.seed(1)  # Make it reproducible.

    with self.session(use_gpu=True):
      # Input: [batch, height, width, input_depth]
      for height in range(15, 17):
        for width in range(15, 17):
          x_shape = [3, height, width, 2]
          x = np.random.random_sample(x_shape).astype(np.float32)

          for kernel in [1, 3, 5]:  # The kernel size needs to be odd.
            # Filter: [kernel_height, kernel_width, input_depth, output_depth]
            f_shape = [kernel, kernel, 2, 2]
            f = 1e-2 * np.random.random_sample(f_shape).astype(np.float32)

            for rate in range(2, 4):
              # y1: three atrous_conv2d in a row.
              y1 = nn_ops.atrous_conv2d(x, f, rate, padding=padding)
              y1 = nn_ops.atrous_conv2d(y1, f, rate, padding=padding)
              y1 = nn_ops.atrous_conv2d(y1, f, rate, padding=padding)
              # y2: space_to_batch, three conv2d in a row, batch_to_space
              pad_bottom = 0 if height % rate == 0 else rate - height % rate
              pad_right = 0 if width % rate == 0 else rate - width % rate
              pad = [[0, pad_bottom], [0, pad_right]]
              y2 = array_ops.space_to_batch(x, paddings=pad, block_size=rate)
              y2 = nn_ops.conv2d(y2, f, strides=[1, 1, 1, 1], padding=padding)
              y2 = nn_ops.conv2d(y2, f, strides=[1, 1, 1, 1], padding=padding)
              y2 = nn_ops.conv2d(y2, f, strides=[1, 1, 1, 1], padding=padding)
              y2 = array_ops.batch_to_space(y2, crops=pad, block_size=rate)
              self.assertAllClose(
                  y1.eval(), self.evaluate(y2), rtol=1e-2, atol=1e-2)
 def _BuildSmallModel(self):
   image = array_ops.zeros([2, 6, 6, 3])
   kernel = variable_scope.get_variable(
       'DW', [3, 3, 3, 6],
       dtypes.float32,
       initializer=init_ops.random_normal_initializer(stddev=0.001))
   x = nn_ops.conv2d(image, kernel, [1, 2, 2, 1], padding='SAME')
   kernel = variable_scope.get_variable(
       'DW2', [2, 2, 6, 12],
       dtypes.float32,
       initializer=init_ops.random_normal_initializer(stddev=0.001))
   x = nn_ops.conv2d(x, kernel, [1, 2, 2, 1], padding='SAME')
   return x
Esempio n. 7
0
  def testExtractPointwiseConv2dPatches(self):
    with ops.Graph().as_default(), self.test_session() as sess:
      batch_size = 10
      image_height = image_width = 8
      in_channels = out_channels = 3
      kernel_height = kernel_width = 1
      strides = [1, 1, 1, 1]
      padding = 'VALID'

      images = random_ops.random_uniform(
          [batch_size, image_height, image_width, in_channels], seed=0)
      kernel_shape = [kernel_height, kernel_width, in_channels, out_channels]
      kernel = random_ops.random_uniform(kernel_shape, seed=1)

      # Ensure shape matches expectation.
      patches = utils.extract_pointwise_conv2d_patches(images, kernel_shape)
      self.assertEqual(patches.shape.as_list(), [
          batch_size, image_height, image_width, kernel_height, kernel_width,
          in_channels
      ])

      # Ensure extract...patches() + matmul() and conv2d() implementation
      # give the same answer.
      outputs = nn_ops.conv2d(images, kernel, strides, padding)

      patches_flat = array_ops.reshape(
          patches, [-1, kernel_height * kernel_width * in_channels])
      kernel_flat = array_ops.reshape(kernel, [-1, out_channels])
      outputs_flat = math_ops.matmul(patches_flat, kernel_flat)

      outputs_, outputs_flat_ = sess.run([outputs, outputs_flat])
      self.assertAllClose(outputs_.flatten(), outputs_flat_.flatten())
Esempio n. 8
0
 def _tf_enc_attention_decoder(self, attention_states, last_enc_state, cell,
                       num_heads=1,
                       dtype=dtypes.float32, scope=None):
     """RNN decoder with attention for the sequence-to-sequence model.
 
     Args:
       return_encodings: If true, return encoder hidden states. Otherwise, return
         single step decoding tensors
     """
     if num_heads < 1:
         raise ValueError("With less than 1 heads, use a non-attention decoder.")
     if not attention_states.get_shape()[1:2].is_fully_defined():
         raise ValueError("Shape[1] and [2] of attention_states must be known: %s"
                      % attention_states.get_shape())
 
     with variable_scope.variable_scope(scope or "attention_decoder"):
       attn_length = attention_states.get_shape()[1].value
       attn_size = attention_states.get_shape()[2].value
   
       # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before.
       hidden = array_ops.reshape(
           attention_states, [-1, attn_length, 1, attn_size])
       hidden_features = []
       v = []
       attention_vec_size = attn_size  # Size of query vectors for attention.
       for a in xrange(num_heads):
         k = variable_scope.get_variable("AttnW_%d" % a,
                                         [1, 1, attn_size, attention_vec_size])
         hidden_features.append(nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME")) # Hidden states multiplied with W1
         v.append(variable_scope.get_variable("AttnV_%d" % a,
                                              [attention_vec_size]))    
     
       return [last_enc_state] + [hidden] + hidden_features + v
def SimulateFusedConv2dBiasActivationInt8(conv_input_scale, conv_input, kernel,
                                          padding, strides, side_input_scale,
                                          side_input, biases):
  """Simulates the int8 fused 2-D convolution op using separate float ops.

    The arguments and return values have the same format, meanings and
    restrictions as the actual op.
  Args:
    conv_input_scale: A scalar 'float'.
    conv_input: A `Tensor` of type `qint8` in NCHW_VECT_C layout.
    kernel: A `Tensor` of type `qint8` in OIHW_VECT_I layout.
    padding: A `string` from: `"SAME", "VALID"`.
    strides: A list of `ints`.
    side_input_scale: A scalar 'float'.
    side_input: A `Tensor` of type `qint8` in NCHW_VECT_C layout.
    biases: A `Tensor` of type `float32` in NCHW layout.
  Returns:
    A `Tensor` of type `qint8` in NCHW_VECT_C layout.
  """
  conv_result = nn_ops.conv2d(
      NchwVectCToNchw(gen_array_ops.dequantize(conv_input, -128, 127)),
      OihwVectIToHwio(gen_array_ops.dequantize(kernel, -128, 127)),
      strides=strides,
      padding=padding,
      data_format="NCHW") * conv_input_scale

  conv_and_side_inputs = conv_result + side_input_scale * NchwVectCToNchw(
      gen_array_ops.dequantize(side_input, -128, 127))

  logit = nn_ops.bias_add(conv_and_side_inputs, biases, data_format="NCHW")

  result, _, _ = gen_array_ops.quantize_v2(
      NchwToNchwVectC(nn_ops.relu(logit)), -128, 127, dtypes.qint8)
  return result
def _CloneWithNewOperands(layer_op, input_tensor, weight_tensor):
  """Clones layer_op with input_tensor and weight_tensor as new inputs."""
  new_layer_name = layer_op.name.split('/')[-1] + '_Fold'
  if layer_op.type == 'Conv2D':
    return nn_ops.conv2d(
        input_tensor,
        weight_tensor,
        strides=layer_op.get_attr('strides'),
        padding=layer_op.get_attr('padding'),
        use_cudnn_on_gpu=layer_op.get_attr('use_cudnn_on_gpu'),
        data_format=layer_op.get_attr('data_format'),
        name=new_layer_name)
  elif layer_op.type == 'MatMul':
    return math_ops.matmul(
        input_tensor,
        weight_tensor,
        transpose_a=layer_op.get_attr('transpose_a'),
        transpose_b=layer_op.get_attr('transpose_b'),
        name=new_layer_name)
  elif layer_op.type == 'DepthwiseConv2dNative':
    return nn.depthwise_conv2d(
        input_tensor,
        weight_tensor,
        strides=layer_op.get_attr('strides'),
        padding=layer_op.get_attr('padding'),
        name=new_layer_name)
  else:
    raise ValueError('Cannot handle operation of type: %s' % layer_op.type)
Esempio n. 11
0
def _strict_conv1d(x, h):
  """Return x * h for rank 1 tensors x and h."""
  with ops.op_scope([x, h], 'strict_conv1d'):
    x = array_ops.reshape(x, (1, -1, 1, 1))
    h = array_ops.reshape(h, (-1, 1, 1, 1))
    result = nn_ops.conv2d(x, h, [1, 1, 1, 1], 'SAME')
    return array_ops.reshape(result, [-1])
Esempio n. 12
0
  def _VerifyValues(self,
                    input_sizes=None,
                    filter_sizes=None,
                    strides=None,
                    dilations=None,
                    padding=None,
                    data_format_src="NHWC",
                    data_format_dst="NHWC",
                    expected=None):
    """Tests that tf.nn.conv2d produces the expected value.

    Args:
      input_sizes: Input tensor dimensions in
        [batch, input_rows, input_cols, input_depth].
      filter_sizes: Filter tensor dimensions in
        [kernel_rows, kernel_cols, input_depth, output_depth].
      strides: Strides.
      dilations: RHS dilations.
      padding: Padding type.
      data_format_src: Data format input is in.
      data_format_dst: Data format verification will run and input is converted
        to.
      expected: Expected output.
    """

    total_size_1 = np.prod(input_sizes)
    total_size_2 = np.prod(filter_sizes)
    x1 = np.arange(1, total_size_1 + 1, dtype=np.float32).reshape(input_sizes)
    x2 = np.arange(1, total_size_2 + 1, dtype=np.float32).reshape(filter_sizes)
    strides = [1] + strides + [1]
    if dilations is None:
      dilations = [1, 1]
    dilations = [1] + dilations + [1]

    # Convert between data formats.
    expected = test_utils.ConvertBetweenDataFormats(expected, data_format_src,
                                                    data_format_dst)
    x1 = test_utils.ConvertBetweenDataFormats(x1, data_format_src,
                                              data_format_dst)
    input_sizes = test_utils.PermuteDimsBetweenDataFormats(
        input_sizes, data_format_src, data_format_dst)
    strides = test_utils.PermuteDimsBetweenDataFormats(strides, data_format_src,
                                                       data_format_dst)
    dilations = test_utils.PermuteDimsBetweenDataFormats(
        dilations, data_format_src, data_format_dst)

    with self.test_session() as sess:
      t1 = array_ops.placeholder(dtypes.float32, shape=input_sizes)
      t2 = array_ops.placeholder(dtypes.float32, shape=filter_sizes)
      with self.test_scope():
        out = nn_ops.conv2d(
            t1,
            t2,
            strides=strides,
            padding=padding,
            data_format=data_format_dst,
            dilations=dilations)

      value = sess.run(out, {t1: x1, t2: x2})
      self.assertAllClose(expected, value, 1e-3)
Esempio n. 13
0
  def testSmallNetwork(self):
    image = array_ops.placeholder(dtypes.float32, shape=[1, 28, 28, 1])
    label = array_ops.placeholder(dtypes.float32, shape=[1, 10])
    w = variables.Variable(
        random_ops.truncated_normal([5, 5, 1, 32], stddev=0.1))
    b = variables.Variable(random_ops.truncated_normal([32], stddev=0.1))
    conv = nn_ops.conv2d(image, w, strides=[1, 1, 1, 1], padding="SAME")
    h_conv = nn_ops.relu(conv + b)
    h_conv_flat = array_ops.reshape(h_conv, [1, -1])

    w_fc = variables.Variable(
        random_ops.truncated_normal([25088, 10], stddev=0.1))
    b_fc = variables.Variable(random_ops.truncated_normal([10], stddev=0.1))
    y_conv = nn_ops.softmax(math_ops.matmul(h_conv_flat, w_fc) + b_fc)

    cross_entropy = math_ops.reduce_mean(-math_ops.reduce_sum(
        label * math_ops.log(y_conv), reduction_indices=[1]))
    _ = adam.AdamOptimizer(1e-4).minimize(cross_entropy)

    mg = meta_graph.create_meta_graph_def(graph=ops.get_default_graph())
    report = cost_analyzer.GenerateCostReport(mg)

    self.assertTrue(b"MatMul" in report)
    self.assertTrue(b"ApplyAdam" in report)
    self.assertTrue(b"Conv2D" in report)
    self.assertTrue(b"Conv2DBackpropInput" in report)
    self.assertTrue(b"Conv2DBackpropFilter" in report)
    self.assertTrue(b"Softmax" in report)

    # Also print the report to make it easier to debug
    print("{}".format(report))
def ReferenceDepthwiseConv2D(input_tensor, filter_tensor, strides, padding,
                             data_format=None):
  # Reference implementation of depthwise convolution that uses regular
  # convolution.
  convs = []
  in_channels = filter_tensor.shape[2]
  # Use a custom implementation of depthwise conv2d using slicing.
  for channel in xrange(in_channels):
    # Slice the input along channel
    if data_format == "NCHW":
      input_slice = input_tensor[:, channel:channel+1, :, :]
    else:
      input_slice = input_tensor[:, :, :, channel:channel+1]

    # Slice the filters.  Filters are  H, W, InC, DepthMultiplier
    filter_slice = filter_tensor[:, :, channel:channel+1, :]
    # Do conv
    convs.append(nn_ops.conv2d(input_slice, filter_slice,
                               strides, padding,
                               data_format=data_format,
                               name="depthwise_slice_%d" % channel))

  # Concat along dimension.
  if data_format == "NCHW":
    return array_ops.concat(convs, 1)
  else:
    return array_ops.concat(convs, 3)
Esempio n. 15
0
  def _VerifyValues(self, input_sizes, filter_sizes, stride, padding, expected):
    """Tests that tf.nn.conv2d produces the expected value.

    Args:
      input_sizes: Input tensor dimensions in
        [batch, input_rows, input_cols, input_depth].
      filter_sizes: Filter tensor dimensions in
        [kernel_rows, kernel_cols, input_depth, output_depth].
      stride: Stride.
      padding: Padding type.
      expected: Expected output.
    """
    total_size_1 = np.prod(input_sizes)
    total_size_2 = np.prod(filter_sizes)
    x1 = np.arange(1, total_size_1 + 1, dtype=np.float32).reshape(input_sizes)
    x2 = np.arange(1, total_size_2 + 1, dtype=np.float32).reshape(filter_sizes)
    strides = [1, stride, stride, 1]

    with self.test_session() as sess:
      with self.test_scope():
        t1 = array_ops.placeholder(dtypes.float32, shape=input_sizes)
        t2 = array_ops.placeholder(dtypes.float32, shape=filter_sizes)
        out = nn_ops.conv2d(
            t1, t2, strides=strides, padding=padding, data_format="NHWC")
      value = sess.run(out, {t1: x1, t2: x2})
      self.assertArrayNear(expected, np.ravel(value), 1e-3)
 def testGradientDilatedConv(self):
   if test.is_gpu_available(cuda_only=True):
     with self.test_session(use_gpu=True):
       for padding in ["SAME", "VALID"]:
         for stride in [1, 2]:
           np.random.seed(1)
           in_shape = [5, 8, 6, 4]
           in_val = constant_op.constant(
               2 * np.random.random_sample(in_shape) - 1, dtype=dtypes.float32)
           filter_shape = [3, 3, 4, 6]
           # Make a convolution op with the current settings,
           # just to easily get the shape of the output.
           conv_out = nn_ops.conv2d(
               in_val,
               array_ops.zeros(filter_shape),
               dilations=[1, 2, 2, 1],
               strides=[1, stride, stride, 1],
               padding=padding)
           out_backprop_shape = conv_out.get_shape().as_list()
           out_backprop_val = constant_op.constant(
               2 * np.random.random_sample(out_backprop_shape) - 1,
               dtype=dtypes.float32)
           output = nn_ops.conv2d_backprop_filter(
               in_val,
               filter_shape,
               out_backprop_val,
               dilations=[1, 2, 2, 1],
               strides=[1, stride, stride, 1],
               padding=padding)
           err = gradient_checker.compute_gradient_error(
               [in_val, out_backprop_val], [in_shape, out_backprop_shape],
               output, filter_shape)
           print("conv2d_backprop_filter gradient err = %g " % err)
           err_tolerance = 2e-3
           self.assertLess(err, err_tolerance)
Esempio n. 17
0
def _Conv2DBackpropInputGrad(op, grad):
  """The derivatives for deconvolution.

  Args:
    op: the Deconvolution op.
    grad: the tensor representing the gradient w.r.t. the output

  Returns:
    the gradients w.r.t. the input and the filter
  """
  return [
      None,
      nn_ops.conv2d_backprop_filter(
          grad,
          array_ops.shape(op.inputs[1]),
          op.inputs[2],
          dilations=op.get_attr("dilations"),
          strides=op.get_attr("strides"),
          padding=op.get_attr("padding"),
          use_cudnn_on_gpu=op.get_attr("use_cudnn_on_gpu"),
          data_format=op.get_attr("data_format").decode()),
      nn_ops.conv2d(
          grad,
          op.inputs[1],
          dilations=op.get_attr("dilations"),
          strides=op.get_attr("strides"),
          padding=op.get_attr("padding"),
          use_cudnn_on_gpu=op.get_attr("use_cudnn_on_gpu"),
          data_format=op.get_attr("data_format").decode())
  ]
Esempio n. 18
0
def separable_conv2d(input, depthwise_filter, pointwise_filter, strides, padding, name=None):
    """2-D convolution with separable filters.

  Performs a depthwise convolution that acts separately on channels followed by
  a pointwise convolution that mixes channels.  Note that this is separability
  between dimensions `[1, 2]` and `3`, not spatial separability between
  dimensions `1` and `2`.

  In detail,

      output[b, i, j, k] = sum_{di, dj, q, r]
          input[b, strides[1] * i + di, strides[2] * j + dj, q] *
          depthwise_filter[di, dj, q, r] *
          pointwise_filter[0, 0, q * channel_multiplier + r, k]

  `strides` controls the strides for the depthwise convolution only, since
  the pointwise convolution has implicit strides of `[1, 1, 1, 1]`.  Must have
  `strides[0] = strides[3] = 1`.  For the most common case of the same
  horizontal and vertical strides, `strides = [1, stride, stride, 1]`.

  Args:
    input: 4-D `Tensor` with shape `[batch, in_height, in_width, in_channels]`.
    depthwise_filter: 4-D `Tensor` with shape
      `[filter_height, filter_width, in_channels, channel_multiplier]`.
      Contains `in_channels` convolutional filters of depth 1.
    pointwise_filter: 4-D `Tensor` with shape
      `[1, 1, channel_multiplier * in_channels, out_channels]`.  Pointwise
      filter to mix channels after `depthwise_filter` has convolved spatially.
    strides: 1-D of size 4.  The strides for the depthwise convolution for
      each dimension of `input`.
    padding: A string, either `'VALID'` or `'SAME'`.  The padding algorithm.
    name: A name for this operation (optional).

  Returns:
    A 4-D `Tensor` of shape `[batch, out_height, out_width, out_channels]`.
  """
    with ops.op_scope([input, depthwise_filter, pointwise_filter], name, "separable_conv2d") as name:
        input = ops.convert_to_tensor(input, name="tensor_in")
        depthwise_filter = ops.convert_to_tensor(depthwise_filter, name="depthwise_filter")
        pointwise_filter = ops.convert_to_tensor(pointwise_filter, name="pointwise_filter")

        if pointwise_filter.get_shape().ndims is not None:
            assert len(pointwise_filter.get_shape()) == 4
            assert pointwise_filter.get_shape()[0] == 1
            assert pointwise_filter.get_shape()[1] == 1
            if depthwise_filter.get_shape().ndims and input.get_shape().ndims:
                channel_multiplier = depthwise_filter.get_shape()[3]
                in_channels = input.get_shape()[3]
                out_channels = pointwise_filter.get_shape()[3]
                # This would mean the separable convolutions is over-parametrized.
                assert channel_multiplier * in_channels < out_channels
        # The layout of the ops in the graph are expected to be as follows:
        # separable_conv2d  // Conv2D op corresponding to the pointwise conv.
        # separable_conv2d/depthwise  // Concat op for the deptwise outputs.
        # separable_conv2d/depthwise/depth0  // Conv2D op for depth 0
        # separable_conv2d/depthwise/depth1  // Conv2D op for depth 1
        # separable_conv2d/depthwise/depth2  // Conv2D op for depth 2
        depthwise = depthwise_conv2d(input, depthwise_filter, strides, padding, name="depthwise")
        return nn_ops.conv2d(depthwise, pointwise_filter, [1, 1, 1, 1], padding="VALID", name=name)
Esempio n. 19
0
 def func(inp):
   conv = nn_ops.conv2d(
       inp,
       filter=array_ops.ones([3, 3, 3, 16]),
       strides=[1, 1, 1, 1],
       padding='SAME')
   output = nn_ops.relu(conv, name='output')
   return output
Esempio n. 20
0
def depthwise_conv2d(input, filter, strides, padding, name=None):
  """Depthwise 2-D convolution.

  Given an input tensor of shape `[batch, in_height, in_width, in_channels]`
  and a filter tensor of shape
  `[filter_height, filter_width, in_channels, channel_multiplier]`
  containing `in_channels` convolutional filters of depth 1, `depthwise_conv2d`
  applies a different filter to each input channel (expanding from 1 channel
  to `channel_multiplier` channels for each), then concatenates the results
  together.  The output has `in_channels * channel_multiplier` channels.

  In detail,

      output[b, i, j, k * channel_multiplier + q] =
          sum_{di, dj} input[b, strides[1] * i + di, strides[2] * j + dj, k] *
                       filter[di, dj, k, q]

  Must have `strides[0] = strides[3] = 1`.  For the most common case of the
  same horizontal and vertical strides, `strides = [1, stride, stride, 1]`.

  Args:
    input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
    filter: 4-D with shape
      `[filter_height, filter_width, in_channels, channel_multiplier]`.
    strides: 1-D of size 4.  The stride of the sliding window for each
      dimension of `input`.
    padding: A string, either `'VALID'` or `'SAME'`.  The padding algorithm.
      See the [comment here](https://www.tensorflow.org/api_docs/python/nn.html#convolution)
    name: A name for this operation (optional).

  Returns:
    A 4-D `Tensor` of shape
    `[batch, out_height, out_width, in_channels * channel_multiplier].`
  """
  with ops.op_scope([input, filter], name, "depthwise") as name:
    input = ops.convert_to_tensor(input, name="tensor_in")
    filter = ops.convert_to_tensor(filter, name="filter_in")
    # A shape is required to statically compute the number of separable filters.
    if filter.get_shape().ndims is not None:
      assert len(filter.get_shape()) == 4
      in_channels = filter.get_shape()[2]
      # Sanity checks, if shape information is available for the inputs.
      if input.get_shape().ndims is not None:
        assert len(input.get_shape()) == 4
        assert input.get_shape()[3] == in_channels, (
            "Mismatched input depth %d and number of depthwise filters %d." % (
                input.get_shape()[3].value, in_channels))
    else:
      assert input.get_shape().ndims is not None, (
          "Either tensor must provide static shape information.")
      assert input.get_shape().ndims == 4
      in_channels = input.get_shape()[3]

    if in_channels == 1:
      return nn_ops.conv2d(input, filter, strides, padding, name=name)
    else:
      return nn_ops.depthwise_conv2d_native(input, filter, strides, padding,
                                            name=name)
 def spatial_conv(batch, gain):
   s = array_ops.shape(batch)
   padded = array_ops.pad(batch, [[0, 0], [2, 2], [2, 2], [0, 0]], 'REFLECT')
   xt = array_ops.transpose(padded, [0, 3, 1, 2])
   xt = array_ops.reshape(xt, [s[0] * s[3], s[1] + 4, s[2] + 4, 1])
   conv_out = nn_ops.conv2d(xt, gaussian_filter * gain, [1] * 4, 'VALID')
   conv_xt = array_ops.reshape(conv_out, [s[0], s[3], s[1], s[2]])
   conv_xt = array_ops.transpose(conv_xt, [0, 2, 3, 1])
   return conv_xt
Esempio n. 22
0
 def testConv2dGradWRTFilter(self):
   x = constant_op.constant([0.5],
                            dtype=dtypes.float32,
                            shape=[1, 4, 4, 3],
                            name='input')
   f = array_ops.placeholder(
       dtype=dtypes.float32, shape=[2, 2, 3, 2], name='filter')
   y = nn_ops.conv2d(x, f, [1, 1, 1, 1], 'SAME')
   self.run_test(f, y)
def BuildSplitableModel():
  """Build a small model that can be run partially in each step."""
  image = array_ops.zeros([2, 6, 6, 3])

  kernel1 = variable_scope.get_variable(
      'DW', [3, 3, 3, 6],
      dtypes.float32,
      initializer=init_ops.random_normal_initializer(stddev=0.001))
  r1 = nn_ops.conv2d(image, kernel1, [1, 2, 2, 1], padding='SAME')

  kernel2 = variable_scope.get_variable(
      'DW2', [2, 3, 3, 6],
      dtypes.float32,
      initializer=init_ops.random_normal_initializer(stddev=0.001))
  r2 = nn_ops.conv2d(image, kernel2, [1, 2, 2, 1], padding='SAME')

  r3 = r1 + r2
  return r1, r2, r3
def BuildSmallModel():
  """Build a small forward conv model."""
  image = array_ops.zeros([2, 6, 6, 3])
  _ = variable_scope.get_variable(
      'ScalarW', [],
      dtypes.float32,
      initializer=init_ops.random_normal_initializer(stddev=0.001))
  kernel = variable_scope.get_variable(
      'DW', [3, 3, 3, 6],
      dtypes.float32,
      initializer=init_ops.random_normal_initializer(stddev=0.001))
  x = nn_ops.conv2d(image, kernel, [1, 2, 2, 1], padding='SAME')
  kernel = variable_scope.get_variable(
      'DW2', [2, 2, 6, 12],
      dtypes.float32,
      initializer=init_ops.random_normal_initializer(stddev=0.001))
  x = nn_ops.conv2d(x, kernel, [1, 2, 2, 1], padding='SAME')
  return x
  def _SetupValuesForDevice(self, tensor_in_sizes, filter_in_sizes, bias,
                            strides, padding, activation_mode, data_format,
                            dtype):
    """Verifies the output values of the convolution function.

    Args:
      tensor_in_sizes: Input tensor dimensions in
        [batch, input_rows, input_cols, input_depth].
      filter_in_sizes: Filter tensor dimensions in
        [kernel_rows, kernel_cols, input_depth, output_depth].
      bias: 1-D bias tensor of length output_depth.
      strides: Stride: [col_stride, row_stride]
      padding: Padding type.
      activation_mode: Activation mode.
      data_format: Format of the data tensors.
      dtype: Data type for inputs and outputs.
    Returns:
      Symbolic tensor value and reference value that can be used to
      execute the computation and verify the results.
    """
    input_size = np.prod(tensor_in_sizes)
    filter_size = np.prod(filter_in_sizes)
    bias_size = filter_in_sizes[-1]  # equals to output depth
    # Initializes the input tensor with array containing incrementing
    # numbers from 1.
    x1 = [f * 1.0 for f in range(1, input_size + 1)]
    x2 = [f * 1.0 for f in range(1, filter_size + 1)]
    # This is to guarantee that there is always negative values after
    # bias add so that we can test whether relu works correctly.
    x3 = bias
    with self.test_session(use_gpu=True):
      t1 = constant_op.constant(x1, shape=tensor_in_sizes, dtype=dtype)
      t2 = constant_op.constant(x2, shape=filter_in_sizes, dtype=dtype)
      t3 = constant_op.constant(x3, shape=[bias_size], dtype=dtype)
      strides = [1] + strides + [1]
      if data_format == "NCHW":
        t1 = test_util.NHWCToNCHW(t1)
        strides = test_util.NHWCToNCHW(strides)
      output = fused_conv2d_bias_activation_op.fused_conv2d_bias_activation(
          t1,
          t2,
          t3,
          strides=strides,
          padding=padding,
          data_format=data_format,
          activation_mode=activation_mode)
      ref_conv_output = nn_ops.conv2d(
          t1, t2, strides=strides, padding=padding, data_format=data_format)
      ref_bias_output = nn_ops.bias_add(
          ref_conv_output, t3, data_format=data_format)
      ref_output = nn_ops.relu(ref_bias_output)
      if data_format == "NCHW":
        output = test_util.NCHWToNHWC(output)
        ref_output = test_util.NCHWToNHWC(ref_output)

      return output, ref_output
def build_conv_bias_relu_graph(device, input_shape, filter_shape, strides,
                               padding, num_iters, data_format):
  """builds a graph containing a sequence of conv2d operations.

  Args:
    device: String, the device to run on.
    input_shape: Shape of the input tensor.
    filter_shape: Shape of the filter tensor.
    strides: A list of ints. 1-D of length 4. The stride of sliding
             window for each dimension of input.
    padding: A string from: "SAME", "VALID". The type of padding
             algorithm to use.
    num_iters: number of iterations to run conv2d.
    data_format: data format string of input, 'NHWC' and 'NCHW' are
    supported.

  Returns:
    An array of tensors to run()
  """
  if data_format == "NCHW":
    input_shape = [
        input_shape[0], input_shape[3], input_shape[1], input_shape[2]
    ]
  with ops.device("/%s:0" % device):
    inp = variables.Variable(random_ops.truncated_normal(input_shape))
    filt = variables.Variable(random_ops.truncated_normal(filter_shape))
    bias_shape = [filter_shape[-1]]
    bias = variables.Variable(random_ops.truncated_normal(bias_shape))

    outputs = []
    conv2d_out = nn_ops.conv2d(
        inp, filt, strides, padding, data_format=data_format)
    bias_out = nn_ops.bias_add(conv2d_out, bias, data_format=data_format)
    relu_out = nn_ops.relu(bias_out)
    outputs.append(relu_out)
    for _ in range(1, num_iters):
      with ops.control_dependencies([relu_out]):
        conv2d_out = nn_ops.conv2d(
            inp, filt, strides, padding, data_format=data_format)
        bias_out = nn_ops.bias_add(conv2d_out, bias, data_format=data_format)
        relu_out = nn_ops.relu(bias_out)
        outputs.append(relu_out)
    return control_flow_ops.group(*outputs)
 def _random_out_op(self, in_shape, filter_shape):
   # Choosing not to use array_op.zeros() to prevent possible removal by
   # optimization
   in_op = self._random_data_op(in_shape)
   filter_op = self._random_data_op(filter_shape)
   # Use the forward op's shape-inference
   conv_op = nn_ops.conv2d(
       in_op, filter_op, strides=_STRIDES, padding=_PADDING)
   out_shape = conv_op.get_shape()
   out_op = self._random_data_op(out_shape)
   return out_op
Esempio n. 28
0
def decoder_type_1(decoder_hidden, attn_size, initializer=None):

    with vs.variable_scope("decoder_type_1", initializer=initializer):

        k = vs.get_variable("AttnDecW_%d" % 0, [1, 1, attn_size, 1], initializer=initializer)
        hidden_features = nn_ops.conv2d(decoder_hidden, k, [1, 1, 1, 1], "SAME")

        # s will be (?, timesteps)
        s = math_ops.reduce_sum(math_ops.tanh(hidden_features), [2, 3])

    return s
 def _CloneConv2d(self, op, inputs, new_name):
   input_tensor = inputs[0]
   weights = inputs[1]
   self._AssertConvShapes(op.name, input_tensor, weights)
   return nn_ops.conv2d(
       input_tensor,
       weights,
       strides=op.get_attr('strides'),
       padding=op.get_attr('padding'),
       use_cudnn_on_gpu=op.get_attr('use_cudnn_on_gpu'),
       data_format=op.get_attr('data_format'),
       name=new_name).op
Esempio n. 30
0
def luong_general(hidden, decoder_hidden_state, initializer=None):

    # size of decoder layers
    attention_vec_size = hidden.get_shape()[3].value

    with vs.variable_scope("luong_general", initializer=initializer):

        # here we calculate the W_a * s_i-1 (W1 * h_1) part of the attention alignment
        k = vs.get_variable("AttnW_%d" % 0, [1, 1, attention_vec_size, attention_vec_size], initializer=initializer)
        hidden_features = nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME")
        s = math_ops.reduce_sum((hidden_features * decoder_hidden_state), [2, 3])

    return s
Esempio n. 31
0
	def rnn_decoder(self,encode_embed, attention_states, initial_state, cell, 
					 num_heads=1, loop_function=None, dtype=dtypes.float32, scope=None,
					 initial_state_attention=False):
		"""RNN decoder for the sequence-to-sequence model.

		"""
		with variable_scope.variable_scope(scope or "rnn_decoder"):
			batch_size = tf.shape(encode_embed[0])[0]# Needed for reshaping.
			attn_length = attention_states.get_shape()[1].value #number of output vector in sequence
			attn_size = attention_states.get_shape()[2].value #the dimension size of each output vector
			state_size = initial_state.get_shape()[1].value #the dimension size of state vector
			print(batch_size,attn_length,attn_size,state_size,"batch_size,attn_lengt,attn_size,state_size")
			# To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before.
			print(attention_states.get_shape(),"attention_states.get_shape()")
			hidden = tf.reshape(
				attention_states, [-1, attn_length, 1, attn_size])
			hidden_features = []
			hidden_features2 = []
			v = []
			u = []
			linear_w = []
			linear_b = []
			abstract_w = []
			abstract_b = []
			abstract_layers = [int((attn_size + state_size)/(2 + 2*i)) for i in xrange(2)] + [1]
			attention_vec_size = attn_size# Size of query vectors for attention.
			head_weights = []
			for a in xrange(num_heads):
				k = variable_scope.get_variable("AttnW_%d" % a,
												[1, 1, attn_size, attention_vec_size]) 
				hidden_features.append(nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME"))#[B,T,1,attn_vec_size]
				k2 = variable_scope.get_variable("AttnW2_%d" % a,
												[1, 1, attn_size, attention_vec_size])
				hidden_features2.append(nn_ops.conv2d(hidden, k2, [1, 1, 1, 1], "SAME"))
				v.append(variable_scope.get_variable("AttnV_%d" % a,
													 [attention_vec_size]))
				u.append(variable_scope.get_variable("AttnU_%d" % a,
													 [attention_vec_size]))
				head_weights.append(variable_scope.get_variable("head_weight_%d" % a,[1]))
				current_layer_size = attn_size + state_size
				linear_w.append(variable_scope.get_variable("linearW_%d" % a,
													 [1,1,current_layer_size, 1]))
				linear_b.append(variable_scope.get_variable("linearB_%d" % a,
													 [1]))
				abstract_w.append([])
				abstract_b.append([])
				for i in xrange(len(abstract_layers)):
					layer_size = abstract_layers[i]
					abstract_w[a].append(variable_scope.get_variable("Att_%d_layerW_%d" % (a,i),
													 [1,1,current_layer_size, layer_size]))
					abstract_b[a].append(variable_scope.get_variable("Att_%d_layerB_%d" % (a,i),
													 [layer_size]))
					current_layer_size = layer_size
				

			def attention(query):
				"""Put attention masks on hidden using hidden_features and query."""
				ds = []# Results of attention reads will be stored here.
				aw = []# Attention weights will be stored here
				tiled_query = tf.tile(tf.reshape(query, [-1, 1, 1, state_size]),[1,attn_length,1, 1])
				print(hidden.get_shape(),"hidden.get_shape()")
				print(tiled_query.get_shape(),"tiled_query.get_shape()")
				concat_input = tf.concat(axis=3, values=[hidden, tiled_query])
				#concat_input = tf.concat(3, [hidden, hidden])
				for a in xrange(num_heads):
					with variable_scope.variable_scope("Attention_%d" % a):
						s = None
						if self.hparams.att_strategy == 'multi':
							print('Attention: multiply')
							y = linear(query, attention_vec_size, True)
							y = tf.reshape(y, [-1, 1, 1, attention_vec_size])
							#s = math_ops.reduce_sum(
							#	u[a] * math_ops.tanh(y * hidden_features[a]), [2, 3])
							s = math_ops.reduce_sum(
								hidden * math_ops.tanh(y), [2, 3])
								#hidden_features[a] * math_ops.tanh(y), [2, 3])

						elif self.hparams.att_strategy == 'multi_add':
							print('Attention: multiply_add')
							y = linear(query, attention_vec_size, True, scope='y')
							y2 = linear(query, attention_vec_size, True , scope='y2')
							y = tf.reshape(y, [-1, 1, 1, attention_vec_size])
							y2 = tf.reshape(y2, [-1, 1, 1, attention_vec_size])
							#s = math_ops.reduce_sum(
							#	u[a] * math_ops.tanh(y * hidden_features[a]), [2, 3])
							s = math_ops.reduce_sum(
								hidden * math_ops.tanh(y2), [2, 3])
							s = s + math_ops.reduce_sum(
								v[a] * math_ops.tanh(hidden_features[a] + y), [2, 3])

						elif self.hparams.att_strategy == 'NTN':
							print('Attention: NTN')
							y = linear(query, attn_size, False)
							y = tf.tile(tf.reshape(y, [-1, 1, 1, attn_size]),[1,attn_length,1,1])
							s = math_ops.reduce_sum(hidden * y, [2,3]) #bilnear
							s = s + math_ops.reduce_sum(nn_ops.conv2d(concat_input, linear_w[a], [1, 1, 1, 1], "SAME"), [2,3]) #linear
							s = s + linear_b[a] #bias
							#print(s.get_shape())
							#s = tf.tanh(s) #non linear

						elif self.hparams.att_strategy == 'elu':
							print('Attention: elu')

							cur_input = concat_input
							#for i in xrange(len(abstract_layers)):
							#	cur_input = tf.contrib.layers.fully_connected(cur_input, abstract_layers[i], activation_fn=tf.nn.elu)
							for i in xrange(len(abstract_layers)):
								cur_input = nn_ops.conv2d(cur_input, abstract_w[a][i], [1, 1, 1, 1], "SAME")
								cur_input = cur_input + abstract_b[a][i]
								cur_input = tf.nn.elu(cur_input)
							s = math_ops.reduce_sum(cur_input,[2,3])

						else:
							print('Attention: add')
							y = linear(query, attention_vec_size, True)
							y = tf.reshape(y, [-1, 1, 1, attention_vec_size])
							s = math_ops.reduce_sum(
								v[a] * math_ops.tanh(hidden_features[a] + y), [2, 3])

						att = s * head_weights[a]#nn_ops.softmax(s)
						aw.append(att)
						# Now calculate the attention-weighted vector d.
						d = math_ops.reduce_sum(
							tf.reshape(att, [-1, attn_length, 1, 1]) * hidden,
								[1, 2])
						ds.append(tf.reshape(d, [-1, attn_size]))
				return aw, ds


			state = initial_state
			outputs = []
			prev = None
			batch_attn_size = tf.stack([batch_size, attn_size])
			batch_attw_size = tf.stack([batch_size, attn_length])
			attns = [tf.zeros(batch_attn_size, dtype=dtype) for _ in xrange(num_heads)]
			attw = [1.0/attn_length * tf.ones(batch_attw_size, dtype=dtype) for _ in xrange(num_heads)]
			for a in attns:# Ensure the second shape of attention vectors is set.
				a.set_shape([None, attn_size])

			# Directly use previous state
			attw, attns = attention(initial_state)
			aw = math_ops.reduce_sum(attw,0)
			output = tf.scalar_mul(1.0/float(num_heads), aw)
			output = output - tf.reduce_min(output,1,keep_dims=True)
			outputs.append(output)

		return outputs, state
Esempio n. 32
0
			def attention(query):
				"""Put attention masks on hidden using hidden_features and query."""
				ds = []# Results of attention reads will be stored here.
				aw = []# Attention weights will be stored here
				tiled_query = tf.tile(tf.reshape(query, [-1, 1, 1, state_size]),[1,attn_length,1, 1])
				print(hidden.get_shape(),"hidden.get_shape()")
				print(tiled_query.get_shape(),"tiled_query.get_shape()")
				concat_input = tf.concat(axis=3, values=[hidden, tiled_query])
				#concat_input = tf.concat(3, [hidden, hidden])
				for a in xrange(num_heads):
					with variable_scope.variable_scope("Attention_%d" % a):
						s = None
						if self.hparams.att_strategy == 'multi':
							print('Attention: multiply')
							y = linear(query, attention_vec_size, True)
							y = tf.reshape(y, [-1, 1, 1, attention_vec_size])
							#s = math_ops.reduce_sum(
							#	u[a] * math_ops.tanh(y * hidden_features[a]), [2, 3])
							s = math_ops.reduce_sum(
								hidden * math_ops.tanh(y), [2, 3])
								#hidden_features[a] * math_ops.tanh(y), [2, 3])

						elif self.hparams.att_strategy == 'multi_add':
							print('Attention: multiply_add')
							y = linear(query, attention_vec_size, True, scope='y')
							y2 = linear(query, attention_vec_size, True , scope='y2')
							y = tf.reshape(y, [-1, 1, 1, attention_vec_size])
							y2 = tf.reshape(y2, [-1, 1, 1, attention_vec_size])
							#s = math_ops.reduce_sum(
							#	u[a] * math_ops.tanh(y * hidden_features[a]), [2, 3])
							s = math_ops.reduce_sum(
								hidden * math_ops.tanh(y2), [2, 3])
							s = s + math_ops.reduce_sum(
								v[a] * math_ops.tanh(hidden_features[a] + y), [2, 3])

						elif self.hparams.att_strategy == 'NTN':
							print('Attention: NTN')
							y = linear(query, attn_size, False)
							y = tf.tile(tf.reshape(y, [-1, 1, 1, attn_size]),[1,attn_length,1,1])
							s = math_ops.reduce_sum(hidden * y, [2,3]) #bilnear
							s = s + math_ops.reduce_sum(nn_ops.conv2d(concat_input, linear_w[a], [1, 1, 1, 1], "SAME"), [2,3]) #linear
							s = s + linear_b[a] #bias
							#print(s.get_shape())
							#s = tf.tanh(s) #non linear

						elif self.hparams.att_strategy == 'elu':
							print('Attention: elu')

							cur_input = concat_input
							#for i in xrange(len(abstract_layers)):
							#	cur_input = tf.contrib.layers.fully_connected(cur_input, abstract_layers[i], activation_fn=tf.nn.elu)
							for i in xrange(len(abstract_layers)):
								cur_input = nn_ops.conv2d(cur_input, abstract_w[a][i], [1, 1, 1, 1], "SAME")
								cur_input = cur_input + abstract_b[a][i]
								cur_input = tf.nn.elu(cur_input)
							s = math_ops.reduce_sum(cur_input,[2,3])

						else:
							print('Attention: add')
							y = linear(query, attention_vec_size, True)
							y = tf.reshape(y, [-1, 1, 1, attention_vec_size])
							s = math_ops.reduce_sum(
								v[a] * math_ops.tanh(hidden_features[a] + y), [2, 3])

						att = s * head_weights[a]#nn_ops.softmax(s)
						aw.append(att)
						# Now calculate the attention-weighted vector d.
						d = math_ops.reduce_sum(
							tf.reshape(att, [-1, attn_length, 1, 1]) * hidden,
								[1, 2])
						ds.append(tf.reshape(d, [-1, attn_size]))
				return aw, ds
Esempio n. 33
0
def pointer_decoder(decoder_inputs, initial_state, attention_states, cell,
                    feed_prev=True, dtype=dtypes.float32, scope=None):
    """RNN decoder with pointer net for the sequence-to-sequence model.
    Args:
      decoder_inputs: a list of 2D Tensors [batch_size x cell.input_size].
      initial_state: 2D Tensor [batch_size x cell.state_size].
      attention_states: 3D Tensor [batch_size x attn_length x attn_size].
      cell: rnn_cell.RNNCell defining the cell function and size.
      dtype: The dtype to use for the RNN initial state (default: tf.float32).
      scope: VariableScope for the created subgraph; default: "pointer_decoder".
    Returns:
      outputs: A list of the same length as decoder_inputs of 2D Tensors of shape
        [batch_size x output_size]. These represent the generated outputs.
        Output i is computed from input i (which is either i-th decoder_inputs.
        First, we run the cell
        on a combination of the input and previous attention masks:
          cell_output, new_state = cell(linear(input, prev_attn), prev_state).
        Then, we calculate new attention masks:
          new_attn = softmax(V^T * tanh(W * attention_states + U * new_state))
        and then we calculate the output:
          output = linear(cell_output, new_attn).
      states: The state of each decoder cell in each time-step. This is a list
        with length len(decoder_inputs) -- one item for each time-step.
        Each item is a 2D Tensor of shape [batch_size x cell.state_size].
    """
    if not decoder_inputs:
        raise ValueError("Must provide at least 1 input to attention decoder.")
    if not attention_states.get_shape()[1:2].is_fully_defined():
        raise ValueError("Shape[1] and [2] of attention_states must be known: %s"
                         % attention_states.get_shape())

    with vs.variable_scope(scope or "point_decoder"):
        batch_size = array_ops.shape(decoder_inputs[0])[0]  # Needed for reshaping.
        input_size = decoder_inputs[0].get_shape()[1].value
        attn_length = attention_states.get_shape()[1].value
        attn_size = attention_states.get_shape()[2].value

        # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before.
        hidden = array_ops.reshape(
            attention_states, [-1, attn_length, 1, attn_size])

        attention_vec_size = attn_size  # Size of query vectors for attention.
        k = vs.get_variable("AttnW", [1, 1, attn_size, attention_vec_size])
        hidden_features = nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME")
        v = vs.get_variable("AttnV", [attention_vec_size])

        states = [initial_state]

        def attention(query):
            """Point on hidden using hidden_features and query."""
            with vs.variable_scope("Attention"):
                y = rnn_cell._linear(query, attention_vec_size, True)
                y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size])
                # Attention mask is a softmax of v^T * tanh(...).
                s = math_ops.reduce_sum(
                    v * math_ops.tanh(hidden_features + y), [2, 3])
                return s

        outputs = []
        prev = None
        batch_attn_size = array_ops.pack([batch_size, attn_size])
        attns = array_ops.zeros(batch_attn_size, dtype=dtype)

        attns.set_shape([None, attn_size])
        inps = []
        for i in xrange(len(decoder_inputs)):
            if i > 0:
                vs.get_variable_scope().reuse_variables()
            inp = decoder_inputs[i]

            if feed_prev and i > 0:
                inp = tf.pack(decoder_inputs)
                inp = tf.transpose(inp, perm=[1, 0, 2])
                inp = tf.reshape(inp, [-1, attn_length, input_size])
                inp = tf.reduce_sum(inp * tf.reshape(tf.nn.softmax(output), [-1, attn_length, 1]), 1)
                inp = tf.stop_gradient(inp)
                inps.append(inp)

            # Use the same inputs in inference, order internaly

            # Merge input and previous attentions into one vector of the right size.
            x = rnn_cell._linear([inp, attns], cell.output_size, True)
            # Run the RNN.
            cell_output, new_state = cell(x, states[-1])
            states.append(new_state)
            # Run the attention mechanism.
            output = attention(new_state)

            outputs.append(output)

    return outputs, states, inps
Esempio n. 34
0
        def attention(decoder_state, temporal_e, coverage=None):
            """Calculate the context vector and attention distribution from the decoder state.

            Args:
              decoder_state: state of the decoder
              temporal_e: store previous attentions for temporal attention mechanism
              coverage: Optional. Previous timestep's coverage vector, shape (batch_size, max_enc_steps, 1, 1).

            Returns:
              context_vector: weighted sum of _enc_states
              attn_dist: attention distribution
              coverage: new coverage vector. shape (batch_size, max_enc_steps, 1, 1)
              masked_e: store the attention score for temporal attention mechanism.
            """
            with variable_scope.variable_scope("Attention"):
                # Pass the decoder state through a linear layer (this is W_s s_t + b_attn in the paper)
                decoder_features = linear(
                    decoder_state, attention_vec_size,
                    True)  # shape (batch_size, attention_vec_size)
                decoder_features = tf.expand_dims(
                    tf.expand_dims(decoder_features, 1),
                    1)  # reshape to (batch_size, 1, 1, attention_vec_size)

                # We can't have coverage with matrix attention
                if not _hps.matrix_attention and use_coverage and coverage is not None:  # non-first step of coverage
                    # Multiply coverage vector by w_c to get coverage_features.
                    coverage_features = nn_ops.conv2d(
                        coverage, w_c, [1, 1, 1, 1], "SAME"
                    )  # c has shape (batch_size, max_enc_steps, 1, attention_vec_size)
                    # Calculate v^T tanh(W_h h_i + W_s s_t + w_c c_i^t + b_attn)
                    e_not_masked = math_ops.reduce_sum(
                        v * math_ops.tanh(encoder_features + decoder_features +
                                          coverage_features),
                        [2, 3])  # shape (batch_size,max_enc_steps)
                    masked_e = nn_ops.softmax(
                        e_not_masked
                    ) * enc_padding_mask  # (batch_size, max_enc_steps)
                    masked_sums = tf.reduce_sum(masked_e,
                                                axis=1)  # shape (batch_size)
                    masked_e = masked_e / tf.reshape(masked_sums, [-1, 1])
                    # Equation 3 in
                    if _hps.use_temporal_attention:
                        try:
                            len_temporal_e = temporal_e.get_shape()[0]
                        except:
                            len_temporal_e = 0
                        if len_temporal_e == 0:
                            attn_dist = masked_e
                        else:
                            masked_sums = tf.reduce_sum(
                                temporal_e, axis=0
                            ) + 1e-10  # if it's zero due to masking we set it to a small value
                            attn_dist = masked_e / masked_sums  # (batch_size, max_enc_steps)
                    else:
                        attn_dist = masked_e
                    masked_attn_sums = tf.reduce_sum(attn_dist, axis=1)
                    attn_dist = attn_dist / tf.reshape(masked_attn_sums,
                                                       [-1, 1])  # re-normalize
                    # Update coverage vector
                    coverage += array_ops.reshape(attn_dist,
                                                  [batch_size, -1, 1, 1])
                else:
                    if _hps.matrix_attention:
                        # Calculate h_d * W_attn * h_i, equation 2 in https://arxiv.org/pdf/1705.04304.pdf
                        _dec_attn = tf.unstack(
                            tf.matmul(
                                tf.squeeze(decoder_features, axis=[1, 2]),
                                w_attn),
                            axis=0)  # batch_size * (attention_vec_size)
                        _enc_states_lst = tf.unstack(
                            tf.squeeze(_enc_states, axis=2), axis=0
                        )  # batch_size * (max_enc_steps, attention_vec_size)

                        e_not_masked = tf.squeeze(
                            tf.stack([
                                tf.matmul(tf.reshape(_dec, [1, -1]),
                                          tf.transpose(_enc)) for _dec, _enc in
                                zip(_dec_attn, _enc_states_lst)
                            ]),
                            axis=1)  # (batch_size, max_enc_steps)
                        masked_e = tf.exp(
                            e_not_masked *
                            enc_padding_mask)  # (batch_size, max_enc_steps)
                    else:
                        # Calculate v^T tanh(W_h h_i + W_s s_t + b_attn)
                        e_not_masked = math_ops.reduce_sum(
                            v *
                            math_ops.tanh(encoder_features + decoder_features),
                            [2, 3])  # calculate e, (batch_size, max_enc_steps)
                        masked_e = nn_ops.softmax(
                            e_not_masked
                        ) * enc_padding_mask  # (batch_size, max_enc_steps)
                        masked_sums = tf.reduce_sum(
                            masked_e, axis=1)  # shape (batch_size)
                        masked_e = masked_e / tf.reshape(masked_sums, [-1, 1])
                    if _hps.use_temporal_attention:
                        try:
                            len_temporal_e = temporal_e.get_shape()[0]
                        except:
                            len_temporal_e = 0
                        if len_temporal_e == 0:
                            attn_dist = masked_e
                        else:
                            masked_sums = tf.reduce_sum(
                                temporal_e, axis=0
                            ) + 1e-10  # if it's zero due to masking we set it to a small value
                            attn_dist = masked_e / masked_sums  # (batch_size, max_enc_steps)
                    else:
                        attn_dist = masked_e
                    # Calculate attention distribution
                    masked_attn_sums = tf.reduce_sum(attn_dist, axis=1)
                    attn_dist = attn_dist / tf.reshape(masked_attn_sums,
                                                       [-1, 1])  # re-normalize

                    if use_coverage:  # first step of training
                        coverage = tf.expand_dims(tf.expand_dims(attn_dist, 2),
                                                  2)  # initialize coverage

                # Calculate the context vector from attn_dist and _enc_states
                context_vector = math_ops.reduce_sum(
                    array_ops.reshape(attn_dist, [batch_size, -1, 1, 1]) *
                    _enc_states, [1, 2])  # shape (batch_size, attn_size).
                context_vector = array_ops.reshape(context_vector,
                                                   [-1, attn_size])

            return context_vector, attn_dist, coverage, masked_e
Esempio n. 35
0
def attention_decoder(_hps,
                      v_size,
                      _max_art_oovs,
                      _enc_batch_extend_vocab,
                      emb_dec_inputs,
                      target_batch,
                      _dec_in_state,
                      _enc_states,
                      enc_padding_mask,
                      dec_padding_mask,
                      cell,
                      embedding,
                      sampling_probability,
                      alpha,
                      unk_id,
                      initial_state_attention=False,
                      pointer_gen=True,
                      use_coverage=False,
                      prev_coverage=None,
                      prev_decoder_outputs=[],
                      prev_encoder_es=[]):
    """
    Args:
      _hps: parameter of the models.
      v_size: vocab size.
      _max_art_oovs: size of the oov tokens in current batch.
      _enc_batch_extend_vocab: encoder extended vocab batch.
      emb_dec_inputs: A list of 2D Tensors [batch_size x emb_dim].
      target_batch: The indices of the target words. shape (max_dec_steps, batch_size)
      _dec_in_state: 2D Tensor [batch_size x cell.state_size].
      _enc_states: 3D Tensor [batch_size x max_enc_steps x attn_size].
      enc_padding_mask: 2D Tensor [batch_size x max_enc_steps] containing 1s and 0s; indicates which of the encoder locations are padding (0) or a real token (1).
      dec_padding_mask: 2D Tensor [batch_size x max_dec_steps] containing 1s and 0s; indicates which of the decoder locations are padding (0) or a real token (1).
      cell: rnn_cell.RNNCell defining the cell function and size.
      embedding: embedding matrix [vocab_size, emb_dim].
      sampling_probability: sampling probability for scheduled sampling.
      alpha: soft-argmax argument.
      initial_state_attention:
        Note that this attention decoder passes each decoder input through a linear layer with the previous step's context vector to get a modified version of the input. If initial_state_attention is False, on the first decoder step the "previous context vector" is just a zero vector. If initial_state_attention is True, we use _dec_in_state to (re)calculate the previous step's context vector. We set this to False for train/eval mode (because we call attention_decoder once for all decoder steps) and True for decode mode (because we call attention_decoder once for each decoder step).
      pointer_gen: boolean. If True, calculate the generation probability p_gen for each decoder step.
      use_coverage: boolean. If True, use coverage mechanism.
      prev_coverage:
        If not None, a tensor with shape (batch_size, max_enc_steps). The previous step's coverage vector. This is only not None in decode mode when using coverage.
      prev_decoder_outputs: if not empty, a tensor of (len(prev_decoder_steps), batch_size, hidden_dim). The previous decoder output used for calculating the intradecoder attention during decode mode
      prev_encoder_es: if not empty, a tensor of (len(prev_encoder_es), batch_size, hidden_dim). The previous attention vector used for calculating the temporal attention during decode mode.
    Returns:
      outputs: A list of the same length as emb_dec_inputs of 2D Tensors of
        shape [batch_size x cell.output_size]. The output vectors.
      state: The final state of the decoder. A tensor shape [batch_size x cell.state_size].
      attn_dists: A list containing tensors of shape (batch_size,max_enc_steps).
        The attention distributions for each decoder step.
      p_gens: List of length emb_dim, containing tensors of shape [batch_size, 1]. The values of p_gen for each decoder step. Empty list if pointer_gen=False.
      coverage: Coverage vector on the last step computed. None if use_coverage=False.
      vocab_scores: vocab distribution.
      final_dists: final output distribution.
      samples: contains sampled tokens.
      greedy_search_samples: contains greedy tokens.
      temporal_e: contains temporal attention.
    """
    with variable_scope.variable_scope("attention_decoder") as scope:
        batch_size = _enc_states.get_shape()[
            0]  # if this line fails, it's because the batch size isn't defined
        attn_size = _enc_states.get_shape(
        )[2]  # if this line fails, it's because the attention length isn't defined
        emb_size = emb_dec_inputs[0].get_shape()[
            1]  # if this line fails, it's because the embedding isn't defined
        decoder_attn_size = _dec_in_state.c.get_shape()[1]
        tf.logging.info("batch_size %i, attn_size: %i, emb_size: %i",
                        batch_size, attn_size, emb_size)
        # Reshape _enc_states (need to insert a dim)
        _enc_states = tf.expand_dims(
            _enc_states,
            axis=2)  # now is shape (batch_size, max_enc_steps, 1, attn_size)

        # To calculate attention, we calculate
        #   v^T tanh(W_h h_i + W_s s_t + b_attn)
        # where h_i is an encoder state, and s_t a decoder state.
        # attn_vec_size is the length of the vectors v, b_attn, (W_h h_i) and (W_s s_t).
        # We set it to be equal to the size of the encoder states.
        attention_vec_size = attn_size

        # Get the weight matrix W_h and apply it to each encoder state to get (W_h h_i), the encoder features
        if _hps.matrix_attention:
            w_attn = variable_scope.get_variable(
                "w_attn", [attention_vec_size, attention_vec_size])
            if _hps.intradecoder:
                w_dec_attn = variable_scope.get_variable(
                    "w_dec_attn", [decoder_attn_size, decoder_attn_size])
        else:
            W_h = variable_scope.get_variable(
                "W_h", [1, 1, attn_size, attention_vec_size])
            v = variable_scope.get_variable("v", [attention_vec_size])
            encoder_features = nn_ops.conv2d(_enc_states, W_h, [
                1, 1, 1, 1
            ], "SAME")  # shape (batch_size,max_enc_steps,1,attention_vec_size)
        if _hps.intradecoder:
            W_h_d = variable_scope.get_variable(
                "W_h_d", [1, 1, decoder_attn_size, decoder_attn_size])
            v_d = variable_scope.get_variable("v_d", [decoder_attn_size])

        # Get the weight vectors v and w_c (w_c is for coverage)
        if use_coverage:
            with variable_scope.variable_scope("coverage"):
                w_c = variable_scope.get_variable(
                    "w_c", [1, 1, 1, attention_vec_size])

        if prev_coverage is not None:  # for beam search mode with coverage
            # reshape from (batch_size, max_enc_steps) to (batch_size, max_enc_steps, 1, 1)
            prev_coverage = tf.expand_dims(tf.expand_dims(prev_coverage, 2), 3)

        def attention(decoder_state, temporal_e, coverage=None):
            """Calculate the context vector and attention distribution from the decoder state.

            Args:
              decoder_state: state of the decoder
              temporal_e: store previous attentions for temporal attention mechanism
              coverage: Optional. Previous timestep's coverage vector, shape (batch_size, max_enc_steps, 1, 1).

            Returns:
              context_vector: weighted sum of _enc_states
              attn_dist: attention distribution
              coverage: new coverage vector. shape (batch_size, max_enc_steps, 1, 1)
              masked_e: store the attention score for temporal attention mechanism.
            """
            with variable_scope.variable_scope("Attention"):
                # Pass the decoder state through a linear layer (this is W_s s_t + b_attn in the paper)
                decoder_features = linear(
                    decoder_state, attention_vec_size,
                    True)  # shape (batch_size, attention_vec_size)
                decoder_features = tf.expand_dims(
                    tf.expand_dims(decoder_features, 1),
                    1)  # reshape to (batch_size, 1, 1, attention_vec_size)

                # We can't have coverage with matrix attention
                if not _hps.matrix_attention and use_coverage and coverage is not None:  # non-first step of coverage
                    # Multiply coverage vector by w_c to get coverage_features.
                    coverage_features = nn_ops.conv2d(
                        coverage, w_c, [1, 1, 1, 1], "SAME"
                    )  # c has shape (batch_size, max_enc_steps, 1, attention_vec_size)
                    # Calculate v^T tanh(W_h h_i + W_s s_t + w_c c_i^t + b_attn)
                    e_not_masked = math_ops.reduce_sum(
                        v * math_ops.tanh(encoder_features + decoder_features +
                                          coverage_features),
                        [2, 3])  # shape (batch_size,max_enc_steps)
                    masked_e = nn_ops.softmax(
                        e_not_masked
                    ) * enc_padding_mask  # (batch_size, max_enc_steps)
                    masked_sums = tf.reduce_sum(masked_e,
                                                axis=1)  # shape (batch_size)
                    masked_e = masked_e / tf.reshape(masked_sums, [-1, 1])
                    # Equation 3 in
                    if _hps.use_temporal_attention:
                        try:
                            len_temporal_e = temporal_e.get_shape()[0]
                        except:
                            len_temporal_e = 0
                        if len_temporal_e == 0:
                            attn_dist = masked_e
                        else:
                            masked_sums = tf.reduce_sum(
                                temporal_e, axis=0
                            ) + 1e-10  # if it's zero due to masking we set it to a small value
                            attn_dist = masked_e / masked_sums  # (batch_size, max_enc_steps)
                    else:
                        attn_dist = masked_e
                    masked_attn_sums = tf.reduce_sum(attn_dist, axis=1)
                    attn_dist = attn_dist / tf.reshape(masked_attn_sums,
                                                       [-1, 1])  # re-normalize
                    # Update coverage vector
                    coverage += array_ops.reshape(attn_dist,
                                                  [batch_size, -1, 1, 1])
                else:
                    if _hps.matrix_attention:
                        # Calculate h_d * W_attn * h_i, equation 2 in https://arxiv.org/pdf/1705.04304.pdf
                        _dec_attn = tf.unstack(
                            tf.matmul(
                                tf.squeeze(decoder_features, axis=[1, 2]),
                                w_attn),
                            axis=0)  # batch_size * (attention_vec_size)
                        _enc_states_lst = tf.unstack(
                            tf.squeeze(_enc_states, axis=2), axis=0
                        )  # batch_size * (max_enc_steps, attention_vec_size)

                        e_not_masked = tf.squeeze(
                            tf.stack([
                                tf.matmul(tf.reshape(_dec, [1, -1]),
                                          tf.transpose(_enc)) for _dec, _enc in
                                zip(_dec_attn, _enc_states_lst)
                            ]),
                            axis=1)  # (batch_size, max_enc_steps)
                        masked_e = tf.exp(
                            e_not_masked *
                            enc_padding_mask)  # (batch_size, max_enc_steps)
                    else:
                        # Calculate v^T tanh(W_h h_i + W_s s_t + b_attn)
                        e_not_masked = math_ops.reduce_sum(
                            v *
                            math_ops.tanh(encoder_features + decoder_features),
                            [2, 3])  # calculate e, (batch_size, max_enc_steps)
                        masked_e = nn_ops.softmax(
                            e_not_masked
                        ) * enc_padding_mask  # (batch_size, max_enc_steps)
                        masked_sums = tf.reduce_sum(
                            masked_e, axis=1)  # shape (batch_size)
                        masked_e = masked_e / tf.reshape(masked_sums, [-1, 1])
                    if _hps.use_temporal_attention:
                        try:
                            len_temporal_e = temporal_e.get_shape()[0]
                        except:
                            len_temporal_e = 0
                        if len_temporal_e == 0:
                            attn_dist = masked_e
                        else:
                            masked_sums = tf.reduce_sum(
                                temporal_e, axis=0
                            ) + 1e-10  # if it's zero due to masking we set it to a small value
                            attn_dist = masked_e / masked_sums  # (batch_size, max_enc_steps)
                    else:
                        attn_dist = masked_e
                    # Calculate attention distribution
                    masked_attn_sums = tf.reduce_sum(attn_dist, axis=1)
                    attn_dist = attn_dist / tf.reshape(masked_attn_sums,
                                                       [-1, 1])  # re-normalize

                    if use_coverage:  # first step of training
                        coverage = tf.expand_dims(tf.expand_dims(attn_dist, 2),
                                                  2)  # initialize coverage

                # Calculate the context vector from attn_dist and _enc_states
                context_vector = math_ops.reduce_sum(
                    array_ops.reshape(attn_dist, [batch_size, -1, 1, 1]) *
                    _enc_states, [1, 2])  # shape (batch_size, attn_size).
                context_vector = array_ops.reshape(context_vector,
                                                   [-1, attn_size])

            return context_vector, attn_dist, coverage, masked_e

        def intra_decoder_attention(decoder_state, outputs):
            """Calculate the context vector and attention distribution from the decoder state.

            Args:
              decoder_state: state of the decoder
              outputs: list of decoder states for implementing intra-decoder mechanism, len(decoder_states) * (batch_size, hidden_dim)
            Returns:
              context_decoder_vector: weighted sum of _dec_states
              decoder_attn_dist: intra-decoder attention distribution
            """
            attention_dec_vec_size = attn_dec_size = decoder_state.c.get_shape(
            )[1]  # hidden_dim
            try:
                len_dec_states = outputs.get_shape()[0]
            except:
                len_dec_states = 0
            attention_dec_vec_size = attn_dec_size = decoder_state.c.get_shape(
            )[1]  # hidden_dim
            _decoder_states = tf.expand_dims(
                tf.reshape(outputs, [batch_size, -1, attn_dec_size]), axis=2
            )  # now is shape (batch_size,len(decoder_states), 1, attn_size)
            _prev_decoder_features = nn_ops.conv2d(
                _decoder_states, W_h_d, [1, 1, 1, 1], "SAME"
            )  # shape (batch_size,len(decoder_states),1,attention_vec_size)
            with variable_scope.variable_scope("DecoderAttention"):
                # Pass the decoder state through a linear layer (this is W_s s_t + b_attn in the paper)
                try:
                    decoder_features = linear(
                        decoder_state, attention_dec_vec_size,
                        True)  # shape (batch_size, attention_vec_size)
                    decoder_features = tf.expand_dims(
                        tf.expand_dims(decoder_features, 1), 1
                    )  # reshape to (batch_size, 1, 1, attention_dec_vec_size)
                    # Calculate v^T tanh(W_h h_i + W_s s_t + b_attn)
                    if _hps.matrix_attention:
                        # Calculate h_d * W_attn * h_d, equation 6 in https://arxiv.org/pdf/1705.04304.pdf
                        _dec_attn = tf.matmul(
                            tf.squeeze(decoder_features),
                            w_dec_attn)  # (batch_size, decoder_attn_size)
                        _dec_states_lst = tf.unstack(
                            tf.reshape(_prev_decoder_features,
                                       [batch_size, -1, decoder_attn_size])
                        )  # batch_size * (len(decoder_states), decoder_attn_size)
                        e_not_masked = tf.reshape(
                            tf.stack([
                                tf.matmul(_dec_attn, tf.transpose(k))
                                for k in _dec_states_lst
                            ]), [batch_size, -1
                                 ])  # (batch_size, len(decoder_states))
                        masked_e = tf.exp(
                            e_not_masked * dec_padding_mask[:, :len_dec_states]
                        )  # (batch_size, len(decoder_states))
                    else:
                        # Calculate v^T tanh(W_h h_i + W_s s_t + b_attn)
                        e_not_masked = math_ops.reduce_sum(
                            v_d * math_ops.tanh(_prev_decoder_features +
                                                decoder_features),
                            [
                                2, 3
                            ])  # calculate e, (batch_size,len(decoder_states))
                        masked_e = nn_ops.softmax(
                            e_not_masked
                        ) * dec_padding_mask[:, :
                                             len_dec_states]  # (batch_size,len(decoder_states))
                    if len_dec_states <= 1:
                        masked_e = array_ops.ones(
                            [batch_size,
                             1])  # first step is filled with equal values
                    masked_sums = tf.reshape(
                        tf.reduce_sum(masked_e, axis=1), [-1, 1]
                    )  # (batch_size,1), # if it's zero due to masking we set it to a small value
                    decoder_attn_dist = masked_e / masked_sums  # (batch_size,len(decoder_states))
                    context_decoder_vector = math_ops.reduce_sum(
                        array_ops.reshape(decoder_attn_dist,
                                          [batch_size, -1, 1, 1]) *
                        _decoder_states, [1, 2])  # (batch_size, attn_size)
                    context_decoder_vector = array_ops.reshape(
                        context_decoder_vector,
                        [-1, attn_dec_size])  # (batch_size, attn_size)
                except:
                    return array_ops.zeros(
                        [batch_size,
                         decoder_attn_size]), array_ops.zeros([batch_size, 0])
            return context_decoder_vector, decoder_attn_dist

        outputs = []
        temporal_e = []
        attn_dists = []
        vocab_scores = []
        vocab_dists = []
        final_dists = []
        p_gens = []
        samples = [
        ]  # this holds the words chosen by sampling based on the final distribution for each decoding step, list of max_dec_steps of (batch_size, 1)
        greedy_search_samples = [
        ]  # this holds the words chosen by greedy search (taking the max) on the final distribution for each decoding step, list of max_dec_steps of (batch_size, 1)
        sampling_rewards = []  # list of size max_dec_steps (batch_size, k)
        greedy_rewards = []  # list of size max_dec_steps (batch_size, k)
        state = _dec_in_state
        coverage = prev_coverage  # initialize coverage to None or whatever was passed in
        context_vector = array_ops.zeros([batch_size, attn_size])
        context_decoder_vector = array_ops.zeros(
            [batch_size, decoder_attn_size])
        context_vector.set_shape([
            None, attn_size
        ])  # Ensure the second shape of attention vectors is set.
        if initial_state_attention:  # true in decode mode
            # Re-calculate the context vector from the previous step so that we can pass it through a linear layer with this step's input to get a modified version of the input
            context_vector, _, coverage, _ = attention(
                _dec_in_state, tf.stack(prev_encoder_es, axis=0), coverage
            )  # in decode mode, this is what updates the coverage vector
            if _hps.intradecoder:
                context_decoder_vector, _ = intra_decoder_attention(
                    _dec_in_state, tf.stack(prev_decoder_outputs, axis=0))
        for i, inp in enumerate(emb_dec_inputs):
            tf.logging.info("Adding attention_decoder timestep %i of %i", i,
                            len(emb_dec_inputs))
            if i > 0:
                variable_scope.get_variable_scope().reuse_variables()

            if _hps.mode in [
                    'train', 'eval'
            ] and _hps.scheduled_sampling and i > 0:  # start scheduled sampling after we received the first decoder's output
                # modify the input to next decoder using scheduled sampling
                if FLAGS.scheduled_sampling_final_dist:
                    inp = scheduled_sampling(_hps, sampling_probability,
                                             final_dist, embedding, inp, alpha)
                else:
                    inp = scheduled_sampling_vocab_dist(
                        _hps, sampling_probability, vocab_dist, embedding, inp,
                        alpha)

            # Merge input and previous attentions into one vector x of the same size as inp
            emb_dim = inp.get_shape().with_rank(2)[1]
            if emb_dim is None:
                raise ValueError("Could not infer input size from input: %s" %
                                 inp.name)

            x = linear([inp] + [context_vector], emb_dim, True)
            # Run the decoder RNN cell. cell_output = decoder state
            cell_output, state = cell(x, state)

            # Run the attention mechanism.
            if i == 0 and initial_state_attention:  # always true in decode mode
                with variable_scope.variable_scope(
                        variable_scope.get_variable_scope(), reuse=True
                ):  # you need this because you've already run the initial attention(...) call
                    context_vector, attn_dist, _, masked_e = attention(
                        state, tf.stack(prev_encoder_es, axis=0),
                        coverage)  # don't allow coverage to update
                    if _hps.intradecoder:
                        context_decoder_vector, _ = intra_decoder_attention(
                            state, tf.stack(prev_decoder_outputs, axis=0))
            else:
                context_vector, attn_dist, coverage, masked_e = attention(
                    state, tf.stack(temporal_e, axis=0), coverage)
                if _hps.intradecoder:
                    context_decoder_vector, _ = intra_decoder_attention(
                        state, tf.stack(outputs, axis=0))
            attn_dists.append(attn_dist)
            temporal_e.append(masked_e)

            with variable_scope.variable_scope("combined_context"):
                if _hps.intradecoder:
                    context_vector = linear(
                        [context_vector] + [context_decoder_vector], attn_size,
                        False)
            # Calculate p_gen
            if pointer_gen:
                with tf.variable_scope('calculate_pgen'):
                    p_gen = linear([context_vector, state.c, state.h, x], 1,
                                   True)  # Tensor shape (batch_size, 1)
                    p_gen = tf.sigmoid(p_gen)
                    p_gens.append(p_gen)

            # Concatenate the cell_output (= decoder state) and the context vector, and pass them through a linear layer
            # This is V[s_t, h*_t] + b in the paper
            with variable_scope.variable_scope("AttnOutputProjection"):
                output = linear([cell_output] + [context_vector],
                                cell.output_size, True)
            outputs.append(output)

            # Add the output projection to obtain the vocabulary distribution
            with tf.variable_scope('output_projection'):
                if i > 0:
                    tf.get_variable_scope().reuse_variables()
                trunc_norm_init = tf.truncated_normal_initializer(
                    stddev=_hps.trunc_norm_init_std)
                w_out = tf.get_variable('w', [_hps.dec_hidden_dim, v_size],
                                        dtype=tf.float32,
                                        initializer=trunc_norm_init)
                # w_t_out = tf.transpose(w)
                v_out = tf.get_variable('v', [v_size],
                                        dtype=tf.float32,
                                        initializer=trunc_norm_init)
                if i > 0:
                    tf.get_variable_scope().reuse_variables()
                if FLAGS.share_decoder_weights:  # Eq. 13 in https://arxiv.org/pdf/1705.04304.pdf
                    w_out = tf.transpose(
                        math_ops.tanh(
                            linear([embedding] + [tf.transpose(w_out)],
                                   _hps.dec_hidden_dim,
                                   bias=False)))
                score = tf.nn.xw_plus_b(output, w_out, v_out)
                if _hps.scheduled_sampling and not _hps.greedy_scheduled_sampling:
                    # Gumbel reparametrization trick: https://arxiv.org/abs/1704.06970
                    U = tf.random_uniform(
                        score.get_shape(), 10e-12,
                        (1 - 10e-12))  # add a small number to avoid log(0)
                    G = -tf.log(-tf.log(U))
                    score = score + G
                vocab_scores.append(score)  # apply the linear layer
                vocab_dist = tf.nn.softmax(score)
                vocab_dists.append(
                    vocab_dist
                )  # The vocabulary distributions. List length max_dec_steps of (batch_size, vsize) arrays. The words are in the order they appear in the vocabulary file.

            # For pointer-generator model, calc final distribution from copy distribution and vocabulary distribution
            if _hps.pointer_gen:
                final_dist = _calc_final_dist(_hps, v_size, _max_art_oovs,
                                              _enc_batch_extend_vocab, p_gen,
                                              vocab_dist, attn_dist)
            else:  # final distribution is just vocabulary distribution
                final_dist = vocab_dist
            final_dists.append(final_dist)

            # get the sampled token and greedy token
            # this will take the final_dist and sample from it for a total count of k (k samples)
            one_hot_k_samples = tf.distributions.Multinomial(
                total_count=1., probs=final_dist
            ).sample(
                _hps.k
            )  # sample k times according to https://arxiv.org/pdf/1705.04304.pdf, size (k, batch_size, extended_vsize)
            k_argmax = tf.argmax(one_hot_k_samples,
                                 axis=2,
                                 output_type=tf.int32)  # (k, batch_size)
            k_sample = tf.transpose(k_argmax)  # shape (batch_size, k)
            greedy_search_prob, greedy_search_sample = tf.nn.top_k(
                final_dist, k=_hps.k)  # (batch_size, k)
            greedy_search_samples.append(greedy_search_sample)
            samples.append(k_sample)
            if FLAGS.use_discounted_rewards:
                _sampling_rewards = []
                _greedy_rewards = []
                for _ in range(_hps.k):
                    rl_fscore = tf.reshape(
                        rouge_l_fscore(
                            tf.transpose(tf.stack(samples)[:, :, _]),
                            target_batch), [-1, 1])  # shape (batch_size, 1)
                    _sampling_rewards.append(tf.reshape(rl_fscore, [-1, 1]))
                    rl_fscore = tf.reshape(
                        rouge_l_fscore(
                            tf.transpose(
                                tf.stack(greedy_search_samples)[:, :, _]),
                            target_batch), [-1, 1])  # shape (batch_size, 1)
                    _greedy_rewards.append(tf.reshape(rl_fscore, [-1, 1]))
                sampling_rewards.append(
                    tf.squeeze(tf.stack(_sampling_rewards, axis=1),
                               axis=-1))  # (batch_size, k)
                greedy_rewards.append(
                    tf.squeeze(tf.stack(_greedy_rewards, axis=1),
                               axis=-1))  # (batch_size, k)

        if FLAGS.use_discounted_rewards:
            sampling_rewards = tf.stack(sampling_rewards)
            greedy_rewards = tf.stack(greedy_rewards)
        else:
            _sampling_rewards = []
            _greedy_rewards = []
            for _ in range(_hps.k):
                rl_fscore = rouge_l_fscore(
                    tf.transpose(tf.stack(samples)[:, :, _]),
                    target_batch)  # shape (batch_size, 1)
                _sampling_rewards.append(tf.reshape(rl_fscore, [-1, 1]))
                rl_fscore = rouge_l_fscore(
                    tf.transpose(tf.stack(greedy_search_samples)[:, :, _]),
                    target_batch)  # shape (batch_size, 1)
                _greedy_rewards.append(tf.reshape(rl_fscore, [-1, 1]))
            sampling_rewards = tf.squeeze(tf.stack(_sampling_rewards, axis=1),
                                          axis=-1)  # (batch_size, k)
            greedy_rewards = tf.squeeze(tf.stack(_greedy_rewards, axis=1),
                                        axis=-1)  # (batch_size, k)
        # If using coverage, reshape it
        if coverage is not None:
            coverage = array_ops.reshape(coverage, [batch_size, -1])

    return (outputs, state, attn_dists, p_gens, coverage, vocab_scores,
            final_dists, samples, greedy_search_samples, temporal_e,
            sampling_rewards, greedy_rewards)
Esempio n. 36
0
def attention_decoder(decoder_inputs, initial_state, attention_states, cell,
                      output_size=None, num_heads=1, loop_function=None,
                      dtype=dtypes.float32, scope=None,
                      initial_state_attention=False):
  """RNN decoder with attention for the sequence-to-sequence model.

  In this context "attention" means that, during decoding, the RNN can look up
  information in the additional tensor attention_states, and it does this by
  focusing on a few entries from the tensor. This model has proven to yield
  especially good results in a number of sequence-to-sequence tasks. This
  implementation is based on http://arxiv.org/abs/1412.7449 (see below for
  details). It is recommended for complex sequence-to-sequence tasks.

  Args:
    decoder_inputs: A list of 2D Tensors [batch_size x input_size].
    initial_state: 2D Tensor [batch_size x cell.state_size].
    attention_states: 3D Tensor [batch_size x attn_length x attn_size].
    cell: rnn_cell.RNNCell defining the cell function and size.
    output_size: Size of the output vectors; if None, we use cell.output_size.
    num_heads: Number of attention heads that read from attention_states.
    loop_function: If not None, this function will be applied to i-th output
      in order to generate i+1-th input, and decoder_inputs will be ignored,
      except for the first element ("GO" symbol). This can be used for decoding,
      but also for training to emulate http://arxiv.org/abs/1506.03099.
      Signature -- loop_function(prev, i) = next
        * prev is a 2D Tensor of shape [batch_size x output_size],
        * i is an integer, the step number (when advanced control is needed),
        * next is a 2D Tensor of shape [batch_size x input_size].
    dtype: The dtype to use for the RNN initial state (default: tf.float32).
    scope: VariableScope for the created subgraph; default: "attention_decoder".
    initial_state_attention: If False (default), initial attentions are zero.
      If True, initialize the attentions from the initial state and attention
      states -- useful when we wish to resume decoding from a previously
      stored decoder state and attention states.

  Returns:
    A tuple of the form (outputs, state), where:
      outputs: A list of the same length as decoder_inputs of 2D Tensors of
        shape [batch_size x output_size]. These represent the generated outputs.
        Output i is computed from input i (which is either the i-th element
        of decoder_inputs or loop_function(output {i-1}, i)) as follows.
        First, we run the cell on a combination of the input and previous
        attention masks:
          cell_output, new_state = cell(linear(input, prev_attn), prev_state).
        Then, we calculate new attention masks:
          new_attn = softmax(V^T * tanh(W * attention_states + U * new_state))
        and then we calculate the output:
          output = linear(cell_output, new_attn).
      state: The state of each decoder cell the final time-step.
        It is a 2D Tensor of shape [batch_size x cell.state_size].

  Raises:
    ValueError: when num_heads is not positive, there are no inputs, shapes
      of attention_states are not set, or input size cannot be inferred
      from the input.
  """
  if not decoder_inputs:
    raise ValueError("Must provide at least 1 input to attention decoder.")
  if num_heads < 1:
    raise ValueError("With less than 1 heads, use a non-attention decoder.")
  if not attention_states.get_shape()[1:2].is_fully_defined():
    raise ValueError("Shape[1] and [2] of attention_states must be known: %s"
                     % attention_states.get_shape())
  if output_size is None:
    output_size = cell.output_size

  with variable_scope.variable_scope(scope or "attention_decoder"):
    batch_size = array_ops.shape(decoder_inputs[0])[0]  # Needed for reshaping.
    attn_length = attention_states.get_shape()[1].value
    attn_size = attention_states.get_shape()[2].value

    # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before.
    hidden = array_ops.reshape(
        attention_states, [-1, attn_length, 1, attn_size])
    hidden_features = []
    v = []
    attention_vec_size = attn_size  # Size of query vectors for attention.
    for a in xrange(num_heads):
      k = variable_scope.get_variable("AttnW_%d" % a,
                                      [1, 1, attn_size, attention_vec_size])
      hidden_features.append(nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME"))
      v.append(variable_scope.get_variable("AttnV_%d" % a,
                                           [attention_vec_size]))

    state = initial_state

    def attention(query):
      """Put attention masks on hidden using hidden_features and query."""
      ds = []  # Results of attention reads will be stored here.
      for a in xrange(num_heads):
        with variable_scope.variable_scope("Attention_%d" % a):
          y = linear(query, attention_vec_size, True)
          y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size])
          # Attention mask is a softmax of v^T * tanh(...).
          s = math_ops.reduce_sum(
              v[a] * math_ops.tanh(hidden_features[a] + y), [2, 3])
          a = nn_ops.softmax(s)
          # Now calculate the attention-weighted vector d.
          d = math_ops.reduce_sum(
              array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden,
              [1, 2])
          ds.append(array_ops.reshape(d, [-1, attn_size]))
      return ds

    outputs = []
    prev = None
    batch_attn_size = array_ops.pack([batch_size, attn_size])
    attns = [array_ops.zeros(batch_attn_size, dtype=dtype)
             for _ in xrange(num_heads)]
    for a in attns:  # Ensure the second shape of attention vectors is set.
      a.set_shape([None, attn_size])
    if initial_state_attention:
      attns = attention(initial_state)
    for i, inp in enumerate(decoder_inputs):
      if i > 0:
        variable_scope.get_variable_scope().reuse_variables()
      # If loop_function is set, we use it instead of decoder_inputs.
      if loop_function is not None and prev is not None:
        with variable_scope.variable_scope("loop_function", reuse=True):
          inp = loop_function(prev, i)
      # Merge input and previous attentions into one vector of the right size.
      input_size = inp.get_shape().with_rank(2)[1]
      if input_size.value is None:
        raise ValueError("Could not infer input size from input: %s" % inp.name)
      x = linear([inp] + attns, input_size, True)
      # Run the RNN.
      cell_output, state = cell(x, state)
      # Run the attention mechanism.
      if i == 0 and initial_state_attention:
        with variable_scope.variable_scope(variable_scope.get_variable_scope(),
                                           reuse=True):
          attns = attention(state)
      else:
        attns = attention(state)

      with variable_scope.variable_scope("AttnOutputProjection"):
        output = linear([cell_output] + attns, output_size, True)
      if loop_function is not None:
        prev = output
      outputs.append(output)

  return outputs, state
Esempio n. 37
0
def attention_decoder(decoder_inputs,
                      initial_state,
                      attention_states,
                      cell,
                      output_size=None,
                      num_heads=1,
                      loop_function=None,
                      dtype=None,
                      scope=None,
                      initial_state_attention=False):
    if not decoder_inputs:
        raise ValueError("Must provide at least 1 input to attention decoder.")
    if num_heads < 1:
        raise ValueError(
            "With less than 1 heads, use a non-attention decoder.")
    if attention_states.get_shape()[2].value is None:
        raise ValueError("Shape[2] of attention_states must be known: %s" %
                         attention_states.get_shape())
    if output_size is None:
        output_size = cell.output_size

    with variable_scope.variable_scope(scope or "attention_decoder",
                                       dtype=dtype) as scope:
        dtype = scope.dtype

        batch_size = array_ops.shape(
            decoder_inputs[0])[0]  # Needed for reshaping.
        attn_length = attention_states.get_shape()[1].value
        if attn_length is None:
            attn_length = array_ops.shape(attention_states)[1]
        attn_size = attention_states.get_shape()[2].value

        # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before.
        hidden = array_ops.reshape(attention_states,
                                   [-1, attn_length, 1, attn_size])
        hidden_features = []
        v = []
        # TODO
        attention_vec_size = 100  #attn_size  # Size of query vectors for attention.
        for a in xrange(num_heads):
            k = variable_scope.get_variable(
                "AttnW_%d" % a, [1, 1, attn_size, attention_vec_size])
            hidden_features.append(
                nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME"))
            v.append(
                variable_scope.get_variable("AttnV_%d" % a,
                                            [attention_vec_size]))

        state = initial_state

        def attention(query):
            """Put attention masks on hidden using hidden_features and query."""
            ds = []  # Results of attention reads will be stored here.
            if nest.is_sequence(query):  # If the query is a tuple, flatten it.
                query_list = nest.flatten(query)
                for q in query_list:  # Check that ndims == 2 if specified.
                    ndims = q.get_shape().ndims
                    if ndims:
                        assert ndims == 2
                query = array_ops.concat(query_list, 1)
            for a in xrange(num_heads):
                with variable_scope.variable_scope("Attention_%d" % a):
                    y = linear(query, attention_vec_size, True)
                    y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size])
                    # Attention mask is a softmax of v^T * tanh(...).
                    s = math_ops.reduce_sum(
                        v[a] * math_ops.tanh(hidden_features[a] + y), [2, 3])
                    a = nn_ops.softmax(s)
                    # Now calculate the attention-weighted vector d.
                    d = math_ops.reduce_sum(
                        array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden,
                        [1, 2])
                    ds.append(array_ops.reshape(d, [-1, attn_size]))
            return ds

        outputs = []
        prev = None
        batch_attn_size = array_ops.stack([batch_size, attn_size])
        attns = [
            array_ops.zeros(batch_attn_size, dtype=dtype)
            for _ in xrange(num_heads)
        ]
        for a in attns:  # Ensure the second shape of attention vectors is set.
            a.set_shape([None, attn_size])
        if initial_state_attention:
            attns = attention(initial_state)
        for i, inp in enumerate(decoder_inputs):
            if i > 0:
                variable_scope.get_variable_scope().reuse_variables()
            # If loop_function is set, we use it instead of decoder_inputs.
            if loop_function is not None and prev is not None:
                with variable_scope.variable_scope("loop_function",
                                                   reuse=True):
                    inp = loop_function(prev, i)
            # Merge input and previous attentions into one vector of the right size.
            input_size = inp.get_shape().with_rank(2)[1]
            if input_size.value is None:
                raise ValueError("Could not infer input size from input: %s" %
                                 inp.name)
            x = linear([inp] + attns, input_size, True)
            # Run the RNN.
            cell_output, state = cell(x, state)
            # Run the attention mechanism.
            if i == 0 and initial_state_attention:
                with variable_scope.variable_scope(
                        variable_scope.get_variable_scope(), reuse=True):
                    attns = attention(state)
            else:
                attns = attention(state)

            with variable_scope.variable_scope("AttnOutputProjection"):
                output = linear([cell_output] + attns, output_size, True)
            if loop_function is not None:
                prev = output
            outputs.append(output)

    return outputs, state
Esempio n. 38
0
        def intra_decoder_attention(decoder_state, outputs):
            """Calculate the context vector and attention distribution from the decoder state.

            Args:
              decoder_state: state of the decoder
              outputs: list of decoder states for implementing intra-decoder mechanism, len(decoder_states) * (batch_size, hidden_dim)
            Returns:
              context_decoder_vector: weighted sum of _dec_states
              decoder_attn_dist: intra-decoder attention distribution
            """
            attention_dec_vec_size = attn_dec_size = decoder_state.c.get_shape(
            )[1]  # hidden_dim
            try:
                len_dec_states = outputs.get_shape()[0]
            except:
                len_dec_states = 0
            attention_dec_vec_size = attn_dec_size = decoder_state.c.get_shape(
            )[1]  # hidden_dim
            _decoder_states = tf.expand_dims(
                tf.reshape(outputs, [batch_size, -1, attn_dec_size]), axis=2
            )  # now is shape (batch_size,len(decoder_states), 1, attn_size)
            _prev_decoder_features = nn_ops.conv2d(
                _decoder_states, W_h_d, [1, 1, 1, 1], "SAME"
            )  # shape (batch_size,len(decoder_states),1,attention_vec_size)
            with variable_scope.variable_scope("DecoderAttention"):
                # Pass the decoder state through a linear layer (this is W_s s_t + b_attn in the paper)
                try:
                    decoder_features = linear(
                        decoder_state, attention_dec_vec_size,
                        True)  # shape (batch_size, attention_vec_size)
                    decoder_features = tf.expand_dims(
                        tf.expand_dims(decoder_features, 1), 1
                    )  # reshape to (batch_size, 1, 1, attention_dec_vec_size)
                    # Calculate v^T tanh(W_h h_i + W_s s_t + b_attn)
                    if _hps.matrix_attention:
                        # Calculate h_d * W_attn * h_d, equation 6 in https://arxiv.org/pdf/1705.04304.pdf
                        _dec_attn = tf.matmul(
                            tf.squeeze(decoder_features),
                            w_dec_attn)  # (batch_size, decoder_attn_size)
                        _dec_states_lst = tf.unstack(
                            tf.reshape(_prev_decoder_features,
                                       [batch_size, -1, decoder_attn_size])
                        )  # batch_size * (len(decoder_states), decoder_attn_size)
                        e_not_masked = tf.reshape(
                            tf.stack([
                                tf.matmul(_dec_attn, tf.transpose(k))
                                for k in _dec_states_lst
                            ]), [batch_size, -1
                                 ])  # (batch_size, len(decoder_states))
                        masked_e = tf.exp(
                            e_not_masked * dec_padding_mask[:, :len_dec_states]
                        )  # (batch_size, len(decoder_states))
                    else:
                        # Calculate v^T tanh(W_h h_i + W_s s_t + b_attn)
                        e_not_masked = math_ops.reduce_sum(
                            v_d * math_ops.tanh(_prev_decoder_features +
                                                decoder_features),
                            [
                                2, 3
                            ])  # calculate e, (batch_size,len(decoder_states))
                        masked_e = nn_ops.softmax(
                            e_not_masked
                        ) * dec_padding_mask[:, :
                                             len_dec_states]  # (batch_size,len(decoder_states))
                    if len_dec_states <= 1:
                        masked_e = array_ops.ones(
                            [batch_size,
                             1])  # first step is filled with equal values
                    masked_sums = tf.reshape(
                        tf.reduce_sum(masked_e, axis=1), [-1, 1]
                    )  # (batch_size,1), # if it's zero due to masking we set it to a small value
                    decoder_attn_dist = masked_e / masked_sums  # (batch_size,len(decoder_states))
                    context_decoder_vector = math_ops.reduce_sum(
                        array_ops.reshape(decoder_attn_dist,
                                          [batch_size, -1, 1, 1]) *
                        _decoder_states, [1, 2])  # (batch_size, attn_size)
                    context_decoder_vector = array_ops.reshape(
                        context_decoder_vector,
                        [-1, attn_dec_size])  # (batch_size, attn_size)
                except:
                    return array_ops.zeros(
                        [batch_size,
                         decoder_attn_size]), array_ops.zeros([batch_size, 0])
            return context_decoder_vector, decoder_attn_dist
Esempio n. 39
0
def attention_decoder(decoder_inputs,
                      initial_for_state,
                      initial_bac_state,
                      attention_states,
                      for_cell,
                      bac_cell,
                      maxout_size,
                      output_size=None,
                      num_heads=1,
                      loop_function=None,
                      embed_function=None,
                      dtype=None,
                      scope=None,
                      embedding_size=620,
                      initial_state_attention=False):
    """RNN decoder with attention for the sequence-to-sequence model.
  """
    if not decoder_inputs:
        raise ValueError("Must provide at least 1 input to attention decoder.")
    if num_heads < 1:
        raise ValueError(
            "With less than 1 heads, use a non-attention decoder.")
    if attention_states.get_shape()[2].value is None:
        raise ValueError("Shape[2] of attention_states must be known: %s" %
                         attention_states.get_shape())
    if output_size is None:
        output_size = for_cell.output_size

    with variable_scope.variable_scope(scope or "attention_decoder",
                                       dtype=dtype) as scope:
        dtype = scope.dtype

        batch_size = array_ops.shape(
            decoder_inputs[0])[0]  # Needed for reshaping.
        # This is the number of encoders
        attn_length = attention_states.get_shape()[1].value
        if attn_length is None:
            attn_length = array_ops.shape(attention_states)[1]
        # This is the output dimension of each encoder
        attn_size = attention_states.get_shape()[2].value

        # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before.
        # hidden is just the attention_states reshaped to 4 dimensions
        hidden = array_ops.reshape(attention_states,
                                   [-1, attn_length, 1, attn_size])
        # hidden_features = []
        # v = []
        # Divide by two because attention_vec_size consists of both forward
        # and backward encoder
        attention_vec_size = attn_size // 2  # Size of query vectors for attention.
        # for a in range(num_heads):
        k = variable_scope.get_variable("AttnW_0",
                                        [1, 1, attn_size, attention_vec_size])
        hidden_features = nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME")
        # v = variable_scope.get_variable("AttnV_0", [attention_vec_size])

        # Set the end state of the reverse encoder as the initial state of decoder
        # Create a two layer RELU Feedforward network.
        # Uncomment below to not use FW
        # state = initial_bac_state
        # state = [None] * len(initial_state)
        state_size = initial_bac_state[0].get_shape()[1].value

        def attention(query):
            """Put attention masks on hidden using hidden_features and query."""
            # ds = []  # Results of attention reads will be stored here.
            # if nest.is_sequence(query):  # If the query is a tuple, flatten it.
            #     query_list = nest.flatten(query)
            #     for q in query_list:  # Check that ndims == 2 if specified.
            #         ndims = q.get_shape().ndims
            #         if ndims:
            #             assert ndims == 2
            #     query = array_ops.concat_v2(query_list, 1)
            # for a in range(num_heads):
            with variable_scope.variable_scope("Attention_0"):
                # y = linear(query, attention_vec_size, True)
                y = array_ops.reshape(query, [-1, 1, 1, attention_vec_size])
                # Attention mask is a softmax of v^T * tanh(...).
                s = math_ops.reduce_sum(tf.multiply(y, hidden_features),
                                        [2, 3])
                # s = math_ops.reduce_sum(v * math_ops.tanh(y*hidden_features),
                #                       [2, 3])
                a = nn_ops.softmax(s)
                # Now calculate the attention-weighted vector d.
                d = math_ops.reduce_sum(
                    array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden,
                    [1, 2])
                ds = array_ops.reshape(d, [-1, attn_size])
            return ds, a

        outputs = []
        prev = None
        batch_attn_size = array_ops.pack([batch_size, attn_size])
        # attns = [
        #     array_ops.zeros(
        #         batch_attn_size, dtype=dtype) for _ in range(num_heads)
        # ]

        # state = initial_bac_state
        hidState = [None] * len(initial_for_state)
        state = [None] * len(initial_for_state)

        for i in range(len(initial_for_state)):
            with variable_scope.variable_scope("F_init_for_%d" % i):
                hidState[i] = tf.nn.relu(linear(initial_for_state[i],
                                                state_size,
                                                True,
                                                scope="Linear0"),
                                         name="relu0")
                state[i] = tf.nn.relu(linear(hidState[i],
                                             state_size,
                                             True,
                                             scope="Linear1"),
                                      name="relu1")
                # state[i] = tf.nn.relu(linear(y, state_size, True, scope="Linear1"), name="relu1")
        # state = for_cell.zero_state(batch_size, dtype)

        # for a in attns:  # Ensure the second shape of attention vectors is set.
        #     a.set_shape([None, attn_size])
        # For first attention, use the input hidden state from backward decoder
        cell_output = state[0]

        contexts = []
        for_output = []
        collect_attn = []
        with variable_scope.variable_scope("Decoder_For"):
            for i, inp in enumerate(decoder_inputs):
                if i > 0:
                    variable_scope.get_variable_scope().reuse_variables()
                # If loop_function is set, we use it instead of decoder_inputs.
                # if loop_function is not None and prev is not None:
                #     with variable_scope.variable_scope("loop_function", reuse=True):
                #         inp = loop_function(prev, i)
                # Merge input and previous attentions into one vector of the right size.
                input_size = inp.get_shape().with_rank(2)[1]
                # if input_size.value is None:
                #     raise ValueError("Could not infer input size from input: %s" % inp.name)
                # Below should almost always be false

                # x = linear([inp] + attns, input_size, True)
                # Run the RNN.
                context, a = attention(cell_output)
                contexts.append(context)
                collect_attn.append(a)

                inp_concat = tf.concat(1, [inp, context])
                cell_output, state = for_cell(inp_concat, state)
                for_output.append(cell_output)

                # Run the attention mechanism.
                # with variable_scope.variable_scope("AttnOutputProjection"):
                #     # attns is a list of heads. Here I just have one though
                #     output = linear([cell_output] + attns, maxout_size, True)
                # output is t
                # output = maxout(t_tilda, maxout_size)
                # output = linear(t, output_size, True)

                # if loop_function is not None:
                #     prev = output
                # outputs.append(output)

        # state = initial_for_state

        # state = initial_for_state
        hidState = [None] * len(initial_bac_state)
        state = [None] * len(initial_bac_state)
        for i in range(len(initial_bac_state)):
            with variable_scope.variable_scope("F_init_bac_%d" % i):
                hidState[i] = tf.nn.relu(linear(initial_bac_state[i],
                                                state_size,
                                                True,
                                                scope="Linear0"),
                                         name="relu0")
                state[i] = tf.nn.relu(linear(hidState[i],
                                             state_size,
                                             True,
                                             scope="Linear1"),
                                      name="relu1")
                # state[i] = tf.nn.relu(linear(y, state_size, True, scope="Linear1"), name="relu1")
        # state = bac_cell.zero_state(batch_size, dtype)
        bac_output = []
        with variable_scope.variable_scope("Decoder_Back"):
            # for i, (inp, out) in enumerate(zip(reversed(input_attn[2:]), reversed(output_attn[:-2]))):
            for i, (inp, context) in enumerate(
                    zip(reversed(decoder_inputs[2:]),
                        reversed(contexts[:-2]))):
                if i > 0:
                    variable_scope.get_variable_scope().reuse_variables()
                inp_concat = tf.concat(1, [inp, context])
                cell_output, state = bac_cell(inp_concat, state)
                bac_output.insert(0, cell_output)

        q_vec = []
        with variable_scope.variable_scope("OutputProjection"):
            for i, (for_out, bac_out,
                    context) in enumerate(zip(for_output, bac_output,
                                              contexts)):
                # for i, (inp, out) in enumerate(zip(reversed(input_attn[2:]), reversed(output_attn[:-2]))):
                if i > 0:
                    variable_scope.get_variable_scope().reuse_variables()
                t_tilda = linear(tf.concat(1, [
                    for_out, bac_out, decoder_inputs[i], decoder_inputs[i + 2],
                    context
                ]),
                                 2 * maxout_size,
                                 True,
                                 scope="sj")
                # temp2 = linear(tf.concat([decoder_inputs[i], decoder_inputs[i+2]]), 2*maxout_size, True, scope="ey")
                # temp3 = linear(context, 2*maxout_size, True, scope="ctxt")
                # t_tilda = temp + temp2 + temp3
                t_output = maxout(t_tilda, maxout_size)
                output = linear(t_output,
                                embedding_size,
                                True,
                                scope="t_tilda2")
                q_vec.append(output)
        # q_vec = q_vec[::-1]
        # print('collect_attn', len(collect_attn))
        return q_vec, state, collect_attn
    def _SetupValuesForDevice(self, tensor_in_sizes, filter_in_sizes, bias,
                              strides, padding, activation_mode, data_format,
                              filter_format, dtype):
        """Verifies the output values of the convolution function.

    Args:
      tensor_in_sizes: Input tensor dimensions in
        [batch, input_rows, input_cols, input_depth].
      filter_in_sizes: Filter tensor dimensions in
        [kernel_rows, kernel_cols, input_depth, output_depth].
      bias: 1-D bias tensor of length output_depth.
      strides: Stride: [col_stride, row_stride]
      padding: Padding type.
      activation_mode: Activation mode.
      data_format: Format of the data tensors.
      filter_format: Filter format to use for the fused convolution.
      dtype: Data type for inputs and outputs.
    Returns:
      Symbolic tensor value and reference value that can be used to
      execute the computation and verify the results.
    """
        input_size = np.prod(tensor_in_sizes)
        filter_size = np.prod(filter_in_sizes)
        bias_size = filter_in_sizes[-1]  # equals to output depth
        # Initializes the input tensor with array containing incrementing
        # numbers from 1.
        x1 = [f * 1.0 for f in range(1, input_size + 1)]
        x2 = [f * 1.0 for f in range(1, filter_size + 1)]
        # This is to guarantee that there are always negative values after
        # bias add so that we can test whether relu works correctly.
        x3 = bias
        t1 = constant_op.constant(x1, shape=tensor_in_sizes, dtype=dtype)
        t2 = constant_op.constant(x2, shape=filter_in_sizes, dtype=dtype)
        fused_t2 = t2
        if filter_format == "OIHW":
            fused_t2 = _HwioToOihw(t2)
        t3 = constant_op.constant(x3, shape=[bias_size], dtype=dtype)
        strides = [1] + strides + [1]
        if data_format == "NCHW":
            t1 = test_util.NHWCToNCHW(t1)
            strides = test_util.NHWCToNCHW(strides)
        output = fused_conv2d_bias_activation_op.fused_conv2d_bias_activation(
            t1,
            fused_t2,
            t3,
            strides=strides,
            padding=padding,
            data_format=data_format,
            filter_format=filter_format,
            activation_mode=activation_mode)
        ref_conv_output = nn_ops.conv2d(t1,
                                        t2,
                                        strides=strides,
                                        padding=padding,
                                        data_format=data_format)
        ref_bias_output = nn_ops.bias_add(ref_conv_output,
                                          t3,
                                          data_format=data_format)
        ref_output = nn_ops.relu(ref_bias_output)
        if data_format == "NCHW":
            output = test_util.NCHWToNHWC(output)
            ref_output = test_util.NCHWToNHWC(ref_output)

        return output, ref_output