def testTransformerAutoencoder(self):
    hparams = imagetransformer_latent_tiny()
    hparams.mode = tf.estimator.ModeKeys.TRAIN
    block_dim = int(hparams.hidden_size // hparams.num_blocks)
    block_v_size = 2**(hparams.bottleneck_bits /
                       (hparams.num_residuals * hparams.num_blocks))
    block_v_size = int(block_v_size)
    means = tf.get_variable(
        name="means",
        shape=[hparams.num_residuals,
               hparams.num_blocks,
               block_v_size,
               block_dim],
        initializer=tf.uniform_unit_scaling_initializer())
    hparams.bottleneck = functools.partial(
        discretization.discrete_bottleneck,
        hidden_size=hparams.hidden_size,
        z_size=hparams.bottleneck_bits,
        filter_size=hparams.filter_size,
        startup_steps=hparams.startup_steps,
        bottleneck_kind=hparams.bottleneck_kind,
        num_blocks=hparams.num_blocks,
        num_residuals=hparams.num_residuals,
        reshape_method=hparams.reshape_method,
        beta=hparams.vq_beta,
        decay=hparams.vq_decay,
        soft_em=hparams.soft_em,
        num_samples=hparams.num_samples,
        epsilon=hparams.vq_epsilon,
        ema=hparams.ema,
        means=means)

    inputs = None
    batch_size = hparams.batch_size
    targets = tf.random_uniform([batch_size,
                                 hparams.img_len,
                                 hparams.img_len,
                                 hparams.hidden_size],
                                minval=-1., maxval=1.)
    target_space_id = None

    tf.train.create_global_step()
    decoder_output, losses, cache = latent_layers.transformer_autoencoder(
        inputs, targets, target_space_id, hparams)

    self.assertEqual(set(losses), {"extra", "extra_loss", "latent_pred"})

    self.evaluate(tf.global_variables_initializer())
    decoder_output_, extra_loss_, latent_pred_ = self.evaluate(
        [decoder_output, losses["extra_loss"], losses["latent_pred"]])
    self.assertEqual(decoder_output_.shape, (batch_size,
                                             hparams.img_len,
                                             hparams.img_len,
                                             hparams.hidden_size))
    self.assertEqual(extra_loss_.shape, (batch_size,))
    self.assertEqual(latent_pred_.shape, (batch_size,))
    self.assertAllGreaterEqual(extra_loss_, 0.)
    self.assertAllGreaterEqual(latent_pred_, 0.)
    self.assertEqual(cache, None)
Ejemplo n.º 2
0
def conv1d(x,
           num_filters,
           filter_length,
           name,
           dilation=1,
           causal=True,
           kernel_initializer=tf.uniform_unit_scaling_initializer(1.0),
           biases_initializer=tf.constant_initializer(0.0),
           is_training=True):
    """Fast 1D convolution that supports causal padding and dilation.

  Args:
    x: The [mb, time, channels] float tensor that we convolve.
    num_filters: The number of filter maps in the convolution.
    filter_length: The integer length of the filter.
    name: The name of the scope for the variables.
    dilation: The amount of dilation.
    causal: Whether or not this is a causal convolution.
    kernel_initializer: The kernel initialization function.
    biases_initializer: The biases initialization function.
    is_training: Whether or not ot use traininable variables.

  Returns:
    y: The output of the 1D convolution.
  """
    batch_size, length, num_input_channels = x.get_shape().as_list()
    assert length % dilation == 0

    kernel_shape = [1, filter_length, num_input_channels, num_filters]
    strides = [1, 1, 1, 1]
    biases_shape = [num_filters]
    padding = 'VALID' if causal else 'SAME'

    with tf.variable_scope(name):
        weights = tf.get_variable('W',
                                  shape=kernel_shape,
                                  initializer=kernel_initializer,
                                  trainable=is_training)
        biases = tf.get_variable('biases',
                                 shape=biases_shape,
                                 initializer=biases_initializer,
                                 trainable=is_training)

    x_ttb = time_to_batch(x, dilation)
    if filter_length > 1 and causal:
        x_ttb = tf.pad(x_ttb, [[0, 0], [filter_length - 1, 0], [0, 0]])

    x_ttb_shape = x_ttb.get_shape().as_list()
    x_4d = tf.reshape(x_ttb,
                      [x_ttb_shape[0], 1, x_ttb_shape[1], num_input_channels])
    y = tf.nn.conv2d(x_4d, weights, strides, padding=padding)
    y = tf.nn.bias_add(y, biases)
    y_shape = y.get_shape().as_list()
    y = tf.reshape(y, [y_shape[0], y_shape[2], num_filters])
    y = batch_to_time(y, dilation)
    y.set_shape([batch_size, length, num_filters])
    return y
Ejemplo n.º 3
0
 def _fully_connected(self, x, out_dim):
     """FullyConnected layer for final output."""
     num_non_batch_dimensions = len(x.shape)
     prod_non_batch_dimensions = 1
     for ii in range(num_non_batch_dimensions - 1):
         prod_non_batch_dimensions *= int(x.shape[ii + 1])
     x = tf.reshape(x, [tf.shape(x)[0], -1])
     w = tf.get_variable(
         'DW', [prod_non_batch_dimensions, out_dim],
         initializer=tf.uniform_unit_scaling_initializer(factor=1.0))
     b = tf.get_variable('biases', [out_dim],
                         initializer=tf.constant_initializer())
     return tf.nn.xw_plus_b(x, w, b)
Ejemplo n.º 4
0
def init_vq_bottleneck(bottleneck_size, hidden_size):
    """Get lookup table for VQ bottleneck."""
    means = tf.get_variable(name="means",
                            shape=[bottleneck_size, hidden_size],
                            initializer=tf.uniform_unit_scaling_initializer())
    ema_count = tf.get_variable(name="ema_count",
                                shape=[bottleneck_size],
                                initializer=tf.constant_initializer(0),
                                trainable=False)
    with tf.colocate_with(means):
        ema_means = tf.get_variable(name="ema_means",
                                    initializer=means.initialized_value(),
                                    trainable=False)

    return means, ema_means, ema_count
Ejemplo n.º 5
0
    def __init__(self,
                 embedding_dim,
                 num_embeddings,
                 commitment_cost,
                 name='vq_layer'):
        super(VectorQuantizer, self).__init__(name=name)
        self._embedding_dim = embedding_dim
        self._num_embeddings = num_embeddings
        self._commitment_cost = commitment_cost

        with self._enter_variable_scope():
            initializer = tf.uniform_unit_scaling_initializer()
            self._w = tf.get_variable('embedding',
                                      [embedding_dim, num_embeddings],
                                      initializer=initializer,
                                      trainable=True)
Ejemplo n.º 6
0
def uniform_scaling(shape=None, factor=1.0, dtype=tf.float32, seed=None):
    """ Uniform Scaling.

    Initialization with random values from uniform distribution without scaling
    variance.

    When initializing a deep network, it is in principle advantageous to keep
    the scale of the input variance constant, so it does not explode or diminish
    by reaching the final layer. If the input is `x` and the operation `x * W`,
    and we want to initialize `W` uniformly at random, we need to pick `W` from

      [-sqrt(3) / sqrt(dim), sqrt(3) / sqrt(dim)]

    to keep the scale intact, where `dim = W.shape[0]` (the size of the input).
    A similar calculation for convolutional networks gives an analogous result
    with `dim` equal to the product of the first 3 dimensions.  When
    nonlinearities are present, we need to multiply this by a constant `factor`.
    See [Sussillo et al., 2014](https://arxiv.org/abs/1412.6558)
    ([pdf](http://arxiv.org/pdf/1412.6558.pdf)) for deeper motivation, experiments
    and the calculation of constants. In section 2.3 there, the constants were
    numerically computed: for a linear layer it's 1.0, relu: ~1.43, tanh: ~1.15.

    Arguments:
        shape: List of `int`. A shape to initialize a Tensor (optional).
        factor: `float`. A multiplicative factor by which the values will be
            scaled.
        dtype: The tensor data type. Only float are supported.
        seed: `int`. Used to create a random seed for the distribution.

    Returns:
        The Initializer, or an initialized `Tensor` if shape is specified.

    """
    if shape:
        input_size = 1.0
        for dim in shape[:-1]:
            input_size *= float(dim)
        max_val = math.sqrt(3 / input_size) * factor
        return tf.random_ops.random_uniform(shape,
                                            -max_val,
                                            max_val,
                                            dtype,
                                            seed=seed)
    else:
        return tf.uniform_unit_scaling_initializer(seed=seed, dtype=dtype)
Ejemplo n.º 7
0
    def __init__(self, *args, **kwargs):
        super(TransformerAE, self).__init__(*args, **kwargs)
        self.predict_mask = 1.0

        # Define bottleneck function
        self._hparams.bottleneck = functools.partial(
            discretization.discrete_bottleneck,
            hidden_size=self._hparams.hidden_size,
            z_size=self._hparams.z_size,
            filter_size=self._hparams.filter_size,
            bottleneck_kind=self._hparams.bottleneck_kind,
            num_blocks=self._hparams.num_blocks,
            num_residuals=self.hparams.num_residuals,
            reshape_method=self._hparams.reshape_method,
            beta=self._hparams.beta,
            ema=self._hparams.ema,
            epsilon=self._hparams.epsilon,
            decay=self._hparams.decay,
            random_top_k=self._hparams.random_top_k,
            soft_em=self.hparams.soft_em,
            num_samples=self.hparams.num_samples,
            softmax_k=self._hparams.softmax_k,
            temperature_warmup_steps=self._hparams.temperature_warmup_steps,
            do_hard_gumbel_softmax=self._hparams.do_hard_gumbel_softmax,
            num_flows=self._hparams.num_flows,
            approximate_gs_entropy=self._hparams.approximate_gs_entropy,
            discrete_mix=self._hparams.d_mix,
            noise_dev=self._hparams.noise_dev,
            startup_steps=self.hparams.startup_steps,
            summary=_DO_SUMMARIES)
        # Set the discretization bottleneck specific things here
        if self._hparams.bottleneck_kind in ["dvq", "gumbel-softmax-dvq"]:
            z_size_per_residual = self._hparams.z_size / self._hparams.num_residuals
            block_dim = int(self._hparams.hidden_size //
                            self._hparams.num_blocks)
            block_v_size = 2**(z_size_per_residual / self._hparams.num_blocks)
            block_v_size = int(block_v_size)

            if self._hparams.reshape_method == "project":
                tf.logging.info("Using projections for DVQ")
                tf.logging.info("Trainable projections = {}".format(
                    self._hparams.trainable_projections))

                projection_tensors = tf.get_variable(
                    name="projection",
                    shape=[
                        self._hparams.num_residuals, self._hparams.num_blocks,
                        self._hparams.hidden_size, block_dim
                    ],
                    initializer=tf.initializers.glorot_uniform(),
                    trainable=self._hparams.trainable_projections)

                self._hparams.bottleneck = functools.partial(
                    self._hparams.bottleneck,
                    projection_tensors=projection_tensors)
            elif self._hparams.reshape_method == "slice":
                tf.logging.info("Using slices for DVQ")
            else:
                raise ValueError("Unknown reshape method")

            means = tf.get_variable(
                name="means",
                shape=[
                    self._hparams.num_residuals, self._hparams.num_blocks,
                    block_v_size, block_dim
                ],
                initializer=tf.uniform_unit_scaling_initializer())

            # Create the shadow variables if we are using EMA
            ema_count = None
            ema_means = None
            if self._hparams.ema:
                ema_count = []
                for i in range(self._hparams.num_residuals):
                    ema_count_i = tf.get_variable(
                        "ema_count_{}".format(i),
                        [self._hparams.num_blocks, block_v_size],
                        initializer=tf.constant_initializer(0),
                        trainable=False)
                    ema_count.append(ema_count_i)
                with tf.colocate_with(means):
                    ema_means = []
                    for i in range(self._hparams.num_residuals):
                        ema_means_i = tf.get_variable(
                            "ema_means_{}".format(i),
                            [
                                self._hparams.num_blocks, block_v_size,
                                block_dim
                            ],
                            initializer=(
                                lambda shape, dtype=None, partition_info=None,  # pylint: disable=g-long-lambda
                                verify_shape=None: means.initialized_value()[i]
                            ),
                            trainable=False)
                        ema_means.append(ema_means_i)

            # Update bottleneck
            self._hparams.bottleneck = functools.partial(
                self._hparams.bottleneck,
                means=means,
                ema_count=ema_count,
                ema_means=ema_means)
Ejemplo n.º 8
0
def encoder(features, mode, vocab, hps):
  """Model function.

  Atttention seq2seq model, augmented with an encoder
  over the targets of the nearest neighbors.

  Args:
    features: Dictionary of input Tensors.
    mode: train or eval. Keys from tf.estimator.ModeKeys.
    vocab: A list of strings of words in the vocabulary.
    hps: Hyperparams.

  Returns:
    Encoder outputs.
  """

  # [batch_size, src_len]
  src_inputs = features["src_inputs"]
  src_len = features["src_len"]

  with tf.variable_scope("embeddings"):
    embeddings = tf.get_variable(
        "embeddings",
        [vocab.size(), hps.emb_dim],
        dtype=tf.float32,
        initializer=tf.uniform_unit_scaling_initializer())

  # [batch_size, src_len, emb_dim]
  src_encoder_input_emb = tf.nn.embedding_lookup(embeddings, src_inputs)
  if mode == tf.estimator.ModeKeys.TRAIN and hps.emb_drop > 0.:
    src_encoder_input_emb = tf.nn.dropout(
        src_encoder_input_emb, keep_prob=1.0-hps.emb_drop)

  src_att_context, neighbor_att_context = None, None
  src_copy_context, neighbor_copy_context = None, None
  with tf.variable_scope("src_encoder"):

    # 2 * [batch_size, src_len, encoder_dim]
    src_encoder_outputs, src_encoder_states = tf.nn.bidirectional_dynamic_rnn(
        cell_fw=get_rnn_cell(
            mode=mode, hps=hps,
            input_dim=hps.emb_dim,
            num_units=hps.encoder_dim,
            num_layers=hps.num_encoder_layers,
            dropout=hps.encoder_drop,
            cell_type=hps.rnn_cell),
        cell_bw=get_rnn_cell(
            mode=mode, hps=hps,
            input_dim=hps.emb_dim,
            num_units=hps.encoder_dim,
            num_layers=hps.num_encoder_layers,
            dropout=hps.encoder_drop,
            cell_type=hps.rnn_cell),
        inputs=src_encoder_input_emb,
        dtype=tf.float32,
        sequence_length=src_len)

    # [batch_size, src_len, 2*encoder_dim]
    src_encoder_outputs = tf.concat(src_encoder_outputs, 2)
    with tf.variable_scope("src_att_context"):
      src_att_context = _build_context(
          hps=hps,
          encoder_outputs=src_encoder_outputs)
    if hps.use_copy:
      with tf.variable_scope("src_copy_context"):
        src_copy_context = _build_context(
            hps=hps,
            encoder_outputs=src_encoder_outputs)

  if hps.model == "nn2seq":

    # [batch_size, neighbor_len]
    neighbor_inputs = features["neighbor_inputs"]
    neighbor_len = features["neighbor_len"]

    # [batch_size, neighbor_len, emb_dim]
    neighbor_input_emb = tf.nn.embedding_lookup(
        embeddings, neighbor_inputs)
    if mode == tf.estimator.ModeKeys.TRAIN and hps.emb_drop > 0.:
      neighbor_input_emb = tf.nn.dropout(
          neighbor_input_emb, keep_prob=1.0-hps.emb_drop)
    with tf.variable_scope("neighbor_encoder"):
      # 2 * [batch_size, neighbor_len, encoder_dim]
      neighbor_encoder_outputs, _ = \
          tf.nn.bidirectional_dynamic_rnn(
              cell_fw=get_rnn_cell(
                  mode=mode, hps=hps,
                  input_dim=hps.emb_dim,
                  num_units=hps.encoder_dim,
                  num_layers=1,
                  dropout=hps.encoder_drop,
                  cell_type=hps.rnn_cell),
              cell_bw=get_rnn_cell(
                  mode=mode, hps=hps,
                  input_dim=hps.emb_dim,
                  num_units=hps.encoder_dim,
                  num_layers=1,
                  dropout=hps.encoder_drop,
                  cell_type=hps.rnn_cell),
              inputs=neighbor_input_emb,
              dtype=tf.float32,
              sequence_length=neighbor_len)

      neighbor_encoder_outputs = tf.concat(neighbor_encoder_outputs, 2)
      with tf.variable_scope("neighbor_att_context"):
        neighbor_att_context = _build_context(
            hps=hps,
            encoder_outputs=neighbor_encoder_outputs)
      if hps.use_copy:
        with tf.variable_scope("neighbor_copy_context"):
          neighbor_copy_context = _build_context(
              hps=hps,
              encoder_outputs=neighbor_encoder_outputs)
  att_context, copy_context = None, None
  if hps.model == "nn2seq":
    att_context = tf.concat([src_att_context, neighbor_att_context], 1)
    if hps.use_copy:
      copy_context = tf.concat(
          [src_copy_context, neighbor_copy_context], 1)
  elif hps.model == "seq2seq":
    att_context = src_att_context
    if hps.use_copy:
      copy_context = src_copy_context
  else:
    assert False, "baseline `model` should be [`nn2seq`, `seq2seq`]."
  if hps.use_bridge:
    with tf.variable_scope("bridge"):
      out_dim = hps.num_decoder_layers * hps.decoder_dim
      if hps.rnn_cell == "lstm":
        fw_states, bw_states = src_encoder_states
        hs = tf.concat([fw_states[-1].h, bw_states[-1].h], axis=1)
        cs = tf.concat([fw_states[-1].c, bw_states[-1].c], axis=1)

        h_state = tf.layers.dense(
            hs, units=out_dim,
            activation=tf.nn.tanh,
            use_bias=True,
            kernel_initializer=tf.contrib.layers.xavier_initializer(),
            name="h_layer")
        c_state = tf.layers.dense(
            cs, units=out_dim,
            activation=tf.nn.tanh,
            use_bias=True,
            kernel_initializer=tf.contrib.layers.xavier_initializer(),
            name="c_layer")
      elif hps.rnn_cell == "gru":
        fw_states, bw_states = src_encoder_states
        hs = tf.concat([fw_states[-1], bw_states[-1]], axis=1)
        h_state = tf.layers.dense(
            hs, units=out_dim,
            activation=tf.nn.tanh,
            use_bias=True,
            kernel_initializer=tf.contrib.layers.xavier_initializer(),
            name="h_layer")
        c_state = None
  else:
    h_state, c_state = None, None

  return EncoderOutputs(
      embeddings=embeddings,
      att_context=att_context,
      copy_context=copy_context,
      states=(h_state, c_state)
  )
Ejemplo n.º 9
0
def lenet(use_pretrained=False):  # modify from lenet model

    if use_pretrained == False:
        # Random initialize
        weights = {
            'conv1':
            tf.get_variable('LN_conv1_w', [5, 5, 3, 64],
                            initializer=tf.uniform_unit_scaling_initializer()),
            'conv2':
            tf.get_variable('LN_conv2_w', [5, 5, 64, 128],
                            initializer=tf.uniform_unit_scaling_initializer()),
            'ip1':
            tf.get_variable('LN_ip1_w', [5 * 5 * 128, 1024],
                            initializer=tf.uniform_unit_scaling_initializer()),
            'ip2':
            tf.get_variable('LN_ip2_w', [1024, 10],
                            initializer=tf.uniform_unit_scaling_initializer())
        }

        biases = {
            'conv1':
            tf.Variable(tf.random_normal(shape=[64], stddev=0.5),
                        name='LN_conv1_b'),
            'conv2':
            tf.Variable(tf.random_normal(shape=[128], stddev=0.5),
                        name='LN_conv2_b'),
            'ip1':
            tf.Variable(tf.random_normal(shape=[1024], stddev=0.5),
                        name='LN_ip1_b'),
            'ip2':
            tf.Variable(tf.random_normal(shape=[10], stddev=0.5),
                        name='LN_ip2_b')
        }
    else:
        # initialized by pre-trained weight
        npyfile = np.load('student.npy')
        npyfile = npyfile.item()
        weights = {
            'conv1': tf.Variable(npyfile['conv1']['weights'],
                                 name='LN_conv1_w'),
            'conv2': tf.Variable(npyfile['conv2']['weights'],
                                 name='LN_conv2_w'),
            'ip1': tf.Variable(npyfile['ip1']['weights'], name='LN_ip1_w'),
            'ip2': tf.Variable(npyfile['ip2']['weights'], name='LN_ip2_w'),
        }

        biases = {
            'conv1': tf.Variable(npyfile['conv1']['biases'],
                                 name='LN_conv1_b'),
            'conv2': tf.Variable(npyfile['conv2']['biases'],
                                 name='LN_conv2_b'),
            'ip1': tf.Variable(npyfile['ip1']['biases'], name='LN_ip1_b'),
            'ip2': tf.Variable(npyfile['ip2']['biases'], name='LN_ip2_b'),
        }

    conv1 = conv(x, weights['conv1'], biases['conv1'], padding='VALID')
    pool1 = maxpool2d(conv1, k=2, s=2)
    conv2 = conv(pool1, weights['conv2'], biases['conv2'], padding='VALID')
    pool2 = maxpool2d(conv2, k=2, s=2, padding='VALID')

    ip1 = tf.reshape(pool2, [-1, weights['ip1'].get_shape().as_list()[0]])
    ip1 = tf.add(tf.matmul(ip1, weights['ip1']), biases['ip1'])
    ip1_relu = tf.nn.relu(ip1)
    ip2 = tf.add(tf.matmul(ip1_relu, weights['ip2']), biases['ip2'])
    return ip2
Ejemplo n.º 10
0
    def __init__(self,
                 num_units,
                 mem_input,
                 use_peepholes=False,
                 cell_clip=None,
                 initializer=None,
                 num_proj=None,
                 proj_clip=None,
                 num_unit_shards=None,
                 num_proj_shards=None,
                 forget_bias=1.0,
                 state_is_tuple=True,
                 activation=None,
                 reuse=None,
                 name=None,
                 dtype=None,
                 use_beam=False,
                 hps=None):
        """Initialize the HyperLSTM cell.

    Args:
      num_units: int, The number of units in the LSTM cell.
      mem_input: mem_input.
      use_peepholes: bool, use peephole connections or not.
      cell_clip: (optional) A float value, if provided the cell state is clipped
        by this value prior to the cell output activation.
      initializer: (optional) The initializer to use for the weight and
        projection matrices.
      num_proj: (optional) int, The output dimensionality for the projection
        matrices.  If None, no projection is performed.
      proj_clip: (optional) A float value.  If `num_proj > 0` and `proj_clip` is
        provided, then the projected values are clipped elementwise to within
        `[-proj_clip, proj_clip]`.
      num_unit_shards: Deprecated, will be removed by Jan. 2017.
        Use a variable_scope partitioner instead.
      num_proj_shards: Deprecated, will be removed by Jan. 2017.
        Use a variable_scope partitioner instead.
      forget_bias: float, The bias added to forget gates (see above).
        Must set to `0.0` manually when restoring from CudnnLSTM-trained
        checkpoints.
      state_is_tuple: If True, accepted and returned states are 2-tuples of
        the `c_state` and `m_state`.  If False, they are concatenated
        along the column axis.  The latter behavior will soon be deprecated.
      activation: Activation function of the inner states.  Default: `tanh`.
      reuse: (optional) Python boolean describing whether to reuse variables
        in an existing scope.  If not `True`, and the existing scope already has
        the given variables, an error is raised.
      name: String, the name of the layer. Layers with the same name will
        share weights, but to avoid mistakes we require reuse=True in such
        cases.
      dtype: Default dtype of the layer (default of `None` means use the type
        of the first input). Required when `build` is called before `call`.
      use_beam: Use beam search or not.
      hps: hyperparameters.
    """

        super(HyperLSTMCell, self).__init__(_reuse=reuse,
                                            name=name,
                                            dtype=dtype)
        if not state_is_tuple:
            tf.logging.warn(
                "%s: Using a concatenated state is slower and will soon "
                "be deprecated.  Use state_is_tuple=True.", self)
        if num_unit_shards is not None or num_proj_shards is not None:
            tf.logging.warn(
                "%s: The num_unit_shards and proj_unit_shards parameters are "
                "deprecated and will be removed in Jan 2017.  "
                "Use a variable scope with a partitioner instead.", self)

        assert not use_peepholes, "currently not supporting peephole connections"
        assert hps is not None
        # Inputs must be 2-dimensional.
        self.input_spec = tf.layers.InputSpec(ndim=2)

        self._num_units = num_units
        self._rank = hps.rank
        assert self._rank == self._num_units or self._rank == 2 * self._num_units
        self._use_peepholes = use_peepholes
        self._cell_clip = cell_clip
        self._initializer = initializer
        self._num_proj = num_proj
        self._proj_clip = proj_clip
        self._num_unit_shards = num_unit_shards
        self._num_proj_shards = num_proj_shards
        self._forget_bias = forget_bias
        self._state_is_tuple = state_is_tuple
        self._activation = activation or tf.tanh
        self._sigma_norm = hps.sigma_norm
        self._beam_width = hps.beam_width
        self._mem_input = mem_input
        self._use_beam = use_beam

        if num_proj:
            self._state_size = (tf.nn.rnn_cell.LSTMStateTuple(
                num_units, num_proj) if state_is_tuple else num_units +
                                num_proj)
            self._output_size = num_proj
        else:
            self._state_size = (tf.nn.rnn_cell.LSTMStateTuple(
                num_units, num_units) if state_is_tuple else 2 * num_units)
            self._output_size = num_units

        input_depth = hps.emb_dim + hps.decoder_dim
        # if hps.encode_neighbor:
        #   input_depth += hps.decoder_dim
        h_depth = self._num_units if self._num_proj is None else self._num_proj

        maybe_partitioner = (tf.fixed_size_partitioner(self._num_unit_shards)
                             if self._num_unit_shards is not None else None)

        # `u`s are matrices of [input_shape, rank], `v`s being [rank, hidden_size]
        # they are the collection of rank-1 parameter matrices.
        # The full parameter matrix is constructed by taking `U\sigma V`,
        # with diagonal matrix `\sigma` computed in the `self.initialize` function.

        redundant_rank = (self._rank > self._num_units)
        # `u`, `v` used to construct matrix from input `x` to input_gate `i`.
        u_xi, v_xi = self._orthogonal_init(
            shape=[input_depth, self._num_units],
            initializer=initializer,
            redundant_rank=redundant_rank)
        self._u_xi = tf.get_variable("u_xi/%s" % _WEIGHTS_VARIABLE_NAME,
                                     initializer=u_xi,
                                     partitioner=maybe_partitioner)
        self._v_xi = tf.get_variable("v_xi/%s" % _WEIGHTS_VARIABLE_NAME,
                                     initializer=v_xi,
                                     partitioner=maybe_partitioner)

        # `u`, `v` used to construct matrix that maps input `x` to cell_state `j`.
        u_xj, v_xj = self._orthogonal_init(
            shape=[input_depth, self._num_units],
            initializer=initializer,
            redundant_rank=redundant_rank)
        self._u_xj = tf.get_variable("u_xj/%s" % _WEIGHTS_VARIABLE_NAME,
                                     initializer=u_xj,
                                     partitioner=maybe_partitioner)
        self._v_xj = tf.get_variable("v_xj/%s" % _WEIGHTS_VARIABLE_NAME,
                                     initializer=v_xj,
                                     partitioner=maybe_partitioner)

        # `u`, `v` used to construct matrix
        # that maps input `x` to forget_gate `f`.
        u_xf, v_xf = self._orthogonal_init(
            shape=[input_depth, self._num_units],
            initializer=initializer,
            redundant_rank=redundant_rank)
        self._u_xf = tf.get_variable("u_xf/%s" % _WEIGHTS_VARIABLE_NAME,
                                     initializer=u_xf,
                                     partitioner=maybe_partitioner)
        self._v_xf = tf.get_variable("v_xf/%s" % _WEIGHTS_VARIABLE_NAME,
                                     initializer=v_xf,
                                     partitioner=maybe_partitioner)

        # `u`, `v` used to construct matrix
        # that maps input `x` to output_gate `o`.
        u_xo, v_xo = self._orthogonal_init(
            shape=[input_depth, self._num_units],
            initializer=initializer,
            redundant_rank=redundant_rank)
        self._u_xo = tf.get_variable("u_xo/%s" % _WEIGHTS_VARIABLE_NAME,
                                     initializer=u_xo,
                                     partitioner=maybe_partitioner)
        self._v_xo = tf.get_variable("v_xo/%s" % _WEIGHTS_VARIABLE_NAME,
                                     initializer=v_xo,
                                     partitioner=maybe_partitioner)

        # `u`, `v` used to construct matrix
        # that maps hid_state `h` to input_gate `i`.
        u_hi, v_hi = self._orthogonal_init(shape=[h_depth, self._num_units],
                                           initializer=initializer,
                                           redundant_rank=redundant_rank)
        self._u_hi = tf.get_variable("u_hi/%s" % _WEIGHTS_VARIABLE_NAME,
                                     initializer=u_hi,
                                     partitioner=maybe_partitioner)
        self._v_hi = tf.get_variable("v_hi/%s" % _WEIGHTS_VARIABLE_NAME,
                                     initializer=v_hi,
                                     partitioner=maybe_partitioner)

        # `u`, `v` used to construct matrix
        # that maps hid_state `h` to cell_state `j`.
        u_hj, v_hj = self._orthogonal_init(shape=[h_depth, self._num_units],
                                           initializer=initializer,
                                           redundant_rank=redundant_rank)
        self._u_hj = tf.get_variable("u_hj/%s" % _WEIGHTS_VARIABLE_NAME,
                                     initializer=u_hj,
                                     partitioner=maybe_partitioner)
        self._v_hj = tf.get_variable("v_hj/%s" % _WEIGHTS_VARIABLE_NAME,
                                     initializer=v_hj,
                                     partitioner=maybe_partitioner)

        # `u`, `v` used to construct matrix
        # that maps hid_state `h` to forget_gate `f`.
        u_hf, v_hf = self._orthogonal_init(shape=[h_depth, self._num_units],
                                           initializer=initializer,
                                           redundant_rank=redundant_rank)
        self._u_hf = tf.get_variable("u_hf/%s" % _WEIGHTS_VARIABLE_NAME,
                                     initializer=u_hf,
                                     partitioner=maybe_partitioner)
        self._v_hf = tf.get_variable("v_hf/%s" % _WEIGHTS_VARIABLE_NAME,
                                     initializer=v_hf,
                                     partitioner=maybe_partitioner)

        # `u`, `v` used to construct matrix
        # that maps hid_state `h` to output_gate `o`.
        u_ho, v_ho = self._orthogonal_init(shape=[h_depth, self._num_units],
                                           initializer=initializer,
                                           redundant_rank=redundant_rank)
        self._u_ho = tf.get_variable("u_ho/%s" % _WEIGHTS_VARIABLE_NAME,
                                     initializer=u_ho,
                                     partitioner=maybe_partitioner)
        self._v_ho = tf.get_variable("v_ho/%s" % _WEIGHTS_VARIABLE_NAME,
                                     initializer=v_ho,
                                     partitioner=maybe_partitioner)

        self._c = tf.get_variable(
            "c/%s" % _WEIGHTS_VARIABLE_NAME,
            shape=[self._num_units, self._rank],
            initializer=tf.contrib.layers.xavier_initializer(),
            partitioner=maybe_partitioner)

        initializer = tf.zeros_initializer(dtype=tf.float32)
        self._b = tf.get_variable("b/%s" % _BIAS_VARIABLE_NAME,
                                  shape=[4 * h_depth, self._rank],
                                  initializer=initializer)

        if self._num_proj is not None:
            if self._num_proj_shards is not None:
                maybe_proj_partitioner = (tf.fixed_size_partitioner(
                    self._num_proj_shards))
            else:
                maybe_proj_partitioner = (None)
            self._proj_kernel = self.add_variable(
                "projection/%s" % _WEIGHTS_VARIABLE_NAME,
                shape=[self._num_units, self._num_proj],
                initializer=tf.uniform_unit_scaling_initializer(),
                partitioner=maybe_proj_partitioner)
        self.initialize()
        self.built = True