Python HeKaimingInitializerの例

プログラミング言語: Python

名前空間/パッケージ名: rembed.util

メソッド/関数: HeKaimingInitializer

hotexamples.comのコード掲載数: 7

Python HeKaimingInitializer - 7件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのrembed.util.HeKaimingInitializerの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

コード例 #1

ファイルを表示

ファイル: recurrences.py プロジェクト: hitluobin/spinn

    def _context_sensitive_shift(self, inputs):
        """
        Compute a buffer top representation by mixing buffer top and hidden state.

        NB: This hasn't been an especially effective tool so far.
        """
        assert self.use_tracking_lstm
        buffer_top, tracking_hidden = inputs[2:4]

        # Exclude the cell value from the computation.
        tracking_hidden = tracking_hidden[:, :hidden_dim]

        inp = T.concatenate([tracking_hidden, buffer_top], axis=1)
        inp_dim = self._spec.word_embedding_dim + self.tracking_lstm_hidden_dim
        layer = util.ReLULayer if self.context_sensitive_use_relu else util.Linear
        return layer(inp, inp_dim, self._spec.model_dim, self._vs,
                     name="context_comb_unit", use_bias=True,
                     initializer=util.HeKaimingInitializer())

コード例 #2

ファイルを表示

ファイル: classifier.py プロジェクト: hitluobin/spinn

def build_sentence_model(cls, vocab_size, seq_length, tokens, transitions,
                         num_classes, training_mode, ground_truth_transitions_visible, vs,
                         initial_embeddings=None, project_embeddings=False, ss_mask_gen=None, ss_prob=0.0):
    """
    Construct a classifier which makes use of some hard-stack model.

    Args:
      cls: Hard stack class to use (from e.g. `rembed.stack`)
      vocab_size:
      seq_length: Length of each sequence provided to the stack model
      tokens: Theano batch (integer matrix), `batch_size * seq_length`
      transitions: Theano batch (integer matrix), `batch_size * seq_length`
      num_classes: Number of output classes
      training_mode: A Theano scalar indicating whether to act as a training model
        with dropout (1.0) or to act as an eval model with rescaling (0.0).
      ground_truth_transitions_visible: A Theano scalar. If set (1.0), allow the model access
        to ground truth transitions. This can be disabled at evaluation time to force Model 1
        (or 2S) to evaluate in the Model 2 style with predicted transitions. Has no effect on Model 0.
      vs: Variable store.
    """

    # Prepare layer which performs stack element composition.
    if cls is rembed.plain_rnn.RNN:
        compose_network = partial(util.LSTMLayer,
                                      initializer=util.HeKaimingInitializer())
        embedding_projection_network = None
    else:
        if FLAGS.lstm_composition:
            compose_network = partial(util.TreeLSTMLayer,
                                      initializer=util.HeKaimingInitializer())
        else:
            assert not FLAGS.connect_tracking_comp, "Can only connect tracking and composition unit while using TreeLSTM"
            compose_network = partial(util.ReLULayer,
                                      initializer=util.HeKaimingInitializer())

        if project_embeddings:
            embedding_projection_network = util.Linear
        else:
            assert FLAGS.word_embedding_dim == FLAGS.model_dim, \
                "word_embedding_dim must equal model_dim unless a projection layer is used."
            embedding_projection_network = util.IdentityLayer

    model_visible_dim = FLAGS.model_dim / 2 if FLAGS.lstm_composition else FLAGS.model_dim
    spec = util.ModelSpec(FLAGS.model_dim, FLAGS.word_embedding_dim,
                          FLAGS.batch_size, vocab_size, seq_length,
                          model_visible_dim=model_visible_dim)

    # TODO: Check non-Model0 support.
    recurrence = cls(spec, vs, compose_network,
                     use_context_sensitive_shift=FLAGS.context_sensitive_shift,
                     context_sensitive_use_relu=FLAGS.context_sensitive_use_relu,
                     use_tracking_lstm=FLAGS.use_tracking_lstm,
                     tracking_lstm_hidden_dim=FLAGS.tracking_lstm_hidden_dim)

    model = ThinStack(spec, recurrence, embedding_projection_network,
                      training_mode, ground_truth_transitions_visible, vs,
                      X=tokens,
                      transitions=transitions,
                      initial_embeddings=initial_embeddings,
                      embedding_dropout_keep_rate=FLAGS.embedding_keep_rate,
                      use_input_batch_norm=False,
                      ss_mask_gen=ss_mask_gen,
                      ss_prob=ss_prob)

    # Extract top element of final stack timestep.
    if FLAGS.lstm_composition:
        sentence_vector = model.sentence_embeddings[:, :FLAGS.model_dim / 2]
        sentence_vector_dim = FLAGS.model_dim / 2
    else:
        sentence_vector = model.sentence_embeddings
        sentence_vector_dim = FLAGS.model_dim

    sentence_vector = util.BatchNorm(sentence_vector, sentence_vector_dim, vs, "sentence_vector", training_mode)
    sentence_vector = util.Dropout(sentence_vector, FLAGS.semantic_classifier_keep_rate, training_mode)

    # Feed forward through a single output layer
    logits = util.Linear(
        sentence_vector, sentence_vector_dim, num_classes, vs,
        name="semantic_classifier", use_bias=True)

    def zero_fn():
        model.zero()

    return model, logits, zero_fn

コード例 #3

ファイルを表示

ファイル: classifier.py プロジェクト: hitluobin/spinn

def build_sentence_pair_model(cls, vocab_size, seq_length, tokens, transitions,
                     num_classes, training_mode, ground_truth_transitions_visible, vs,
                     initial_embeddings=None, project_embeddings=False, ss_mask_gen=None, ss_prob=0.0):
    """
    Construct a classifier which makes use of some hard-stack model.

    Args:
      cls: Hard stack class to use (from e.g. `rembed.stack`)
      vocab_size:
      seq_length: Length of each sequence provided to the stack model
      tokens: Theano batch (integer matrix), `batch_size * seq_length`
      transitions: Theano batch (integer matrix), `batch_size * seq_length`
      num_classes: Number of output classes
      training_mode: A Theano scalar indicating whether to act as a training model
        with dropout (1.0) or to act as an eval model with rescaling (0.0).
      ground_truth_transitions_visible: A Theano scalar. If set (1.0), allow the model access
        to ground truth transitions. This can be disabled at evaluation time to force Model 1
        (or 2S) to evaluate in the Model 2 style with predicted transitions. Has no effect on Model 0.
      vs: Variable store.
    """

    # Prepare layer which performs stack element composition.
    if cls is rembed.plain_rnn.RNN:
        compose_network = partial(util.LSTMLayer,
                                      initializer=util.HeKaimingInitializer())
        embedding_projection_network = None
    else:
        if FLAGS.lstm_composition:
            compose_network = partial(util.TreeLSTMLayer,
                                      initializer=util.HeKaimingInitializer())
        else:
            assert not FLAGS.connect_tracking_comp, "Can only connect tracking and composition unit while using TreeLSTM"
            compose_network = partial(util.ReLULayer,
                                      initializer=util.HeKaimingInitializer())

        if project_embeddings:
            embedding_projection_network = util.Linear
        else:
            assert FLAGS.word_embedding_dim == FLAGS.model_dim, \
                "word_embedding_dim must equal model_dim unless a projection layer is used."
            embedding_projection_network = util.IdentityLayer

    model_visible_dim = FLAGS.model_dim / 2 if FLAGS.lstm_composition else FLAGS.model_dim
    spec = util.ModelSpec(FLAGS.model_dim, FLAGS.word_embedding_dim,
                          FLAGS.batch_size, vocab_size, seq_length,
                          model_visible_dim=model_visible_dim)

    # Split the two sentences
    premise_tokens = tokens[:, :, 0]
    hypothesis_tokens = tokens[:, :, 1]

    premise_transitions = transitions[:, :, 0]
    hypothesis_transitions = transitions[:, :, 1]

    # TODO: Check non-Model0 support.
    recurrence = cls(spec, vs, compose_network,
                     use_context_sensitive_shift=FLAGS.context_sensitive_shift,
                     context_sensitive_use_relu=FLAGS.context_sensitive_use_relu,
                     use_tracking_lstm=FLAGS.use_tracking_lstm,
                     tracking_lstm_hidden_dim=FLAGS.tracking_lstm_hidden_dim)

    # Build two hard stack models which scan over input sequences.
    premise_model = ThinStack(spec, recurrence, embedding_projection_network,
        training_mode, ground_truth_transitions_visible, vs,
        X=premise_tokens,
        transitions=premise_transitions,
        initial_embeddings=initial_embeddings,
        embedding_dropout_keep_rate=FLAGS.embedding_keep_rate,
        use_input_batch_norm=False,
        ss_mask_gen=ss_mask_gen,
        ss_prob=ss_prob,
        use_attention=FLAGS.use_attention,
        name="premise")

    premise_stack_tops = premise_model.stack_tops if FLAGS.use_attention != "None" else None

    hypothesis_model = ThinStack(spec, recurrence, embedding_projection_network,
        training_mode, ground_truth_transitions_visible, vs,
        X=hypothesis_tokens,
        transitions=hypothesis_transitions,
        initial_embeddings=initial_embeddings,
        embedding_dropout_keep_rate=FLAGS.embedding_keep_rate,
        use_input_batch_norm=False,
        ss_mask_gen=ss_mask_gen,
        ss_prob=ss_prob,
        use_attention=FLAGS.use_attention,
        name="hypothesis")

    # Extract top element of final stack timestep.
    if FLAGS.use_attention == "None" or FLAGS.use_difference_feature or FLAGS.use_product_feature:
        premise_vector = premise_model.sentence_embeddings
        hypothesis_vector = hypothesis_model.sentence_embeddings

        if FLAGS.lstm_composition:
            premise_vector = premise_vector[:,:FLAGS.model_dim / 2]
            hypothesis_vector = hypothesis_vector[:,:FLAGS.model_dim / 2]
            sentence_vector_dim = FLAGS.model_dim / 2
        else:
            sentence_vector_dim = FLAGS.model_dim

    if FLAGS.use_attention != "None":
        # Use the attention weighted representation
        h_dim = FLAGS.model_dim / 2
        mlp_input = hypothesis_model.final_weighed_representation.reshape((-1, h_dim))
        mlp_input_dim = h_dim
    else:
        # Create standard MLP features
        mlp_input = T.concatenate([premise_vector, hypothesis_vector], axis=1)
        mlp_input_dim = 2 * sentence_vector_dim

    if FLAGS.use_difference_feature:
        mlp_input = T.concatenate([mlp_input, premise_vector - hypothesis_vector], axis=1)
        mlp_input_dim += sentence_vector_dim

    if FLAGS.use_product_feature:
        mlp_input = T.concatenate([mlp_input, premise_vector * hypothesis_vector], axis=1)
        mlp_input_dim += sentence_vector_dim

    mlp_input = util.BatchNorm(mlp_input, mlp_input_dim, vs, "sentence_vectors", training_mode)
    mlp_input = util.Dropout(mlp_input, FLAGS.semantic_classifier_keep_rate, training_mode)

    # Apply a combining MLP
    prev_features = mlp_input
    prev_features_dim = mlp_input_dim
    for layer in range(FLAGS.num_sentence_pair_combination_layers):
        prev_features = util.ReLULayer(prev_features, prev_features_dim, FLAGS.sentence_pair_combination_layer_dim, vs,
            name="combining_mlp/" + str(layer),
            initializer=util.HeKaimingInitializer())
        prev_features_dim = FLAGS.sentence_pair_combination_layer_dim

        prev_features = util.BatchNorm(prev_features, prev_features_dim, vs, "combining_mlp/" + str(layer), training_mode)
        prev_features = util.Dropout(prev_features, FLAGS.semantic_classifier_keep_rate, training_mode)

    # Feed forward through a single output layer
    logits = util.Linear(
        prev_features, prev_features_dim, num_classes, vs,
        name="semantic_classifier", use_bias=True)

    def zero_fn():
        premise_model.zero()
        hypothesis_model.zero()

    return premise_model, hypothesis_model, logits, zero_fn

コード例 #4

ファイルを表示

    def _step(self, transitions_t, ss_mask_gen_matrix_t, stack_t, buffer_cur_t,
              tracking_hidden, attention_hidden, stack_pushed, stack_merged,
              buffer, ground_truth_transitions_visible, premise_stack_tops,
              projected_stack_tops):
        batch_size, _ = self.X.shape

        # Extract top buffer values.
        idxs = buffer_cur_t + (T.arange(batch_size) * self.seq_length)

        if self.context_sensitive_shift:
            # Combine with the hidden state from previous unit.
            tracking_h_t = tracking_hidden[:, :self.tracking_lstm_hidden_dim]
            context_comb_input_t = T.concatenate([tracking_h_t, buffer[idxs]],
                                                 axis=1)
            context_comb_input_dim = self.word_embedding_dim + self.tracking_lstm_hidden_dim
            comb_layer = util.ReLULayer if self.context_sensitive_use_relu else util.Linear
            buffer_top_t = comb_layer(context_comb_input_t,
                                      context_comb_input_dim,
                                      self.model_dim,
                                      self._vs,
                                      name="context_comb_unit",
                                      use_bias=True,
                                      initializer=util.HeKaimingInitializer())
        else:
            buffer_top_t = buffer[idxs]

        if self._prediction_and_tracking_network is not None:
            # We are predicting our own stack operations.
            h_dim = self.model_dim / 2  # TODO(SB): Turn this off when not using TreeLSTM.

            predict_inp = T.concatenate([
                stack_t[:, 0, :h_dim], stack_t[:, 1, :h_dim],
                buffer_top_t[:, :h_dim]
            ],
                                        axis=1)

            if self.use_tracking_lstm:
                # Update the hidden state and obtain predicted actions.
                tracking_hidden, actions_t = self._prediction_and_tracking_network(
                    tracking_hidden,
                    predict_inp,
                    h_dim * 3,
                    self.tracking_lstm_hidden_dim,
                    self._vs,
                    name="prediction_and_tracking")
            else:
                # Obtain predicted actions directly.
                actions_t = self._prediction_and_tracking_network(
                    predict_inp,
                    h_dim * 3,
                    util.NUM_TRANSITION_TYPES,
                    self._vs,
                    name="prediction_and_tracking")

        if self.train_with_predicted_transitions:
            # Model 2 case.
            if self.interpolate:
                # Only use ground truth transitions if they are marked as visible to the model.
                effective_ss_mask_gen_matrix_t = ss_mask_gen_matrix_t * ground_truth_transitions_visible
                # Interpolate between truth and prediction using bernoulli RVs
                # generated prior to the step.
                mask = (transitions_t * effective_ss_mask_gen_matrix_t +
                        actions_t.argmax(axis=1) *
                        (1 - effective_ss_mask_gen_matrix_t))
            else:
                # Use predicted actions to build a mask.
                mask = actions_t.argmax(axis=1)
        elif self._predict_transitions:
            # Use transitions provided from external parser when not masked out
            mask = (transitions_t * ground_truth_transitions_visible +
                    actions_t.argmax(axis=1) *
                    (1 - ground_truth_transitions_visible))
        else:
            # Model 0 case.
            mask = transitions_t

        # Now update the stack: first precompute merge results.
        if self.model_dim != self.stack_dim:
            stack1 = stack_t[:, 0, :self.model_dim].reshape(
                (-1, self.model_dim))
            stack2 = stack_t[:, 1, :self.model_dim].reshape(
                (-1, self.model_dim))
        else:
            stack1 = stack_t[:, 0].reshape((-1, self.model_dim))
            stack2 = stack_t[:, 1].reshape((-1, self.model_dim))
        merge_items = (stack1, stack2)
        if self.connect_tracking_comp:
            tracking_h_t = tracking_hidden[:, :self.tracking_lstm_hidden_dim]
            merge_value = self._compose_network(
                merge_items,
                tracking_h_t,
                self.model_dim,
                self._vs,
                name="compose",
                external_state_dim=self.tracking_lstm_hidden_dim)
        else:
            merge_value = self._compose_network(merge_items,
                                                (self.model_dim, ) * 2,
                                                self.model_dim,
                                                self._vs,
                                                name="compose")

        # Compute new stack value.
        stack_next = update_hard_stack(stack_t, stack_pushed, stack_merged,
                                       buffer_top_t, merge_value, mask,
                                       self.model_dim)

        # If attention is to be used and premise_stack_tops is not None (i.e.
        # we're processing the hypothesis) calculate the attention weighed representation.
        if self.use_attention != "None" and self.is_hypothesis:
            h_dim = self.model_dim / 2
            if self.use_attention in {"TreeWangJiang", "TreeThang"}:
                mask_ = mask.dimshuffle(0, "x")
                mask_ = T.cast(mask_, dtype=theano.config.floatX)
                attention_hidden_l = stack_t[:, 0, self.model_dim:] * mask_
                attention_hidden_r = stack_t[:, 1, self.model_dim:] * mask_

                tree_attention_hidden = self._attention_unit(
                    attention_hidden_l,
                    attention_hidden_r,
                    stack_next[:, 0, :h_dim],
                    premise_stack_tops,
                    projected_stack_tops,
                    h_dim,
                    self._vs,
                    name="attention_unit")
                stack_next = T.set_subtensor(stack_next[:, 0, self.model_dim:],
                                             tree_attention_hidden)
            else:
                attention_hidden = self._attention_unit(attention_hidden,
                                                        stack_next[:,
                                                                   0, :h_dim],
                                                        premise_stack_tops,
                                                        projected_stack_tops,
                                                        h_dim,
                                                        self._vs,
                                                        name="attention_unit")

        # Move buffer cursor as necessary. Since mask == 1 when merge, we
        # should increment each buffer cursor by 1 - mask.
        buffer_cur_next = buffer_cur_t + (1 - mask)

        if self._predict_transitions:
            ret_val = stack_next, buffer_cur_next, tracking_hidden, attention_hidden, actions_t
        else:
            ret_val = stack_next, buffer_cur_next, tracking_hidden, attention_hidden

        if not self.interpolate:
            # Use ss_mask as a redundant return value.
            ret_val = (ss_mask_gen_matrix_t, ) + ret_val

        return ret_val

コード例 #5

ファイルを表示

def build_sentence_model(cls,
                         vocab_size,
                         seq_length,
                         tokens,
                         transitions,
                         num_classes,
                         apply_dropout,
                         vs,
                         initial_embeddings=None,
                         project_embeddings=False,
                         ss_mask_gen=None,
                         ss_prob=0.0):
    """
    Construct a classifier which makes use of some hard-stack model.

    Args:
      cls: Hard stack class to use (from e.g. `rembed.stack`)
      vocab_size:
      seq_length: Length of each sequence provided to the stack model
      tokens: Theano batch (integer matrix), `batch_size * seq_length`
      transitions: Theano batch (integer matrix), `batch_size * seq_length`
      num_classes: Number of output classes
      apply_dropout: 1.0 at training time, 0.0 at eval time (to avoid corrupting outputs in dropout)
      vs: Variable store.
    """

    # Prepare layer which performs stack element composition.
    if FLAGS.lstm_composition:
        compose_network = partial(util.TreeLSTMLayer,
                                  initializer=util.HeKaimingInitializer())
    else:
        compose_network = partial(util.ReLULayer,
                                  initializer=util.HeKaimingInitializer())

    if project_embeddings:
        embedding_projection_network = util.Linear
    else:
        assert FLAGS.word_embedding_dim == FLAGS.model_dim, \
            "word_embedding_dim must equal model_dim unless a projection layer is used."
        embedding_projection_network = util.IdentityLayer

    # Build hard stack which scans over input sequence.
    stack = cls(FLAGS.model_dim,
                FLAGS.word_embedding_dim,
                vocab_size,
                seq_length,
                compose_network,
                embedding_projection_network,
                apply_dropout,
                vs,
                X=tokens,
                transitions=transitions,
                initial_embeddings=initial_embeddings,
                embedding_dropout_keep_rate=FLAGS.embedding_keep_rate,
                ss_mask_gen=ss_mask_gen,
                ss_prob=ss_prob)

    # Extract top element of final stack timestep.
    final_stack = stack.final_stack
    stack_top = final_stack[:, 0]
    sentence_vector = stack_top.reshape((-1, FLAGS.model_dim))

    sentence_vector = util.Dropout(sentence_vector,
                                   FLAGS.semantic_classifier_keep_rate,
                                   apply_dropout)

    # Feed forward through a single output layer
    logits = util.Linear(sentence_vector,
                         FLAGS.model_dim,
                         num_classes,
                         vs,
                         use_bias=True)

    return stack.transitions_pred, logits

コード例 #6

ファイルを表示

def build_sentence_pair_model(cls,
                              vocab_size,
                              seq_length,
                              tokens,
                              transitions,
                              num_classes,
                              apply_dropout,
                              vs,
                              initial_embeddings=None,
                              project_embeddings=False,
                              ss_mask_gen=None,
                              ss_prob=0.0):
    """
    Construct a classifier which makes use of some hard-stack model.

    Args:
      cls: Hard stack class to use (from e.g. `rembed.stack`)
      vocab_size:
      seq_length: Length of each sequence provided to the stack model
      tokens: Theano batch (integer matrix), `batch_size * seq_length`
      transitions: Theano batch (integer matrix), `batch_size * seq_length`
      num_classes: Number of output classes
      apply_dropout: 1.0 at training time, 0.0 at eval time (to avoid corrupting outputs in dropout)
      vs: Variable store.
    """

    # Prepare layer which performs stack element composition.
    if FLAGS.lstm_composition:
        compose_network = partial(util.TreeLSTMLayer,
                                  initializer=util.HeKaimingInitializer())
    else:
        compose_network = partial(util.ReLULayer,
                                  initializer=util.HeKaimingInitializer())

    if project_embeddings:
        embedding_projection_network = util.Linear
    else:
        assert FLAGS.word_embedding_dim == FLAGS.model_dim, \
            "word_embedding_dim must equal model_dim unless a projection layer is used."
        embedding_projection_network = util.IdentityLayer

    # Split the two sentences
    premise_tokens = tokens[:, :, 0]
    hypothesis_tokens = tokens[:, :, 1]

    premise_transitions = transitions[:, :, 0]
    hypothesis_transitions = transitions[:, :, 1]

    # Build two hard stack models which scan over input sequences.
    premise_model = cls(FLAGS.model_dim,
                        FLAGS.word_embedding_dim,
                        vocab_size,
                        seq_length,
                        compose_network,
                        embedding_projection_network,
                        apply_dropout,
                        vs,
                        X=premise_tokens,
                        transitions=premise_transitions,
                        initial_embeddings=initial_embeddings,
                        embedding_dropout_keep_rate=FLAGS.embedding_keep_rate,
                        ss_mask_gen=ss_mask_gen,
                        ss_prob=ss_prob)
    hypothesis_model = cls(
        FLAGS.model_dim,
        FLAGS.word_embedding_dim,
        vocab_size,
        seq_length,
        compose_network,
        embedding_projection_network,
        apply_dropout,
        vs,
        X=hypothesis_tokens,
        transitions=hypothesis_transitions,
        initial_embeddings=initial_embeddings,
        embedding_dropout_keep_rate=FLAGS.embedding_keep_rate,
        ss_mask_gen=ss_mask_gen,
        ss_prob=ss_prob)

    # Extract top element of final stack timestep.
    premise_stack_top = premise_model.final_stack[:, 0]
    hypothesis_stack_top = hypothesis_model.final_stack[:, 0]

    premise_vector = premise_stack_top.reshape((-1, FLAGS.model_dim))
    hypothesis_vector = hypothesis_stack_top.reshape((-1, FLAGS.model_dim))

    # Concatenate and apply dropout
    mlp_input = T.concatenate([premise_vector, hypothesis_vector], axis=1)
    dropout_mlp_input = util.Dropout(mlp_input,
                                     FLAGS.semantic_classifier_keep_rate,
                                     apply_dropout)

    # Apply a combining MLP
    pair_features = util.MLP(dropout_mlp_input,
                             2 * FLAGS.model_dim,
                             FLAGS.model_dim,
                             vs,
                             hidden_dims=[FLAGS.model_dim],
                             name="combining_mlp",
                             initializer=util.HeKaimingInitializer())

    # Feed forward through a single output layer
    logits = util.Linear(pair_features,
                         FLAGS.model_dim,
                         num_classes,
                         vs,
                         use_bias=True)

    return premise_model.transitions_pred, hypothesis_model.transitions_pred, logits

コード例 #7

ファイルを表示

ファイル: stack.py プロジェクト: mihail911/rembed

    def _step(self, transitions_t, ss_mask_gen_matrix_t, stack_t, buffer_cur_t,
              tracking_hidden, stack_pushed, stack_merged, buffer,
              ground_truth_transitions_visible):
        batch_size, _ = self.X.shape

        # Extract top buffer values.
        idxs = buffer_cur_t + (T.arange(batch_size) * self.seq_length)

        if self.context_sensitive_shift:
            # Combine with the hidden state from previous unit.
            tracking_h_t = tracking_hidden[:, :self.tracking_lstm_hidden_dim]
            context_comb_input_t = T.concatenate([tracking_h_t, buffer[idxs]],
                                                 axis=1)
            context_comb_input_dim = self.word_embedding_dim + self.tracking_lstm_hidden_dim
            comb_layer = util.ReLULayer if self.context_sensitive_use_relu else util.Linear
            buffer_top_t = comb_layer(context_comb_input_t,
                                      context_comb_input_dim,
                                      self.model_dim,
                                      self._vs,
                                      name="context_comb_unit",
                                      use_bias=True,
                                      initializer=util.HeKaimingInitializer())
        else:
            buffer_top_t = buffer[idxs]

        if self._prediction_and_tracking_network is not None:
            # We are predicting our own stack operations.
            predict_inp = T.concatenate(
                [stack_t[:, 0], stack_t[:, 1], buffer_top_t], axis=1)

            if self.use_tracking_lstm:
                # Update the hidden state and obtain predicted actions.
                tracking_hidden, actions_t = self._prediction_and_tracking_network(
                    tracking_hidden,
                    predict_inp,
                    self.model_dim * 3,
                    self.tracking_lstm_hidden_dim,
                    self._vs,
                    name="prediction_and_tracking")
            else:
                # Obtain predicted actions directly.
                actions_t = self._prediction_and_tracking_network(
                    predict_inp,
                    self.model_dim * 3,
                    util.NUM_TRANSITION_TYPES,
                    self._vs,
                    name="prediction_and_tracking")

        if self.train_with_predicted_transitions:
            # Model 2 case.
            if self.interpolate:
                # Only use ground truth transitions if they are marked as visible to the model.
                effective_ss_mask_gen_matrix_t = ss_mask_gen_matrix_t * ground_truth_transitions_visible
                # Interpolate between truth and prediction using bernoulli RVs
                # generated prior to the step.
                mask = (transitions_t * effective_ss_mask_gen_matrix_t +
                        actions_t.argmax(axis=1) *
                        (1 - effective_ss_mask_gen_matrix_t))
            else:
                # Use predicted actions to build a mask.
                mask = actions_t.argmax(axis=1)
        elif self._predict_transitions:
            # Use transitions provided from external parser when not masked out
            mask = (transitions_t * ground_truth_transitions_visible +
                    actions_t.argmax(axis=1) *
                    (1 - ground_truth_transitions_visible))
        else:
            # Model 0 case.
            mask = transitions_t

        # Now update the stack: first precompute merge results.
        merge_items = stack_t[:, :2].reshape((-1, self.model_dim * 2))
        if self.connect_tracking_comp:
            tracking_h_t = tracking_hidden[:, :self.tracking_lstm_hidden_dim]
            merge_value = self._compose_network(
                merge_items,
                tracking_h_t,
                self.model_dim,
                self._vs,
                name="compose",
                external_state_dim=self.tracking_lstm_hidden_dim)
        else:
            merge_value = self._compose_network(merge_items,
                                                self.model_dim * 2,
                                                self.model_dim,
                                                self._vs,
                                                name="compose")

        # Compute new stack value.
        stack_next = update_hard_stack(stack_t, stack_pushed, stack_merged,
                                       buffer_top_t, merge_value, mask)

        # Move buffer cursor as necessary. Since mask == 1 when merge, we
        # should increment each buffer cursor by 1 - mask.
        buffer_cur_next = buffer_cur_t + (1 - mask)

        if self._predict_transitions:
            ret_val = stack_next, buffer_cur_next, tracking_hidden, actions_t
        else:
            ret_val = stack_next, buffer_cur_next, tracking_hidden

        if not self.interpolate:
            # Use ss_mask as a redundant return value.
            ret_val = (ss_mask_gen_matrix_t, ) + ret_val

        return ret_val