def build_sentence_pair_model(cls, vocab_size, seq_length, tokens, transitions, num_classes, training_mode, ground_truth_transitions_visible, vs, initial_embeddings=None, project_embeddings=False, ss_mask_gen=None, ss_prob=0.0): """ Construct a classifier which makes use of some hard-stack model. Args: cls: Hard stack class to use (from e.g. `spinn.fat_stack`) vocab_size: seq_length: Length of each sequence provided to the stack model tokens: Theano batch (integer matrix), `batch_size * seq_length` transitions: Theano batch (integer matrix), `batch_size * seq_length` num_classes: Number of output classes training_mode: A Theano scalar indicating whether to act as a training model with dropout (1.0) or to act as an eval model with rescaling (0.0). ground_truth_transitions_visible: A Theano scalar. If set (1.0), allow the model access to ground truth transitions. This can be disabled at evaluation time to force Model 1 (or 2S) to evaluate in the Model 2 style with predicted transitions. Has no effect on Model 0. vs: Variable store. """ # Prepare layer which performs stack element composition. if cls is spinn.plain_rnn.RNN: compose_network = partial(util.LSTMLayer, initializer=util.HeKaimingInitializer()) embedding_projection_network = None elif cls is spinn.cbow.CBOW: compose_network = None embedding_projection_network = None else: if FLAGS.lstm_composition: compose_network = partial(util.TreeLSTMLayer, initializer=util.HeKaimingInitializer()) else: assert not FLAGS.connect_tracking_comp, "Can only connect tracking and composition unit while using TreeLSTM" compose_network = partial(util.ReLULayer, initializer=util.HeKaimingInitializer()) if project_embeddings: embedding_projection_network = util.Linear else: assert FLAGS.word_embedding_dim == FLAGS.model_dim, \ "word_embedding_dim must equal model_dim unless a projection layer is used." embedding_projection_network = util.IdentityLayer # Split the two sentences premise_tokens = tokens[:, :, 0] hypothesis_tokens = tokens[:, :, 1] premise_transitions = transitions[:, :, 0] hypothesis_transitions = transitions[:, :, 1] # Build two hard stack models which scan over input sequences. premise_model = cls( FLAGS.model_dim, FLAGS.word_embedding_dim, vocab_size, seq_length, compose_network, embedding_projection_network, training_mode, ground_truth_transitions_visible, vs, predict_use_cell=FLAGS.predict_use_cell, use_tracking_lstm=FLAGS.use_tracking_lstm, tracking_lstm_hidden_dim=FLAGS.tracking_lstm_hidden_dim, X=premise_tokens, transitions=premise_transitions, initial_embeddings=initial_embeddings, embedding_dropout_keep_rate=FLAGS.embedding_keep_rate, ss_mask_gen=ss_mask_gen, ss_prob=ss_prob, connect_tracking_comp=FLAGS.connect_tracking_comp, context_sensitive_shift=FLAGS.context_sensitive_shift, context_sensitive_use_relu=FLAGS.context_sensitive_use_relu, use_attention=FLAGS.use_attention, initialize_hyp_tracking_state=FLAGS.initialize_hyp_tracking_state) premise_stack_tops = premise_model.stack_tops if FLAGS.use_attention != "None" else None premise_tracking_c_state_final = premise_model.tracking_c_state_final if cls not in [spinn.plain_rnn.RNN, spinn.cbow.CBOW] else None hypothesis_model = cls( FLAGS.model_dim, FLAGS.word_embedding_dim, vocab_size, seq_length, compose_network, embedding_projection_network, training_mode, ground_truth_transitions_visible, vs, predict_use_cell=FLAGS.predict_use_cell, use_tracking_lstm=FLAGS.use_tracking_lstm, tracking_lstm_hidden_dim=FLAGS.tracking_lstm_hidden_dim, X=hypothesis_tokens, transitions=hypothesis_transitions, initial_embeddings=initial_embeddings, embedding_dropout_keep_rate=FLAGS.embedding_keep_rate, ss_mask_gen=ss_mask_gen, ss_prob=ss_prob, connect_tracking_comp=FLAGS.connect_tracking_comp, context_sensitive_shift=FLAGS.context_sensitive_shift, context_sensitive_use_relu=FLAGS.context_sensitive_use_relu, use_attention=FLAGS.use_attention, premise_stack_tops=premise_stack_tops, is_hypothesis=True, initialize_hyp_tracking_state=FLAGS.initialize_hyp_tracking_state, premise_tracking_c_state_final=premise_tracking_c_state_final) # Extract top element of final stack timestep. if FLAGS.use_attention == "None" or FLAGS.use_difference_feature or FLAGS.use_product_feature: premise_vector = premise_model.final_representations hypothesis_vector = hypothesis_model.final_representations if (FLAGS.lstm_composition and cls is not spinn.cbow.CBOW) or cls is spinn.plain_rnn.RNN: premise_vector = premise_vector[:,:FLAGS.model_dim / 2].reshape((-1, FLAGS.model_dim / 2)) hypothesis_vector = hypothesis_vector[:,:FLAGS.model_dim / 2].reshape((-1, FLAGS.model_dim / 2)) sentence_vector_dim = FLAGS.model_dim / 2 else: premise_vector = premise_vector.reshape((-1, FLAGS.model_dim)) hypothesis_vector = hypothesis_vector.reshape((-1, FLAGS.model_dim)) sentence_vector_dim = FLAGS.model_dim if FLAGS.use_attention != "None": # Use the attention weighted representation h_dim = FLAGS.model_dim / 2 mlp_input = hypothesis_model.final_weighed_representation.reshape((-1, h_dim)) mlp_input_dim = h_dim else: # Create standard MLP features mlp_input = T.concatenate([premise_vector, hypothesis_vector], axis=1) mlp_input_dim = 2 * sentence_vector_dim if FLAGS.use_difference_feature: mlp_input = T.concatenate([mlp_input, premise_vector - hypothesis_vector], axis=1) mlp_input_dim += sentence_vector_dim if FLAGS.use_product_feature: mlp_input = T.concatenate([mlp_input, premise_vector * hypothesis_vector], axis=1) mlp_input_dim += sentence_vector_dim mlp_input = util.BatchNorm(mlp_input, mlp_input_dim, vs, "sentence_vectors", training_mode) mlp_input = util.Dropout(mlp_input, FLAGS.semantic_classifier_keep_rate, training_mode) if FLAGS.classifier_type == "ResNet": features = util.Linear( mlp_input, mlp_input_dim, FLAGS.sentence_pair_combination_layer_dim, vs, name="resnet/linear", use_bias=True) features_dim = FLAGS.sentence_pair_combination_layer_dim for layer in range(FLAGS.num_sentence_pair_combination_layers): features = util.HeKaimingResidualLayerSet(features, features_dim, vs, training_mode, name="resnet/" + str(layer), dropout_keep_rate=FLAGS.semantic_classifier_keep_rate, depth=FLAGS.resnet_unit_depth, initializer=util.HeKaimingInitializer()) features = util.BatchNorm(features, features_dim, vs, "combining_mlp/" + str(layer), training_mode) features = util.Dropout(features, FLAGS.semantic_classifier_keep_rate, training_mode) elif FLAGS.classifier_type == "Highway": features = util.Linear( mlp_input, mlp_input_dim, FLAGS.sentence_pair_combination_layer_dim, vs, name="resnet/linear", use_bias=True) features_dim = FLAGS.sentence_pair_combination_layer_dim for layer in range(FLAGS.num_sentence_pair_combination_layers): features = util.HighwayLayer(features, features_dim, vs, training_mode, name="highway/" + str(layer), dropout_keep_rate=FLAGS.semantic_classifier_keep_rate, initializer=util.HeKaimingInitializer()) features = util.BatchNorm(features, features_dim, vs, "combining_mlp/" + str(layer), training_mode) features = util.Dropout(features, FLAGS.semantic_classifier_keep_rate, training_mode) else: # Apply a combining MLP features = mlp_input features_dim = mlp_input_dim for layer in range(FLAGS.num_sentence_pair_combination_layers): features = util.ReLULayer(features, features_dim, FLAGS.sentence_pair_combination_layer_dim, vs, name="combining_mlp/" + str(layer), initializer=util.HeKaimingInitializer()) features_dim = FLAGS.sentence_pair_combination_layer_dim features = util.BatchNorm(features, features_dim, vs, "combining_mlp/" + str(layer), training_mode) features = util.Dropout(features, FLAGS.semantic_classifier_keep_rate, training_mode) # Feed forward through a single output layer logits = util.Linear( features, features_dim, num_classes, vs, name="semantic_classifier", use_bias=True) return premise_model.transitions_pred, hypothesis_model.transitions_pred, logits
def build_sentence_model(cls, vocab_size, seq_length, tokens, transitions, num_classes, training_mode, ground_truth_transitions_visible, vs, initial_embeddings=None, project_embeddings=False, ss_mask_gen=None, ss_prob=0.0): """ Construct a classifier which makes use of some hard-stack model. Args: cls: Hard stack class to use (from e.g. `spinn.fat_stack`) vocab_size: seq_length: Length of each sequence provided to the stack model tokens: Theano batch (integer matrix), `batch_size * seq_length` transitions: Theano batch (integer matrix), `batch_size * seq_length` num_classes: Number of output classes training_mode: A Theano scalar indicating whether to act as a training model with dropout (1.0) or to act as an eval model with rescaling (0.0). ground_truth_transitions_visible: A Theano scalar. If set (1.0), allow the model access to ground truth transitions. This can be disabled at evaluation time to force Model 1 (or 2S) to evaluate in the Model 2 style with predicted transitions. Has no effect on Model 0. vs: Variable store. """ # Prepare layer which performs stack element composition. if cls is spinn.plain_rnn.RNN: compose_network = partial(util.LSTMLayer, initializer=util.HeKaimingInitializer()) embedding_projection_network = None elif cls is spinn.cbow.CBOW: compose_network = None embedding_projection_network = None else: if FLAGS.lstm_composition: compose_network = partial(util.TreeLSTMLayer, initializer=util.HeKaimingInitializer()) else: assert not FLAGS.connect_tracking_comp, "Can only connect tracking and composition unit while using TreeLSTM" compose_network = partial(util.ReLULayer, initializer=util.HeKaimingInitializer()) if project_embeddings: embedding_projection_network = util.Linear else: assert FLAGS.word_embedding_dim == FLAGS.model_dim, \ "word_embedding_dim must equal model_dim unless a projection layer is used." embedding_projection_network = util.IdentityLayer # Build hard stack which scans over input sequence. sentence_model = cls( FLAGS.model_dim, FLAGS.word_embedding_dim, vocab_size, seq_length, compose_network, embedding_projection_network, training_mode, ground_truth_transitions_visible, vs, predict_use_cell=FLAGS.predict_use_cell, use_tracking_lstm=FLAGS.use_tracking_lstm, tracking_lstm_hidden_dim=FLAGS.tracking_lstm_hidden_dim, X=tokens, transitions=transitions, initial_embeddings=initial_embeddings, embedding_dropout_keep_rate=FLAGS.embedding_keep_rate, ss_mask_gen=ss_mask_gen, ss_prob=ss_prob, connect_tracking_comp=FLAGS.connect_tracking_comp, context_sensitive_shift=FLAGS.context_sensitive_shift, context_sensitive_use_relu=FLAGS.context_sensitive_use_relu, use_input_batch_norm=False) # Extract top element of final stack timestep. if FLAGS.lstm_composition or cls is spinn.plain_rnn.RNN: sentence_vector = sentence_model.final_representations[:,:FLAGS.model_dim / 2].reshape((-1, FLAGS.model_dim / 2)) sentence_vector_dim = FLAGS.model_dim / 2 else: sentence_vector = sentence_model.final_representations.reshape((-1, FLAGS.model_dim)) sentence_vector_dim = FLAGS.model_dim sentence_vector = util.BatchNorm(sentence_vector, sentence_vector_dim, vs, "sentence_vector", training_mode) sentence_vector = util.Dropout(sentence_vector, FLAGS.semantic_classifier_keep_rate, training_mode) # Feed forward through a single output layer logits = util.Linear( sentence_vector, sentence_vector_dim, num_classes, vs, name="semantic_classifier", use_bias=True) return sentence_model.transitions_pred, logits
def build_sentence_pair_model(cls, vocab_size, seq_length, tokens, transitions, num_classes, training_mode, ground_truth_transitions_visible, vs, initial_embeddings=None, project_embeddings=False, ss_mask_gen=None, ss_prob=0.0): """ Construct a classifier which makes use of some hard-stack model. Args: cls: Hard stack class to use (from e.g. `spinn.stack`) vocab_size: seq_length: Length of each sequence provided to the stack model tokens: Theano batch (integer matrix), `batch_size * seq_length` transitions: Theano batch (integer matrix), `batch_size * seq_length` num_classes: Number of output classes training_mode: A Theano scalar indicating whether to act as a training model with dropout (1.0) or to act as an eval model with rescaling (0.0). ground_truth_transitions_visible: A Theano scalar. If set (1.0), allow the model access to ground truth transitions. This can be disabled at evaluation time to force Model 1 (or 2S) to evaluate in the Model 2 style with predicted transitions. Has no effect on Model 0. vs: Variable store. """ # Prepare layer which performs stack element composition. if cls is spinn.plain_rnn.RNN: compose_network = partial(util.LSTMLayer, initializer=util.HeKaimingInitializer()) embedding_projection_network = None else: if FLAGS.lstm_composition: compose_network = partial(util.TreeLSTMLayer, initializer=util.HeKaimingInitializer()) else: assert not FLAGS.connect_tracking_comp, "Can only connect tracking and composition unit while using TreeLSTM" compose_network = partial(util.ReLULayer, initializer=util.HeKaimingInitializer()) if project_embeddings: embedding_projection_network = util.Linear else: assert FLAGS.word_embedding_dim == FLAGS.model_dim, \ "word_embedding_dim must equal model_dim unless a projection layer is used." embedding_projection_network = util.IdentityLayer model_visible_dim = FLAGS.model_dim / 2 if FLAGS.lstm_composition else FLAGS.model_dim spec = util.ModelSpec(FLAGS.model_dim, FLAGS.word_embedding_dim, FLAGS.batch_size, vocab_size, seq_length, model_visible_dim=model_visible_dim) # Split the two sentences premise_tokens = tokens[:, :, 0] hypothesis_tokens = tokens[:, :, 1] premise_transitions = transitions[:, :, 0] hypothesis_transitions = transitions[:, :, 1] # TODO: Check non-Model0 support. recurrence = cls( spec, vs, compose_network, use_context_sensitive_shift=FLAGS.context_sensitive_shift, context_sensitive_use_relu=FLAGS.context_sensitive_use_relu, use_tracking_lstm=FLAGS.use_tracking_lstm, tracking_lstm_hidden_dim=FLAGS.tracking_lstm_hidden_dim) # Build two hard stack models which scan over input sequences. premise_model = ThinStack( spec, recurrence, embedding_projection_network, training_mode, ground_truth_transitions_visible, vs, X=premise_tokens, transitions=premise_transitions, initial_embeddings=initial_embeddings, embedding_dropout_keep_rate=FLAGS.embedding_keep_rate, use_input_batch_norm=False, ss_mask_gen=ss_mask_gen, ss_prob=ss_prob, name="premise") hypothesis_model = ThinStack( spec, recurrence, embedding_projection_network, training_mode, ground_truth_transitions_visible, vs, X=hypothesis_tokens, transitions=hypothesis_transitions, initial_embeddings=initial_embeddings, embedding_dropout_keep_rate=FLAGS.embedding_keep_rate, use_input_batch_norm=False, ss_mask_gen=ss_mask_gen, ss_prob=ss_prob, name="hypothesis") # Create standard MLP features mlp_input = T.concatenate([premise_vector, hypothesis_vector], axis=1) mlp_input_dim = 2 * sentence_vector_dim if FLAGS.use_difference_feature: mlp_input = T.concatenate( [mlp_input, premise_vector - hypothesis_vector], axis=1) mlp_input_dim += sentence_vector_dim if FLAGS.use_product_feature: mlp_input = T.concatenate( [mlp_input, premise_vector * hypothesis_vector], axis=1) mlp_input_dim += sentence_vector_dim mlp_input = util.BatchNorm(mlp_input, mlp_input_dim, vs, "sentence_vectors", training_mode) mlp_input = util.Dropout(mlp_input, FLAGS.semantic_classifier_keep_rate, training_mode) # Apply a combining MLP prev_features = mlp_input prev_features_dim = mlp_input_dim for layer in range(FLAGS.num_sentence_pair_combination_layers): prev_features = util.ReLULayer( prev_features, prev_features_dim, FLAGS.sentence_pair_combination_layer_dim, vs, name="combining_mlp/" + str(layer), initializer=util.HeKaimingInitializer()) prev_features_dim = FLAGS.sentence_pair_combination_layer_dim prev_features = util.BatchNorm(prev_features, prev_features_dim, vs, "combining_mlp/" + str(layer), training_mode) prev_features = util.Dropout(prev_features, FLAGS.semantic_classifier_keep_rate, training_mode) # Feed forward through a single output layer logits = util.Linear(prev_features, prev_features_dim, num_classes, vs, name="semantic_classifier", use_bias=True) def zero_fn(): premise_model.zero() hypothesis_model.zero() return premise_model, hypothesis_model, logits, zero_fn