Beispiel #1
0
 def __init__(self, num_senses, num_hyps, use_attention, set_sense_priors, **kwargs):
     super(OntoLSTMEntailmentModel, self).__init__(**kwargs)
     # Set self.data_processor again, now with the right arguments.
     self.data_processor = DataProcessor(word_syn_cutoff=num_senses, syn_path_cutoff=num_hyps)
     self.num_senses = num_senses
     self.num_hyps = num_hyps
     self.attention_model = None  # Keras model with just embedding and encoder to output attention.
     self.set_sense_priors = set_sense_priors
     self.use_attention = use_attention
     # If bidirectional, we'll do pooling here. So make the encoder return sequences iff bidirectional.
     self.encoder_model = OntoLSTMEncoder(num_senses=num_senses, num_hyps=num_hyps,
                                          use_attention=use_attention, set_sense_priors=set_sense_priors,
                                          data_processor=self.data_processor, embed_dim=self.embed_dim,
                                          bidirectional=self.bidirectional,
                                          tune_embedding=self.tune_embedding,
                                          return_sequences=self.bidirectional)
     self.model_name_prefix = "ontolstm_ent_att=%s_senses=%d_hyps=%d_sense-priors=%s_tune-embedding=%s_bi=%s_pool-att=%s" % (
         str(self.use_attention), self.num_senses, self.num_hyps, str(set_sense_priors), str(self.tune_embedding),
         str(self.bidirectional), str(self.intra_attention))
     self.custom_objects = {"OntoAttentionLSTM": OntoAttentionLSTM, "OntoAwareEmbedding": OntoAwareEmbedding}
     if self.bidirectional:
         if self.intra_attention:
             self.custom_objects["IntraAttention"] = IntraAttention
         else: 
             self.custom_objects["AveragePooling"] = AveragePooling
Beispiel #2
0
 def __init__(self, bidirectional=False, intra_attention=False, tune_embedding=False, **kwargs):
     self.data_processor = DataProcessor()
     if "embed_dim" in kwargs:
         self.embed_dim = kwargs["embed_dim"]
     else:
         self.embed_dim = 50
     self.bidirectional = bidirectional
     self.intra_attention = intra_attention
     self.tune_embedding = tune_embedding
     self.numpy_rng = numpy.random.RandomState(12345)
     self.label_map = {}  # Maps labels to integers.
     self.model = None
     self.best_epoch = 0  # index of the best epoch
     self.model_name_prefix = None
     self.custom_objects = None
     self.encoder_model = None
 def __init__(self, num_senses, num_hyps, use_attention, set_sense_priors, prep_senses_dir, **kwargs):
     super(OntoLSTMAttachmentModel, self).__init__(**kwargs)
     # Set self.data_processor again, now with the right arguments.
     process_preps = False if prep_senses_dir is None else True
     self.data_processor = DataProcessor(word_syn_cutoff=num_senses, syn_path_cutoff=num_hyps,
                                         process_preps=process_preps, prep_senses_dir=prep_senses_dir)
     self.num_senses = num_senses
     self.num_hyps = num_hyps
     self.attention_model = None  # Keras model with just embedding and encoder to output attention.
     self.set_sense_priors = set_sense_priors
     self.use_attention = use_attention
     use_prep_senses = False if prep_senses_dir is None else True
     self.encoder = OntoLSTMEncoder(self.num_senses, self.num_hyps, self.use_attention, self.set_sense_priors,
                                    data_processor=self.data_processor, embed_dim=self.embed_dim,
                                    bidirectional=self.bidirectional, tune_embedding=self.tune_embedding)
     self.model_name_prefix = ("ontolstm_models/ontolstm_ppa_att=%s_senses=%d_hyps=%d"
                               "_sense-priors=%s_prep-senses=%s_tune-embedding=%s_bi=%s") % (
                                   str(self.use_attention), self.num_senses, self.num_hyps,
                                   str(set_sense_priors), str(use_prep_senses), str(self.tune_embedding),
                                   str(self.bidirectional))
     self.custom_objects.update(self.encoder.get_custom_objects())
class OntoLSTMAttachmentModel(PPAttachmentModel):
    '''
    A PP Attachment prediction model that uses an OnotoLSTM as the encoder.
    '''
    def __init__(self, num_senses, num_hyps, use_attention, set_sense_priors, prep_senses_dir, **kwargs):
        super(OntoLSTMAttachmentModel, self).__init__(**kwargs)
        # Set self.data_processor again, now with the right arguments.
        process_preps = False if prep_senses_dir is None else True
        self.data_processor = DataProcessor(word_syn_cutoff=num_senses, syn_path_cutoff=num_hyps,
                                            process_preps=process_preps, prep_senses_dir=prep_senses_dir)
        self.num_senses = num_senses
        self.num_hyps = num_hyps
        self.attention_model = None  # Keras model with just embedding and encoder to output attention.
        self.set_sense_priors = set_sense_priors
        self.use_attention = use_attention
        use_prep_senses = False if prep_senses_dir is None else True
        self.encoder = OntoLSTMEncoder(self.num_senses, self.num_hyps, self.use_attention, self.set_sense_priors,
                                       data_processor=self.data_processor, embed_dim=self.embed_dim,
                                       bidirectional=self.bidirectional, tune_embedding=self.tune_embedding)
        self.model_name_prefix = ("ontolstm_models/ontolstm_ppa_att=%s_senses=%d_hyps=%d"
                                  "_sense-priors=%s_prep-senses=%s_tune-embedding=%s_bi=%s") % (
                                      str(self.use_attention), self.num_senses, self.num_hyps,
                                      str(set_sense_priors), str(use_prep_senses), str(self.tune_embedding),
                                      str(self.bidirectional))
        self.custom_objects.update(self.encoder.get_custom_objects())

    def get_attention(self, inputs):
        '''
        Takes inputs and returns pairs of synsets and corresponding attention values.
        '''
        if not self.attention_model:
            self.define_attention_model()
        attention_outputs = self.attention_model.predict(inputs)
        sent_attention_values = []
        for sentence_input, sentence_attention in zip(inputs, attention_outputs):
            word_attention_values = []
            for word_input, word_attention in zip(sentence_input, sentence_attention):
                # Size of word input is (senses, hyps+1)
                # Ignoring the last hyp index because that is just the word index pt there by
                # OntoAwareEmbedding for sense priors.
                if word_input.sum() == 0:
                    # This is just padding
                    continue
                word_input = word_input[:, :-1]  # removing last hyp index.
                sense_hyp_prod = self.num_senses * self.num_hyps
                assert len(word_attention) == sense_hyp_prod or len(word_attention) == 2 * sense_hyp_prod
                attention_per_sense = []
                if len(word_attention) == 2 * sense_hyp_prod:
                    # The encoder is Bidirectional. We have attentions from both directions.
                    forward_sense_attention = word_attention[:len(word_attention) // 2]
                    backward_sense_attention = word_attention[len(word_attention) // 2:]
                    processed_attention = zip(forward_sense_attention, backward_sense_attention)
                else:
                    # Encoder is not bidirectional
                    processed_attention = word_attention
                hyp_ind = 0
                while hyp_ind < len(processed_attention):
                    attention_per_sense.append(processed_attention[hyp_ind:hyp_ind+self.num_hyps])
                    hyp_ind += self.num_hyps

                sense_attention_values = []
                for sense_input, attention_per_hyp in zip(word_input, attention_per_sense):
                    hyp_attention_values = []
                    for hyp_input, hyp_attention in zip(sense_input, attention_per_hyp):
                        if hyp_input == 0:
                            continue
                        hyp_attention_values.append((self.data_processor.get_token_from_index(hyp_input,
                                                                                              onto_aware=True),
                                                     hyp_attention))
                    sense_attention_values.append(hyp_attention_values)
                word_attention_values.append(sense_attention_values)
            sent_attention_values.append(word_attention_values)
        return sent_attention_values

    def define_attention_model(self):
        '''
        Take necessary parts out of the model to get OntoLSTM attention.
        '''
        if not self.model:
            raise RuntimeError("Model not trained yet!")
        input_shape = self.model.get_input_shape_at(0)
        input_layer = Input(input_shape[1:], dtype='int32')  # removing batch size
        embedding_layer = None
        encoder_layer = None
        for layer in self.model.layers:
            if layer.name == "embedding":
                embedding_layer = layer
            elif layer.name == "onto_lstm":
                # We need to redefine the OntoLSTM layer with the learned weights and set return attention to True.
                # Assuming we'll want attention values for all words (return_sequences = True)
                if isinstance(layer, Bidirectional):
                    onto_lstm = OntoAttentionLSTM(input_dim=self.embed_dim, output_dim=self.embed_dim,
                                                  num_senses=self.num_senses, num_hyps=self.num_hyps,
                                                  use_attention=True, return_attention=True, return_sequences=True,
                                                  consume_less='gpu')
                    encoder_layer = Bidirectional(onto_lstm, weights=layer.get_weights())
                else:
                    encoder_layer = OntoAttentionLSTM(input_dim=self.embed_dim,
                                                      output_dim=self.embed_dim, num_senses=self.num_senses,
                                                      num_hyps=self.num_hyps, use_attention=True,
                                                      return_attention=True, return_sequences=True,
                                                      consume_less='gpu', weights=layer.get_weights())
                break
        if not embedding_layer or not encoder_layer:
            raise RuntimeError("Required layers not found!")
        attention_output = encoder_layer(embedding_layer(input_layer))
        self.attention_model = Model(inputs=input_layer, outputs=attention_output)
        print >>sys.stderr, "Attention model summary:"
        self.attention_model.summary()
        self.attention_model.compile(loss="mse", optimizer="sgd")  # Loss and optimizer do not matter!

    def print_attention_values(self, input_file, test_inputs, output_file):
        sent_attention_outputs = self.get_attention(test_inputs)
        tagged_sentences = [x.strip().split("\t")[1] for x in codecs.open(input_file).readlines()]
        outfile = codecs.open(output_file, "w", "utf-8")
        full_json_struct = []
        for sent_attention, tagged_sentence in zip(sent_attention_outputs, tagged_sentences):
            sent_json = {}
            sent_json["input"] = tagged_sentence
            sent_json["tokens"] = []
            tagged_words = tagged_sentence.split()
            for tagged_word, word_attention in zip(tagged_words, sent_attention):
                token_json = {}
                token_json["surface_form"] = tagged_word
                token_json["senses"] = []
                for sense_num, sense_attention in enumerate(word_attention):
                    if len(sense_attention) == 0:
                        continue
                    sense_json = {}
                    sense_json["id"] = sense_num
                    sense_json["hypernyms"] = []
                    for hyp_name, hyp_att in sense_attention:
                        if isinstance(hyp_att, tuple):
                            # Averaging forward and backward attention
                            sense_json["hypernyms"].append({hyp_name: {"forward": float(hyp_att[0]),
                                                                       "backward": float(hyp_att[1])}})
                        else:
                            sense_json["hypernyms"].append({hyp_name: float(hyp_att)})
                    token_json["senses"].append(sense_json)
                sent_json["tokens"].append(token_json)
            full_json_struct.append(sent_json)
        print >>outfile, json.dumps(full_json_struct, indent=2)
        outfile.close()
ONTO_ATTENTION = True
SENSE_PRIORS = True
EMBED_DIM = 50
BIDIRECTIONAL = False
TUNE_EMBEDDING = True
EMBEDDING_FILE = None  # Replace with a gzipped embedding file if needed.

## Reading text file
test_file = open('data/test_data.tsv')
labeled_sentences = [x.strip().split('\t') for x in test_file]
labels, tagged_sentences = zip(*labeled_sentences)

## Preparing (indexing) data for classification.
# word_syn_cutoff is the number of senses per word,
# and syn_path_cutoff is the number of hypernyms per sense
data_processor = DataProcessor(word_syn_cutoff=NUM_SENSES,
                               syn_path_cutoff=NUM_HYPS)
indexed_input = data_processor.prepare_input(tagged_sentences, onto_aware=True)
one_hot_labels = data_processor.make_one_hot([int(x) for x in labels])

## Defining Keras model
input_layer = Input(shape=indexed_input.shape[1:], dtype='int32')
onto_lstm = OntoLSTMEncoder(num_senses=NUM_SENSES,
                            num_hyps=NUM_HYPS,
                            use_attention=ONTO_ATTENTION,
                            set_sense_priors=SENSE_PRIORS,
                            data_processor=data_processor,
                            embed_dim=EMBED_DIM,
                            return_sequences=False,
                            bidirectional=BIDIRECTIONAL,
                            tune_embedding=TUNE_EMBEDDING)
encoded_input = onto_lstm.get_encoded_phrase(input_layer,
class OntoLSTMEntailmentModel(EntailmentModel):
    def __init__(self, num_senses, num_hyps, use_attention, set_sense_priors,
                 **kwargs):
        super(OntoLSTMEntailmentModel, self).__init__(**kwargs)
        # Set self.data_processor again, now with the right arguments.
        self.data_processor = DataProcessor(word_syn_cutoff=num_senses,
                                            syn_path_cutoff=num_hyps)
        self.num_senses = num_senses
        self.num_hyps = num_hyps
        self.attention_model = None  # Keras model with just embedding and encoder to output attention.
        self.set_sense_priors = set_sense_priors
        self.use_attention = use_attention
        # If bidirectional, we'll do pooling here. So make the encoder return sequences iff bidirectional.
        self.encoder_model = OntoLSTMEncoder(
            num_senses=num_senses,
            num_hyps=num_hyps,
            use_attention=use_attention,
            set_sense_priors=set_sense_priors,
            data_processor=self.data_processor,
            embed_dim=self.embed_dim,
            bidirectional=self.bidirectional,
            tune_embedding=self.tune_embedding,
            return_sequences=self.bidirectional)
        self.model_name_prefix = "ontolstm_ent_att=%s_senses=%d_hyps=%d_sense-priors=%s_tune-embedding=%s_bi=%s_pool-att=%s" % (
            str(self.use_attention), self.num_senses, self.num_hyps,
            str(set_sense_priors), str(self.tune_embedding),
            str(self.bidirectional), str(self.intra_attention))
        self.custom_objects = {
            "OntoAttentionLSTM": OntoAttentionLSTM,
            "OntoAwareEmbedding": OntoAwareEmbedding
        }
        if self.bidirectional:
            if self.intra_attention:
                self.custom_objects["IntraAttention"] = IntraAttention
            else:
                self.custom_objects["AveragePooling"] = AveragePooling

    def get_encoder(self, return_sequences=False):
        lstm = OntoAttentionLSTM(input_dim=self.embed_dim,
                                 output_dim=self.embed_dim,
                                 num_senses=self.num_senses,
                                 num_hyps=self.num_hyps,
                                 use_attention=self.use_attention,
                                 consume_less="gpu",
                                 return_sequences=return_sequences,
                                 name="onto_lstm")
        return lstm

    def get_attention(self, inputs):
        # Takes inputs and returns pairs of synsets and corresponding attention values.
        if not self.attention_model:
            self.define_attention_model()
        attention_outputs = self.attention_model.predict(inputs)
        sent_attention_values = []
        for sentence_input, sentence_attention in zip(inputs,
                                                      attention_outputs):
            word_attention_values = []
            for word_input, word_attention in zip(sentence_input,
                                                  sentence_attention):
                if word_input.sum() == 0:
                    # This is just padding
                    continue
                sense_attention_values = []
                for sense_input, sense_attention in zip(
                        word_input, word_attention):
                    if sense_input.sum() == 0:
                        continue
                    hyp_attention_values = []
                    for hyp_input, hyp_attention in zip(
                            sense_input, sense_attention):
                        if hyp_input == 0:
                            continue
                        hyp_attention_values.append(
                            (self.data_processor.get_token_from_index(
                                hyp_input, onto_aware=True), hyp_attention))
                    sense_attention_values.append(hyp_attention_values)
                word_attention_values.append(sense_attention_values)
            sent_attention_values.append(word_attention_values)
        return sent_attention_values

    def define_attention_model(self):
        # Take necessary parts out of the entailment model to get OntoLSTM attention.
        if not self.model:
            raise RuntimeError, "Model not trained yet!"
        # We need just one input to get attention. input_shape_at(0) gives a list with two shapes.
        input_shape = self.model.get_input_shape_at(0)[0]
        input_layer = Input(input_shape[1:],
                            dtype='int32')  # removing batch size
        embedding_layer = None
        encoder_layer = None
        for layer in self.model.layers:
            if layer.name == "embedding":
                embedding_layer = layer
            elif layer.name == "encoder":
                # We need to redefine the OntoLSTM layer with the learned weights and set return attention to True.
                # Assuming we'll want attention values for all words (return_sequences = True)
                encoder_layer = OntoAttentionLSTM(input_dim=self.embed_dim,
                                                  output_dim=self.embed_dim,
                                                  num_senses=self.num_senses,
                                                  num_hyps=self.num_hyps,
                                                  use_attention=True,
                                                  return_attention=True,
                                                  return_sequences=True,
                                                  weights=layer.get_weights())
        if not embedding_layer or not encoder_layer:
            raise RuntimeError, "Required layers not found!"
        attention_output = encoder_layer(embedding_layer(input_layer))
        self.attention_model = Model(input=input_layer,
                                     output=attention_output)
        self.attention_model.compile(
            loss="mse", optimizer="sgd")  # Loss and optimizer do not matter!

    def print_attention_values(self, input_file, test_inputs, output_file):
        onto_aware = True
        sent1_attention_outputs = self.get_attention(test_inputs[0])
        sent2_attention_outputs = self.get_attention(test_inputs[1])
        tagged_sentences = [
            x.strip().split("\t")[1]
            for x in codecs.open(input_file).readlines()
        ]
        outfile = codecs.open(output_file, "w", "utf-8")
        for sent1_attention, sent2_attention, tagged_sentence in zip(
                sent1_attention_outputs, sent2_attention_outputs,
                tagged_sentences):
            print >> outfile, tagged_sentence
            print >> outfile, "Sentence 1:"
            for word_attention in sent1_attention:
                for sense_attention in word_attention:
                    print >> outfile, " ".join([
                        "%s:%f" % (hyp, hyp_att)
                        for hyp, hyp_att in sense_attention
                    ])
                print >> outfile
            print >> outfile, "\nSentence 2:"
            for word_attention in sent2_attention:
                for sense_attention in word_attention:
                    print >> outfile, " ".join([
                        "%s:%f" % (hyp, hyp_att)
                        for hyp, hyp_att in sense_attention
                    ])
                print >> outfile
        outfile.close()
class EntailmentModel(object):
    def __init__(self,
                 bidirectional=False,
                 intra_attention=False,
                 tune_embedding=False,
                 **kwargs):
        self.data_processor = DataProcessor()
        if "embed_dim" in kwargs:
            self.embed_dim = kwargs["embed_dim"]
        else:
            self.embed_dim = 50
        self.bidirectional = bidirectional
        self.intra_attention = intra_attention
        self.tune_embedding = tune_embedding
        self.numpy_rng = numpy.random.RandomState(12345)
        self.label_map = {}  # Maps labels to integers.
        self.model = None
        self.best_epoch = 0  # index of the best epoch
        self.model_name_prefix = None
        self.custom_objects = None
        self.encoder_model = None

    def train(self,
              max_sent_len,
              train_inputs,
              train_labels,
              num_epochs=20,
              mlp_size=1024,
              mlp_activation='relu',
              dropout=None,
              embedding_file=None,
              tune_embedding=True,
              num_mlp_layers=2,
              batch=None,
              patience=5):
        '''
        train_inputs (list(numpy_array)): The two sentence inputs
        train_labels (numpy_array): One-hot matrix indicating labels
        num_epochs (int): Maximum number of epochs to run
        mlp_size (int): Dimensionality of each layer in the MLP
        dropout (dict(str->float)): Probabilities in Dropout layers after "embedding" and "encoder" (lstm)
        embedding (numpy): Optional pretrained embedding
        tune_embedding (bool): If pretrained embedding is given, tune it.
        patience (int): Early stopping patience
        '''
        if dropout is None:
            dropout = {}
        num_label_types = train_labels.shape[
            1]  # train_labels is of shape (num_samples, num_label_types)
        sent1_input_layer = Input(name='sent1',
                                  shape=train_inputs[0].shape[1:],
                                  dtype='int32')
        sent2_input_layer = Input(name='sent2',
                                  shape=train_inputs[1].shape[1:],
                                  dtype='int32')
        encoded_sent1, encoded_sent2 = self._get_encoded_sentence_variables(
            max_sent_len,
            sent1_input_layer,
            sent2_input_layer,
            dropout,
            embedding_file,
            tune_embedding,
            batch=32 if batch == None else batch)
        concat_sent_rep = merge([encoded_sent1, encoded_sent2], mode='concat')
        mul_sent_rep = merge([encoded_sent1, encoded_sent2], mode='mul')
        diff_sent_rep = merge([encoded_sent1, encoded_sent2],
                              mode=lambda l: l[0] - l[1],
                              output_shape=lambda l: l[0])
        # Use heuristic from Mou et al. (2015) to get final merged representation
        merged_sent_rep = merge([concat_sent_rep, mul_sent_rep, diff_sent_rep],
                                mode='concat')
        current_projection = merged_sent_rep
        for i in range(num_mlp_layers):
            mlp_layer_i = Dense(output_dim=mlp_size,
                                activation=mlp_activation,
                                name="%s_layer_%d" % (mlp_activation, i))
            current_projection = mlp_layer_i(current_projection)
        if dropout is not None:
            if "output" in dropout:
                current_projection = Dropout(
                    dropout["output"])(current_projection)
        softmax = Dense(output_dim=num_label_types,
                        activation='softmax',
                        name='softmax_layer')
        label_probs = softmax(current_projection)
        model = Model(input=[sent1_input_layer, sent2_input_layer],
                      output=label_probs)
        model.compile(optimizer='adam',
                      loss='categorical_crossentropy',
                      metrics=['accuracy'])
        self.model = model
        print >> sys.stderr, "Entailment model summary:"
        model.summary()
        plot(model,
             to_file='model_plot.png',
             show_shapes=True,
             show_layer_names=True)
        best_accuracy = 0.0
        num_worse_epochs = 0
        for epoch_id in range(num_epochs):
            print >> sys.stderr, "Epoch: %d" % epoch_id
            history = model.fit(train_inputs,
                                train_labels,
                                validation_split=0.1,
                                nb_epoch=1)
            validation_accuracy = history.history['val_acc'][
                0]  # history['val_acc'] is a list of size nb_epoch
            if validation_accuracy > best_accuracy:
                self.save_model(epoch_id)
                self.best_epoch = epoch_id
                num_worse_epochs = 0
                best_accuracy = validation_accuracy
            elif validation_accuracy < best_accuracy:
                num_worse_epochs += 1
                if num_worse_epochs >= patience:
                    print >> sys.stderr, "Stopping training."
                    break
        self.save_best_model()

    def _get_encoded_sentence_variables(self,
                                        max_sent_len,
                                        sent1_input_layer,
                                        sent2_input_layer,
                                        dropout,
                                        embedding_file,
                                        tune_embedding,
                                        batch=None):
        if self.bidirectional:
            encoded_sent1_seq = self.encoder_model.get_encoded_phrase(
                sent1_input_layer, dropout=dropout, embedding=embedding_file)
            encoded_sent2_seq = self.encoder_model.get_encoded_phrase(
                sent2_input_layer, dropout=dropout, embedding=embedding_file)
            if self.intra_attention:
                pooling_layer = IntraAttention(name='intra_attention')
            else:
                pooling_layer = AveragePooling(name='average_pooling')
            encoded_sent1 = pooling_layer(encoded_sent1_seq)
            encoded_sent2 = pooling_layer(encoded_sent2_seq)
        else:
            encoded_sent1 = self.encoder_model.get_encoded_phrase(
                sent1_input_layer, dropout=dropout, embedding=embedding_file)
            encoded_sent2 = self.encoder_model.get_encoded_phrase(
                sent2_input_layer, dropout=dropout, embedding=embedding_file)
        return encoded_sent1, encoded_sent2

    def process_train_data(self, input_file, onto_aware):
        print >> sys.stderr, "Reading training data"
        label_ind = []
        tagged_sentences = []
        for line in open(input_file):
            lnstrp = line.strip()
            label, tagged_sentence = lnstrp.split("\t")
            if label not in self.label_map:
                self.label_map[label] = len(self.label_map)
            label_ind.append(self.label_map[label])
            tagged_sentences.append(tagged_sentence)
        # Shuffling so that when Keras does validation split, it is not always at the end.
        sentences_and_labels = zip(tagged_sentences, label_ind)
        random.shuffle(sentences_and_labels)
        tagged_sentences, label_ind = zip(*sentences_and_labels)
        print >> sys.stderr, "Indexing training data"
        max_sent_len, train_inputs = self.data_processor.prepare_paired_input(
            tagged_sentences,
            onto_aware=onto_aware,
            for_test=False,
            remove_singletons=True)
        train_labels = self.data_processor.make_one_hot(label_ind)
        return max_sent_len, train_inputs, train_labels

    def process_test_data(self, input_file, onto_aware, is_labeled=True):
        if not self.model:
            raise RuntimeError, "Model not trained yet!"
        print >> sys.stderr, "Reading test data"
        label_ind = []
        tagged_sentences = []
        for line in open(input_file):
            lnstrp = line.strip()
            if is_labeled:
                label, tagged_sentence = lnstrp.split("\t")
                if label not in self.label_map:
                    self.label_map[label] = len(self.label_map)
                label_ind.append(self.label_map[label])
            else:
                tagged_sentence = lnstrp
            tagged_sentences.append(tagged_sentence)
        print >> sys.stderr, "Indexing test data"
        # Infer max sentence length if the model is trained
        input_shape = self.model.get_input_shape_at(0)[
            0]  # take the shape of the first of two inputs at 0.
        sentlenlimit = input_shape[
            1]  # (num_sentences, num_words, num_senses, num_hyps)
        max_sent_len, test_inputs = self.data_processor.prepare_paired_input(
            tagged_sentences,
            onto_aware=onto_aware,
            sentlenlimit=sentlenlimit,
            for_test=True)
        test_labels = self.data_processor.make_one_hot(label_ind)
        return max_sent_len, test_inputs, test_labels

    def test(self, inputs, targets):
        if not self.model:
            raise RuntimeError, "Model not trained!"
        metrics = self.model.evaluate(inputs, targets)
        print >> sys.stderr, "Test accuracy: %.4f" % (
            metrics[1])  # The first metric is loss.
        predictions = numpy.argmax(self.model.predict(inputs), axis=1)
        rev_label_map = {ind: label for label, ind in self.label_map.items()}
        predicted_labels = [rev_label_map[pred] for pred in predictions]
        return predicted_labels

    def save_model(self, epoch):
        '''
        Saves the current model using the epoch id to identify the file.
        '''
        self.model.save("%s_%d.model" % (self.model_name_prefix, epoch))
        pickle.dump(self.data_processor,
                    open("%s.dataproc" % self.model_name_prefix, "wb"))
        pickle.dump(self.label_map,
                    open("%s.labelmap" % self.model_name_prefix, "wb"))

    def save_best_model(self):
        '''
        Copies the model corresponding to the best epoch as the final model file.
        '''
        from shutil import copyfile
        best_model_file = "%s_%d.model" % (self.model_name_prefix,
                                           self.best_epoch)
        final_model_file = "%s.model" % self.model_name_prefix
        copyfile(best_model_file, final_model_file)

    def load_model(self, epoch=None):
        '''
        Loads a saved model. If epoch id is provided, will load the corresponding model. Or else,
        will load the best model.
        '''
        if not epoch:
            self.model = load_model("%s.model" % self.model_name_prefix,
                                    custom_objects=self.custom_objects)
        else:
            self.model = load_model("%s_%d.model" %
                                    (self.model_name_prefix, epoch),
                                    custom_objects=self.custom_objects)
        self.data_processor = pickle.load(
            open("%s.dataproc" % self.model_name_prefix, "rb"))
        self.label_map = pickle.load(
            open("%s.labelmap" % self.model_name_prefix, "rb"))
Beispiel #8
0
class OntoLSTMEntailmentModel(EntailmentModel):
    def __init__(self, num_senses, num_hyps, use_attention, set_sense_priors, **kwargs):
        super(OntoLSTMEntailmentModel, self).__init__(**kwargs)
        # Set self.data_processor again, now with the right arguments.
        self.data_processor = DataProcessor(word_syn_cutoff=num_senses, syn_path_cutoff=num_hyps)
        self.num_senses = num_senses
        self.num_hyps = num_hyps
        self.attention_model = None  # Keras model with just embedding and encoder to output attention.
        self.set_sense_priors = set_sense_priors
        self.use_attention = use_attention
        # If bidirectional, we'll do pooling here. So make the encoder return sequences iff bidirectional.
        self.encoder_model = OntoLSTMEncoder(num_senses=num_senses, num_hyps=num_hyps,
                                             use_attention=use_attention, set_sense_priors=set_sense_priors,
                                             data_processor=self.data_processor, embed_dim=self.embed_dim,
                                             bidirectional=self.bidirectional,
                                             tune_embedding=self.tune_embedding,
                                             return_sequences=self.bidirectional)
        self.model_name_prefix = "ontolstm_ent_att=%s_senses=%d_hyps=%d_sense-priors=%s_tune-embedding=%s_bi=%s_pool-att=%s" % (
            str(self.use_attention), self.num_senses, self.num_hyps, str(set_sense_priors), str(self.tune_embedding),
            str(self.bidirectional), str(self.intra_attention))
        self.custom_objects = {"OntoAttentionLSTM": OntoAttentionLSTM, "OntoAwareEmbedding": OntoAwareEmbedding}
        if self.bidirectional:
            if self.intra_attention:
                self.custom_objects["IntraAttention"] = IntraAttention
            else: 
                self.custom_objects["AveragePooling"] = AveragePooling

    def get_encoder(self, return_sequences=False):
        lstm = OntoAttentionLSTM(input_dim=self.embed_dim, output_dim=self.embed_dim, num_senses=self.num_senses,
                                 num_hyps=self.num_hyps, use_attention=self.use_attention, consume_less="gpu",
                                 return_sequences=return_sequences, name="onto_lstm")
        return lstm

    def get_attention(self, inputs):
        # Takes inputs and returns pairs of synsets and corresponding attention values.
        if not self.attention_model:
            self.define_attention_model()
        attention_outputs = self.attention_model.predict(inputs)
        sent_attention_values = []
        for sentence_input, sentence_attention in zip(inputs, attention_outputs):
            word_attention_values = []
            for word_input, word_attention in zip(sentence_input, sentence_attention):
                if word_input.sum() == 0:
                    # This is just padding
                    continue
                sense_attention_values = []
                for sense_input, sense_attention in zip(word_input, word_attention):
                    if sense_input.sum() == 0:
                        continue
                    hyp_attention_values = []
                    for hyp_input, hyp_attention in zip(sense_input, sense_attention):
                        if hyp_input == 0:
                            continue
                        hyp_attention_values.append((self.data_processor.get_token_from_index(hyp_input,
                                                        onto_aware=True), hyp_attention))
                    sense_attention_values.append(hyp_attention_values)
                word_attention_values.append(sense_attention_values)
            sent_attention_values.append(word_attention_values)
        return sent_attention_values

    def define_attention_model(self):
        # Take necessary parts out of the entailment model to get OntoLSTM attention.
        if not self.model:
            raise RuntimeError, "Model not trained yet!"
        # We need just one input to get attention. input_shape_at(0) gives a list with two shapes.
        input_shape = self.model.get_input_shape_at(0)[0]
        input_layer = Input(input_shape[1:], dtype='int32')  # removing batch size
        embedding_layer = None
        encoder_layer = None
        for layer in self.model.layers:
            if layer.name == "embedding":
                embedding_layer = layer
            elif layer.name == "encoder":
                # We need to redefine the OntoLSTM layer with the learned weights and set return attention to True.
                # Assuming we'll want attention values for all words (return_sequences = True)
                encoder_layer = OntoAttentionLSTM(input_dim=self.embed_dim,
                                                  output_dim=self.embed_dim, num_senses=self.num_senses,
                                                  num_hyps=self.num_hyps, use_attention=True,
                                                  return_attention=True, return_sequences=True,
                                                  weights=layer.get_weights())
        if not embedding_layer or not encoder_layer:
            raise RuntimeError, "Required layers not found!"
        attention_output = encoder_layer(embedding_layer(input_layer))
        self.attention_model = Model(input=input_layer, output=attention_output)
        self.attention_model.compile(loss="mse", optimizer="sgd")  # Loss and optimizer do not matter!

    def print_attention_values(self, input_file, test_inputs, output_file):
        onto_aware = True
        sent1_attention_outputs = self.get_attention(test_inputs[0])
        sent2_attention_outputs = self.get_attention(test_inputs[1])
        tagged_sentences = [x.strip().split("\t")[1] for x in codecs.open(input_file).readlines()]
        outfile = codecs.open(output_file, "w", "utf-8")
        for sent1_attention, sent2_attention, tagged_sentence in zip(sent1_attention_outputs, sent2_attention_outputs, tagged_sentences):
            print >>outfile, tagged_sentence
            print >>outfile, "Sentence 1:"
            for word_attention in sent1_attention:
                for sense_attention in word_attention:
                    print >>outfile, " ".join(["%s:%f" % (hyp, hyp_att) for hyp, hyp_att in sense_attention])
                print >>outfile
            print >>outfile, "\nSentence 2:"
            for word_attention in sent2_attention:
                for sense_attention in word_attention:
                    print >>outfile, " ".join(["%s:%f" % (hyp, hyp_att) for hyp, hyp_att in sense_attention])
                print >>outfile
        outfile.close()
Beispiel #9
0
class EntailmentModel(object):
    def __init__(self, bidirectional=False, intra_attention=False, tune_embedding=False, **kwargs):
        self.data_processor = DataProcessor()
        if "embed_dim" in kwargs:
            self.embed_dim = kwargs["embed_dim"]
        else:
            self.embed_dim = 50
        self.bidirectional = bidirectional
        self.intra_attention = intra_attention
        self.tune_embedding = tune_embedding
        self.numpy_rng = numpy.random.RandomState(12345)
        self.label_map = {}  # Maps labels to integers.
        self.model = None
        self.best_epoch = 0  # index of the best epoch
        self.model_name_prefix = None
        self.custom_objects = None
        self.encoder_model = None

    def train(self, train_inputs, train_labels, num_epochs=20, mlp_size=1024, mlp_activation='relu',
              dropout=None, embedding_file=None, tune_embedding=True, num_mlp_layers=2,
              patience=5):
        '''
        train_inputs (list(numpy_array)): The two sentence inputs
        train_labels (numpy_array): One-hot matrix indicating labels
        num_epochs (int): Maximum number of epochs to run
        mlp_size (int): Dimensionality of each layer in the MLP
        dropout (dict(str->float)): Probabilities in Dropout layers after "embedding" and "encoder" (lstm)
        embedding (numpy): Optional pretrained embedding
        tune_embedding (bool): If pretrained embedding is given, tune it.
        patience (int): Early stopping patience
        '''
        if dropout is None:
            dropout = {}
        num_label_types = train_labels.shape[1]  # train_labels is of shape (num_samples, num_label_types)
        sent1_input_layer = Input(name='sent1', shape=train_inputs[0].shape[1:], dtype='int32')
        sent2_input_layer = Input(name='sent2', shape=train_inputs[1].shape[1:], dtype='int32')
        encoded_sent1, encoded_sent2 = self._get_encoded_sentence_variables(sent1_input_layer,
                                                                            sent2_input_layer, dropout,
                                                                            embedding_file, tune_embedding)
        concat_sent_rep = merge([encoded_sent1, encoded_sent2], mode='concat')
        mul_sent_rep = merge([encoded_sent1, encoded_sent2], mode='mul')
        diff_sent_rep = merge([encoded_sent1, encoded_sent2], mode=lambda l: l[0]-l[1],
                              output_shape=lambda l: l[0])
        # Use heuristic from Mou et al. (2015) to get final merged representation
        merged_sent_rep = merge([concat_sent_rep, mul_sent_rep, diff_sent_rep], mode='concat')
        current_projection = merged_sent_rep
        for i in range(num_mlp_layers):
            mlp_layer_i = Dense(output_dim=mlp_size, activation=mlp_activation,
                                name="%s_layer_%d" % (mlp_activation, i))
            current_projection = mlp_layer_i(current_projection)
        if dropout is not None:
            if "output" in dropout:
                current_projection = Dropout(dropout["output"])(current_projection)
        softmax = Dense(output_dim=num_label_types, activation='softmax', name='softmax_layer')
        label_probs = softmax(current_projection)
        model = Model(input=[sent1_input_layer, sent2_input_layer], output=label_probs)
        model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
        self.model = model
        print >>sys.stderr, "Entailment model summary:"
        model.summary()
        best_accuracy = 0.0
        num_worse_epochs = 0
        for epoch_id in range(num_epochs):
            print >>sys.stderr, "Epoch: %d" % epoch_id
            history = model.fit(train_inputs, train_labels, validation_split=0.1, nb_epoch=1)
            validation_accuracy = history.history['val_acc'][0]  # history['val_acc'] is a list of size nb_epoch
            if validation_accuracy > best_accuracy:
                self.save_model(epoch_id)
                self.best_epoch = epoch_id
                num_worse_epochs = 0
                best_accuracy = validation_accuracy
            elif validation_accuracy < best_accuracy:
                num_worse_epochs += 1
                if num_worse_epochs >= patience:
                    print >>sys.stderr, "Stopping training."
                    break
        self.save_best_model()

    def _get_encoded_sentence_variables(self, sent1_input_layer, sent2_input_layer, dropout,
                                        embedding_file, tune_embedding):
        if self.bidirectional:
            encoded_sent1_seq = self.encoder_model.get_encoded_phrase(sent1_input_layer, dropout=dropout,
                                                                      embedding=embedding_file)
            encoded_sent2_seq = self.encoder_model.get_encoded_phrase(sent2_input_layer, dropout=dropout,
                                                                      embedding=embedding_file)
            if self.intra_attention:
                pooling_layer = IntraAttention(name='intra_attention')
            else:
                pooling_layer = AveragePooling(name='average_pooling')
            encoded_sent1 = pooling_layer(encoded_sent1_seq)
            encoded_sent2 = pooling_layer(encoded_sent2_seq)
        else:
            encoded_sent1 = self.encoder_model.get_encoded_phrase(sent1_input_layer, dropout=dropout,
                                                                  embedding=embedding_file)
            encoded_sent2 = self.encoder_model.get_encoded_phrase(sent2_input_layer, dropout=dropout,
                                                                  embedding=embedding_file)
        return encoded_sent1, encoded_sent2

    def process_train_data(self, input_file, onto_aware):
        print >>sys.stderr, "Reading training data"
        label_ind = []
        tagged_sentences = []
        for line in open(input_file):
            lnstrp = line.strip()
            label, tagged_sentence = lnstrp.split("\t")
            if label not in self.label_map:
                self.label_map[label] = len(self.label_map)
            label_ind.append(self.label_map[label])
            tagged_sentences.append(tagged_sentence)
        # Shuffling so that when Keras does validation split, it is not always at the end.
        sentences_and_labels = zip(tagged_sentences, label_ind)
        random.shuffle(sentences_and_labels)
        tagged_sentences, label_ind = zip(*sentences_and_labels)
        print >>sys.stderr, "Indexing training data"
        train_inputs = self.data_processor.prepare_paired_input(tagged_sentences, onto_aware=onto_aware,
                                                                for_test=False, remove_singletons=True)
        train_labels = self.data_processor.make_one_hot(label_ind)
        return train_inputs, train_labels

    def process_test_data(self, input_file, onto_aware, is_labeled=True):
        if not self.model:
            raise RuntimeError, "Model not trained yet!"
        print >>sys.stderr, "Reading test data"
        label_ind = []
        tagged_sentences = []
        for line in open(input_file):
            lnstrp = line.strip()
            if is_labeled:
                label, tagged_sentence = lnstrp.split("\t")
                if label not in self.label_map:
                    self.label_map[label] = len(self.label_map)
                label_ind.append(self.label_map[label])
            else:
                tagged_sentence = lnstrp
            tagged_sentences.append(tagged_sentence)
        print >>sys.stderr, "Indexing test data"
        # Infer max sentence length if the model is trained
        input_shape = self.model.get_input_shape_at(0)[0]  # take the shape of the first of two inputs at 0.
        sentlenlimit = input_shape[1]  # (num_sentences, num_words, num_senses, num_hyps)
        test_inputs = self.data_processor.prepare_paired_input(tagged_sentences, onto_aware=onto_aware,
                                                               sentlenlimit=sentlenlimit, for_test=True)
        test_labels = self.data_processor.make_one_hot(label_ind)
        return test_inputs, test_labels

    def test(self, inputs, targets):
        if not self.model:
            raise RuntimeError, "Model not trained!"
        metrics = self.model.evaluate(inputs, targets)
        print >>sys.stderr, "Test accuracy: %.4f" % (metrics[1])  # The first metric is loss.
        predictions = numpy.argmax(self.model.predict(inputs), axis=1)
        rev_label_map = {ind: label for label, ind in self.label_map.items()}
        predicted_labels = [rev_label_map[pred] for pred in predictions]
        return predicted_labels

    def save_model(self, epoch):
        '''
        Saves the current model using the epoch id to identify the file.
        '''
        self.model.save("%s_%d.model" % (self.model_name_prefix, epoch))
        pickle.dump(self.data_processor, open("%s.dataproc" % self.model_name_prefix, "wb"))
        pickle.dump(self.label_map, open("%s.labelmap" % self.model_name_prefix, "wb"))

    def save_best_model(self):
        '''
        Copies the model corresponding to the best epoch as the final model file.
        '''
        from shutil import copyfile
        best_model_file = "%s_%d.model" % (self.model_name_prefix, self.best_epoch)
        final_model_file = "%s.model" % self.model_name_prefix
        copyfile(best_model_file, final_model_file)

    def load_model(self, epoch=None):
        '''
        Loads a saved model. If epoch id is provided, will load the corresponding model. Or else,
        will load the best model.
        '''
        if not epoch:
            self.model = load_model("%s.model" % self.model_name_prefix,
                                    custom_objects=self.custom_objects)
        else:
            self.model = load_model("%s_%d.model" % (self.model_name_prefix, epoch),
                                    custom_objects=self.custom_objects)
        self.data_processor = pickle.load(open("%s.dataproc" % self.model_name_prefix, "rb"))
        self.label_map = pickle.load(open("%s.labelmap" % self.model_name_prefix, "rb"))
Beispiel #10
0
ONTO_ATTENTION = True
SENSE_PRIORS = True
EMBED_DIM = 50
BIDIRECTIONAL = False
TUNE_EMBEDDING = True
EMBEDDING_FILE = None  # Replace with a gzipped embedding file if needed.

## Reading text file
test_file = open('data/test_data.tsv')
labeled_sentences = [x.strip().split('\t') for x in test_file]
labels, tagged_sentences = zip(*labeled_sentences)

## Preparing (indexing) data for classification.
# word_syn_cutoff is the number of senses per word,
# and syn_path_cutoff is the number of hypernyms per sense
data_processor = DataProcessor(word_syn_cutoff=NUM_SENSES, syn_path_cutoff=NUM_HYPS)
indexed_input = data_processor.prepare_input(tagged_sentences, onto_aware=True)
one_hot_labels = data_processor.make_one_hot([int(x) for x in labels])

## Defining Keras model
input_layer = Input(shape=indexed_input.shape[1:], dtype='int32')
onto_lstm = OntoLSTMEncoder(num_senses=NUM_SENSES, num_hyps=NUM_HYPS, use_attention=ONTO_ATTENTION,
                            set_sense_priors=SENSE_PRIORS, data_processor=data_processor,
                            embed_dim=EMBED_DIM, return_sequences=False, bidirectional=BIDIRECTIONAL,
                            tune_embedding=TUNE_EMBEDDING)
encoded_input = onto_lstm.get_encoded_phrase(input_layer, embedding_file=EMBEDDING_FILE)
softmax_layer = Dense(2, activation='softmax')
output_predictions = softmax_layer(encoded_input)
model = Model(input=input_layer, output=output_predictions)
model.summary()
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
class SentenceModel(object):
  def __init__(self, word_dim=50, num_senses=2, num_hyps=5):
    self.dp = DataProcessor(word_syn_cutoff=num_senses, syn_path_cutoff=num_hyps)
    self.num_hyps = num_hyps
    self.num_senses = num_senses
    self.numpy_rng = numpy.random.RandomState(12345)
    self.word_dim = word_dim
    self.model = None

  def read_sentences(self, tagged_sentences, sentlenlimit=None, test=False, remove_singletons=False):
    num_sentences = len(tagged_sentences)
    all_words = []
    all_pos_tags = []
    maxsentlen = 0
    for tagged_sentence in tagged_sentences:
      words = []
      pos_tags = []
      # Expects each token to be a "_" separated combination of word and POS tag.
      tagged_words = tagged_sentence.split(" ")
      if sentlenlimit is not None:
        tagged_words = tagged_words[:sentlenlimit] 
      for word_tag in tagged_words:
        parts = word_tag.split("_")
        tag = parts[-1]
        word = "_".join(parts[:-1]).lower()
        words.append(word)
        pos_tags.append(tag)
      if len(words) > maxsentlen:
        maxsentlen = len(words)
      all_words.append(words)
      all_pos_tags.append(pos_tags)
    if not sentlenlimit:
      sentlenlimit = maxsentlen
    C_ind = numpy.zeros((num_sentences, sentlenlimit, self.num_senses, self.num_hyps), dtype='int32')
    S_ind = numpy.zeros((num_sentences, sentlenlimit), dtype='int32')
    for i, (words, pos_tags) in enumerate(zip(all_words, all_pos_tags)):
      sentlen = len(words)
      # test=True locks the word and syn index dicts. No new keys will be added
      word_inds, syn_inds = self.dp.index_sentence(words, pos_tags, test=test, remove_singletons=remove_singletons)
      S_ind[i][-sentlen:] = word_inds
      for j in range(sentlen):
        sense_syn_ind = syn_inds[j]
        sense_syn_ind_len = len(sense_syn_ind)
        for k, syn_ind in enumerate(sense_syn_ind):
          C_ind[i][-sentlen+j][-sense_syn_ind_len+k][-len(syn_ind):] = syn_ind
    return S_ind, C_ind

  def _make_one_hot(self, word_inds, vec_size):
    onehot = numpy.zeros((word_inds.shape + (vec_size,)))
    for inds in itertools.product(*[numpy.arange(s) for s in word_inds.shape]):
      onehot[inds+(word_inds[inds],)] = 1
    return onehot

  def _factor_target_indices(self, Y_inds, vocab_size=None, base=2):
    if vocab_size is None:
      vocab_size = len(self.dp.word_index)
    print >>sys.stderr, "Factoring targets of vocabulary size: %d"%(vocab_size)
    num_vecs = int(math.ceil(math.log(vocab_size)/math.log(base))) + 1
    base_inds = []
    div_Y_inds = Y_inds
    print >>sys.stderr, "Number of factors: %d"%num_vecs
    for i in range(num_vecs):
      new_inds = div_Y_inds % base
      if i == num_vecs - 1:
        if new_inds.sum() == 0:
          # Most significant "digit" is a zero. Omit it.
          break
      base_inds.append(new_inds)
      div_Y_inds = numpy.copy(div_Y_inds/base)
    base_vecs = [self._make_one_hot(base_inds_i, base) for base_inds_i in base_inds]
    return base_vecs
    
  def train(self, S_ind, C_ind, use_onto_lstm=True, use_attention=True, num_epochs=20,  hierarchical=False, base=2):
    # Predict next word from current synsets
    X = C_ind[:,:-1] if use_onto_lstm else S_ind[:,:-1] # remove the last words' hyps in all sentences
    Y_inds = S_ind[:,1:] # remove the first words in all sentences
    if hierarchical:
      train_targets = self._factor_target_indices(Y_inds, base=base)
    else:
      train_targets = [self._make_one_hot(Y_inds, Y_inds.max() + 1)]
    length = Y_inds.shape[1]
    lstm_outdim = self.word_dim
    
    num_words = len(self.dp.word_index)
    num_syns = len(self.dp.synset_index)
    input = Input(shape=X.shape[1:], dtype='int32')
    embed_input_dim = num_syns if use_onto_lstm else num_words
    embed_layer = HigherOrderEmbedding(name='embedding', input_dim=embed_input_dim, output_dim=self.word_dim, input_shape=X.shape[1:], mask_zero=True)
    sent_rep = embed_layer(input)
    reg_sent_rep = Dropout(0.5)(sent_rep)
    if use_onto_lstm:
      lstm_out = OntoAttentionLSTM(name='sent_lstm', input_dim=self.word_dim, output_dim=lstm_outdim, input_length=length, num_senses=self.num_senses, num_hyps=self.num_hyps, return_sequences=True, use_attention=use_attention)(reg_sent_rep)
    else:
      lstm_out = LSTM(name='sent_lstm', input_dim=self.word_dim, output_dim=lstm_outdim, input_length=length, return_sequences=True)(reg_sent_rep)
    output_nodes = []
    # Make one node for each factored target
    for target in train_targets:
      node = TimeDistributed(Dense(input_dim=lstm_outdim, output_dim=target.shape[-1], activation='softmax'))(lstm_out)
      output_nodes.append(node)

    model = Model(input=input, output=output_nodes)
    print >>sys.stderr, model.summary()
    early_stopping = EarlyStopping()
    precompile_time = time.time()
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    postcompile_time = time.time()
    print >>sys.stderr, "Model compilation took %d s"%(postcompile_time - precompile_time)
    model.fit(X, train_targets, nb_epoch=num_epochs, validation_split=0.1, callbacks=[early_stopping])
    posttrain_time = time.time()
    print >>sys.stderr, "Training took %d s"%(posttrain_time - postcompile_time)
    concept_reps = model.layers[1].get_weights()
    self.model = model
    return concept_reps

  def test(self, vocab_size, use_onto_lstm, S_ind_test=None, C_ind_test=None, hierarchical=False, base=2, oov_list=None):
    X_test = C_ind_test[:,:-1] if use_onto_lstm else S_ind_test[:,:-1] # remove the last words' hyps in all sentences
    Y_inds_test = S_ind_test[:,1:]
    if hierarchical:
      test_targets = self._factor_target_indices(Y_inds_test, vocab_size, base=base)
    else:
      test_targets = [self._make_one_hot(Y_inds_test, vocab_size)]
    print >>sys.stderr, "Evaluating model on test data"
    test_loss = self.model.evaluate(X_test, test_targets)
    print >>sys.stderr, "Test loss: %.4f"%test_loss
    if oov_list is not None:
      oov_inds = [self.dp.word_index[w] for w in oov_list]
      non_oov_Y_inds = numpy.copy(Y_inds_test)
      for ind in oov_inds:
	non_oov_Y_inds[non_oov_Y_inds == ind] = 0
      non_oov_test_targets = self._factor_target_indices(non_oov_Y_inds, vocab_size, base=base)
      non_oov_test_loss = self.model.evaluate(X_test, non_oov_test_targets)
      print >>sys.stderr, "Non-oov test loss: %.4f"%non_oov_test_loss
    factored_test_preds = [-((numpy.log(pred) * target).sum(axis=-1)) for pred, target in zip(self.model.predict(X_test), test_targets)]
    test_preds = sum(factored_test_preds)
    #non_null_probs = []
    #for test_pred, inds in zip(test_preds, Y_inds_test):
    #  wanted_probs = []
    #  for tp, ind in zip(test_pred, inds):
    #    if ind != 0:
    #      wanted_probs.append(tp)
    #  non_null_probs.append(wanted_probs)
    #return non_null_probs
    return test_preds

  def get_attention(self, C_ind):
    if not self.model:
      raise RuntimeError, "Model not trained!"
    model_embedding = None
    model_weights = None
    for layer in self.model.layers:
      if layer.name.lower() == "embedding":
        model_embedding = layer
      if layer.name.lower() == "sent_lstm":
        model_lstm = layer
    if model_embedding is None or model_lstm is None:
      raise RuntimeError, "Did not find expected layers"
    lstm_weights = model_lstm.get_weights()
    embedding_weights = model_embedding.get_weights()
    embed_in_dim, embed_out_dim = embedding_weights[0].shape
    att_embedding = HigherOrderEmbedding(input_dim=embed_in_dim, output_dim=embed_out_dim, weights=embedding_weights)
    onto_lstm = OntoAttentionLSTM(input_dim=embed_out_dim, output_dim=embed_out_dim, input_length=model_lstm.input_length, num_senses=self.num_senses, num_hyps=self.num_hyps, use_attention=True, return_attention=True, weights=lstm_weights)
    att_input = Input(shape=C_ind.shape[1:], dtype='int32')
    att_sent_rep = att_embedding(att_input)
    att_output = onto_lstm(att_sent_rep)
    att_model = Model(input=att_input, output=att_output)
    att_model.compile(optimizer='adam', loss='mse') # optimizer and loss are not needed since we are not going to train this model.
    C_att = att_model.predict(C_ind)
    print >>sys.stderr, "Got attention values. Input, output shapes:", C_ind.shape, C_att.shape
    return C_att