def __init__(self, num_senses, num_hyps, use_attention, set_sense_priors, **kwargs): super(OntoLSTMEntailmentModel, self).__init__(**kwargs) # Set self.data_processor again, now with the right arguments. self.data_processor = DataProcessor(word_syn_cutoff=num_senses, syn_path_cutoff=num_hyps) self.num_senses = num_senses self.num_hyps = num_hyps self.attention_model = None # Keras model with just embedding and encoder to output attention. self.set_sense_priors = set_sense_priors self.use_attention = use_attention # If bidirectional, we'll do pooling here. So make the encoder return sequences iff bidirectional. self.encoder_model = OntoLSTMEncoder(num_senses=num_senses, num_hyps=num_hyps, use_attention=use_attention, set_sense_priors=set_sense_priors, data_processor=self.data_processor, embed_dim=self.embed_dim, bidirectional=self.bidirectional, tune_embedding=self.tune_embedding, return_sequences=self.bidirectional) self.model_name_prefix = "ontolstm_ent_att=%s_senses=%d_hyps=%d_sense-priors=%s_tune-embedding=%s_bi=%s_pool-att=%s" % ( str(self.use_attention), self.num_senses, self.num_hyps, str(set_sense_priors), str(self.tune_embedding), str(self.bidirectional), str(self.intra_attention)) self.custom_objects = {"OntoAttentionLSTM": OntoAttentionLSTM, "OntoAwareEmbedding": OntoAwareEmbedding} if self.bidirectional: if self.intra_attention: self.custom_objects["IntraAttention"] = IntraAttention else: self.custom_objects["AveragePooling"] = AveragePooling
def __init__(self, bidirectional=False, intra_attention=False, tune_embedding=False, **kwargs): self.data_processor = DataProcessor() if "embed_dim" in kwargs: self.embed_dim = kwargs["embed_dim"] else: self.embed_dim = 50 self.bidirectional = bidirectional self.intra_attention = intra_attention self.tune_embedding = tune_embedding self.numpy_rng = numpy.random.RandomState(12345) self.label_map = {} # Maps labels to integers. self.model = None self.best_epoch = 0 # index of the best epoch self.model_name_prefix = None self.custom_objects = None self.encoder_model = None
def __init__(self, num_senses, num_hyps, use_attention, set_sense_priors, prep_senses_dir, **kwargs): super(OntoLSTMAttachmentModel, self).__init__(**kwargs) # Set self.data_processor again, now with the right arguments. process_preps = False if prep_senses_dir is None else True self.data_processor = DataProcessor(word_syn_cutoff=num_senses, syn_path_cutoff=num_hyps, process_preps=process_preps, prep_senses_dir=prep_senses_dir) self.num_senses = num_senses self.num_hyps = num_hyps self.attention_model = None # Keras model with just embedding and encoder to output attention. self.set_sense_priors = set_sense_priors self.use_attention = use_attention use_prep_senses = False if prep_senses_dir is None else True self.encoder = OntoLSTMEncoder(self.num_senses, self.num_hyps, self.use_attention, self.set_sense_priors, data_processor=self.data_processor, embed_dim=self.embed_dim, bidirectional=self.bidirectional, tune_embedding=self.tune_embedding) self.model_name_prefix = ("ontolstm_models/ontolstm_ppa_att=%s_senses=%d_hyps=%d" "_sense-priors=%s_prep-senses=%s_tune-embedding=%s_bi=%s") % ( str(self.use_attention), self.num_senses, self.num_hyps, str(set_sense_priors), str(use_prep_senses), str(self.tune_embedding), str(self.bidirectional)) self.custom_objects.update(self.encoder.get_custom_objects())
class OntoLSTMAttachmentModel(PPAttachmentModel): ''' A PP Attachment prediction model that uses an OnotoLSTM as the encoder. ''' def __init__(self, num_senses, num_hyps, use_attention, set_sense_priors, prep_senses_dir, **kwargs): super(OntoLSTMAttachmentModel, self).__init__(**kwargs) # Set self.data_processor again, now with the right arguments. process_preps = False if prep_senses_dir is None else True self.data_processor = DataProcessor(word_syn_cutoff=num_senses, syn_path_cutoff=num_hyps, process_preps=process_preps, prep_senses_dir=prep_senses_dir) self.num_senses = num_senses self.num_hyps = num_hyps self.attention_model = None # Keras model with just embedding and encoder to output attention. self.set_sense_priors = set_sense_priors self.use_attention = use_attention use_prep_senses = False if prep_senses_dir is None else True self.encoder = OntoLSTMEncoder(self.num_senses, self.num_hyps, self.use_attention, self.set_sense_priors, data_processor=self.data_processor, embed_dim=self.embed_dim, bidirectional=self.bidirectional, tune_embedding=self.tune_embedding) self.model_name_prefix = ("ontolstm_models/ontolstm_ppa_att=%s_senses=%d_hyps=%d" "_sense-priors=%s_prep-senses=%s_tune-embedding=%s_bi=%s") % ( str(self.use_attention), self.num_senses, self.num_hyps, str(set_sense_priors), str(use_prep_senses), str(self.tune_embedding), str(self.bidirectional)) self.custom_objects.update(self.encoder.get_custom_objects()) def get_attention(self, inputs): ''' Takes inputs and returns pairs of synsets and corresponding attention values. ''' if not self.attention_model: self.define_attention_model() attention_outputs = self.attention_model.predict(inputs) sent_attention_values = [] for sentence_input, sentence_attention in zip(inputs, attention_outputs): word_attention_values = [] for word_input, word_attention in zip(sentence_input, sentence_attention): # Size of word input is (senses, hyps+1) # Ignoring the last hyp index because that is just the word index pt there by # OntoAwareEmbedding for sense priors. if word_input.sum() == 0: # This is just padding continue word_input = word_input[:, :-1] # removing last hyp index. sense_hyp_prod = self.num_senses * self.num_hyps assert len(word_attention) == sense_hyp_prod or len(word_attention) == 2 * sense_hyp_prod attention_per_sense = [] if len(word_attention) == 2 * sense_hyp_prod: # The encoder is Bidirectional. We have attentions from both directions. forward_sense_attention = word_attention[:len(word_attention) // 2] backward_sense_attention = word_attention[len(word_attention) // 2:] processed_attention = zip(forward_sense_attention, backward_sense_attention) else: # Encoder is not bidirectional processed_attention = word_attention hyp_ind = 0 while hyp_ind < len(processed_attention): attention_per_sense.append(processed_attention[hyp_ind:hyp_ind+self.num_hyps]) hyp_ind += self.num_hyps sense_attention_values = [] for sense_input, attention_per_hyp in zip(word_input, attention_per_sense): hyp_attention_values = [] for hyp_input, hyp_attention in zip(sense_input, attention_per_hyp): if hyp_input == 0: continue hyp_attention_values.append((self.data_processor.get_token_from_index(hyp_input, onto_aware=True), hyp_attention)) sense_attention_values.append(hyp_attention_values) word_attention_values.append(sense_attention_values) sent_attention_values.append(word_attention_values) return sent_attention_values def define_attention_model(self): ''' Take necessary parts out of the model to get OntoLSTM attention. ''' if not self.model: raise RuntimeError("Model not trained yet!") input_shape = self.model.get_input_shape_at(0) input_layer = Input(input_shape[1:], dtype='int32') # removing batch size embedding_layer = None encoder_layer = None for layer in self.model.layers: if layer.name == "embedding": embedding_layer = layer elif layer.name == "onto_lstm": # We need to redefine the OntoLSTM layer with the learned weights and set return attention to True. # Assuming we'll want attention values for all words (return_sequences = True) if isinstance(layer, Bidirectional): onto_lstm = OntoAttentionLSTM(input_dim=self.embed_dim, output_dim=self.embed_dim, num_senses=self.num_senses, num_hyps=self.num_hyps, use_attention=True, return_attention=True, return_sequences=True, consume_less='gpu') encoder_layer = Bidirectional(onto_lstm, weights=layer.get_weights()) else: encoder_layer = OntoAttentionLSTM(input_dim=self.embed_dim, output_dim=self.embed_dim, num_senses=self.num_senses, num_hyps=self.num_hyps, use_attention=True, return_attention=True, return_sequences=True, consume_less='gpu', weights=layer.get_weights()) break if not embedding_layer or not encoder_layer: raise RuntimeError("Required layers not found!") attention_output = encoder_layer(embedding_layer(input_layer)) self.attention_model = Model(inputs=input_layer, outputs=attention_output) print >>sys.stderr, "Attention model summary:" self.attention_model.summary() self.attention_model.compile(loss="mse", optimizer="sgd") # Loss and optimizer do not matter! def print_attention_values(self, input_file, test_inputs, output_file): sent_attention_outputs = self.get_attention(test_inputs) tagged_sentences = [x.strip().split("\t")[1] for x in codecs.open(input_file).readlines()] outfile = codecs.open(output_file, "w", "utf-8") full_json_struct = [] for sent_attention, tagged_sentence in zip(sent_attention_outputs, tagged_sentences): sent_json = {} sent_json["input"] = tagged_sentence sent_json["tokens"] = [] tagged_words = tagged_sentence.split() for tagged_word, word_attention in zip(tagged_words, sent_attention): token_json = {} token_json["surface_form"] = tagged_word token_json["senses"] = [] for sense_num, sense_attention in enumerate(word_attention): if len(sense_attention) == 0: continue sense_json = {} sense_json["id"] = sense_num sense_json["hypernyms"] = [] for hyp_name, hyp_att in sense_attention: if isinstance(hyp_att, tuple): # Averaging forward and backward attention sense_json["hypernyms"].append({hyp_name: {"forward": float(hyp_att[0]), "backward": float(hyp_att[1])}}) else: sense_json["hypernyms"].append({hyp_name: float(hyp_att)}) token_json["senses"].append(sense_json) sent_json["tokens"].append(token_json) full_json_struct.append(sent_json) print >>outfile, json.dumps(full_json_struct, indent=2) outfile.close()
ONTO_ATTENTION = True SENSE_PRIORS = True EMBED_DIM = 50 BIDIRECTIONAL = False TUNE_EMBEDDING = True EMBEDDING_FILE = None # Replace with a gzipped embedding file if needed. ## Reading text file test_file = open('data/test_data.tsv') labeled_sentences = [x.strip().split('\t') for x in test_file] labels, tagged_sentences = zip(*labeled_sentences) ## Preparing (indexing) data for classification. # word_syn_cutoff is the number of senses per word, # and syn_path_cutoff is the number of hypernyms per sense data_processor = DataProcessor(word_syn_cutoff=NUM_SENSES, syn_path_cutoff=NUM_HYPS) indexed_input = data_processor.prepare_input(tagged_sentences, onto_aware=True) one_hot_labels = data_processor.make_one_hot([int(x) for x in labels]) ## Defining Keras model input_layer = Input(shape=indexed_input.shape[1:], dtype='int32') onto_lstm = OntoLSTMEncoder(num_senses=NUM_SENSES, num_hyps=NUM_HYPS, use_attention=ONTO_ATTENTION, set_sense_priors=SENSE_PRIORS, data_processor=data_processor, embed_dim=EMBED_DIM, return_sequences=False, bidirectional=BIDIRECTIONAL, tune_embedding=TUNE_EMBEDDING) encoded_input = onto_lstm.get_encoded_phrase(input_layer,
class OntoLSTMEntailmentModel(EntailmentModel): def __init__(self, num_senses, num_hyps, use_attention, set_sense_priors, **kwargs): super(OntoLSTMEntailmentModel, self).__init__(**kwargs) # Set self.data_processor again, now with the right arguments. self.data_processor = DataProcessor(word_syn_cutoff=num_senses, syn_path_cutoff=num_hyps) self.num_senses = num_senses self.num_hyps = num_hyps self.attention_model = None # Keras model with just embedding and encoder to output attention. self.set_sense_priors = set_sense_priors self.use_attention = use_attention # If bidirectional, we'll do pooling here. So make the encoder return sequences iff bidirectional. self.encoder_model = OntoLSTMEncoder( num_senses=num_senses, num_hyps=num_hyps, use_attention=use_attention, set_sense_priors=set_sense_priors, data_processor=self.data_processor, embed_dim=self.embed_dim, bidirectional=self.bidirectional, tune_embedding=self.tune_embedding, return_sequences=self.bidirectional) self.model_name_prefix = "ontolstm_ent_att=%s_senses=%d_hyps=%d_sense-priors=%s_tune-embedding=%s_bi=%s_pool-att=%s" % ( str(self.use_attention), self.num_senses, self.num_hyps, str(set_sense_priors), str(self.tune_embedding), str(self.bidirectional), str(self.intra_attention)) self.custom_objects = { "OntoAttentionLSTM": OntoAttentionLSTM, "OntoAwareEmbedding": OntoAwareEmbedding } if self.bidirectional: if self.intra_attention: self.custom_objects["IntraAttention"] = IntraAttention else: self.custom_objects["AveragePooling"] = AveragePooling def get_encoder(self, return_sequences=False): lstm = OntoAttentionLSTM(input_dim=self.embed_dim, output_dim=self.embed_dim, num_senses=self.num_senses, num_hyps=self.num_hyps, use_attention=self.use_attention, consume_less="gpu", return_sequences=return_sequences, name="onto_lstm") return lstm def get_attention(self, inputs): # Takes inputs and returns pairs of synsets and corresponding attention values. if not self.attention_model: self.define_attention_model() attention_outputs = self.attention_model.predict(inputs) sent_attention_values = [] for sentence_input, sentence_attention in zip(inputs, attention_outputs): word_attention_values = [] for word_input, word_attention in zip(sentence_input, sentence_attention): if word_input.sum() == 0: # This is just padding continue sense_attention_values = [] for sense_input, sense_attention in zip( word_input, word_attention): if sense_input.sum() == 0: continue hyp_attention_values = [] for hyp_input, hyp_attention in zip( sense_input, sense_attention): if hyp_input == 0: continue hyp_attention_values.append( (self.data_processor.get_token_from_index( hyp_input, onto_aware=True), hyp_attention)) sense_attention_values.append(hyp_attention_values) word_attention_values.append(sense_attention_values) sent_attention_values.append(word_attention_values) return sent_attention_values def define_attention_model(self): # Take necessary parts out of the entailment model to get OntoLSTM attention. if not self.model: raise RuntimeError, "Model not trained yet!" # We need just one input to get attention. input_shape_at(0) gives a list with two shapes. input_shape = self.model.get_input_shape_at(0)[0] input_layer = Input(input_shape[1:], dtype='int32') # removing batch size embedding_layer = None encoder_layer = None for layer in self.model.layers: if layer.name == "embedding": embedding_layer = layer elif layer.name == "encoder": # We need to redefine the OntoLSTM layer with the learned weights and set return attention to True. # Assuming we'll want attention values for all words (return_sequences = True) encoder_layer = OntoAttentionLSTM(input_dim=self.embed_dim, output_dim=self.embed_dim, num_senses=self.num_senses, num_hyps=self.num_hyps, use_attention=True, return_attention=True, return_sequences=True, weights=layer.get_weights()) if not embedding_layer or not encoder_layer: raise RuntimeError, "Required layers not found!" attention_output = encoder_layer(embedding_layer(input_layer)) self.attention_model = Model(input=input_layer, output=attention_output) self.attention_model.compile( loss="mse", optimizer="sgd") # Loss and optimizer do not matter! def print_attention_values(self, input_file, test_inputs, output_file): onto_aware = True sent1_attention_outputs = self.get_attention(test_inputs[0]) sent2_attention_outputs = self.get_attention(test_inputs[1]) tagged_sentences = [ x.strip().split("\t")[1] for x in codecs.open(input_file).readlines() ] outfile = codecs.open(output_file, "w", "utf-8") for sent1_attention, sent2_attention, tagged_sentence in zip( sent1_attention_outputs, sent2_attention_outputs, tagged_sentences): print >> outfile, tagged_sentence print >> outfile, "Sentence 1:" for word_attention in sent1_attention: for sense_attention in word_attention: print >> outfile, " ".join([ "%s:%f" % (hyp, hyp_att) for hyp, hyp_att in sense_attention ]) print >> outfile print >> outfile, "\nSentence 2:" for word_attention in sent2_attention: for sense_attention in word_attention: print >> outfile, " ".join([ "%s:%f" % (hyp, hyp_att) for hyp, hyp_att in sense_attention ]) print >> outfile outfile.close()
class EntailmentModel(object): def __init__(self, bidirectional=False, intra_attention=False, tune_embedding=False, **kwargs): self.data_processor = DataProcessor() if "embed_dim" in kwargs: self.embed_dim = kwargs["embed_dim"] else: self.embed_dim = 50 self.bidirectional = bidirectional self.intra_attention = intra_attention self.tune_embedding = tune_embedding self.numpy_rng = numpy.random.RandomState(12345) self.label_map = {} # Maps labels to integers. self.model = None self.best_epoch = 0 # index of the best epoch self.model_name_prefix = None self.custom_objects = None self.encoder_model = None def train(self, max_sent_len, train_inputs, train_labels, num_epochs=20, mlp_size=1024, mlp_activation='relu', dropout=None, embedding_file=None, tune_embedding=True, num_mlp_layers=2, batch=None, patience=5): ''' train_inputs (list(numpy_array)): The two sentence inputs train_labels (numpy_array): One-hot matrix indicating labels num_epochs (int): Maximum number of epochs to run mlp_size (int): Dimensionality of each layer in the MLP dropout (dict(str->float)): Probabilities in Dropout layers after "embedding" and "encoder" (lstm) embedding (numpy): Optional pretrained embedding tune_embedding (bool): If pretrained embedding is given, tune it. patience (int): Early stopping patience ''' if dropout is None: dropout = {} num_label_types = train_labels.shape[ 1] # train_labels is of shape (num_samples, num_label_types) sent1_input_layer = Input(name='sent1', shape=train_inputs[0].shape[1:], dtype='int32') sent2_input_layer = Input(name='sent2', shape=train_inputs[1].shape[1:], dtype='int32') encoded_sent1, encoded_sent2 = self._get_encoded_sentence_variables( max_sent_len, sent1_input_layer, sent2_input_layer, dropout, embedding_file, tune_embedding, batch=32 if batch == None else batch) concat_sent_rep = merge([encoded_sent1, encoded_sent2], mode='concat') mul_sent_rep = merge([encoded_sent1, encoded_sent2], mode='mul') diff_sent_rep = merge([encoded_sent1, encoded_sent2], mode=lambda l: l[0] - l[1], output_shape=lambda l: l[0]) # Use heuristic from Mou et al. (2015) to get final merged representation merged_sent_rep = merge([concat_sent_rep, mul_sent_rep, diff_sent_rep], mode='concat') current_projection = merged_sent_rep for i in range(num_mlp_layers): mlp_layer_i = Dense(output_dim=mlp_size, activation=mlp_activation, name="%s_layer_%d" % (mlp_activation, i)) current_projection = mlp_layer_i(current_projection) if dropout is not None: if "output" in dropout: current_projection = Dropout( dropout["output"])(current_projection) softmax = Dense(output_dim=num_label_types, activation='softmax', name='softmax_layer') label_probs = softmax(current_projection) model = Model(input=[sent1_input_layer, sent2_input_layer], output=label_probs) model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy']) self.model = model print >> sys.stderr, "Entailment model summary:" model.summary() plot(model, to_file='model_plot.png', show_shapes=True, show_layer_names=True) best_accuracy = 0.0 num_worse_epochs = 0 for epoch_id in range(num_epochs): print >> sys.stderr, "Epoch: %d" % epoch_id history = model.fit(train_inputs, train_labels, validation_split=0.1, nb_epoch=1) validation_accuracy = history.history['val_acc'][ 0] # history['val_acc'] is a list of size nb_epoch if validation_accuracy > best_accuracy: self.save_model(epoch_id) self.best_epoch = epoch_id num_worse_epochs = 0 best_accuracy = validation_accuracy elif validation_accuracy < best_accuracy: num_worse_epochs += 1 if num_worse_epochs >= patience: print >> sys.stderr, "Stopping training." break self.save_best_model() def _get_encoded_sentence_variables(self, max_sent_len, sent1_input_layer, sent2_input_layer, dropout, embedding_file, tune_embedding, batch=None): if self.bidirectional: encoded_sent1_seq = self.encoder_model.get_encoded_phrase( sent1_input_layer, dropout=dropout, embedding=embedding_file) encoded_sent2_seq = self.encoder_model.get_encoded_phrase( sent2_input_layer, dropout=dropout, embedding=embedding_file) if self.intra_attention: pooling_layer = IntraAttention(name='intra_attention') else: pooling_layer = AveragePooling(name='average_pooling') encoded_sent1 = pooling_layer(encoded_sent1_seq) encoded_sent2 = pooling_layer(encoded_sent2_seq) else: encoded_sent1 = self.encoder_model.get_encoded_phrase( sent1_input_layer, dropout=dropout, embedding=embedding_file) encoded_sent2 = self.encoder_model.get_encoded_phrase( sent2_input_layer, dropout=dropout, embedding=embedding_file) return encoded_sent1, encoded_sent2 def process_train_data(self, input_file, onto_aware): print >> sys.stderr, "Reading training data" label_ind = [] tagged_sentences = [] for line in open(input_file): lnstrp = line.strip() label, tagged_sentence = lnstrp.split("\t") if label not in self.label_map: self.label_map[label] = len(self.label_map) label_ind.append(self.label_map[label]) tagged_sentences.append(tagged_sentence) # Shuffling so that when Keras does validation split, it is not always at the end. sentences_and_labels = zip(tagged_sentences, label_ind) random.shuffle(sentences_and_labels) tagged_sentences, label_ind = zip(*sentences_and_labels) print >> sys.stderr, "Indexing training data" max_sent_len, train_inputs = self.data_processor.prepare_paired_input( tagged_sentences, onto_aware=onto_aware, for_test=False, remove_singletons=True) train_labels = self.data_processor.make_one_hot(label_ind) return max_sent_len, train_inputs, train_labels def process_test_data(self, input_file, onto_aware, is_labeled=True): if not self.model: raise RuntimeError, "Model not trained yet!" print >> sys.stderr, "Reading test data" label_ind = [] tagged_sentences = [] for line in open(input_file): lnstrp = line.strip() if is_labeled: label, tagged_sentence = lnstrp.split("\t") if label not in self.label_map: self.label_map[label] = len(self.label_map) label_ind.append(self.label_map[label]) else: tagged_sentence = lnstrp tagged_sentences.append(tagged_sentence) print >> sys.stderr, "Indexing test data" # Infer max sentence length if the model is trained input_shape = self.model.get_input_shape_at(0)[ 0] # take the shape of the first of two inputs at 0. sentlenlimit = input_shape[ 1] # (num_sentences, num_words, num_senses, num_hyps) max_sent_len, test_inputs = self.data_processor.prepare_paired_input( tagged_sentences, onto_aware=onto_aware, sentlenlimit=sentlenlimit, for_test=True) test_labels = self.data_processor.make_one_hot(label_ind) return max_sent_len, test_inputs, test_labels def test(self, inputs, targets): if not self.model: raise RuntimeError, "Model not trained!" metrics = self.model.evaluate(inputs, targets) print >> sys.stderr, "Test accuracy: %.4f" % ( metrics[1]) # The first metric is loss. predictions = numpy.argmax(self.model.predict(inputs), axis=1) rev_label_map = {ind: label for label, ind in self.label_map.items()} predicted_labels = [rev_label_map[pred] for pred in predictions] return predicted_labels def save_model(self, epoch): ''' Saves the current model using the epoch id to identify the file. ''' self.model.save("%s_%d.model" % (self.model_name_prefix, epoch)) pickle.dump(self.data_processor, open("%s.dataproc" % self.model_name_prefix, "wb")) pickle.dump(self.label_map, open("%s.labelmap" % self.model_name_prefix, "wb")) def save_best_model(self): ''' Copies the model corresponding to the best epoch as the final model file. ''' from shutil import copyfile best_model_file = "%s_%d.model" % (self.model_name_prefix, self.best_epoch) final_model_file = "%s.model" % self.model_name_prefix copyfile(best_model_file, final_model_file) def load_model(self, epoch=None): ''' Loads a saved model. If epoch id is provided, will load the corresponding model. Or else, will load the best model. ''' if not epoch: self.model = load_model("%s.model" % self.model_name_prefix, custom_objects=self.custom_objects) else: self.model = load_model("%s_%d.model" % (self.model_name_prefix, epoch), custom_objects=self.custom_objects) self.data_processor = pickle.load( open("%s.dataproc" % self.model_name_prefix, "rb")) self.label_map = pickle.load( open("%s.labelmap" % self.model_name_prefix, "rb"))
class OntoLSTMEntailmentModel(EntailmentModel): def __init__(self, num_senses, num_hyps, use_attention, set_sense_priors, **kwargs): super(OntoLSTMEntailmentModel, self).__init__(**kwargs) # Set self.data_processor again, now with the right arguments. self.data_processor = DataProcessor(word_syn_cutoff=num_senses, syn_path_cutoff=num_hyps) self.num_senses = num_senses self.num_hyps = num_hyps self.attention_model = None # Keras model with just embedding and encoder to output attention. self.set_sense_priors = set_sense_priors self.use_attention = use_attention # If bidirectional, we'll do pooling here. So make the encoder return sequences iff bidirectional. self.encoder_model = OntoLSTMEncoder(num_senses=num_senses, num_hyps=num_hyps, use_attention=use_attention, set_sense_priors=set_sense_priors, data_processor=self.data_processor, embed_dim=self.embed_dim, bidirectional=self.bidirectional, tune_embedding=self.tune_embedding, return_sequences=self.bidirectional) self.model_name_prefix = "ontolstm_ent_att=%s_senses=%d_hyps=%d_sense-priors=%s_tune-embedding=%s_bi=%s_pool-att=%s" % ( str(self.use_attention), self.num_senses, self.num_hyps, str(set_sense_priors), str(self.tune_embedding), str(self.bidirectional), str(self.intra_attention)) self.custom_objects = {"OntoAttentionLSTM": OntoAttentionLSTM, "OntoAwareEmbedding": OntoAwareEmbedding} if self.bidirectional: if self.intra_attention: self.custom_objects["IntraAttention"] = IntraAttention else: self.custom_objects["AveragePooling"] = AveragePooling def get_encoder(self, return_sequences=False): lstm = OntoAttentionLSTM(input_dim=self.embed_dim, output_dim=self.embed_dim, num_senses=self.num_senses, num_hyps=self.num_hyps, use_attention=self.use_attention, consume_less="gpu", return_sequences=return_sequences, name="onto_lstm") return lstm def get_attention(self, inputs): # Takes inputs and returns pairs of synsets and corresponding attention values. if not self.attention_model: self.define_attention_model() attention_outputs = self.attention_model.predict(inputs) sent_attention_values = [] for sentence_input, sentence_attention in zip(inputs, attention_outputs): word_attention_values = [] for word_input, word_attention in zip(sentence_input, sentence_attention): if word_input.sum() == 0: # This is just padding continue sense_attention_values = [] for sense_input, sense_attention in zip(word_input, word_attention): if sense_input.sum() == 0: continue hyp_attention_values = [] for hyp_input, hyp_attention in zip(sense_input, sense_attention): if hyp_input == 0: continue hyp_attention_values.append((self.data_processor.get_token_from_index(hyp_input, onto_aware=True), hyp_attention)) sense_attention_values.append(hyp_attention_values) word_attention_values.append(sense_attention_values) sent_attention_values.append(word_attention_values) return sent_attention_values def define_attention_model(self): # Take necessary parts out of the entailment model to get OntoLSTM attention. if not self.model: raise RuntimeError, "Model not trained yet!" # We need just one input to get attention. input_shape_at(0) gives a list with two shapes. input_shape = self.model.get_input_shape_at(0)[0] input_layer = Input(input_shape[1:], dtype='int32') # removing batch size embedding_layer = None encoder_layer = None for layer in self.model.layers: if layer.name == "embedding": embedding_layer = layer elif layer.name == "encoder": # We need to redefine the OntoLSTM layer with the learned weights and set return attention to True. # Assuming we'll want attention values for all words (return_sequences = True) encoder_layer = OntoAttentionLSTM(input_dim=self.embed_dim, output_dim=self.embed_dim, num_senses=self.num_senses, num_hyps=self.num_hyps, use_attention=True, return_attention=True, return_sequences=True, weights=layer.get_weights()) if not embedding_layer or not encoder_layer: raise RuntimeError, "Required layers not found!" attention_output = encoder_layer(embedding_layer(input_layer)) self.attention_model = Model(input=input_layer, output=attention_output) self.attention_model.compile(loss="mse", optimizer="sgd") # Loss and optimizer do not matter! def print_attention_values(self, input_file, test_inputs, output_file): onto_aware = True sent1_attention_outputs = self.get_attention(test_inputs[0]) sent2_attention_outputs = self.get_attention(test_inputs[1]) tagged_sentences = [x.strip().split("\t")[1] for x in codecs.open(input_file).readlines()] outfile = codecs.open(output_file, "w", "utf-8") for sent1_attention, sent2_attention, tagged_sentence in zip(sent1_attention_outputs, sent2_attention_outputs, tagged_sentences): print >>outfile, tagged_sentence print >>outfile, "Sentence 1:" for word_attention in sent1_attention: for sense_attention in word_attention: print >>outfile, " ".join(["%s:%f" % (hyp, hyp_att) for hyp, hyp_att in sense_attention]) print >>outfile print >>outfile, "\nSentence 2:" for word_attention in sent2_attention: for sense_attention in word_attention: print >>outfile, " ".join(["%s:%f" % (hyp, hyp_att) for hyp, hyp_att in sense_attention]) print >>outfile outfile.close()
class EntailmentModel(object): def __init__(self, bidirectional=False, intra_attention=False, tune_embedding=False, **kwargs): self.data_processor = DataProcessor() if "embed_dim" in kwargs: self.embed_dim = kwargs["embed_dim"] else: self.embed_dim = 50 self.bidirectional = bidirectional self.intra_attention = intra_attention self.tune_embedding = tune_embedding self.numpy_rng = numpy.random.RandomState(12345) self.label_map = {} # Maps labels to integers. self.model = None self.best_epoch = 0 # index of the best epoch self.model_name_prefix = None self.custom_objects = None self.encoder_model = None def train(self, train_inputs, train_labels, num_epochs=20, mlp_size=1024, mlp_activation='relu', dropout=None, embedding_file=None, tune_embedding=True, num_mlp_layers=2, patience=5): ''' train_inputs (list(numpy_array)): The two sentence inputs train_labels (numpy_array): One-hot matrix indicating labels num_epochs (int): Maximum number of epochs to run mlp_size (int): Dimensionality of each layer in the MLP dropout (dict(str->float)): Probabilities in Dropout layers after "embedding" and "encoder" (lstm) embedding (numpy): Optional pretrained embedding tune_embedding (bool): If pretrained embedding is given, tune it. patience (int): Early stopping patience ''' if dropout is None: dropout = {} num_label_types = train_labels.shape[1] # train_labels is of shape (num_samples, num_label_types) sent1_input_layer = Input(name='sent1', shape=train_inputs[0].shape[1:], dtype='int32') sent2_input_layer = Input(name='sent2', shape=train_inputs[1].shape[1:], dtype='int32') encoded_sent1, encoded_sent2 = self._get_encoded_sentence_variables(sent1_input_layer, sent2_input_layer, dropout, embedding_file, tune_embedding) concat_sent_rep = merge([encoded_sent1, encoded_sent2], mode='concat') mul_sent_rep = merge([encoded_sent1, encoded_sent2], mode='mul') diff_sent_rep = merge([encoded_sent1, encoded_sent2], mode=lambda l: l[0]-l[1], output_shape=lambda l: l[0]) # Use heuristic from Mou et al. (2015) to get final merged representation merged_sent_rep = merge([concat_sent_rep, mul_sent_rep, diff_sent_rep], mode='concat') current_projection = merged_sent_rep for i in range(num_mlp_layers): mlp_layer_i = Dense(output_dim=mlp_size, activation=mlp_activation, name="%s_layer_%d" % (mlp_activation, i)) current_projection = mlp_layer_i(current_projection) if dropout is not None: if "output" in dropout: current_projection = Dropout(dropout["output"])(current_projection) softmax = Dense(output_dim=num_label_types, activation='softmax', name='softmax_layer') label_probs = softmax(current_projection) model = Model(input=[sent1_input_layer, sent2_input_layer], output=label_probs) model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy']) self.model = model print >>sys.stderr, "Entailment model summary:" model.summary() best_accuracy = 0.0 num_worse_epochs = 0 for epoch_id in range(num_epochs): print >>sys.stderr, "Epoch: %d" % epoch_id history = model.fit(train_inputs, train_labels, validation_split=0.1, nb_epoch=1) validation_accuracy = history.history['val_acc'][0] # history['val_acc'] is a list of size nb_epoch if validation_accuracy > best_accuracy: self.save_model(epoch_id) self.best_epoch = epoch_id num_worse_epochs = 0 best_accuracy = validation_accuracy elif validation_accuracy < best_accuracy: num_worse_epochs += 1 if num_worse_epochs >= patience: print >>sys.stderr, "Stopping training." break self.save_best_model() def _get_encoded_sentence_variables(self, sent1_input_layer, sent2_input_layer, dropout, embedding_file, tune_embedding): if self.bidirectional: encoded_sent1_seq = self.encoder_model.get_encoded_phrase(sent1_input_layer, dropout=dropout, embedding=embedding_file) encoded_sent2_seq = self.encoder_model.get_encoded_phrase(sent2_input_layer, dropout=dropout, embedding=embedding_file) if self.intra_attention: pooling_layer = IntraAttention(name='intra_attention') else: pooling_layer = AveragePooling(name='average_pooling') encoded_sent1 = pooling_layer(encoded_sent1_seq) encoded_sent2 = pooling_layer(encoded_sent2_seq) else: encoded_sent1 = self.encoder_model.get_encoded_phrase(sent1_input_layer, dropout=dropout, embedding=embedding_file) encoded_sent2 = self.encoder_model.get_encoded_phrase(sent2_input_layer, dropout=dropout, embedding=embedding_file) return encoded_sent1, encoded_sent2 def process_train_data(self, input_file, onto_aware): print >>sys.stderr, "Reading training data" label_ind = [] tagged_sentences = [] for line in open(input_file): lnstrp = line.strip() label, tagged_sentence = lnstrp.split("\t") if label not in self.label_map: self.label_map[label] = len(self.label_map) label_ind.append(self.label_map[label]) tagged_sentences.append(tagged_sentence) # Shuffling so that when Keras does validation split, it is not always at the end. sentences_and_labels = zip(tagged_sentences, label_ind) random.shuffle(sentences_and_labels) tagged_sentences, label_ind = zip(*sentences_and_labels) print >>sys.stderr, "Indexing training data" train_inputs = self.data_processor.prepare_paired_input(tagged_sentences, onto_aware=onto_aware, for_test=False, remove_singletons=True) train_labels = self.data_processor.make_one_hot(label_ind) return train_inputs, train_labels def process_test_data(self, input_file, onto_aware, is_labeled=True): if not self.model: raise RuntimeError, "Model not trained yet!" print >>sys.stderr, "Reading test data" label_ind = [] tagged_sentences = [] for line in open(input_file): lnstrp = line.strip() if is_labeled: label, tagged_sentence = lnstrp.split("\t") if label not in self.label_map: self.label_map[label] = len(self.label_map) label_ind.append(self.label_map[label]) else: tagged_sentence = lnstrp tagged_sentences.append(tagged_sentence) print >>sys.stderr, "Indexing test data" # Infer max sentence length if the model is trained input_shape = self.model.get_input_shape_at(0)[0] # take the shape of the first of two inputs at 0. sentlenlimit = input_shape[1] # (num_sentences, num_words, num_senses, num_hyps) test_inputs = self.data_processor.prepare_paired_input(tagged_sentences, onto_aware=onto_aware, sentlenlimit=sentlenlimit, for_test=True) test_labels = self.data_processor.make_one_hot(label_ind) return test_inputs, test_labels def test(self, inputs, targets): if not self.model: raise RuntimeError, "Model not trained!" metrics = self.model.evaluate(inputs, targets) print >>sys.stderr, "Test accuracy: %.4f" % (metrics[1]) # The first metric is loss. predictions = numpy.argmax(self.model.predict(inputs), axis=1) rev_label_map = {ind: label for label, ind in self.label_map.items()} predicted_labels = [rev_label_map[pred] for pred in predictions] return predicted_labels def save_model(self, epoch): ''' Saves the current model using the epoch id to identify the file. ''' self.model.save("%s_%d.model" % (self.model_name_prefix, epoch)) pickle.dump(self.data_processor, open("%s.dataproc" % self.model_name_prefix, "wb")) pickle.dump(self.label_map, open("%s.labelmap" % self.model_name_prefix, "wb")) def save_best_model(self): ''' Copies the model corresponding to the best epoch as the final model file. ''' from shutil import copyfile best_model_file = "%s_%d.model" % (self.model_name_prefix, self.best_epoch) final_model_file = "%s.model" % self.model_name_prefix copyfile(best_model_file, final_model_file) def load_model(self, epoch=None): ''' Loads a saved model. If epoch id is provided, will load the corresponding model. Or else, will load the best model. ''' if not epoch: self.model = load_model("%s.model" % self.model_name_prefix, custom_objects=self.custom_objects) else: self.model = load_model("%s_%d.model" % (self.model_name_prefix, epoch), custom_objects=self.custom_objects) self.data_processor = pickle.load(open("%s.dataproc" % self.model_name_prefix, "rb")) self.label_map = pickle.load(open("%s.labelmap" % self.model_name_prefix, "rb"))
ONTO_ATTENTION = True SENSE_PRIORS = True EMBED_DIM = 50 BIDIRECTIONAL = False TUNE_EMBEDDING = True EMBEDDING_FILE = None # Replace with a gzipped embedding file if needed. ## Reading text file test_file = open('data/test_data.tsv') labeled_sentences = [x.strip().split('\t') for x in test_file] labels, tagged_sentences = zip(*labeled_sentences) ## Preparing (indexing) data for classification. # word_syn_cutoff is the number of senses per word, # and syn_path_cutoff is the number of hypernyms per sense data_processor = DataProcessor(word_syn_cutoff=NUM_SENSES, syn_path_cutoff=NUM_HYPS) indexed_input = data_processor.prepare_input(tagged_sentences, onto_aware=True) one_hot_labels = data_processor.make_one_hot([int(x) for x in labels]) ## Defining Keras model input_layer = Input(shape=indexed_input.shape[1:], dtype='int32') onto_lstm = OntoLSTMEncoder(num_senses=NUM_SENSES, num_hyps=NUM_HYPS, use_attention=ONTO_ATTENTION, set_sense_priors=SENSE_PRIORS, data_processor=data_processor, embed_dim=EMBED_DIM, return_sequences=False, bidirectional=BIDIRECTIONAL, tune_embedding=TUNE_EMBEDDING) encoded_input = onto_lstm.get_encoded_phrase(input_layer, embedding_file=EMBEDDING_FILE) softmax_layer = Dense(2, activation='softmax') output_predictions = softmax_layer(encoded_input) model = Model(input=input_layer, output=output_predictions) model.summary() model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
class SentenceModel(object): def __init__(self, word_dim=50, num_senses=2, num_hyps=5): self.dp = DataProcessor(word_syn_cutoff=num_senses, syn_path_cutoff=num_hyps) self.num_hyps = num_hyps self.num_senses = num_senses self.numpy_rng = numpy.random.RandomState(12345) self.word_dim = word_dim self.model = None def read_sentences(self, tagged_sentences, sentlenlimit=None, test=False, remove_singletons=False): num_sentences = len(tagged_sentences) all_words = [] all_pos_tags = [] maxsentlen = 0 for tagged_sentence in tagged_sentences: words = [] pos_tags = [] # Expects each token to be a "_" separated combination of word and POS tag. tagged_words = tagged_sentence.split(" ") if sentlenlimit is not None: tagged_words = tagged_words[:sentlenlimit] for word_tag in tagged_words: parts = word_tag.split("_") tag = parts[-1] word = "_".join(parts[:-1]).lower() words.append(word) pos_tags.append(tag) if len(words) > maxsentlen: maxsentlen = len(words) all_words.append(words) all_pos_tags.append(pos_tags) if not sentlenlimit: sentlenlimit = maxsentlen C_ind = numpy.zeros((num_sentences, sentlenlimit, self.num_senses, self.num_hyps), dtype='int32') S_ind = numpy.zeros((num_sentences, sentlenlimit), dtype='int32') for i, (words, pos_tags) in enumerate(zip(all_words, all_pos_tags)): sentlen = len(words) # test=True locks the word and syn index dicts. No new keys will be added word_inds, syn_inds = self.dp.index_sentence(words, pos_tags, test=test, remove_singletons=remove_singletons) S_ind[i][-sentlen:] = word_inds for j in range(sentlen): sense_syn_ind = syn_inds[j] sense_syn_ind_len = len(sense_syn_ind) for k, syn_ind in enumerate(sense_syn_ind): C_ind[i][-sentlen+j][-sense_syn_ind_len+k][-len(syn_ind):] = syn_ind return S_ind, C_ind def _make_one_hot(self, word_inds, vec_size): onehot = numpy.zeros((word_inds.shape + (vec_size,))) for inds in itertools.product(*[numpy.arange(s) for s in word_inds.shape]): onehot[inds+(word_inds[inds],)] = 1 return onehot def _factor_target_indices(self, Y_inds, vocab_size=None, base=2): if vocab_size is None: vocab_size = len(self.dp.word_index) print >>sys.stderr, "Factoring targets of vocabulary size: %d"%(vocab_size) num_vecs = int(math.ceil(math.log(vocab_size)/math.log(base))) + 1 base_inds = [] div_Y_inds = Y_inds print >>sys.stderr, "Number of factors: %d"%num_vecs for i in range(num_vecs): new_inds = div_Y_inds % base if i == num_vecs - 1: if new_inds.sum() == 0: # Most significant "digit" is a zero. Omit it. break base_inds.append(new_inds) div_Y_inds = numpy.copy(div_Y_inds/base) base_vecs = [self._make_one_hot(base_inds_i, base) for base_inds_i in base_inds] return base_vecs def train(self, S_ind, C_ind, use_onto_lstm=True, use_attention=True, num_epochs=20, hierarchical=False, base=2): # Predict next word from current synsets X = C_ind[:,:-1] if use_onto_lstm else S_ind[:,:-1] # remove the last words' hyps in all sentences Y_inds = S_ind[:,1:] # remove the first words in all sentences if hierarchical: train_targets = self._factor_target_indices(Y_inds, base=base) else: train_targets = [self._make_one_hot(Y_inds, Y_inds.max() + 1)] length = Y_inds.shape[1] lstm_outdim = self.word_dim num_words = len(self.dp.word_index) num_syns = len(self.dp.synset_index) input = Input(shape=X.shape[1:], dtype='int32') embed_input_dim = num_syns if use_onto_lstm else num_words embed_layer = HigherOrderEmbedding(name='embedding', input_dim=embed_input_dim, output_dim=self.word_dim, input_shape=X.shape[1:], mask_zero=True) sent_rep = embed_layer(input) reg_sent_rep = Dropout(0.5)(sent_rep) if use_onto_lstm: lstm_out = OntoAttentionLSTM(name='sent_lstm', input_dim=self.word_dim, output_dim=lstm_outdim, input_length=length, num_senses=self.num_senses, num_hyps=self.num_hyps, return_sequences=True, use_attention=use_attention)(reg_sent_rep) else: lstm_out = LSTM(name='sent_lstm', input_dim=self.word_dim, output_dim=lstm_outdim, input_length=length, return_sequences=True)(reg_sent_rep) output_nodes = [] # Make one node for each factored target for target in train_targets: node = TimeDistributed(Dense(input_dim=lstm_outdim, output_dim=target.shape[-1], activation='softmax'))(lstm_out) output_nodes.append(node) model = Model(input=input, output=output_nodes) print >>sys.stderr, model.summary() early_stopping = EarlyStopping() precompile_time = time.time() model.compile(loss='categorical_crossentropy', optimizer='adam') postcompile_time = time.time() print >>sys.stderr, "Model compilation took %d s"%(postcompile_time - precompile_time) model.fit(X, train_targets, nb_epoch=num_epochs, validation_split=0.1, callbacks=[early_stopping]) posttrain_time = time.time() print >>sys.stderr, "Training took %d s"%(posttrain_time - postcompile_time) concept_reps = model.layers[1].get_weights() self.model = model return concept_reps def test(self, vocab_size, use_onto_lstm, S_ind_test=None, C_ind_test=None, hierarchical=False, base=2, oov_list=None): X_test = C_ind_test[:,:-1] if use_onto_lstm else S_ind_test[:,:-1] # remove the last words' hyps in all sentences Y_inds_test = S_ind_test[:,1:] if hierarchical: test_targets = self._factor_target_indices(Y_inds_test, vocab_size, base=base) else: test_targets = [self._make_one_hot(Y_inds_test, vocab_size)] print >>sys.stderr, "Evaluating model on test data" test_loss = self.model.evaluate(X_test, test_targets) print >>sys.stderr, "Test loss: %.4f"%test_loss if oov_list is not None: oov_inds = [self.dp.word_index[w] for w in oov_list] non_oov_Y_inds = numpy.copy(Y_inds_test) for ind in oov_inds: non_oov_Y_inds[non_oov_Y_inds == ind] = 0 non_oov_test_targets = self._factor_target_indices(non_oov_Y_inds, vocab_size, base=base) non_oov_test_loss = self.model.evaluate(X_test, non_oov_test_targets) print >>sys.stderr, "Non-oov test loss: %.4f"%non_oov_test_loss factored_test_preds = [-((numpy.log(pred) * target).sum(axis=-1)) for pred, target in zip(self.model.predict(X_test), test_targets)] test_preds = sum(factored_test_preds) #non_null_probs = [] #for test_pred, inds in zip(test_preds, Y_inds_test): # wanted_probs = [] # for tp, ind in zip(test_pred, inds): # if ind != 0: # wanted_probs.append(tp) # non_null_probs.append(wanted_probs) #return non_null_probs return test_preds def get_attention(self, C_ind): if not self.model: raise RuntimeError, "Model not trained!" model_embedding = None model_weights = None for layer in self.model.layers: if layer.name.lower() == "embedding": model_embedding = layer if layer.name.lower() == "sent_lstm": model_lstm = layer if model_embedding is None or model_lstm is None: raise RuntimeError, "Did not find expected layers" lstm_weights = model_lstm.get_weights() embedding_weights = model_embedding.get_weights() embed_in_dim, embed_out_dim = embedding_weights[0].shape att_embedding = HigherOrderEmbedding(input_dim=embed_in_dim, output_dim=embed_out_dim, weights=embedding_weights) onto_lstm = OntoAttentionLSTM(input_dim=embed_out_dim, output_dim=embed_out_dim, input_length=model_lstm.input_length, num_senses=self.num_senses, num_hyps=self.num_hyps, use_attention=True, return_attention=True, weights=lstm_weights) att_input = Input(shape=C_ind.shape[1:], dtype='int32') att_sent_rep = att_embedding(att_input) att_output = onto_lstm(att_sent_rep) att_model = Model(input=att_input, output=att_output) att_model.compile(optimizer='adam', loss='mse') # optimizer and loss are not needed since we are not going to train this model. C_att = att_model.predict(C_ind) print >>sys.stderr, "Got attention values. Input, output shapes:", C_ind.shape, C_att.shape return C_att