def __init__( self, model_fn, sent_maxlen=None, emb_filename=None, batch_size=5, seed=42, sep='\t', hidden_units=pow(2, 7), trainable_emb=True, emb_dropout=0.1, num_of_latent_layers=2, epochs=10, pred_dropout=0.1, model_dir="./models/", classes=None, pos_tag_embedding_size=5, ): """ Initialize the model model_fn - a model generating function, to be called when training with self as a single argument. sent_maxlen - the maximum length in words of each sentence - will be used for padding / truncating emb_filename - the filename from which to load the embedding (Currenly only Glove. Idea: parse by filename) batch_size - batch size for training seed - the random seed for reproduciblity sep - separator in the csv dataset files for this model hidden_units - number of hidden units per layer trainable_emb - controls if the loss should propagate to the word embeddings during training emb_dropout - the percentage of dropout during embedding num_of_latent_layers - how many LSTMs to stack epochs - the number of epochs to train the model pred_dropout - the proportion to dropout before prediction model_dir - the path in which to save model classes - the classes to be encoded (list of strings) pos_tag_embedding_size - The number of features to use when encoding pos tags """ self.model_fn = lambda: model_fn(self) self.model_dir = model_dir self.sent_maxlen = sent_maxlen self.batch_size = batch_size self.seed = seed self.sep = sep self.encoder = LabelEncoder() self.hidden_units = hidden_units self.emb_filename = emb_filename self.emb = Glove(emb_filename) self.embedding_size = self.emb.dim self.trainable_emb = trainable_emb self.emb_dropout = emb_dropout self.num_of_latent_layers = num_of_latent_layers self.epochs = epochs self.pred_dropout = pred_dropout self.classes = classes self.pos_tag_embedding_size = pos_tag_embedding_size np.random.seed(self.seed)
def __init__(self, train_file, dev_file, test_file, emb_filename=None, sent_maxlen=300, batch_size=32, seed=42, sep='\t', hidden_units=pow(2, 7), trainable_emb=True, emb_dropout=0.1, num_of_latent_layers=2, epochs=10, pred_dropout=0.1, model_dir="./models/", classes=None, pos_tag_embedding_size=5, num_classes=15, num_workers=0, lr=0.001): # NOTE: So far, num classes must be provided at the beginning super(RNN_Model, self).__init__() self.train_file = train_file self.dev_file = dev_file self.test_file = test_file self.model_dir = model_dir self.sent_maxlen = sent_maxlen self.batch_size = batch_size self.seed = seed self.sep = sep self.hidden_units = hidden_units self.emb_filename = emb_filename self.emb = Glove(emb_filename) self.embedding_size = self.emb.dim self.trainable_emb = trainable_emb self.emb_dropout = emb_dropout self.num_of_latent_layers = num_of_latent_layers self.epochs = epochs self.pred_dropout = pred_dropout self.classes = classes self.label_map = None self.num_classes = num_classes self.num_workers = num_workers self.lr = lr if self.classes is not None: self.label_map = LabelEncoder() self.label_map.fit(self.classes) self.pos_tag_embedding_size = pos_tag_embedding_size np.random.seed(self.seed) # build_model self.build_model()
def __init__(self, **args): """ Init and compile model's params Arguments: seed - the random seed to use sep - the delimiter to be used in the csv files batch_size - (=input_lenght) Batch size in which to partition the elements maximum_output_length - The maximum number of words in output emb - Pretrained embeddings hidden_dim - number of hidden units input_depth - the number of layers in encoder output_depth - the number of layers in decoder peek - (binray) add the peek feature attention - (binary) use attention model epochs - Number of epochs to train the model loss - (string) the loss function, one of keras options optimizer - (string) the optimizer function, one of keras options """ self.args = args self.sep = str(self.args['sep']) self.emb = Glove(self.args['emb_fn']) self.epochs = self.args['epochs'] np.random.seed(self.args['seed']) self.model = Seq2seq_OIE.compile_model( input_length=self.args['batch_size'], input_depth=self.args['input_depth'], input_dim=self.emb.dim, hidden_dim=self.args['hidden_dim'], output_length=self.args['maximum_output_length'], output_depth=self.args['output_depth'], output_dim=self.emb.dim, peek=self.args['peek'], attention=self.args['attention'], loss=self.args['loss'], optimizer=self.args['optimizer'], )
class RNN_model: """ Represents an RNN model for supervised OIE """ def __init__( self, model_fn, sent_maxlen=None, emb_filename=None, batch_size=5, seed=42, sep='\t', hidden_units=pow(2, 7), trainable_emb=True, emb_dropout=0.1, num_of_latent_layers=2, epochs=10, pred_dropout=0.1, model_dir="./models/", classes=None, pos_tag_embedding_size=5, ): """ Initialize the model model_fn - a model generating function, to be called when training with self as a single argument. sent_maxlen - the maximum length in words of each sentence - will be used for padding / truncating emb_filename - the filename from which to load the embedding (Currenly only Glove. Idea: parse by filename) batch_size - batch size for training seed - the random seed for reproduciblity sep - separator in the csv dataset files for this model hidden_units - number of hidden units per layer trainable_emb - controls if the loss should propagate to the word embeddings during training emb_dropout - the percentage of dropout during embedding num_of_latent_layers - how many LSTMs to stack epochs - the number of epochs to train the model pred_dropout - the proportion to dropout before prediction model_dir - the path in which to save model classes - the classes to be encoded (list of strings) pos_tag_embedding_size - The number of features to use when encoding pos tags """ self.model_fn = lambda: model_fn(self) self.model_dir = model_dir self.sent_maxlen = sent_maxlen self.batch_size = batch_size self.seed = seed self.sep = sep self.encoder = LabelEncoder() self.hidden_units = hidden_units self.emb_filename = emb_filename self.emb = Glove(emb_filename) self.embedding_size = self.emb.dim self.trainable_emb = trainable_emb self.emb_dropout = emb_dropout self.num_of_latent_layers = num_of_latent_layers self.epochs = epochs self.pred_dropout = pred_dropout self.classes = classes self.pos_tag_embedding_size = pos_tag_embedding_size np.random.seed(self.seed) def get_callbacks(self, X): """ Sets these callbacks as a class member. X is the encoded dataset used to print a sample of the output. Callbacks created: 1. Sample output each epoch 2. Save best performing model each epoch """ sample_output_callback = LambdaCallback(on_epoch_end = lambda epoch, logs:\ logging.debug(pformat(self.sample_labels(self.model.predict(X))))) checkpoint = ModelCheckpoint( os.path.join(self.model_dir, "weights.hdf5"), verbose=1, save_best_only=False ) # TODO: is there a way to save by best val_acc? return [sample_output_callback, checkpoint] def plot(self, fn, train_fn): """ Plot this model to an image file Train file is needed as it influences the dimentions of the RNN """ from keras.utils.visualize_util import plot X, Y = self.load_dataset(train_fn) self.model_fn() plot(self.model, to_file=fn) def classes_(self): """ Return the classes which are classified by this model """ try: return self.encoder.classes_ except: return self.classes def train_and_test(self, train_fn, test_fn): """ Train and then test on given files """ logging.info("Training..") self.train(train_fn) logging.info("Testing..") return self.test(test_fn) logging.info("Done!") def train(self, train_fn, dev_fn): """ Train this model on a given train dataset Dev test is used for model checkpointing """ X_train, Y_train = self.load_dataset(train_fn) X_dev, Y_dev = self.load_dataset(dev_fn) logging.debug("Classes: {}".format( (self.num_of_classes(), self.classes_()))) # Set model params, called here after labels have been identified in load dataset self.model_fn() # Create a callback to print a sample after each epoch logging.debug("Training model on {}".format(train_fn)) self.model.fit(X_train, Y_train, batch_size=self.batch_size, epochs=self.epochs, validation_data=(X_dev, Y_dev), callbacks=self.get_callbacks(X_train)) @staticmethod def consolidate_labels(labels): """ Return a consolidated list of labels, e.g., O-A1 -> O, A1-I -> A """ return map(RNN_model.consolidate_label, labels) @staticmethod def consolidate_label(label): """ Return a consolidated label, e.g., O-A1 -> O, A1-I -> A """ return label.split("-")[0] if label.startswith("O") else label def predict_sentence(self, sent): """ Return a predicted label for each word in an arbitrary length sentence sent - a list of string tokens """ ret = [] sent_str = " ".join(sent) # Extract predicates by looking at verbal POS preds = [(word.i, str(word)) for word in spacy_ws(sent_str) if word.tag_.startswith("V")] # Calculate num of samples (round up to the nearst multiple of sent_maxlen) num_of_samples = np.ceil( float(len(sent)) / self.sent_maxlen) * self.sent_maxlen # Run RNN for each predicate on this sentence for ind, pred in preds: cur_sample = self.create_sample(sent, ind) X = self.encode_inputs([cur_sample]) ret.append(( (ind, pred), [ [(self.consolidate_label(label), float(prob)) for (label, prob) in label_list] for label_list in #for (label, prob) in self.transform_output_probs( self.model.predict(X), # "flatten" and truncate get_prob=True)[0][:len(sent)] ])) return ret def create_sample(self, sent, head_pred_id): """ Return a dataframe which could be given to encode_inputs """ return pandas.DataFrame({ "word": sent, "run_id": [-1] * len(sent), # Mock running id "head_pred_id": head_pred_id }) def test(self, test_fn, eval_metrics): """ Evaluate this model on a test file eval metrics is a list composed of: (name, f: (y_true, y_pred) -> float (some performance metric)) Prints and returns the metrics name and numbers """ # Load gold and predict X, Y = self.load_dataset(test_fn) y = self.model.predict(X) # Get most probable predictions and flatten Y = RNN_model.consolidate_labels( self.transform_output_probs(Y).flatten()) y = RNN_model.consolidate_labels( self.transform_output_probs(y).flatten()) # Run evaluation metrics and report # TODO: is it possible to compare without the padding? ret = [] for (metric_name, metric_func) in eval_metrics: ret.append((metric_name, metric_func(Y, y))) logging.debug("calculating {}".format(ret[-1])) for (metric_name, metric_val) in ret: logging.info("{}: {:.4f}".format(metric_name, metric_val)) return Y, y, ret def load_dataset(self, fn): """ Load a supervised OIE dataset from file """ df = pandas.read_csv(fn, sep=self.sep, header=0, keep_default_na=False) # Encode one-hot representation of the labels if self.classes_() is None: self.encoder.fit(df.label.values) # Split according to sentences and encode sents = self.get_sents_from_df(df) return (self.encode_inputs(sents), self.encode_outputs(sents)) def get_sents_from_df(self, df): """ Split a data frame by rows accroding to the sentences """ return [ df[df.run_id == run_id] for run_id in sorted(set(df.run_id.values)) ] def get_fixed_size(self, sents): """ Partition sents into lists of sent_maxlen elements (execept the last in each sentence, which might be shorter) """ return [ sent[s_ind:s_ind + self.sent_maxlen] for sent in sents for s_ind in range(0, len(sent), self.sent_maxlen) ] def get_head_pred_word(self, full_sent): """ Get the head predicate word from a full sentence conll. """ assert (len(set(full_sent.head_pred_id.values)) == 1) # Sanity check pred_ind = full_sent.head_pred_id.values[0] return full_sent.word.values[pred_ind] \ if pred_ind != -1 \ else full_sent.pred.values[0].split(" ")[0] def encode_inputs(self, sents): """ Given a dataframe which is already split to sentences, encode inputs for rnn classification. Should return a dictionary of sequences of sample of length maxlen. """ word_inputs = [] pred_inputs = [] pos_inputs = [] # Preproc to get all preds per run_id # Sanity check - make sure that all sents agree on run_id assert (all([len(set(sent.run_id.values)) == 1 for sent in sents])) run_id_to_pred = dict([(int(sent.run_id.values[0]), self.get_head_pred_word(sent)) for sent in sents]) # Construct a mapping from running word index to pos word_id_to_pos = {} for sent in sents: indices = sent.index.values words = sent.word.values for index, word in zip(indices, spacy_ws(" ".join(words))): word_id_to_pos[index] = word.tag_ fixed_size_sents = self.get_fixed_size(sents) for sent in fixed_size_sents: assert (len(set(sent.run_id.values)) == 1) word_indices = sent.index.values sent_words = sent.word.values sent_str = " ".join(sent_words) pos_tags_encodings = [(SPACY_POS_TAGS.index(word_id_to_pos[word_ind]) \ if word_id_to_pos[word_ind] in SPACY_POS_TAGS \ else 0) for word_ind in word_indices] word_encodings = [self.emb.get_word_index(w) for w in sent_words] # Same pred word encodings for all words in the sentence pred_word = run_id_to_pred[int(sent.run_id.values[0])] pred_word_encodings = [ self.emb.get_word_index(pred_word) for _ in sent_words ] word_inputs.append([Sample(w) for w in word_encodings]) pred_inputs.append([Sample(w) for w in pred_word_encodings]) pos_inputs.append([Sample(pos) for pos in pos_tags_encodings]) # Pad / truncate to desired maximum length ret = defaultdict(lambda: []) for name, sequence in zip( ["word_inputs", "predicate_inputs", "postags_inputs"], [word_inputs, pred_inputs, pos_inputs]): for samples in pad_sequences(sequence, pad_func=lambda: Pad_sample(), maxlen=self.sent_maxlen): ret[name].append([sample.encode() for sample in samples]) return {k: np.array(v) for k, v in ret.iteritems()} def encode_outputs(self, sents): """ Given a dataframe split to sentences, encode outputs for rnn classification. Should return a list sequence of sample of length maxlen. """ output_encodings = [] sents = self.get_fixed_size(sents) # Encode outputs for sent in sents: output_encodings.append( list( np_utils.to_categorical( list(self.transform_labels(sent.label.values)), num_classes=self.num_of_classes()))) # Pad / truncate to maximum length return np.ndarray(shape = (len(sents), self.sent_maxlen, self.num_of_classes()), buffer = np.array(pad_sequences(output_encodings, lambda : \ np.zeros(self.num_of_classes()), maxlen = self.sent_maxlen))) def transform_labels(self, labels): """ Encode a list of textual labels """ # Fallback: # return self.encoder.transform(labels) classes = list(self.classes_()) return [classes.index(label) for label in labels] def transform_output_probs(self, y, get_prob=False): """ Given a list of probabilities over labels, get the textual representation of the most probable assignment """ return np.array( self.sample_labels( y, num_of_sents=len(y), # all sentences num_of_samples=max(map(len, y)), # all words num_of_classes=1, # Only top probability start_index=0, # all sentences get_prob=get_prob, # Indicate whether to get only labels )) def inverse_transform_labels(self, indices): """ Encode a list of textual labels """ classes = self.classes_() return [classes[ind] for ind in indices] def num_of_classes(self): """ Return the number of ouput classes """ return len(self.classes_()) # Functional Keras -- all of the following are currying functions expecting models as input # https://keras.io/getting-started/functional-api-guide/ def embed_word(self): """ Embed word sequences using self's embedding class """ return self.emb.get_keras_embedding(dropout=self.emb_dropout, trainable=self.trainable_emb, input_length=self.sent_maxlen) def embed_pos(self): """ Embed Part of Speech using this instance params """ return Embedding(output_dim=self.pos_tag_embedding_size, input_dim=len(SPACY_POS_TAGS), input_length=self.sent_maxlen) def predict_classes(self): """ Predict to the number of classes Named arguments are passed to the keras function """ return lambda x: self.stack(x, [ lambda: TimeDistributed( Dense(output_dim=self.num_of_classes(), activation="softmax")) ] + [ lambda: TimeDistributed(Dense(self.hidden_units, activation='relu') ) ] * 3) def stack_latent_layers(self, n): """ Stack n bidi LSTMs """ return lambda x: self.stack(x, [ lambda: Bidirectional( LSTM(self.hidden_units, return_sequences=True)) ] * n) def stack(self, x, layers): """ Stack layers (FIFO) by applying recursively on the output, until returing the input as the base case for the recursion """ if not layers: return x # Base case of the recursion is the just returning the input else: return layers[0]()(self.stack(x, layers[1:])) def set_model_from_file(self): """ Receives an instance of RNN and returns a model from the self.model_dir path which should contain a file named: model.json, and a single file with the hdf5 extension. Note: Use this function for a pretrained model, running model training on the loaded model will override the files in the model_dir """ from glob import glob weights_fn = glob(os.path.join(self.model_dir, "*.hdf5")) assert len(weights_fn ) == 1, "More/Less than one weights file in {}: {}".format( self.model_dir, weights_fn) weights_fn = weights_fn[0] logging.debug("Weights file: {}".format(weights_fn)) self.model = model_from_json( open(os.path.join(self.model_dir, "./model.json")).read()) self.model.load_weights(weights_fn) self.model.compile(optimizer="adam", loss='categorical_crossentropy', metrics=["accuracy"]) def set_vanilla_model(self): """ Set a Keras model for predicting OIE as a member of this class Can be passed as model_fn to the constructor """ logging.debug("Setting vanilla model") # Build model ## Embedding Layer word_embedding_layer = self.embed_word() pos_embedding_layer = self.embed_pos() ## Deep layers latent_layers = self.stack_latent_layers(self.num_of_latent_layers) ## Dropout dropout = Dropout(self.pred_dropout) ## Prediction predict_layer = self.predict_classes() ## Prepare input features, and indicate how to embed them inputs_and_embeddings = [ (Input(shape=(self.sent_maxlen, ), dtype="int32", name="word_inputs"), word_embedding_layer), (Input(shape=(self.sent_maxlen, ), dtype="int32", name="predicate_inputs"), word_embedding_layer), (Input(shape=(self.sent_maxlen, ), dtype="int32", name="postags_inputs"), pos_embedding_layer), ] ## Concat all inputs and run on deep network output = predict_layer( dropout( latent_layers( merge([embed(inp) for inp, embed in inputs_and_embeddings], mode="concat", concat_axis=-1)))) # Build model self.model = Model(input=map(itemgetter(0), inputs_and_embeddings), output=[output]) # Loss self.model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['categorical_accuracy']) self.model.summary() # Save model json to file self.save_model_to_file(os.path.join(self.model_dir, "model.json")) def to_json(self): """ Encode a json of the parameters needed to reload this model """ return { "sent_maxlen": self.sent_maxlen, "batch_size": self.batch_size, "seed": self.seed, "sep": self.sep, "classes": list(self.classes_()), "hidden_units": self.hidden_units, "trainable_emb": self.trainable_emb, "emb_dropout": self.emb_dropout, "num_of_latent_layers": self.num_of_latent_layers, "epochs": self.epochs, "pred_dropout": self.pred_dropout, "emb_filename": self.emb_filename, "pos_tag_embedding_size": self.pos_tag_embedding_size, } def save_model_to_file(self, fn): """ Saves this model to file, also encodes class inits in the model's json """ js = json.loads(self.model.to_json()) # Add this model's params js["rnn"] = self.to_json() with open(fn, 'w') as fout: json.dump(js, fout) def sample_labels(self, y, num_of_sents=5, num_of_samples=10, num_of_classes=3, start_index=5, get_prob=True): """ Get a sense of how labels in y look like """ classes = self.classes_() ret = [] for sent in y[:num_of_sents]: cur = [] for word in sent[start_index:start_index + num_of_samples]: sorted_prob = am(word) cur.append([(classes[ind], word[ind]) if get_prob else classes[ind] for ind in sorted_prob[:num_of_classes]]) ret.append(cur) return ret
class Confidence_model: """ Represents an RNN model for computing the confidence of an extraction """ def __init__( self, model_fn, sent_maxlen=None, emb_filename=None, batch_size=5, seed=42, sep='\t', hidden_units=pow(2, 7), trainable_emb=True, emb_dropout=0.1, num_of_latent_layers=2, epochs=10, pred_dropout=0.1, model_dir="./models/", pos_tag_embedding_size=5, ): """ Initialize the model model_fn - a model generating function, to be called when training with self as a single argument. sent_maxlen - the maximum length in words of each sentence - will be used for padding / truncating batch_size - batch size for training seed - the random seed for reproduciblity sep - separator in the csv dataset files for this model hidden_units - number of hidden units per layer trainable_emb - controls if the loss should propagate to the word embeddings during training emb_dropout - the percentage of dropout during embedding num_of_latent_layers - how many LSTMs to stack epochs - the number of epochs to train the model pred_dropout - the proportion to dropout before prediction model_dir - the path in which to save the model pos_tag_embedding_size - The number of features to use when encoding pos tags """ self.model_fn = lambda: model_fn(self) self.model_dir = model_dir self.sent_maxlen = sent_maxlen self.batch_size = batch_size self.seed = seed self.sep = sep self.encoder = LabelEncoder() self.hidden_units = hidden_units self.emb_filename = emb_filename self.emb = Glove(emb_filename) self.embedding_size = self.emb.dim self.trainable_emb = trainable_emb self.emb_dropout = emb_dropout self.num_of_latent_layers = num_of_latent_layers self.epochs = epochs self.pred_dropout = pred_dropout self.pos_tag_embedding_size = pos_tag_embedding_size np.random.seed(self.seed) # TODO: this is not needed for confidence, which calculates a real value self.num_of_classes = lambda: 1 self.classes_ = lambda: None def confidence_prediction(self, inputs_and_embeddings): """ Return a network computing confidence of the given OIE inputs """ return predict_layer( dropout( latent_layers( merge([embed(inp) for inp, embed in inputs_and_embeddings], mode="concat", concat_axis=-1)))) # TODO: these should probably be deleted def transform_labels(self, labels): """ Encode a list of textual labels """ # Fallback: # return self.encoder.transform(labels) classes = list(self.classes_()) return [classes.index(label) for label in labels] # TODO: put all of the functions below in a super class (Functional_keras_model, Functional_sentenial_model) # General utils def plot(self, fn, train_fn): """ Plot this model to an image file Train file is needed as it influences the dimentions of the RNN """ from keras.utils.visualize_util import plot X, Y = self.load_dataset(train_fn) self.model_fn() plot(self.model, to_file=fn) def load_dataset(self, fn): """ Load a supervised OIE dataset from file """ df = pandas.read_csv(fn, sep=self.sep, header=0) # Encode one-hot representation of the labels if self.classes_() is None: self.encoder.fit(df.label.values) # Split according to sentences and encode sents = self.get_sents_from_df(df) return (self.encode_inputs(sents), self.encode_outputs(sents)) def get_sents_from_df(self, df): """ Split a data frame by rows accroding to the sentences """ return [ df[df.run_id == i] for i in range(min(df.run_id), max(df.run_id)) ] def encode_inputs(self, sents): """ Given a dataframe split to sentences, encode inputs for rnn classification. Should return a dictionary of sequences of sample of length maxlen. """ word_inputs = [] pred_inputs = [] pos_inputs = [] sents = self.get_fixed_size(sents) for sent in sents: # pd assigns NaN for very infreq. empty string (see wiki train) sent_words = [ word if not (isinstance(word, float) and math.isnan(word)) else " " for word in sent.word.values ] pos_tags_encodings = [ NLTK_POS_TAGS.index(tag) for (_, tag) in nltk.pos_tag(sent_words) ] word_encodings = [self.emb.get_word_index(w) for w in sent_words] pred_word_encodings = [ self.emb.get_word_index(w) for w in sent_words ] word_inputs.append([Sample(w) for w in word_encodings]) pred_inputs.append([Sample(w) for w in pred_word_encodings]) pos_inputs.append([Sample(pos) for pos in pos_tags_encodings]) # Pad / truncate to desired maximum length ret = {"word_inputs": [], "predicate_inputs": []} ret = defaultdict(lambda: []) for name, sequence in zip( ["word_inputs", "predicate_inputs", "postags_inputs"], [word_inputs, pred_inputs, pos_inputs]): for samples in pad_sequences(sequence, pad_func=lambda: Pad_sample(), maxlen=self.sent_maxlen): ret[name].append([sample.encode() for sample in samples]) return {k: np.array(v) for k, v in ret.iteritems()} def get_fixed_size(self, sents): """ Partition sents into lists of sent_maxlen elements (execept the last in each sentence, which might be shorter) """ return [ sent[s_ind:s_ind + self.sent_maxlen] for sent in sents for s_ind in range(0, len(sent), self.sent_maxlen) ] def encode_outputs(self, sents): """ Given a dataframe split to sentences, encode outputs for rnn classification. Should return a list sequence of sample of length maxlen. """ output_encodings = [] sents = self.get_fixed_size(sents) # Encode outputs for sent in sents: output_encodings.append(list(np_utils.to_categorical(\ list(self.transform_labels(sent.label.values)), nb_classes = self.num_of_classes()))) # Pad / truncate to maximum length return np.ndarray(shape = (len(sents), self.sent_maxlen, self.num_of_classes()), buffer = np.array(pad_sequences(output_encodings, lambda : \ np.zeros(self.num_of_classes()), maxlen = self.sent_maxlen))) # Functional Keras -- all of the following are currying functions expecting models as input # https://keras.io/getting-started/functional-api-guide/ def embed_word(self): """ Embed word sequences using self's embedding class """ return self.emb.get_keras_embedding(dropout=self.emb_dropout, trainable=self.trainable_emb, input_length=self.sent_maxlen) def embed_pos(self): """ Embed Part of Speech using this instance params """ return Embedding(output_dim=self.pos_tag_embedding_size, input_dim=len(NLTK_POS_TAGS), input_length=self.sent_maxlen) def predict_classes(self): """ Predict to the number of classes Named arguments are passed to the keras function """ return lambda x: self.stack(x, [ lambda: TimeDistributed( Dense(output_dim=self.num_of_classes(), activation="softmax")) ] + [ lambda: TimeDistributed(Dense(self.hidden_units, activation='relu') ) ] * 3) def stack_latent_layers(self, n): """ Stack n bidi LSTMs """ return lambda x: self.stack(x, [ lambda: Bidirectional( LSTM(self.hidden_units, return_sequences=True)) ] * n) def stack(self, x, layers): """ Stack layers (FIFO) by applying recursively on the output, until returing the input as the base case for the recursion """ if not layers: return x # Base case of the recursion is the just returning the input else: return layers[0]()(self.stack(x, layers[1:])) def set_model(self): """ Set a Keras model for predicting OIE as a member of this class Can be passed as model_fn to the constructor """ logging.debug("Setting vanilla model") # Build model ## Embedding Layer word_embedding_layer = self.embed_word() pos_embedding_layer = self.embed_pos() # label_embedding_layer = self.embed_label() ## Deep layers latent_layers = self.stack_latent_layers(self.num_of_latent_layers) ## Dropout dropout = Dropout(self.pred_dropout) ## Prediction predict_layer = self.predict_classes() ## Prepare input features, and indicate how to embed them # True input true_input = [(Input(shape=(self.sent_maxlen, ), dtype="int32", name="word_inputs"), word_embedding_layer), (Input(shape=(self.sent_maxlen, ), dtype="int32", name="postags_inputs"), pos_embedding_layer)] corrupt_input = [(Input(shape=(self.sent_maxlen, ), dtype="int32", name="neg_word_inputs"), word_embedding_layer), (Input(shape=(self.sent_maxlen, ), dtype="int32", name="neg_postags_inputs"), pos_embedding_layer)] # true_input = [(Input(shape = (self.sent_maxlen,), # dtype="int32", # name = "word_inputs"), # word_embedding_layer), # (Input(shape = (self.sent_maxlen,), # dtype="int32", # name = "predicate_inputs"), # word_embedding_layer), # (Input(shape = (self.sent_maxlen,), # dtype="int32", # name = "postags_inputs"), # pos_embedding_layer), # (Input(shape = (self.sent_maxlen,), # dtype="int32", # name = "postags_inputs"), # label_embedding_layer), # ] # # Corrput negative sample # corrupt_input = [(Input(shape = (self.sent_maxlen,), # dtype="int32", # name = "neg_word_inputs"), # word_embedding_layer), # (Input(shape = (self.sent_maxlen,), # dtype="int32", # name = "neg_predicate_inputs"), # word_embedding_layer), # (Input(shape = (self.sent_maxlen,), # dtype="int32", # name = "neg_postags_inputs"), # pos_embedding_layer), # (Input(shape = (self.sent_maxlen,), # dtype="int32", # name = "neg_postags_inputs"), # label_embedding_layer), # ] confidence_prediction = lambda inputs_and_embeddings:\ predict_layer(dropout(latent_layers(merge([embed(inp) for inp, embed in inputs_and_embeddings], mode = "concat", concat_axis = -1)))) # Compute two "branches" for confidence estimation - one true and one corrput true_confidence = confidence_prediction(true_input) neg_confidence = confidence_prediction(corrupt_input) # Combine these output = merge([true_confidence, neg_confidence], mode="sum") # Build model self.model = Model(input=map(itemgetter(0), true_input + corrupt_input), output=[output]) # Loss self.model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['categorical_accuracy']) self.model.summary() # Save model json to file self.save_model_to_file( os.path.join(self.model_dir, "confidence_model.json")) def to_json(self): """ Encode a json of the parameters needed to reload this model """ return { "sent_maxlen": self.sent_maxlen, "batch_size": self.batch_size, "seed": self.seed, "sep": self.sep, "hidden_units": self.hidden_units, "trainable_emb": self.trainable_emb, "emb_dropout": self.emb_dropout, "num_of_latent_layers": self.num_of_latent_layers, "epochs": self.epochs, "pred_dropout": self.pred_dropout, "emb_filename": self.emb_filename, "pos_tag_embedding_size": self.pos_tag_embedding_size, } def save_model_to_file(self, fn): """ Saves this model to file, also encodes class inits in the model's json """ js = json.loads(self.model.to_json()) # Add this model's params js["rnn"] = self.to_json() with open(fn, 'w') as fout: json.dump(js, fout)
class RNN_Model(LightningModule): def __init__(self, train_file, dev_file, test_file, emb_filename=None, sent_maxlen=300, batch_size=32, seed=42, sep='\t', hidden_units=pow(2, 7), trainable_emb=True, emb_dropout=0.1, num_of_latent_layers=2, epochs=10, pred_dropout=0.1, model_dir="./models/", classes=None, pos_tag_embedding_size=5, num_classes=15, num_workers=0, lr=0.001): # NOTE: So far, num classes must be provided at the beginning super(RNN_Model, self).__init__() self.train_file = train_file self.dev_file = dev_file self.test_file = test_file self.model_dir = model_dir self.sent_maxlen = sent_maxlen self.batch_size = batch_size self.seed = seed self.sep = sep self.hidden_units = hidden_units self.emb_filename = emb_filename self.emb = Glove(emb_filename) self.embedding_size = self.emb.dim self.trainable_emb = trainable_emb self.emb_dropout = emb_dropout self.num_of_latent_layers = num_of_latent_layers self.epochs = epochs self.pred_dropout = pred_dropout self.classes = classes self.label_map = None self.num_classes = num_classes self.num_workers = num_workers self.lr = lr if self.classes is not None: self.label_map = LabelEncoder() self.label_map.fit(self.classes) self.pos_tag_embedding_size = pos_tag_embedding_size np.random.seed(self.seed) # build_model self.build_model() def build_model(self): self.word_embedding = self.embed_word() self.pos_embedding = self.embed_pos() self.lstm = nn.LSTM(self.embedding_size * 2 + self.pos_tag_embedding_size, self.hidden_units, num_layers=self.num_of_latent_layers, bidirectional=True) ## Dropout self.dropout = nn.Dropout(self.pred_dropout) self.linears = nn.ModuleList( [nn.Linear(self.hidden_units * 2, self.hidden_units)]) self.linears.extend([ nn.Linear(self.hidden_units, self.hidden_units) for i in range(1) ]) linear_pred = nn.Linear(self.hidden_units, self.num_classes) self.linears.append(linear_pred) def forward(self, x): lengths = [len(i) for i in x[0]] batch_size = len(lengths) x = [rnn.pad_sequence(i) for i in x] sents, predicates, tags = x[0], x[1], x[2] embed_sent = self.word_embedding(sents) embed_predicate = self.word_embedding(predicates) embed_pos = self.pos_embedding(tags) embed = torch.cat([embed_sent, embed_predicate, embed_pos], dim=-1) out = rnn.pack_padded_sequence(embed, lengths, enforce_sorted=False) out, _ = self.lstm(out) out, _ = rnn.pad_packed_sequence(out) out = out.view(-1, batch_size, 2 * self.hidden_units) for i in self.linears: out = i(out) out = F.relu(out) out = rnn.pack_padded_sequence(out, lengths, enforce_sorted=False) return out def configure_optimizers(self): return torch.optim.Adam(self.parameters(), lr=self.lr) def train_dataloader(self): self.train_dataset = OpenIE_CONLL_Dataset(self.train_file, self.emb, sep=self.sep, label_map=self.label_map, sent_maxlen=self.sent_maxlen) print("Num train instances:", len(self.train_dataset)) self.label_map = self.train_dataset.label_map self.classes = self.label_map.classes_ loader = DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True, collate_fn=self.train_dataset.collate, num_workers=self.num_workers) return loader def val_dataloader(self): self.dev_dataset = OpenIE_CONLL_Dataset(self.dev_file, self.emb, sep=self.sep, label_map=self.label_map, sent_maxlen=self.sent_maxlen) print("Num dev instances:", len(self.dev_dataset)) loader = DataLoader(self.dev_dataset, batch_size=self.batch_size, shuffle=False, collate_fn=self.dev_dataset.collate, num_workers=self.num_workers) return loader def test_dataloader(self): dataset = OpenIE_CONLL_Dataset(self.test_file, self.emb, sep=self.sep, label_map=self.label_map, sent_maxlen=self.sent_maxlen) loader = DataLoader(dataset, batch_size=self.batch_size, shuffle=False, collate_fn=dataset.collate, num_workers=self.num_workers) return loader def training_step(self, batch, batch_idx): x, y = batch y_hat = self(x) y = rnn.pack_sequence(y, enforce_sorted=False) loss = F.cross_entropy(y_hat.data, y.data) tensorboard_logs = {'train_loss': loss} return {'loss': loss, 'log': tensorboard_logs} def validation_step(self, batch, batch_idx): x, y = batch y_hat = self(x) y = rnn.pack_sequence(y, enforce_sorted=False) loss = F.cross_entropy(y_hat.data, y.data) _, y_hat = torch.max(y_hat.data, dim=-1) # 1 is for the class acc = accuracy_score(y.data.cpu(), y_hat.cpu()) acc = torch.tensor(acc, dtype=torch.float) return {'val_loss': loss, 'val_acc': acc} def validation_epoch_end(self, outputs): avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean() avg_acc = torch.stack([x['val_acc'] for x in outputs]).mean() tensorboard_logs = {'val_loss': avg_loss, 'avg_val_acc': avg_acc} return { 'val_loss': avg_loss, 'log': tensorboard_logs, 'progress_bar': tensorboard_logs } def test_step(self, batch, batch_idx): x, y = batch y_hat = self(x) y = rnn.pack_sequence(y, enforce_sorted=False) _, y_hat = torch.max(y_hat.data, dim=-1) # 1 is for the class acc = accuracy_score(y.data.cpu(), y_hat.cpu()) acc = torch.tensor(acc, dtype=torch.float) return {'test_acc': acc} def test_epoch_end(self, outputs): avg_loss = torch.stack([x['test_acc'] for x in outputs]).mean() tensorboard_logs = {'test_acc': avg_loss} return { 'avg_test_acc': avg_loss, 'log': tensorboard_logs, 'progress_bar': tensorboard_logs } def compute_accuracy_packed(self, y_hat, y): pass def predict_sentence(self, sent): """ Return a predicted label for each word in an arbitrary length sentence sent - a list of string tokens """ ret = [] sent_str = " ".join(sent) # Extract predicates by looking at verbal POS preds = [(word.i, str(word)) for word in spacy_ws(sent_str) if word.tag_.startswith("V")] # Calculate num of samples (round up to the nearst multiple of sent_maxlen) num_of_samples = np.ceil( float(len(sent)) / self.sent_maxlen) * self.sent_maxlen # Run RNN for each predicate on this sentence for ind, pred in preds: cur_sample = self.create_sample(sent, ind) X = self.encode_inputs([cur_sample]) ret.append(( (ind, pred), [ (self.consolidate_label(label), float(prob)) for (label, prob) in self.transform_output_probs( self.model.predict(X), # "flatten" and truncate get_prob=True).reshape(num_of_samples, 2)[:len(sent)] ])) return ret def get_head_pred_word(self, full_sent): """ Get the head predicate word from a full sentence conll. """ assert (len(set(full_sent.head_pred_id.values)) == 1) # Sanity check pred_ind = full_sent.head_pred_id.values[0] return full_sent.word.values[pred_ind] \ if pred_ind != -1 \ else full_sent.pred.values[0].split(" ")[0] def embed_word(self): #TODO: dropout and maxlen """ Embed word sequences using self's embedding class """ return self.emb.get_torch_embedding(freeze=not self.trainable_emb) def embed_pos(self): """ Embed Part of Speech using this instance params """ return nn.Embedding(len(SPACY_POS_TAGS), self.pos_tag_embedding_size)