Beispiel #1
0
    def __init__(
        self,
        model_fn,
        sent_maxlen=None,
        emb_filename=None,
        batch_size=5,
        seed=42,
        sep='\t',
        hidden_units=pow(2, 7),
        trainable_emb=True,
        emb_dropout=0.1,
        num_of_latent_layers=2,
        epochs=10,
        pred_dropout=0.1,
        model_dir="./models/",
        classes=None,
        pos_tag_embedding_size=5,
    ):
        """
        Initialize the model
        model_fn - a model generating function, to be called when
                   training with self as a single argument.
        sent_maxlen - the maximum length in words of each sentence -
                      will be used for padding / truncating
        emb_filename - the filename from which to load the embedding
                       (Currenly only Glove. Idea: parse by filename)
        batch_size - batch size for training
        seed - the random seed for reproduciblity
        sep  - separator in the csv dataset files for this model
        hidden_units - number of hidden units per layer
        trainable_emb - controls if the loss should propagate to the word embeddings during training
        emb_dropout - the percentage of dropout during embedding
        num_of_latent_layers - how many LSTMs to stack
        epochs - the number of epochs to train the model
        pred_dropout - the proportion to dropout before prediction
        model_dir - the path in which to save model
        classes - the classes to be encoded (list of strings)
        pos_tag_embedding_size - The number of features to use when encoding pos tags
        """
        self.model_fn = lambda: model_fn(self)
        self.model_dir = model_dir
        self.sent_maxlen = sent_maxlen
        self.batch_size = batch_size
        self.seed = seed
        self.sep = sep
        self.encoder = LabelEncoder()
        self.hidden_units = hidden_units
        self.emb_filename = emb_filename
        self.emb = Glove(emb_filename)
        self.embedding_size = self.emb.dim
        self.trainable_emb = trainable_emb
        self.emb_dropout = emb_dropout
        self.num_of_latent_layers = num_of_latent_layers
        self.epochs = epochs
        self.pred_dropout = pred_dropout
        self.classes = classes
        self.pos_tag_embedding_size = pos_tag_embedding_size

        np.random.seed(self.seed)
Beispiel #2
0
    def __init__(self,
                 train_file,
                 dev_file,
                 test_file,
                 emb_filename=None,
                 sent_maxlen=300,
                 batch_size=32,
                 seed=42,
                 sep='\t',
                 hidden_units=pow(2, 7),
                 trainable_emb=True,
                 emb_dropout=0.1,
                 num_of_latent_layers=2,
                 epochs=10,
                 pred_dropout=0.1,
                 model_dir="./models/",
                 classes=None,
                 pos_tag_embedding_size=5,
                 num_classes=15,
                 num_workers=0,
                 lr=0.001):
        # NOTE: So far, num classes must be provided at the beginning
        super(RNN_Model, self).__init__()
        self.train_file = train_file
        self.dev_file = dev_file
        self.test_file = test_file
        self.model_dir = model_dir
        self.sent_maxlen = sent_maxlen
        self.batch_size = batch_size
        self.seed = seed
        self.sep = sep
        self.hidden_units = hidden_units
        self.emb_filename = emb_filename
        self.emb = Glove(emb_filename)
        self.embedding_size = self.emb.dim
        self.trainable_emb = trainable_emb
        self.emb_dropout = emb_dropout
        self.num_of_latent_layers = num_of_latent_layers
        self.epochs = epochs
        self.pred_dropout = pred_dropout
        self.classes = classes
        self.label_map = None
        self.num_classes = num_classes
        self.num_workers = num_workers
        self.lr = lr
        if self.classes is not None:
            self.label_map = LabelEncoder()
            self.label_map.fit(self.classes)
        self.pos_tag_embedding_size = pos_tag_embedding_size

        np.random.seed(self.seed)
        # build_model
        self.build_model()
Beispiel #3
0
 def __init__(self, **args):
     """
     Init and compile model's params
     Arguments:
     seed - the random seed to use
     sep - the delimiter to be used in the csv files
     batch_size - (=input_lenght) Batch size in which to partition the elements
     maximum_output_length - The maximum number of words in output
     emb - Pretrained embeddings
     hidden_dim - number of hidden units
     input_depth - the number of layers in encoder
     output_depth - the number of layers in decoder
     peek - (binray) add the peek feature
     attention - (binary) use attention model
     epochs - Number of epochs to train the model
     loss - (string) the loss function, one of keras options
     optimizer - (string) the optimizer function, one of keras options
     """
     self.args = args
     self.sep = str(self.args['sep'])
     self.emb = Glove(self.args['emb_fn'])
     self.epochs = self.args['epochs']
     np.random.seed(self.args['seed'])
     self.model = Seq2seq_OIE.compile_model(
         input_length=self.args['batch_size'],
         input_depth=self.args['input_depth'],
         input_dim=self.emb.dim,
         hidden_dim=self.args['hidden_dim'],
         output_length=self.args['maximum_output_length'],
         output_depth=self.args['output_depth'],
         output_dim=self.emb.dim,
         peek=self.args['peek'],
         attention=self.args['attention'],
         loss=self.args['loss'],
         optimizer=self.args['optimizer'],
     )
Beispiel #4
0
class RNN_model:
    """
    Represents an RNN model for supervised OIE
    """
    def __init__(
        self,
        model_fn,
        sent_maxlen=None,
        emb_filename=None,
        batch_size=5,
        seed=42,
        sep='\t',
        hidden_units=pow(2, 7),
        trainable_emb=True,
        emb_dropout=0.1,
        num_of_latent_layers=2,
        epochs=10,
        pred_dropout=0.1,
        model_dir="./models/",
        classes=None,
        pos_tag_embedding_size=5,
    ):
        """
        Initialize the model
        model_fn - a model generating function, to be called when
                   training with self as a single argument.
        sent_maxlen - the maximum length in words of each sentence -
                      will be used for padding / truncating
        emb_filename - the filename from which to load the embedding
                       (Currenly only Glove. Idea: parse by filename)
        batch_size - batch size for training
        seed - the random seed for reproduciblity
        sep  - separator in the csv dataset files for this model
        hidden_units - number of hidden units per layer
        trainable_emb - controls if the loss should propagate to the word embeddings during training
        emb_dropout - the percentage of dropout during embedding
        num_of_latent_layers - how many LSTMs to stack
        epochs - the number of epochs to train the model
        pred_dropout - the proportion to dropout before prediction
        model_dir - the path in which to save model
        classes - the classes to be encoded (list of strings)
        pos_tag_embedding_size - The number of features to use when encoding pos tags
        """
        self.model_fn = lambda: model_fn(self)
        self.model_dir = model_dir
        self.sent_maxlen = sent_maxlen
        self.batch_size = batch_size
        self.seed = seed
        self.sep = sep
        self.encoder = LabelEncoder()
        self.hidden_units = hidden_units
        self.emb_filename = emb_filename
        self.emb = Glove(emb_filename)
        self.embedding_size = self.emb.dim
        self.trainable_emb = trainable_emb
        self.emb_dropout = emb_dropout
        self.num_of_latent_layers = num_of_latent_layers
        self.epochs = epochs
        self.pred_dropout = pred_dropout
        self.classes = classes
        self.pos_tag_embedding_size = pos_tag_embedding_size

        np.random.seed(self.seed)

    def get_callbacks(self, X):
        """
        Sets these callbacks as a class member.
        X is the encoded dataset used to print a sample of the output.
        Callbacks created:
        1. Sample output each epoch
        2. Save best performing model each epoch
        """

        sample_output_callback = LambdaCallback(on_epoch_end = lambda epoch, logs:\
                                                logging.debug(pformat(self.sample_labels(self.model.predict(X)))))
        checkpoint = ModelCheckpoint(
            os.path.join(self.model_dir, "weights.hdf5"),
            verbose=1,
            save_best_only=False
        )  # TODO: is there a way to save by best val_acc?

        return [sample_output_callback, checkpoint]

    def plot(self, fn, train_fn):
        """
        Plot this model to an image file
        Train file is needed as it influences the dimentions of the RNN
        """
        from keras.utils.visualize_util import plot
        X, Y = self.load_dataset(train_fn)
        self.model_fn()
        plot(self.model, to_file=fn)

    def classes_(self):
        """
        Return the classes which are classified by this model
        """
        try:
            return self.encoder.classes_
        except:
            return self.classes

    def train_and_test(self, train_fn, test_fn):
        """
        Train and then test on given files
        """
        logging.info("Training..")
        self.train(train_fn)
        logging.info("Testing..")
        return self.test(test_fn)
        logging.info("Done!")

    def train(self, train_fn, dev_fn):
        """
        Train this model on a given train dataset
        Dev test is used for model checkpointing
        """
        X_train, Y_train = self.load_dataset(train_fn)
        X_dev, Y_dev = self.load_dataset(dev_fn)
        logging.debug("Classes: {}".format(
            (self.num_of_classes(), self.classes_())))
        # Set model params, called here after labels have been identified in load dataset
        self.model_fn()

        # Create a callback to print a sample after each epoch
        logging.debug("Training model on {}".format(train_fn))
        self.model.fit(X_train,
                       Y_train,
                       batch_size=self.batch_size,
                       epochs=self.epochs,
                       validation_data=(X_dev, Y_dev),
                       callbacks=self.get_callbacks(X_train))

    @staticmethod
    def consolidate_labels(labels):
        """
        Return a consolidated list of labels, e.g., O-A1 -> O, A1-I -> A
        """
        return map(RNN_model.consolidate_label, labels)

    @staticmethod
    def consolidate_label(label):
        """
        Return a consolidated label, e.g., O-A1 -> O, A1-I -> A
        """
        return label.split("-")[0] if label.startswith("O") else label

    def predict_sentence(self, sent):
        """
        Return a predicted label for each word in an arbitrary length sentence
        sent - a list of string tokens
        """
        ret = []
        sent_str = " ".join(sent)

        # Extract predicates by looking at verbal POS

        preds = [(word.i, str(word)) for word in spacy_ws(sent_str)
                 if word.tag_.startswith("V")]

        # Calculate num of samples (round up to the nearst multiple of sent_maxlen)
        num_of_samples = np.ceil(
            float(len(sent)) / self.sent_maxlen) * self.sent_maxlen

        # Run RNN for each predicate on this sentence
        for ind, pred in preds:
            cur_sample = self.create_sample(sent, ind)
            X = self.encode_inputs([cur_sample])
            ret.append((
                (ind, pred),
                [
                    [(self.consolidate_label(label), float(prob))
                     for (label, prob) in label_list]
                    for label_list in  #for (label, prob) in
                    self.transform_output_probs(
                        self.model.predict(X),  # "flatten" and truncate
                        get_prob=True)[0][:len(sent)]
                ]))
        return ret

    def create_sample(self, sent, head_pred_id):
        """
        Return a dataframe which could be given to encode_inputs
        """
        return pandas.DataFrame({
            "word": sent,
            "run_id": [-1] * len(sent),  # Mock running id
            "head_pred_id": head_pred_id
        })

    def test(self, test_fn, eval_metrics):
        """
        Evaluate this model on a test file
        eval metrics is a list composed of:
        (name, f: (y_true, y_pred) -> float (some performance metric))
        Prints and returns the metrics name and numbers
        """
        # Load gold and predict
        X, Y = self.load_dataset(test_fn)
        y = self.model.predict(X)

        # Get most probable predictions and flatten
        Y = RNN_model.consolidate_labels(
            self.transform_output_probs(Y).flatten())
        y = RNN_model.consolidate_labels(
            self.transform_output_probs(y).flatten())

        # Run evaluation metrics and report
        # TODO: is it possible to compare without the padding?
        ret = []
        for (metric_name, metric_func) in eval_metrics:
            ret.append((metric_name, metric_func(Y, y)))
            logging.debug("calculating {}".format(ret[-1]))

        for (metric_name, metric_val) in ret:
            logging.info("{}: {:.4f}".format(metric_name, metric_val))
        return Y, y, ret

    def load_dataset(self, fn):
        """
        Load a supervised OIE dataset from file
        """
        df = pandas.read_csv(fn, sep=self.sep, header=0, keep_default_na=False)

        # Encode one-hot representation of the labels
        if self.classes_() is None:
            self.encoder.fit(df.label.values)

        # Split according to sentences and encode
        sents = self.get_sents_from_df(df)
        return (self.encode_inputs(sents), self.encode_outputs(sents))

    def get_sents_from_df(self, df):
        """
        Split a data frame by rows accroding to the sentences
        """
        return [
            df[df.run_id == run_id] for run_id in sorted(set(df.run_id.values))
        ]

    def get_fixed_size(self, sents):
        """
        Partition sents into lists of sent_maxlen elements
        (execept the last in each sentence, which might be shorter)
        """
        return [
            sent[s_ind:s_ind + self.sent_maxlen] for sent in sents
            for s_ind in range(0, len(sent), self.sent_maxlen)
        ]

    def get_head_pred_word(self, full_sent):
        """
        Get the head predicate word from a full sentence conll.
        """
        assert (len(set(full_sent.head_pred_id.values)) == 1)  # Sanity check
        pred_ind = full_sent.head_pred_id.values[0]

        return full_sent.word.values[pred_ind] \
            if pred_ind != -1 \
               else full_sent.pred.values[0].split(" ")[0]

    def encode_inputs(self, sents):
        """
        Given a dataframe which is already split to sentences,
        encode inputs for rnn classification.
        Should return a dictionary of sequences of sample of length maxlen.
        """
        word_inputs = []
        pred_inputs = []
        pos_inputs = []

        # Preproc to get all preds per run_id
        # Sanity check - make sure that all sents agree on run_id
        assert (all([len(set(sent.run_id.values)) == 1 for sent in sents]))
        run_id_to_pred = dict([(int(sent.run_id.values[0]),
                                self.get_head_pred_word(sent))
                               for sent in sents])

        # Construct a mapping from running word index to pos
        word_id_to_pos = {}
        for sent in sents:
            indices = sent.index.values
            words = sent.word.values

            for index, word in zip(indices, spacy_ws(" ".join(words))):
                word_id_to_pos[index] = word.tag_

        fixed_size_sents = self.get_fixed_size(sents)

        for sent in fixed_size_sents:

            assert (len(set(sent.run_id.values)) == 1)

            word_indices = sent.index.values
            sent_words = sent.word.values

            sent_str = " ".join(sent_words)



            pos_tags_encodings = [(SPACY_POS_TAGS.index(word_id_to_pos[word_ind]) \
                                   if word_id_to_pos[word_ind] in SPACY_POS_TAGS \
                                   else 0)
                                  for word_ind
                                  in word_indices]

            word_encodings = [self.emb.get_word_index(w) for w in sent_words]

            # Same pred word encodings for all words in the sentence
            pred_word = run_id_to_pred[int(sent.run_id.values[0])]
            pred_word_encodings = [
                self.emb.get_word_index(pred_word) for _ in sent_words
            ]

            word_inputs.append([Sample(w) for w in word_encodings])
            pred_inputs.append([Sample(w) for w in pred_word_encodings])
            pos_inputs.append([Sample(pos) for pos in pos_tags_encodings])

        # Pad / truncate to desired maximum length
        ret = defaultdict(lambda: [])

        for name, sequence in zip(
            ["word_inputs", "predicate_inputs", "postags_inputs"],
            [word_inputs, pred_inputs, pos_inputs]):
            for samples in pad_sequences(sequence,
                                         pad_func=lambda: Pad_sample(),
                                         maxlen=self.sent_maxlen):
                ret[name].append([sample.encode() for sample in samples])

        return {k: np.array(v) for k, v in ret.iteritems()}

    def encode_outputs(self, sents):
        """
        Given a dataframe split to sentences, encode outputs for rnn classification.
        Should return a list sequence of sample of length maxlen.
        """
        output_encodings = []
        sents = self.get_fixed_size(sents)
        # Encode outputs
        for sent in sents:
            output_encodings.append(
                list(
                    np_utils.to_categorical(
                        list(self.transform_labels(sent.label.values)),
                        num_classes=self.num_of_classes())))

        # Pad / truncate to maximum length
        return np.ndarray(shape = (len(sents),
                                  self.sent_maxlen,
                                  self.num_of_classes()),
                          buffer = np.array(pad_sequences(output_encodings,
                                                          lambda : \
                                                            np.zeros(self.num_of_classes()),
                                                          maxlen = self.sent_maxlen)))

    def transform_labels(self, labels):
        """
        Encode a list of textual labels
        """
        # Fallback:
        # return self.encoder.transform(labels)
        classes = list(self.classes_())
        return [classes.index(label) for label in labels]

    def transform_output_probs(self, y, get_prob=False):
        """
        Given a list of probabilities over labels, get the textual representation of the
        most probable assignment
        """
        return np.array(
            self.sample_labels(
                y,
                num_of_sents=len(y),  # all sentences
                num_of_samples=max(map(len, y)),  # all words
                num_of_classes=1,  # Only top probability
                start_index=0,  # all sentences
                get_prob=get_prob,  # Indicate whether to get only labels
            ))

    def inverse_transform_labels(self, indices):
        """
        Encode a list of textual labels
        """
        classes = self.classes_()
        return [classes[ind] for ind in indices]

    def num_of_classes(self):
        """
        Return the number of ouput classes
        """
        return len(self.classes_())

    # Functional Keras -- all of the following are currying functions expecting models as input
    # https://keras.io/getting-started/functional-api-guide/

    def embed_word(self):
        """
        Embed word sequences using self's embedding class
        """
        return self.emb.get_keras_embedding(dropout=self.emb_dropout,
                                            trainable=self.trainable_emb,
                                            input_length=self.sent_maxlen)

    def embed_pos(self):
        """
        Embed Part of Speech using this instance params
        """
        return Embedding(output_dim=self.pos_tag_embedding_size,
                         input_dim=len(SPACY_POS_TAGS),
                         input_length=self.sent_maxlen)

    def predict_classes(self):
        """
        Predict to the number of classes
        Named arguments are passed to the keras function
        """
        return lambda x: self.stack(x, [
            lambda: TimeDistributed(
                Dense(output_dim=self.num_of_classes(), activation="softmax"))
        ] + [
            lambda: TimeDistributed(Dense(self.hidden_units, activation='relu')
                                    )
        ] * 3)

    def stack_latent_layers(self, n):
        """
        Stack n bidi LSTMs
        """
        return lambda x: self.stack(x, [
            lambda: Bidirectional(
                LSTM(self.hidden_units, return_sequences=True))
        ] * n)

    def stack(self, x, layers):
        """
        Stack layers (FIFO) by applying recursively on the output,
        until returing the input as the base case for the recursion
        """
        if not layers:
            return x  # Base case of the recursion is the just returning the input
        else:
            return layers[0]()(self.stack(x, layers[1:]))

    def set_model_from_file(self):
        """
        Receives an instance of RNN and returns a model from the self.model_dir
        path which should contain a file named: model.json,
        and a single file with the hdf5 extension.
        Note: Use this function for a pretrained model, running model training
        on the loaded model will override the files in the model_dir
        """
        from glob import glob

        weights_fn = glob(os.path.join(self.model_dir, "*.hdf5"))
        assert len(weights_fn
                   ) == 1, "More/Less than one weights file in {}: {}".format(
                       self.model_dir, weights_fn)
        weights_fn = weights_fn[0]
        logging.debug("Weights file: {}".format(weights_fn))
        self.model = model_from_json(
            open(os.path.join(self.model_dir, "./model.json")).read())
        self.model.load_weights(weights_fn)
        self.model.compile(optimizer="adam",
                           loss='categorical_crossentropy',
                           metrics=["accuracy"])

    def set_vanilla_model(self):
        """
        Set a Keras model for predicting OIE as a member of this class
        Can be passed as model_fn to the constructor
        """
        logging.debug("Setting vanilla model")
        # Build model

        ## Embedding Layer
        word_embedding_layer = self.embed_word()
        pos_embedding_layer = self.embed_pos()

        ## Deep layers
        latent_layers = self.stack_latent_layers(self.num_of_latent_layers)

        ## Dropout
        dropout = Dropout(self.pred_dropout)

        ## Prediction
        predict_layer = self.predict_classes()

        ## Prepare input features, and indicate how to embed them
        inputs_and_embeddings = [
            (Input(shape=(self.sent_maxlen, ),
                   dtype="int32",
                   name="word_inputs"), word_embedding_layer),
            (Input(shape=(self.sent_maxlen, ),
                   dtype="int32",
                   name="predicate_inputs"), word_embedding_layer),
            (Input(shape=(self.sent_maxlen, ),
                   dtype="int32",
                   name="postags_inputs"), pos_embedding_layer),
        ]

        ## Concat all inputs and run on deep network
        output = predict_layer(
            dropout(
                latent_layers(
                    merge([embed(inp) for inp, embed in inputs_and_embeddings],
                          mode="concat",
                          concat_axis=-1))))

        # Build model
        self.model = Model(input=map(itemgetter(0), inputs_and_embeddings),
                           output=[output])

        # Loss
        self.model.compile(optimizer='adam',
                           loss='categorical_crossentropy',
                           metrics=['categorical_accuracy'])
        self.model.summary()

        # Save model json to file
        self.save_model_to_file(os.path.join(self.model_dir, "model.json"))

    def to_json(self):
        """
        Encode a json of the parameters needed to reload this model
        """
        return {
            "sent_maxlen": self.sent_maxlen,
            "batch_size": self.batch_size,
            "seed": self.seed,
            "sep": self.sep,
            "classes": list(self.classes_()),
            "hidden_units": self.hidden_units,
            "trainable_emb": self.trainable_emb,
            "emb_dropout": self.emb_dropout,
            "num_of_latent_layers": self.num_of_latent_layers,
            "epochs": self.epochs,
            "pred_dropout": self.pred_dropout,
            "emb_filename": self.emb_filename,
            "pos_tag_embedding_size": self.pos_tag_embedding_size,
        }

    def save_model_to_file(self, fn):
        """
        Saves this model to file, also encodes class inits in the model's json
        """
        js = json.loads(self.model.to_json())

        # Add this model's params
        js["rnn"] = self.to_json()
        with open(fn, 'w') as fout:
            json.dump(js, fout)

    def sample_labels(self,
                      y,
                      num_of_sents=5,
                      num_of_samples=10,
                      num_of_classes=3,
                      start_index=5,
                      get_prob=True):
        """
        Get a sense of how labels in y look like
        """
        classes = self.classes_()
        ret = []
        for sent in y[:num_of_sents]:
            cur = []
            for word in sent[start_index:start_index + num_of_samples]:
                sorted_prob = am(word)
                cur.append([(classes[ind],
                             word[ind]) if get_prob else classes[ind]
                            for ind in sorted_prob[:num_of_classes]])
            ret.append(cur)
        return ret
class Confidence_model:
    """
    Represents an RNN model for computing the confidence of an extraction
    """
    def __init__(
        self,
        model_fn,
        sent_maxlen=None,
        emb_filename=None,
        batch_size=5,
        seed=42,
        sep='\t',
        hidden_units=pow(2, 7),
        trainable_emb=True,
        emb_dropout=0.1,
        num_of_latent_layers=2,
        epochs=10,
        pred_dropout=0.1,
        model_dir="./models/",
        pos_tag_embedding_size=5,
    ):
        """
        Initialize the model
        model_fn - a model generating function, to be called when
                   training with self as a single argument.
        sent_maxlen - the maximum length in words of each sentence -
                      will be used for padding / truncating
        batch_size - batch size for training
        seed - the random seed for reproduciblity
        sep  - separator in the csv dataset files for this model
        hidden_units - number of hidden units per layer
        trainable_emb - controls if the loss should propagate to the word embeddings during training
        emb_dropout - the percentage of dropout during embedding
        num_of_latent_layers - how many LSTMs to stack
        epochs - the number of epochs to train the model
        pred_dropout - the proportion to dropout before prediction
        model_dir - the path in which to save the model
        pos_tag_embedding_size - The number of features to use when encoding pos tags
        """
        self.model_fn = lambda: model_fn(self)
        self.model_dir = model_dir
        self.sent_maxlen = sent_maxlen
        self.batch_size = batch_size
        self.seed = seed
        self.sep = sep
        self.encoder = LabelEncoder()
        self.hidden_units = hidden_units
        self.emb_filename = emb_filename
        self.emb = Glove(emb_filename)
        self.embedding_size = self.emb.dim
        self.trainable_emb = trainable_emb
        self.emb_dropout = emb_dropout
        self.num_of_latent_layers = num_of_latent_layers
        self.epochs = epochs
        self.pred_dropout = pred_dropout
        self.pos_tag_embedding_size = pos_tag_embedding_size

        np.random.seed(self.seed)

        # TODO: this is not needed for confidence, which calculates a real value
        self.num_of_classes = lambda: 1
        self.classes_ = lambda: None

    def confidence_prediction(self, inputs_and_embeddings):
        """
        Return a network computing confidence of the given OIE inputs
        """
        return predict_layer(
            dropout(
                latent_layers(
                    merge([embed(inp) for inp, embed in inputs_and_embeddings],
                          mode="concat",
                          concat_axis=-1))))

    # TODO: these should probably be deleted

    def transform_labels(self, labels):
        """
        Encode a list of textual labels
        """
        # Fallback:
        # return self.encoder.transform(labels)
        classes = list(self.classes_())
        return [classes.index(label) for label in labels]

    # TODO: put all of the functions below in a super class (Functional_keras_model, Functional_sentenial_model)
    # General utils

    def plot(self, fn, train_fn):
        """
        Plot this model to an image file
        Train file is needed as it influences the dimentions of the RNN
        """
        from keras.utils.visualize_util import plot
        X, Y = self.load_dataset(train_fn)
        self.model_fn()
        plot(self.model, to_file=fn)

    def load_dataset(self, fn):
        """
        Load a supervised OIE dataset from file
        """
        df = pandas.read_csv(fn, sep=self.sep, header=0)

        # Encode one-hot representation of the labels
        if self.classes_() is None:
            self.encoder.fit(df.label.values)

        # Split according to sentences and encode
        sents = self.get_sents_from_df(df)
        return (self.encode_inputs(sents), self.encode_outputs(sents))

    def get_sents_from_df(self, df):
        """
        Split a data frame by rows accroding to the sentences
        """
        return [
            df[df.run_id == i] for i in range(min(df.run_id), max(df.run_id))
        ]

    def encode_inputs(self, sents):
        """
        Given a dataframe split to sentences, encode inputs for rnn classification.
        Should return a dictionary of sequences of sample of length maxlen.
        """
        word_inputs = []
        pred_inputs = []
        pos_inputs = []
        sents = self.get_fixed_size(sents)

        for sent in sents:
            # pd assigns NaN for very infreq. empty string (see wiki train)
            sent_words = [
                word
                if not (isinstance(word, float) and math.isnan(word)) else " "
                for word in sent.word.values
            ]

            pos_tags_encodings = [
                NLTK_POS_TAGS.index(tag)
                for (_, tag) in nltk.pos_tag(sent_words)
            ]
            word_encodings = [self.emb.get_word_index(w) for w in sent_words]
            pred_word_encodings = [
                self.emb.get_word_index(w) for w in sent_words
            ]
            word_inputs.append([Sample(w) for w in word_encodings])
            pred_inputs.append([Sample(w) for w in pred_word_encodings])
            pos_inputs.append([Sample(pos) for pos in pos_tags_encodings])

        # Pad / truncate to desired maximum length
        ret = {"word_inputs": [], "predicate_inputs": []}
        ret = defaultdict(lambda: [])

        for name, sequence in zip(
            ["word_inputs", "predicate_inputs", "postags_inputs"],
            [word_inputs, pred_inputs, pos_inputs]):
            for samples in pad_sequences(sequence,
                                         pad_func=lambda: Pad_sample(),
                                         maxlen=self.sent_maxlen):
                ret[name].append([sample.encode() for sample in samples])

        return {k: np.array(v) for k, v in ret.iteritems()}

    def get_fixed_size(self, sents):
        """
        Partition sents into lists of sent_maxlen elements
        (execept the last in each sentence, which might be shorter)
        """
        return [
            sent[s_ind:s_ind + self.sent_maxlen] for sent in sents
            for s_ind in range(0, len(sent), self.sent_maxlen)
        ]

    def encode_outputs(self, sents):
        """
        Given a dataframe split to sentences, encode outputs for rnn classification.
        Should return a list sequence of sample of length maxlen.
        """
        output_encodings = []
        sents = self.get_fixed_size(sents)
        # Encode outputs
        for sent in sents:
            output_encodings.append(list(np_utils.to_categorical(\
                                                list(self.transform_labels(sent.label.values)),
                                                            nb_classes = self.num_of_classes())))

        # Pad / truncate to maximum length
        return np.ndarray(shape = (len(sents),
                                  self.sent_maxlen,
                                  self.num_of_classes()),
                          buffer = np.array(pad_sequences(output_encodings,
                                                          lambda : \
                                                            np.zeros(self.num_of_classes()),
                                                          maxlen = self.sent_maxlen)))

    # Functional Keras -- all of the following are currying functions expecting models as input
    # https://keras.io/getting-started/functional-api-guide/

    def embed_word(self):
        """
        Embed word sequences using self's embedding class
        """
        return self.emb.get_keras_embedding(dropout=self.emb_dropout,
                                            trainable=self.trainable_emb,
                                            input_length=self.sent_maxlen)

    def embed_pos(self):
        """
        Embed Part of Speech using this instance params
        """
        return Embedding(output_dim=self.pos_tag_embedding_size,
                         input_dim=len(NLTK_POS_TAGS),
                         input_length=self.sent_maxlen)

    def predict_classes(self):
        """
        Predict to the number of classes
        Named arguments are passed to the keras function
        """
        return lambda x: self.stack(x, [
            lambda: TimeDistributed(
                Dense(output_dim=self.num_of_classes(), activation="softmax"))
        ] + [
            lambda: TimeDistributed(Dense(self.hidden_units, activation='relu')
                                    )
        ] * 3)

    def stack_latent_layers(self, n):
        """
        Stack n bidi LSTMs
        """
        return lambda x: self.stack(x, [
            lambda: Bidirectional(
                LSTM(self.hidden_units, return_sequences=True))
        ] * n)

    def stack(self, x, layers):
        """
        Stack layers (FIFO) by applying recursively on the output,
        until returing the input as the base case for the recursion
        """
        if not layers:
            return x  # Base case of the recursion is the just returning the input
        else:
            return layers[0]()(self.stack(x, layers[1:]))

    def set_model(self):
        """
        Set a Keras model for predicting OIE as a member of this class
        Can be passed as model_fn to the constructor
        """
        logging.debug("Setting vanilla model")
        # Build model

        ## Embedding Layer
        word_embedding_layer = self.embed_word()
        pos_embedding_layer = self.embed_pos()
        #        label_embedding_layer = self.embed_label()

        ## Deep layers
        latent_layers = self.stack_latent_layers(self.num_of_latent_layers)

        ## Dropout
        dropout = Dropout(self.pred_dropout)

        ## Prediction
        predict_layer = self.predict_classes()

        ## Prepare input features, and indicate how to embed them

        # True input
        true_input = [(Input(shape=(self.sent_maxlen, ),
                             dtype="int32",
                             name="word_inputs"), word_embedding_layer),
                      (Input(shape=(self.sent_maxlen, ),
                             dtype="int32",
                             name="postags_inputs"), pos_embedding_layer)]

        corrupt_input = [(Input(shape=(self.sent_maxlen, ),
                                dtype="int32",
                                name="neg_word_inputs"), word_embedding_layer),
                         (Input(shape=(self.sent_maxlen, ),
                                dtype="int32",
                                name="neg_postags_inputs"),
                          pos_embedding_layer)]

        # true_input = [(Input(shape = (self.sent_maxlen,),
        #                      dtype="int32",
        #                      name = "word_inputs"),
        #                word_embedding_layer),
        #               (Input(shape = (self.sent_maxlen,),
        #                      dtype="int32",
        #                      name = "predicate_inputs"),
        #                word_embedding_layer),
        #               (Input(shape = (self.sent_maxlen,),
        #                      dtype="int32",
        #                      name = "postags_inputs"),
        #                pos_embedding_layer),
        #               (Input(shape = (self.sent_maxlen,),
        #                      dtype="int32",
        #                      name = "postags_inputs"),
        #                label_embedding_layer),
        # ]

        # # Corrput negative sample
        # corrupt_input = [(Input(shape = (self.sent_maxlen,),
        #                                 dtype="int32",
        #                         name = "neg_word_inputs"),
        #                   word_embedding_layer),
        #                  (Input(shape = (self.sent_maxlen,),
        #                         dtype="int32",
        #                         name = "neg_predicate_inputs"),
        #                   word_embedding_layer),
        #                  (Input(shape = (self.sent_maxlen,),
        #                         dtype="int32",
        #                         name = "neg_postags_inputs"),
        #                   pos_embedding_layer),
        #                  (Input(shape = (self.sent_maxlen,),
        #                         dtype="int32",
        #                         name = "neg_postags_inputs"),
        #                   label_embedding_layer),
        # ]

        confidence_prediction = lambda inputs_and_embeddings:\
                                predict_layer(dropout(latent_layers(merge([embed(inp)
                                                                           for inp, embed in inputs_and_embeddings],
                                                                          mode = "concat",
                                                                        concat_axis = -1))))

        # Compute two "branches" for confidence estimation - one true and one corrput
        true_confidence = confidence_prediction(true_input)
        neg_confidence = confidence_prediction(corrupt_input)

        # Combine these
        output = merge([true_confidence, neg_confidence], mode="sum")

        # Build model
        self.model = Model(input=map(itemgetter(0),
                                     true_input + corrupt_input),
                           output=[output])

        # Loss
        self.model.compile(optimizer='adam',
                           loss='categorical_crossentropy',
                           metrics=['categorical_accuracy'])
        self.model.summary()

        # Save model json to file
        self.save_model_to_file(
            os.path.join(self.model_dir, "confidence_model.json"))

    def to_json(self):
        """
        Encode a json of the parameters needed to reload this model
        """
        return {
            "sent_maxlen": self.sent_maxlen,
            "batch_size": self.batch_size,
            "seed": self.seed,
            "sep": self.sep,
            "hidden_units": self.hidden_units,
            "trainable_emb": self.trainable_emb,
            "emb_dropout": self.emb_dropout,
            "num_of_latent_layers": self.num_of_latent_layers,
            "epochs": self.epochs,
            "pred_dropout": self.pred_dropout,
            "emb_filename": self.emb_filename,
            "pos_tag_embedding_size": self.pos_tag_embedding_size,
        }

    def save_model_to_file(self, fn):
        """
        Saves this model to file, also encodes class inits in the model's json
        """
        js = json.loads(self.model.to_json())

        # Add this model's params
        js["rnn"] = self.to_json()
        with open(fn, 'w') as fout:
            json.dump(js, fout)
Beispiel #6
0
class RNN_Model(LightningModule):
    def __init__(self,
                 train_file,
                 dev_file,
                 test_file,
                 emb_filename=None,
                 sent_maxlen=300,
                 batch_size=32,
                 seed=42,
                 sep='\t',
                 hidden_units=pow(2, 7),
                 trainable_emb=True,
                 emb_dropout=0.1,
                 num_of_latent_layers=2,
                 epochs=10,
                 pred_dropout=0.1,
                 model_dir="./models/",
                 classes=None,
                 pos_tag_embedding_size=5,
                 num_classes=15,
                 num_workers=0,
                 lr=0.001):
        # NOTE: So far, num classes must be provided at the beginning
        super(RNN_Model, self).__init__()
        self.train_file = train_file
        self.dev_file = dev_file
        self.test_file = test_file
        self.model_dir = model_dir
        self.sent_maxlen = sent_maxlen
        self.batch_size = batch_size
        self.seed = seed
        self.sep = sep
        self.hidden_units = hidden_units
        self.emb_filename = emb_filename
        self.emb = Glove(emb_filename)
        self.embedding_size = self.emb.dim
        self.trainable_emb = trainable_emb
        self.emb_dropout = emb_dropout
        self.num_of_latent_layers = num_of_latent_layers
        self.epochs = epochs
        self.pred_dropout = pred_dropout
        self.classes = classes
        self.label_map = None
        self.num_classes = num_classes
        self.num_workers = num_workers
        self.lr = lr
        if self.classes is not None:
            self.label_map = LabelEncoder()
            self.label_map.fit(self.classes)
        self.pos_tag_embedding_size = pos_tag_embedding_size

        np.random.seed(self.seed)
        # build_model
        self.build_model()

    def build_model(self):
        self.word_embedding = self.embed_word()
        self.pos_embedding = self.embed_pos()
        self.lstm = nn.LSTM(self.embedding_size * 2 +
                            self.pos_tag_embedding_size,
                            self.hidden_units,
                            num_layers=self.num_of_latent_layers,
                            bidirectional=True)

        ## Dropout
        self.dropout = nn.Dropout(self.pred_dropout)
        self.linears = nn.ModuleList(
            [nn.Linear(self.hidden_units * 2, self.hidden_units)])
        self.linears.extend([
            nn.Linear(self.hidden_units, self.hidden_units) for i in range(1)
        ])
        linear_pred = nn.Linear(self.hidden_units, self.num_classes)
        self.linears.append(linear_pred)

    def forward(self, x):
        lengths = [len(i) for i in x[0]]
        batch_size = len(lengths)
        x = [rnn.pad_sequence(i) for i in x]
        sents, predicates, tags = x[0], x[1], x[2]

        embed_sent = self.word_embedding(sents)
        embed_predicate = self.word_embedding(predicates)
        embed_pos = self.pos_embedding(tags)
        embed = torch.cat([embed_sent, embed_predicate, embed_pos], dim=-1)

        out = rnn.pack_padded_sequence(embed, lengths, enforce_sorted=False)
        out, _ = self.lstm(out)
        out, _ = rnn.pad_packed_sequence(out)
        out = out.view(-1, batch_size, 2 * self.hidden_units)
        for i in self.linears:
            out = i(out)
            out = F.relu(out)

        out = rnn.pack_padded_sequence(out, lengths, enforce_sorted=False)
        return out

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.lr)

    def train_dataloader(self):
        self.train_dataset = OpenIE_CONLL_Dataset(self.train_file,
                                                  self.emb,
                                                  sep=self.sep,
                                                  label_map=self.label_map,
                                                  sent_maxlen=self.sent_maxlen)
        print("Num train instances:", len(self.train_dataset))
        self.label_map = self.train_dataset.label_map
        self.classes = self.label_map.classes_
        loader = DataLoader(self.train_dataset,
                            batch_size=self.batch_size,
                            shuffle=True,
                            collate_fn=self.train_dataset.collate,
                            num_workers=self.num_workers)
        return loader

    def val_dataloader(self):
        self.dev_dataset = OpenIE_CONLL_Dataset(self.dev_file,
                                                self.emb,
                                                sep=self.sep,
                                                label_map=self.label_map,
                                                sent_maxlen=self.sent_maxlen)
        print("Num dev instances:", len(self.dev_dataset))
        loader = DataLoader(self.dev_dataset,
                            batch_size=self.batch_size,
                            shuffle=False,
                            collate_fn=self.dev_dataset.collate,
                            num_workers=self.num_workers)
        return loader

    def test_dataloader(self):
        dataset = OpenIE_CONLL_Dataset(self.test_file,
                                       self.emb,
                                       sep=self.sep,
                                       label_map=self.label_map,
                                       sent_maxlen=self.sent_maxlen)
        loader = DataLoader(dataset,
                            batch_size=self.batch_size,
                            shuffle=False,
                            collate_fn=dataset.collate,
                            num_workers=self.num_workers)
        return loader

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        y = rnn.pack_sequence(y, enforce_sorted=False)
        loss = F.cross_entropy(y_hat.data, y.data)
        tensorboard_logs = {'train_loss': loss}
        return {'loss': loss, 'log': tensorboard_logs}

    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        y = rnn.pack_sequence(y, enforce_sorted=False)
        loss = F.cross_entropy(y_hat.data, y.data)

        _, y_hat = torch.max(y_hat.data, dim=-1)  # 1 is for the class
        acc = accuracy_score(y.data.cpu(), y_hat.cpu())
        acc = torch.tensor(acc, dtype=torch.float)
        return {'val_loss': loss, 'val_acc': acc}

    def validation_epoch_end(self, outputs):
        avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
        avg_acc = torch.stack([x['val_acc'] for x in outputs]).mean()
        tensorboard_logs = {'val_loss': avg_loss, 'avg_val_acc': avg_acc}
        return {
            'val_loss': avg_loss,
            'log': tensorboard_logs,
            'progress_bar': tensorboard_logs
        }

    def test_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        y = rnn.pack_sequence(y, enforce_sorted=False)

        _, y_hat = torch.max(y_hat.data, dim=-1)  # 1 is for the class
        acc = accuracy_score(y.data.cpu(), y_hat.cpu())
        acc = torch.tensor(acc, dtype=torch.float)
        return {'test_acc': acc}

    def test_epoch_end(self, outputs):
        avg_loss = torch.stack([x['test_acc'] for x in outputs]).mean()
        tensorboard_logs = {'test_acc': avg_loss}
        return {
            'avg_test_acc': avg_loss,
            'log': tensorboard_logs,
            'progress_bar': tensorboard_logs
        }

    def compute_accuracy_packed(self, y_hat, y):
        pass

    def predict_sentence(self, sent):
        """
        Return a predicted label for each word in an arbitrary length sentence
        sent - a list of string tokens
        """
        ret = []
        sent_str = " ".join(sent)

        # Extract predicates by looking at verbal POS

        preds = [(word.i, str(word)) for word in spacy_ws(sent_str)
                 if word.tag_.startswith("V")]

        # Calculate num of samples (round up to the nearst multiple of sent_maxlen)
        num_of_samples = np.ceil(
            float(len(sent)) / self.sent_maxlen) * self.sent_maxlen

        # Run RNN for each predicate on this sentence
        for ind, pred in preds:
            cur_sample = self.create_sample(sent, ind)
            X = self.encode_inputs([cur_sample])
            ret.append((
                (ind, pred),
                [
                    (self.consolidate_label(label), float(prob))
                    for (label, prob) in self.transform_output_probs(
                        self.model.predict(X),  # "flatten" and truncate
                        get_prob=True).reshape(num_of_samples, 2)[:len(sent)]
                ]))
        return ret

    def get_head_pred_word(self, full_sent):
        """
        Get the head predicate word from a full sentence conll.
        """
        assert (len(set(full_sent.head_pred_id.values)) == 1)  # Sanity check
        pred_ind = full_sent.head_pred_id.values[0]

        return full_sent.word.values[pred_ind] \
            if pred_ind != -1 \
               else full_sent.pred.values[0].split(" ")[0]

    def embed_word(self):
        #TODO: dropout and maxlen
        """
        Embed word sequences using self's embedding class
        """
        return self.emb.get_torch_embedding(freeze=not self.trainable_emb)

    def embed_pos(self):
        """
        Embed Part of Speech using this instance params
        """
        return nn.Embedding(len(SPACY_POS_TAGS), self.pos_tag_embedding_size)