예제 #1
0
def main(args):
    print('Loading dataset...')
    x_train, y_train = load_data_and_labels(args.train_data)
    x_valid, y_valid = load_data_and_labels(args.valid_data)
    x_test, y_test = load_data_and_labels(args.test_data)
    x_train = np.r_[x_train, x_valid]
    y_train = np.r_[y_train, y_valid]

    print('Transforming datasets...')
    p = ELMoTransformer()
    p.fit(x_train, y_train)

    print('Loading word embeddings...')
    embeddings = load_glove(EMBEDDING_PATH)
    embeddings = filter_embeddings(embeddings, p._word_vocab.vocab, 100)

    print('Building a model.')
    model = ELModel(char_embedding_dim=args.char_emb_size,
                    word_embedding_dim=args.word_emb_size,
                    char_lstm_size=args.char_lstm_units,
                    word_lstm_size=args.word_lstm_units,
                    char_vocab_size=p.char_vocab_size,
                    word_vocab_size=p.word_vocab_size,
                    num_labels=p.label_size,
                    embeddings=embeddings,
                    dropout=args.dropout)
    model, loss = model.build()
    model.compile(loss=loss, optimizer='adam')

    print('Training the model...')
    trainer = Trainer(model, preprocessor=p)
    trainer.train(x_train, y_train, x_test, y_test)

    print('Saving the model...')
    model.save(args.weights_file, args.params_file)
예제 #2
0
def main(args):
    print('Loading datasets...')
    X, y = load_data_and_labels(args.data_path)
    x_train, x_valid, y_train, y_valid = train_test_split(X,
                                                          y,
                                                          test_size=0.1,
                                                          random_state=42)
    embeddings = KeyedVectors.load(args.embedding_path).wv

    print('Transforming datasets...')
    p = IndexTransformer()
    p.fit(X, y)
    embeddings = filter_embeddings(embeddings, p._word_vocab,
                                   embeddings.vector_size)

    print('Building a model...')
    model = BiLSTMCRF(char_vocab_size=p.char_vocab_size,
                      word_vocab_size=p.word_vocab_size,
                      num_labels=p.label_size,
                      embeddings=embeddings,
                      char_embedding_dim=50)
    model.build()

    print('Training the model...')
    trainer = Trainer(model, preprocessor=p)
    trainer.train(x_train, y_train, x_valid, y_valid)

    print('Saving the model...')
    model.save(args.weights_file, args.params_file)
    p.save(args.preprocessor_file)
예제 #3
0
    def train(self,
              x_train,
              y_train,
              x_valid=None,
              y_valid=None,
              vocab_init=None,
              verbose=1):
        self.p = prepare_preprocessor(x_train, y_train, vocab_init=vocab_init)
        embeddings = filter_embeddings(self.embeddings, self.p.vocab_word,
                                       self.model_config.word_embedding_size)
        self.model_config.vocab_size = len(self.p.vocab_word)
        self.model_config.char_vocab_size = len(self.p.vocab_char)

        self.model = SeqLabeling(self.model_config, embeddings,
                                 len(self.p.vocab_tag))

        if not os.path.exists(self.log_dir):
            print('Successfully made a directory: {}'.format(self.log_dir))
            os.mkdir(self.log_dir)
        self.p.save(os.path.join(self.log_dir, self.preprocessor_file))
        self.model_config.save(os.path.join(self.log_dir, self.config_file))
        print('Successfully save config and preprocess files')

        trainer = Trainer(self.model,
                          self.training_config,
                          checkpoint_path=self.log_dir,
                          preprocessor=self.p)
        return trainer.train(x_train, y_train, x_valid, y_valid, verbose)
예제 #4
0
def main(args):
    print('Loading dataset...')
    x_train, y_train = load_data_and_labels(args.train_data)
    x_valid, y_valid = load_data_and_labels(args.valid_data)

    print('Transforming datasets...')
    p = IndexTransformer(use_char=args.no_char_feature)
    p.fit(x_train, y_train)

    print('Building a model.')
    model = BiLSTMCRF(char_embedding_dim=args.char_emb_size,
                      word_embedding_dim=args.word_emb_size,
                      char_lstm_size=args.char_lstm_units,
                      word_lstm_size=args.word_lstm_units,
                      char_vocab_size=p.char_vocab_size,
                      word_vocab_size=p.word_vocab_size,
                      num_labels=p.label_size,
                      dropout=args.dropout,
                      use_char=args.no_char_feature,
                      use_crf=args.no_use_crf)
    model, loss = model.build()
    model.compile(loss=loss, optimizer='adam')

    print('Training the model...')
    trainer = Trainer(model, preprocessor=p)
    trainer.train(x_train, y_train, x_valid, y_valid)

    print('Saving the model...')
    model.save(args.weights_file, args.params_file)
    p.save(args.preprocessor_file)
예제 #5
0
    def test_save(self):
        # Train the model.
        trainer = Trainer(self.model, preprocessor=self.p)
        trainer.train(self.x_train, self.y_train)

        # Save the model.
        save_model(self.model, self.weights_file, self.params_file)
        self.p.save(self.preprocessor_file)
예제 #6
0
 def test_train_no_crf(self):
     model = BiLSTMCRF(char_vocab_size=self.p.char_vocab_size,
                       word_vocab_size=self.p.word_vocab_size,
                       num_labels=self.p.label_size,
                       use_crf=False)
     model, loss = model.build()
     model.compile(loss=loss, optimizer='adam')
     trainer = Trainer(model, preprocessor=self.p)
     trainer.train(self.x_train,
                   self.y_train,
                   x_valid=self.x_valid,
                   y_valid=self.y_valid)
예제 #7
0
 def test_train_no_character(self):
     p = IndexTransformer(use_char=False)
     p.fit(self.x_train, self.y_train)
     model = BiLSTMCRF(word_vocab_size=p.word_vocab_size,
                       num_labels=p.label_size,
                       use_crf=False,
                       use_char=False)
     model, loss = model.build()
     model.compile(loss=loss, optimizer='adam')
     trainer = Trainer(model, preprocessor=p)
     trainer.train(self.x_train,
                   self.y_train,
                   x_valid=self.x_valid,
                   y_valid=self.y_valid)
예제 #8
0
    def fit(self, x_train, y_train, x_valid=None, y_valid=None,
            epochs=1, batch_size=32, verbose=1, callbacks=None, shuffle=True):
        """Fit the model for a fixed number of epochs.

        Args:
            x_train: list of training data.
            y_train: list of training target (label) data.
            x_valid: list of validation data.
            y_valid: list of validation target (label) data.
            batch_size: Integer.
                Number of samples per gradient update.
                If unspecified, `batch_size` will default to 32.
            epochs: Integer. Number of epochs to train the model.
            verbose: Integer. 0, 1, or 2. Verbosity mode.
                0 = silent, 1 = progress bar, 2 = one line per epoch.
            callbacks: List of `keras.callbacks.Callback` instances.
                List of callbacks to apply during training.
            shuffle: Boolean (whether to shuffle the training data
                before each epoch). `shuffle` will default to True.
        """
        p = IndexTransformer(initial_vocab=self.initial_vocab, use_char=self.use_char)
        p.fit(x_train, y_train)
        embeddings = filter_embeddings(self.embeddings, p._word_vocab.vocab, self.word_embedding_dim)

        model = BiLSTMCRF(char_vocab_size=p.char_vocab_size,
                          word_vocab_size=p.word_vocab_size,
                          num_labels=p.label_size,
                          word_embedding_dim=self.word_embedding_dim,
                          char_embedding_dim=self.char_embedding_dim,
                          word_lstm_size=self.word_lstm_size,
                          char_lstm_size=self.char_lstm_size,
                          fc_dim=self.fc_dim,
                          dropout=self.dropout,
                          embeddings=embeddings,
                          use_char=self.use_char,
                          use_crf=self.use_crf)
        model, loss = model.build()
        model.compile(loss=loss, optimizer=self.optimizer)

        trainer = Trainer(model, preprocessor=p)
        trainer.train(x_train, y_train, x_valid, y_valid,
                      epochs=epochs, batch_size=batch_size,
                      verbose=verbose, callbacks=callbacks,
                      shuffle=shuffle)

        self.p = p
        self.model = model
예제 #9
0
    def fit(self, X, y):
        """ Trains the NER model. Input is list of list of tokens and tags.

            Parameters
            ----------
            X : list(list(str))
                list of list of tokens
            y : list(list(str))
                list of list of BIO tags

            Returns
            -------
            self
        """
        log.info("Preprocessing dataset...")
        self.preprocessor_ = IndexTransformer(use_char=self.use_char)
        self.preprocessor_.fit(X, y)

        log.info("Building model...")
        self.model_ = BiLSTMCRF(
            char_embedding_dim=self.char_embedding_dim,
            word_embedding_dim=self.word_embedding_dim,
            char_lstm_size=self.char_lstm_size,
            word_lstm_size=self.word_lstm_size,
            char_vocab_size=self.preprocessor_.char_vocab_size,
            word_vocab_size=self.preprocessor_.word_vocab_size,
            num_labels=self.preprocessor_.label_size,
            dropout=self.dropout,
            use_char=self.use_char,
            use_crf=self.use_crf)
        self.model_, loss = self.model_.build()
        optimizer = Adam(lr=self.learning_rate)
        self.model_.compile(loss=loss, optimizer=optimizer)
        self.model_.summary()

        log.info('Training the model...')
        self.trainer_ = Trainer(self.model_, preprocessor=self.preprocessor_)

        x_train, x_valid, y_train, y_valid = train_test_split(X, y, 
            test_size=0.1, random_state=42)
        self.trainer_.train(x_train, y_train, x_valid=x_valid, y_valid=y_valid,
            batch_size=self.batch_size, epochs=self.max_iter)

        self.tagger_ = Tagger(self.model_, preprocessor=self.preprocessor_)

        return self
예제 #10
0
    def train(self,
              x_train,
              y_train,
              x_valid=None,
              y_valid=None,
              vocab_init=None):
        self.p = prepare_preprocessor(x_train, y_train, vocab_init=vocab_init)
        embeddings = filter_embeddings(self.embeddings, self.p.vocab_word,
                                       self.model_config.word_embedding_size)
        self.model_config.vocab_size = len(self.p.vocab_word)
        self.model_config.char_vocab_size = len(self.p.vocab_char)

        self.model = SeqLabeling(self.model_config, embeddings,
                                 len(self.p.vocab_tag))

        trainer = Trainer(self.model,
                          self.training_config,
                          checkpoint_path=self.log_dir,
                          preprocessor=self.p,
                          save_path='./models')
        trainer.train(x_train, y_train, x_valid, y_valid)
예제 #11
0
class BiLstmCrfNER(NERModel):

    def __init__(self,
            word_embedding_dim=100,
            char_embedding_dim=25,
            word_lstm_size=100,
            char_lstm_size=25,
            fc_dim=100,
            dropout=0.5,
            embeddings=None,
            use_char=True,
            use_crf=True,
            batch_size=16, 
            learning_rate=0.001, 
            max_iter=10):
        """ Construct a BiLSTM-CRF NER model. Model is augmented with character
            level embeddings as well as word embeddings by default. Implementation 
            is provided by the Anago project.

            Parameters
            ----------
            word_embedding_dim : int, optional, default 100
                word embedding dimensions.
            char_embedding_dim : int, optional, default 25
                character embedding dimensions.
            word_lstm_size : int, optional, default 100
                character LSTM feature extractor output dimensions.
            char_lstm_size : int, optional, default 25
                word tagger LSTM output dimensions.
            fc_dim : int, optional, default 100
                output fully-connected layer size.
            dropout : float, optional, default 0.5
                dropout rate.
            embeddings : numpy array
                word embedding matrix.
            use_char : bool, optional, default True
                add char feature.
            use_crf : bool, optional, default True
                use crf as last layer.
            batch_size : int, optional, default 16
                training batch size.
            learning_rate : float, optional, default 0.001
                learning rate for Adam optimizer
            max_iter : int
                number of epochs of training

            Attributes
            ----------
            preprocessor_ : reference to preprocessor
            model_ : reference to generated model
            trainer_ : internal reference to Anago Trainer (model)
            tagger_ : internal reference to Anago Tagger (predictor)
        """
        super().__init__()
        self.word_embedding_dim = word_embedding_dim
        self.char_embedding_dim = char_embedding_dim
        self.word_lstm_size = word_lstm_size
        self.char_lstm_size = char_lstm_size
        self.fc_dim = fc_dim
        self.dropout = dropout
        self.embedding = None
        self.use_char = True
        self.use_crf = True
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.max_iter = max_iter
        # populated by fit() and load(), expected by save() and transform()
        self.preprocessor_ = None
        self.model_ = None
        self.trainer_ = None
        self.tagger_ = None


    def fit(self, X, y):
        """ Trains the NER model. Input is list of list of tokens and tags.

            Parameters
            ----------
            X : list(list(str))
                list of list of tokens
            y : list(list(str))
                list of list of BIO tags

            Returns
            -------
            self
        """
        log.info("Preprocessing dataset...")
        self.preprocessor_ = IndexTransformer(use_char=self.use_char)
        self.preprocessor_.fit(X, y)

        log.info("Building model...")
        self.model_ = BiLSTMCRF(
            char_embedding_dim=self.char_embedding_dim,
            word_embedding_dim=self.word_embedding_dim,
            char_lstm_size=self.char_lstm_size,
            word_lstm_size=self.word_lstm_size,
            char_vocab_size=self.preprocessor_.char_vocab_size,
            word_vocab_size=self.preprocessor_.word_vocab_size,
            num_labels=self.preprocessor_.label_size,
            dropout=self.dropout,
            use_char=self.use_char,
            use_crf=self.use_crf)
        self.model_, loss = self.model_.build()
        optimizer = Adam(lr=self.learning_rate)
        self.model_.compile(loss=loss, optimizer=optimizer)
        self.model_.summary()

        log.info('Training the model...')
        self.trainer_ = Trainer(self.model_, preprocessor=self.preprocessor_)

        x_train, x_valid, y_train, y_valid = train_test_split(X, y, 
            test_size=0.1, random_state=42)
        self.trainer_.train(x_train, y_train, x_valid=x_valid, y_valid=y_valid,
            batch_size=self.batch_size, epochs=self.max_iter)

        self.tagger_ = Tagger(self.model_, preprocessor=self.preprocessor_)

        return self


    def predict(self, X):
        """ Predicts using the NER model.

            Parameters
            ----------
            X : list(list(str))
                list of list of tokens.

            Returns
            -------
            y : list(list(str))
                list of list of predicted BIO tags.
        """
        if self.tagger_ is None:
            raise ValueError("No tagger found, either run fit() to train or load() a trained model")

        log.info("Predicting from model...")
        ypreds = [self.tagger_.predict(" ".join(x)) for x in X]
        return ypreds


    def save(self, dirpath):
        """ Saves model to local disk, given a dirpath 
        
            Parameters
            ----------
            dirpath : str
                a directory where model artifacts will be saved.
                Model saves a weights.h5 weights file, a params.json parameter
                file, and a preprocessor.pkl preprocessor file.

            Returns
            -------
            None
        """
        if self.model_ is None or self.preprocessor_ is None:
            raise ValueError("No model artifacts to save, either run fit() to train or load() a trained model")

        if not os.path.exists(dirpath):
            os.makedirs(dirpath)

        weights_file = os.path.join(dirpath, "weights.h5")
        params_file = os.path.join(dirpath, "params.json")
        preprocessor_file = os.path.join(dirpath, "preprocessor.pkl")

        save_model(self.model_, weights_file, params_file)
        self.preprocessor_.save(preprocessor_file)

        write_param_file(self.get_params(), os.path.join(dirpath, "params.yaml"))


    def load(self, dirpath):
        """ Loads a trained model from local disk, given the dirpath

            Parameters
            ----------
            dirpath : str
                a directory where model artifacts are saved.

            Returns
            -------
            self
        """
        if not os.path.exists(dirpath):
            raise ValueError("Model directory not found: {:s}".format(dirpath))

        weights_file = os.path.join(dirpath, "weights.h5")
        params_file = os.path.join(dirpath, "params.json")
        preprocessor_file = os.path.join(dirpath, "preprocessor.pkl")

        if not (os.path.exists(weights_file) or 
                os.path.exists(params_file) or
                os.path.exists(preprocessor_file)):
            raise ValueError("Model files may be corrupted, exiting")
        
        self.model_ = load_model(weights_file, params_file)
        self.preprocessor_ = IndexTransformer.load(preprocessor_file)
        self.tagger_ = Tagger(self.model_, preprocessor=self.preprocessor_)

        return self
예제 #12
0
    def bertFitV2(self,
                  x_train,
                  y_train,
                  x_valid=None,
                  y_valid=None,
                  epochs=1,
                  batch_size=32,
                  verbose=1,
                  callbacks=None,
                  shuffle=True):

        sess = tf.Session()
        bert_path = "https://tfhub.dev/google/bert_multi_cased_L-12_H-768_A-12/1"
        max_seq_length = self._bretMaxLen

        p = IndexTransformer(initial_vocab=self.initial_vocab,
                             use_char=self.use_char)
        p.fit(x_train, y_train)
        embeddings = filter_embeddings(self.embeddings, p._word_vocab.vocab,
                                       self.word_embedding_dim)

        #tokenizer = create_tokenizer_from_hub_module()
        #print("tokenizar done")

        #train_examples = convert_text_to_examples(x_train, y_train)

        #(train_input_ids, train_input_masks, train_segment_ids, train_labels) = convert_examples_to_features(tokenizer,train_examples,max_seq_length=max_seq_length)

        model = ABM.BertBiLSTMCRF(num_labels=p.label_size,
                                  char_embedding_dim=self.char_embedding_dim,
                                  word_lstm_size=self.word_lstm_size,
                                  char_lstm_size=self.char_lstm_size,
                                  fc_dim=self.fc_dim,
                                  use_char=self.use_char,
                                  char_vocab_size=None,
                                  use_crf=self.use_crf,
                                  layer2Flag=self._layer2Flag,
                                  layerdropout=self._layerdropout,
                                  bretFlag=self._bretFlag,
                                  bretMaxLen=self._bretMaxLen,
                                  bert_path=self._bert_path)

        model, loss = model.build()

        # Instantiate variables
        ABM.initialize_vars(sess)

        model.compile(loss=loss, optimizer=self.optimizer)

        trainer = Trainer(model, preprocessor=p)
        trainer.train(x_train,
                      y_train,
                      x_valid,
                      y_valid,
                      epochs=epochs,
                      batch_size=batch_size,
                      verbose=verbose,
                      callbacks=callbacks,
                      shuffle=shuffle)

        self.p = p
        self.model = model
예제 #13
0
    preprocessor = IndexTransformer(use_char=True)
    x = x_train + x_valid
    y = y_train + y_valid
    preprocessor.fit(x, y)
    print(len(x_train), 'train sequences')
    print(len(x_valid), 'valid sequences')

    embeddings = filter_embeddings(wv_model, preprocessor._word_vocab.vocab,
                                   wv_model.vector_size)
    # Use pre-trained word embeddings

    model = anago.models.BiLSTMCRF(
        embeddings=embeddings,
        use_crf=False,
        use_char=True,
        num_labels=preprocessor.label_size,
        word_vocab_size=preprocessor.word_vocab_size,
        char_vocab_size=preprocessor.char_vocab_size,
        dropout=.5,
        word_lstm_size=120)
    model.build()
    model.compile(loss=model.get_loss(), optimizer='adam', metrics=["acc"])
    model.summary()

    trainer = Trainer(model, preprocessor=preprocessor)
    trainer.train(x_train,
                  y_train,
                  x_valid=x_valid,
                  y_valid=y_valid,
                  epochs=100)
예제 #14
0
class ElmoNER(NERModel):
    def __init__(self,
                 word_embedding_dim=100,
                 char_embedding_dim=25,
                 word_lstm_size=100,
                 char_lstm_size=25,
                 fc_dim=100,
                 dropout=0.5,
                 embeddings=None,
                 embeddings_file="glove.6B.100d.txt",
                 batch_size=16,
                 learning_rate=0.001,
                 max_iter=2):
        """ Construct a ELMo based NER model. Model is similar to the BiLSTM-CRF
            model except that the word embeddings are contextual, since they are
            returned by a trained ELMo model. ELMo model requires an additional 
            embedding, which is Glove-100 by default. ELMo model is provided by
            the (dev) Anago project.

            Parameters
            ----------
            word_embedding_dim : int, optional, default 100
                word embedding dimensions.
            char_embedding_dim : int, optional, default 25
                character embedding dimensions.
            word_lstm_size: int, optional, default 100
                character LSTM feature extractor output dimensions.
            char_lstm_size : int, optional, default 25
                word tagger LSTM output dimensions.
            fc_dim : int, optional, default 100
                output fully-connected layer size.
            dropout : float, optional, default 0.5
                dropout rate.
            embeddings : numpy array
                word embedding matrix.
            embeddings_file : str
                path to embedding file.
            batch_size : int, optional, default 16
                training batch size.
            learning_rate : float, optional, default 0.001
                learning rate for Adam optimizer.
            max_iter : int, optional, default 2
                number of epochs of training.

            Attributes
            ----------
            preprocessor_ : reference to Anago preprocessor.
            model_ : reference to the internal Anago ELModel
            trainer_ : reference to the internal Anago Trainer object.
            tagger_ : reference to the internal Anago Tagger object.
        """
        super().__init__()
        self.word_embedding_dim = word_embedding_dim
        self.char_embedding_dim = char_embedding_dim
        self.word_lstm_size = word_lstm_size
        self.char_lstm_size = char_lstm_size
        self.fc_dim = fc_dim
        self.dropout = dropout
        self.embeddings = embeddings
        self.embeddings_file = embeddings_file
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.max_iter = max_iter
        # populated by fit() and load(), expected by save() and transform()
        self.preprocessor_ = None
        self.model_ = None
        self.trainer_ = None
        self.tagger_ = None

    def fit(self, X, y):
        """ Trains the NER model. Input is list of AnnotatedDocuments.

            Parameters
            ----------
            X : list(list(str))
                list of list of tokens
            y : list(list(str))
                list of list of BIO tags

            Returns
            -------
            self
        """
        if self.embeddings is None and self.embeddings_file is None:
            raise ValueError(
                "Either embeddings or embeddings_file should be provided, exiting."
            )

        log.info("Preprocessing dataset...")
        self.preprocessor_ = ELMoTransformer()
        self.preprocessor_.fit(X, y)

        if self.embeddings is None:
            self.embeddings = load_glove(self.embeddings_file)
            embeddings_dim != self.embeddings[list(
                self.embeddings.keys())[0]].shape[0]
            self.embeddings = filter_embeddings(
                self.embeddings, self.preprocessor_._word_vocab.vocab,
                embeddings_dim)

        log.info("Building model...")
        self.model_ = ELModel(
            char_embedding_dim=self.char_embedding_dim,
            word_embedding_dim=self.word_embedding_dim,
            char_lstm_size=self.char_lstm_size,
            word_lstm_size=self.word_lstm_size,
            char_vocab_size=self.preprocessor_.char_vocab_size,
            word_vocab_size=self.preprocessor_.word_vocab_size,
            num_labels=self.preprocessor_.label_size,
            embeddings=self.embeddings,
            dropout=self.dropout)

        self.model_, loss = self.model_.build()
        optimizer = Adam(lr=self.learning_rate)
        self.model_.compile(loss=loss, optimizer=optimizer)
        self.model_.summary()

        log.info('Training the model...')
        self.trainer_ = Trainer(self.model_, preprocessor=self.preprocessor_)

        x_train, x_valid, y_train, y_valid = train_test_split(X,
                                                              y,
                                                              test_size=0.1,
                                                              random_state=42)
        self.trainer_.train(x_train,
                            y_train,
                            x_valid=x_valid,
                            y_valid=y_valid,
                            batch_size=self.batch_size,
                            epochs=self.max_iter)

        self.tagger_ = Tagger(self.model_, preprocessor=self.preprocessor_)

        return self

    def predict(self, X):
        """ Predicts using the NER model.

            Parameters
            ----------
            X : list(list(str))
                list of list of tokens.
            
            Returns
            -------
            y : list(list(str))
                list of list of predicted BIO tags.
        """
        if self.tagger_ is None:
            raise ValueError(
                "No tagger found, either run fit() to train or load() a trained model"
            )

        log.info("Predicting from model...")
        ypreds = [self.tagger_.predict(" ".join(x)) for x in X]
        return ypreds

    def save(self, dirpath):
        """ Saves model to local disk, given a dirpath 
        
            Parameters
            -----------
            dirpath : str
                a directory where model artifacts will be saved. Model saves a 
                weights.h5 weights file, a params.json parameter file, and a 
                preprocessor.pkl preprocessor file.

            Returns
            -------
            None
        """
        if self.model_ is None or self.preprocessor_ is None:
            raise ValueError(
                "No model artifacts to save, either run fit() to train or load() a trained model"
            )

        if not os.path.exists(dirpath):
            os.makedirs(dirpath)

        weights_file = os.path.join(dirpath, "weights.h5")
        params_file = os.path.join(dirpath, "params.json")
        preprocessor_file = os.path.join(dirpath, "preprocessor.pkl")

        save_model(self.model_, weights_file, params_file)
        self.preprocessor_.save(preprocessor_file)

        write_param_file(self.get_params(),
                         os.path.join(dirpath, "params.yaml"))

    def load(self, dirpath):
        """ Loads a trained model from local disk, given the dirpath

            Parameters
            ----------
            dirpath : str
                a directory where model artifacts are saved.

            Returns
            -------
            self
        """
        if not os.path.exists(dirpath):
            raise ValueError("Model directory not found: {:s}".format(dirpath))

        weights_file = os.path.join(dirpath, "weights.h5")
        params_file = os.path.join(dirpath, "params.json")
        preprocessor_file = os.path.join(dirpath, "preprocessor.pkl")

        if not (os.path.exists(weights_file) or os.path.exists(params_file)
                or os.path.exists(preprocessor_file)):
            raise ValueError("Model files may be corrupted, exiting")

        self.model_ = load_model(weights_file, params_file)
        self.preprocessor_ = ELMoTransformer.load(preprocessor_file)
        self.tagger_ = Tagger(self.model_, preprocessor=self.preprocessor_)

        return self
예제 #15
0
    def fit(self, X, y):
        """ Trains the NER model. Input is list of AnnotatedDocuments.

            Parameters
            ----------
            X : list(list(str))
                list of list of tokens
            y : list(list(str))
                list of list of BIO tags

            Returns
            -------
            self
        """
        if self.embeddings is None and self.embeddings_file is None:
            raise ValueError(
                "Either embeddings or embeddings_file should be provided, exiting."
            )

        log.info("Preprocessing dataset...")
        self.preprocessor_ = ELMoTransformer()
        self.preprocessor_.fit(X, y)

        if self.embeddings is None:
            self.embeddings = load_glove(self.embeddings_file)
            embeddings_dim != self.embeddings[list(
                self.embeddings.keys())[0]].shape[0]
            self.embeddings = filter_embeddings(
                self.embeddings, self.preprocessor_._word_vocab.vocab,
                embeddings_dim)

        log.info("Building model...")
        self.model_ = ELModel(
            char_embedding_dim=self.char_embedding_dim,
            word_embedding_dim=self.word_embedding_dim,
            char_lstm_size=self.char_lstm_size,
            word_lstm_size=self.word_lstm_size,
            char_vocab_size=self.preprocessor_.char_vocab_size,
            word_vocab_size=self.preprocessor_.word_vocab_size,
            num_labels=self.preprocessor_.label_size,
            embeddings=self.embeddings,
            dropout=self.dropout)

        self.model_, loss = self.model_.build()
        optimizer = Adam(lr=self.learning_rate)
        self.model_.compile(loss=loss, optimizer=optimizer)
        self.model_.summary()

        log.info('Training the model...')
        self.trainer_ = Trainer(self.model_, preprocessor=self.preprocessor_)

        x_train, x_valid, y_train, y_valid = train_test_split(X,
                                                              y,
                                                              test_size=0.1,
                                                              random_state=42)
        self.trainer_.train(x_train,
                            y_train,
                            x_valid=x_valid,
                            y_valid=y_valid,
                            batch_size=self.batch_size,
                            epochs=self.max_iter)

        self.tagger_ = Tagger(self.model_, preprocessor=self.preprocessor_)

        return self
예제 #16
0
p.fit(x_train, y_train)

print('Loading word embeddings...')
embeddings = load_glove(EMBEDDING_PATH)
embeddings = filter_embeddings(embeddings, p._word_vocab.vocab, EMBEDDING_DIM)

print('Building a model.')
model = ELModel(char_embedding_dim=32,
                word_embedding_dim=EMBEDDING_DIM,
                char_lstm_size=32,
                word_lstm_size=EMBEDDING_DIM,
                char_vocab_size=p.char_vocab_size,
                word_vocab_size=p.word_vocab_size,
                num_labels=p.label_size,
                embeddings=embeddings)
model, loss = model.build()
model.compile(loss=loss, optimizer='adam')

print('Training the model...')
trainer = Trainer(model, preprocessor=p)
trainer.train(x_train, y_train, x_test, y_test,
              callbacks=[
                  TensorBoard(log_dir=log_dir, write_graph=False),
                  ModelCheckpoint(weights_path, save_weights_only=True),
                  ReduceLROnPlateau(),
                  EarlyStopping(patience=EARLY_STOP)])

print('Saving the model...')
save_model(model, os.path.join(log_dir, 'weights.h5'), os.path.join(log_dir, 'params.json'))
p.save(os.path.join(log_dir, 'preprocessor.pkl'))
# model.save('weights.h5', 'params.json')
예제 #17
0
 def test_train_no_valid(self):
     trainer = Trainer(self.model, preprocessor=self.p)
     trainer.train(self.x_train, self.y_train)
예제 #18
0
 def test_train(self):
     trainer = Trainer(self.model, preprocessor=self.p)
     trainer.train(self.x_train,
                   self.y_train,
                   x_valid=self.x_valid,
                   y_valid=self.y_valid)
예제 #19
0
def train_anago(keras_model_name="WCP",
                data_name="laptops",
                task_name="ATEPC2",
                hand_features=None):
    DATA_ROOT = 'data'
    SAVE_ROOT = './models'  # trained models
    LOG_ROOT = './logs'  # checkpoint, tensorboard
    w_embedding_path = '/home/s1610434/Documents/Data/Vector/glove.twitter.27B.100d.txt'
    c_embedding_path = '/home/s1610434/Documents/Data/Vector/AmaYelp/GloVe/glove.char.100.txt'
    pos_embedding_path = '/home/s1610434/Documents/Data/Vector/AmaYelp/GloVe/glove.pos.100.txt'
    unipos_embedding_path = '/home/s1610434/Documents/Data/Vector/AmaYelp/GloVe/glove.unipos.100.txt'

    model_config = prepare_modelconfig(keras_model_name)
    training_config = TrainingConfig()
    training_config.max_epoch = 100
    training_config.early_stopping = 30

    print("-----{0}-----{1}-----{2}-----{3}-----".format(
        task_name, data_name, keras_model_name, hand_features))
    save_path = SAVE_ROOT + "/{0}/{1}".format(data_name, task_name)
    train_path = os.path.join(DATA_ROOT,
                              '{0}.{1}.train.tsv'.format(data_name, task_name))
    test_path = os.path.join(DATA_ROOT,
                             '{0}.{1}.test.tsv'.format(data_name, task_name))
    train_dep_path = os.path.join(
        DATA_ROOT, '{0}.{1}.train.dep.tsv'.format(data_name, task_name))
    test_dep_path = os.path.join(
        DATA_ROOT, '{0}.{1}.test.dep.tsv'.format(data_name, task_name))

    # train set
    x_train_valid, y_train_valid, _ = collect_data_from_tsv(train_path)
    x_train_valid_dep = collect_dept_data_from_tsv(train_dep_path)

    # test set
    X_test, Y_test, _ = collect_data_from_tsv(test_path)
    X_test_dep = collect_dept_data_from_tsv(test_dep_path)

    # train_test set
    X_train_test = np.concatenate((x_train_valid, X_test), 0)
    X_train_test_dep = np.concatenate((x_train_valid_dep, X_test_dep), 0)
    Y_train_test = np.concatenate((y_train_valid, Y_test), 0)

    # preprocessor
    p = prepare_preprocessor(list(zip(X_train_test, X_train_test_dep)),
                             Y_train_test,
                             keras_model_name=keras_model_name,
                             hand_features=hand_features)

    print(len(p.vocab_word))
    print(len(p.vocab_char))
    model_config.vocab_size = len(p.vocab_word)
    model_config.char_vocab_size = len(p.vocab_char)
    if keras_model_name.find("P") != -1:
        if hand_features is not None:
            if "UNIPOS" in hand_features:
                pos_embedding_path = unipos_embedding_path
        model_config.pos_vocab_size = len(p.pos_extractor.features_dict)
    if keras_model_name.find("H") != -1:
        # model_config.hand_feature_size = gen_no_hand_dimension(data_name, hand_features, keras_model_name)
        model_config.hand_feature_size = 53
        print("model_config.hand_feature_size: ",
              str(model_config.hand_feature_size))

    # load embedding
    W_embeddings = load_word_embeddings(p.vocab_word, w_embedding_path,
                                        model_config.word_embedding_size)
    print("Load W_embeddings: {0}".format(W_embeddings.shape))
    C_embeddings = None
    POS_embeddings = None
    # if "C" in keras_model_name:
    #     C_embeddings = load_word_embeddings(p.vocab_char, c_embedding_path, model_config.char_embedding_size)
    #     print("Load C_embeddings: {0}".format(C_embeddings.shape))
    # if "P" in keras_model_name:
    #     POS_embeddings = load_word_embeddings(p.pos_extractor.features_dict, pos_embedding_path, model_config.pos_embedding_size)
    #     print("Load POS_embeddings: {0}".format(POS_embeddings.shape))

    atepc_evaluator = ATEPCEvaluator()
    results = []

    # TODO Kfold split
    kf = KFold(n_splits=10)
    i_fold = 0
    for train_index, valid_index in kf.split(x_train_valid):
        model_name = "{0}.{1}.{2}".format(keras_model_name,
                                          "{0}".format(hand_features), i_fold)
        X_train, X_valid = x_train_valid[train_index], x_train_valid[
            valid_index]
        X_train_dep, X_valid_dep = x_train_valid_dep[
            train_index], x_train_valid_dep[valid_index]
        Y_train, Y_valid = y_train_valid[train_index], y_train_valid[
            valid_index]

        print("Data train: ", X_train.shape, Y_train.shape)
        print("Data valid: ", X_valid.shape, Y_valid.shape)
        print("Data  test: ", X_test.shape, Y_test.shape)

        trainer = Trainer(model_config=model_config,
                          training_config=training_config,
                          checkpoint_path=LOG_ROOT,
                          save_path=save_path,
                          preprocessor=p,
                          W_embeddings=W_embeddings,
                          C_embeddings=C_embeddings,
                          POS_embeddings=POS_embeddings,
                          keras_model_name=keras_model_name,
                          model_name=model_name)

        # trainer = Trainer2(model_config=model_config,
        #                         training_config=training_config,
        #                         checkpoint_path=LOG_ROOT,
        #                         save_path=save_path,
        #                         preprocessor=p,
        #                         W_embeddings=W_embeddings,
        #                         C_embeddings=C_embeddings,
        #                         POS_embeddings=POS_embeddings,
        #                         keras_model_name = keras_model_name,
        #                         model_name=model_name)

        trainer.train(list(zip(X_train, X_train_dep)), Y_train,
                      list(zip(X_valid, X_valid_dep)), Y_valid)

        evaluator = anago.Evaluator(model_config,
                                    weights=model_name,
                                    save_path=save_path,
                                    preprocessor=p,
                                    keras_model_name=keras_model_name)
        print("--- Test phrase --- " + model_name)
        print("Train ")
        f1_score_train = evaluator.eval(list(zip(X_train, X_train_dep)),
                                        Y_train)
        print("Validation ")
        f1_score_valid = evaluator.eval(list(zip(X_valid, X_valid_dep)),
                                        Y_valid)
        print("Test ")
        f1_score_test = evaluator.eval(list(zip(X_test, X_test_dep)), Y_test)
        print("---")
        i_fold += 1

        f_out_name = "data/{0}.{1}.test.pred.tsv".format(data_name, task_name)
        f_out = open(f_out_name, "w")
        tagger = anago.Tagger(model_config,
                              model_name,
                              save_path=save_path,
                              preprocessor=p,
                              keras_model_name=keras_model_name)
        for x, y in zip(list(zip(X_test, X_test_dep)), Y_test):
            result = tagger.predict(x)
            for word, label, pred in zip(x[0], y, result):
                f_out.write("{0}\t{1}\t{2}\n".format(word, label, pred))
            f_out.write("\n")
        f_out.close()
        ate_f1, apc_acc, c_apc_acc = atepc_evaluator.evaluate(f_out_name)
        results.append([ate_f1, apc_acc, c_apc_acc])
        print(results[-1])

    print("-----All-----{0}--{1}".format(keras_model_name, data_name))
    for result in results:
        print(result)
    print("-----AVG-----")
    results_np = np.array(results, dtype=np.float32)
    print(results_np.mean(axis=0))
    print("-------------")