Ejemplo n.º 1
0
    def load(self, dirpath):
        """ Loads a trained model from local disk, given the dirpath

            Parameters
            ----------
            dirpath : str
                a directory where model artifacts are saved.

            Returns
            -------
            self
        """
        if not os.path.exists(dirpath):
            raise ValueError("Model directory not found: {:s}".format(dirpath))

        weights_file = os.path.join(dirpath, "weights.h5")
        params_file = os.path.join(dirpath, "params.json")
        preprocessor_file = os.path.join(dirpath, "preprocessor.pkl")

        if not (os.path.exists(weights_file) or 
                os.path.exists(params_file) or
                os.path.exists(preprocessor_file)):
            raise ValueError("Model files may be corrupted, exiting")
        
        self.model_ = load_model(weights_file, params_file)
        self.preprocessor_ = IndexTransformer.load(preprocessor_file)
        self.tagger_ = Tagger(self.model_, preprocessor=self.preprocessor_)

        return self
Ejemplo n.º 2
0
 def __init__(self, process_proper_nouns=False):
     super().__init__(process_proper_nouns)
     model = load_model(os.path.join(ELMO_TAGGER_PATH, 'weights.h5'),
                        os.path.join(ELMO_TAGGER_PATH, 'params.json'))
     it = IndexTransformer.load(
         os.path.join(ELMO_TAGGER_PATH, 'preprocessor.pkl'))
     self.pos_tagger = Tagger(model,
                              preprocessor=it,
                              tokenizer=wordpunct_tokenize)
Ejemplo n.º 3
0
def main(args):
    print('Loading objects...')
    model = BiLSTMCRF.load(args.weights_file, args.params_file)
    it = IndexTransformer.load(args.preprocessor_file)
    tagger = Tagger(model, preprocessor=it)

    print('Tagging a sentence...')
    res = tagger.analyze(args.sent)
    pprint(res)
Ejemplo n.º 4
0
    def analyze(self, text):
        """Analyze text and return pretty format.

        Args:
            text: string, the input text.

        Returns:
            res: dict.
        """
        if not self.tagger:
            self.tagger = Tagger(self.model, preprocessor=self.p,
                                 tokenizer=self.tokenizer)

        return self.tagger.analyze(text)
Ejemplo n.º 5
0
    def analyze(self, text, tokenizer=str.split):
        """Analyze text and return pretty format.

        Args:
            text: string, the input text.
            tokenizer: Tokenize input sentence. Default tokenizer is `str.split`.

        Returns:
            res: dict.
        """
        if not self.tagger:
            self.tagger = Tagger(self.model,
                                 preprocessor=self.p,
                                 tokenizer=tokenizer)

        return self.tagger.analyze(text)
Ejemplo n.º 6
0
class ElmoBiLSTM_CRFProcessor(CustomProcessor):
    def __init__(self, process_proper_nouns=False):
        super().__init__(process_proper_nouns)
        model = load_model(os.path.join(ELMO_TAGGER_PATH, 'weights.h5'),
                           os.path.join(ELMO_TAGGER_PATH, 'params.json'))
        it = IndexTransformer.load(
            os.path.join(ELMO_TAGGER_PATH, 'preprocessor.pkl'))
        self.pos_tagger = Tagger(model,
                                 preprocessor=it,
                                 tokenizer=wordpunct_tokenize)

    def extract_phrase_by_type(self, token, type):
        return self._extract_phrase(
            list(
                zip(self.pos_tagger.tokenizer(token),
                    self.pos_tagger.predict(token))), type)
Ejemplo n.º 7
0
    def fit(self, X, y):
        """ Trains the NER model. Input is list of list of tokens and tags.

            Parameters
            ----------
            X : list(list(str))
                list of list of tokens
            y : list(list(str))
                list of list of BIO tags

            Returns
            -------
            self
        """
        log.info("Preprocessing dataset...")
        self.preprocessor_ = IndexTransformer(use_char=self.use_char)
        self.preprocessor_.fit(X, y)

        log.info("Building model...")
        self.model_ = BiLSTMCRF(
            char_embedding_dim=self.char_embedding_dim,
            word_embedding_dim=self.word_embedding_dim,
            char_lstm_size=self.char_lstm_size,
            word_lstm_size=self.word_lstm_size,
            char_vocab_size=self.preprocessor_.char_vocab_size,
            word_vocab_size=self.preprocessor_.word_vocab_size,
            num_labels=self.preprocessor_.label_size,
            dropout=self.dropout,
            use_char=self.use_char,
            use_crf=self.use_crf)
        self.model_, loss = self.model_.build()
        optimizer = Adam(lr=self.learning_rate)
        self.model_.compile(loss=loss, optimizer=optimizer)
        self.model_.summary()

        log.info('Training the model...')
        self.trainer_ = Trainer(self.model_, preprocessor=self.preprocessor_)

        x_train, x_valid, y_train, y_valid = train_test_split(X, y, 
            test_size=0.1, random_state=42)
        self.trainer_.train(x_train, y_train, x_valid=x_valid, y_valid=y_valid,
            batch_size=self.batch_size, epochs=self.max_iter)

        self.tagger_ = Tagger(self.model_, preprocessor=self.preprocessor_)

        return self
Ejemplo n.º 8
0
def test_model(sentence: str, model_dir: str) -> None:
    """Performs NER analysis on sentence

    (defaults to using base model which can be trained with train_base_model())

    Args:
        sentence (str): text file to perform analysis on
        model_dir (str): path to model to use for analysis

    """
    model = anago.Sequence.load(model_dir)
    tagger = Tagger(model.model, preprocessor=model.p)

    data = sentence.strip().split()
    pred = tagger.predict(data)
    tags = tagger._get_tags(pred)
    probs = tagger._get_prob(pred)
    res = tagger._build_response(data, tags, probs)

    print()
    print(list(zip(data, tags, probs)))
    print()

    if not res['entities']:
        print("No entities found.")
    else:
        print("Entities Found: ")

    for entity in res['entities']:
        print(f"\t{entity['text']} = {entity['type']}")
Ejemplo n.º 9
0
def run_model(text: str, model_dir: str=BASE_MODEL_PATH) -> List:
    """Performs NER analysis on sentence

    (defaults to using base model which can be trained with train_base_model())

    Args:
        text (str): text to perform analysis on
        model (str): path to model to use for analysis

    """
    model = anago.Sequence.load(model_dir)
    tagger = Tagger(model.model, preprocessor=model.p)

    data = text.strip().split()
    pred = tagger.predict(data)
    tags = tagger._get_tags(pred)
    probs = tagger._get_prob(pred)
    res = tagger._build_response(data, tags, probs)

    return res['entities']
Ejemplo n.º 10
0
class Sequence(object):

    def __init__(self,
                 word_embedding_dim=100,
                 char_embedding_dim=25,
                 word_lstm_size=100,
                 char_lstm_size=25,
                 fc_dim=100,
                 dropout=0.5,
                 embeddings=None,
                 use_char=True,
                 use_crf=True,
                 initial_vocab=None,
                 optimizer='adam'):

        self.model = None
        self.p = None
        self.tagger = None

        self.word_embedding_dim = word_embedding_dim
        self.char_embedding_dim = char_embedding_dim
        self.word_lstm_size = word_lstm_size
        self.char_lstm_size = char_lstm_size
        self.fc_dim = fc_dim
        self.dropout = dropout
        self.embeddings = embeddings
        self.use_char = use_char
        self.use_crf = use_crf
        self.initial_vocab = initial_vocab
        self.optimizer = optimizer

    def fit(self, x_train, y_train, x_valid=None, y_valid=None,
            epochs=1, batch_size=32, verbose=1, callbacks=None, shuffle=True):
        """Fit the model for a fixed number of epochs.

        Args:
            x_train: list of training data.
            y_train: list of training target (label) data.
            x_valid: list of validation data.
            y_valid: list of validation target (label) data.
            batch_size: Integer.
                Number of samples per gradient update.
                If unspecified, `batch_size` will default to 32.
            epochs: Integer. Number of epochs to train the model.
            verbose: Integer. 0, 1, or 2. Verbosity mode.
                0 = silent, 1 = progress bar, 2 = one line per epoch.
            callbacks: List of `keras.callbacks.Callback` instances.
                List of callbacks to apply during training.
            shuffle: Boolean (whether to shuffle the training data
                before each epoch). `shuffle` will default to True.
        """
        p = IndexTransformer(initial_vocab=self.initial_vocab, use_char=self.use_char)
        p.fit(x_train, y_train)
        embeddings = filter_embeddings(self.embeddings, p._word_vocab.vocab, self.word_embedding_dim)

        model = BiLSTMCRF(char_vocab_size=p.char_vocab_size,
                          word_vocab_size=p.word_vocab_size,
                          num_labels=p.label_size,
                          word_embedding_dim=self.word_embedding_dim,
                          char_embedding_dim=self.char_embedding_dim,
                          word_lstm_size=self.word_lstm_size,
                          char_lstm_size=self.char_lstm_size,
                          fc_dim=self.fc_dim,
                          dropout=self.dropout,
                          embeddings=embeddings,
                          use_char=self.use_char,
                          use_crf=self.use_crf)
        model, loss = model.build()
        model.compile(loss=loss, optimizer=self.optimizer)

        trainer = Trainer(model, preprocessor=p)
        trainer.train(x_train, y_train, x_valid, y_valid,
                      epochs=epochs, batch_size=batch_size,
                      verbose=verbose, callbacks=callbacks,
                      shuffle=shuffle)

        self.p = p
        self.model = model

    def score(self, x_test, y_test):
        """Returns the f1-micro score on the given test data and labels.

        Args:
            x_test : array-like, shape = (n_samples, sent_length)
            Test samples.

            y_test : array-like, shape = (n_samples, sent_length)
            True labels for x.

        Returns:
            score : float, f1-micro score.
        """
        if self.model:
            x_test = self.p.transform(x_test)
            lengths = map(len, y_test)
            y_pred = self.model.predict(x_test)
            y_pred = self.p.inverse_transform(y_pred, lengths)
            score = f1_score(y_test, y_pred)
            return score
        else:
            raise OSError('Could not find a model. Call load(dir_path).')

    def analyze(self, text, tokenizer=str.split):
        """Analyze text and return pretty format.

        Args:
            text: string, the input text.
            tokenizer: Tokenize input sentence. Default tokenizer is `str.split`.

        Returns:
            res: dict.
        """
        if not self.tagger:
            self.tagger = Tagger(self.model,
                                 preprocessor=self.p,
                                 tokenizer=tokenizer)

        return self.tagger.analyze(text)

    def save(self, weights_file, params_file, preprocessor_file):
        self.p.save(preprocessor_file)
        save_model(self.model, weights_file, params_file)

    @classmethod
    def load(cls, weights_file, params_file, preprocessor_file):
        self = cls()
        self.p = IndexTransformer.load(preprocessor_file)
        self.model = load_model(weights_file, params_file)

        return self
Ejemplo n.º 11
0
class BiLstmCrfNER(NERModel):

    def __init__(self,
            word_embedding_dim=100,
            char_embedding_dim=25,
            word_lstm_size=100,
            char_lstm_size=25,
            fc_dim=100,
            dropout=0.5,
            embeddings=None,
            use_char=True,
            use_crf=True,
            batch_size=16, 
            learning_rate=0.001, 
            max_iter=10):
        """ Construct a BiLSTM-CRF NER model. Model is augmented with character
            level embeddings as well as word embeddings by default. Implementation 
            is provided by the Anago project.

            Parameters
            ----------
            word_embedding_dim : int, optional, default 100
                word embedding dimensions.
            char_embedding_dim : int, optional, default 25
                character embedding dimensions.
            word_lstm_size : int, optional, default 100
                character LSTM feature extractor output dimensions.
            char_lstm_size : int, optional, default 25
                word tagger LSTM output dimensions.
            fc_dim : int, optional, default 100
                output fully-connected layer size.
            dropout : float, optional, default 0.5
                dropout rate.
            embeddings : numpy array
                word embedding matrix.
            use_char : bool, optional, default True
                add char feature.
            use_crf : bool, optional, default True
                use crf as last layer.
            batch_size : int, optional, default 16
                training batch size.
            learning_rate : float, optional, default 0.001
                learning rate for Adam optimizer
            max_iter : int
                number of epochs of training

            Attributes
            ----------
            preprocessor_ : reference to preprocessor
            model_ : reference to generated model
            trainer_ : internal reference to Anago Trainer (model)
            tagger_ : internal reference to Anago Tagger (predictor)
        """
        super().__init__()
        self.word_embedding_dim = word_embedding_dim
        self.char_embedding_dim = char_embedding_dim
        self.word_lstm_size = word_lstm_size
        self.char_lstm_size = char_lstm_size
        self.fc_dim = fc_dim
        self.dropout = dropout
        self.embedding = None
        self.use_char = True
        self.use_crf = True
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.max_iter = max_iter
        # populated by fit() and load(), expected by save() and transform()
        self.preprocessor_ = None
        self.model_ = None
        self.trainer_ = None
        self.tagger_ = None


    def fit(self, X, y):
        """ Trains the NER model. Input is list of list of tokens and tags.

            Parameters
            ----------
            X : list(list(str))
                list of list of tokens
            y : list(list(str))
                list of list of BIO tags

            Returns
            -------
            self
        """
        log.info("Preprocessing dataset...")
        self.preprocessor_ = IndexTransformer(use_char=self.use_char)
        self.preprocessor_.fit(X, y)

        log.info("Building model...")
        self.model_ = BiLSTMCRF(
            char_embedding_dim=self.char_embedding_dim,
            word_embedding_dim=self.word_embedding_dim,
            char_lstm_size=self.char_lstm_size,
            word_lstm_size=self.word_lstm_size,
            char_vocab_size=self.preprocessor_.char_vocab_size,
            word_vocab_size=self.preprocessor_.word_vocab_size,
            num_labels=self.preprocessor_.label_size,
            dropout=self.dropout,
            use_char=self.use_char,
            use_crf=self.use_crf)
        self.model_, loss = self.model_.build()
        optimizer = Adam(lr=self.learning_rate)
        self.model_.compile(loss=loss, optimizer=optimizer)
        self.model_.summary()

        log.info('Training the model...')
        self.trainer_ = Trainer(self.model_, preprocessor=self.preprocessor_)

        x_train, x_valid, y_train, y_valid = train_test_split(X, y, 
            test_size=0.1, random_state=42)
        self.trainer_.train(x_train, y_train, x_valid=x_valid, y_valid=y_valid,
            batch_size=self.batch_size, epochs=self.max_iter)

        self.tagger_ = Tagger(self.model_, preprocessor=self.preprocessor_)

        return self


    def predict(self, X):
        """ Predicts using the NER model.

            Parameters
            ----------
            X : list(list(str))
                list of list of tokens.

            Returns
            -------
            y : list(list(str))
                list of list of predicted BIO tags.
        """
        if self.tagger_ is None:
            raise ValueError("No tagger found, either run fit() to train or load() a trained model")

        log.info("Predicting from model...")
        ypreds = [self.tagger_.predict(" ".join(x)) for x in X]
        return ypreds


    def save(self, dirpath):
        """ Saves model to local disk, given a dirpath 
        
            Parameters
            ----------
            dirpath : str
                a directory where model artifacts will be saved.
                Model saves a weights.h5 weights file, a params.json parameter
                file, and a preprocessor.pkl preprocessor file.

            Returns
            -------
            None
        """
        if self.model_ is None or self.preprocessor_ is None:
            raise ValueError("No model artifacts to save, either run fit() to train or load() a trained model")

        if not os.path.exists(dirpath):
            os.makedirs(dirpath)

        weights_file = os.path.join(dirpath, "weights.h5")
        params_file = os.path.join(dirpath, "params.json")
        preprocessor_file = os.path.join(dirpath, "preprocessor.pkl")

        save_model(self.model_, weights_file, params_file)
        self.preprocessor_.save(preprocessor_file)

        write_param_file(self.get_params(), os.path.join(dirpath, "params.yaml"))


    def load(self, dirpath):
        """ Loads a trained model from local disk, given the dirpath

            Parameters
            ----------
            dirpath : str
                a directory where model artifacts are saved.

            Returns
            -------
            self
        """
        if not os.path.exists(dirpath):
            raise ValueError("Model directory not found: {:s}".format(dirpath))

        weights_file = os.path.join(dirpath, "weights.h5")
        params_file = os.path.join(dirpath, "params.json")
        preprocessor_file = os.path.join(dirpath, "preprocessor.pkl")

        if not (os.path.exists(weights_file) or 
                os.path.exists(params_file) or
                os.path.exists(preprocessor_file)):
            raise ValueError("Model files may be corrupted, exiting")
        
        self.model_ = load_model(weights_file, params_file)
        self.preprocessor_ = IndexTransformer.load(preprocessor_file)
        self.tagger_ = Tagger(self.model_, preprocessor=self.preprocessor_)

        return self
Ejemplo n.º 12
0
from CRF.anago.data import prepare_preprocessor

DATA_ROOT = 'data/phenebank/'
train_path = os.path.join(DATA_ROOT, 'train.txt')

x_train, y_train = load_data_and_labels(train_path)

p = prepare_preprocessor(x_train, y_train)
model_config = ModelConfig()
SAVE_ROOT = './models'  # trained model
weights = 'model_weights.h5'
tagger = anago.Tagger(model_config,
                      weights,
                      save_path=SAVE_ROOT,
                      preprocessor=p)

test_path = "data/phenebank/test.txt"

with open(test_path) as ifile:
    this_sentence = []
    all_sentences = []
    this_output = []
    all_outputs = []
    for line in ifile:
        line = line.strip()
        if len(line) == 0:
            this_output = tag.predict(tag, this_sentence)
            print this_sentence, this_output
        else:
            this_sentence.append(line.split("\t")[0])
Ejemplo n.º 13
0
    def fit(self, X, y):
        """ Trains the NER model. Input is list of AnnotatedDocuments.

            Parameters
            ----------
            X : list(list(str))
                list of list of tokens
            y : list(list(str))
                list of list of BIO tags

            Returns
            -------
            self
        """
        if self.embeddings is None and self.embeddings_file is None:
            raise ValueError(
                "Either embeddings or embeddings_file should be provided, exiting."
            )

        log.info("Preprocessing dataset...")
        self.preprocessor_ = ELMoTransformer()
        self.preprocessor_.fit(X, y)

        if self.embeddings is None:
            self.embeddings = load_glove(self.embeddings_file)
            embeddings_dim != self.embeddings[list(
                self.embeddings.keys())[0]].shape[0]
            self.embeddings = filter_embeddings(
                self.embeddings, self.preprocessor_._word_vocab.vocab,
                embeddings_dim)

        log.info("Building model...")
        self.model_ = ELModel(
            char_embedding_dim=self.char_embedding_dim,
            word_embedding_dim=self.word_embedding_dim,
            char_lstm_size=self.char_lstm_size,
            word_lstm_size=self.word_lstm_size,
            char_vocab_size=self.preprocessor_.char_vocab_size,
            word_vocab_size=self.preprocessor_.word_vocab_size,
            num_labels=self.preprocessor_.label_size,
            embeddings=self.embeddings,
            dropout=self.dropout)

        self.model_, loss = self.model_.build()
        optimizer = Adam(lr=self.learning_rate)
        self.model_.compile(loss=loss, optimizer=optimizer)
        self.model_.summary()

        log.info('Training the model...')
        self.trainer_ = Trainer(self.model_, preprocessor=self.preprocessor_)

        x_train, x_valid, y_train, y_valid = train_test_split(X,
                                                              y,
                                                              test_size=0.1,
                                                              random_state=42)
        self.trainer_.train(x_train,
                            y_train,
                            x_valid=x_valid,
                            y_valid=y_valid,
                            batch_size=self.batch_size,
                            epochs=self.max_iter)

        self.tagger_ = Tagger(self.model_, preprocessor=self.preprocessor_)

        return self
Ejemplo n.º 14
0
class ElmoNER(NERModel):
    def __init__(self,
                 word_embedding_dim=100,
                 char_embedding_dim=25,
                 word_lstm_size=100,
                 char_lstm_size=25,
                 fc_dim=100,
                 dropout=0.5,
                 embeddings=None,
                 embeddings_file="glove.6B.100d.txt",
                 batch_size=16,
                 learning_rate=0.001,
                 max_iter=2):
        """ Construct a ELMo based NER model. Model is similar to the BiLSTM-CRF
            model except that the word embeddings are contextual, since they are
            returned by a trained ELMo model. ELMo model requires an additional 
            embedding, which is Glove-100 by default. ELMo model is provided by
            the (dev) Anago project.

            Parameters
            ----------
            word_embedding_dim : int, optional, default 100
                word embedding dimensions.
            char_embedding_dim : int, optional, default 25
                character embedding dimensions.
            word_lstm_size: int, optional, default 100
                character LSTM feature extractor output dimensions.
            char_lstm_size : int, optional, default 25
                word tagger LSTM output dimensions.
            fc_dim : int, optional, default 100
                output fully-connected layer size.
            dropout : float, optional, default 0.5
                dropout rate.
            embeddings : numpy array
                word embedding matrix.
            embeddings_file : str
                path to embedding file.
            batch_size : int, optional, default 16
                training batch size.
            learning_rate : float, optional, default 0.001
                learning rate for Adam optimizer.
            max_iter : int, optional, default 2
                number of epochs of training.

            Attributes
            ----------
            preprocessor_ : reference to Anago preprocessor.
            model_ : reference to the internal Anago ELModel
            trainer_ : reference to the internal Anago Trainer object.
            tagger_ : reference to the internal Anago Tagger object.
        """
        super().__init__()
        self.word_embedding_dim = word_embedding_dim
        self.char_embedding_dim = char_embedding_dim
        self.word_lstm_size = word_lstm_size
        self.char_lstm_size = char_lstm_size
        self.fc_dim = fc_dim
        self.dropout = dropout
        self.embeddings = embeddings
        self.embeddings_file = embeddings_file
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.max_iter = max_iter
        # populated by fit() and load(), expected by save() and transform()
        self.preprocessor_ = None
        self.model_ = None
        self.trainer_ = None
        self.tagger_ = None

    def fit(self, X, y):
        """ Trains the NER model. Input is list of AnnotatedDocuments.

            Parameters
            ----------
            X : list(list(str))
                list of list of tokens
            y : list(list(str))
                list of list of BIO tags

            Returns
            -------
            self
        """
        if self.embeddings is None and self.embeddings_file is None:
            raise ValueError(
                "Either embeddings or embeddings_file should be provided, exiting."
            )

        log.info("Preprocessing dataset...")
        self.preprocessor_ = ELMoTransformer()
        self.preprocessor_.fit(X, y)

        if self.embeddings is None:
            self.embeddings = load_glove(self.embeddings_file)
            embeddings_dim != self.embeddings[list(
                self.embeddings.keys())[0]].shape[0]
            self.embeddings = filter_embeddings(
                self.embeddings, self.preprocessor_._word_vocab.vocab,
                embeddings_dim)

        log.info("Building model...")
        self.model_ = ELModel(
            char_embedding_dim=self.char_embedding_dim,
            word_embedding_dim=self.word_embedding_dim,
            char_lstm_size=self.char_lstm_size,
            word_lstm_size=self.word_lstm_size,
            char_vocab_size=self.preprocessor_.char_vocab_size,
            word_vocab_size=self.preprocessor_.word_vocab_size,
            num_labels=self.preprocessor_.label_size,
            embeddings=self.embeddings,
            dropout=self.dropout)

        self.model_, loss = self.model_.build()
        optimizer = Adam(lr=self.learning_rate)
        self.model_.compile(loss=loss, optimizer=optimizer)
        self.model_.summary()

        log.info('Training the model...')
        self.trainer_ = Trainer(self.model_, preprocessor=self.preprocessor_)

        x_train, x_valid, y_train, y_valid = train_test_split(X,
                                                              y,
                                                              test_size=0.1,
                                                              random_state=42)
        self.trainer_.train(x_train,
                            y_train,
                            x_valid=x_valid,
                            y_valid=y_valid,
                            batch_size=self.batch_size,
                            epochs=self.max_iter)

        self.tagger_ = Tagger(self.model_, preprocessor=self.preprocessor_)

        return self

    def predict(self, X):
        """ Predicts using the NER model.

            Parameters
            ----------
            X : list(list(str))
                list of list of tokens.
            
            Returns
            -------
            y : list(list(str))
                list of list of predicted BIO tags.
        """
        if self.tagger_ is None:
            raise ValueError(
                "No tagger found, either run fit() to train or load() a trained model"
            )

        log.info("Predicting from model...")
        ypreds = [self.tagger_.predict(" ".join(x)) for x in X]
        return ypreds

    def save(self, dirpath):
        """ Saves model to local disk, given a dirpath 
        
            Parameters
            -----------
            dirpath : str
                a directory where model artifacts will be saved. Model saves a 
                weights.h5 weights file, a params.json parameter file, and a 
                preprocessor.pkl preprocessor file.

            Returns
            -------
            None
        """
        if self.model_ is None or self.preprocessor_ is None:
            raise ValueError(
                "No model artifacts to save, either run fit() to train or load() a trained model"
            )

        if not os.path.exists(dirpath):
            os.makedirs(dirpath)

        weights_file = os.path.join(dirpath, "weights.h5")
        params_file = os.path.join(dirpath, "params.json")
        preprocessor_file = os.path.join(dirpath, "preprocessor.pkl")

        save_model(self.model_, weights_file, params_file)
        self.preprocessor_.save(preprocessor_file)

        write_param_file(self.get_params(),
                         os.path.join(dirpath, "params.yaml"))

    def load(self, dirpath):
        """ Loads a trained model from local disk, given the dirpath

            Parameters
            ----------
            dirpath : str
                a directory where model artifacts are saved.

            Returns
            -------
            self
        """
        if not os.path.exists(dirpath):
            raise ValueError("Model directory not found: {:s}".format(dirpath))

        weights_file = os.path.join(dirpath, "weights.h5")
        params_file = os.path.join(dirpath, "params.json")
        preprocessor_file = os.path.join(dirpath, "preprocessor.pkl")

        if not (os.path.exists(weights_file) or os.path.exists(params_file)
                or os.path.exists(preprocessor_file)):
            raise ValueError("Model files may be corrupted, exiting")

        self.model_ = load_model(weights_file, params_file)
        self.preprocessor_ = ELMoTransformer.load(preprocessor_file)
        self.tagger_ = Tagger(self.model_, preprocessor=self.preprocessor_)

        return self
Ejemplo n.º 15
0
class Sequence(object):
    def __init__(
        self,
        word_embedding_dim=100,
        char_embedding_dim=25,
        word_lstm_size=100,
        char_lstm_size=25,
        fc_dim=100,
        dropout=0.5,
        embeddings=None,
        use_char=True,
        use_crf=True,
        initial_vocab=None,
        optimizer='adam',
        layer2Flag=False,
        layerdropout=0,
        # fastArFlag=False,
        # fastModelAr="",
        # fastEnFlag=False,
        # fastModelEn="",ArTwitterFlag=False,ArTwitterModel="",fileToWrite="Invalid.txt",
        bretFlag=False,
        bretMaxLen=100,
        bert_path="https://tfhub.dev/google/bert_multi_cased_L-12_H-768_A-12/1"
    ):

        self.model = None
        self.p = None
        self.tagger = None

        self.word_embedding_dim = word_embedding_dim
        self.char_embedding_dim = char_embedding_dim
        self.word_lstm_size = word_lstm_size
        self.char_lstm_size = char_lstm_size
        self.fc_dim = fc_dim
        self.dropout = dropout
        self.embeddings = embeddings
        self.use_char = use_char
        self.use_crf = use_crf
        self.initial_vocab = initial_vocab
        self.optimizer = optimizer
        self._layer2Flag = layer2Flag
        self._layerdropout = layerdropout
        # self._fastArFlag=fastArFlag
        # self._fastEnFlag=fastEnFlag
        # self._fastModelAr=fastModelAr
        # self._fastModelEn=fastModelEn
        # self._ArTwitterFlag=ArTwitterFlag
        # self._ArTwitterModel=ArTwitterModel
        # self._fileToWrite=fileToWrite
        self._bretFlag = bretFlag
        self._bretMaxLen = bretMaxLen
        self._bert_path = bert_path

    def bertFit(self,
                x_train,
                y_train,
                x_valid=None,
                y_valid=None,
                epochs=1,
                batch_size=32,
                verbose=1,
                callbacks=None,
                shuffle=True):

        sess = tf.Session()
        bert_path = "https://tfhub.dev/google/bert_multi_cased_L-12_H-768_A-12/1"
        max_seq_length = self._bretMaxLen

        tokenizer = create_tokenizer_from_hub_module()
        print("tokenizar done")

        train_examples = convert_text_to_examples(x_train, y_train)

        (train_input_ids, train_input_masks, train_segment_ids,
         train_labels) = convert_examples_to_features(
             tokenizer, train_examples, max_seq_length=max_seq_length)

        model = ABM.BertBiLSTMCRF(num_labels=9,
                                  char_embedding_dim=self.char_embedding_dim,
                                  word_lstm_size=self.word_lstm_size,
                                  char_lstm_size=self.char_lstm_size,
                                  fc_dim=self.fc_dim,
                                  use_char=self.use_char,
                                  char_vocab_size=None,
                                  use_crf=self.use_crf,
                                  layer2Flag=self._layer2Flag,
                                  layerdropout=self._layerdropout,
                                  bretFlag=self._bretFlag,
                                  bretMaxLen=self._bretMaxLen,
                                  bert_path=self._bert_path)

        model, loss = model.build()

        # Instantiate variables
        ABM.initialize_vars(sess)

        model.fit([train_input_ids, train_input_masks, train_segment_ids],
                  train_labels,
                  epochs=epochs,
                  batch_size=batch_size)

    def bertFitV2(self,
                  x_train,
                  y_train,
                  x_valid=None,
                  y_valid=None,
                  epochs=1,
                  batch_size=32,
                  verbose=1,
                  callbacks=None,
                  shuffle=True):

        sess = tf.Session()
        bert_path = "https://tfhub.dev/google/bert_multi_cased_L-12_H-768_A-12/1"
        max_seq_length = self._bretMaxLen

        p = IndexTransformer(initial_vocab=self.initial_vocab,
                             use_char=self.use_char)
        p.fit(x_train, y_train)
        embeddings = filter_embeddings(self.embeddings, p._word_vocab.vocab,
                                       self.word_embedding_dim)

        #tokenizer = create_tokenizer_from_hub_module()
        #print("tokenizar done")

        #train_examples = convert_text_to_examples(x_train, y_train)

        #(train_input_ids, train_input_masks, train_segment_ids, train_labels) = convert_examples_to_features(tokenizer,train_examples,max_seq_length=max_seq_length)

        model = ABM.BertBiLSTMCRF(num_labels=p.label_size,
                                  char_embedding_dim=self.char_embedding_dim,
                                  word_lstm_size=self.word_lstm_size,
                                  char_lstm_size=self.char_lstm_size,
                                  fc_dim=self.fc_dim,
                                  use_char=self.use_char,
                                  char_vocab_size=None,
                                  use_crf=self.use_crf,
                                  layer2Flag=self._layer2Flag,
                                  layerdropout=self._layerdropout,
                                  bretFlag=self._bretFlag,
                                  bretMaxLen=self._bretMaxLen,
                                  bert_path=self._bert_path)

        model, loss = model.build()

        # Instantiate variables
        ABM.initialize_vars(sess)

        model.compile(loss=loss, optimizer=self.optimizer)

        trainer = Trainer(model, preprocessor=p)
        trainer.train(x_train,
                      y_train,
                      x_valid,
                      y_valid,
                      epochs=epochs,
                      batch_size=batch_size,
                      verbose=verbose,
                      callbacks=callbacks,
                      shuffle=shuffle)

        self.p = p
        self.model = model

    def fit(self,
            x_train,
            y_train,
            x_valid=None,
            y_valid=None,
            epochs=1,
            batch_size=32,
            verbose=1,
            callbacks=None,
            shuffle=True):
        """Fit the model for a fixed number of epochs.

        Args:
            x_train: list of training data.
            y_train: list of training target (label) data.
            x_valid: list of validation data.
            y_valid: list of validation target (label) data.
            batch_size: Integer.
                Number of samples per gradient update.
                If unspecified, `batch_size` will default to 32.
            epochs: Integer. Number of epochs to train the model.
            verbose: Integer. 0, 1, or 2. Verbosity mode.
                0 = silent, 1 = progress bar, 2 = one line per epoch.
            callbacks: List of `keras.callbacks.Callback` instances.
                List of callbacks to apply during training.
            shuffle: Boolean (whether to shuffle the training data
                before each epoch). `shuffle` will default to True.
        """
        p = IndexTransformer(initial_vocab=self.initial_vocab,
                             use_char=self.use_char)
        p.fit(x_train,
              y_train,
              bretFlag=self._bretFlag,
              max_len=self._bretMaxLen)
        embeddings = filter_embeddings(self.embeddings, p._word_vocab.vocab,
                                       self.word_embedding_dim)

        model = BiLSTMCRF(char_vocab_size=p.char_vocab_size,
                          word_vocab_size=p.word_vocab_size,
                          num_labels=p.label_size,
                          word_embedding_dim=self.word_embedding_dim,
                          char_embedding_dim=self.char_embedding_dim,
                          word_lstm_size=self.word_lstm_size,
                          char_lstm_size=self.char_lstm_size,
                          fc_dim=self.fc_dim,
                          dropout=self.dropout,
                          embeddings=embeddings,
                          use_char=self.use_char,
                          use_crf=self.use_crf,
                          layer2Flag=self._layer2Flag,
                          layerdropout=self._layerdropout,
                          bretFlag=self._bretFlag,
                          bretMaxLen=self._bretMaxLen,
                          bert_path=self._bert_path)
        model, loss = model.build()
        #if(self.optimizer.lower()=="adam"):
        #self.optimizer=keras.optimizers.Adamax(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.1)
        model.compile(loss=loss, optimizer=self.optimizer)

        trainer = Trainer(model, preprocessor=p)
        trainer.train(x_train,
                      y_train,
                      x_valid,
                      y_valid,
                      epochs=epochs,
                      batch_size=batch_size,
                      verbose=verbose,
                      callbacks=callbacks,
                      shuffle=shuffle)

        self.p = p
        self.model = model

    def score(self, x_test, y_test, fileToWrite):
        """Returns the f1-micro score on the given test data and labels.

        Args:
            x_test : array-like, shape = (n_samples, sent_length)
            Test samples.

            y_test : array-like, shape = (n_samples, sent_length)
            True labels for x.

        Returns:
            score : float, f1-micro score.
        """
        if self.model:
            # if(self._fastArFlag):
            #     ArText=KeyedVectors.load_word2vec_format(self._fastModelAr)
            # if(self._fastEnFlag):
            #     EnText=KeyedVectors.load_word2vec_format(self._fastModelEn)
            # if(self._ArTwitterFlag):
            #     ArTwitter=gensim.models.Word2Vec.load(self._ArTwitterModel)

            x_test_org = x_test
            x_test = self.p.transform(x_test)
            lengths = map(len, y_test)
            y_pred = self.model.predict(x_test)
            y_pred = self.p.inverse_transform(y_pred, lengths)
            # adjust here
            # vector similarity approach

            # if(self._ArTwitterFlag and self._fastEnFlag):
            #     print("here")
            #     AdjustPredTag(t_model=ArTwitter,t_en_model=EnText,x_test_org=x_test_org,y_pred=y_pred,ratioSimilarity=0.6,topn=30)

            writeTupleArray(x_test_org, y_pred, fileToWrite)

            #checkerLen(x_test_org,y_pred)
            #print(y_pred)
            print(classification_report(y_test, y_pred))
            score = f1_score(y_test, y_pred)
            print("F-score is")
            return score
        else:
            raise OSError('Could not find a model. Call load(dir_path).')

    def analyze(self, text, tokenizer=str.split):
        """Analyze text and return pretty format.

        Args:
            text: string, the input text.
            tokenizer: Tokenize input sentence. Default tokenizer is `str.split`.

        Returns:
            res: dict.
        """
        if not self.tagger:
            self.tagger = Tagger(self.model,
                                 preprocessor=self.p,
                                 tokenizer=tokenizer)

        return self.tagger.analyze(text)

    def save(self, weights_file, params_file, preprocessor_file):
        self.p.save(preprocessor_file)
        save_model(self.model, weights_file, params_file)

    @classmethod
    def load(cls, weights_file, params_file, preprocessor_file):
        self = cls()
        self.p = IndexTransformer.load(preprocessor_file)
        self.model = load_model(weights_file, params_file)

        return self
Ejemplo n.º 16
0
 def analyze(self, words):
     if self.model:
         tagger = Tagger(self.model, preprocessor=self.p)
         return tagger.analyze(words)
     else:
         raise (OSError('Could not find a model. Call load(dir_path).'))