Example #1
0
    def __init__(self,
                 model_name,
                 model_type="BidLSTM_CRF",
                 embeddings_name=None,
                 char_emb_size=25,
                 max_char_length=30,
                 char_lstm_units=25,
                 word_lstm_units=100,
                 dropout=0.5,
                 recurrent_dropout=0.25,
                 use_char_feature=True,
                 use_crf=True,
                 batch_size=20,
                 optimizer='adam',
                 learning_rate=0.001,
                 lr_decay=0.9,
                 clip_gradients=5.0,
                 max_epoch=50,
                 early_stop=True,
                 patience=5,
                 max_checkpoints_to_keep=5,
                 log_dir=None,
                 use_ELMo=True,
                 fold_number=1):

        self.model = None
        self.models = None
        self.p = None
        self.log_dir = log_dir
        self.embeddings_name = embeddings_name

        word_emb_size = 0
        if embeddings_name is not None:
            self.embeddings = Embeddings(embeddings_name, use_ELMo=use_ELMo)
            word_emb_size = self.embeddings.embed_size

        self.model_config = ModelConfig(model_name=model_name,
                                        model_type=model_type,
                                        embeddings_name=embeddings_name,
                                        word_embedding_size=word_emb_size,
                                        char_emb_size=char_emb_size,
                                        char_lstm_units=char_lstm_units,
                                        max_char_length=max_char_length,
                                        word_lstm_units=word_lstm_units,
                                        dropout=dropout,
                                        recurrent_dropout=recurrent_dropout,
                                        use_char_feature=use_char_feature,
                                        use_crf=use_crf,
                                        fold_number=fold_number,
                                        batch_size=batch_size,
                                        use_ELMo=use_ELMo)

        self.training_config = TrainingConfig(batch_size, optimizer,
                                              learning_rate, lr_decay,
                                              clip_gradients, max_epoch,
                                              early_stop, patience,
                                              max_checkpoints_to_keep)
Example #2
0
    def load(self, dir_path='data/models/sequenceLabelling/'):
        self.p = WordPreprocessor.load(os.path.join(dir_path, self.model_config.model_name, self.preprocessor_file))
        
        self.model_config = ModelConfig.load(os.path.join(dir_path, self.model_config.model_name, self.config_file))

        # load embeddings
        self.embeddings = Embeddings(self.model_config.embeddings_name, use_ELMo=self.model_config.use_ELMo) 
        self.model_config.word_embedding_size = self.embeddings.embed_size

        self.model = get_model(self.model_config, self.p, ntags=len(self.p.vocab_tag))
        self.model.load(filepath=os.path.join(dir_path, self.model_config.model_name, self.weight_file))
Example #3
0
    def load(self, dir_path='data/models/textClassification/'):
        self.model_config = ModelConfig.load(
            os.path.join(dir_path, self.model_config.model_name,
                         self.config_file))

        # load embeddings
        self.embeddings = Embeddings(self.model_config.embeddings_name)
        self.model_config.word_embedding_size = self.embeddings.embed_size

        self.model = getModel(self.model_config, self.training_config)
        if self.model_config.fold_number is 1:
            self.model.load_weights(
                os.path.join(
                    dir_path, self.model_config.model_name,
                    self.model_config.model_type + "." + self.weight_file))
        else:
            self.models = []
            for i in range(0, self.model_config.fold_number):
                local_model = getModel(self.model_config, self.training_config)
                local_model.load_weights(
                    os.path.join(
                        dir_path, self.model_config.model_name,
                        self.model_config.model_type +
                        ".model{0}_weights.hdf5".format(i)))
                self.models.append(local_model)
Example #4
0
    def __init__(self,
                 model_name="",
                 model_type="gru",
                 embeddings_name=None,
                 list_classes=[],
                 char_emb_size=25,
                 dropout=0.5,
                 recurrent_dropout=0.25,
                 use_char_feature=False,
                 batch_size=256,
                 optimizer='adam',
                 learning_rate=0.001,
                 lr_decay=0.9,
                 clip_gradients=5.0,
                 max_epoch=50,
                 patience=5,
                 log_dir=None,
                 maxlen=300,
                 fold_number=1,
                 use_roc_auc=True,
                 embeddings=()):

        self.model = None
        self.models = None
        self.log_dir = log_dir
        self.embeddings_name = embeddings_name

        word_emb_size = 0
        if embeddings_name is not None:
            self.embeddings = Embeddings(embeddings_name)
            word_emb_size = self.embeddings.embed_size

        self.model_config = ModelConfig(model_name=model_name,
                                        model_type=model_type,
                                        embeddings_name=embeddings_name,
                                        list_classes=list_classes,
                                        char_emb_size=char_emb_size,
                                        word_emb_size=word_emb_size,
                                        dropout=dropout,
                                        recurrent_dropout=recurrent_dropout,
                                        use_char_feature=use_char_feature,
                                        maxlen=maxlen,
                                        fold_number=fold_number,
                                        batch_size=batch_size)

        self.training_config = TrainingConfig(batch_size, optimizer,
                                              learning_rate, lr_decay,
                                              clip_gradients, max_epoch,
                                              patience, use_roc_auc)
Example #5
0
class Sequence(object):

    config_file = 'config.json'
    weight_file = 'model_weights.hdf5'
    preprocessor_file = 'preprocessor.pkl'

    # number of parallel worker for the data generator when not using ELMo
    nb_workers = 6

    def __init__(self,
                 model_name,
                 model_type="BidLSTM_CRF",
                 embeddings_name=None,
                 char_emb_size=25,
                 max_char_length=30,
                 char_lstm_units=25,
                 word_lstm_units=100,
                 dropout=0.5,
                 recurrent_dropout=0.25,
                 use_char_feature=True,
                 use_crf=True,
                 batch_size=20,
                 optimizer='adam',
                 learning_rate=0.001,
                 lr_decay=0.9,
                 clip_gradients=5.0,
                 max_epoch=50,
                 early_stop=True,
                 patience=5,
                 max_checkpoints_to_keep=5,
                 log_dir=None,
                 use_ELMo=True,
                 fold_number=1):

        self.model = None
        self.models = None
        self.p = None
        self.log_dir = log_dir
        self.embeddings_name = embeddings_name

        word_emb_size = 0
        if embeddings_name is not None:
            self.embeddings = Embeddings(embeddings_name, use_ELMo=use_ELMo)
            word_emb_size = self.embeddings.embed_size

        self.model_config = ModelConfig(model_name=model_name,
                                        model_type=model_type,
                                        embeddings_name=embeddings_name,
                                        word_embedding_size=word_emb_size,
                                        char_emb_size=char_emb_size,
                                        char_lstm_units=char_lstm_units,
                                        max_char_length=max_char_length,
                                        word_lstm_units=word_lstm_units,
                                        dropout=dropout,
                                        recurrent_dropout=recurrent_dropout,
                                        use_char_feature=use_char_feature,
                                        use_crf=use_crf,
                                        fold_number=fold_number,
                                        batch_size=batch_size,
                                        use_ELMo=use_ELMo)

        self.training_config = TrainingConfig(batch_size, optimizer,
                                              learning_rate, lr_decay,
                                              clip_gradients, max_epoch,
                                              early_stop, patience,
                                              max_checkpoints_to_keep)

    def train(self, x_train, y_train, x_valid=None, y_valid=None):
        # TBD if valid is None, segment train to get one
        x_all = np.concatenate((x_train, x_valid), axis=0)
        y_all = np.concatenate((y_train, y_valid), axis=0)
        self.p = prepare_preprocessor(x_all, y_all, self.model_config)
        self.model_config.char_vocab_size = len(self.p.vocab_char)
        self.model_config.case_vocab_size = len(self.p.vocab_case)
        """
        if self.embeddings.use_ELMo:
            # dump token context independent data for the train set, done once for the training
            x_train_local = x_train
            if not self.training_config.early_stop:
                # in case we want to train with the validation set too, we dump also
                # the ELMo embeddings for the token of the valid set
                x_train_local = np.concatenate((x_train, x_valid), axis=0)
            self.embeddings.dump_ELMo_token_embeddings(x_train_local)
        """
        self.model = get_model(self.model_config, self.p,
                               len(self.p.vocab_tag))
        trainer = Trainer(self.model,
                          self.models,
                          self.embeddings,
                          self.model_config,
                          self.training_config,
                          checkpoint_path=self.log_dir,
                          preprocessor=self.p)
        trainer.train(x_train, y_train, x_valid, y_valid)
        if self.embeddings.use_ELMo:
            self.embeddings.clean_ELMo_cache()

    def train_nfold(self,
                    x_train,
                    y_train,
                    x_valid=None,
                    y_valid=None,
                    fold_number=10):
        if x_valid is not None and y_valid is not None:
            x_all = np.concatenate((x_train, x_valid), axis=0)
            y_all = np.concatenate((y_train, y_valid), axis=0)
            self.p = prepare_preprocessor(x_all, y_all, self.model_config)
        else:
            self.p = prepare_preprocessor(x_train, y_train, self.model_config)
        self.model_config.char_vocab_size = len(self.p.vocab_char)
        self.model_config.case_vocab_size = len(self.p.vocab_case)
        self.p.return_lengths = True

        #self.model = get_model(self.model_config, self.p, len(self.p.vocab_tag))
        self.models = []

        for k in range(0, fold_number):
            model = get_model(self.model_config, self.p, len(self.p.vocab_tag))
            self.models.append(model)

        trainer = Trainer(self.model,
                          self.models,
                          self.embeddings,
                          self.model_config,
                          self.training_config,
                          checkpoint_path=self.log_dir,
                          preprocessor=self.p)
        trainer.train_nfold(x_train, y_train, x_valid, y_valid)
        if self.embeddings.use_ELMo:
            self.embeddings.clean_ELMo_cache()

    def eval(self, x_test, y_test):
        if self.model_config.fold_number > 1 and self.models and len(
                self.models) == self.model_config.fold_number:
            self.eval_nfold(x_test, y_test)
        else:
            self.eval_single(x_test, y_test)

    def eval_single(self, x_test, y_test):
        if self.model:
            # Prepare test data(steps, generator)
            test_generator = DataGenerator(
                x_test,
                y_test,
                batch_size=self.training_config.batch_size,
                preprocessor=self.p,
                char_embed_size=self.model_config.char_embedding_size,
                embeddings=self.embeddings,
                shuffle=False)

            # Build the evaluator and evaluate the model
            scorer = Scorer(test_generator, self.p, evaluation=True)
            scorer.model = self.model
            scorer.on_epoch_end(epoch=-1)
        else:
            raise (OSError('Could not find a model.'))

    def eval_nfold(self, x_test, y_test):
        if self.models is not None:
            total_f1 = 0
            best_f1 = 0
            best_index = 0
            worst_f1 = 1
            worst_index = 0
            reports = []
            total_precision = 0
            total_recall = 0
            for i in range(0, self.model_config.fold_number):
                print('\n------------------------ fold ' + str(i) +
                      '--------------------------------------')

                # Prepare test data(steps, generator)
                test_generator = DataGenerator(
                    x_test,
                    y_test,
                    batch_size=self.training_config.batch_size,
                    preprocessor=self.p,
                    char_embed_size=self.model_config.char_embedding_size,
                    embeddings=self.embeddings,
                    shuffle=False)

                # Build the evaluator and evaluate the model
                scorer = Scorer(test_generator, self.p, evaluation=True)
                scorer.model = self.models[i]
                scorer.on_epoch_end(epoch=-1)
                f1 = scorer.f1
                precision = scorer.precision
                recall = scorer.recall
                reports.append(scorer.report)

                if best_f1 < f1:
                    best_f1 = f1
                    best_index = i
                if worst_f1 > f1:
                    worst_f1 = f1
                    worst_index = i
                total_f1 += f1
                total_precision += precision
                total_recall += recall

            macro_f1 = total_f1 / self.model_config.fold_number
            macro_precision = total_precision / self.model_config.fold_number
            macro_recall = total_recall / self.model_config.fold_number

            print("\naverage over", self.model_config.fold_number, "folds")
            print("\tmacro f1 =", macro_f1)
            print("\tmacro precision =", macro_precision)
            print("\tmacro recall =", macro_recall, "\n")

            print("\n** Worst ** model scores - \n")
            print(reports[worst_index])

            self.model = self.models[best_index]
            print("\n** Best ** model scores - \n")
            print(reports[best_index])

    def tag(self, texts, output_format):
        # annotate a list of sentences, return the list of annotations in the
        # specified output_format
        if self.model:
            tagger = Tagger(self.model,
                            self.model_config,
                            self.embeddings,
                            preprocessor=self.p)
            start_time = time.time()
            annotations = tagger.tag(texts, output_format)
            runtime = round(time.time() - start_time, 3)
            if output_format is 'json':
                annotations["runtime"] = runtime
            else:
                print("runtime: %s seconds " % (runtime))
            return annotations
        else:
            raise (OSError('Could not find a model.'))

    def tag_file(self, file_in, output_format, file_out):
        # Annotate a text file containing one sentence per line, the annotations are
        # written in the output file if not None, in the standard output otherwise.
        # Processing is streamed by batches so that we can process huge files without
        # memory issues
        if self.model:
            tagger = Tagger(self.model,
                            self.model_config,
                            self.embeddings,
                            preprocessor=self.p)
            start_time = time.time()
            if file_out is not None:
                out = open(file_out, 'w')
            first = True
            with open(file_in, 'r') as f:
                texts = None
                while texts is None or len(
                        texts
                ) == self.model_config.batch_size * self.nb_workers:

                    texts = next_n_lines(
                        f, self.model_config.batch_size * self.nb_workers)
                    annotations = tagger.tag(texts, output_format)
                    # if the following is true, we just output the JSON returned by the tagger without any modification
                    directDump = False
                    if first:
                        first = False
                        if len(
                                texts
                        ) < self.model_config.batch_size * self.nb_workers:
                            runtime = round(time.time() - start_time, 3)
                            annotations['runtime'] = runtime
                            jsonString = json.dumps(annotations,
                                                    sort_keys=False,
                                                    indent=4,
                                                    ensure_ascii=False)
                            if file_out is None:
                                print(jsonString)
                            else:
                                out.write(jsonString)
                            directDump = True
                        else:
                            # we need to modify a bit the JSON outputted by the tagger to glue the different batches
                            # output the general information attributes
                            jsonString = '{\n    "software": ' + json.dumps(
                                annotations["software"],
                                ensure_ascii=False) + ",\n"
                            jsonString += '    "date": ' + json.dumps(
                                annotations["date"],
                                ensure_ascii=False) + ",\n"
                            jsonString += '    "model": ' + json.dumps(
                                annotations["model"],
                                ensure_ascii=False) + ",\n"
                            jsonString += '    "texts": ['
                            if file_out is None:
                                print(jsonString, end='', flush=True)
                            else:
                                out.write(jsonString)
                            first = True
                            for jsonStr in annotations["texts"]:
                                jsonString = json.dumps(jsonStr,
                                                        sort_keys=False,
                                                        indent=4,
                                                        ensure_ascii=False)
                                #jsonString = jsonString.replace('\n', '\n\t\t')
                                jsonString = re.sub('\n', '\n        ',
                                                    jsonString)
                                if file_out is None:
                                    if not first:
                                        print(',\n        ' + jsonString,
                                              end='',
                                              flush=True)
                                    else:
                                        first = False
                                        print('\n        ' + jsonString,
                                              end='',
                                              flush=True)
                                else:
                                    if not first:
                                        out.write(',\n        ')
                                        out.write(jsonString)
                                    else:
                                        first = False
                                        out.write('\n        ')
                                        out.write(jsonString)
                    else:
                        for jsonStr in annotations["texts"]:
                            jsonString = json.dumps(jsonStr,
                                                    sort_keys=False,
                                                    indent=4,
                                                    ensure_ascii=False)
                            jsonString = re.sub('\n', '\n        ', jsonString)
                            if file_out is None:
                                print(',\n        ' + jsonString,
                                      end='',
                                      flush=True)
                            else:
                                out.write(',\n        ')
                                out.write(jsonString)

            runtime = round(time.time() - start_time, 3)
            if not directDump:
                jsonString = "\n    ],\n"
                jsonString += '    "runtime": ' + str(runtime)
                jsonString += "\n}\n"
                if file_out is None:
                    print(jsonString)
                else:
                    out.write(jsonString)

            if file_out is not None:
                out.close()
            #print("runtime: %s seconds " % (runtime))
        else:
            raise (OSError('Could not find a model.'))

    def save(self, dir_path='data/models/sequenceLabelling/'):
        # create subfolder for the model if not already exists
        directory = os.path.join(dir_path, self.model_config.model_name)
        if not os.path.exists(directory):
            os.makedirs(directory)

        self.p.save(os.path.join(directory, self.preprocessor_file))
        print('preprocessor saved')

        self.model_config.save(os.path.join(directory, self.config_file))
        print('model config file saved')

        self.model.save(os.path.join(directory, self.weight_file))
        print('model saved')

    def load(self, dir_path='data/models/sequenceLabelling/'):
        self.p = WordPreprocessor.load(
            os.path.join(dir_path, self.model_config.model_name,
                         self.preprocessor_file))

        self.model_config = ModelConfig.load(
            os.path.join(dir_path, self.model_config.model_name,
                         self.config_file))

        # load embeddings
        self.embeddings = Embeddings(self.model_config.embeddings_name,
                                     use_ELMo=self.model_config.use_ELMo)
        self.model_config.word_embedding_size = self.embeddings.embed_size

        self.model = get_model(self.model_config,
                               self.p,
                               ntags=len(self.p.vocab_tag))
        self.model.load(filepath=os.path.join(
            dir_path, self.model_config.model_name, self.weight_file))
Example #6
0
class Sequence(object):

    config_file = 'config.json'
    weight_file = 'model_weights.hdf5'
    preprocessor_file = 'preprocessor.pkl'

    def __init__(self, 
                 model_name,
                 model_type="BidLSTM_CRF",
                 embeddings_name=None,
                 char_emb_size=25, 
                 max_char_length=30,
                 char_lstm_units=25,
                 word_lstm_units=100, 
                 dropout=0.5, 
                 recurrent_dropout=0.25,
                 use_char_feature=True, 
                 use_crf=True,
                 batch_size=20, 
                 optimizer='adam', 
                 learning_rate=0.001, 
                 lr_decay=0.9,
                 clip_gradients=5.0, 
                 max_epoch=50, 
                 early_stop=True,
                 patience=5,
                 max_checkpoints_to_keep=5, 
                 log_dir=None,
                 use_ELMo=True,
                 fold_number=1):

        self.model = None
        self.models = None
        self.p = None
        self.log_dir = log_dir
        self.embeddings_name = embeddings_name

        word_emb_size = 0
        if embeddings_name is not None:
            self.embeddings = Embeddings(embeddings_name, use_ELMo=use_ELMo) 
            word_emb_size = self.embeddings.embed_size

        self.model_config = ModelConfig(model_name=model_name, 
                                        model_type=model_type, 
                                        embeddings_name=embeddings_name, 
                                        word_embedding_size=word_emb_size, 
                                        char_emb_size=char_emb_size, 
                                        char_lstm_units=char_lstm_units, 
                                        max_char_length=max_char_length,
                                        word_lstm_units=word_lstm_units, 
                                        dropout=dropout, 
                                        recurrent_dropout=recurrent_dropout, 
                                        use_char_feature=use_char_feature, 
                                        use_crf=use_crf, 
                                        fold_number=fold_number, 
                                        batch_size=batch_size,
                                        use_ELMo=use_ELMo)

        self.training_config = TrainingConfig(batch_size, optimizer, learning_rate,
                                              lr_decay, clip_gradients, max_epoch,
                                              early_stop, patience, 
                                              max_checkpoints_to_keep)


    def train(self, x_train, y_train, x_valid=None, y_valid=None):
        # TBD if valid is None, segment train to get one
        x_all = np.concatenate((x_train, x_valid), axis=0)
        y_all = np.concatenate((y_train, y_valid), axis=0)
        self.p = prepare_preprocessor(x_all, y_all, self.model_config)
        self.model_config.char_vocab_size = len(self.p.vocab_char)
        self.model_config.case_vocab_size = len(self.p.vocab_case)

        """
        if self.embeddings.use_ELMo:
            # dump token context independent data for the train set, done once for the training
            x_train_local = x_train
            if not self.training_config.early_stop:
                # in case we want to train with the validation set too, we dump also
                # the ELMo embeddings for the token of the valid set
                x_train_local = np.concatenate((x_train, x_valid), axis=0)
            self.embeddings.dump_ELMo_token_embeddings(x_train_local)
        """
        self.model = get_model(self.model_config, self.p, len(self.p.vocab_tag))
        trainer = Trainer(self.model, 
                          self.models,
                          self.embeddings,
                          self.model_config,
                          self.training_config,
                          checkpoint_path=self.log_dir,
                          preprocessor=self.p
                          )
        trainer.train(x_train, y_train, x_valid, y_valid)
        if self.embeddings.use_ELMo:
            self.embeddings.clean_ELMo_cache()

    def train_nfold(self, x_train, y_train, x_valid=None, y_valid=None, fold_number=10):
        if x_valid is not None and y_valid is not None:
            x_all = np.concatenate((x_train, x_valid), axis=0)
            y_all = np.concatenate((y_train, y_valid), axis=0)
            self.p = prepare_preprocessor(x_all, y_all, self.model_config)
        else:
            self.p = prepare_preprocessor(x_train, y_train, self.model_config)
        self.model_config.char_vocab_size = len(self.p.vocab_char)
        self.model_config.case_vocab_size = len(self.p.vocab_case)
        self.p.return_lengths = True
        
        #self.model = get_model(self.model_config, self.p, len(self.p.vocab_tag))
        self.models = []

        for k in range(0, fold_number):
            model = get_model(self.model_config, self.p, len(self.p.vocab_tag))
            self.models.append(model)

        trainer = Trainer(self.model, 
                          self.models,
                          self.embeddings,
                          self.model_config,
                          self.training_config,
                          checkpoint_path=self.log_dir,
                          preprocessor=self.p
                          )
        trainer.train_nfold(x_train, y_train, x_valid, y_valid)
        if self.embeddings.use_ELMo:
            self.embeddings.clean_ELMo_cache()

    def eval(self, x_test, y_test):
        if self.model_config.fold_number > 1 and self.models and len(self.models) == self.model_config.fold_number:
            self.eval_nfold(x_test, y_test)
        else:
            self.eval_single(x_test, y_test)


    def eval_single(self, x_test, y_test):   
        if self.model:
            # Prepare test data(steps, generator)
            test_generator = DataGenerator(x_test, y_test, 
              batch_size=self.training_config.batch_size, preprocessor=self.p, 
              char_embed_size=self.model_config.char_embedding_size, 
              embeddings=self.embeddings, shuffle=False)

            # Build the evaluator and evaluate the model
            scorer = Scorer(test_generator, self.p, evaluation=True)
            scorer.model = self.model
            scorer.on_epoch_end(epoch=-1) 
        else:
            raise (OSError('Could not find a model.'))


    def eval_nfold(self, x_test, y_test):
        if self.models is not None:
            total_f1 = 0
            best_f1 = 0
            best_index = 0
            worst_f1 = 1
            worst_index = 0
            reports = []
            total_precision = 0
            total_recall = 0
            for i in range(0, self.model_config.fold_number):
                print('\n------------------------ fold ' + str(i) + '--------------------------------------')

                # Prepare test data(steps, generator)
                test_generator = DataGenerator(x_test, y_test, 
                  batch_size=self.training_config.batch_size, preprocessor=self.p, 
                  char_embed_size=self.model_config.char_embedding_size, 
                  embeddings=self.embeddings, shuffle=False)

                # Build the evaluator and evaluate the model
                scorer = Scorer(test_generator, self.p, evaluation=True)
                scorer.model = self.models[i]
                scorer.on_epoch_end(epoch=-1) 
                f1 = scorer.f1
                precision = scorer.precision
                recall = scorer.recall
                reports.append(scorer.report)
                
                if best_f1 < f1:
                    best_f1 = f1
                    best_index = i
                if worst_f1 > f1:
                    worst_f1 = f1
                    worst_index = i
                total_f1 += f1
                total_precision += precision
                total_recall += recall

            macro_f1 = total_f1 / self.model_config.fold_number
            macro_precision = total_precision / self.model_config.fold_number
            macro_recall = total_recall / self.model_config.fold_number

            print("\naverage over", self.model_config.fold_number, "folds")
            print("\tmacro f1 =", macro_f1)
            print("\tmacro precision =", macro_precision)
            print("\tmacro recall =", macro_recall, "\n")

            print("\n** Worst ** model scores - \n")
            print(reports[worst_index])

            self.model = self.models[best_index]
            print("\n** Best ** model scores - \n")
            print(reports[best_index])
        

    def tag(self, texts, output_format):
        if self.model:
            tagger = Tagger(self.model, self.model_config, self.embeddings, preprocessor=self.p)
            return tagger.tag(texts, output_format)
        else:
            raise (OSError('Could not find a model.'))


    def save(self, dir_path='data/models/sequenceLabelling/'):
        
        # create subfolder for the model if not already exists
        directory = os.path.join(dir_path, self.model_config.model_name)
        if not os.path.exists(directory):
            os.makedirs(directory)

        self.p.save(os.path.join(directory, self.preprocessor_file))
        print('preprocessor saved')
        
        self.model_config.save(os.path.join(directory, self.config_file))
        print('model config file saved')
        
        self.model.save(os.path.join(directory, self.weight_file))
        print('model saved')


    def load(self, dir_path='data/models/sequenceLabelling/'):
        self.p = WordPreprocessor.load(os.path.join(dir_path, self.model_config.model_name, self.preprocessor_file))
        
        self.model_config = ModelConfig.load(os.path.join(dir_path, self.model_config.model_name, self.config_file))

        # load embeddings
        self.embeddings = Embeddings(self.model_config.embeddings_name, use_ELMo=self.model_config.use_ELMo) 
        self.model_config.word_embedding_size = self.embeddings.embed_size

        self.model = get_model(self.model_config, self.p, ntags=len(self.p.vocab_tag))
        self.model.load(filepath=os.path.join(dir_path, self.model_config.model_name, self.weight_file))