Python Embeddings Examples

Programming Language: Python

Namespace/Package Name: delft.utilities.Embeddings

Class/Type: Embeddings

Examples at hotexamples.com: 15

Python Embeddings - 15 examples found. These are the top rated real world Python examples of delft.utilities.Embeddings.Embeddings extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Embeddings(10)

clean_ELMo_cache(5)

clean_BERT_cache(4)

clean_downloads(1)

env(1)

get_description(1)

load_embeddings_from_file(1)

Example #1

Show file

    def load(self, dir_path='data/models/sequenceLabelling/'):
        self.model_config = ModelConfig.load(
            os.path.join(dir_path, self.model_config.model_name,
                         self.config_file))
        self.p = WordPreprocessor.load(
            os.path.join(dir_path, self.model_config.model_name,
                         self.preprocessor_file))

        if self.model_config.model_type.lower().find("bert") != -1:
            self.model = get_model(self.model_config,
                                   self.p,
                                   ntags=len(self.p.vocab_tag),
                                   dir_path=dir_path)
            self.model.load_model()
            return

        # load embeddings
        # Do not use cache in 'production' mode
        self.embeddings = Embeddings(self.model_config.embeddings_name,
                                     use_ELMo=self.model_config.use_ELMo,
                                     use_BERT=self.model_config.use_BERT,
                                     use_cache=False)
        self.model_config.word_embedding_size = self.embeddings.embed_size

        self.model = get_model(self.model_config,
                               self.p,
                               ntags=len(self.p.vocab_tag))
        self.model.load(filepath=os.path.join(
            dir_path, self.model_config.model_name, self.weight_file))

Example #2

Show file

    def load(self,
             dir_path='data/models/sequenceLabelling/',
             weight_file=DEFAULT_WEIGHT_FILE_NAME):
        model_path = os.path.join(dir_path, self.model_config.model_name)
        self.model_config = ModelConfig.load(
            os.path.join(model_path, CONFIG_FILE_NAME))

        if self.model_config.embeddings_name is not None:
            # load embeddings
            # Do not use cache in 'prediction/production' mode
            self.embeddings = Embeddings(self.model_config.embeddings_name,
                                         resource_registry=self.registry,
                                         use_ELMo=self.model_config.use_ELMo,
                                         use_cache=False)
            self.model_config.word_embedding_size = self.embeddings.embed_size
        else:
            self.embeddings = None
            self.model_config.word_embedding_size = 0

        self.p = Preprocessor.load(
            os.path.join(dir_path, self.model_config.model_name,
                         PROCESSOR_FILE_NAME))
        self.model = get_model(self.model_config,
                               self.p,
                               ntags=len(self.p.vocab_tag),
                               load_pretrained_weights=False,
                               local_path=os.path.join(
                                   dir_path, self.model_config.model_name))
        print(
            "load weights from",
            os.path.join(dir_path, self.model_config.model_name, weight_file))
        self.model.load(filepath=os.path.join(
            dir_path, self.model_config.model_name, weight_file))
        self.model.print_summary()

Example #3

Show file

    def load(self, dir_path='data/models/textClassification/'):
        self.model_config = ModelConfig.load(
            os.path.join(dir_path, self.model_config.model_name,
                         self.config_file))

        # load embeddings
        self.embeddings = Embeddings(self.model_config.embeddings_name,
                                     use_ELMo=self.model_config.use_ELMo,
                                     use_BERT=self.model_config.use_BERT)
        self.model_config.word_embedding_size = self.embeddings.embed_size

        self.model = getModel(self.model_config, self.training_config)
        if self.model_config.fold_number is 1:
            self.model.load_weights(
                os.path.join(
                    dir_path, self.model_config.model_name,
                    self.model_config.model_type + "." + self.weight_file))
        else:
            self.models = []
            for i in range(0, self.model_config.fold_number):
                local_model = getModel(self.model_config, self.training_config)
                local_model.load_weights(
                    os.path.join(
                        dir_path, self.model_config.model_name,
                        self.model_config.model_type +
                        ".model{0}_weights.hdf5".format(i)))
                self.models.append(local_model)

Example #4

Show file

File: wrapper.py Project: aarushiibisht/delft

    def __init__(self, 
                 model_name,
                 model_type="BidLSTM_CRF",
                 embeddings_name=None,
                 char_emb_size=25, 
                 max_char_length=30,
                 char_lstm_units=25,
                 word_lstm_units=100, 
                 dropout=0.5, 
                 recurrent_dropout=0.25,
                 use_char_feature=True, 
                 use_crf=True,
                 batch_size=20, 
                 optimizer='adam', 
                 learning_rate=0.001, 
                 lr_decay=0.9,
                 clip_gradients=5.0, 
                 max_epoch=50, 
                 early_stop=True,
                 patience=5,
                 max_checkpoints_to_keep=5, 
                 log_dir=None,
                 use_ELMo=True,
                 fold_number=1):

        self.model = None
        self.models = None
        self.p = None
        self.log_dir = log_dir
        self.embeddings_name = embeddings_name

        word_emb_size = 0
        if embeddings_name is not None:
            self.embeddings = Embeddings(embeddings_name, use_ELMo=use_ELMo) 
            word_emb_size = self.embeddings.embed_size

        self.model_config = ModelConfig(model_name=model_name, 
                                        model_type=model_type, 
                                        embeddings_name=embeddings_name, 
                                        word_embedding_size=word_emb_size, 
                                        char_emb_size=char_emb_size, 
                                        char_lstm_units=char_lstm_units, 
                                        max_char_length=max_char_length,
                                        word_lstm_units=word_lstm_units, 
                                        dropout=dropout, 
                                        recurrent_dropout=recurrent_dropout, 
                                        use_char_feature=use_char_feature, 
                                        use_crf=use_crf, 
                                        fold_number=fold_number, 
                                        batch_size=batch_size,
                                        use_ELMo=use_ELMo)

        self.training_config = TrainingConfig(batch_size, optimizer, learning_rate,
                                              lr_decay, clip_gradients, max_epoch,
                                              early_stop, patience, 
                                              max_checkpoints_to_keep)

Example #5

Show file

File: wrapper.py Project: aarushiibisht/delft

    def load(self, dir_path='data/models/sequenceLabelling/'):
        self.p = WordPreprocessor.load(os.path.join(dir_path, self.model_config.model_name, self.preprocessor_file))

        self.model_config = ModelConfig.load(os.path.join(dir_path, self.model_config.model_name, self.config_file))

        # load embeddings
        self.embeddings = Embeddings(self.model_config.embeddings_name, use_ELMo=self.model_config.use_ELMo) 
        self.model_config.word_embedding_size = self.embeddings.embed_size

        self.model = get_model(self.model_config, self.p, ntags=len(self.p.vocab_tag))
        self.model.load(filepath=os.path.join(dir_path, self.model_config.model_name, self.weight_file))

Example #6

Show file

    def __init__(self, 
                 model_name="",
                 model_type="gru",
                 embeddings_name=None,
                 list_classes=[],
                 char_emb_size=25, 
                 dropout=0.5, 
                 recurrent_dropout=0.25,
                 use_char_feature=False, 
                 batch_size=256, 
                 optimizer='adam', 
                 learning_rate=0.001, 
                 lr_decay=0.9,
                 clip_gradients=5.0, 
                 max_epoch=50, 
                 patience=5,
                 log_dir=None,
                 maxlen=300,
                 fold_number=1,
                 use_roc_auc=True,
                 use_ELMo=False,
                 use_BERT=False,
                 embeddings=(),
                 class_weights=None,
                 multiprocessing=True):
        self.model = None
        self.models = None
        self.log_dir = log_dir
        self.embeddings_name = embeddings_name

        word_emb_size = 0
        if embeddings_name is not None and model_type.find("bert") == -1:
            self.embeddings = Embeddings(embeddings_name, use_ELMo=use_ELMo, use_BERT=use_BERT) 
            word_emb_size = self.embeddings.embed_size

        self.model_config = ModelConfig(model_name=model_name, 
                                        model_type=model_type, 
                                        embeddings_name=embeddings_name, 
                                        list_classes=list_classes, 
                                        char_emb_size=char_emb_size, 
                                        word_emb_size=word_emb_size, 
                                        dropout=dropout, 
                                        recurrent_dropout=recurrent_dropout,
                                        use_char_feature=use_char_feature, 
                                        maxlen=maxlen, 
                                        fold_number=fold_number, 
                                        batch_size=batch_size, 
                                        use_ELMo=use_ELMo, 
                                        use_BERT=use_BERT)

        self.training_config = TrainingConfig(batch_size, optimizer, learning_rate,
                                              lr_decay, clip_gradients, max_epoch,
                                              patience, use_roc_auc,
                                              class_weights=class_weights, multiprocessing=multiprocessing)

Example #7

Show file

File: preload_embeddings.py Project: krkh-1702/grobid

def preload(embeddings_name, input_path=None):
    embeddings = Embeddings(embeddings_name, path='./embedding-registry.json', load=False)

    description = embeddings.get_description(embeddings_name)
    if description is None:
        print("Error: embedding name", embeddings_name, "is not registered in", path)

    if input_path is None:
        embeddings_path = None
        # download if url is available
        if description is not None and "url" in description and len(description["url"])>0:
            url = description["url"]
            download_path = embeddings.registry['embedding-download-path']
            # if the download path does not exist, we create it
            if not os.path.isdir(download_path):
                try:
                    os.mkdir(download_path)
                except OSError:
                    print ("Creation of the download directory", download_path, "failed")

            print("Downloading resource file for", embeddings_name, "...")
            embeddings_path = download_file(url, download_path)
            if embeddings_path != None and os.path.isfile(embeddings_path):
                print("Download sucessful:", embeddings_path)
        else:
            print("Embeddings resource is not specified in the embeddings registry:", embeddings_name)
    else:
        embeddings_path = input_path

    if embeddings_path == None:
        print("Fail to retrive embedding file for", embeddings_name)

    embedding_file = open_embedding_file(embeddings_path)
    if embedding_file is None:
        print("Error: could not open embeddings file", embeddings_path)
        return

    # create and load the database in write mode
    embedding_lmdb_path = embeddings.registry["embedding-lmdb-path"]
    if not os.path.isdir(embedding_lmdb_path):
        os.makedirs(embedding_lmdb_path)

    envFilePath = os.path.join(embedding_lmdb_path, embeddings_name)
    embeddings.env = lmdb.open(envFilePath, map_size=map_size)
    embeddings.load_embeddings_from_file(embeddings_path)
    embeddings.clean_downloads()

Example #8

Show file

    def load(self, dir_path='data/models/textClassification/'):
        model_path = os.path.join(dir_path, self.model_config.model_name)
        self.model_config = ModelConfig.load(os.path.join(model_path, self.config_file))

        if self.model_config.transformer_name is None:
            # load embeddings
            # Do not use cache in 'production' mode
            self.embeddings = Embeddings(self.model_config.embeddings_name, resource_registry=self.registry, use_cache=False)
            self.model_config.word_embedding_size = self.embeddings.embed_size
        else:
            self.transformer_name = self.model_config.transformer_name
            self.embeddings = None

        self.model = getModel(self.model_config, 
                              self.training_config, 
                              load_pretrained_weights=False, 
                              local_path=model_path)
        print_parameters(self.model_config, self.training_config)
        self.model.print_summary()

        if self.model_config.fold_number == 1:
            print("load weights from", os.path.join(model_path, self.weight_file))
            self.model.load(os.path.join(model_path, self.weight_file))
        else:
            self.models = []
            if self.model_config.transformer_name is None:
                for i in range(0, self.model_config.fold_number):
                    local_model = getModel(self.model_config, 
                                        self.training_config, 
                                        load_pretrained_weights=False, 
                                        local_path=model_path)
                    local_model.load(os.path.join(model_path, "model{0}_weights.hdf5".format(i)))
                    self.models.append(local_model)
            else:
                # only init first fold one, the other will be init at prediction time, all weights will be loaded at prediction time
                local_model = getModel(self.model_config, 
                                    self.training_config, 
                                    load_pretrained_weights=False, 
                                    local_path=model_path)
                self.models.append(local_model)

Example #9

Show file

File: wrapper.py Project: aarushiibisht/delft

class Sequence(object):

    config_file = 'config.json'
    weight_file = 'model_weights.hdf5'
    preprocessor_file = 'preprocessor.pkl'

    # number of parallel worker for the data generator when not using ELMo
    nb_workers = 6

    def __init__(self, 
                 model_name,
                 model_type="BidLSTM_CRF",
                 embeddings_name=None,
                 char_emb_size=25, 
                 max_char_length=30,
                 char_lstm_units=25,
                 word_lstm_units=100, 
                 dropout=0.5, 
                 recurrent_dropout=0.25,
                 use_char_feature=True, 
                 use_crf=True,
                 batch_size=20, 
                 optimizer='adam', 
                 learning_rate=0.001, 
                 lr_decay=0.9,
                 clip_gradients=5.0, 
                 max_epoch=50, 
                 early_stop=True,
                 patience=5,
                 max_checkpoints_to_keep=5, 
                 log_dir=None,
                 use_ELMo=True,
                 fold_number=1):

        self.model = None
        self.models = None
        self.p = None
        self.log_dir = log_dir
        self.embeddings_name = embeddings_name

        word_emb_size = 0
        if embeddings_name is not None:
            self.embeddings = Embeddings(embeddings_name, use_ELMo=use_ELMo) 
            word_emb_size = self.embeddings.embed_size

        self.model_config = ModelConfig(model_name=model_name, 
                                        model_type=model_type, 
                                        embeddings_name=embeddings_name, 
                                        word_embedding_size=word_emb_size, 
                                        char_emb_size=char_emb_size, 
                                        char_lstm_units=char_lstm_units, 
                                        max_char_length=max_char_length,
                                        word_lstm_units=word_lstm_units, 
                                        dropout=dropout, 
                                        recurrent_dropout=recurrent_dropout, 
                                        use_char_feature=use_char_feature, 
                                        use_crf=use_crf, 
                                        fold_number=fold_number, 
                                        batch_size=batch_size,
                                        use_ELMo=use_ELMo)

        self.training_config = TrainingConfig(batch_size, optimizer, learning_rate,
                                              lr_decay, clip_gradients, max_epoch,
                                              early_stop, patience, 
                                              max_checkpoints_to_keep)

    def train(self, x_train, y_train, x_valid=None, y_valid=None):
        # TBD if valid is None, segment train to get one
        x_all = np.concatenate((x_train, x_valid), axis=0)
        y_all = np.concatenate((y_train, y_valid), axis=0)
        self.p = prepare_preprocessor(x_all, y_all, self.model_config)
        self.model_config.char_vocab_size = len(self.p.vocab_char)
        self.model_config.case_vocab_size = len(self.p.vocab_case)

        """
        if self.embeddings.use_ELMo:
            # dump token context independent data for the train set, done once for the training
            x_train_local = x_train
            if not self.training_config.early_stop:
                # in case we want to train with the validation set too, we dump also
                # the ELMo embeddings for the token of the valid set
                x_train_local = np.concatenate((x_train, x_valid), axis=0)
            self.embeddings.dump_ELMo_token_embeddings(x_train_local)
        """
        self.model = get_model(self.model_config, self.p, len(self.p.vocab_tag))
        trainer = Trainer(self.model, 
                          self.models,
                          self.embeddings,
                          self.model_config,
                          self.training_config,
                          checkpoint_path=self.log_dir,
                          preprocessor=self.p
                          )
        trainer.train(x_train, y_train, x_valid, y_valid)
        if self.embeddings.use_ELMo:
            self.embeddings.clean_ELMo_cache()

    def train_nfold(self, x_train, y_train, x_valid=None, y_valid=None, fold_number=10):
        if x_valid is not None and y_valid is not None:
            x_all = np.concatenate((x_train, x_valid), axis=0)
            y_all = np.concatenate((y_train, y_valid), axis=0)
            self.p = prepare_preprocessor(x_all, y_all, self.model_config)
        else:
            self.p = prepare_preprocessor(x_train, y_train, self.model_config)
        self.model_config.char_vocab_size = len(self.p.vocab_char)
        self.model_config.case_vocab_size = len(self.p.vocab_case)
        self.p.return_lengths = True

        #self.model = get_model(self.model_config, self.p, len(self.p.vocab_tag))
        self.models = []

        for k in range(0, fold_number):
            model = get_model(self.model_config, self.p, len(self.p.vocab_tag))
            self.models.append(model)

        trainer = Trainer(self.model, 
                          self.models,
                          self.embeddings,
                          self.model_config,
                          self.training_config,
                          checkpoint_path=self.log_dir,
                          preprocessor=self.p
                          )
        trainer.train_nfold(x_train, y_train, x_valid, y_valid)
        if self.embeddings.use_ELMo:
            self.embeddings.clean_ELMo_cache()

    def eval(self, x_test, y_test):
        if self.model_config.fold_number > 1 and self.models and len(self.models) == self.model_config.fold_number:
            self.eval_nfold(x_test, y_test)
        else:
            self.eval_single(x_test, y_test)

    def eval_single(self, x_test, y_test):   
        if self.model:
            # Prepare test data(steps, generator)
            test_generator = DataGenerator(x_test, y_test, 
              batch_size=self.training_config.batch_size, preprocessor=self.p, 
              char_embed_size=self.model_config.char_embedding_size, 
              embeddings=self.embeddings, shuffle=False)

            # Build the evaluator and evaluate the model
            scorer = Scorer(test_generator, self.p, evaluation=True)
            scorer.model = self.model
            scorer.on_epoch_end(epoch=-1) 
        else:
            raise (OSError('Could not find a model.'))

    def eval_nfold(self, x_test, y_test):
        if self.models is not None:
            total_f1 = 0
            best_f1 = 0
            best_index = 0
            worst_f1 = 1
            worst_index = 0
            reports = []
            total_precision = 0
            total_recall = 0
            for i in range(0, self.model_config.fold_number):
                print('\n------------------------ fold ' + str(i) + '--------------------------------------')

                # Prepare test data(steps, generator)
                test_generator = DataGenerator(x_test, y_test, 
                  batch_size=self.training_config.batch_size, preprocessor=self.p, 
                  char_embed_size=self.model_config.char_embedding_size, 
                  embeddings=self.embeddings, shuffle=False)

                # Build the evaluator and evaluate the model
                scorer = Scorer(test_generator, self.p, evaluation=True)
                scorer.model = self.models[i]
                scorer.on_epoch_end(epoch=-1) 
                f1 = scorer.f1
                precision = scorer.precision
                recall = scorer.recall
                reports.append(scorer.report)

                if best_f1 < f1:
                    best_f1 = f1
                    best_index = i
                if worst_f1 > f1:
                    worst_f1 = f1
                    worst_index = i
                total_f1 += f1
                total_precision += precision
                total_recall += recall

            macro_f1 = total_f1 / self.model_config.fold_number
            macro_precision = total_precision / self.model_config.fold_number
            macro_recall = total_recall / self.model_config.fold_number

            print("\naverage over", self.model_config.fold_number, "folds")
            print("\tmacro f1 =", macro_f1)
            print("\tmacro precision =", macro_precision)
            print("\tmacro recall =", macro_recall, "\n")

            print("\n** Worst ** model scores - \n")
            print(reports[worst_index])

            self.model = self.models[best_index]
            print("\n** Best ** model scores - \n")
            print(reports[best_index])

    def tag(self, texts, output_format):
        # annotate a list of sentences, return the list of annotations in the 
        # specified output_format
        if self.model:
            tagger = Tagger(self.model, self.model_config, self.embeddings, preprocessor=self.p)
            start_time = time.time()
            annotations = tagger.tag(texts, output_format)
            runtime = round(time.time() - start_time, 3)
            if output_format is 'json':
                annotations["runtime"] = runtime
            #else:
            #    print("runtime: %s seconds " % (runtime))
            return annotations
        else:
            raise (OSError('Could not find a model.'))

    def tag_file(self, file_in, output_format, file_out):
        # Annotate a text file containing one sentence per line, the annotations are
        # written in the output file if not None, in the standard output otherwise.
        # Processing is streamed by batches so that we can process huge files without
        # memory issues
        if self.model:
            tagger = Tagger(self.model, self.model_config, self.embeddings, preprocessor=self.p)
            start_time = time.time()
            if file_out is not None:
                out = open(file_out,'w')
            first = True
            with open(file_in, 'r') as f:
                texts = None
                while texts is None or len(texts) == self.model_config.batch_size * self.nb_workers:

                  texts = next_n_lines(f, self.model_config.batch_size * self.nb_workers)
                  annotations = tagger.tag(texts, output_format)
                  # if the following is true, we just output the JSON returned by the tagger without any modification
                  directDump = False
                  if first:
                      first = False
                      if len(texts) < self.model_config.batch_size * self.nb_workers:
                          runtime = round(time.time() - start_time, 3)
                          annotations['runtime'] = runtime
                          jsonString = json.dumps(annotations, sort_keys=False, indent=4, ensure_ascii=False)
                          if file_out is None:
                              print(jsonString)
                          else:
                              out.write(jsonString)
                          directDump = True
                      else:
                          # we need to modify a bit the JSON outputted by the tagger to glue the different batches
                          # output the general information attributes
                          jsonString = '{\n    "software": ' + json.dumps(annotations["software"], ensure_ascii=False) + ",\n"
                          jsonString += '    "date": ' + json.dumps(annotations["date"], ensure_ascii=False) + ",\n"
                          jsonString += '    "model": ' + json.dumps(annotations["model"], ensure_ascii=False) + ",\n"
                          jsonString += '    "texts": ['
                          if file_out is None:
                              print(jsonString, end='', flush=True)
                          else:
                              out.write(jsonString)
                          first = True
                          for jsonStr in annotations["texts"]:
                              jsonString = json.dumps(jsonStr, sort_keys=False, indent=4, ensure_ascii=False)
                              #jsonString = jsonString.replace('\n', '\n\t\t')
                              jsonString = re.sub('\n', '\n        ', jsonString)
                              if file_out is None:
                                  if not first:
                                      print(',\n        '+jsonString, end='', flush=True)
                                  else:
                                      first = False
                                      print('\n        '+jsonString, end='', flush=True)
                              else:
                                  if not first:
                                      out.write(',\n        ')
                                      out.write(jsonString)
                                  else:
                                      first = False
                                      out.write('\n        ')
                                      out.write(jsonString)
                  else:
                      for jsonStr in annotations["texts"]:
                          jsonString = json.dumps(jsonStr, sort_keys=False, indent=4, ensure_ascii=False)
                          jsonString = re.sub('\n', '\n        ', jsonString)
                          if file_out is None:
                              print(',\n        '+jsonString, end='', flush=True)
                          else:
                              out.write(',\n        ')
                              out.write(jsonString)

            runtime = round(time.time() - start_time, 3)
            if not directDump: 
                jsonString = "\n    ],\n"
                jsonString += '    "runtime": ' + str(runtime)
                jsonString += "\n}\n"
                if file_out is None:
                    print(jsonString)
                else:
                    out.write(jsonString) 

            if file_out is not None:
                out.close() 
            #print("runtime: %s seconds " % (runtime))
        else:
            raise (OSError('Could not find a model.'))

    def save(self, dir_path='data/models/sequenceLabelling/'):
        # create subfolder for the model if not already exists
        directory = os.path.join(dir_path, self.model_config.model_name)
        if not os.path.exists(directory):
            os.makedirs(directory)

        self.p.save(os.path.join(directory, self.preprocessor_file))
        print('preprocessor saved')

        self.model_config.save(os.path.join(directory, self.config_file))
        print('model config file saved')

        self.model.save(os.path.join(directory, self.weight_file))
        print('model saved')

    def load(self, dir_path='data/models/sequenceLabelling/'):
        self.p = WordPreprocessor.load(os.path.join(dir_path, self.model_config.model_name, self.preprocessor_file))

        self.model_config = ModelConfig.load(os.path.join(dir_path, self.model_config.model_name, self.config_file))

        # load embeddings
        self.embeddings = Embeddings(self.model_config.embeddings_name, use_ELMo=self.model_config.use_ELMo) 
        self.model_config.word_embedding_size = self.embeddings.embed_size

        self.model = get_model(self.model_config, self.p, ntags=len(self.p.vocab_tag))
        self.model.load(filepath=os.path.join(dir_path, self.model_config.model_name, self.weight_file))

Example #10

Show file

class Sequence(object):

    config_file = 'config.json'
    weight_file = 'model_weights.hdf5'
    preprocessor_file = 'preprocessor.json'
    #preprocessor_file_new = 'preprocessor.json'

    # number of parallel worker for the data generator when not using ELMo
    nb_workers = 6

    def __init__(self,
                 model_name,
                 model_type="BidLSTM_CRF",
                 embeddings_name=None,
                 char_emb_size=25,
                 max_char_length=30,
                 char_lstm_units=25,
                 word_lstm_units=100,
                 max_sequence_length=300,
                 dropout=0.5,
                 recurrent_dropout=0.25,
                 use_char_feature=True,
                 use_crf=True,
                 batch_size=20,
                 optimizer='adam',
                 learning_rate=0.001,
                 lr_decay=0.9,
                 clip_gradients=5.0,
                 max_epoch=50,
                 early_stop=True,
                 patience=5,
                 max_checkpoints_to_keep=5,
                 log_dir=None,
                 use_ELMo=False,
                 use_BERT=False,
                 fold_number=1,
                 multiprocessing=True,
                 features_indices=None):

        self.model = None
        self.models = None
        self.p = None
        self.log_dir = log_dir
        self.embeddings_name = embeddings_name

        word_emb_size = 0
        if embeddings_name is not None:
            self.embeddings = Embeddings(embeddings_name,
                                         use_ELMo=use_ELMo,
                                         use_BERT=use_BERT)
            word_emb_size = self.embeddings.embed_size
        else:
            self.embeddings = None

        self.model_config = ModelConfig(
            model_name=model_name,
            model_type=model_type,
            embeddings_name=embeddings_name,
            word_embedding_size=word_emb_size,
            char_emb_size=char_emb_size,
            char_lstm_units=char_lstm_units,
            max_char_length=max_char_length,
            word_lstm_units=word_lstm_units,
            max_sequence_length=max_sequence_length,
            dropout=dropout,
            recurrent_dropout=recurrent_dropout,
            use_char_feature=use_char_feature,
            use_crf=use_crf,
            fold_number=fold_number,
            batch_size=batch_size,
            use_ELMo=use_ELMo,
            use_BERT=use_BERT,
            features_indices=features_indices)

        self.training_config = TrainingConfig(batch_size, optimizer,
                                              learning_rate, lr_decay,
                                              clip_gradients, max_epoch,
                                              early_stop, patience,
                                              max_checkpoints_to_keep,
                                              multiprocessing)

    def train(self,
              x_train,
              y_train,
              f_train: np.array = None,
              x_valid=None,
              y_valid=None,
              f_valid: np.array = None,
              callbacks=None):
        # TBD if valid is None, segment train to get one
        x_all = np.concatenate(
            (x_train, x_valid), axis=0) if x_valid is not None else x_train
        y_all = np.concatenate(
            (y_train, y_valid), axis=0) if y_valid is not None else y_train
        features_all = concatenate_or_none((f_train, f_valid), axis=0)

        self.p = prepare_preprocessor(x_all,
                                      y_all,
                                      features=features_all,
                                      model_config=self.model_config)
        self.model_config.char_vocab_size = len(self.p.vocab_char)
        self.model_config.case_vocab_size = len(self.p.vocab_case)

        self.model = get_model(self.model_config, self.p,
                               len(self.p.vocab_tag))
        if self.p.return_features is not False:
            print('x_train.shape: ', x_train.shape)
            print('features_train.shape: ', f_train.shape)
            sample_transformed_features = self.p.transform_features(f_train)
            self.model_config.max_feature_size = np.asarray(
                sample_transformed_features).shape[-1]
            print('max_feature_size: ', self.model_config.max_feature_size)

        trainer = Trainer(self.model,
                          self.models,
                          self.embeddings,
                          self.model_config,
                          self.training_config,
                          checkpoint_path=self.log_dir,
                          preprocessor=self.p)
        trainer.train(x_train,
                      y_train,
                      x_valid,
                      y_valid,
                      features_train=f_train,
                      features_valid=f_valid,
                      callbacks=callbacks)
        if self.embeddings.use_ELMo:
            self.embeddings.clean_ELMo_cache()
        if self.embeddings.use_BERT:
            self.embeddings.clean_BERT_cache()

    def train_nfold(self,
                    x_train,
                    y_train,
                    x_valid=None,
                    y_valid=None,
                    f_train: np.array = None,
                    f_valid: np.array = None,
                    fold_number=10,
                    callbacks=None):
        x_all = np.concatenate(
            (x_train, x_valid), axis=0) if x_valid is not None else x_train
        y_all = np.concatenate(
            (y_train, y_valid), axis=0) if y_valid is not None else y_train
        features_all = concatenate_or_none((f_train, f_valid), axis=0)

        self.p = prepare_preprocessor(x_all,
                                      y_all,
                                      features=features_all,
                                      model_config=self.model_config)
        self.model_config.char_vocab_size = len(self.p.vocab_char)
        self.model_config.case_vocab_size = len(self.p.vocab_case)
        self.p.return_lengths = True

        if 'bert' in self.model_config.model_type.lower():
            self.model = get_model(self.model_config, self.p,
                                   len(self.p.vocab_tag))
        self.models = []

        for k in range(0, fold_number):
            model = get_model(self.model_config, self.p, len(self.p.vocab_tag))
            self.models.append(model)

        trainer = Trainer(self.model,
                          self.models,
                          self.embeddings,
                          self.model_config,
                          self.training_config,
                          checkpoint_path=self.log_dir,
                          preprocessor=self.p)
        trainer.train_nfold(x_train,
                            y_train,
                            x_valid,
                            y_valid,
                            f_train=f_train,
                            f_valid=f_valid,
                            callbacks=callbacks)
        if self.embeddings.use_ELMo:
            self.embeddings.clean_ELMo_cache()
        if self.embeddings.use_BERT:
            self.embeddings.clean_BERT_cache()
        if 'bert' in self.model_config.model_type.lower():
            self.save()

    def eval(self, x_test, y_test, features=None):
        if self.models and 1 < self.model_config.fold_number == len(
                self.models):
            self.eval_nfold(x_test, y_test, features=features)
        else:
            self.eval_single(x_test, y_test, features=features)

    def eval_single(self, x_test, y_test, features=None):
        if 'bert' not in self.model_config.model_type.lower():
            if self.model:
                # Prepare test data(steps, generator)
                test_generator = DataGenerator(
                    x_test,
                    y_test,
                    batch_size=self.model_config.batch_size,
                    preprocessor=self.p,
                    char_embed_size=self.model_config.char_embedding_size,
                    max_sequence_length=self.model_config.max_sequence_length,
                    embeddings=self.embeddings,
                    shuffle=False,
                    features=features)

                # Build the evaluator and evaluate the model
                scorer = Scorer(test_generator, self.p, evaluation=True)
                scorer.model = self.model
                scorer.on_epoch_end(epoch=-1)
            else:
                raise (OSError('Could not find a model.'))
        else:
            # BERT architecture model
            y_pred = self.model.predict(x_test, fold_id=-1)

            nb_alignment_issues = 0
            for i in range(len(y_test)):
                if len(y_test[i]) != len(y_pred[i]):
                    nb_alignment_issues += 1
                    # BERT tokenizer appears to introduce some additional tokens without ## prefix,
                    # but this is normally handled when predicting.
                    # To be very conservative, the following ensure the number of tokens always
                    # match, but it should never be used in practice.
                    if len(y_test[i]) < len(y_pred[i]):
                        y_test[i] = y_test[i] + ["O"] * (len(y_pred[i]) -
                                                         len(y_test[i]))
                    if len(y_test[i]) > len(y_pred[i]):
                        y_pred[i] = y_pred[i] + ["O"] * (len(y_test[i]) -
                                                         len(y_pred[i]))

            if nb_alignment_issues > 0:
                print("number of alignment issues with test set:",
                      nb_alignment_issues)

            report, report_as_map = classification_report(y_test,
                                                          y_pred,
                                                          digits=4)
            print(report)

    def eval_nfold(self, x_test, y_test, features=None):
        if self.models is not None:
            total_f1 = 0
            best_f1 = 0
            best_index = 0
            worst_f1 = 1
            worst_index = 0
            reports = []
            reports_as_map = []
            total_precision = 0
            total_recall = 0
            for i in range(self.model_config.fold_number):
                print('\n------------------------ fold ' + str(i) +
                      ' --------------------------------------')

                if 'bert' not in self.model_config.model_type.lower():
                    # Prepare test data(steps, generator)
                    test_generator = DataGenerator(
                        x_test,
                        y_test,
                        batch_size=self.model_config.batch_size,
                        preprocessor=self.p,
                        char_embed_size=self.model_config.char_embedding_size,
                        max_sequence_length=self.model_config.
                        max_sequence_length,
                        embeddings=self.embeddings,
                        shuffle=False,
                        features=features)

                    # Build the evaluator and evaluate the model
                    scorer = Scorer(test_generator, self.p, evaluation=True)
                    scorer.model = self.models[i]
                    scorer.on_epoch_end(epoch=-1)
                    f1 = scorer.f1
                    precision = scorer.precision
                    recall = scorer.recall
                    reports.append(scorer.report)
                    reports_as_map.append(scorer.report_as_map)

                else:
                    # BERT architecture model
                    dir_path = 'data/models/sequenceLabelling/'
                    self.model_config = ModelConfig.load(
                        os.path.join(dir_path, self.model_config.model_name,
                                     self.config_file))
                    self.p = WordPreprocessor.load(
                        os.path.join(dir_path, self.model_config.model_name,
                                     self.preprocessor_file))
                    self.model = get_model(self.model_config,
                                           self.p,
                                           ntags=len(self.p.vocab_tag))
                    self.model.load_model(i)

                    y_pred = self.model.predict(x_test, fold_id=i)

                    nb_alignment_issues = 0
                    for j in range(len(y_test)):
                        if len(y_test[i]) != len(y_pred[j]):
                            nb_alignment_issues += 1
                            # BERT tokenizer appears to introduce some additional tokens without ## prefix,
                            # but this is normally handled when predicting.
                            # To be very conservative, the following ensure the number of tokens always
                            # match, but it should never be used in practice.
                            if len(y_test[j]) < len(y_pred[j]):
                                y_test[j] = y_test[j] + ["O"] * (
                                    len(y_pred[j]) - len(y_test[j]))
                            if len(y_test[j]) > len(y_pred[j]):
                                y_pred[j] = y_pred[j] + ["O"] * (
                                    len(y_test[j]) - len(y_pred[j]))

                    if nb_alignment_issues > 0:
                        print("number of alignment issues with test set:",
                              nb_alignment_issues)

                    f1 = f1_score(y_test, y_pred)
                    precision = precision_score(y_test, y_pred)
                    recall = recall_score(y_test, y_pred)

                    print("\tf1: {:04.2f}".format(f1 * 100))
                    print("\tprecision: {:04.2f}".format(precision * 100))
                    print("\trecall: {:04.2f}".format(recall * 100))

                    report, report_as_map = classification_report(y_test,
                                                                  y_pred,
                                                                  digits=4)
                    reports.append(report)
                    reports_as_map.append(report_as_map)

                if best_f1 < f1:
                    best_f1 = f1
                    best_index = i
                if worst_f1 > f1:
                    worst_f1 = f1
                    worst_index = i
                total_f1 += f1
                total_precision += precision
                total_recall += recall

            fold_average_evaluation = {'labels': {}, 'micro': {}, 'macro': {}}

            micro_f1 = total_f1 / self.model_config.fold_number
            micro_precision = total_precision / self.model_config.fold_number
            micro_recall = total_recall / self.model_config.fold_number

            micro_eval_block = {
                'f1': micro_f1,
                'precision': micro_precision,
                'recall': micro_recall
            }
            fold_average_evaluation['micro'] = micro_eval_block

            # field-level average over the n folds
            labels = []
            for label in sorted(self.p.vocab_tag):
                if label == 'O' or label == '<PAD>':
                    continue
                if label.startswith("B-") or label.startswith(
                        "S-") or label.startswith("I-") or label.startswith(
                            "E-"):
                    label = label[2:]

                if label in labels:
                    continue
                labels.append(label)

                sum_p = 0
                sum_r = 0
                sum_f1 = 0
                sum_support = 0
                for j in range(0, self.model_config.fold_number):
                    if not label in reports_as_map[j]['labels']:
                        continue
                    report_as_map = reports_as_map[j]['labels'][label]
                    sum_p += report_as_map["precision"]
                    sum_r += report_as_map["recall"]
                    sum_f1 += report_as_map["f1"]
                    sum_support += report_as_map["support"]

                avg_p = sum_p / self.model_config.fold_number
                avg_r = sum_r / self.model_config.fold_number
                avg_f1 = sum_f1 / self.model_config.fold_number
                avg_support = sum_support / self.model_config.fold_number
                avg_support_dec = str(avg_support - int(avg_support))[1:]
                if avg_support_dec != '0':
                    avg_support = math.floor(avg_support)

                block_label = {
                    'precision': avg_p,
                    'recall': avg_r,
                    'support': avg_support,
                    'f1': avg_f1
                }
                fold_average_evaluation['labels'][label] = block_label

            print(
                "----------------------------------------------------------------------"
            )
            print("\n** Worst ** model scores - run", str(worst_index))
            print(reports[worst_index])

            print("\n** Best ** model scores - run", str(best_index))
            print(reports[best_index])

            if 'bert' not in self.model_config.model_type.lower():
                self.model = self.models[best_index]
            else:
                # copy best BERT model fold_number
                best_model_dir = 'data/models/sequenceLabelling/' + self.model_config.model_name + str(
                    best_index)
                new_model_dir = 'data/models/sequenceLabelling/' + self.model_config.model_name
                # update new_model_dir if it already exists, keep its existing config content
                merge_folders(best_model_dir, new_model_dir)
                # clean other fold directory
                for i in range(self.model_config.fold_number):
                    shutil.rmtree('data/models/sequenceLabelling/' +
                                  self.model_config.model_name + str(i))

            print(
                "----------------------------------------------------------------------"
            )
            print("\nAverage over", self.model_config.fold_number, "folds")
            print(
                get_report(fold_average_evaluation,
                           digits=4,
                           include_avgs=['micro']))

    def tag(self, texts, output_format, features=None):
        # annotate a list of sentences, return the list of annotations in the
        # specified output_format
        if self.model:
            tagger = Tagger(self.model,
                            self.model_config,
                            self.embeddings,
                            preprocessor=self.p)
            start_time = time.time()
            annotations = tagger.tag(texts, output_format, features=features)
            runtime = round(time.time() - start_time, 3)
            if output_format is 'json':
                annotations["runtime"] = runtime
            #else:
            #    print("runtime: %s seconds " % (runtime))
            return annotations
        else:
            raise (OSError('Could not find a model.' + str(self.model)))

    def tag_file(self, file_in, output_format, file_out):
        # Annotate a text file containing one sentence per line, the annotations are
        # written in the output file if not None, in the standard output otherwise.
        # Processing is streamed by batches so that we can process huge files without
        # memory issues
        if self.model:
            tagger = Tagger(self.model,
                            self.model_config,
                            self.embeddings,
                            preprocessor=self.p)
            start_time = time.time()
            if file_out is not None:
                out = open(file_out, 'w')
            first = True
            with open(file_in, 'r') as f:
                texts = None
                while texts is None or len(
                        texts
                ) == self.model_config.batch_size * self.nb_workers:

                    texts = next_n_lines(
                        f, self.model_config.batch_size * self.nb_workers)
                    annotations = tagger.tag(texts, output_format)
                    # if the following is true, we just output the JSON returned by the tagger without any modification
                    directDump = False
                    if first:
                        first = False
                        if len(
                                texts
                        ) < self.model_config.batch_size * self.nb_workers:
                            runtime = round(time.time() - start_time, 3)
                            annotations['runtime'] = runtime
                            jsonString = json.dumps(annotations,
                                                    sort_keys=False,
                                                    indent=4,
                                                    ensure_ascii=False)
                            if file_out is None:
                                print(jsonString)
                            else:
                                out.write(jsonString)
                            directDump = True
                        else:
                            # we need to modify a bit the JSON outputted by the tagger to glue the different batches
                            # output the general information attributes
                            jsonString = '{\n    "software": ' + json.dumps(
                                annotations["software"],
                                ensure_ascii=False) + ",\n"
                            jsonString += '    "date": ' + json.dumps(
                                annotations["date"],
                                ensure_ascii=False) + ",\n"
                            jsonString += '    "model": ' + json.dumps(
                                annotations["model"],
                                ensure_ascii=False) + ",\n"
                            jsonString += '    "texts": ['
                            if file_out is None:
                                print(jsonString, end='', flush=True)
                            else:
                                out.write(jsonString)
                            first = True
                            for jsonStr in annotations["texts"]:
                                jsonString = json.dumps(jsonStr,
                                                        sort_keys=False,
                                                        indent=4,
                                                        ensure_ascii=False)
                                #jsonString = jsonString.replace('\n', '\n\t\t')
                                jsonString = re.sub('\n', '\n        ',
                                                    jsonString)
                                if file_out is None:
                                    if not first:
                                        print(',\n        ' + jsonString,
                                              end='',
                                              flush=True)
                                    else:
                                        first = False
                                        print('\n        ' + jsonString,
                                              end='',
                                              flush=True)
                                else:
                                    if not first:
                                        out.write(',\n        ')
                                        out.write(jsonString)
                                    else:
                                        first = False
                                        out.write('\n        ')
                                        out.write(jsonString)
                    else:
                        for jsonStr in annotations["texts"]:
                            jsonString = json.dumps(jsonStr,
                                                    sort_keys=False,
                                                    indent=4,
                                                    ensure_ascii=False)
                            jsonString = re.sub('\n', '\n        ', jsonString)
                            if file_out is None:
                                print(',\n        ' + jsonString,
                                      end='',
                                      flush=True)
                            else:
                                out.write(',\n        ')
                                out.write(jsonString)

            runtime = round(time.time() - start_time, 3)
            if not directDump:
                jsonString = "\n    ],\n"
                jsonString += '    "runtime": ' + str(runtime)
                jsonString += "\n}\n"
                if file_out is None:
                    print(jsonString)
                else:
                    out.write(jsonString)

            if file_out is not None:
                out.close()
            #print("runtime: %s seconds " % (runtime))
        else:
            raise (OSError('Could not find a model.'))

    def save(self, dir_path='data/models/sequenceLabelling/'):
        # create subfolder for the model if not already exists
        directory = os.path.join(dir_path, self.model_config.model_name)
        if not os.path.exists(directory):
            os.makedirs(directory)

        self.model_config.save(os.path.join(directory, self.config_file))
        print('model config file saved')

        self.p.save(os.path.join(directory, self.preprocessor_file))
        print('preprocessor saved')

        # bert model are always saved via training process steps as checkpoint
        if self.model_config.model_type.lower().find("bert") == -1:
            if self.model is None and self.model_config.fold_number != 0 and self.model_config.fold_number != 1:
                print(
                    'Error: model not saved. Evaluation need to be called first to select the best fold model to be saved'
                )
            else:
                self.model.save(os.path.join(directory, self.weight_file))
        print('model saved')

    def load(self, dir_path='data/models/sequenceLabelling/'):
        self.model_config = ModelConfig.load(
            os.path.join(dir_path, self.model_config.model_name,
                         self.config_file))
        self.p = WordPreprocessor.load(
            os.path.join(dir_path, self.model_config.model_name,
                         self.preprocessor_file))

        if self.model_config.model_type.lower().find("bert") != -1:
            self.model = get_model(self.model_config,
                                   self.p,
                                   ntags=len(self.p.vocab_tag),
                                   dir_path=dir_path)
            self.model.load_model()
            return

        # load embeddings
        # Do not use cache in 'production' mode
        self.embeddings = Embeddings(self.model_config.embeddings_name,
                                     use_ELMo=self.model_config.use_ELMo,
                                     use_BERT=self.model_config.use_BERT,
                                     use_cache=False)
        self.model_config.word_embedding_size = self.embeddings.embed_size

        self.model = get_model(self.model_config,
                               self.p,
                               ntags=len(self.p.vocab_tag))
        self.model.load(filepath=os.path.join(
            dir_path, self.model_config.model_name, self.weight_file))

Example #11

Show file

class Classifier(object):

    config_file = 'config.json'
    weight_file = 'model_weights.hdf5'

    def __init__(self, 
                 model_name="",
                 model_type="gru",
                 embeddings_name=None,
                 list_classes=[],
                 char_emb_size=25, 
                 dropout=0.5, 
                 recurrent_dropout=0.25,
                 use_char_feature=False, 
                 batch_size=256, 
                 optimizer='adam', 
                 learning_rate=0.001, 
                 lr_decay=0.9,
                 clip_gradients=5.0, 
                 max_epoch=50, 
                 patience=5,
                 log_dir=None,
                 maxlen=300,
                 fold_number=1,
                 use_roc_auc=True,
                 use_ELMo=False,
                 use_BERT=False,
                 embeddings=(),
                 class_weights=None,
                 multiprocessing=True):
        self.model = None
        self.models = None
        self.log_dir = log_dir
        self.embeddings_name = embeddings_name

        word_emb_size = 0
        if embeddings_name is not None and model_type.find("bert") == -1:
            self.embeddings = Embeddings(embeddings_name, use_ELMo=use_ELMo, use_BERT=use_BERT) 
            word_emb_size = self.embeddings.embed_size

        self.model_config = ModelConfig(model_name=model_name, 
                                        model_type=model_type, 
                                        embeddings_name=embeddings_name, 
                                        list_classes=list_classes, 
                                        char_emb_size=char_emb_size, 
                                        word_emb_size=word_emb_size, 
                                        dropout=dropout, 
                                        recurrent_dropout=recurrent_dropout,
                                        use_char_feature=use_char_feature, 
                                        maxlen=maxlen, 
                                        fold_number=fold_number, 
                                        batch_size=batch_size, 
                                        use_ELMo=use_ELMo, 
                                        use_BERT=use_BERT)

        self.training_config = TrainingConfig(batch_size, optimizer, learning_rate,
                                              lr_decay, clip_gradients, max_epoch,
                                              patience, use_roc_auc,
                                              class_weights=class_weights, multiprocessing=multiprocessing)

    def train(self, x_train, y_train, vocab_init=None, callbacks=None):
        self.model = getModel(self.model_config, self.training_config)

        # bert models
        if self.model_config.model_type.find("bert") != -1:     
            self.model.processor = BERT_classifier_processor(labels=self.model_config.list_classes, x_train=x_train, y_train=y_train)
            self.model.train()
            return

        # create validation set in case we don't use k-folds
        xtr, val_x, y, val_y = train_test_split(x_train, y_train, test_size=0.1)

        training_generator = DataGenerator(xtr, y, batch_size=self.training_config.batch_size, 
            maxlen=self.model_config.maxlen, list_classes=self.model_config.list_classes, 
            embeddings=self.embeddings, shuffle=True)
        validation_generator = DataGenerator(val_x, None, batch_size=self.training_config.batch_size, 
            maxlen=self.model_config.maxlen, list_classes=self.model_config.list_classes, 
            embeddings=self.embeddings, shuffle=False)
        
        # uncomment to plot graph
        #plot_model(self.model, 
        #    to_file='data/models/textClassification/'+self.model_config.model_name+'_'+self.model_config.model_type+'.png')
        self.model, best_roc_auc = train_model(self.model, self.model_config.list_classes, self.training_config.batch_size, 
            self.training_config.max_epoch, self.training_config.use_roc_auc, self.training_config.class_weights, 
            training_generator, validation_generator, val_y, use_ELMo=self.embeddings.use_ELMo, 
            use_BERT=self.embeddings.use_BERT, multiprocessing=self.training_config.multiprocessing, callbacks=callbacks)
        if self.embeddings.use_ELMo:
            self.embeddings.clean_ELMo_cache()
        if self.embeddings.use_BERT:
            self.embeddings.clean_BERT_cache()

    def train_nfold(self, x_train, y_train, vocab_init=None, callbacks=None):
        # bert models
        if self.model_config.model_type.find("bert") != -1:     
            self.model = getModel(self.model_config, self.training_config)
            self.model.processor = BERT_classifier_processor(labels=self.model_config.list_classes, x_train=x_train, y_train=y_train)
            self.model.train()
            return

        self.models = train_folds(x_train, y_train, self.model_config, self.training_config, self.embeddings, callbacks=callbacks)
        if self.embeddings.use_ELMo:
            self.embeddings.clean_ELMo_cache()
        if self.embeddings.use_BERT:
            self.embeddings.clean_BERT_cache()

    # classification
    def predict(self, texts, output_format='json', use_main_thread_only=False):
        if self.model_config.fold_number is 1:
            if self.model is not None:
                # bert model?
                if self.model_config.model_type.find("bert") != -1:
                    # be sure the input processor is instanciated
                    self.model.processor = BERT_classifier_processor(labels=self.model_config.list_classes)
                    result = self.model.predict(texts)
                else:
                    predict_generator = DataGenerator(texts, None, batch_size=self.model_config.batch_size, 
                        maxlen=self.model_config.maxlen, list_classes=self.model_config.list_classes, 
                        embeddings=self.embeddings, shuffle=False)

                    result = predict(self.model, predict_generator, use_ELMo=self.embeddings.use_ELMo, use_BERT=self.embeddings.use_BERT, use_main_thread_only=use_main_thread_only)
            else:
                raise (OSError('Could not find a model.'))
        else:            
            # bert model?
            if self.model_config.model_type.find("bert") != -1:
                # we don't support n classifiers for BERT for prediction currently 
                # (it would be too large and too slow if loaded 10 times from file for each batch)
                # (however it is done for eval, models are loaded 1 time for the complete dataset, not each time per batch, and we should do the same here) 
                # be sure the input processor is instanciated
                self.model.processor = BERT_classifier_processor(labels=self.model_config.list_classes)
                #result = self.models[0].predict(texts)
                result = self.model.predict(texts)
            else:
                if self.models is not None: 
                    predict_generator = DataGenerator(texts, None, batch_size=self.model_config.batch_size, 
                        maxlen=self.model_config.maxlen, list_classes=self.model_config.list_classes, 
                        embeddings=self.embeddings, shuffle=False)

                    result = predict_folds(self.models, predict_generator, use_ELMo=self.embeddings.use_ELMo, use_BERT=self.embeddings.use_BERT, use_main_thread_only=use_main_thread_only)
                else:
                    raise (OSError('Could not find nfolds models.'))
        if output_format is 'json':
            res = {
                "software": "DeLFT",
                "date": datetime.datetime.now().isoformat(),
                "model": self.model_config.model_name,
                "classifications": []
            }
            i = 0
            for text in texts:
                classification = {
                    "text": text
                }
                the_res = result[i]
                j = 0
                for cl in self.model_config.list_classes:
                    classification[cl] = float(the_res[j])
                    j += 1
                res["classifications"].append(classification)
                i += 1
            return res
        else:
            return result

    def eval(self, x_test, y_test, use_main_thread_only=False):
        if self.model_config.fold_number == 1:
            if self.model is not None:
                # bert model?
                if self.model_config.model_type.find("bert") != -1:
                    #self.model.eval(x_test, y_test)
                    result = self.model.predict(x_test)
                else:
                    test_generator = DataGenerator(x_test, None, batch_size=self.model_config.batch_size, 
                        maxlen=self.model_config.maxlen, list_classes=self.model_config.list_classes, 
                        embeddings=self.embeddings, shuffle=False)

                    result = predict(self.model, test_generator, use_ELMo=self.embeddings.use_ELMo, use_BERT=self.embeddings.use_BERT, use_main_thread_only=use_main_thread_only)
            else:
                raise (OSError('Could not find a model.'))
        else:
            if self.models is not None or (self.model_config.model_type.find("bert") != -1 and self.model is not None):
                # bert model?
                print(self.model_config.model_type)
                if self.model_config.model_type.find("bert") != -1:
                    result_list = []
                    for i in range(self.model_config.fold_number):
                        result = self.model.predict(x_test, i)
                        result_list.append(result)

                    result = np.ones(result_list[0].shape)
                    for fold_result in result_list:
                        result *= fold_result

                    result **= (1. / len(result_list))
                else:
                    test_generator = DataGenerator(x_test, None, batch_size=self.model_config.batch_size, 
                        maxlen=self.model_config.maxlen, list_classes=self.model_config.list_classes, 
                        embeddings=self.embeddings, shuffle=False)
                    result = predict_folds(self.models, test_generator, use_ELMo=self.embeddings.use_ELMo, use_BERT=self.embeddings.use_BERT, use_main_thread_only=use_main_thread_only)
            else:
                raise (OSError('Could not find nfolds models.'))
        print("-----------------------------------------------")
        print("\nEvaluation on", x_test.shape[0], "instances:")

        total_accuracy = 0.0
        total_f1 = 0.0
        total_loss = 0.0
        total_roc_auc = 0.0

        '''
        def normer(t):
            if t < 0.5: 
                return 0 
            else: 
                return 1
        vfunc = np.vectorize(normer)
        result_binary = vfunc(result)
        '''
        result_intermediate = np.asarray([np.argmax(line) for line in result])
        
        def vectorize(index, size):
            result = np.zeros(size)
            if index < size:
                result[index] = 1
            return result
        result_binary = np.array([vectorize(xi, len(self.model_config.list_classes)) for xi in result_intermediate])

        precision, recall, fscore, support = precision_recall_fscore_support(y_test, result_binary, average=None)
        print('{:>14}  {:>12}  {:>12}  {:>12}  {:>12}'.format(" ", "precision", "recall", "f-score", "support"))
        p = 0
        for the_class in self.model_config.list_classes:
            the_class = the_class[:14]
            print('{:>14}  {:>12}  {:>12}  {:>12}  {:>12}'.format(the_class, "{:10.4f}"
                .format(precision[p]), "{:10.4f}".format(recall[p]), "{:10.4f}".format(fscore[p]), support[p]))
            p += 1

        # macro-average (average of class scores)
        # we distinguish 1-class and multiclass problems 
        if len(self.model_config.list_classes) is 1:
            total_accuracy = accuracy_score(y_test, result_binary)
            total_f1 = f1_score(y_test, result_binary)
            total_loss = log_loss(y_test, result, labels=[0,1])
            if len(np.unique(y_test)) == 1:
                # roc_auc_score sklearn implementation is not working in this case, it needs more balanced batches
                # a simple fix is to return the r2_score instead in this case (which is a regression score and not a loss)
                total_roc_auc = r2_score(y_test, result)
                if total_roc_auc < 0:
                    total_roc_auc = 0 
            else:
                total_roc_auc = roc_auc_score(y_test, result)
        else:
            for j in range(0, len(self.model_config.list_classes)):
                accuracy = accuracy_score(y_test[:, j], result_binary[:, j])
                total_accuracy += accuracy
                f1 = f1_score(y_test[:, j], result_binary[:, j], average='micro')
                total_f1 += f1
                loss = log_loss(y_test[:, j], result[:, j], labels=[0,1])
                total_loss += loss
                if len(np.unique(y_test[:, j])) == 1:
                    # roc_auc_score sklearn implementation is not working in this case, it needs more balanced batches
                    # a simple fix is to return the r2_score instead in this case (which is a regression score and not a loss)
                    roc_auc = r2_score(y_test[:, j], result[:, j])
                    if roc_auc < 0:
                        roc_auc = 0 
                else:
                    roc_auc = roc_auc_score(y_test[:, j], result[:, j])
                total_roc_auc += roc_auc
                '''
                print("\nClass:", self.model_config.list_classes[j])
                print("\taccuracy at 0.5 =", accuracy)
                print("\tf-1 at 0.5 =", f1)
                print("\tlog-loss =", loss)
                print("\troc auc =", roc_auc)
                '''

        total_accuracy /= len(self.model_config.list_classes)
        total_f1 /= len(self.model_config.list_classes)
        total_loss /= len(self.model_config.list_classes)
        total_roc_auc /= len(self.model_config.list_classes)

        '''
        if len(self.model_config.list_classes) is not 1:
            print("\nMacro-average:")
        print("\taverage accuracy at 0.5 =", "{:10.4f}".format(total_accuracy))
        print("\taverage f-1 at 0.5 =", "{:10.4f}".format(total_f1))
        print("\taverage log-loss =","{:10.4f}".format( total_loss))
        print("\taverage roc auc =", "{:10.4f}".format(total_roc_auc))
        '''
        
        # micro-average (average of scores for each instance)
        # make sense only if we have more than 1 class, otherwise same as 
        # macro-avergae
        if len(self.model_config.list_classes) is not 1:
            total_accuracy = 0.0
            total_f1 = 0.0
            total_loss = 0.0
            total_roc_auc = 0.0

            for i in range(0, result.shape[0]):
                accuracy = accuracy_score(y_test[i,:], result_binary[i,:])
                total_accuracy += accuracy
                f1 = f1_score(y_test[i,:], result_binary[i,:], average='micro')
                total_f1 += f1
                loss = log_loss(y_test[i,:], result[i,:])
                total_loss += loss
                roc_auc = roc_auc_score(y_test[i,:], result[i,:])
                total_roc_auc += roc_auc

            total_accuracy /= result.shape[0]
            total_f1 /= result.shape[0]
            total_loss /= result.shape[0]
            total_roc_auc /= result.shape[0]

            '''
            print("\nMicro-average:")
            print("\taverage accuracy at 0.5 =", "{:10.4f}".format(total_accuracy))
            print("\taverage f-1 at 0.5 =", "{:10.4f}".format(total_f1))
            print("\taverage log-loss =", "{:10.4f}".format(total_loss))
            print("\taverage roc auc =", "{:10.4f}".format(total_roc_auc))
            '''
            
    def save(self, dir_path='data/models/textClassification/'):
        # create subfolder for the model if not already exists
        directory = os.path.join(dir_path, self.model_config.model_name)
        if not os.path.exists(directory):
            os.makedirs(directory)

        self.model_config.save(os.path.join(directory, self.config_file))
        print('model config file saved')

        # bert model are always saved via training process steps as checkpoint
        if self.model_config.model_type.find("bert") != -1:
            print('model saved')
            return

        if self.model_config.fold_number is 1:
            if self.model is not None:
                self.model.save(os.path.join(directory, self.model_config.model_type+"."+self.weight_file))
                print('model saved')
            else:
                print('Error: model has not been built')
        else:
            if self.models is None:
                print('Error: nfolds models have not been built')
            else:
                for i in range(0, self.model_config.fold_number):
                    self.models[i].save(os.path.join(directory, self.model_config.model_type+".model{0}_weights.hdf5".format(i)))
                print('nfolds model saved')

    def load(self, dir_path='data/models/textClassification/'):
        self.model_config = ModelConfig.load(os.path.join(dir_path, self.model_config.model_name, self.config_file))

        if self.model_config.model_type.find("bert") != -1:
             self.model = getModel(self.model_config, self.training_config)
             self.model.load()
             return

        # load embeddings
        # Do not use cache in 'production' mode
        self.embeddings = Embeddings(self.model_config.embeddings_name, use_ELMo=self.model_config.use_ELMo, use_BERT=self.model_config.use_BERT, use_cache=False)
        self.model_config.word_embedding_size = self.embeddings.embed_size

        self.model = getModel(self.model_config, self.training_config)
        if self.model_config.fold_number is 1:
            self.model.load_weights(os.path.join(dir_path, self.model_config.model_name, self.model_config.model_type+"."+self.weight_file))
        else:
            self.models = []
            for i in range(0, self.model_config.fold_number):
                local_model = getModel(self.model_config, self.training_config)
                local_model.load_weights(os.path.join(dir_path, self.model_config.model_name, self.model_config.model_type+".model{0}_weights.hdf5".format(i)))
                self.models.append(local_model)

Example #12

Show file

    def __init__(self,
                 model_name=None,
                 architecture=None,
                 embeddings_name=None,
                 char_emb_size=25,
                 max_char_length=30,
                 char_lstm_units=25,
                 word_lstm_units=100,
                 max_sequence_length=300,
                 dropout=0.5,
                 recurrent_dropout=0.25,
                 batch_size=20,
                 optimizer='adam',
                 learning_rate=0.001,
                 lr_decay=0.9,
                 clip_gradients=5.0,
                 max_epoch=50,
                 early_stop=True,
                 patience=5,
                 max_checkpoints_to_keep=0,
                 use_ELMo=False,
                 log_dir=None,
                 fold_number=1,
                 multiprocessing=True,
                 features_indices=None,
                 transformer_name: str = None):

        if model_name is None:
            # add a dummy name based on the architecture
            model_name = architecture
            if embeddings_name is not None:
                model_name += "_" + embeddings_name
            if transformer_name is not None:
                model_name += "_" + transformer_name

        self.model = None
        self.models = None
        self.p: Preprocessor = None
        self.log_dir = log_dir
        self.embeddings_name = embeddings_name

        word_emb_size = 0
        self.embeddings = None
        self.model_local_path = None

        self.registry = load_resource_registry("delft/resources-registry.json")

        if self.embeddings_name is not None:
            self.embeddings = Embeddings(self.embeddings_name,
                                         resource_registry=self.registry,
                                         use_ELMo=use_ELMo)
            word_emb_size = self.embeddings.embed_size
        else:
            self.embeddings = None
            word_emb_size = 0

        self.model_config = ModelConfig(
            model_name=model_name,
            architecture=architecture,
            embeddings_name=embeddings_name,
            word_embedding_size=word_emb_size,
            char_emb_size=char_emb_size,
            char_lstm_units=char_lstm_units,
            max_char_length=max_char_length,
            word_lstm_units=word_lstm_units,
            max_sequence_length=max_sequence_length,
            dropout=dropout,
            recurrent_dropout=recurrent_dropout,
            fold_number=fold_number,
            batch_size=batch_size,
            use_ELMo=use_ELMo,
            features_indices=features_indices,
            transformer_name=transformer_name)

        self.training_config = TrainingConfig(batch_size, optimizer,
                                              learning_rate, lr_decay,
                                              clip_gradients, max_epoch,
                                              early_stop, patience,
                                              max_checkpoints_to_keep,
                                              multiprocessing)

Example #13

Show file

class Sequence(object):

    # number of parallel worker for the data generator
    nb_workers = 6

    def __init__(self,
                 model_name=None,
                 architecture=None,
                 embeddings_name=None,
                 char_emb_size=25,
                 max_char_length=30,
                 char_lstm_units=25,
                 word_lstm_units=100,
                 max_sequence_length=300,
                 dropout=0.5,
                 recurrent_dropout=0.25,
                 batch_size=20,
                 optimizer='adam',
                 learning_rate=0.001,
                 lr_decay=0.9,
                 clip_gradients=5.0,
                 max_epoch=50,
                 early_stop=True,
                 patience=5,
                 max_checkpoints_to_keep=0,
                 use_ELMo=False,
                 log_dir=None,
                 fold_number=1,
                 multiprocessing=True,
                 features_indices=None,
                 transformer_name: str = None):

        if model_name is None:
            # add a dummy name based on the architecture
            model_name = architecture
            if embeddings_name is not None:
                model_name += "_" + embeddings_name
            if transformer_name is not None:
                model_name += "_" + transformer_name

        self.model = None
        self.models = None
        self.p: Preprocessor = None
        self.log_dir = log_dir
        self.embeddings_name = embeddings_name

        word_emb_size = 0
        self.embeddings = None
        self.model_local_path = None

        self.registry = load_resource_registry("delft/resources-registry.json")

        if self.embeddings_name is not None:
            self.embeddings = Embeddings(self.embeddings_name,
                                         resource_registry=self.registry,
                                         use_ELMo=use_ELMo)
            word_emb_size = self.embeddings.embed_size
        else:
            self.embeddings = None
            word_emb_size = 0

        self.model_config = ModelConfig(
            model_name=model_name,
            architecture=architecture,
            embeddings_name=embeddings_name,
            word_embedding_size=word_emb_size,
            char_emb_size=char_emb_size,
            char_lstm_units=char_lstm_units,
            max_char_length=max_char_length,
            word_lstm_units=word_lstm_units,
            max_sequence_length=max_sequence_length,
            dropout=dropout,
            recurrent_dropout=recurrent_dropout,
            fold_number=fold_number,
            batch_size=batch_size,
            use_ELMo=use_ELMo,
            features_indices=features_indices,
            transformer_name=transformer_name)

        self.training_config = TrainingConfig(batch_size, optimizer,
                                              learning_rate, lr_decay,
                                              clip_gradients, max_epoch,
                                              early_stop, patience,
                                              max_checkpoints_to_keep,
                                              multiprocessing)

    def train(self,
              x_train,
              y_train,
              f_train=None,
              x_valid=None,
              y_valid=None,
              f_valid=None,
              callbacks=None):
        # TBD if valid is None, segment train to get one if early_stop is True

        # we concatenate all the training+validation data to create the model vocabulary
        if not x_valid is None:
            x_all = np.concatenate((x_train, x_valid), axis=0)
        else:
            x_all = x_train

        if not y_valid is None:
            y_all = np.concatenate((y_train, y_valid), axis=0)
        else:
            y_all = y_train

        features_all = concatenate_or_none((f_train, f_valid), axis=0)

        self.p = prepare_preprocessor(x_all,
                                      y_all,
                                      features=features_all,
                                      model_config=self.model_config)

        self.model_config.char_vocab_size = len(self.p.vocab_char)
        self.model_config.case_vocab_size = len(self.p.vocab_case)

        self.model = get_model(self.model_config,
                               self.p,
                               len(self.p.vocab_tag),
                               load_pretrained_weights=True)
        print_parameters(self.model_config, self.training_config)
        self.model.print_summary()

        # uncomment to plot graph
        #plot_model(self.model,
        #    to_file='data/models/textClassification/'+self.model_config.model_name+'_'+self.model_config.architecture+'.png')

        trainer = Trainer(
            self.model,
            self.models,
            self.embeddings,
            self.model_config,
            self.training_config,
            checkpoint_path=self.log_dir,
            preprocessor=self.p,
            transformer_preprocessor=self.model.transformer_preprocessor)
        trainer.train(x_train,
                      y_train,
                      x_valid,
                      y_valid,
                      features_train=f_train,
                      features_valid=f_valid,
                      callbacks=callbacks)
        if self.embeddings and self.embeddings.use_ELMo:
            self.embeddings.clean_ELMo_cache()

    def train_nfold(self,
                    x_train,
                    y_train,
                    x_valid=None,
                    y_valid=None,
                    f_train=None,
                    f_valid=None,
                    callbacks=None):
        x_all = np.concatenate(
            (x_train, x_valid), axis=0) if x_valid is not None else x_train
        y_all = np.concatenate(
            (y_train, y_valid), axis=0) if y_valid is not None else y_train
        features_all = concatenate_or_none((f_train, f_valid), axis=0)

        self.p = prepare_preprocessor(x_all,
                                      y_all,
                                      features=features_all,
                                      model_config=self.model_config)

        self.model_config.char_vocab_size = len(self.p.vocab_char)
        self.model_config.case_vocab_size = len(self.p.vocab_case)

        self.models = []
        trainer = Trainer(self.model,
                          self.models,
                          self.embeddings,
                          self.model_config,
                          self.training_config,
                          checkpoint_path=self.log_dir,
                          preprocessor=self.p)

        trainer.train_nfold(x_train,
                            y_train,
                            x_valid,
                            y_valid,
                            f_train=f_train,
                            f_valid=f_valid,
                            callbacks=callbacks)
        if self.embeddings and self.embeddings.use_ELMo:
            self.embeddings.clean_ELMo_cache()

    def eval(self, x_test, y_test, features=None):
        if self.model_config.fold_number > 1:
            self.eval_nfold(x_test, y_test, features=features)
        else:
            self.eval_single(x_test, y_test, features=features)

    def eval_single(self, x_test, y_test, features=None):
        if self.model is None:
            raise (OSError('Could not find a model.'))
        print_parameters(self.model_config, self.training_config)
        self.model.print_summary()

        if self.model_config.transformer_name is None:
            # we can use a data generator for evaluation

            # Prepare test data(steps, generator)
            generator = self.model.get_generator()
            test_generator = generator(
                x_test,
                y_test,
                batch_size=self.model_config.batch_size,
                preprocessor=self.p,
                char_embed_size=self.model_config.char_embedding_size,
                max_sequence_length=self.model_config.max_sequence_length,
                embeddings=self.embeddings,
                shuffle=False,
                features=features,
                output_input_offsets=True,
                use_chain_crf=self.model_config.use_chain_crf)

            # Build the evaluator and evaluate the model
            scorer = Scorer(test_generator,
                            self.p,
                            evaluation=True,
                            use_crf=self.model_config.use_crf,
                            use_chain_crf=self.model_config.use_chain_crf)
            scorer.model = self.model
            scorer.on_epoch_end(epoch=-1)
        else:
            # the architecture model uses a transformer layer
            # note that we could also use the above test_generator, but as an alternative here we check the
            # test/prediction alignment of tokens and the validity of the maximum sequence input length
            # wrt the length of the test sequences

            tagger = Tagger(
                self.model,
                self.model_config,
                self.embeddings,
                preprocessor=self.p,
                transformer_preprocessor=self.model.transformer_preprocessor)
            y_pred_pairs = tagger.tag(x_test,
                                      output_format=None,
                                      features=features)

            # keep only labels
            y_pred = []
            for result in y_pred_pairs:
                result_labels = []
                for pair in result:
                    result_labels.append(pair[1])
                y_pred.append(result_labels)

            nb_alignment_issues = 0
            for i in range(len(y_test)):
                if len(y_test[i]) != len(y_pred[i]):
                    #print("y_test:", y_test[i])
                    #print("y_pred:", y_pred[i])

                    nb_alignment_issues += 1
                    # BERT tokenizer appears to introduce some additional tokens without ## prefix,
                    # but we normally handled that well when predicting.
                    # To be very conservative, the following ensure the number of tokens always
                    # match, but it should never be used in practice.
                    if len(y_test[i]) < len(y_pred[i]):
                        y_test[i] = y_test[i] + ["O"] * (len(y_pred[i]) -
                                                         len(y_test[i]))
                    if len(y_test[i]) > len(y_pred[i]):
                        y_pred[i] = y_pred[i] + ["O"] * (len(y_test[i]) -
                                                         len(y_pred[i]))

            if nb_alignment_issues > 0:
                print("number of alignment issues with test set:",
                      nb_alignment_issues)
                print(
                    "to solve them consider increasing the maximum sequence input length of the model and retrain"
                )

            report, report_as_map = classification_report(y_test,
                                                          y_pred,
                                                          digits=4)
            print(report)

    def eval_nfold(self, x_test, y_test, features=None):
        if self.models is not None:
            total_f1 = 0
            best_f1 = 0
            best_index = 0
            worst_f1 = 1
            worst_index = 0
            reports = []
            reports_as_map = []
            total_precision = 0
            total_recall = 0
            for i in range(self.model_config.fold_number):

                if self.model_config.transformer_name is None:
                    the_model = self.models[i]
                    bert_preprocessor = None
                else:
                    # the architecture model uses a transformer layer, it is large and needs to be loaded from disk
                    dir_path = 'data/models/sequenceLabelling/'
                    weight_file = DEFAULT_WEIGHT_FILE_NAME.replace(
                        ".hdf5",
                        str(i) + ".hdf5")
                    self.model = get_model(self.model_config,
                                           self.p,
                                           ntags=len(self.p.vocab_tag),
                                           load_pretrained_weights=False,
                                           local_path=os.path.join(
                                               dir_path,
                                               self.model_config.model_name))
                    self.model.load(filepath=os.path.join(
                        dir_path, self.model_config.model_name, weight_file))
                    the_model = self.model
                    bert_preprocessor = self.model.transformer_preprocessor

                if i == 0:
                    the_model.print_summary()
                    print_parameters(self.model_config, self.training_config)

                print('\n------------------------ fold ' + str(i) +
                      ' --------------------------------------')

                # we can use a data generator for evaluation
                # Prepare test data(steps, generator)
                generator = the_model.get_generator()
                test_generator = generator(
                    x_test,
                    y_test,
                    batch_size=self.model_config.batch_size,
                    preprocessor=self.p,
                    bert_preprocessor=bert_preprocessor,
                    char_embed_size=self.model_config.char_embedding_size,
                    max_sequence_length=self.model_config.max_sequence_length,
                    embeddings=self.embeddings,
                    shuffle=False,
                    features=features,
                    output_input_offsets=True,
                    use_chain_crf=self.model_config.use_chain_crf)

                # Build the evaluator and evaluate the model
                scorer = Scorer(test_generator,
                                self.p,
                                evaluation=True,
                                use_crf=self.model_config.use_crf,
                                use_chain_crf=self.model_config.use_chain_crf)
                scorer.model = the_model
                scorer.on_epoch_end(epoch=-1)
                f1 = scorer.f1
                precision = scorer.precision
                recall = scorer.recall
                reports.append(scorer.report)
                reports_as_map.append(scorer.report_as_map)

                if best_f1 < f1:
                    best_f1 = f1
                    best_index = i
                if worst_f1 > f1:
                    worst_f1 = f1
                    worst_index = i
                total_f1 += f1
                total_precision += precision
                total_recall += recall

            fold_average_evaluation = {'labels': {}, 'micro': {}, 'macro': {}}

            micro_f1 = total_f1 / self.model_config.fold_number
            micro_precision = total_precision / self.model_config.fold_number
            micro_recall = total_recall / self.model_config.fold_number

            micro_eval_block = {
                'f1': micro_f1,
                'precision': micro_precision,
                'recall': micro_recall
            }
            fold_average_evaluation['micro'] = micro_eval_block

            # field-level average over the n folds
            labels = []
            for label in sorted(self.p.vocab_tag):
                if label == 'O' or label == '<PAD>':
                    continue
                if label.startswith("B-") or label.startswith(
                        "S-") or label.startswith("I-") or label.startswith(
                            "E-"):
                    label = label[2:]

                if label in labels:
                    continue
                labels.append(label)

                sum_p = 0
                sum_r = 0
                sum_f1 = 0
                sum_support = 0
                for j in range(0, self.model_config.fold_number):
                    if label not in reports_as_map[j]['labels']:
                        continue
                    report_as_map = reports_as_map[j]['labels'][label]
                    sum_p += report_as_map["precision"]
                    sum_r += report_as_map["recall"]
                    sum_f1 += report_as_map["f1"]
                    sum_support += report_as_map["support"]

                avg_p = sum_p / self.model_config.fold_number
                avg_r = sum_r / self.model_config.fold_number
                avg_f1 = sum_f1 / self.model_config.fold_number
                avg_support = sum_support / self.model_config.fold_number
                avg_support_dec = str(avg_support - int(avg_support))[1:]
                if avg_support_dec != '0':
                    avg_support = math.floor(avg_support)

                block_label = {
                    'precision': avg_p,
                    'recall': avg_r,
                    'support': avg_support,
                    'f1': avg_f1
                }
                fold_average_evaluation['labels'][label] = block_label

            print(
                "----------------------------------------------------------------------"
            )
            print("\n** Worst ** model scores - run", str(worst_index))
            print(reports[worst_index])

            print("\n** Best ** model scores - run", str(best_index))
            print(reports[best_index])

            fold_nb = self.model_config.fold_number
            self.model_config.fold_number = 1
            if self.model_config.transformer_name is None:
                self.model = self.models[best_index]
            else:
                dir_path = 'data/models/sequenceLabelling/'
                weight_file = DEFAULT_WEIGHT_FILE_NAME.replace(
                    ".hdf5",
                    str(best_index) + ".hdf5")
                # saved config file must be updated to single fold
                self.model.load(filepath=os.path.join(
                    dir_path, self.model_config.model_name, weight_file))

            print(
                "----------------------------------------------------------------------"
            )
            print("\nAverage over", str(int(fold_nb)), "folds")
            print(
                get_report(fold_average_evaluation,
                           digits=4,
                           include_avgs=['micro']))

    def tag(self, texts, output_format, features=None, batch_size=None):
        # annotate a list of sentences, return the list of annotations in the
        # specified output_format

        if batch_size != None:
            self.model_config.batch_size = batch_size
            print("---")
            print("batch_size (prediction):", self.model_config.batch_size)
            print("---")

        if self.model:
            tagger = Tagger(
                self.model,
                self.model_config,
                self.embeddings,
                preprocessor=self.p,
                transformer_preprocessor=self.model.transformer_preprocessor)
            start_time = time.time()
            annotations = tagger.tag(texts, output_format, features=features)
            runtime = round(time.time() - start_time, 3)
            if output_format == 'json':
                annotations["runtime"] = runtime
            #else:
            #    print("runtime: %s seconds " % (runtime))
            return annotations
        else:
            raise (OSError('Could not find a model.' + str(self.model)))

    def tag_file(self, file_in, output_format, file_out, batch_size=None):
        # Annotate a text file containing one sentence per line, the annotations are
        # written in the output file if not None, in the standard output otherwise.
        # Processing is streamed by batches so that we can process huge files without
        # memory issues

        if batch_size != None:
            self.model_config.batch_size = batch_size
            print("---")
            print("batch_size (prediction):", self.model_config.batch_size)
            print("---")

        if self.model:
            tagger = Tagger(
                self.model,
                self.model_config,
                self.embeddings,
                preprocessor=self.p,
                transformer_preprocessor=self.model.transformer_preprocessor)
            start_time = time.time()
            if file_out != None:
                out = open(file_out, 'w')
            first = True
            with open(file_in, 'r') as f:
                texts = None
                while texts == None or len(
                        texts
                ) == self.model_config.batch_size * self.nb_workers:

                    texts = next_n_lines(
                        f, self.model_config.batch_size * self.nb_workers)
                    annotations = tagger.tag(texts, output_format)
                    # if the following is true, we just output the JSON returned by the tagger without any modification
                    directDump = False
                    if first:
                        first = False
                        if len(
                                texts
                        ) < self.model_config.batch_size * self.nb_workers:
                            runtime = round(time.time() - start_time, 3)
                            annotations['runtime'] = runtime
                            jsonString = json.dumps(annotations,
                                                    sort_keys=False,
                                                    indent=4,
                                                    ensure_ascii=False)
                            if file_out == None:
                                print(jsonString)
                            else:
                                out.write(jsonString)
                            directDump = True
                        else:
                            # we need to modify a bit the JSON outputted by the tagger to glue the different batches
                            # output the general information attributes
                            jsonString = '{\n    "software": ' + json.dumps(
                                annotations["software"],
                                ensure_ascii=False) + ",\n"
                            jsonString += '    "date": ' + json.dumps(
                                annotations["date"],
                                ensure_ascii=False) + ",\n"
                            jsonString += '    "model": ' + json.dumps(
                                annotations["model"],
                                ensure_ascii=False) + ",\n"
                            jsonString += '    "texts": ['
                            if file_out == None:
                                print(jsonString, end='', flush=True)
                            else:
                                out.write(jsonString)
                            first = True
                            for jsonStr in annotations["texts"]:
                                jsonString = json.dumps(jsonStr,
                                                        sort_keys=False,
                                                        indent=4,
                                                        ensure_ascii=False)
                                #jsonString = jsonString.replace('\n', '\n\t\t')
                                jsonString = re.sub('\n', '\n        ',
                                                    jsonString)
                                if file_out == None:
                                    if not first:
                                        print(',\n        ' + jsonString,
                                              end='',
                                              flush=True)
                                    else:
                                        first = False
                                        print('\n        ' + jsonString,
                                              end='',
                                              flush=True)
                                else:
                                    if not first:
                                        out.write(',\n        ')
                                        out.write(jsonString)
                                    else:
                                        first = False
                                        out.write('\n        ')
                                        out.write(jsonString)
                    else:
                        for jsonStr in annotations["texts"]:
                            jsonString = json.dumps(jsonStr,
                                                    sort_keys=False,
                                                    indent=4,
                                                    ensure_ascii=False)
                            jsonString = re.sub('\n', '\n        ', jsonString)
                            if file_out == None:
                                print(',\n        ' + jsonString,
                                      end='',
                                      flush=True)
                            else:
                                out.write(',\n        ')
                                out.write(jsonString)

            runtime = round(time.time() - start_time, 3)
            if not directDump:
                jsonString = "\n    ],\n"
                jsonString += '    "runtime": ' + str(runtime)
                jsonString += "\n}\n"
                if file_out == None:
                    print(jsonString)
                else:
                    out.write(jsonString)

            if file_out != None:
                out.close()
            #print("runtime: %s seconds " % (runtime))
        else:
            raise (OSError('Could not find a model.'))

    def save(self,
             dir_path='data/models/sequenceLabelling/',
             weight_file=DEFAULT_WEIGHT_FILE_NAME):
        # create subfolder for the model if not already exists
        directory = os.path.join(dir_path, self.model_config.model_name)
        if not os.path.exists(directory):
            os.makedirs(directory)

        self.model_config.save(os.path.join(directory, CONFIG_FILE_NAME))
        print('model config file saved')

        self.p.save(os.path.join(directory, PROCESSOR_FILE_NAME))
        print('preprocessor saved')

        if self.model is None and self.model_config.fold_number > 1:
            print(
                'Error: model not saved. Evaluation need to be called first to select the best fold model to be saved'
            )
        else:
            self.model.save(os.path.join(directory, weight_file))

            # save pretrained transformer config if used in the model
            if self.model.transformer_config is not None:
                self.model.transformer_config.to_json_file(
                    os.path.join(directory, TRANSFORMER_CONFIG_FILE_NAME))
                print('transformer config saved')

            if self.model.transformer_preprocessor is not None:
                self.model.transformer_preprocessor.tokenizer.save_pretrained(
                    os.path.join(directory, DEFAULT_TRANSFORMER_TOKENIZER_DIR))
                print('transformer tokenizer saved')

        print('model saved')

    def load(self,
             dir_path='data/models/sequenceLabelling/',
             weight_file=DEFAULT_WEIGHT_FILE_NAME):
        model_path = os.path.join(dir_path, self.model_config.model_name)
        self.model_config = ModelConfig.load(
            os.path.join(model_path, CONFIG_FILE_NAME))

        if self.model_config.embeddings_name is not None:
            # load embeddings
            # Do not use cache in 'prediction/production' mode
            self.embeddings = Embeddings(self.model_config.embeddings_name,
                                         resource_registry=self.registry,
                                         use_ELMo=self.model_config.use_ELMo,
                                         use_cache=False)
            self.model_config.word_embedding_size = self.embeddings.embed_size
        else:
            self.embeddings = None
            self.model_config.word_embedding_size = 0

        self.p = Preprocessor.load(
            os.path.join(dir_path, self.model_config.model_name,
                         PROCESSOR_FILE_NAME))
        self.model = get_model(self.model_config,
                               self.p,
                               ntags=len(self.p.vocab_tag),
                               load_pretrained_weights=False,
                               local_path=os.path.join(
                                   dir_path, self.model_config.model_name))
        print(
            "load weights from",
            os.path.join(dir_path, self.model_config.model_name, weight_file))
        self.model.load(filepath=os.path.join(
            dir_path, self.model_config.model_name, weight_file))
        self.model.print_summary()

Example #14

Show file

    def __init__(self, 
                 model_name=None,
                 architecture="gru",
                 embeddings_name=None,
                 list_classes=[],
                 char_emb_size=25, 
                 dropout=0.5, 
                 recurrent_dropout=0.25,
                 use_char_feature=False, 
                 batch_size=256, 
                 optimizer='adam', 
                 learning_rate=0.001, 
                 lr_decay=0.9,
                 clip_gradients=5.0, 
                 max_epoch=50, 
                 patience=5,
                 log_dir=None,
                 maxlen=300,
                 fold_number=1,
                 use_roc_auc=True,
                 early_stop=True,
                 class_weights=None,
                 multiprocessing=True,
                 transformer_name: str=None):

        if model_name is None:
            # add a dummy name based on the architecture
            model_name = architecture
            if embeddings_name is not None:
                model_name += "_" + embeddings_name
            if transformer_name is not None:
                model_name += "_" + transformer_name

        self.model = None
        self.models = None
        self.log_dir = log_dir
        self.embeddings_name = embeddings_name
        self.embeddings = None

        # if transformer_name is None, no bert layer is present in the model
        self.transformer_name = None

        self.registry = load_resource_registry("delft/resources-registry.json")

        word_emb_size = 0
        if transformer_name is not None:
            self.transformer_name = transformer_name
            self.embeddings_name = None
            self.embeddings = None
        elif self.embeddings_name is not None:
            self.embeddings = Embeddings(self.embeddings_name, resource_registry=self.registry)
            word_emb_size = self.embeddings.embed_size
        
        self.model_config = ModelConfig(model_name=model_name, 
                                        architecture=architecture, 
                                        embeddings_name=embeddings_name, 
                                        list_classes=list_classes, 
                                        char_emb_size=char_emb_size, 
                                        word_emb_size=word_emb_size, 
                                        dropout=dropout, 
                                        recurrent_dropout=recurrent_dropout,
                                        use_char_feature=use_char_feature, 
                                        maxlen=maxlen, 
                                        fold_number=fold_number, 
                                        batch_size=batch_size,
                                        transformer_name=self.transformer_name)

        self.training_config = TrainingConfig(batch_size=batch_size, 
                                              optimizer=optimizer, 
                                              learning_rate=learning_rate,
                                              lr_decay=lr_decay, 
                                              clip_gradients=clip_gradients, 
                                              max_epoch=max_epoch,
                                              patience=patience, 
                                              use_roc_auc=use_roc_auc, 
                                              early_stop=early_stop,
                                              class_weights=class_weights, 
                                              multiprocessing=multiprocessing)

Example #15

Show file

class Classifier(object):

    config_file = 'config.json'
    weight_file = 'model_weights.hdf5'

    def __init__(self,
                 model_name="",
                 model_type="gru",
                 embeddings_name=None,
                 list_classes=[],
                 char_emb_size=25,
                 dropout=0.5,
                 recurrent_dropout=0.25,
                 use_char_feature=False,
                 batch_size=256,
                 optimizer='adam',
                 learning_rate=0.001,
                 lr_decay=0.9,
                 clip_gradients=5.0,
                 max_epoch=50,
                 patience=5,
                 log_dir=None,
                 maxlen=300,
                 fold_number=1,
                 use_roc_auc=True,
                 use_ELMo=False,
                 use_BERT=False,
                 embeddings=(),
                 class_weights=None):

        self.model = None
        self.models = None
        self.log_dir = log_dir
        self.embeddings_name = embeddings_name

        word_emb_size = 0
        if embeddings_name is not None:
            self.embeddings = Embeddings(embeddings_name,
                                         use_ELMo=use_ELMo,
                                         use_BERT=use_BERT)
            word_emb_size = self.embeddings.embed_size

        self.model_config = ModelConfig(model_name=model_name,
                                        model_type=model_type,
                                        embeddings_name=embeddings_name,
                                        list_classes=list_classes,
                                        char_emb_size=char_emb_size,
                                        word_emb_size=word_emb_size,
                                        dropout=dropout,
                                        recurrent_dropout=recurrent_dropout,
                                        use_char_feature=use_char_feature,
                                        maxlen=maxlen,
                                        fold_number=fold_number,
                                        batch_size=batch_size,
                                        use_ELMo=use_ELMo,
                                        use_BERT=use_BERT)

        self.training_config = TrainingConfig(batch_size,
                                              optimizer,
                                              learning_rate,
                                              lr_decay,
                                              clip_gradients,
                                              max_epoch,
                                              patience,
                                              use_roc_auc,
                                              class_weights=class_weights)

    def train(self, x_train, y_train, vocab_init=None):
        # create validation set in case we don't use k-folds
        xtr, val_x, y, val_y = train_test_split(x_train,
                                                y_train,
                                                test_size=0.1)

        training_generator = DataGenerator(
            xtr,
            y,
            batch_size=self.training_config.batch_size,
            maxlen=self.model_config.maxlen,
            list_classes=self.model_config.list_classes,
            embeddings=self.embeddings,
            shuffle=True)
        validation_generator = DataGenerator(
            val_x,
            None,
            batch_size=self.training_config.batch_size,
            maxlen=self.model_config.maxlen,
            list_classes=self.model_config.list_classes,
            embeddings=self.embeddings,
            shuffle=False)

        self.model = getModel(self.model_config, self.training_config)
        # uncomment to plot graph
        #plot_model(self.model,
        #    to_file='data/models/textClassification/'+self.model_config.model_name+'_'+self.model_config.model_type+'.png')
        self.model, best_roc_auc = train_model(
            self.model,
            self.model_config.list_classes,
            self.training_config.batch_size,
            self.training_config.max_epoch,
            self.training_config.use_roc_auc,
            self.training_config.class_weights,
            training_generator,
            validation_generator,
            val_y,
            use_ELMo=self.embeddings.use_ELMo,
            use_BERT=self.embeddings.use_BERT)
        if self.embeddings.use_ELMo:
            self.embeddings.clean_ELMo_cache()
        if self.embeddings.use_BERT:
            self.embeddings.clean_BERT_cache()

    def train_nfold(self, x_train, y_train, vocab_init=None):
        self.models = train_folds(x_train, y_train, self.model_config,
                                  self.training_config, self.embeddings)
        if self.embeddings.use_ELMo:
            self.embeddings.clean_ELMo_cache()
        if self.embeddings.use_BERT:
            self.embeddings.clean_BERT_cache()

    # classification
    def predict(self, texts, output_format='json'):
        if self.model_config.fold_number is 1:
            if self.model is not None:
                predict_generator = DataGenerator(
                    texts,
                    None,
                    batch_size=self.model_config.batch_size,
                    maxlen=self.model_config.maxlen,
                    list_classes=self.model_config.list_classes,
                    embeddings=self.embeddings,
                    shuffle=False)

                result = predict(self.model,
                                 predict_generator,
                                 use_ELMo=self.embeddings.use_ELMo,
                                 use_BERT=self.embeddings.use_BERT)
            else:
                raise (OSError('Could not find a model.'))
        else:
            if self.models is not None:
                predict_generator = DataGenerator(
                    texts,
                    None,
                    batch_size=self.model_config.batch_size,
                    maxlen=self.model_config.maxlen,
                    list_classes=self.model_config.list_classes,
                    embeddings=self.embeddings,
                    shuffle=False)

                result = predict_folds(self.models,
                                       predict_generator,
                                       use_ELMo=self.embeddings.use_ELMo,
                                       use_BERT=self.embeddings.use_BERT)
            else:
                raise (OSError('Could not find nfolds models.'))
        if output_format is 'json':
            res = {
                "software": "DeLFT",
                "date": datetime.datetime.now().isoformat(),
                "model": self.model_config.model_name,
                "classifications": []
            }
            i = 0
            for text in texts:
                classification = {"text": text}
                the_res = result[i]
                j = 0
                for cl in self.model_config.list_classes:
                    classification[cl] = float(the_res[j])
                    j += 1
                res["classifications"].append(classification)
                i += 1
            return res
        else:
            return result

    def eval(self, x_test, y_test):
        if self.model_config.fold_number is 1:
            if self.model is not None:
                test_generator = DataGenerator(
                    x_test,
                    None,
                    batch_size=self.model_config.batch_size,
                    maxlen=self.model_config.maxlen,
                    list_classes=self.model_config.list_classes,
                    embeddings=self.embeddings,
                    shuffle=False)

                result = predict(self.model,
                                 test_generator,
                                 use_ELMo=self.embeddings.use_ELMo,
                                 use_BERT=self.embeddings.use_BERT)
            else:
                raise (OSError('Could not find a model.'))
        else:
            if self.models is not None:
                test_generator = DataGenerator(
                    x_test,
                    None,
                    batch_size=self.model_config.batch_size,
                    maxlen=self.model_config.maxlen,
                    list_classes=self.model_config.list_classes,
                    embeddings=self.embeddings,
                    shuffle=False)

                result = predict_folds(self.models,
                                       test_generator,
                                       use_ELMo=self.embeddings.use_ELMo,
                                       use_BERT=self.embeddings.use_BERT)
            else:
                raise (OSError('Could not find nfolds models.'))
        print("-----------------------------------------------")
        print("\nEvaluation on", x_test.shape[0], "instances:")

        total_accuracy = 0.0
        total_f1 = 0.0
        total_loss = 0.0
        total_roc_auc = 0.0

        def normer(t):
            if t < 0.5:
                return 0
            else:
                return 1

        vfunc = np.vectorize(normer)
        result_binary = vfunc(result)

        # macro-average (average of class scores)
        # we distinguish 1-class and multiclass problems
        if len(self.model_config.list_classes) is 1:
            total_accuracy = accuracy_score(y_test, result_binary)
            total_f1 = f1_score(y_test, result_binary)
            total_loss = log_loss(y_test, result)
            total_roc_auc = roc_auc_score(y_test, result)
        else:
            for j in range(0, len(self.model_config.list_classes)):
                accuracy = accuracy_score(y_test[:, j], result_binary[:, j])
                total_accuracy += accuracy
                f1 = f1_score(y_test[:, j],
                              result_binary[:, j],
                              average='micro')
                total_f1 += f1
                loss = log_loss(y_test[:, j], result[:, j])
                total_loss += loss
                roc_auc = roc_auc_score(y_test[:, j], result[:, j])
                total_roc_auc += roc_auc
                print("\nClass:", self.model_config.list_classes[j])
                print("\taccuracy at 0.5 =", accuracy)
                print("\tf-1 at 0.5 =", f1)
                print("\tlog-loss =", loss)
                print("\troc auc =", roc_auc)

        total_accuracy /= len(self.model_config.list_classes)
        total_f1 /= len(self.model_config.list_classes)
        total_loss /= len(self.model_config.list_classes)
        total_roc_auc /= len(self.model_config.list_classes)

        if len(self.model_config.list_classes) is not 1:
            print("\nMacro-average:")
        print("\taverage accuracy at 0.5 =", "{:10.4f}".format(total_accuracy))
        print("\taverage f-1 at 0.5 =", "{:10.4f}".format(total_f1))
        print("\taverage log-loss =", "{:10.4f}".format(total_loss))
        print("\taverage roc auc =", "{:10.4f}".format(total_roc_auc))

        # micro-average (average of scores for each instance)
        # make sense only if we have more than 1 class, otherwise same as
        # macro-avergae
        if len(self.model_config.list_classes) is not 1:
            total_accuracy = 0.0
            total_f1 = 0.0
            total_loss = 0.0
            total_roc_auc = 0.0

            for i in range(0, result.shape[0]):
                #for j in range(0, len(self.model_config.list_classes)):
                accuracy = accuracy_score(y_test[i, :], result_binary[i, :])
                total_accuracy += accuracy
                f1 = f1_score(y_test[i, :],
                              result_binary[i, :],
                              average='micro')
                total_f1 += f1
                loss = log_loss(y_test[i, :], result[i, :])
                total_loss += loss
                roc_auc = roc_auc_score(y_test[i, :], result[i, :])
                total_roc_auc += roc_auc

            total_accuracy /= result.shape[0]
            total_f1 /= result.shape[0]
            total_loss /= result.shape[0]
            total_roc_auc /= result.shape[0]

            print("\nMicro-average:")
            print("\taverage accuracy at 0.5 =",
                  "{:10.4f}".format(total_accuracy))
            print("\taverage f-1 at 0.5 =", "{:10.4f}".format(total_f1))
            print("\taverage log-loss =", "{:10.4f}".format(total_loss))
            print("\taverage roc auc =", "{:10.4f}".format(total_roc_auc))

    def save(self, dir_path='data/models/textClassification/'):
        # create subfolder for the model if not already exists
        directory = os.path.join(dir_path, self.model_config.model_name)
        if not os.path.exists(directory):
            os.makedirs(directory)

        self.model_config.save(os.path.join(directory, self.config_file))
        print('model config file saved')

        if self.model_config.fold_number is 1:
            if self.model is not None:
                self.model.save(
                    os.path.join(
                        directory,
                        self.model_config.model_type + "." + self.weight_file))
                print('model saved')
            else:
                print('Error: model has not been built')
        else:
            if self.models is None:
                print('Error: nfolds models have not been built')
            else:
                for i in range(0, self.model_config.fold_number):
                    self.models[i].save(
                        os.path.join(
                            directory, self.model_config.model_type +
                            ".model{0}_weights.hdf5".format(i)))
                print('nfolds model saved')

    def load(self, dir_path='data/models/textClassification/'):
        self.model_config = ModelConfig.load(
            os.path.join(dir_path, self.model_config.model_name,
                         self.config_file))

        # load embeddings
        self.embeddings = Embeddings(self.model_config.embeddings_name,
                                     use_ELMo=self.model_config.use_ELMo,
                                     use_BERT=self.model_config.use_BERT)
        self.model_config.word_embedding_size = self.embeddings.embed_size

        self.model = getModel(self.model_config, self.training_config)
        if self.model_config.fold_number is 1:
            self.model.load_weights(
                os.path.join(
                    dir_path, self.model_config.model_name,
                    self.model_config.model_type + "." + self.weight_file))
        else:
            self.models = []
            for i in range(0, self.model_config.fold_number):
                local_model = getModel(self.model_config, self.training_config)
                local_model.load_weights(
                    os.path.join(
                        dir_path, self.model_config.model_name,
                        self.model_config.model_type +
                        ".model{0}_weights.hdf5".format(i)))
                self.models.append(local_model)