Example #1
0
    def load(self, dir_path='data/models/sequenceLabelling/'):
        self.model_config = ModelConfig.load(
            os.path.join(dir_path, self.model_config.model_name,
                         self.config_file))
        self.p = WordPreprocessor.load(
            os.path.join(dir_path, self.model_config.model_name,
                         self.preprocessor_file))

        if self.model_config.model_type.lower().find("bert") != -1:
            self.model = get_model(self.model_config,
                                   self.p,
                                   ntags=len(self.p.vocab_tag),
                                   dir_path=dir_path)
            self.model.load_model()
            return

        # load embeddings
        # Do not use cache in 'production' mode
        self.embeddings = Embeddings(self.model_config.embeddings_name,
                                     use_ELMo=self.model_config.use_ELMo,
                                     use_BERT=self.model_config.use_BERT,
                                     use_cache=False)
        self.model_config.word_embedding_size = self.embeddings.embed_size

        self.model = get_model(self.model_config,
                               self.p,
                               ntags=len(self.p.vocab_tag))
        self.model.load(filepath=os.path.join(
            dir_path, self.model_config.model_name, self.weight_file))
Example #2
0
    def load(self,
             dir_path='data/models/sequenceLabelling/',
             weight_file=DEFAULT_WEIGHT_FILE_NAME):
        model_path = os.path.join(dir_path, self.model_config.model_name)
        self.model_config = ModelConfig.load(
            os.path.join(model_path, CONFIG_FILE_NAME))

        if self.model_config.embeddings_name is not None:
            # load embeddings
            # Do not use cache in 'prediction/production' mode
            self.embeddings = Embeddings(self.model_config.embeddings_name,
                                         resource_registry=self.registry,
                                         use_ELMo=self.model_config.use_ELMo,
                                         use_cache=False)
            self.model_config.word_embedding_size = self.embeddings.embed_size
        else:
            self.embeddings = None
            self.model_config.word_embedding_size = 0

        self.p = Preprocessor.load(
            os.path.join(dir_path, self.model_config.model_name,
                         PROCESSOR_FILE_NAME))
        self.model = get_model(self.model_config,
                               self.p,
                               ntags=len(self.p.vocab_tag),
                               load_pretrained_weights=False,
                               local_path=os.path.join(
                                   dir_path, self.model_config.model_name))
        print(
            "load weights from",
            os.path.join(dir_path, self.model_config.model_name, weight_file))
        self.model.load(filepath=os.path.join(
            dir_path, self.model_config.model_name, weight_file))
        self.model.print_summary()
Example #3
0
    def load(self, dir_path='data/models/textClassification/'):
        self.model_config = ModelConfig.load(
            os.path.join(dir_path, self.model_config.model_name,
                         self.config_file))

        # load embeddings
        self.embeddings = Embeddings(self.model_config.embeddings_name,
                                     use_ELMo=self.model_config.use_ELMo,
                                     use_BERT=self.model_config.use_BERT)
        self.model_config.word_embedding_size = self.embeddings.embed_size

        self.model = getModel(self.model_config, self.training_config)
        if self.model_config.fold_number is 1:
            self.model.load_weights(
                os.path.join(
                    dir_path, self.model_config.model_name,
                    self.model_config.model_type + "." + self.weight_file))
        else:
            self.models = []
            for i in range(0, self.model_config.fold_number):
                local_model = getModel(self.model_config, self.training_config)
                local_model.load_weights(
                    os.path.join(
                        dir_path, self.model_config.model_name,
                        self.model_config.model_type +
                        ".model{0}_weights.hdf5".format(i)))
                self.models.append(local_model)
Example #4
0
    def __init__(self, 
                 model_name,
                 model_type="BidLSTM_CRF",
                 embeddings_name=None,
                 char_emb_size=25, 
                 max_char_length=30,
                 char_lstm_units=25,
                 word_lstm_units=100, 
                 dropout=0.5, 
                 recurrent_dropout=0.25,
                 use_char_feature=True, 
                 use_crf=True,
                 batch_size=20, 
                 optimizer='adam', 
                 learning_rate=0.001, 
                 lr_decay=0.9,
                 clip_gradients=5.0, 
                 max_epoch=50, 
                 early_stop=True,
                 patience=5,
                 max_checkpoints_to_keep=5, 
                 log_dir=None,
                 use_ELMo=True,
                 fold_number=1):

        self.model = None
        self.models = None
        self.p = None
        self.log_dir = log_dir
        self.embeddings_name = embeddings_name

        word_emb_size = 0
        if embeddings_name is not None:
            self.embeddings = Embeddings(embeddings_name, use_ELMo=use_ELMo) 
            word_emb_size = self.embeddings.embed_size

        self.model_config = ModelConfig(model_name=model_name, 
                                        model_type=model_type, 
                                        embeddings_name=embeddings_name, 
                                        word_embedding_size=word_emb_size, 
                                        char_emb_size=char_emb_size, 
                                        char_lstm_units=char_lstm_units, 
                                        max_char_length=max_char_length,
                                        word_lstm_units=word_lstm_units, 
                                        dropout=dropout, 
                                        recurrent_dropout=recurrent_dropout, 
                                        use_char_feature=use_char_feature, 
                                        use_crf=use_crf, 
                                        fold_number=fold_number, 
                                        batch_size=batch_size,
                                        use_ELMo=use_ELMo)

        self.training_config = TrainingConfig(batch_size, optimizer, learning_rate,
                                              lr_decay, clip_gradients, max_epoch,
                                              early_stop, patience, 
                                              max_checkpoints_to_keep)
Example #5
0
    def load(self, dir_path='data/models/sequenceLabelling/'):
        self.p = WordPreprocessor.load(os.path.join(dir_path, self.model_config.model_name, self.preprocessor_file))

        self.model_config = ModelConfig.load(os.path.join(dir_path, self.model_config.model_name, self.config_file))

        # load embeddings
        self.embeddings = Embeddings(self.model_config.embeddings_name, use_ELMo=self.model_config.use_ELMo) 
        self.model_config.word_embedding_size = self.embeddings.embed_size

        self.model = get_model(self.model_config, self.p, ntags=len(self.p.vocab_tag))
        self.model.load(filepath=os.path.join(dir_path, self.model_config.model_name, self.weight_file))
Example #6
0
    def __init__(self, 
                 model_name="",
                 model_type="gru",
                 embeddings_name=None,
                 list_classes=[],
                 char_emb_size=25, 
                 dropout=0.5, 
                 recurrent_dropout=0.25,
                 use_char_feature=False, 
                 batch_size=256, 
                 optimizer='adam', 
                 learning_rate=0.001, 
                 lr_decay=0.9,
                 clip_gradients=5.0, 
                 max_epoch=50, 
                 patience=5,
                 log_dir=None,
                 maxlen=300,
                 fold_number=1,
                 use_roc_auc=True,
                 use_ELMo=False,
                 use_BERT=False,
                 embeddings=(),
                 class_weights=None,
                 multiprocessing=True):
        self.model = None
        self.models = None
        self.log_dir = log_dir
        self.embeddings_name = embeddings_name

        word_emb_size = 0
        if embeddings_name is not None and model_type.find("bert") == -1:
            self.embeddings = Embeddings(embeddings_name, use_ELMo=use_ELMo, use_BERT=use_BERT) 
            word_emb_size = self.embeddings.embed_size

        self.model_config = ModelConfig(model_name=model_name, 
                                        model_type=model_type, 
                                        embeddings_name=embeddings_name, 
                                        list_classes=list_classes, 
                                        char_emb_size=char_emb_size, 
                                        word_emb_size=word_emb_size, 
                                        dropout=dropout, 
                                        recurrent_dropout=recurrent_dropout,
                                        use_char_feature=use_char_feature, 
                                        maxlen=maxlen, 
                                        fold_number=fold_number, 
                                        batch_size=batch_size, 
                                        use_ELMo=use_ELMo, 
                                        use_BERT=use_BERT)

        self.training_config = TrainingConfig(batch_size, optimizer, learning_rate,
                                              lr_decay, clip_gradients, max_epoch,
                                              patience, use_roc_auc,
                                              class_weights=class_weights, multiprocessing=multiprocessing)
Example #7
0
def preload(embeddings_name, input_path=None):
    embeddings = Embeddings(embeddings_name, path='./embedding-registry.json', load=False)

    description = embeddings.get_description(embeddings_name)
    if description is None:
        print("Error: embedding name", embeddings_name, "is not registered in", path)

    if input_path is None:
        embeddings_path = None
        # download if url is available
        if description is not None and "url" in description and len(description["url"])>0:
            url = description["url"]
            download_path = embeddings.registry['embedding-download-path']
            # if the download path does not exist, we create it
            if not os.path.isdir(download_path):
                try:
                    os.mkdir(download_path)
                except OSError:
                    print ("Creation of the download directory", download_path, "failed")

            print("Downloading resource file for", embeddings_name, "...")
            embeddings_path = download_file(url, download_path)
            if embeddings_path != None and os.path.isfile(embeddings_path):
                print("Download sucessful:", embeddings_path)
        else:
            print("Embeddings resource is not specified in the embeddings registry:", embeddings_name)
    else:
        embeddings_path = input_path

    if embeddings_path == None:
        print("Fail to retrive embedding file for", embeddings_name)

    embedding_file = open_embedding_file(embeddings_path)
    if embedding_file is None:
        print("Error: could not open embeddings file", embeddings_path)
        return

    # create and load the database in write mode
    embedding_lmdb_path = embeddings.registry["embedding-lmdb-path"]
    if not os.path.isdir(embedding_lmdb_path):
        os.makedirs(embedding_lmdb_path)

    envFilePath = os.path.join(embedding_lmdb_path, embeddings_name)
    embeddings.env = lmdb.open(envFilePath, map_size=map_size)
    embeddings.load_embeddings_from_file(embeddings_path)
    embeddings.clean_downloads()
Example #8
0
    def load(self, dir_path='data/models/textClassification/'):
        model_path = os.path.join(dir_path, self.model_config.model_name)
        self.model_config = ModelConfig.load(os.path.join(model_path, self.config_file))

        if self.model_config.transformer_name is None:
            # load embeddings
            # Do not use cache in 'production' mode
            self.embeddings = Embeddings(self.model_config.embeddings_name, resource_registry=self.registry, use_cache=False)
            self.model_config.word_embedding_size = self.embeddings.embed_size
        else:
            self.transformer_name = self.model_config.transformer_name
            self.embeddings = None

        self.model = getModel(self.model_config, 
                              self.training_config, 
                              load_pretrained_weights=False, 
                              local_path=model_path)
        print_parameters(self.model_config, self.training_config)
        self.model.print_summary()

        if self.model_config.fold_number == 1:
            print("load weights from", os.path.join(model_path, self.weight_file))
            self.model.load(os.path.join(model_path, self.weight_file))
        else:
            self.models = []
            if self.model_config.transformer_name is None:
                for i in range(0, self.model_config.fold_number):
                    local_model = getModel(self.model_config, 
                                        self.training_config, 
                                        load_pretrained_weights=False, 
                                        local_path=model_path)
                    local_model.load(os.path.join(model_path, "model{0}_weights.hdf5".format(i)))
                    self.models.append(local_model)
            else:
                # only init first fold one, the other will be init at prediction time, all weights will be loaded at prediction time
                local_model = getModel(self.model_config, 
                                    self.training_config, 
                                    load_pretrained_weights=False, 
                                    local_path=model_path)
                self.models.append(local_model)
Example #9
0
class Sequence(object):

    config_file = 'config.json'
    weight_file = 'model_weights.hdf5'
    preprocessor_file = 'preprocessor.pkl'

    # number of parallel worker for the data generator when not using ELMo
    nb_workers = 6

    def __init__(self, 
                 model_name,
                 model_type="BidLSTM_CRF",
                 embeddings_name=None,
                 char_emb_size=25, 
                 max_char_length=30,
                 char_lstm_units=25,
                 word_lstm_units=100, 
                 dropout=0.5, 
                 recurrent_dropout=0.25,
                 use_char_feature=True, 
                 use_crf=True,
                 batch_size=20, 
                 optimizer='adam', 
                 learning_rate=0.001, 
                 lr_decay=0.9,
                 clip_gradients=5.0, 
                 max_epoch=50, 
                 early_stop=True,
                 patience=5,
                 max_checkpoints_to_keep=5, 
                 log_dir=None,
                 use_ELMo=True,
                 fold_number=1):

        self.model = None
        self.models = None
        self.p = None
        self.log_dir = log_dir
        self.embeddings_name = embeddings_name

        word_emb_size = 0
        if embeddings_name is not None:
            self.embeddings = Embeddings(embeddings_name, use_ELMo=use_ELMo) 
            word_emb_size = self.embeddings.embed_size

        self.model_config = ModelConfig(model_name=model_name, 
                                        model_type=model_type, 
                                        embeddings_name=embeddings_name, 
                                        word_embedding_size=word_emb_size, 
                                        char_emb_size=char_emb_size, 
                                        char_lstm_units=char_lstm_units, 
                                        max_char_length=max_char_length,
                                        word_lstm_units=word_lstm_units, 
                                        dropout=dropout, 
                                        recurrent_dropout=recurrent_dropout, 
                                        use_char_feature=use_char_feature, 
                                        use_crf=use_crf, 
                                        fold_number=fold_number, 
                                        batch_size=batch_size,
                                        use_ELMo=use_ELMo)

        self.training_config = TrainingConfig(batch_size, optimizer, learning_rate,
                                              lr_decay, clip_gradients, max_epoch,
                                              early_stop, patience, 
                                              max_checkpoints_to_keep)

    def train(self, x_train, y_train, x_valid=None, y_valid=None):
        # TBD if valid is None, segment train to get one
        x_all = np.concatenate((x_train, x_valid), axis=0)
        y_all = np.concatenate((y_train, y_valid), axis=0)
        self.p = prepare_preprocessor(x_all, y_all, self.model_config)
        self.model_config.char_vocab_size = len(self.p.vocab_char)
        self.model_config.case_vocab_size = len(self.p.vocab_case)

        """
        if self.embeddings.use_ELMo:
            # dump token context independent data for the train set, done once for the training
            x_train_local = x_train
            if not self.training_config.early_stop:
                # in case we want to train with the validation set too, we dump also
                # the ELMo embeddings for the token of the valid set
                x_train_local = np.concatenate((x_train, x_valid), axis=0)
            self.embeddings.dump_ELMo_token_embeddings(x_train_local)
        """
        self.model = get_model(self.model_config, self.p, len(self.p.vocab_tag))
        trainer = Trainer(self.model, 
                          self.models,
                          self.embeddings,
                          self.model_config,
                          self.training_config,
                          checkpoint_path=self.log_dir,
                          preprocessor=self.p
                          )
        trainer.train(x_train, y_train, x_valid, y_valid)
        if self.embeddings.use_ELMo:
            self.embeddings.clean_ELMo_cache()

    def train_nfold(self, x_train, y_train, x_valid=None, y_valid=None, fold_number=10):
        if x_valid is not None and y_valid is not None:
            x_all = np.concatenate((x_train, x_valid), axis=0)
            y_all = np.concatenate((y_train, y_valid), axis=0)
            self.p = prepare_preprocessor(x_all, y_all, self.model_config)
        else:
            self.p = prepare_preprocessor(x_train, y_train, self.model_config)
        self.model_config.char_vocab_size = len(self.p.vocab_char)
        self.model_config.case_vocab_size = len(self.p.vocab_case)
        self.p.return_lengths = True

        #self.model = get_model(self.model_config, self.p, len(self.p.vocab_tag))
        self.models = []

        for k in range(0, fold_number):
            model = get_model(self.model_config, self.p, len(self.p.vocab_tag))
            self.models.append(model)

        trainer = Trainer(self.model, 
                          self.models,
                          self.embeddings,
                          self.model_config,
                          self.training_config,
                          checkpoint_path=self.log_dir,
                          preprocessor=self.p
                          )
        trainer.train_nfold(x_train, y_train, x_valid, y_valid)
        if self.embeddings.use_ELMo:
            self.embeddings.clean_ELMo_cache()

    def eval(self, x_test, y_test):
        if self.model_config.fold_number > 1 and self.models and len(self.models) == self.model_config.fold_number:
            self.eval_nfold(x_test, y_test)
        else:
            self.eval_single(x_test, y_test)

    def eval_single(self, x_test, y_test):   
        if self.model:
            # Prepare test data(steps, generator)
            test_generator = DataGenerator(x_test, y_test, 
              batch_size=self.training_config.batch_size, preprocessor=self.p, 
              char_embed_size=self.model_config.char_embedding_size, 
              embeddings=self.embeddings, shuffle=False)

            # Build the evaluator and evaluate the model
            scorer = Scorer(test_generator, self.p, evaluation=True)
            scorer.model = self.model
            scorer.on_epoch_end(epoch=-1) 
        else:
            raise (OSError('Could not find a model.'))

    def eval_nfold(self, x_test, y_test):
        if self.models is not None:
            total_f1 = 0
            best_f1 = 0
            best_index = 0
            worst_f1 = 1
            worst_index = 0
            reports = []
            total_precision = 0
            total_recall = 0
            for i in range(0, self.model_config.fold_number):
                print('\n------------------------ fold ' + str(i) + '--------------------------------------')

                # Prepare test data(steps, generator)
                test_generator = DataGenerator(x_test, y_test, 
                  batch_size=self.training_config.batch_size, preprocessor=self.p, 
                  char_embed_size=self.model_config.char_embedding_size, 
                  embeddings=self.embeddings, shuffle=False)

                # Build the evaluator and evaluate the model
                scorer = Scorer(test_generator, self.p, evaluation=True)
                scorer.model = self.models[i]
                scorer.on_epoch_end(epoch=-1) 
                f1 = scorer.f1
                precision = scorer.precision
                recall = scorer.recall
                reports.append(scorer.report)

                if best_f1 < f1:
                    best_f1 = f1
                    best_index = i
                if worst_f1 > f1:
                    worst_f1 = f1
                    worst_index = i
                total_f1 += f1
                total_precision += precision
                total_recall += recall

            macro_f1 = total_f1 / self.model_config.fold_number
            macro_precision = total_precision / self.model_config.fold_number
            macro_recall = total_recall / self.model_config.fold_number

            print("\naverage over", self.model_config.fold_number, "folds")
            print("\tmacro f1 =", macro_f1)
            print("\tmacro precision =", macro_precision)
            print("\tmacro recall =", macro_recall, "\n")

            print("\n** Worst ** model scores - \n")
            print(reports[worst_index])

            self.model = self.models[best_index]
            print("\n** Best ** model scores - \n")
            print(reports[best_index])

    def tag(self, texts, output_format):
        # annotate a list of sentences, return the list of annotations in the 
        # specified output_format
        if self.model:
            tagger = Tagger(self.model, self.model_config, self.embeddings, preprocessor=self.p)
            start_time = time.time()
            annotations = tagger.tag(texts, output_format)
            runtime = round(time.time() - start_time, 3)
            if output_format is 'json':
                annotations["runtime"] = runtime
            #else:
            #    print("runtime: %s seconds " % (runtime))
            return annotations
        else:
            raise (OSError('Could not find a model.'))

    def tag_file(self, file_in, output_format, file_out):
        # Annotate a text file containing one sentence per line, the annotations are
        # written in the output file if not None, in the standard output otherwise.
        # Processing is streamed by batches so that we can process huge files without
        # memory issues
        if self.model:
            tagger = Tagger(self.model, self.model_config, self.embeddings, preprocessor=self.p)
            start_time = time.time()
            if file_out is not None:
                out = open(file_out,'w')
            first = True
            with open(file_in, 'r') as f:
                texts = None
                while texts is None or len(texts) == self.model_config.batch_size * self.nb_workers:

                  texts = next_n_lines(f, self.model_config.batch_size * self.nb_workers)
                  annotations = tagger.tag(texts, output_format)
                  # if the following is true, we just output the JSON returned by the tagger without any modification
                  directDump = False
                  if first:
                      first = False
                      if len(texts) < self.model_config.batch_size * self.nb_workers:
                          runtime = round(time.time() - start_time, 3)
                          annotations['runtime'] = runtime
                          jsonString = json.dumps(annotations, sort_keys=False, indent=4, ensure_ascii=False)
                          if file_out is None:
                              print(jsonString)
                          else:
                              out.write(jsonString)
                          directDump = True
                      else:
                          # we need to modify a bit the JSON outputted by the tagger to glue the different batches
                          # output the general information attributes
                          jsonString = '{\n    "software": ' + json.dumps(annotations["software"], ensure_ascii=False) + ",\n"
                          jsonString += '    "date": ' + json.dumps(annotations["date"], ensure_ascii=False) + ",\n"
                          jsonString += '    "model": ' + json.dumps(annotations["model"], ensure_ascii=False) + ",\n"
                          jsonString += '    "texts": ['
                          if file_out is None:
                              print(jsonString, end='', flush=True)
                          else:
                              out.write(jsonString)
                          first = True
                          for jsonStr in annotations["texts"]:
                              jsonString = json.dumps(jsonStr, sort_keys=False, indent=4, ensure_ascii=False)
                              #jsonString = jsonString.replace('\n', '\n\t\t')
                              jsonString = re.sub('\n', '\n        ', jsonString)
                              if file_out is None:
                                  if not first:
                                      print(',\n        '+jsonString, end='', flush=True)
                                  else:
                                      first = False
                                      print('\n        '+jsonString, end='', flush=True)
                              else:
                                  if not first:
                                      out.write(',\n        ')
                                      out.write(jsonString)
                                  else:
                                      first = False
                                      out.write('\n        ')
                                      out.write(jsonString)
                  else:
                      for jsonStr in annotations["texts"]:
                          jsonString = json.dumps(jsonStr, sort_keys=False, indent=4, ensure_ascii=False)
                          jsonString = re.sub('\n', '\n        ', jsonString)
                          if file_out is None:
                              print(',\n        '+jsonString, end='', flush=True)
                          else:
                              out.write(',\n        ')
                              out.write(jsonString)

            runtime = round(time.time() - start_time, 3)
            if not directDump: 
                jsonString = "\n    ],\n"
                jsonString += '    "runtime": ' + str(runtime)
                jsonString += "\n}\n"
                if file_out is None:
                    print(jsonString)
                else:
                    out.write(jsonString) 

            if file_out is not None:
                out.close() 
            #print("runtime: %s seconds " % (runtime))
        else:
            raise (OSError('Could not find a model.'))

    def save(self, dir_path='data/models/sequenceLabelling/'):
        # create subfolder for the model if not already exists
        directory = os.path.join(dir_path, self.model_config.model_name)
        if not os.path.exists(directory):
            os.makedirs(directory)

        self.p.save(os.path.join(directory, self.preprocessor_file))
        print('preprocessor saved')

        self.model_config.save(os.path.join(directory, self.config_file))
        print('model config file saved')

        self.model.save(os.path.join(directory, self.weight_file))
        print('model saved')

    def load(self, dir_path='data/models/sequenceLabelling/'):
        self.p = WordPreprocessor.load(os.path.join(dir_path, self.model_config.model_name, self.preprocessor_file))

        self.model_config = ModelConfig.load(os.path.join(dir_path, self.model_config.model_name, self.config_file))

        # load embeddings
        self.embeddings = Embeddings(self.model_config.embeddings_name, use_ELMo=self.model_config.use_ELMo) 
        self.model_config.word_embedding_size = self.embeddings.embed_size

        self.model = get_model(self.model_config, self.p, ntags=len(self.p.vocab_tag))
        self.model.load(filepath=os.path.join(dir_path, self.model_config.model_name, self.weight_file))
Example #10
0
class Sequence(object):

    config_file = 'config.json'
    weight_file = 'model_weights.hdf5'
    preprocessor_file = 'preprocessor.json'
    #preprocessor_file_new = 'preprocessor.json'

    # number of parallel worker for the data generator when not using ELMo
    nb_workers = 6

    def __init__(self,
                 model_name,
                 model_type="BidLSTM_CRF",
                 embeddings_name=None,
                 char_emb_size=25,
                 max_char_length=30,
                 char_lstm_units=25,
                 word_lstm_units=100,
                 max_sequence_length=300,
                 dropout=0.5,
                 recurrent_dropout=0.25,
                 use_char_feature=True,
                 use_crf=True,
                 batch_size=20,
                 optimizer='adam',
                 learning_rate=0.001,
                 lr_decay=0.9,
                 clip_gradients=5.0,
                 max_epoch=50,
                 early_stop=True,
                 patience=5,
                 max_checkpoints_to_keep=5,
                 log_dir=None,
                 use_ELMo=False,
                 use_BERT=False,
                 fold_number=1,
                 multiprocessing=True,
                 features_indices=None):

        self.model = None
        self.models = None
        self.p = None
        self.log_dir = log_dir
        self.embeddings_name = embeddings_name

        word_emb_size = 0
        if embeddings_name is not None:
            self.embeddings = Embeddings(embeddings_name,
                                         use_ELMo=use_ELMo,
                                         use_BERT=use_BERT)
            word_emb_size = self.embeddings.embed_size
        else:
            self.embeddings = None

        self.model_config = ModelConfig(
            model_name=model_name,
            model_type=model_type,
            embeddings_name=embeddings_name,
            word_embedding_size=word_emb_size,
            char_emb_size=char_emb_size,
            char_lstm_units=char_lstm_units,
            max_char_length=max_char_length,
            word_lstm_units=word_lstm_units,
            max_sequence_length=max_sequence_length,
            dropout=dropout,
            recurrent_dropout=recurrent_dropout,
            use_char_feature=use_char_feature,
            use_crf=use_crf,
            fold_number=fold_number,
            batch_size=batch_size,
            use_ELMo=use_ELMo,
            use_BERT=use_BERT,
            features_indices=features_indices)

        self.training_config = TrainingConfig(batch_size, optimizer,
                                              learning_rate, lr_decay,
                                              clip_gradients, max_epoch,
                                              early_stop, patience,
                                              max_checkpoints_to_keep,
                                              multiprocessing)

    def train(self,
              x_train,
              y_train,
              f_train: np.array = None,
              x_valid=None,
              y_valid=None,
              f_valid: np.array = None,
              callbacks=None):
        # TBD if valid is None, segment train to get one
        x_all = np.concatenate(
            (x_train, x_valid), axis=0) if x_valid is not None else x_train
        y_all = np.concatenate(
            (y_train, y_valid), axis=0) if y_valid is not None else y_train
        features_all = concatenate_or_none((f_train, f_valid), axis=0)

        self.p = prepare_preprocessor(x_all,
                                      y_all,
                                      features=features_all,
                                      model_config=self.model_config)
        self.model_config.char_vocab_size = len(self.p.vocab_char)
        self.model_config.case_vocab_size = len(self.p.vocab_case)

        self.model = get_model(self.model_config, self.p,
                               len(self.p.vocab_tag))
        if self.p.return_features is not False:
            print('x_train.shape: ', x_train.shape)
            print('features_train.shape: ', f_train.shape)
            sample_transformed_features = self.p.transform_features(f_train)
            self.model_config.max_feature_size = np.asarray(
                sample_transformed_features).shape[-1]
            print('max_feature_size: ', self.model_config.max_feature_size)

        trainer = Trainer(self.model,
                          self.models,
                          self.embeddings,
                          self.model_config,
                          self.training_config,
                          checkpoint_path=self.log_dir,
                          preprocessor=self.p)
        trainer.train(x_train,
                      y_train,
                      x_valid,
                      y_valid,
                      features_train=f_train,
                      features_valid=f_valid,
                      callbacks=callbacks)
        if self.embeddings.use_ELMo:
            self.embeddings.clean_ELMo_cache()
        if self.embeddings.use_BERT:
            self.embeddings.clean_BERT_cache()

    def train_nfold(self,
                    x_train,
                    y_train,
                    x_valid=None,
                    y_valid=None,
                    f_train: np.array = None,
                    f_valid: np.array = None,
                    fold_number=10,
                    callbacks=None):
        x_all = np.concatenate(
            (x_train, x_valid), axis=0) if x_valid is not None else x_train
        y_all = np.concatenate(
            (y_train, y_valid), axis=0) if y_valid is not None else y_train
        features_all = concatenate_or_none((f_train, f_valid), axis=0)

        self.p = prepare_preprocessor(x_all,
                                      y_all,
                                      features=features_all,
                                      model_config=self.model_config)
        self.model_config.char_vocab_size = len(self.p.vocab_char)
        self.model_config.case_vocab_size = len(self.p.vocab_case)
        self.p.return_lengths = True

        if 'bert' in self.model_config.model_type.lower():
            self.model = get_model(self.model_config, self.p,
                                   len(self.p.vocab_tag))
        self.models = []

        for k in range(0, fold_number):
            model = get_model(self.model_config, self.p, len(self.p.vocab_tag))
            self.models.append(model)

        trainer = Trainer(self.model,
                          self.models,
                          self.embeddings,
                          self.model_config,
                          self.training_config,
                          checkpoint_path=self.log_dir,
                          preprocessor=self.p)
        trainer.train_nfold(x_train,
                            y_train,
                            x_valid,
                            y_valid,
                            f_train=f_train,
                            f_valid=f_valid,
                            callbacks=callbacks)
        if self.embeddings.use_ELMo:
            self.embeddings.clean_ELMo_cache()
        if self.embeddings.use_BERT:
            self.embeddings.clean_BERT_cache()
        if 'bert' in self.model_config.model_type.lower():
            self.save()

    def eval(self, x_test, y_test, features=None):
        if self.models and 1 < self.model_config.fold_number == len(
                self.models):
            self.eval_nfold(x_test, y_test, features=features)
        else:
            self.eval_single(x_test, y_test, features=features)

    def eval_single(self, x_test, y_test, features=None):
        if 'bert' not in self.model_config.model_type.lower():
            if self.model:
                # Prepare test data(steps, generator)
                test_generator = DataGenerator(
                    x_test,
                    y_test,
                    batch_size=self.model_config.batch_size,
                    preprocessor=self.p,
                    char_embed_size=self.model_config.char_embedding_size,
                    max_sequence_length=self.model_config.max_sequence_length,
                    embeddings=self.embeddings,
                    shuffle=False,
                    features=features)

                # Build the evaluator and evaluate the model
                scorer = Scorer(test_generator, self.p, evaluation=True)
                scorer.model = self.model
                scorer.on_epoch_end(epoch=-1)
            else:
                raise (OSError('Could not find a model.'))
        else:
            # BERT architecture model
            y_pred = self.model.predict(x_test, fold_id=-1)

            nb_alignment_issues = 0
            for i in range(len(y_test)):
                if len(y_test[i]) != len(y_pred[i]):
                    nb_alignment_issues += 1
                    # BERT tokenizer appears to introduce some additional tokens without ## prefix,
                    # but this is normally handled when predicting.
                    # To be very conservative, the following ensure the number of tokens always
                    # match, but it should never be used in practice.
                    if len(y_test[i]) < len(y_pred[i]):
                        y_test[i] = y_test[i] + ["O"] * (len(y_pred[i]) -
                                                         len(y_test[i]))
                    if len(y_test[i]) > len(y_pred[i]):
                        y_pred[i] = y_pred[i] + ["O"] * (len(y_test[i]) -
                                                         len(y_pred[i]))

            if nb_alignment_issues > 0:
                print("number of alignment issues with test set:",
                      nb_alignment_issues)

            report, report_as_map = classification_report(y_test,
                                                          y_pred,
                                                          digits=4)
            print(report)

    def eval_nfold(self, x_test, y_test, features=None):
        if self.models is not None:
            total_f1 = 0
            best_f1 = 0
            best_index = 0
            worst_f1 = 1
            worst_index = 0
            reports = []
            reports_as_map = []
            total_precision = 0
            total_recall = 0
            for i in range(self.model_config.fold_number):
                print('\n------------------------ fold ' + str(i) +
                      ' --------------------------------------')

                if 'bert' not in self.model_config.model_type.lower():
                    # Prepare test data(steps, generator)
                    test_generator = DataGenerator(
                        x_test,
                        y_test,
                        batch_size=self.model_config.batch_size,
                        preprocessor=self.p,
                        char_embed_size=self.model_config.char_embedding_size,
                        max_sequence_length=self.model_config.
                        max_sequence_length,
                        embeddings=self.embeddings,
                        shuffle=False,
                        features=features)

                    # Build the evaluator and evaluate the model
                    scorer = Scorer(test_generator, self.p, evaluation=True)
                    scorer.model = self.models[i]
                    scorer.on_epoch_end(epoch=-1)
                    f1 = scorer.f1
                    precision = scorer.precision
                    recall = scorer.recall
                    reports.append(scorer.report)
                    reports_as_map.append(scorer.report_as_map)

                else:
                    # BERT architecture model
                    dir_path = 'data/models/sequenceLabelling/'
                    self.model_config = ModelConfig.load(
                        os.path.join(dir_path, self.model_config.model_name,
                                     self.config_file))
                    self.p = WordPreprocessor.load(
                        os.path.join(dir_path, self.model_config.model_name,
                                     self.preprocessor_file))
                    self.model = get_model(self.model_config,
                                           self.p,
                                           ntags=len(self.p.vocab_tag))
                    self.model.load_model(i)

                    y_pred = self.model.predict(x_test, fold_id=i)

                    nb_alignment_issues = 0
                    for j in range(len(y_test)):
                        if len(y_test[i]) != len(y_pred[j]):
                            nb_alignment_issues += 1
                            # BERT tokenizer appears to introduce some additional tokens without ## prefix,
                            # but this is normally handled when predicting.
                            # To be very conservative, the following ensure the number of tokens always
                            # match, but it should never be used in practice.
                            if len(y_test[j]) < len(y_pred[j]):
                                y_test[j] = y_test[j] + ["O"] * (
                                    len(y_pred[j]) - len(y_test[j]))
                            if len(y_test[j]) > len(y_pred[j]):
                                y_pred[j] = y_pred[j] + ["O"] * (
                                    len(y_test[j]) - len(y_pred[j]))

                    if nb_alignment_issues > 0:
                        print("number of alignment issues with test set:",
                              nb_alignment_issues)

                    f1 = f1_score(y_test, y_pred)
                    precision = precision_score(y_test, y_pred)
                    recall = recall_score(y_test, y_pred)

                    print("\tf1: {:04.2f}".format(f1 * 100))
                    print("\tprecision: {:04.2f}".format(precision * 100))
                    print("\trecall: {:04.2f}".format(recall * 100))

                    report, report_as_map = classification_report(y_test,
                                                                  y_pred,
                                                                  digits=4)
                    reports.append(report)
                    reports_as_map.append(report_as_map)

                if best_f1 < f1:
                    best_f1 = f1
                    best_index = i
                if worst_f1 > f1:
                    worst_f1 = f1
                    worst_index = i
                total_f1 += f1
                total_precision += precision
                total_recall += recall

            fold_average_evaluation = {'labels': {}, 'micro': {}, 'macro': {}}

            micro_f1 = total_f1 / self.model_config.fold_number
            micro_precision = total_precision / self.model_config.fold_number
            micro_recall = total_recall / self.model_config.fold_number

            micro_eval_block = {
                'f1': micro_f1,
                'precision': micro_precision,
                'recall': micro_recall
            }
            fold_average_evaluation['micro'] = micro_eval_block

            # field-level average over the n folds
            labels = []
            for label in sorted(self.p.vocab_tag):
                if label == 'O' or label == '<PAD>':
                    continue
                if label.startswith("B-") or label.startswith(
                        "S-") or label.startswith("I-") or label.startswith(
                            "E-"):
                    label = label[2:]

                if label in labels:
                    continue
                labels.append(label)

                sum_p = 0
                sum_r = 0
                sum_f1 = 0
                sum_support = 0
                for j in range(0, self.model_config.fold_number):
                    if not label in reports_as_map[j]['labels']:
                        continue
                    report_as_map = reports_as_map[j]['labels'][label]
                    sum_p += report_as_map["precision"]
                    sum_r += report_as_map["recall"]
                    sum_f1 += report_as_map["f1"]
                    sum_support += report_as_map["support"]

                avg_p = sum_p / self.model_config.fold_number
                avg_r = sum_r / self.model_config.fold_number
                avg_f1 = sum_f1 / self.model_config.fold_number
                avg_support = sum_support / self.model_config.fold_number
                avg_support_dec = str(avg_support - int(avg_support))[1:]
                if avg_support_dec != '0':
                    avg_support = math.floor(avg_support)

                block_label = {
                    'precision': avg_p,
                    'recall': avg_r,
                    'support': avg_support,
                    'f1': avg_f1
                }
                fold_average_evaluation['labels'][label] = block_label

            print(
                "----------------------------------------------------------------------"
            )
            print("\n** Worst ** model scores - run", str(worst_index))
            print(reports[worst_index])

            print("\n** Best ** model scores - run", str(best_index))
            print(reports[best_index])

            if 'bert' not in self.model_config.model_type.lower():
                self.model = self.models[best_index]
            else:
                # copy best BERT model fold_number
                best_model_dir = 'data/models/sequenceLabelling/' + self.model_config.model_name + str(
                    best_index)
                new_model_dir = 'data/models/sequenceLabelling/' + self.model_config.model_name
                # update new_model_dir if it already exists, keep its existing config content
                merge_folders(best_model_dir, new_model_dir)
                # clean other fold directory
                for i in range(self.model_config.fold_number):
                    shutil.rmtree('data/models/sequenceLabelling/' +
                                  self.model_config.model_name + str(i))

            print(
                "----------------------------------------------------------------------"
            )
            print("\nAverage over", self.model_config.fold_number, "folds")
            print(
                get_report(fold_average_evaluation,
                           digits=4,
                           include_avgs=['micro']))

    def tag(self, texts, output_format, features=None):
        # annotate a list of sentences, return the list of annotations in the
        # specified output_format
        if self.model:
            tagger = Tagger(self.model,
                            self.model_config,
                            self.embeddings,
                            preprocessor=self.p)
            start_time = time.time()
            annotations = tagger.tag(texts, output_format, features=features)
            runtime = round(time.time() - start_time, 3)
            if output_format is 'json':
                annotations["runtime"] = runtime
            #else:
            #    print("runtime: %s seconds " % (runtime))
            return annotations
        else:
            raise (OSError('Could not find a model.' + str(self.model)))

    def tag_file(self, file_in, output_format, file_out):
        # Annotate a text file containing one sentence per line, the annotations are
        # written in the output file if not None, in the standard output otherwise.
        # Processing is streamed by batches so that we can process huge files without
        # memory issues
        if self.model:
            tagger = Tagger(self.model,
                            self.model_config,
                            self.embeddings,
                            preprocessor=self.p)
            start_time = time.time()
            if file_out is not None:
                out = open(file_out, 'w')
            first = True
            with open(file_in, 'r') as f:
                texts = None
                while texts is None or len(
                        texts
                ) == self.model_config.batch_size * self.nb_workers:

                    texts = next_n_lines(
                        f, self.model_config.batch_size * self.nb_workers)
                    annotations = tagger.tag(texts, output_format)
                    # if the following is true, we just output the JSON returned by the tagger without any modification
                    directDump = False
                    if first:
                        first = False
                        if len(
                                texts
                        ) < self.model_config.batch_size * self.nb_workers:
                            runtime = round(time.time() - start_time, 3)
                            annotations['runtime'] = runtime
                            jsonString = json.dumps(annotations,
                                                    sort_keys=False,
                                                    indent=4,
                                                    ensure_ascii=False)
                            if file_out is None:
                                print(jsonString)
                            else:
                                out.write(jsonString)
                            directDump = True
                        else:
                            # we need to modify a bit the JSON outputted by the tagger to glue the different batches
                            # output the general information attributes
                            jsonString = '{\n    "software": ' + json.dumps(
                                annotations["software"],
                                ensure_ascii=False) + ",\n"
                            jsonString += '    "date": ' + json.dumps(
                                annotations["date"],
                                ensure_ascii=False) + ",\n"
                            jsonString += '    "model": ' + json.dumps(
                                annotations["model"],
                                ensure_ascii=False) + ",\n"
                            jsonString += '    "texts": ['
                            if file_out is None:
                                print(jsonString, end='', flush=True)
                            else:
                                out.write(jsonString)
                            first = True
                            for jsonStr in annotations["texts"]:
                                jsonString = json.dumps(jsonStr,
                                                        sort_keys=False,
                                                        indent=4,
                                                        ensure_ascii=False)
                                #jsonString = jsonString.replace('\n', '\n\t\t')
                                jsonString = re.sub('\n', '\n        ',
                                                    jsonString)
                                if file_out is None:
                                    if not first:
                                        print(',\n        ' + jsonString,
                                              end='',
                                              flush=True)
                                    else:
                                        first = False
                                        print('\n        ' + jsonString,
                                              end='',
                                              flush=True)
                                else:
                                    if not first:
                                        out.write(',\n        ')
                                        out.write(jsonString)
                                    else:
                                        first = False
                                        out.write('\n        ')
                                        out.write(jsonString)
                    else:
                        for jsonStr in annotations["texts"]:
                            jsonString = json.dumps(jsonStr,
                                                    sort_keys=False,
                                                    indent=4,
                                                    ensure_ascii=False)
                            jsonString = re.sub('\n', '\n        ', jsonString)
                            if file_out is None:
                                print(',\n        ' + jsonString,
                                      end='',
                                      flush=True)
                            else:
                                out.write(',\n        ')
                                out.write(jsonString)

            runtime = round(time.time() - start_time, 3)
            if not directDump:
                jsonString = "\n    ],\n"
                jsonString += '    "runtime": ' + str(runtime)
                jsonString += "\n}\n"
                if file_out is None:
                    print(jsonString)
                else:
                    out.write(jsonString)

            if file_out is not None:
                out.close()
            #print("runtime: %s seconds " % (runtime))
        else:
            raise (OSError('Could not find a model.'))

    def save(self, dir_path='data/models/sequenceLabelling/'):
        # create subfolder for the model if not already exists
        directory = os.path.join(dir_path, self.model_config.model_name)
        if not os.path.exists(directory):
            os.makedirs(directory)

        self.model_config.save(os.path.join(directory, self.config_file))
        print('model config file saved')

        self.p.save(os.path.join(directory, self.preprocessor_file))
        print('preprocessor saved')

        # bert model are always saved via training process steps as checkpoint
        if self.model_config.model_type.lower().find("bert") == -1:
            if self.model is None and self.model_config.fold_number != 0 and self.model_config.fold_number != 1:
                print(
                    'Error: model not saved. Evaluation need to be called first to select the best fold model to be saved'
                )
            else:
                self.model.save(os.path.join(directory, self.weight_file))
        print('model saved')

    def load(self, dir_path='data/models/sequenceLabelling/'):
        self.model_config = ModelConfig.load(
            os.path.join(dir_path, self.model_config.model_name,
                         self.config_file))
        self.p = WordPreprocessor.load(
            os.path.join(dir_path, self.model_config.model_name,
                         self.preprocessor_file))

        if self.model_config.model_type.lower().find("bert") != -1:
            self.model = get_model(self.model_config,
                                   self.p,
                                   ntags=len(self.p.vocab_tag),
                                   dir_path=dir_path)
            self.model.load_model()
            return

        # load embeddings
        # Do not use cache in 'production' mode
        self.embeddings = Embeddings(self.model_config.embeddings_name,
                                     use_ELMo=self.model_config.use_ELMo,
                                     use_BERT=self.model_config.use_BERT,
                                     use_cache=False)
        self.model_config.word_embedding_size = self.embeddings.embed_size

        self.model = get_model(self.model_config,
                               self.p,
                               ntags=len(self.p.vocab_tag))
        self.model.load(filepath=os.path.join(
            dir_path, self.model_config.model_name, self.weight_file))
Example #11
0
class Classifier(object):

    config_file = 'config.json'
    weight_file = 'model_weights.hdf5'

    def __init__(self, 
                 model_name="",
                 model_type="gru",
                 embeddings_name=None,
                 list_classes=[],
                 char_emb_size=25, 
                 dropout=0.5, 
                 recurrent_dropout=0.25,
                 use_char_feature=False, 
                 batch_size=256, 
                 optimizer='adam', 
                 learning_rate=0.001, 
                 lr_decay=0.9,
                 clip_gradients=5.0, 
                 max_epoch=50, 
                 patience=5,
                 log_dir=None,
                 maxlen=300,
                 fold_number=1,
                 use_roc_auc=True,
                 use_ELMo=False,
                 use_BERT=False,
                 embeddings=(),
                 class_weights=None,
                 multiprocessing=True):
        self.model = None
        self.models = None
        self.log_dir = log_dir
        self.embeddings_name = embeddings_name

        word_emb_size = 0
        if embeddings_name is not None and model_type.find("bert") == -1:
            self.embeddings = Embeddings(embeddings_name, use_ELMo=use_ELMo, use_BERT=use_BERT) 
            word_emb_size = self.embeddings.embed_size

        self.model_config = ModelConfig(model_name=model_name, 
                                        model_type=model_type, 
                                        embeddings_name=embeddings_name, 
                                        list_classes=list_classes, 
                                        char_emb_size=char_emb_size, 
                                        word_emb_size=word_emb_size, 
                                        dropout=dropout, 
                                        recurrent_dropout=recurrent_dropout,
                                        use_char_feature=use_char_feature, 
                                        maxlen=maxlen, 
                                        fold_number=fold_number, 
                                        batch_size=batch_size, 
                                        use_ELMo=use_ELMo, 
                                        use_BERT=use_BERT)

        self.training_config = TrainingConfig(batch_size, optimizer, learning_rate,
                                              lr_decay, clip_gradients, max_epoch,
                                              patience, use_roc_auc,
                                              class_weights=class_weights, multiprocessing=multiprocessing)

    def train(self, x_train, y_train, vocab_init=None, callbacks=None):
        self.model = getModel(self.model_config, self.training_config)

        # bert models
        if self.model_config.model_type.find("bert") != -1:     
            self.model.processor = BERT_classifier_processor(labels=self.model_config.list_classes, x_train=x_train, y_train=y_train)
            self.model.train()
            return

        # create validation set in case we don't use k-folds
        xtr, val_x, y, val_y = train_test_split(x_train, y_train, test_size=0.1)

        training_generator = DataGenerator(xtr, y, batch_size=self.training_config.batch_size, 
            maxlen=self.model_config.maxlen, list_classes=self.model_config.list_classes, 
            embeddings=self.embeddings, shuffle=True)
        validation_generator = DataGenerator(val_x, None, batch_size=self.training_config.batch_size, 
            maxlen=self.model_config.maxlen, list_classes=self.model_config.list_classes, 
            embeddings=self.embeddings, shuffle=False)
        
        # uncomment to plot graph
        #plot_model(self.model, 
        #    to_file='data/models/textClassification/'+self.model_config.model_name+'_'+self.model_config.model_type+'.png')
        self.model, best_roc_auc = train_model(self.model, self.model_config.list_classes, self.training_config.batch_size, 
            self.training_config.max_epoch, self.training_config.use_roc_auc, self.training_config.class_weights, 
            training_generator, validation_generator, val_y, use_ELMo=self.embeddings.use_ELMo, 
            use_BERT=self.embeddings.use_BERT, multiprocessing=self.training_config.multiprocessing, callbacks=callbacks)
        if self.embeddings.use_ELMo:
            self.embeddings.clean_ELMo_cache()
        if self.embeddings.use_BERT:
            self.embeddings.clean_BERT_cache()

    def train_nfold(self, x_train, y_train, vocab_init=None, callbacks=None):
        # bert models
        if self.model_config.model_type.find("bert") != -1:     
            self.model = getModel(self.model_config, self.training_config)
            self.model.processor = BERT_classifier_processor(labels=self.model_config.list_classes, x_train=x_train, y_train=y_train)
            self.model.train()
            return

        self.models = train_folds(x_train, y_train, self.model_config, self.training_config, self.embeddings, callbacks=callbacks)
        if self.embeddings.use_ELMo:
            self.embeddings.clean_ELMo_cache()
        if self.embeddings.use_BERT:
            self.embeddings.clean_BERT_cache()

    # classification
    def predict(self, texts, output_format='json', use_main_thread_only=False):
        if self.model_config.fold_number is 1:
            if self.model is not None:
                # bert model?
                if self.model_config.model_type.find("bert") != -1:
                    # be sure the input processor is instanciated
                    self.model.processor = BERT_classifier_processor(labels=self.model_config.list_classes)
                    result = self.model.predict(texts)
                else:
                    predict_generator = DataGenerator(texts, None, batch_size=self.model_config.batch_size, 
                        maxlen=self.model_config.maxlen, list_classes=self.model_config.list_classes, 
                        embeddings=self.embeddings, shuffle=False)

                    result = predict(self.model, predict_generator, use_ELMo=self.embeddings.use_ELMo, use_BERT=self.embeddings.use_BERT, use_main_thread_only=use_main_thread_only)
            else:
                raise (OSError('Could not find a model.'))
        else:            
            # bert model?
            if self.model_config.model_type.find("bert") != -1:
                # we don't support n classifiers for BERT for prediction currently 
                # (it would be too large and too slow if loaded 10 times from file for each batch)
                # (however it is done for eval, models are loaded 1 time for the complete dataset, not each time per batch, and we should do the same here) 
                # be sure the input processor is instanciated
                self.model.processor = BERT_classifier_processor(labels=self.model_config.list_classes)
                #result = self.models[0].predict(texts)
                result = self.model.predict(texts)
            else:
                if self.models is not None: 
                    predict_generator = DataGenerator(texts, None, batch_size=self.model_config.batch_size, 
                        maxlen=self.model_config.maxlen, list_classes=self.model_config.list_classes, 
                        embeddings=self.embeddings, shuffle=False)

                    result = predict_folds(self.models, predict_generator, use_ELMo=self.embeddings.use_ELMo, use_BERT=self.embeddings.use_BERT, use_main_thread_only=use_main_thread_only)
                else:
                    raise (OSError('Could not find nfolds models.'))
        if output_format is 'json':
            res = {
                "software": "DeLFT",
                "date": datetime.datetime.now().isoformat(),
                "model": self.model_config.model_name,
                "classifications": []
            }
            i = 0
            for text in texts:
                classification = {
                    "text": text
                }
                the_res = result[i]
                j = 0
                for cl in self.model_config.list_classes:
                    classification[cl] = float(the_res[j])
                    j += 1
                res["classifications"].append(classification)
                i += 1
            return res
        else:
            return result

    def eval(self, x_test, y_test, use_main_thread_only=False):
        if self.model_config.fold_number == 1:
            if self.model is not None:
                # bert model?
                if self.model_config.model_type.find("bert") != -1:
                    #self.model.eval(x_test, y_test)
                    result = self.model.predict(x_test)
                else:
                    test_generator = DataGenerator(x_test, None, batch_size=self.model_config.batch_size, 
                        maxlen=self.model_config.maxlen, list_classes=self.model_config.list_classes, 
                        embeddings=self.embeddings, shuffle=False)

                    result = predict(self.model, test_generator, use_ELMo=self.embeddings.use_ELMo, use_BERT=self.embeddings.use_BERT, use_main_thread_only=use_main_thread_only)
            else:
                raise (OSError('Could not find a model.'))
        else:
            if self.models is not None or (self.model_config.model_type.find("bert") != -1 and self.model is not None):
                # bert model?
                print(self.model_config.model_type)
                if self.model_config.model_type.find("bert") != -1:
                    result_list = []
                    for i in range(self.model_config.fold_number):
                        result = self.model.predict(x_test, i)
                        result_list.append(result)

                    result = np.ones(result_list[0].shape)
                    for fold_result in result_list:
                        result *= fold_result

                    result **= (1. / len(result_list))
                else:
                    test_generator = DataGenerator(x_test, None, batch_size=self.model_config.batch_size, 
                        maxlen=self.model_config.maxlen, list_classes=self.model_config.list_classes, 
                        embeddings=self.embeddings, shuffle=False)
                    result = predict_folds(self.models, test_generator, use_ELMo=self.embeddings.use_ELMo, use_BERT=self.embeddings.use_BERT, use_main_thread_only=use_main_thread_only)
            else:
                raise (OSError('Could not find nfolds models.'))
        print("-----------------------------------------------")
        print("\nEvaluation on", x_test.shape[0], "instances:")

        total_accuracy = 0.0
        total_f1 = 0.0
        total_loss = 0.0
        total_roc_auc = 0.0

        '''
        def normer(t):
            if t < 0.5: 
                return 0 
            else: 
                return 1
        vfunc = np.vectorize(normer)
        result_binary = vfunc(result)
        '''
        result_intermediate = np.asarray([np.argmax(line) for line in result])
        
        def vectorize(index, size):
            result = np.zeros(size)
            if index < size:
                result[index] = 1
            return result
        result_binary = np.array([vectorize(xi, len(self.model_config.list_classes)) for xi in result_intermediate])

        precision, recall, fscore, support = precision_recall_fscore_support(y_test, result_binary, average=None)
        print('{:>14}  {:>12}  {:>12}  {:>12}  {:>12}'.format(" ", "precision", "recall", "f-score", "support"))
        p = 0
        for the_class in self.model_config.list_classes:
            the_class = the_class[:14]
            print('{:>14}  {:>12}  {:>12}  {:>12}  {:>12}'.format(the_class, "{:10.4f}"
                .format(precision[p]), "{:10.4f}".format(recall[p]), "{:10.4f}".format(fscore[p]), support[p]))
            p += 1

        # macro-average (average of class scores)
        # we distinguish 1-class and multiclass problems 
        if len(self.model_config.list_classes) is 1:
            total_accuracy = accuracy_score(y_test, result_binary)
            total_f1 = f1_score(y_test, result_binary)
            total_loss = log_loss(y_test, result, labels=[0,1])
            if len(np.unique(y_test)) == 1:
                # roc_auc_score sklearn implementation is not working in this case, it needs more balanced batches
                # a simple fix is to return the r2_score instead in this case (which is a regression score and not a loss)
                total_roc_auc = r2_score(y_test, result)
                if total_roc_auc < 0:
                    total_roc_auc = 0 
            else:
                total_roc_auc = roc_auc_score(y_test, result)
        else:
            for j in range(0, len(self.model_config.list_classes)):
                accuracy = accuracy_score(y_test[:, j], result_binary[:, j])
                total_accuracy += accuracy
                f1 = f1_score(y_test[:, j], result_binary[:, j], average='micro')
                total_f1 += f1
                loss = log_loss(y_test[:, j], result[:, j], labels=[0,1])
                total_loss += loss
                if len(np.unique(y_test[:, j])) == 1:
                    # roc_auc_score sklearn implementation is not working in this case, it needs more balanced batches
                    # a simple fix is to return the r2_score instead in this case (which is a regression score and not a loss)
                    roc_auc = r2_score(y_test[:, j], result[:, j])
                    if roc_auc < 0:
                        roc_auc = 0 
                else:
                    roc_auc = roc_auc_score(y_test[:, j], result[:, j])
                total_roc_auc += roc_auc
                '''
                print("\nClass:", self.model_config.list_classes[j])
                print("\taccuracy at 0.5 =", accuracy)
                print("\tf-1 at 0.5 =", f1)
                print("\tlog-loss =", loss)
                print("\troc auc =", roc_auc)
                '''

        total_accuracy /= len(self.model_config.list_classes)
        total_f1 /= len(self.model_config.list_classes)
        total_loss /= len(self.model_config.list_classes)
        total_roc_auc /= len(self.model_config.list_classes)

        '''
        if len(self.model_config.list_classes) is not 1:
            print("\nMacro-average:")
        print("\taverage accuracy at 0.5 =", "{:10.4f}".format(total_accuracy))
        print("\taverage f-1 at 0.5 =", "{:10.4f}".format(total_f1))
        print("\taverage log-loss =","{:10.4f}".format( total_loss))
        print("\taverage roc auc =", "{:10.4f}".format(total_roc_auc))
        '''
        
        # micro-average (average of scores for each instance)
        # make sense only if we have more than 1 class, otherwise same as 
        # macro-avergae
        if len(self.model_config.list_classes) is not 1:
            total_accuracy = 0.0
            total_f1 = 0.0
            total_loss = 0.0
            total_roc_auc = 0.0

            for i in range(0, result.shape[0]):
                accuracy = accuracy_score(y_test[i,:], result_binary[i,:])
                total_accuracy += accuracy
                f1 = f1_score(y_test[i,:], result_binary[i,:], average='micro')
                total_f1 += f1
                loss = log_loss(y_test[i,:], result[i,:])
                total_loss += loss
                roc_auc = roc_auc_score(y_test[i,:], result[i,:])
                total_roc_auc += roc_auc

            total_accuracy /= result.shape[0]
            total_f1 /= result.shape[0]
            total_loss /= result.shape[0]
            total_roc_auc /= result.shape[0]

            '''
            print("\nMicro-average:")
            print("\taverage accuracy at 0.5 =", "{:10.4f}".format(total_accuracy))
            print("\taverage f-1 at 0.5 =", "{:10.4f}".format(total_f1))
            print("\taverage log-loss =", "{:10.4f}".format(total_loss))
            print("\taverage roc auc =", "{:10.4f}".format(total_roc_auc))
            '''
            
    def save(self, dir_path='data/models/textClassification/'):
        # create subfolder for the model if not already exists
        directory = os.path.join(dir_path, self.model_config.model_name)
        if not os.path.exists(directory):
            os.makedirs(directory)

        self.model_config.save(os.path.join(directory, self.config_file))
        print('model config file saved')

        # bert model are always saved via training process steps as checkpoint
        if self.model_config.model_type.find("bert") != -1:
            print('model saved')
            return

        if self.model_config.fold_number is 1:
            if self.model is not None:
                self.model.save(os.path.join(directory, self.model_config.model_type+"."+self.weight_file))
                print('model saved')
            else:
                print('Error: model has not been built')
        else:
            if self.models is None:
                print('Error: nfolds models have not been built')
            else:
                for i in range(0, self.model_config.fold_number):
                    self.models[i].save(os.path.join(directory, self.model_config.model_type+".model{0}_weights.hdf5".format(i)))
                print('nfolds model saved')

    def load(self, dir_path='data/models/textClassification/'):
        self.model_config = ModelConfig.load(os.path.join(dir_path, self.model_config.model_name, self.config_file))

        if self.model_config.model_type.find("bert") != -1:
             self.model = getModel(self.model_config, self.training_config)
             self.model.load()
             return

        # load embeddings
        # Do not use cache in 'production' mode
        self.embeddings = Embeddings(self.model_config.embeddings_name, use_ELMo=self.model_config.use_ELMo, use_BERT=self.model_config.use_BERT, use_cache=False)
        self.model_config.word_embedding_size = self.embeddings.embed_size

        self.model = getModel(self.model_config, self.training_config)
        if self.model_config.fold_number is 1:
            self.model.load_weights(os.path.join(dir_path, self.model_config.model_name, self.model_config.model_type+"."+self.weight_file))
        else:
            self.models = []
            for i in range(0, self.model_config.fold_number):
                local_model = getModel(self.model_config, self.training_config)
                local_model.load_weights(os.path.join(dir_path, self.model_config.model_name, self.model_config.model_type+".model{0}_weights.hdf5".format(i)))
                self.models.append(local_model)
Example #12
0
    def __init__(self,
                 model_name=None,
                 architecture=None,
                 embeddings_name=None,
                 char_emb_size=25,
                 max_char_length=30,
                 char_lstm_units=25,
                 word_lstm_units=100,
                 max_sequence_length=300,
                 dropout=0.5,
                 recurrent_dropout=0.25,
                 batch_size=20,
                 optimizer='adam',
                 learning_rate=0.001,
                 lr_decay=0.9,
                 clip_gradients=5.0,
                 max_epoch=50,
                 early_stop=True,
                 patience=5,
                 max_checkpoints_to_keep=0,
                 use_ELMo=False,
                 log_dir=None,
                 fold_number=1,
                 multiprocessing=True,
                 features_indices=None,
                 transformer_name: str = None):

        if model_name is None:
            # add a dummy name based on the architecture
            model_name = architecture
            if embeddings_name is not None:
                model_name += "_" + embeddings_name
            if transformer_name is not None:
                model_name += "_" + transformer_name

        self.model = None
        self.models = None
        self.p: Preprocessor = None
        self.log_dir = log_dir
        self.embeddings_name = embeddings_name

        word_emb_size = 0
        self.embeddings = None
        self.model_local_path = None

        self.registry = load_resource_registry("delft/resources-registry.json")

        if self.embeddings_name is not None:
            self.embeddings = Embeddings(self.embeddings_name,
                                         resource_registry=self.registry,
                                         use_ELMo=use_ELMo)
            word_emb_size = self.embeddings.embed_size
        else:
            self.embeddings = None
            word_emb_size = 0

        self.model_config = ModelConfig(
            model_name=model_name,
            architecture=architecture,
            embeddings_name=embeddings_name,
            word_embedding_size=word_emb_size,
            char_emb_size=char_emb_size,
            char_lstm_units=char_lstm_units,
            max_char_length=max_char_length,
            word_lstm_units=word_lstm_units,
            max_sequence_length=max_sequence_length,
            dropout=dropout,
            recurrent_dropout=recurrent_dropout,
            fold_number=fold_number,
            batch_size=batch_size,
            use_ELMo=use_ELMo,
            features_indices=features_indices,
            transformer_name=transformer_name)

        self.training_config = TrainingConfig(batch_size, optimizer,
                                              learning_rate, lr_decay,
                                              clip_gradients, max_epoch,
                                              early_stop, patience,
                                              max_checkpoints_to_keep,
                                              multiprocessing)
Example #13
0
class Sequence(object):

    # number of parallel worker for the data generator
    nb_workers = 6

    def __init__(self,
                 model_name=None,
                 architecture=None,
                 embeddings_name=None,
                 char_emb_size=25,
                 max_char_length=30,
                 char_lstm_units=25,
                 word_lstm_units=100,
                 max_sequence_length=300,
                 dropout=0.5,
                 recurrent_dropout=0.25,
                 batch_size=20,
                 optimizer='adam',
                 learning_rate=0.001,
                 lr_decay=0.9,
                 clip_gradients=5.0,
                 max_epoch=50,
                 early_stop=True,
                 patience=5,
                 max_checkpoints_to_keep=0,
                 use_ELMo=False,
                 log_dir=None,
                 fold_number=1,
                 multiprocessing=True,
                 features_indices=None,
                 transformer_name: str = None):

        if model_name is None:
            # add a dummy name based on the architecture
            model_name = architecture
            if embeddings_name is not None:
                model_name += "_" + embeddings_name
            if transformer_name is not None:
                model_name += "_" + transformer_name

        self.model = None
        self.models = None
        self.p: Preprocessor = None
        self.log_dir = log_dir
        self.embeddings_name = embeddings_name

        word_emb_size = 0
        self.embeddings = None
        self.model_local_path = None

        self.registry = load_resource_registry("delft/resources-registry.json")

        if self.embeddings_name is not None:
            self.embeddings = Embeddings(self.embeddings_name,
                                         resource_registry=self.registry,
                                         use_ELMo=use_ELMo)
            word_emb_size = self.embeddings.embed_size
        else:
            self.embeddings = None
            word_emb_size = 0

        self.model_config = ModelConfig(
            model_name=model_name,
            architecture=architecture,
            embeddings_name=embeddings_name,
            word_embedding_size=word_emb_size,
            char_emb_size=char_emb_size,
            char_lstm_units=char_lstm_units,
            max_char_length=max_char_length,
            word_lstm_units=word_lstm_units,
            max_sequence_length=max_sequence_length,
            dropout=dropout,
            recurrent_dropout=recurrent_dropout,
            fold_number=fold_number,
            batch_size=batch_size,
            use_ELMo=use_ELMo,
            features_indices=features_indices,
            transformer_name=transformer_name)

        self.training_config = TrainingConfig(batch_size, optimizer,
                                              learning_rate, lr_decay,
                                              clip_gradients, max_epoch,
                                              early_stop, patience,
                                              max_checkpoints_to_keep,
                                              multiprocessing)

    def train(self,
              x_train,
              y_train,
              f_train=None,
              x_valid=None,
              y_valid=None,
              f_valid=None,
              callbacks=None):
        # TBD if valid is None, segment train to get one if early_stop is True

        # we concatenate all the training+validation data to create the model vocabulary
        if not x_valid is None:
            x_all = np.concatenate((x_train, x_valid), axis=0)
        else:
            x_all = x_train

        if not y_valid is None:
            y_all = np.concatenate((y_train, y_valid), axis=0)
        else:
            y_all = y_train

        features_all = concatenate_or_none((f_train, f_valid), axis=0)

        self.p = prepare_preprocessor(x_all,
                                      y_all,
                                      features=features_all,
                                      model_config=self.model_config)

        self.model_config.char_vocab_size = len(self.p.vocab_char)
        self.model_config.case_vocab_size = len(self.p.vocab_case)

        self.model = get_model(self.model_config,
                               self.p,
                               len(self.p.vocab_tag),
                               load_pretrained_weights=True)
        print_parameters(self.model_config, self.training_config)
        self.model.print_summary()

        # uncomment to plot graph
        #plot_model(self.model,
        #    to_file='data/models/textClassification/'+self.model_config.model_name+'_'+self.model_config.architecture+'.png')

        trainer = Trainer(
            self.model,
            self.models,
            self.embeddings,
            self.model_config,
            self.training_config,
            checkpoint_path=self.log_dir,
            preprocessor=self.p,
            transformer_preprocessor=self.model.transformer_preprocessor)
        trainer.train(x_train,
                      y_train,
                      x_valid,
                      y_valid,
                      features_train=f_train,
                      features_valid=f_valid,
                      callbacks=callbacks)
        if self.embeddings and self.embeddings.use_ELMo:
            self.embeddings.clean_ELMo_cache()

    def train_nfold(self,
                    x_train,
                    y_train,
                    x_valid=None,
                    y_valid=None,
                    f_train=None,
                    f_valid=None,
                    callbacks=None):
        x_all = np.concatenate(
            (x_train, x_valid), axis=0) if x_valid is not None else x_train
        y_all = np.concatenate(
            (y_train, y_valid), axis=0) if y_valid is not None else y_train
        features_all = concatenate_or_none((f_train, f_valid), axis=0)

        self.p = prepare_preprocessor(x_all,
                                      y_all,
                                      features=features_all,
                                      model_config=self.model_config)

        self.model_config.char_vocab_size = len(self.p.vocab_char)
        self.model_config.case_vocab_size = len(self.p.vocab_case)

        self.models = []
        trainer = Trainer(self.model,
                          self.models,
                          self.embeddings,
                          self.model_config,
                          self.training_config,
                          checkpoint_path=self.log_dir,
                          preprocessor=self.p)

        trainer.train_nfold(x_train,
                            y_train,
                            x_valid,
                            y_valid,
                            f_train=f_train,
                            f_valid=f_valid,
                            callbacks=callbacks)
        if self.embeddings and self.embeddings.use_ELMo:
            self.embeddings.clean_ELMo_cache()

    def eval(self, x_test, y_test, features=None):
        if self.model_config.fold_number > 1:
            self.eval_nfold(x_test, y_test, features=features)
        else:
            self.eval_single(x_test, y_test, features=features)

    def eval_single(self, x_test, y_test, features=None):
        if self.model is None:
            raise (OSError('Could not find a model.'))
        print_parameters(self.model_config, self.training_config)
        self.model.print_summary()

        if self.model_config.transformer_name is None:
            # we can use a data generator for evaluation

            # Prepare test data(steps, generator)
            generator = self.model.get_generator()
            test_generator = generator(
                x_test,
                y_test,
                batch_size=self.model_config.batch_size,
                preprocessor=self.p,
                char_embed_size=self.model_config.char_embedding_size,
                max_sequence_length=self.model_config.max_sequence_length,
                embeddings=self.embeddings,
                shuffle=False,
                features=features,
                output_input_offsets=True,
                use_chain_crf=self.model_config.use_chain_crf)

            # Build the evaluator and evaluate the model
            scorer = Scorer(test_generator,
                            self.p,
                            evaluation=True,
                            use_crf=self.model_config.use_crf,
                            use_chain_crf=self.model_config.use_chain_crf)
            scorer.model = self.model
            scorer.on_epoch_end(epoch=-1)
        else:
            # the architecture model uses a transformer layer
            # note that we could also use the above test_generator, but as an alternative here we check the
            # test/prediction alignment of tokens and the validity of the maximum sequence input length
            # wrt the length of the test sequences

            tagger = Tagger(
                self.model,
                self.model_config,
                self.embeddings,
                preprocessor=self.p,
                transformer_preprocessor=self.model.transformer_preprocessor)
            y_pred_pairs = tagger.tag(x_test,
                                      output_format=None,
                                      features=features)

            # keep only labels
            y_pred = []
            for result in y_pred_pairs:
                result_labels = []
                for pair in result:
                    result_labels.append(pair[1])
                y_pred.append(result_labels)

            nb_alignment_issues = 0
            for i in range(len(y_test)):
                if len(y_test[i]) != len(y_pred[i]):
                    #print("y_test:", y_test[i])
                    #print("y_pred:", y_pred[i])

                    nb_alignment_issues += 1
                    # BERT tokenizer appears to introduce some additional tokens without ## prefix,
                    # but we normally handled that well when predicting.
                    # To be very conservative, the following ensure the number of tokens always
                    # match, but it should never be used in practice.
                    if len(y_test[i]) < len(y_pred[i]):
                        y_test[i] = y_test[i] + ["O"] * (len(y_pred[i]) -
                                                         len(y_test[i]))
                    if len(y_test[i]) > len(y_pred[i]):
                        y_pred[i] = y_pred[i] + ["O"] * (len(y_test[i]) -
                                                         len(y_pred[i]))

            if nb_alignment_issues > 0:
                print("number of alignment issues with test set:",
                      nb_alignment_issues)
                print(
                    "to solve them consider increasing the maximum sequence input length of the model and retrain"
                )

            report, report_as_map = classification_report(y_test,
                                                          y_pred,
                                                          digits=4)
            print(report)

    def eval_nfold(self, x_test, y_test, features=None):
        if self.models is not None:
            total_f1 = 0
            best_f1 = 0
            best_index = 0
            worst_f1 = 1
            worst_index = 0
            reports = []
            reports_as_map = []
            total_precision = 0
            total_recall = 0
            for i in range(self.model_config.fold_number):

                if self.model_config.transformer_name is None:
                    the_model = self.models[i]
                    bert_preprocessor = None
                else:
                    # the architecture model uses a transformer layer, it is large and needs to be loaded from disk
                    dir_path = 'data/models/sequenceLabelling/'
                    weight_file = DEFAULT_WEIGHT_FILE_NAME.replace(
                        ".hdf5",
                        str(i) + ".hdf5")
                    self.model = get_model(self.model_config,
                                           self.p,
                                           ntags=len(self.p.vocab_tag),
                                           load_pretrained_weights=False,
                                           local_path=os.path.join(
                                               dir_path,
                                               self.model_config.model_name))
                    self.model.load(filepath=os.path.join(
                        dir_path, self.model_config.model_name, weight_file))
                    the_model = self.model
                    bert_preprocessor = self.model.transformer_preprocessor

                if i == 0:
                    the_model.print_summary()
                    print_parameters(self.model_config, self.training_config)

                print('\n------------------------ fold ' + str(i) +
                      ' --------------------------------------')

                # we can use a data generator for evaluation
                # Prepare test data(steps, generator)
                generator = the_model.get_generator()
                test_generator = generator(
                    x_test,
                    y_test,
                    batch_size=self.model_config.batch_size,
                    preprocessor=self.p,
                    bert_preprocessor=bert_preprocessor,
                    char_embed_size=self.model_config.char_embedding_size,
                    max_sequence_length=self.model_config.max_sequence_length,
                    embeddings=self.embeddings,
                    shuffle=False,
                    features=features,
                    output_input_offsets=True,
                    use_chain_crf=self.model_config.use_chain_crf)

                # Build the evaluator and evaluate the model
                scorer = Scorer(test_generator,
                                self.p,
                                evaluation=True,
                                use_crf=self.model_config.use_crf,
                                use_chain_crf=self.model_config.use_chain_crf)
                scorer.model = the_model
                scorer.on_epoch_end(epoch=-1)
                f1 = scorer.f1
                precision = scorer.precision
                recall = scorer.recall
                reports.append(scorer.report)
                reports_as_map.append(scorer.report_as_map)

                if best_f1 < f1:
                    best_f1 = f1
                    best_index = i
                if worst_f1 > f1:
                    worst_f1 = f1
                    worst_index = i
                total_f1 += f1
                total_precision += precision
                total_recall += recall

            fold_average_evaluation = {'labels': {}, 'micro': {}, 'macro': {}}

            micro_f1 = total_f1 / self.model_config.fold_number
            micro_precision = total_precision / self.model_config.fold_number
            micro_recall = total_recall / self.model_config.fold_number

            micro_eval_block = {
                'f1': micro_f1,
                'precision': micro_precision,
                'recall': micro_recall
            }
            fold_average_evaluation['micro'] = micro_eval_block

            # field-level average over the n folds
            labels = []
            for label in sorted(self.p.vocab_tag):
                if label == 'O' or label == '<PAD>':
                    continue
                if label.startswith("B-") or label.startswith(
                        "S-") or label.startswith("I-") or label.startswith(
                            "E-"):
                    label = label[2:]

                if label in labels:
                    continue
                labels.append(label)

                sum_p = 0
                sum_r = 0
                sum_f1 = 0
                sum_support = 0
                for j in range(0, self.model_config.fold_number):
                    if label not in reports_as_map[j]['labels']:
                        continue
                    report_as_map = reports_as_map[j]['labels'][label]
                    sum_p += report_as_map["precision"]
                    sum_r += report_as_map["recall"]
                    sum_f1 += report_as_map["f1"]
                    sum_support += report_as_map["support"]

                avg_p = sum_p / self.model_config.fold_number
                avg_r = sum_r / self.model_config.fold_number
                avg_f1 = sum_f1 / self.model_config.fold_number
                avg_support = sum_support / self.model_config.fold_number
                avg_support_dec = str(avg_support - int(avg_support))[1:]
                if avg_support_dec != '0':
                    avg_support = math.floor(avg_support)

                block_label = {
                    'precision': avg_p,
                    'recall': avg_r,
                    'support': avg_support,
                    'f1': avg_f1
                }
                fold_average_evaluation['labels'][label] = block_label

            print(
                "----------------------------------------------------------------------"
            )
            print("\n** Worst ** model scores - run", str(worst_index))
            print(reports[worst_index])

            print("\n** Best ** model scores - run", str(best_index))
            print(reports[best_index])

            fold_nb = self.model_config.fold_number
            self.model_config.fold_number = 1
            if self.model_config.transformer_name is None:
                self.model = self.models[best_index]
            else:
                dir_path = 'data/models/sequenceLabelling/'
                weight_file = DEFAULT_WEIGHT_FILE_NAME.replace(
                    ".hdf5",
                    str(best_index) + ".hdf5")
                # saved config file must be updated to single fold
                self.model.load(filepath=os.path.join(
                    dir_path, self.model_config.model_name, weight_file))

            print(
                "----------------------------------------------------------------------"
            )
            print("\nAverage over", str(int(fold_nb)), "folds")
            print(
                get_report(fold_average_evaluation,
                           digits=4,
                           include_avgs=['micro']))

    def tag(self, texts, output_format, features=None, batch_size=None):
        # annotate a list of sentences, return the list of annotations in the
        # specified output_format

        if batch_size != None:
            self.model_config.batch_size = batch_size
            print("---")
            print("batch_size (prediction):", self.model_config.batch_size)
            print("---")

        if self.model:
            tagger = Tagger(
                self.model,
                self.model_config,
                self.embeddings,
                preprocessor=self.p,
                transformer_preprocessor=self.model.transformer_preprocessor)
            start_time = time.time()
            annotations = tagger.tag(texts, output_format, features=features)
            runtime = round(time.time() - start_time, 3)
            if output_format == 'json':
                annotations["runtime"] = runtime
            #else:
            #    print("runtime: %s seconds " % (runtime))
            return annotations
        else:
            raise (OSError('Could not find a model.' + str(self.model)))

    def tag_file(self, file_in, output_format, file_out, batch_size=None):
        # Annotate a text file containing one sentence per line, the annotations are
        # written in the output file if not None, in the standard output otherwise.
        # Processing is streamed by batches so that we can process huge files without
        # memory issues

        if batch_size != None:
            self.model_config.batch_size = batch_size
            print("---")
            print("batch_size (prediction):", self.model_config.batch_size)
            print("---")

        if self.model:
            tagger = Tagger(
                self.model,
                self.model_config,
                self.embeddings,
                preprocessor=self.p,
                transformer_preprocessor=self.model.transformer_preprocessor)
            start_time = time.time()
            if file_out != None:
                out = open(file_out, 'w')
            first = True
            with open(file_in, 'r') as f:
                texts = None
                while texts == None or len(
                        texts
                ) == self.model_config.batch_size * self.nb_workers:

                    texts = next_n_lines(
                        f, self.model_config.batch_size * self.nb_workers)
                    annotations = tagger.tag(texts, output_format)
                    # if the following is true, we just output the JSON returned by the tagger without any modification
                    directDump = False
                    if first:
                        first = False
                        if len(
                                texts
                        ) < self.model_config.batch_size * self.nb_workers:
                            runtime = round(time.time() - start_time, 3)
                            annotations['runtime'] = runtime
                            jsonString = json.dumps(annotations,
                                                    sort_keys=False,
                                                    indent=4,
                                                    ensure_ascii=False)
                            if file_out == None:
                                print(jsonString)
                            else:
                                out.write(jsonString)
                            directDump = True
                        else:
                            # we need to modify a bit the JSON outputted by the tagger to glue the different batches
                            # output the general information attributes
                            jsonString = '{\n    "software": ' + json.dumps(
                                annotations["software"],
                                ensure_ascii=False) + ",\n"
                            jsonString += '    "date": ' + json.dumps(
                                annotations["date"],
                                ensure_ascii=False) + ",\n"
                            jsonString += '    "model": ' + json.dumps(
                                annotations["model"],
                                ensure_ascii=False) + ",\n"
                            jsonString += '    "texts": ['
                            if file_out == None:
                                print(jsonString, end='', flush=True)
                            else:
                                out.write(jsonString)
                            first = True
                            for jsonStr in annotations["texts"]:
                                jsonString = json.dumps(jsonStr,
                                                        sort_keys=False,
                                                        indent=4,
                                                        ensure_ascii=False)
                                #jsonString = jsonString.replace('\n', '\n\t\t')
                                jsonString = re.sub('\n', '\n        ',
                                                    jsonString)
                                if file_out == None:
                                    if not first:
                                        print(',\n        ' + jsonString,
                                              end='',
                                              flush=True)
                                    else:
                                        first = False
                                        print('\n        ' + jsonString,
                                              end='',
                                              flush=True)
                                else:
                                    if not first:
                                        out.write(',\n        ')
                                        out.write(jsonString)
                                    else:
                                        first = False
                                        out.write('\n        ')
                                        out.write(jsonString)
                    else:
                        for jsonStr in annotations["texts"]:
                            jsonString = json.dumps(jsonStr,
                                                    sort_keys=False,
                                                    indent=4,
                                                    ensure_ascii=False)
                            jsonString = re.sub('\n', '\n        ', jsonString)
                            if file_out == None:
                                print(',\n        ' + jsonString,
                                      end='',
                                      flush=True)
                            else:
                                out.write(',\n        ')
                                out.write(jsonString)

            runtime = round(time.time() - start_time, 3)
            if not directDump:
                jsonString = "\n    ],\n"
                jsonString += '    "runtime": ' + str(runtime)
                jsonString += "\n}\n"
                if file_out == None:
                    print(jsonString)
                else:
                    out.write(jsonString)

            if file_out != None:
                out.close()
            #print("runtime: %s seconds " % (runtime))
        else:
            raise (OSError('Could not find a model.'))

    def save(self,
             dir_path='data/models/sequenceLabelling/',
             weight_file=DEFAULT_WEIGHT_FILE_NAME):
        # create subfolder for the model if not already exists
        directory = os.path.join(dir_path, self.model_config.model_name)
        if not os.path.exists(directory):
            os.makedirs(directory)

        self.model_config.save(os.path.join(directory, CONFIG_FILE_NAME))
        print('model config file saved')

        self.p.save(os.path.join(directory, PROCESSOR_FILE_NAME))
        print('preprocessor saved')

        if self.model is None and self.model_config.fold_number > 1:
            print(
                'Error: model not saved. Evaluation need to be called first to select the best fold model to be saved'
            )
        else:
            self.model.save(os.path.join(directory, weight_file))

            # save pretrained transformer config if used in the model
            if self.model.transformer_config is not None:
                self.model.transformer_config.to_json_file(
                    os.path.join(directory, TRANSFORMER_CONFIG_FILE_NAME))
                print('transformer config saved')

            if self.model.transformer_preprocessor is not None:
                self.model.transformer_preprocessor.tokenizer.save_pretrained(
                    os.path.join(directory, DEFAULT_TRANSFORMER_TOKENIZER_DIR))
                print('transformer tokenizer saved')

        print('model saved')

    def load(self,
             dir_path='data/models/sequenceLabelling/',
             weight_file=DEFAULT_WEIGHT_FILE_NAME):
        model_path = os.path.join(dir_path, self.model_config.model_name)
        self.model_config = ModelConfig.load(
            os.path.join(model_path, CONFIG_FILE_NAME))

        if self.model_config.embeddings_name is not None:
            # load embeddings
            # Do not use cache in 'prediction/production' mode
            self.embeddings = Embeddings(self.model_config.embeddings_name,
                                         resource_registry=self.registry,
                                         use_ELMo=self.model_config.use_ELMo,
                                         use_cache=False)
            self.model_config.word_embedding_size = self.embeddings.embed_size
        else:
            self.embeddings = None
            self.model_config.word_embedding_size = 0

        self.p = Preprocessor.load(
            os.path.join(dir_path, self.model_config.model_name,
                         PROCESSOR_FILE_NAME))
        self.model = get_model(self.model_config,
                               self.p,
                               ntags=len(self.p.vocab_tag),
                               load_pretrained_weights=False,
                               local_path=os.path.join(
                                   dir_path, self.model_config.model_name))
        print(
            "load weights from",
            os.path.join(dir_path, self.model_config.model_name, weight_file))
        self.model.load(filepath=os.path.join(
            dir_path, self.model_config.model_name, weight_file))
        self.model.print_summary()
Example #14
0
    def __init__(self, 
                 model_name=None,
                 architecture="gru",
                 embeddings_name=None,
                 list_classes=[],
                 char_emb_size=25, 
                 dropout=0.5, 
                 recurrent_dropout=0.25,
                 use_char_feature=False, 
                 batch_size=256, 
                 optimizer='adam', 
                 learning_rate=0.001, 
                 lr_decay=0.9,
                 clip_gradients=5.0, 
                 max_epoch=50, 
                 patience=5,
                 log_dir=None,
                 maxlen=300,
                 fold_number=1,
                 use_roc_auc=True,
                 early_stop=True,
                 class_weights=None,
                 multiprocessing=True,
                 transformer_name: str=None):

        if model_name is None:
            # add a dummy name based on the architecture
            model_name = architecture
            if embeddings_name is not None:
                model_name += "_" + embeddings_name
            if transformer_name is not None:
                model_name += "_" + transformer_name

        self.model = None
        self.models = None
        self.log_dir = log_dir
        self.embeddings_name = embeddings_name
        self.embeddings = None

        # if transformer_name is None, no bert layer is present in the model
        self.transformer_name = None

        self.registry = load_resource_registry("delft/resources-registry.json")

        word_emb_size = 0
        if transformer_name is not None:
            self.transformer_name = transformer_name
            self.embeddings_name = None
            self.embeddings = None
        elif self.embeddings_name is not None:
            self.embeddings = Embeddings(self.embeddings_name, resource_registry=self.registry)
            word_emb_size = self.embeddings.embed_size
        
        self.model_config = ModelConfig(model_name=model_name, 
                                        architecture=architecture, 
                                        embeddings_name=embeddings_name, 
                                        list_classes=list_classes, 
                                        char_emb_size=char_emb_size, 
                                        word_emb_size=word_emb_size, 
                                        dropout=dropout, 
                                        recurrent_dropout=recurrent_dropout,
                                        use_char_feature=use_char_feature, 
                                        maxlen=maxlen, 
                                        fold_number=fold_number, 
                                        batch_size=batch_size,
                                        transformer_name=self.transformer_name)

        self.training_config = TrainingConfig(batch_size=batch_size, 
                                              optimizer=optimizer, 
                                              learning_rate=learning_rate,
                                              lr_decay=lr_decay, 
                                              clip_gradients=clip_gradients, 
                                              max_epoch=max_epoch,
                                              patience=patience, 
                                              use_roc_auc=use_roc_auc, 
                                              early_stop=early_stop,
                                              class_weights=class_weights, 
                                              multiprocessing=multiprocessing)
Example #15
0
class Classifier(object):

    config_file = 'config.json'
    weight_file = 'model_weights.hdf5'

    def __init__(self,
                 model_name="",
                 model_type="gru",
                 embeddings_name=None,
                 list_classes=[],
                 char_emb_size=25,
                 dropout=0.5,
                 recurrent_dropout=0.25,
                 use_char_feature=False,
                 batch_size=256,
                 optimizer='adam',
                 learning_rate=0.001,
                 lr_decay=0.9,
                 clip_gradients=5.0,
                 max_epoch=50,
                 patience=5,
                 log_dir=None,
                 maxlen=300,
                 fold_number=1,
                 use_roc_auc=True,
                 use_ELMo=False,
                 use_BERT=False,
                 embeddings=(),
                 class_weights=None):

        self.model = None
        self.models = None
        self.log_dir = log_dir
        self.embeddings_name = embeddings_name

        word_emb_size = 0
        if embeddings_name is not None:
            self.embeddings = Embeddings(embeddings_name,
                                         use_ELMo=use_ELMo,
                                         use_BERT=use_BERT)
            word_emb_size = self.embeddings.embed_size

        self.model_config = ModelConfig(model_name=model_name,
                                        model_type=model_type,
                                        embeddings_name=embeddings_name,
                                        list_classes=list_classes,
                                        char_emb_size=char_emb_size,
                                        word_emb_size=word_emb_size,
                                        dropout=dropout,
                                        recurrent_dropout=recurrent_dropout,
                                        use_char_feature=use_char_feature,
                                        maxlen=maxlen,
                                        fold_number=fold_number,
                                        batch_size=batch_size,
                                        use_ELMo=use_ELMo,
                                        use_BERT=use_BERT)

        self.training_config = TrainingConfig(batch_size,
                                              optimizer,
                                              learning_rate,
                                              lr_decay,
                                              clip_gradients,
                                              max_epoch,
                                              patience,
                                              use_roc_auc,
                                              class_weights=class_weights)

    def train(self, x_train, y_train, vocab_init=None):
        # create validation set in case we don't use k-folds
        xtr, val_x, y, val_y = train_test_split(x_train,
                                                y_train,
                                                test_size=0.1)

        training_generator = DataGenerator(
            xtr,
            y,
            batch_size=self.training_config.batch_size,
            maxlen=self.model_config.maxlen,
            list_classes=self.model_config.list_classes,
            embeddings=self.embeddings,
            shuffle=True)
        validation_generator = DataGenerator(
            val_x,
            None,
            batch_size=self.training_config.batch_size,
            maxlen=self.model_config.maxlen,
            list_classes=self.model_config.list_classes,
            embeddings=self.embeddings,
            shuffle=False)

        self.model = getModel(self.model_config, self.training_config)
        # uncomment to plot graph
        #plot_model(self.model,
        #    to_file='data/models/textClassification/'+self.model_config.model_name+'_'+self.model_config.model_type+'.png')
        self.model, best_roc_auc = train_model(
            self.model,
            self.model_config.list_classes,
            self.training_config.batch_size,
            self.training_config.max_epoch,
            self.training_config.use_roc_auc,
            self.training_config.class_weights,
            training_generator,
            validation_generator,
            val_y,
            use_ELMo=self.embeddings.use_ELMo,
            use_BERT=self.embeddings.use_BERT)
        if self.embeddings.use_ELMo:
            self.embeddings.clean_ELMo_cache()
        if self.embeddings.use_BERT:
            self.embeddings.clean_BERT_cache()

    def train_nfold(self, x_train, y_train, vocab_init=None):
        self.models = train_folds(x_train, y_train, self.model_config,
                                  self.training_config, self.embeddings)
        if self.embeddings.use_ELMo:
            self.embeddings.clean_ELMo_cache()
        if self.embeddings.use_BERT:
            self.embeddings.clean_BERT_cache()

    # classification
    def predict(self, texts, output_format='json'):
        if self.model_config.fold_number is 1:
            if self.model is not None:
                predict_generator = DataGenerator(
                    texts,
                    None,
                    batch_size=self.model_config.batch_size,
                    maxlen=self.model_config.maxlen,
                    list_classes=self.model_config.list_classes,
                    embeddings=self.embeddings,
                    shuffle=False)

                result = predict(self.model,
                                 predict_generator,
                                 use_ELMo=self.embeddings.use_ELMo,
                                 use_BERT=self.embeddings.use_BERT)
            else:
                raise (OSError('Could not find a model.'))
        else:
            if self.models is not None:
                predict_generator = DataGenerator(
                    texts,
                    None,
                    batch_size=self.model_config.batch_size,
                    maxlen=self.model_config.maxlen,
                    list_classes=self.model_config.list_classes,
                    embeddings=self.embeddings,
                    shuffle=False)

                result = predict_folds(self.models,
                                       predict_generator,
                                       use_ELMo=self.embeddings.use_ELMo,
                                       use_BERT=self.embeddings.use_BERT)
            else:
                raise (OSError('Could not find nfolds models.'))
        if output_format is 'json':
            res = {
                "software": "DeLFT",
                "date": datetime.datetime.now().isoformat(),
                "model": self.model_config.model_name,
                "classifications": []
            }
            i = 0
            for text in texts:
                classification = {"text": text}
                the_res = result[i]
                j = 0
                for cl in self.model_config.list_classes:
                    classification[cl] = float(the_res[j])
                    j += 1
                res["classifications"].append(classification)
                i += 1
            return res
        else:
            return result

    def eval(self, x_test, y_test):
        if self.model_config.fold_number is 1:
            if self.model is not None:
                test_generator = DataGenerator(
                    x_test,
                    None,
                    batch_size=self.model_config.batch_size,
                    maxlen=self.model_config.maxlen,
                    list_classes=self.model_config.list_classes,
                    embeddings=self.embeddings,
                    shuffle=False)

                result = predict(self.model,
                                 test_generator,
                                 use_ELMo=self.embeddings.use_ELMo,
                                 use_BERT=self.embeddings.use_BERT)
            else:
                raise (OSError('Could not find a model.'))
        else:
            if self.models is not None:
                test_generator = DataGenerator(
                    x_test,
                    None,
                    batch_size=self.model_config.batch_size,
                    maxlen=self.model_config.maxlen,
                    list_classes=self.model_config.list_classes,
                    embeddings=self.embeddings,
                    shuffle=False)

                result = predict_folds(self.models,
                                       test_generator,
                                       use_ELMo=self.embeddings.use_ELMo,
                                       use_BERT=self.embeddings.use_BERT)
            else:
                raise (OSError('Could not find nfolds models.'))
        print("-----------------------------------------------")
        print("\nEvaluation on", x_test.shape[0], "instances:")

        total_accuracy = 0.0
        total_f1 = 0.0
        total_loss = 0.0
        total_roc_auc = 0.0

        def normer(t):
            if t < 0.5:
                return 0
            else:
                return 1

        vfunc = np.vectorize(normer)
        result_binary = vfunc(result)

        # macro-average (average of class scores)
        # we distinguish 1-class and multiclass problems
        if len(self.model_config.list_classes) is 1:
            total_accuracy = accuracy_score(y_test, result_binary)
            total_f1 = f1_score(y_test, result_binary)
            total_loss = log_loss(y_test, result)
            total_roc_auc = roc_auc_score(y_test, result)
        else:
            for j in range(0, len(self.model_config.list_classes)):
                accuracy = accuracy_score(y_test[:, j], result_binary[:, j])
                total_accuracy += accuracy
                f1 = f1_score(y_test[:, j],
                              result_binary[:, j],
                              average='micro')
                total_f1 += f1
                loss = log_loss(y_test[:, j], result[:, j])
                total_loss += loss
                roc_auc = roc_auc_score(y_test[:, j], result[:, j])
                total_roc_auc += roc_auc
                print("\nClass:", self.model_config.list_classes[j])
                print("\taccuracy at 0.5 =", accuracy)
                print("\tf-1 at 0.5 =", f1)
                print("\tlog-loss =", loss)
                print("\troc auc =", roc_auc)

        total_accuracy /= len(self.model_config.list_classes)
        total_f1 /= len(self.model_config.list_classes)
        total_loss /= len(self.model_config.list_classes)
        total_roc_auc /= len(self.model_config.list_classes)

        if len(self.model_config.list_classes) is not 1:
            print("\nMacro-average:")
        print("\taverage accuracy at 0.5 =", "{:10.4f}".format(total_accuracy))
        print("\taverage f-1 at 0.5 =", "{:10.4f}".format(total_f1))
        print("\taverage log-loss =", "{:10.4f}".format(total_loss))
        print("\taverage roc auc =", "{:10.4f}".format(total_roc_auc))

        # micro-average (average of scores for each instance)
        # make sense only if we have more than 1 class, otherwise same as
        # macro-avergae
        if len(self.model_config.list_classes) is not 1:
            total_accuracy = 0.0
            total_f1 = 0.0
            total_loss = 0.0
            total_roc_auc = 0.0

            for i in range(0, result.shape[0]):
                #for j in range(0, len(self.model_config.list_classes)):
                accuracy = accuracy_score(y_test[i, :], result_binary[i, :])
                total_accuracy += accuracy
                f1 = f1_score(y_test[i, :],
                              result_binary[i, :],
                              average='micro')
                total_f1 += f1
                loss = log_loss(y_test[i, :], result[i, :])
                total_loss += loss
                roc_auc = roc_auc_score(y_test[i, :], result[i, :])
                total_roc_auc += roc_auc

            total_accuracy /= result.shape[0]
            total_f1 /= result.shape[0]
            total_loss /= result.shape[0]
            total_roc_auc /= result.shape[0]

            print("\nMicro-average:")
            print("\taverage accuracy at 0.5 =",
                  "{:10.4f}".format(total_accuracy))
            print("\taverage f-1 at 0.5 =", "{:10.4f}".format(total_f1))
            print("\taverage log-loss =", "{:10.4f}".format(total_loss))
            print("\taverage roc auc =", "{:10.4f}".format(total_roc_auc))

    def save(self, dir_path='data/models/textClassification/'):
        # create subfolder for the model if not already exists
        directory = os.path.join(dir_path, self.model_config.model_name)
        if not os.path.exists(directory):
            os.makedirs(directory)

        self.model_config.save(os.path.join(directory, self.config_file))
        print('model config file saved')

        if self.model_config.fold_number is 1:
            if self.model is not None:
                self.model.save(
                    os.path.join(
                        directory,
                        self.model_config.model_type + "." + self.weight_file))
                print('model saved')
            else:
                print('Error: model has not been built')
        else:
            if self.models is None:
                print('Error: nfolds models have not been built')
            else:
                for i in range(0, self.model_config.fold_number):
                    self.models[i].save(
                        os.path.join(
                            directory, self.model_config.model_type +
                            ".model{0}_weights.hdf5".format(i)))
                print('nfolds model saved')

    def load(self, dir_path='data/models/textClassification/'):
        self.model_config = ModelConfig.load(
            os.path.join(dir_path, self.model_config.model_name,
                         self.config_file))

        # load embeddings
        self.embeddings = Embeddings(self.model_config.embeddings_name,
                                     use_ELMo=self.model_config.use_ELMo,
                                     use_BERT=self.model_config.use_BERT)
        self.model_config.word_embedding_size = self.embeddings.embed_size

        self.model = getModel(self.model_config, self.training_config)
        if self.model_config.fold_number is 1:
            self.model.load_weights(
                os.path.join(
                    dir_path, self.model_config.model_name,
                    self.model_config.model_type + "." + self.weight_file))
        else:
            self.models = []
            for i in range(0, self.model_config.fold_number):
                local_model = getModel(self.model_config, self.training_config)
                local_model.load_weights(
                    os.path.join(
                        dir_path, self.model_config.model_name,
                        self.model_config.model_type +
                        ".model{0}_weights.hdf5".format(i)))
                self.models.append(local_model)