Esempio n. 1
0
    def train_model(self,
                    local_model,
                    x_train,
                    y_train,
                    x_valid=None,
                    y_valid=None,
                    max_epoch=50):
        # todo: if valid set if None, create it as random segment of the shuffled train set

        if self.training_config.early_stop:
            training_generator = DataGenerator(
                x_train,
                y_train,
                batch_size=self.training_config.batch_size,
                preprocessor=self.preprocessor,
                char_embed_size=self.model_config.char_embedding_size,
                embeddings=self.embeddings,
                shuffle=True)

            validation_generator = DataGenerator(
                x_valid,
                y_valid,
                batch_size=self.training_config.batch_size,
                preprocessor=self.preprocessor,
                char_embed_size=self.model_config.char_embedding_size,
                embeddings=self.embeddings,
                shuffle=False)

            callbacks = get_callbacks(log_dir=self.checkpoint_path,
                                      eary_stopping=True,
                                      valid=(validation_generator,
                                             self.preprocessor))
        else:
            x_train = np.concatenate((x_train, x_valid), axis=0)
            y_train = np.concatenate((y_train, y_valid), axis=0)
            training_generator = DataGenerator(
                x_train,
                y_train,
                batch_size=self.training_config.batch_size,
                preprocessor=self.preprocessor,
                char_embed_size=self.model_config.char_embedding_size,
                embeddings=self.embeddings,
                shuffle=True)

            callbacks = get_callbacks(log_dir=self.checkpoint_path,
                                      eary_stopping=False)
        nb_workers = 6
        if self.embeddings.use_ELMo:
            nb_workers = 0
            # dump token context independent data for train set, done once for the training

        local_model.fit_generator(generator=training_generator,
                                  epochs=max_epoch,
                                  use_multiprocessing=True,
                                  workers=nb_workers,
                                  callbacks=callbacks)

        return local_model
Esempio n. 2
0
    def tag(self, texts, output_format):
        assert isinstance(texts, list)

        if output_format is 'json':
            res = {
                "software": "DeLFT",
                "date": datetime.datetime.now().isoformat(),
                "model": self.model.config.model_name,
                "texts": []
            }
        else:
            list_of_tags = []

        predict_generator = DataGenerator(
            texts,
            None,
            batch_size=self.model_config.batch_size,
            preprocessor=self.preprocessor,
            char_embed_size=self.model_config.char_embedding_size,
            embeddings=self.embeddings,
            tokenize=True,
            shuffle=False)

        nb_workers = 6
        multiprocessing = True
        # multiple workers will not work with ELMo due to GPU memory limit (with GTX 1080Ti 11GB)
        if self.embeddings.use_ELMo:
            # worker at 0 means the training will be executed in the main thread
            nb_workers = 0
            multiprocessing = False
            # dump token context independent data for train set, done once for the training

        preds = self.model.predict_generator(
            generator=predict_generator,
            use_multiprocessing=multiprocessing,
            workers=nb_workers)

        for i in range(0, len(preds)):
            pred = [preds[i]]
            text = texts[i]
            tokens, offsets = tokenizeAndFilter(text)

            tags = self._get_tags(pred)
            prob = self._get_prob(pred)

            if output_format is 'json':
                piece = {}
                piece["text"] = text
                piece["entities"] = self._build_json_response(
                    tokens, tags, prob, offsets)["entities"]
                res["texts"].append(piece)
            else:
                the_tags = list(zip(tokens, tags))
                list_of_tags.append(the_tags)

        if output_format is 'json':
            return res
        else:
            return list_of_tags
Esempio n. 3
0
    def eval_nfold(self, x_test, y_test):
        if self.models is not None:
            total_f1 = 0
            best_f1 = 0
            best_index = 0
            worst_f1 = 1
            worst_index = 0
            reports = []
            total_precision = 0
            total_recall = 0
            for i in range(0, self.model_config.fold_number):
                print('\n------------------------ fold ' + str(i) +
                      '--------------------------------------')

                # Prepare test data(steps, generator)
                test_generator = DataGenerator(
                    x_test,
                    y_test,
                    batch_size=self.training_config.batch_size,
                    preprocessor=self.p,
                    char_embed_size=self.model_config.char_embedding_size,
                    embeddings=self.embeddings,
                    shuffle=False)

                # Build the evaluator and evaluate the model
                scorer = Scorer(test_generator, self.p, evaluation=True)
                scorer.model = self.models[i]
                scorer.on_epoch_end(epoch=-1)
                f1 = scorer.f1
                precision = scorer.precision
                recall = scorer.recall
                reports.append(scorer.report)

                if best_f1 < f1:
                    best_f1 = f1
                    best_index = i
                if worst_f1 > f1:
                    worst_f1 = f1
                    worst_index = i
                total_f1 += f1
                total_precision += precision
                total_recall += recall

            macro_f1 = total_f1 / self.model_config.fold_number
            macro_precision = total_precision / self.model_config.fold_number
            macro_recall = total_recall / self.model_config.fold_number

            print("\naverage over", self.model_config.fold_number, "folds")
            print("\tmacro f1 =", macro_f1)
            print("\tmacro precision =", macro_precision)
            print("\tmacro recall =", macro_recall, "\n")

            print("\n** Worst ** model scores - \n")
            print(reports[worst_index])

            self.model = self.models[best_index]
            print("\n** Best ** model scores - \n")
            print(reports[best_index])
Esempio n. 4
0
    def tag(self, texts, output_format):
        assert isinstance(texts, list)

        if output_format is 'json':
            res = {
                "software": "DeLFT",
                "date": datetime.datetime.now().isoformat(),
                "model": self.model.config.model_name,
                "texts": []
            }
        else:
            list_of_tags = []

        predict_generator = DataGenerator(
            texts,
            None,
            batch_size=self.model_config.batch_size,
            preprocessor=self.preprocessor,
            char_embed_size=self.model_config.char_embedding_size,
            embeddings=self.embeddings,
            tokenize=True,
            shuffle=False)

        nb_workers = 6
        if self.embeddings.use_ELMo:
            nb_workers = 0
        preds = self.model.predict_generator(generator=predict_generator,
                                             use_multiprocessing=True,
                                             workers=nb_workers)

        for i in range(0, len(preds)):
            pred = [preds[i]]
            text = texts[i]
            tokens, offsets = tokenizeAndFilter(text)

            tags = self._get_tags(pred)
            prob = self._get_prob(pred)

            if output_format is 'json':
                piece = {}
                piece["text"] = text
                piece["entities"] = self._build_json_response(
                    tokens, tags, prob, offsets)["entities"]
                res["texts"].append(piece)
            else:
                the_tags = list(zip(tokens, tags))
                list_of_tags.append(the_tags)

        if output_format is 'json':
            return res
        else:
            return list_of_tags
Esempio n. 5
0
    def eval_single(self, x_test, y_test):   
        if self.model:
            # Prepare test data(steps, generator)
            test_generator = DataGenerator(x_test, y_test, 
              batch_size=self.training_config.batch_size, preprocessor=self.p, 
              char_embed_size=self.model_config.char_embedding_size, 
              embeddings=self.embeddings, shuffle=False)

            # Build the evaluator and evaluate the model
            scorer = Scorer(test_generator, self.p, evaluation=True)
            scorer.model = self.model
            scorer.on_epoch_end(epoch=-1) 
        else:
            raise (OSError('Could not find a model.'))
Esempio n. 6
0
    def tag(self, texts, output_format, features=None):
        assert isinstance(texts, list)

        if output_format is 'json':
            res = {
                "software": "DeLFT",
                "date": datetime.datetime.now().isoformat(),
                "model": self.model.config.model_name,
                "texts": []
            }
        else:
            list_of_tags = []

        tokeniz = False
        if (len(texts) > 0 and isinstance(texts[0], str)):
            tokeniz = True

        predict_generator = DataGenerator(
            texts,
            None,
            batch_size=self.model_config.batch_size,
            preprocessor=self.preprocessor,
            char_embed_size=self.model_config.char_embedding_size,
            embeddings=self.embeddings,
            tokenize=tokeniz,
            shuffle=False,
            features=None)

        nb_workers = 6
        multiprocessing = True
        # multiple workers will not work with ELMo due to GPU memory limit (with GTX 1080Ti 11GB)
        if self.embeddings.use_ELMo:
            # worker at 0 means the training will be executed in the main thread
            nb_workers = 0
            multiprocessing = False
            # dump token context independent data for train set, done once for the training

        steps_done = 0
        steps = len(predict_generator)
        #while steps_done < steps:
        for generator_output in predict_generator:
            if steps_done == steps:
                break
            #generator_output = next(predict_generator)
            preds = self.model.predict_on_batch(generator_output[0])
            '''preds = self.model.predict_generator(
                generator=predict_generator,
                use_multiprocessing=multiprocessing,
                workers=nb_workers
                )
            '''
            for i in range(0, len(preds)):
                pred = [preds[i]]
                text = texts[i + (steps_done * self.model_config.batch_size)]

                if (isinstance(text, str)):
                    tokens, offsets = tokenizeAndFilter(text)
                else:
                    # it is a list of string, so a string already tokenized
                    # note that in this case, offset are not present and json output is impossible
                    tokens = text
                    offsets = []

                tags = self._get_tags(pred)
                prob = self._get_prob(pred)

                if output_format is 'json':
                    piece = {}
                    piece["text"] = text
                    piece["entities"] = self._build_json_response(
                        tokens, tags, prob, offsets)["entities"]
                    res["texts"].append(piece)
                else:
                    the_tags = list(zip(tokens, tags))
                    list_of_tags.append(the_tags)
            steps_done += 1

        if output_format is 'json':
            return res
        else:
            return list_of_tags