def ignor_test_online_training(self):
        """
        test online training.
        """
        test_config = "tests/data/test_config/test_config.json"
        config = AnnotatorConfig(test_config)
        # init trainer first
        trainer = Trainer(config)

        # load all data for test, in actual data should get from user label
        with io.open(config["org_data"], encoding="utf-8-sig") as f:
            data = simplejson.loads(f.read())
        validate_local_data(data)

        data_set = data.get("data_set", list())

        # faker user labeled data, user has labeled 50 texts.
        faker_user_labeled_data = data_set[:50]
        # 950 text to predict and rank
        unlabeled_data = data_set[50:]

        # now test online training
        examples = []
        for e in faker_user_labeled_data:
            data = e.copy()
            if "text" in data:
                del data["text"]
            examples.append(Message(e["text"], data))

        new_labeled_data = TrainingData(examples)

        # full amount train and persist model
        interpreter = trainer.train(new_labeled_data)
        trainer.persist(config['path'],
                        config['project'],
                        config['fixed_model_name'])

        # predict unlabeled dataset and ranking
        predicted_results = []
        for unlabeled_data in unlabeled_data:
            predict = interpreter.parse(unlabeled_data["text"])
            predicted_results.append(predict)

        # sort predict result
        # predicted result format as
        # {
        #   'classifylabel': {'name': 'spam', 'confidence': 0.5701943777626447},
        #   'classifylabel_ranking': [{'name': 'spam', 'confidence': 0.5701943777626447},
        #                             {'name': 'notspam', 'confidence': 0.42980562223735524}],
        #   'text': '我是一个垃圾邮件'
        # }
        confidence_threshold = config["confidence_threshold"]
        ranking_candidates = [text for text in predicted_results \
                              if text.get("classifylabel").get("confidence") < confidence_threshold]
        for candidate in ranking_candidates:
            assert candidate.get("classifylabel").get("confidence") < confidence_threshold
    def ignore_test_train_with_empty_data(self):
        """
        test train with empty train data
        :return:
        """
        test_config = "tests/data/test_config/test_config.json"
        config = AnnotatorConfig(test_config)

        trainer = Trainer(config)
        assert len(trainer.pipeline) > 0
        # create tmp train set

        train_data = TrainingData([])
        # rm tmp train set

        trainer.train(train_data)
        # test persist and load
        persisted_path = trainer.persist(config['path'],
                                         config['project'],
                                         config['fixed_model_name'])

        interpreter_loaded = Interpreter.load(persisted_path, config)
        
        assert interpreter_loaded.pipeline
        assert interpreter_loaded.parse("hello") is not None
        assert interpreter_loaded.parse("Hello today is Monday, again!") is not None
        
        # remove tmp models
        shutil.rmtree(config['path'], ignore_errors=False)
    def ignore_test_pipeline_flow(self):
        """
        test trainer's train func for pipeline
        :return:
        """
        test_config = "tests/data/test_config/test_config.json"
        config = AnnotatorConfig(test_config)

        trainer = Trainer(config)
        assert len(trainer.pipeline) > 0
        # create tmp train set
        tmp_path = create_tmp_test_jsonfile("tmp.json")
        train_data = load_local_data(tmp_path)
        # rm tmp train set
        rm_tmp_file("tmp.json")

        interpreter = trainer.train(train_data)
        assert interpreter is not None
        out1 = interpreter.parse(("点连接拿红包啦"))

        # test persist and load
        persisted_path = trainer.persist(config['path'],
                                         config['project'],
                                         config['fixed_model_name'])

        interpreter_loaded = Interpreter.load(persisted_path, config)
        out2 = interpreter_loaded.parse("点连接拿红包啦")
        assert out1.get("classifylabel").get("name") == out2.get("classifylabel").get("name")

        # remove tmp models
        shutil.rmtree(config['path'], ignore_errors=True)
    def ignore_test_load_and_persist_without_train(self):
        """
        test save and load model without train
        :return:
        """
        test_config = "tests/data/test_config/test_config.json"
        config = AnnotatorConfig(test_config)

        trainer = Trainer(config)
        assert len(trainer.pipeline) > 0
        # create tmp train set
        tmp_path = create_tmp_test_jsonfile("tmp.json")
        train_data = load_local_data(tmp_path)
        # rm tmp train set
        rm_tmp_file("tmp.json")

        # interpreter = trainer.train(train_data)
        # test persist and load
        persisted_path = trainer.persist(config['path'],
                                         config['project'],
                                         config['fixed_model_name'])

        interpreter_loaded = Interpreter.load(persisted_path, config)
        assert interpreter_loaded.pipeline
        assert interpreter_loaded.parse("hello") is not None
        assert interpreter_loaded.parse("Hello today is Monday, again!") is not None
        # remove tmp models
        shutil.rmtree(config['path'], ignore_errors=False)
    def ignore_test_trainer_persist(self):
        """
        test pipeline persist, metadata will be saved
        :return:
        """
        test_config = "tests/data/test_config/test_config.json"
        config = AnnotatorConfig(test_config)

        trainer = Trainer(config)
        assert len(trainer.pipeline) > 0
        # char_tokenizer component should been created
        assert trainer.pipeline[0] is not None
        # create tmp train set
        tmp_path = create_tmp_test_jsonfile("tmp.json")
        train_data = load_local_data(tmp_path)
        # rm tmp train set
        rm_tmp_file("tmp.json")

        trainer.train(train_data)
        persisted_path = trainer.persist(config['path'],
                                         config['project'],
                                         config['fixed_model_name'])
        # load persisted metadata
        metadata_path = os.path.join(persisted_path, 'metadata.json')
        with io.open(metadata_path) as f:
            metadata = json.load(f)
        assert 'trained_at' in metadata
        # rm tmp files and dirs
        shutil.rmtree(config['path'], ignore_errors=False)
    def test_load_and_persist_without_train(self):
        """
        test save and load model without train
        :return:
        """
        test_config = "tests/data/test_config.json"
        config = AnnotatorConfig(test_config)

        trainer = Trainer(config)
        assert len(trainer.pipeline) > 0
        # create tmp train set
        tmp_path = create_tmp_test_jsonfile("tmp.json")
        train_data = load_local_data(tmp_path)
        # rm tmp train set
        rm_tmp_file("tmp.json")

        # interpreter = trainer.train(train_data)
        # test persist and load
        persisted_path = trainer.persist(config['path'], config['project'],
                                         config['fixed_model_name'])

        interpreter_loaded = Interpreter.load(persisted_path, config)
        assert interpreter_loaded.pipeline
        assert interpreter_loaded.parse("hello") is not None
        assert interpreter_loaded.parse(
            "Hello today is Monday, again!") is not None
        # remove tmp models
        shutil.rmtree(config['path'], ignore_errors=True)
    def test_trainer_persist(self):
        """
        test pipeline persist, metadata will be saved
        :return:
        """
        test_config = "tests/data/test_config.json"
        config = AnnotatorConfig(test_config)

        trainer = Trainer(config)
        assert len(trainer.pipeline) > 0
        # char_tokenizer component should been created
        assert trainer.pipeline[0] is not None
        # create tmp train set
        tmp_path = create_tmp_test_jsonfile("tmp.json")
        train_data = load_local_data(tmp_path)
        # rm tmp train set
        rm_tmp_file("tmp.json")

        trainer.train(train_data)
        persisted_path = trainer.persist(config['path'], config['project'],
                                         config['fixed_model_name'])
        # load persisted metadata
        metadata_path = os.path.join(persisted_path, 'metadata.json')
        with io.open(metadata_path) as f:
            metadata = json.load(f)
        assert 'trained_at' in metadata
        # rm tmp files and dirs
        shutil.rmtree(config['path'], ignore_errors=True)
Exemple #8
0
def do_train(
    config,  # type: AnnotatorConfig
    component_builder=None  # type: Optional[ComponentBuilder]
):
    # type: (...) -> Tuple[Trainer, Interpreter, Text]
    """Loads the trainer and the data and runs the training of the model."""

    # Ensure we are training a model that we can save in the end
    # WARN: there is still a race condition if a model with the same name is
    # trained in another subprocess
    trainer = Trainer(config, component_builder)
    training_data = load_local_data(config['org_data'])
    interpreter = trainer.train(training_data)
    persisted_path = trainer.persist(config['path'], config['project'],
                                     config['fixed_model_name'])
    return trainer, interpreter, persisted_path
class ActiveLearner(object):
    """
    implement of active learning core, this class is mainly the wrapper of trainer and interpreter.
    you can use ActiveLearner as follow:
        active_leaner = ActiveLearner(config)
        data_to_train = get_data_from_user_label()
        active_leaner.train(data_to_train)

        data_to_rank = get_data_from_db()
        low_confidence_data = active_leaner.process_batch(data_to_rank)
    """
    def __init__(self, config):
        """
        init of ActiveLearner
        """
        self.config = config
        self.trainer = Trainer(config)
        self.train_data = TrainingData([])
        self.new_data_count = 0
        self.batch_num = config.get("batch_num", 20)
        self.db = DBManager(config)
        self.interpreter = None

    def train(self, data_set):
        """
        train data set
        :param data_set: format as [{"id": 1, "text": "我是测试", "label": "spam"}, .....]
        :return:
        """
        config = self.config

        examples = []
        for e in data_set:
            data = e.copy()
            if "text" in data:
                del data["text"]
            examples.append(Message(e["text"], data))
        train_data = TrainingData(examples)

        self.interpreter = self.trainer.train(train_data)
        # overwrite save model TODO
        self.trainer.persist(config['path'], config['project'],
                             config['fixed_model_name'])

    def process_one(self, id):
        """
        predict one according id
        :param id:
        :return:
        """
        data = self.db.get_row({"id": id})
        predict = self.interpreter.parse(data["text"])
        return predict

    def process_batch(self, ids):
        """
        process batch text according ids
        :param ids:
        :return:
        """
        datas = self.db.get_row_by_ids(ids)
        predicted_results = []
        for unlabeled_data in datas:
            predict = self.interpreter.parse(unlabeled_data["text"])
            if predict:
                unlabeled_data.update(predict)
            predicted_results.append(unlabeled_data)
        return predicted_results

    def process_texts(self, texts):
        """
        process texts
        :param texts: format as [{"id": 1, "text": "我是测试"}, {"id": 2, "text": "我是测试2"}, ...]
        :return: format as [{'id':-, 'text':-, 'classifylabel':-, 'classifylabel_ranking':-}, ...]
        """
        if self.interpreter is None:
            logger.warning(
                "model has not been trained, nothing will be predicted.")
            return []
        predicted_results = []
        for unlabeled_data in texts:
            predict = self.interpreter.parse(unlabeled_data["text"])
            if predict:
                unlabeled_data.update(predict)
            predicted_results.append(unlabeled_data)
        return predicted_results
class ActiveLearner(object):
    """
    implement of active learning core, this class is mainly the wrapper of trainer and interpreter.
    you can use ActiveLearner as follow:
        active_leaner = ActiveLearner(config)
        data_to_train = get_data_from_user_label()
        active_leaner.train(data_to_train)

        data_to_rank = get_data_from_db()
        low_confidence_data = active_leaner.process_batch(data_to_rank)
    """

    def __init__(self, config):
        """
        init of ActiveLearner
        """
        self.config = config
        self.trainer = Trainer(config)
        self.train_data = TrainingData([])
        self.new_data_count = 0
        self.batch_num = config.get("batch_num", 20)
        self.db = DBManager(config)
        self.interpreter = None

    def train(self, data_set):
        """
        train data set
        :param data_set: format as [{"id": 1, "text": "我是测试", "label": "spam"}, .....]
        :return:
        """
        config = self.config

        examples = []
        for e in data_set:
            data = e.copy()
            if "text" in data:
                del data["text"]
            examples.append(Message(e["text"], data))
        train_data = TrainingData(examples)

        self.interpreter = self.trainer.train(train_data)
        # overwrite save model TODO
        self.trainer.persist(config['path'],
                        config['project'],
                        config['fixed_model_name'])

    def process_one(self, id):
        """
        predict one according id
        :param id:
        :return:
        """
        data = self.db.get_row({"id": id})
        predict = self.interpreter.parse(data["text"])
        return predict

    def process_batch(self, ids):
        """
        process batch text according ids
        :param ids:
        :return:
        """
        datas = self.db.get_row_by_ids(ids)
        predicted_results = []
        for unlabeled_data in datas:
            predict = self.interpreter.parse(unlabeled_data["text"])
            if predict:
                unlabeled_data.update(predict)
            predicted_results.append(unlabeled_data)
        return predicted_results

    def process_texts(self, texts):
        """
        process texts
        :param texts: format as [{"id": 1, "text": "我是测试"}, {"id": 2, "text": "我是测试2"}, ...]
        :return: format as [{'id':-, 'text':-, 'classifylabel':-, 'classifylabel_ranking':-}, ...]
        """
        if self.interpreter is None:
            logger.warning("model has not been trained, nothing will be predicted.")
            return []
        predicted_results = []
        for unlabeled_data in texts:
            predict = self.interpreter.parse(unlabeled_data["text"])
            if predict:
                unlabeled_data.update(predict)
            predicted_results.append(unlabeled_data)
        return predicted_results