コード例 #1
0
ファイル: test_trainer.py プロジェクト: JohnnySorry/learnGit
    def ignore_test_trainer_persist(self):
        """
        test pipeline persist, metadata will be saved
        :return:
        """
        test_config = "tests/data/test_config/test_config.json"
        config = AnnotatorConfig(test_config)

        trainer = Trainer(config)
        assert len(trainer.pipeline) > 0
        # char_tokenizer component should been created
        assert trainer.pipeline[0] is not None
        # create tmp train set
        tmp_path = create_tmp_test_jsonfile("tmp.json")
        train_data = load_local_data(tmp_path)
        # rm tmp train set
        rm_tmp_file("tmp.json")

        trainer.train(train_data)
        persisted_path = trainer.persist(config['path'], config['project'],
                                         config['fixed_model_name'])
        # load persisted metadata
        metadata_path = os.path.join(persisted_path, 'metadata.json')
        with io.open(metadata_path) as f:
            metadata = json.load(f)
        assert 'trained_at' in metadata
        # rm tmp files and dirs
        shutil.rmtree(config['path'], ignore_errors=False)
コード例 #2
0
    def ignore_test_pipeline_flow(self):
        """
        test trainer's train func for pipeline
        :return:
        """
        test_config = "tests/data/test_config/test_config.json"
        config = AnnotatorConfig(test_config)

        trainer = Trainer(config)
        assert len(trainer.pipeline) > 0
        # create tmp train set
        tmp_path = create_tmp_test_jsonfile("tmp.json")
        train_data = load_local_data(tmp_path)
        # rm tmp train set
        rm_tmp_file("tmp.json")

        interpreter = trainer.train(train_data)
        assert interpreter is not None
        out1 = interpreter.parse(("点连接拿红包啦"))

        # test persist and load
        persisted_path = trainer.persist(config['path'],
                                         config['project'],
                                         config['fixed_model_name'])

        interpreter_loaded = Interpreter.load(persisted_path, config)
        out2 = interpreter_loaded.parse("点连接拿红包啦")
        assert out1.get("classifylabel").get("name") == out2.get("classifylabel").get("name")

        # remove tmp models
        shutil.rmtree(config['path'], ignore_errors=True)
コード例 #3
0
    def ignore_test_load_and_persist_without_train(self):
        """
        test save and load model without train
        :return:
        """
        test_config = "tests/data/test_config/test_config.json"
        config = AnnotatorConfig(test_config)

        trainer = Trainer(config)
        assert len(trainer.pipeline) > 0
        # create tmp train set
        tmp_path = create_tmp_test_jsonfile("tmp.json")
        train_data = load_local_data(tmp_path)
        # rm tmp train set
        rm_tmp_file("tmp.json")

        # interpreter = trainer.train(train_data)
        # test persist and load
        persisted_path = trainer.persist(config['path'],
                                         config['project'],
                                         config['fixed_model_name'])

        interpreter_loaded = Interpreter.load(persisted_path, config)
        assert interpreter_loaded.pipeline
        assert interpreter_loaded.parse("hello") is not None
        assert interpreter_loaded.parse("Hello today is Monday, again!") is not None
        # remove tmp models
        shutil.rmtree(config['path'], ignore_errors=False)
コード例 #4
0
ファイル: test_trainer.py プロジェクト: JohnnySorry/learnGit
    def ignore_test_load_and_persist_without_train(self):
        """
        test save and load model without train
        :return:
        """
        test_config = "tests/data/test_config/test_config.json"
        config = AnnotatorConfig(test_config)

        trainer = Trainer(config)
        assert len(trainer.pipeline) > 0
        # create tmp train set
        tmp_path = create_tmp_test_jsonfile("tmp.json")
        train_data = load_local_data(tmp_path)
        # rm tmp train set
        rm_tmp_file("tmp.json")

        # interpreter = trainer.train(train_data)
        # test persist and load
        persisted_path = trainer.persist(config['path'], config['project'],
                                         config['fixed_model_name'])

        interpreter_loaded = Interpreter.load(persisted_path, config)
        assert interpreter_loaded.pipeline
        assert interpreter_loaded.parse("hello") is not None
        assert interpreter_loaded.parse(
            "Hello today is Monday, again!") is not None
        # remove tmp models
        shutil.rmtree(config['path'], ignore_errors=False)
コード例 #5
0
ファイル: test_trainer.py プロジェクト: JohnnySorry/learnGit
    def ignore_test_pipeline_flow(self):
        """
        test trainer's train func for pipeline
        :return:
        """
        test_config = "tests/data/test_config/test_config.json"
        config = AnnotatorConfig(test_config)

        trainer = Trainer(config)
        assert len(trainer.pipeline) > 0
        # create tmp train set
        tmp_path = create_tmp_test_jsonfile("tmp.json")
        train_data = load_local_data(tmp_path)
        # rm tmp train set
        rm_tmp_file("tmp.json")

        interpreter = trainer.train(train_data)
        assert interpreter is not None
        out1 = interpreter.parse(("点连接拿红包啦"))

        # test persist and load
        persisted_path = trainer.persist(config['path'], config['project'],
                                         config['fixed_model_name'])

        interpreter_loaded = Interpreter.load(persisted_path, config)
        out2 = interpreter_loaded.parse("点连接拿红包啦")
        assert out1.get("classifylabel").get("name") == out2.get(
            "classifylabel").get("name")

        # remove tmp models
        shutil.rmtree(config['path'], ignore_errors=True)
コード例 #6
0
    def ignore_test_trainer_persist(self):
        """
        test pipeline persist, metadata will be saved
        :return:
        """
        test_config = "tests/data/test_config/test_config.json"
        config = AnnotatorConfig(test_config)

        trainer = Trainer(config)
        assert len(trainer.pipeline) > 0
        # char_tokenizer component should been created
        assert trainer.pipeline[0] is not None
        # create tmp train set
        tmp_path = create_tmp_test_jsonfile("tmp.json")
        train_data = load_local_data(tmp_path)
        # rm tmp train set
        rm_tmp_file("tmp.json")

        trainer.train(train_data)
        persisted_path = trainer.persist(config['path'],
                                         config['project'],
                                         config['fixed_model_name'])
        # load persisted metadata
        metadata_path = os.path.join(persisted_path, 'metadata.json')
        with io.open(metadata_path) as f:
            metadata = json.load(f)
        assert 'trained_at' in metadata
        # rm tmp files and dirs
        shutil.rmtree(config['path'], ignore_errors=False)
コード例 #7
0
    def teardown_class(cls):
        """ teardown any state that was previously setup with a call to
        setup_class.
        """
        # remove tmp files and dirs created in test case
        test_config = "tests/data/test_config/test_config.json"
        config = AnnotatorConfig(test_config)

        rm_tmp_file("test_data.json")
        shutil.rmtree(config['path'], ignore_errors=True)
コード例 #8
0
    def teardown_class(cls):
        """ teardown any state that was previously setup with a call to
        setup_class.
        """
        # remove tmp files and dirs created in test case
        test_config = "tests/data/test_config.json"
        config = AnnotatorConfig(test_config)

        rm_tmp_file("test_data.json")
        shutil.rmtree(config['path'], ignore_errors=True)
コード例 #9
0
 def test_char2vec_standalone(self):
     """
     test char2vec_standalone training
     """
     create_tmp_test_textfile("spam_email_text_1000")
     os.system(
         "python -m chi_annotator.algo_factory.preprocess.char2vec_standalone -train tests/data/spam_email_text_1000 -output tests/data/test_vec.txt -size 200 -sample 1e-4 -binary 0 -iter 3"
     )
     assert os.path.isfile("tests/data/test_vec.txt") is not None
     rm_tmp_file("spam_email_text_1000")
     rm_tmp_file("test_vec.txt")
コード例 #10
0
ファイル: test_trainer.py プロジェクト: JohnnySorry/learnGit
 def ignore_test_load_local_data(self):
     """
     test load local json format data
     :return:
     """
     tmp_path = create_tmp_test_jsonfile("tmp.json")
     train_data = load_local_data(tmp_path)
     rm_tmp_file("tmp.json")
     assert train_data is not None
     assert len(train_data.training_examples) == 1000
     assert "text" not in train_data.training_examples[0].data
     assert "label" in train_data.training_examples[0].data
コード例 #11
0
 def ignore_test_load_local_data(self):
     """
     test load local json format data
     :return:
     """
     tmp_path = create_tmp_test_jsonfile("tmp.json")
     train_data = load_local_data(tmp_path)
     rm_tmp_file("tmp.json")
     assert train_data is not None
     assert len(train_data.training_examples) == 1000
     assert "text" not in train_data.training_examples[0].data
     assert "label" in train_data.training_examples[0].data
コード例 #12
0
 def ingor_test_char2vec_standalone(self):
     """
     test char2vec_standalone training
     """
     abs_path = os.path.dirname(os.path.abspath(__file__))
     create_tmp_test_textfile(os.path.join(abs_path, "spam_email_text_1000"))
     os.system("python -m chi_annotator.algo_factory.preprocess.char2vec_standalone -train " +
               os.path.join(abs_path, "/../data/spam_email_text_1000") +
               " -output " +
               os.path.join(abs_path, "/../data/test_vec.txt") +
               " -size 200 -sample 1e-4 -binary 0 -iter 3")
     assert os.path.isfile(os.path.join(abs_path, "/../data/test_vec.txt")) is not None
     rm_tmp_file(os.path.join(abs_path, "spam_email_text_1000"))
     rm_tmp_file(os.path.join(abs_path, "/../data/test_vec.txt"))
コード例 #13
0
    def ignore_test_train_model_empty_pipeline(self):
        """
        train model with no component
        :return:
        """
        test_config = "tests/data/test_config/test_config.json"
        config = AnnotatorConfig(test_config)
        config['pipeline'] = []

        tmp_path = create_tmp_test_jsonfile("tmp.json")
        train_data = load_local_data(tmp_path)
        rm_tmp_file("tmp.json")

        with pytest.raises(ValueError):
            trainer = Trainer(config)
            trainer.train(train_data)
コード例 #14
0
ファイル: test_trainer.py プロジェクト: JohnnySorry/learnGit
    def ignore_test_train_model_empty_pipeline(self):
        """
        train model with no component
        :return:
        """
        test_config = "tests/data/test_config/test_config.json"
        config = AnnotatorConfig(test_config)
        config['pipeline'] = []

        tmp_path = create_tmp_test_jsonfile("tmp.json")
        train_data = load_local_data(tmp_path)
        rm_tmp_file("tmp.json")

        with pytest.raises(ValueError):
            trainer = Trainer(config)
            trainer.train(train_data)
コード例 #15
0
 def ingor_test_char2vec_standalone(self):
     """
     test char2vec_standalone training
     """
     abs_path = os.path.dirname(os.path.abspath(__file__))
     create_tmp_test_textfile(os.path.join(abs_path,
                                           "spam_email_text_1000"))
     os.system(
         "python -m chi_annotator.algo_factory.preprocess.char2vec_standalone -train "
         + os.path.join(abs_path, "/../data/spam_email_text_1000") +
         " -output " + os.path.join(abs_path, "/../data/test_vec.txt") +
         " -size 200 -sample 1e-4 -binary 0 -iter 3")
     assert os.path.isfile(os.path.join(
         abs_path, "/../data/test_vec.txt")) is not None
     rm_tmp_file(os.path.join(abs_path, "spam_email_text_1000"))
     rm_tmp_file(os.path.join(abs_path, "/../data/test_vec.txt"))
コード例 #16
0
    def ignore_test_handles_pipeline_with_non_existing_component(self):
        """
        handle no exist component in pipeline
        :return:
        """
        test_config = "tests/data/test_config/test_config.json"
        config = AnnotatorConfig(test_config)
        config['pipeline'].append("unknown_component")

        tmp_path = create_tmp_test_jsonfile("tmp.json")
        train_data = load_local_data(tmp_path)
        rm_tmp_file("tmp.json")

        with pytest.raises(Exception) as execinfo:
            trainer = Trainer(config)
            trainer.train(train_data)
        assert "Failed to find component" in str(execinfo.value)
コード例 #17
0
ファイル: test_trainer.py プロジェクト: JohnnySorry/learnGit
    def ignore_test_handles_pipeline_with_non_existing_component(self):
        """
        handle no exist component in pipeline
        :return:
        """
        test_config = "tests/data/test_config/test_config.json"
        config = AnnotatorConfig(test_config)
        config['pipeline'].append("unknown_component")

        tmp_path = create_tmp_test_jsonfile("tmp.json")
        train_data = load_local_data(tmp_path)
        rm_tmp_file("tmp.json")

        with pytest.raises(Exception) as execinfo:
            trainer = Trainer(config)
            trainer.train(train_data)
        assert "Failed to find component" in str(execinfo.value)
コード例 #18
0
    def test_pipeline_flow(self):
        """
        test trainer's train func for pipeline
        :return:
        """
        test_config = "tests/data/test_config.json"
        config = AnnotatorConfig(test_config)

        trainer = Trainer(config)
        assert len(trainer.pipeline) == 1
        # char_tokenizer component should been created
        assert trainer.pipeline[0] is not None
        # create tmp train set
        tmp_path = create_tmp_test_file("tmp.json")
        train_data = load_local_data(tmp_path)
        # rm tmp train set
        rm_tmp_file("tmp.json")

        interpreter = trainer.train(train_data)
        assert interpreter is not None
コード例 #19
0
    def ignor_test_embedding(self):
        from chi_annotator.algo_factory.components import ComponentBuilder
        from chi_annotator.algo_factory.common import Message
        from chi_annotator.task_center.config import AnnotatorConfig
        from chi_annotator.algo_factory.common import TrainingData
        from gensim.models.word2vec import LineSentence
        text_dir = create_tmp_test_textfile("spam_email_text_1000")

        # 将数据放入TrainingData
        with open(text_dir, 'r') as f:
            res = []
            for line in f.readlines():
                line.strip('\n')
                line = Message(re.sub('\s', '', line))
                res.append(line)
        res = TrainingData(res)

        cfg = AnnotatorConfig(
            filename="tests/data/test_config/test_config_embedding.json")
        cb = ComponentBuilder()

        # char_tokenize, embedding的训练暂时不用用到
        char_tokenize = cb.create_component("char_tokenizer", cfg)
        char_tokenize.train(res, cfg)

        # 加载embedding, 训练模型, 传入数据为LinSentence(data_path)
        embedding = cb.create_component("embedding", cfg)
        embedding.train(LineSentence(text_dir), cfg)
        embedding.persist(cfg.wv_model_path)

        # 加载sent_embedding, 从embedding训练完是model中, 获得sentence_vec
        sent_embedding = cb.create_component("embedding_extractor", cfg)
        msg = Message("你好,我是一个demo!!!!")
        char_tokenize.process(msg)
        sent_embedding.sentence_process(msg, **{})
        assert msg.get("sentence_embedding").sum() != 0

        # 加载base model, 加入新的corpus, 在base_model的基础上进行增量学习
        embedding = embedding.load(model_metadata=cfg)
        embedding.train(LineSentence(text_dir), cfg)
        embedding.persist(cfg.wv_model_path)

        # 增量学习后生成的新model, 进行EmbeddingExtractor测验
        sent_embedding = cb.create_component("embedding_extractor", cfg)
        msg = Message("你好,我是一个demo!!!!")
        char_tokenize.process(msg)
        sent_embedding.sentence_process(msg, **{})
        assert msg.get("sentence_embedding").sum() != 0

        rm_tmp_file("word2vec.model")
        rm_tmp_file("word2vec.model.vector")
        rm_tmp_file("spam_email_text_1000")
コード例 #20
0
    def ignor_test_embedding(self):
        from chi_annotator.algo_factory.components import ComponentBuilder
        from chi_annotator.algo_factory.common import Message
        from chi_annotator.task_center.config import AnnotatorConfig
        from chi_annotator.algo_factory.common import TrainingData
        from gensim.models.word2vec import LineSentence
        text_dir = create_tmp_test_textfile("spam_email_text_1000")

        # 将数据放入TrainingData
        with open(text_dir, 'r') as f:
            res = []
            for line in f.readlines():
                line.strip('\n')
                line = Message(re.sub('\s', '', line))
                res.append(line)
        res = TrainingData(res)

        cfg = AnnotatorConfig(filename="tests/data/test_config/test_config_embedding.json")
        cb = ComponentBuilder()

        # char_tokenize, embedding的训练暂时不用用到
        char_tokenize = cb.create_component("char_tokenizer", cfg)
        char_tokenize.train(res, cfg)

        # 加载embedding, 训练模型, 传入数据为LinSentence(data_path)
        embedding = cb.create_component("embedding", cfg)
        embedding.train(LineSentence(text_dir), cfg)
        embedding.persist(cfg.wv_model_path)

        # 加载sent_embedding, 从embedding训练完是model中, 获得sentence_vec
        sent_embedding = cb.create_component("embedding_extractor", cfg)
        msg = Message("你好,我是一个demo!!!!")
        char_tokenize.process(msg)
        sent_embedding.sentence_process(msg, **{})
        assert msg.get("sentence_embedding").sum() != 0

        # 加载base model, 加入新的corpus, 在base_model的基础上进行增量学习
        embedding = embedding.load(model_metadata=cfg)
        embedding.train(LineSentence(text_dir), cfg)
        embedding.persist(cfg.wv_model_path)

        # 增量学习后生成的新model, 进行EmbeddingExtractor测验
        sent_embedding = cb.create_component("embedding_extractor", cfg)
        msg = Message("你好,我是一个demo!!!!")
        char_tokenize.process(msg)
        sent_embedding.sentence_process(msg, **{})
        assert msg.get("sentence_embedding").sum() != 0

        rm_tmp_file("word2vec.model")
        rm_tmp_file("word2vec.model.vector")
        rm_tmp_file("spam_email_text_1000")