Ejemplo n.º 1
0
    def test_kmeans(self):
        from chi_annotator.algo_factory.components import ComponentBuilder
        from chi_annotator.algo_factory.common import Message
        from chi_annotator.task_center.config import AnnotatorConfig
        from chi_annotator.algo_factory.common import TrainingData
        task_config = dict(config.CLASSIFY_TASK_CONFIG)
        dir_name = os.path.dirname(os.path.abspath(__file__))
        task_config["embedding_path"] = dir_name + "/../data/test_embedding/vec.txt"
        task_config["embedding_type"] = "w2v"
        cfg = AnnotatorConfig(task_config)
        pos_msg1 = Message(u"你好,我是一个demo!!!!")
        pos_msg2 = Message(u"你好,你好,你好")
        neg_msg1 = Message(u"如果发现有文件漏提或注释有误")
        neg_msg2 = Message(u"增加一个需要上传的文件")

        train_data = TrainingData([neg_msg1, neg_msg2, pos_msg1, pos_msg2])
        cb = ComponentBuilder()
        char_tokenize = cb.create_component("char_tokenizer", cfg)
        sent_embedding = cb.create_component("sentence_embedding_extractor", cfg)
        svm_classifer = cb.create_component("cluster_sklearn", cfg)
        char_tokenize.train(train_data, cfg)
        sent_embedding.train(train_data, cfg)
        svm_classifer.train(train_data, cfg)
        # test
        test_msg = Message(u"增加一个需要上传的文件")
        char_tokenize.process(test_msg, **{})
        sent_embedding.process(test_msg, **{})
        svm_classifer.process(test_msg, **{})
        assert test_msg.get("cluster_center").get("center") is not None
Ejemplo n.º 2
0
    def test_randomforest_classify(self):
        task_config = dict(config.CLASSIFY_TASK_CONFIG)
        dir_name = os.path.dirname(os.path.abspath(__file__))
        task_config[
            "embedding_path"] = dir_name + "/../data/test_embedding/vec.txt"
        task_config["embedding_type"] = "w2v"
        cfg = AnnotatorConfig(task_config)
        train_data = TrainingData(
            [self.neg_msg1, self.neg_msg2, self.pos_msg1, self.pos_msg2])
        cb = ComponentBuilder()
        char_tokenize = cb.create_component("char_tokenizer", cfg)
        sent_embedding = cb.create_component("sentence_embedding_extractor",
                                             cfg)
        RandomForest_Classifier = cb.create_component(
            "RandomForest_Classifier", cfg)
        char_tokenize.train(train_data, cfg)
        sent_embedding.train(train_data, cfg)
        RandomForest_Classifier.train(train_data, cfg)
        # test
        test_msg = Message(u"增加一个需要上传的文件")
        char_tokenize.process(test_msg, **{})
        sent_embedding.process(test_msg, **{})
        RandomForest_Classifier.process(test_msg, **{})

        assert test_msg.get("classifylabel").get("name") == "bad"
Ejemplo n.º 3
0
 def test_char_tokenizer(self):
     from chi_annotator.algo_factory.components import ComponentBuilder
     from chi_annotator.algo_factory.common import Message
     from chi_annotator.task_center.config import AnnotatorConfig
     msg = Message(u"你好,我是一个demo!!!!")
     cb = ComponentBuilder()
     cfg = AnnotatorConfig(config.CLASSIFY_TASK_CONFIG)
     ct = cb.create_component("char_tokenizer", cfg)
     assert ct is not None
     ct.process(msg, **{})
     assert len(msg.get("tokens")) > 0
Ejemplo n.º 4
0
def test_tokenizer_main():
    from chi_annotator.algo_factory.components import ComponentBuilder
    from chi_annotator.algo_factory.common import Message
    from chi_annotator.config import AnnotatorConfig
    msg = Message("你好,我是一个demo!!!!")
    cb = ComponentBuilder()
    config = AnnotatorConfig()
    ct = cb.create_component("char_tokenizer", config)
    if ct is not None:
        ct.process(msg, **{})
        print(msg.get("tokens"))
Ejemplo n.º 5
0
 def ignor_test_senten_embedding_extractor(self):
     from chi_annotator.algo_factory.components import ComponentBuilder
     from chi_annotator.algo_factory.common import Message
     from chi_annotator.task_center.config import AnnotatorConfig
     cfg = AnnotatorConfig()
     msg = Message("你好,我是一个demo!!!!")
     cb = ComponentBuilder()
     char_tokenize = cb.create_component("char_tokenizer", cfg)
     sent_embedding = cb.create_component("sentence_embedding_extractor", cfg)
     char_tokenize.process(msg)
     sent_embedding.process(msg, **{})
     assert msg.get("sentence_embedding").sum() + 7.30032945834 < 1e-6
Ejemplo n.º 6
0
 def ignor_test_senten_embedding_extractor(self):
     from chi_annotator.algo_factory.components import ComponentBuilder
     from chi_annotator.algo_factory.common import Message
     from chi_annotator.task_center.config import AnnotatorConfig
     cfg = AnnotatorConfig()
     msg = Message("你好,我是一个demo!!!!")
     cb = ComponentBuilder()
     char_tokenize = cb.create_component("char_tokenizer", cfg)
     sent_embedding = cb.create_component("sentence_embedding_extractor",
                                          cfg)
     char_tokenize.process(msg)
     sent_embedding.process(msg, **{})
     assert msg.get("sentence_embedding").sum() + 7.30032945834 < 1e-6
Ejemplo n.º 7
0
    def ignor_test_embedding(self):
        from chi_annotator.algo_factory.components import ComponentBuilder
        from chi_annotator.algo_factory.common import Message
        from chi_annotator.task_center.config import AnnotatorConfig
        from chi_annotator.algo_factory.common import TrainingData
        from gensim.models.word2vec import LineSentence
        text_dir = create_tmp_test_textfile("spam_email_text_1000")

        # 将数据放入TrainingData
        with open(text_dir, 'r') as f:
            res = []
            for line in f.readlines():
                line.strip('\n')
                line = Message(re.sub('\s', '', line))
                res.append(line)
        res = TrainingData(res)

        cfg = AnnotatorConfig(filename="tests/data/test_config/test_config_embedding.json")
        cb = ComponentBuilder()

        # char_tokenize, embedding的训练暂时不用用到
        char_tokenize = cb.create_component("char_tokenizer", cfg)
        char_tokenize.train(res, cfg)

        # 加载embedding, 训练模型, 传入数据为LinSentence(data_path)
        embedding = cb.create_component("embedding", cfg)
        embedding.train(LineSentence(text_dir), cfg)
        embedding.persist(cfg.wv_model_path)

        # 加载sent_embedding, 从embedding训练完是model中, 获得sentence_vec
        sent_embedding = cb.create_component("embedding_extractor", cfg)
        msg = Message("你好,我是一个demo!!!!")
        char_tokenize.process(msg)
        sent_embedding.sentence_process(msg, **{})
        assert msg.get("sentence_embedding").sum() != 0

        # 加载base model, 加入新的corpus, 在base_model的基础上进行增量学习
        embedding = embedding.load(model_metadata=cfg)
        embedding.train(LineSentence(text_dir), cfg)
        embedding.persist(cfg.wv_model_path)

        # 增量学习后生成的新model, 进行EmbeddingExtractor测验
        sent_embedding = cb.create_component("embedding_extractor", cfg)
        msg = Message("你好,我是一个demo!!!!")
        char_tokenize.process(msg)
        sent_embedding.sentence_process(msg, **{})
        assert msg.get("sentence_embedding").sum() != 0

        rm_tmp_file("word2vec.model")
        rm_tmp_file("word2vec.model.vector")
        rm_tmp_file("spam_email_text_1000")
Ejemplo n.º 8
0
 def test_words_jieba_tokenizer(self):
     """
     test word tokenizer using jieba
     :return:
     """
     from chi_annotator.algo_factory.components import ComponentBuilder
     from chi_annotator.algo_factory.common import Message
     from chi_annotator.task_center.config import AnnotatorConfig
     msg = Message(u"你好,我是一个demo!!!!")
     cb = ComponentBuilder()
     config = AnnotatorConfig()
     ct = cb.create_component("tokenizer_jieba", config)
     assert ct is not None
     ct.process(msg, **{})
     assert len(msg.get("tokens")) > 0
Ejemplo n.º 9
0
 def ignor_test_words_jieba_tokenizer(self):
     """
     #TODO: jieba will add later
     test word tokenizer using jieba
     :return:
     """
     from chi_annotator.algo_factory.components import ComponentBuilder
     from chi_annotator.algo_factory.common import Message
     from chi_annotator.task_center.config import AnnotatorConfig
     msg = Message(u"你好,我是一个demo!!!!")
     cb = ComponentBuilder()
     cfg = AnnotatorConfig(config.CLASSIFY_TASK_CONFIG)
     ct = cb.create_component("tokenizer_jieba", cfg)
     assert ct is not None
     ct.process(msg, **{})
     assert len(msg.get("tokens")) > 0
    def test_kmeans(self):
        from chi_annotator.algo_factory.components import ComponentBuilder
        from chi_annotator.algo_factory.common import Message
        from chi_annotator.task_center.config import AnnotatorConfig
        from chi_annotator.algo_factory.common import TrainingData
        task_config = dict(config.CLASSIFY_TASK_CONFIG)
        dir_name = os.path.dirname(os.path.abspath(__file__))
        task_config[
            "embedding_path"] = dir_name + "/../data/test_embedding/vec.txt"
        task_config["embedding_type"] = "w2v"
        cfg = AnnotatorConfig(task_config)
        pos_msg1 = Message(u"你好,我是一个demo!!!!")
        pos_msg2 = Message(u"你好,你好,你好")
        neg_msg1 = Message(u"如果发现有文件漏提或注释有误")
        neg_msg2 = Message(u"增加一个需要上传的文件")

        train_data = TrainingData([neg_msg1, neg_msg2, pos_msg1, pos_msg2])
        cb = ComponentBuilder()
        char_tokenize = cb.create_component("char_tokenizer", cfg)
        sent_embedding = cb.create_component("sentence_embedding_extractor",
                                             cfg)
        svm_classifer = cb.create_component("cluster_sklearn", cfg)
        char_tokenize.train(train_data, cfg)
        sent_embedding.train(train_data, cfg)
        svm_classifer.train(train_data, cfg)
        # test
        test_msg = Message(u"增加一个需要上传的文件")
        char_tokenize.process(test_msg, **{})
        sent_embedding.process(test_msg, **{})
        svm_classifer.process(test_msg, **{})
        assert test_msg.get("cluster_center").get("center") is not None
Ejemplo n.º 11
0
    def test_kmeans(self):
        from chi_annotator.algo_factory.components import ComponentBuilder
        from chi_annotator.algo_factory.common import Message
        from chi_annotator.task_center.config import AnnotatorConfig
        from chi_annotator.algo_factory.common import TrainingData
        cfg = AnnotatorConfig()
        pos_msg1 = Message(u"你好,我是一个demo!!!!")
        pos_msg2 = Message(u"你好,你好,你好")
        neg_msg1 = Message(u"如果发现有文件漏提或注释有误")
        neg_msg2 = Message(u"增加一个需要上传的文件")

        train_data = TrainingData([neg_msg1, neg_msg2, pos_msg1, pos_msg2])
        cb = ComponentBuilder()
        char_tokenize = cb.create_component("char_tokenizer", cfg)
        sent_embedding = cb.create_component("sentence_embedding_extractor",
                                             cfg)
        svm_classifer = cb.create_component("cluster_sklearn", cfg)
        char_tokenize.train(train_data, cfg)
        sent_embedding.train(train_data, cfg)
        svm_classifer.train(train_data, cfg)
        # test
        test_msg = Message(u"增加一个需要上传的文件")
        char_tokenize.process(test_msg, **{})
        sent_embedding.process(test_msg, **{})
        svm_classifer.process(test_msg, **{})
        assert test_msg.get("cluster_center").get("center") is not None
    def test_online_training(self):
        """
        test online training.
        :return:
        """
        test_config = "tests/data/test_config.json"
        config = AnnotatorConfig(test_config)
        # init trainer first
        trainer = Trainer(config)

        # load all data for test, in actual data should get from user label
        with io.open(config["org_data"], encoding="utf-8-sig") as f:
            data = simplejson.loads(f.read())
        validate_local_data(data)

        data_set = data.get("data_set", list())

        # faker user labeled data, user has labeled 50 texts.
        faker_user_labeled_data = data_set[:50]
        # 950 text to predict and rank
        unlabeled_data = data_set[50:]

        # now test online training
        examples = []
        for e in faker_user_labeled_data:
            data = e.copy()
            if "text" in data:
                del data["text"]
            examples.append(Message(e["text"], data))

        new_labeled_data = TrainingData(examples)

        # full amount train and persist model
        interpreter = trainer.train(new_labeled_data)
        trainer.persist(config['path'], config['project'],
                        config['fixed_model_name'])

        # predict unlabeled dataset and ranking
        predicted_results = []
        for unlabeled_data in unlabeled_data:
            predict = interpreter.parse(unlabeled_data["text"])
            predicted_results.append(predict)

        # sort predict result
        # predicted result format as
        # {
        #   'classifylabel': {'name': 'spam', 'confidence': 0.5701943777626447},
        #   'classifylabel_ranking': [{'name': 'spam', 'confidence': 0.5701943777626447},
        #                             {'name': 'notspam', 'confidence': 0.42980562223735524}],
        #   'text': '我是一个垃圾邮件'
        # }
        confidence_threshold = config["confidence_threshold"]
        ranking_candidates = [text for text in predicted_results \
                              if text.get("classifylabel").get("confidence") < confidence_threshold]
        for candidate in ranking_candidates:
            assert candidate.get("classifylabel").get(
                "confidence") < confidence_threshold
Ejemplo n.º 13
0
    def test_sgd_classify(self):
        cfg = AnnotatorConfig()
        train_data = TrainingData(
            [self.neg_msg1, self.neg_msg2, self.pos_msg1, self.pos_msg2])
        cb = ComponentBuilder()
        char_tokenize = cb.create_component("char_tokenizer", cfg)
        sent_embedding = cb.create_component("sentence_embedding_extractor",
                                             cfg)
        SGD_Classifier = cb.create_component("SGD_Classifier", cfg)
        char_tokenize.train(train_data, cfg)
        sent_embedding.train(train_data, cfg)
        SGD_Classifier.train(train_data, cfg)
        # test
        test_msg = Message(u"增加一个需要上传的文件")
        char_tokenize.process(test_msg, **{})
        sent_embedding.process(test_msg, **{})
        SGD_Classifier.process(test_msg, **{})

        assert test_msg.get("classifylabel").get("name") == "bad"
Ejemplo n.º 14
0
 def _train_batch(self, batch_result):
     # from result to train_data, create train data
     msg = []
     for item in batch_result:
         msg.append(Message(item["text"], {"label": item["label"]}))
     train_data = TrainingData(msg)
     # create interpreter
     trainer = Trainer(self.task_config)
     trainer.train(train_data)
     # save model meta for config
     trainer.persist(self.task_config.get_save_path_prefix())
     return True
    def test_randomforest_classify(self):
        task_config = dict(config.CLASSIFY_TASK_CONFIG)
        dir_name = os.path.dirname(os.path.abspath(__file__))
        task_config["embedding_path"] = dir_name + "/../data/test_embedding/vec.txt"
        task_config["embedding_type"] = "w2v"
        cfg = AnnotatorConfig(task_config)
        train_data = TrainingData([self.neg_msg1, self.neg_msg2, self.pos_msg1, self.pos_msg2])
        cb = ComponentBuilder()
        char_tokenize = cb.create_component("char_tokenizer", cfg)
        sent_embedding = cb.create_component("sentence_embedding_extractor", cfg)
        RandomForest_Classifier = cb.create_component("RandomForest_Classifier", cfg)
        char_tokenize.train(train_data, cfg)
        sent_embedding.train(train_data, cfg)
        RandomForest_Classifier.train(train_data, cfg)
        # test
        test_msg = Message(u"增加一个需要上传的文件")
        char_tokenize.process(test_msg, **{})
        sent_embedding.process(test_msg, **{})
        RandomForest_Classifier.process(test_msg, **{})

        assert test_msg.get("classifylabel").get("name") == "bad"
Ejemplo n.º 16
0
    def parse(self, text, time=None):
        # type: (Text) -> Dict[Text, Any]
        """Parse the input text, classify it and return pipeline result.

        The pipeline result usually contains intent and entities."""

        if not text:
            # Not all components are able to handle empty strings. So we need
            # to prevent that... This default return will not contain all
            # output attributes of all components, but in the end, no one should
            # pass an empty string in the first place.
            output = self.default_output_attributes()
            output["text"] = ""
            return output

        message = Message(text, self.default_output_attributes(), time=time)

        for component in self.pipeline:
            component.process(message, **self.context)

        output = self.default_output_attributes()
        output.update(message.as_dict(only_output_properties=True))
        return output
Ejemplo n.º 17
0
    def parse(self, text, time=None):
        # type: (Text) -> Dict[Text, Any]
        """Parse the input text, classify it and return pipeline result.

        The pipeline result usually contains intent and entities."""

        if not text:
            # Not all components are able to handle empty strings. So we need
            # to prevent that... This default return will not contain all
            # output attributes of all components, but in the end, no one should
            # pass an empty string in the first place.
            output = self.default_output_attributes()
            output["text"] = ""
            return output

        message = Message(text, self.default_output_attributes(), time=time)

        for component in self.pipeline:
            component.process(message, **self.context)

        output = self.default_output_attributes()
        output.update(message.as_dict(only_output_properties=True))
        return output
Ejemplo n.º 18
0
    def ignor_test_embedding(self):
        from chi_annotator.algo_factory.components import ComponentBuilder
        from chi_annotator.algo_factory.common import Message
        from chi_annotator.task_center.config import AnnotatorConfig
        from chi_annotator.algo_factory.common import TrainingData
        from gensim.models.word2vec import LineSentence
        text_dir = create_tmp_test_textfile("spam_email_text_1000")

        # 将数据放入TrainingData
        with open(text_dir, 'r') as f:
            res = []
            for line in f.readlines():
                line.strip('\n')
                line = Message(re.sub('\s', '', line))
                res.append(line)
        res = TrainingData(res)

        cfg = AnnotatorConfig(
            filename="tests/data/test_config/test_config_embedding.json")
        cb = ComponentBuilder()

        # char_tokenize, embedding的训练暂时不用用到
        char_tokenize = cb.create_component("char_tokenizer", cfg)
        char_tokenize.train(res, cfg)

        # 加载embedding, 训练模型, 传入数据为LinSentence(data_path)
        embedding = cb.create_component("embedding", cfg)
        embedding.train(LineSentence(text_dir), cfg)
        embedding.persist(cfg.wv_model_path)

        # 加载sent_embedding, 从embedding训练完是model中, 获得sentence_vec
        sent_embedding = cb.create_component("embedding_extractor", cfg)
        msg = Message("你好,我是一个demo!!!!")
        char_tokenize.process(msg)
        sent_embedding.sentence_process(msg, **{})
        assert msg.get("sentence_embedding").sum() != 0

        # 加载base model, 加入新的corpus, 在base_model的基础上进行增量学习
        embedding = embedding.load(model_metadata=cfg)
        embedding.train(LineSentence(text_dir), cfg)
        embedding.persist(cfg.wv_model_path)

        # 增量学习后生成的新model, 进行EmbeddingExtractor测验
        sent_embedding = cb.create_component("embedding_extractor", cfg)
        msg = Message("你好,我是一个demo!!!!")
        char_tokenize.process(msg)
        sent_embedding.sentence_process(msg, **{})
        assert msg.get("sentence_embedding").sum() != 0

        rm_tmp_file("word2vec.model")
        rm_tmp_file("word2vec.model.vector")
        rm_tmp_file("spam_email_text_1000")
Ejemplo n.º 19
0
def load_local_data(filename):
    # type: (Text) -> TrainingData
    """Loads training data stored in the rasa NLU data format."""

    with io.open(filename, encoding="utf-8-sig") as f:
        data = simplejson.loads(f.read())
    validate_local_data(data)

    data_set = data.get("data_set", list())

    training_examples = []
    for e in data_set:
        data = e.copy()
        if "text" in data:
            del data["text"]
        training_examples.append(Message(e["text"], data))

    return TrainingData(training_examples)
Ejemplo n.º 20
0
    def train(self, data_set):
        """
        train data set
        :param data_set: format as [{"id": 1, "text": "我是测试", "label": "spam"}, .....]
        :return:
        """
        config = self.config

        examples = []
        for e in data_set:
            data = e.copy()
            if "text" in data:
                del data["text"]
            examples.append(Message(e["text"], data))
        train_data = TrainingData(examples)

        self.interpreter = self.trainer.train(train_data)
        # overwrite save model TODO
        self.trainer.persist(config['path'], config['project'],
                             config['fixed_model_name'])
Ejemplo n.º 21
0
class TestClassify(object):
    pos_msg1 = Message(u"你好,我是一个demo!!!!", {"label": "good"})
    pos_msg2 = Message(u"你好,你好,你好", {"label": "good"})
    pos_msg3 = Message(u"好的呀,不错", {"label": "good"})
    neg_msg1 = Message(u"如果发现有文件漏提或注释有误", {"label": "bad"})
    neg_msg2 = Message(u"增加一个需要上传的文件", {"label": "bad"})
    neg_msg3 = Message(u"有一个上传的文件", {"label": "bad"})

    def test_svm_classify(self):
        cfg = AnnotatorConfig()
        train_data = TrainingData(
            [self.neg_msg1, self.neg_msg2, self.pos_msg1, self.pos_msg2])
        cb = ComponentBuilder()
        char_tokenize = cb.create_component("char_tokenizer", cfg)
        sent_embedding = cb.create_component("sentence_embedding_extractor",
                                             cfg)
        svm_classifer = cb.create_component("SVM_classifier", cfg)
        char_tokenize.train(train_data, cfg)
        sent_embedding.train(train_data, cfg)
        svm_classifer.train(train_data, cfg)
        # test
        test_msg = Message(u"增加一个需要上传的文件")
        char_tokenize.process(test_msg, **{})
        sent_embedding.process(test_msg, **{})
        svm_classifer.process(test_msg, **{})

        assert test_msg.get("classifylabel").get("name") == "bad"

    def test_sgd_classify(self):
        cfg = AnnotatorConfig()
        train_data = TrainingData(
            [self.neg_msg1, self.neg_msg2, self.pos_msg1, self.pos_msg2])
        cb = ComponentBuilder()
        char_tokenize = cb.create_component("char_tokenizer", cfg)
        sent_embedding = cb.create_component("sentence_embedding_extractor",
                                             cfg)
        SGD_Classifier = cb.create_component("SGD_Classifier", cfg)
        char_tokenize.train(train_data, cfg)
        sent_embedding.train(train_data, cfg)
        SGD_Classifier.train(train_data, cfg)
        # test
        test_msg = Message(u"增加一个需要上传的文件")
        char_tokenize.process(test_msg, **{})
        sent_embedding.process(test_msg, **{})
        SGD_Classifier.process(test_msg, **{})

        assert test_msg.get("classifylabel").get("name") == "bad"

    def test_knn_classify(self):
        cfg = AnnotatorConfig()
        train_data = TrainingData([
            self.neg_msg1, self.neg_msg2, self.neg_msg3, self.pos_msg1,
            self.pos_msg2, self.pos_msg3
        ])
        cb = ComponentBuilder()
        char_tokenize = cb.create_component("char_tokenizer", cfg)
        sent_embedding = cb.create_component("sentence_embedding_extractor",
                                             cfg)
        Knn_Classifier = cb.create_component("Knn_Classifier", cfg)
        char_tokenize.train(train_data, cfg)
        sent_embedding.train(train_data, cfg)
        Knn_Classifier.train(train_data, cfg)
        # test
        test_msg = Message(u"增加一个需要上传的文件")
        char_tokenize.process(test_msg, **{})
        sent_embedding.process(test_msg, **{})
        Knn_Classifier.process(test_msg, **{})

        assert test_msg.get("classifylabel").get("name") == "bad"

    def test_randomforest_classify(self):
        cfg = AnnotatorConfig()
        train_data = TrainingData(
            [self.neg_msg1, self.neg_msg2, self.pos_msg1, self.pos_msg2])
        cb = ComponentBuilder()
        char_tokenize = cb.create_component("char_tokenizer", cfg)
        sent_embedding = cb.create_component("sentence_embedding_extractor",
                                             cfg)
        RandomForest_Classifier = cb.create_component(
            "RandomForest_Classifier", cfg)
        char_tokenize.train(train_data, cfg)
        sent_embedding.train(train_data, cfg)
        RandomForest_Classifier.train(train_data, cfg)
        # test
        test_msg = Message(u"增加一个需要上传的文件")
        char_tokenize.process(test_msg, **{})
        sent_embedding.process(test_msg, **{})
        RandomForest_Classifier.process(test_msg, **{})

        assert test_msg.get("classifylabel").get("name") == "bad"

    def test_adaboost_classify(self):
        cfg = AnnotatorConfig()
        train_data = TrainingData(
            [self.neg_msg1, self.neg_msg2, self.pos_msg1, self.pos_msg2])
        cb = ComponentBuilder()
        char_tokenize = cb.create_component("char_tokenizer", cfg)
        sent_embedding = cb.create_component("sentence_embedding_extractor",
                                             cfg)
        AdaBoost_Classifier = cb.create_component("AdaBoost_Classifier", cfg)
        char_tokenize.train(train_data, cfg)
        sent_embedding.train(train_data, cfg)
        AdaBoost_Classifier.train(train_data, cfg)
        # test
        test_msg = Message(u"增加一个需要上传的文件")
        char_tokenize.process(test_msg, **{})
        sent_embedding.process(test_msg, **{})
        AdaBoost_Classifier.process(test_msg, **{})

        assert test_msg.get("classifylabel").get("name") == "bad"
Ejemplo n.º 22
0
class TestClassify(object):
    pos_msg1 = Message(u"你好,我是一个demo!!!!", {"label": "good"})
    pos_msg2 = Message(u"你好,你好,你好", {"label": "good"})
    pos_msg3 = Message(u"好的呀,不错", {"label": "good"})
    neg_msg1 = Message(u"如果发现有文件漏提或注释有误", {"label": "bad"})
    neg_msg2 = Message(u"增加一个需要上传的文件", {"label": "bad"})
    neg_msg3 = Message(u"有一个上传的文件", {"label": "bad"})

    def test_svm_classify(self):
        task_config = dict(config.CLASSIFY_TASK_CONFIG)
        dir_name = os.path.dirname(os.path.abspath(__file__))
        task_config[
            "embedding_path"] = dir_name + "/../data/test_embedding/vec.txt"
        task_config["embedding_type"] = "w2v"
        cfg = AnnotatorConfig(task_config)
        train_data = TrainingData(
            [self.neg_msg1, self.neg_msg2, self.pos_msg1, self.pos_msg2])
        cb = ComponentBuilder()
        char_tokenize = cb.create_component("char_tokenizer", cfg)
        sent_embedding = cb.create_component("sentence_embedding_extractor",
                                             cfg)
        svm_classifer = cb.create_component("SVM_classifier", cfg)
        char_tokenize.train(train_data, cfg)
        sent_embedding.train(train_data, cfg)
        svm_classifer.train(train_data, cfg)
        # test
        test_msg = Message(u"增加一个需要上传的文件")
        char_tokenize.process(test_msg, **{})
        sent_embedding.process(test_msg, **{})
        svm_classifer.process(test_msg, **{})

        assert test_msg.get("classifylabel").get("name") == "bad"

    def test_sgd_classify(self):
        task_config = dict(config.CLASSIFY_TASK_CONFIG)
        dir_name = os.path.dirname(os.path.abspath(__file__))
        task_config[
            "embedding_path"] = dir_name + "/../data/test_embedding/vec.txt"
        task_config["embedding_type"] = "w2v"
        cfg = AnnotatorConfig(task_config)
        train_data = TrainingData(
            [self.neg_msg1, self.neg_msg2, self.pos_msg1, self.pos_msg2])
        cb = ComponentBuilder()
        char_tokenize = cb.create_component("char_tokenizer", cfg)
        sent_embedding = cb.create_component("sentence_embedding_extractor",
                                             cfg)
        SGD_Classifier = cb.create_component("SGD_Classifier", cfg)
        char_tokenize.train(train_data, cfg)
        sent_embedding.train(train_data, cfg)
        SGD_Classifier.train(train_data, cfg)
        # test
        test_msg = Message(u"增加一个需要上传的文件")
        char_tokenize.process(test_msg, **{})
        sent_embedding.process(test_msg, **{})
        SGD_Classifier.process(test_msg, **{})

        assert test_msg.get("classifylabel").get("name") == "bad"

    def test_knn_classify(self):
        task_config = dict(config.CLASSIFY_TASK_CONFIG)
        dir_name = os.path.dirname(os.path.abspath(__file__))
        task_config[
            "embedding_path"] = dir_name + "/../data/test_embedding/vec.txt"
        task_config["embedding_type"] = "w2v"
        cfg = AnnotatorConfig(task_config)
        train_data = TrainingData([
            self.neg_msg1, self.neg_msg2, self.neg_msg3, self.pos_msg1,
            self.pos_msg2, self.pos_msg3
        ])
        cb = ComponentBuilder()
        char_tokenize = cb.create_component("char_tokenizer", cfg)
        sent_embedding = cb.create_component("sentence_embedding_extractor",
                                             cfg)
        Knn_Classifier = cb.create_component("Knn_Classifier", cfg)
        char_tokenize.train(train_data, cfg)
        sent_embedding.train(train_data, cfg)
        Knn_Classifier.train(train_data, cfg)
        # test
        test_msg = Message(u"增加一个需要上传的文件")
        char_tokenize.process(test_msg, **{})
        sent_embedding.process(test_msg, **{})
        Knn_Classifier.process(test_msg, **{})

        assert test_msg.get("classifylabel").get("name") == "bad"

    def test_randomforest_classify(self):
        task_config = dict(config.CLASSIFY_TASK_CONFIG)
        dir_name = os.path.dirname(os.path.abspath(__file__))
        task_config[
            "embedding_path"] = dir_name + "/../data/test_embedding/vec.txt"
        task_config["embedding_type"] = "w2v"
        cfg = AnnotatorConfig(task_config)
        train_data = TrainingData(
            [self.neg_msg1, self.neg_msg2, self.pos_msg1, self.pos_msg2])
        cb = ComponentBuilder()
        char_tokenize = cb.create_component("char_tokenizer", cfg)
        sent_embedding = cb.create_component("sentence_embedding_extractor",
                                             cfg)
        RandomForest_Classifier = cb.create_component(
            "RandomForest_Classifier", cfg)
        char_tokenize.train(train_data, cfg)
        sent_embedding.train(train_data, cfg)
        RandomForest_Classifier.train(train_data, cfg)
        # test
        test_msg = Message(u"增加一个需要上传的文件")
        char_tokenize.process(test_msg, **{})
        sent_embedding.process(test_msg, **{})
        RandomForest_Classifier.process(test_msg, **{})

        assert test_msg.get("classifylabel").get("name") == "bad"

    def test_adaboost_classify(self):
        task_config = dict(config.CLASSIFY_TASK_CONFIG)
        dir_name = os.path.dirname(os.path.abspath(__file__))
        task_config[
            "embedding_path"] = dir_name + "/../data/test_embedding/vec.txt"
        task_config["embedding_type"] = "w2v"
        cfg = AnnotatorConfig(task_config)
        train_data = TrainingData(
            [self.neg_msg1, self.neg_msg2, self.pos_msg1, self.pos_msg2])
        cb = ComponentBuilder()
        char_tokenize = cb.create_component("char_tokenizer", cfg)
        sent_embedding = cb.create_component("sentence_embedding_extractor",
                                             cfg)
        AdaBoost_Classifier = cb.create_component("AdaBoost_Classifier", cfg)
        char_tokenize.train(train_data, cfg)
        sent_embedding.train(train_data, cfg)
        AdaBoost_Classifier.train(train_data, cfg)
        # test
        test_msg = Message(u"增加一个需要上传的文件")
        char_tokenize.process(test_msg, **{})
        sent_embedding.process(test_msg, **{})
        AdaBoost_Classifier.process(test_msg, **{})

        assert test_msg.get("classifylabel").get("name") == "bad"