def test_kmeans(self): from chi_annotator.algo_factory.components import ComponentBuilder from chi_annotator.algo_factory.common import Message from chi_annotator.task_center.config import AnnotatorConfig from chi_annotator.algo_factory.common import TrainingData task_config = dict(config.CLASSIFY_TASK_CONFIG) dir_name = os.path.dirname(os.path.abspath(__file__)) task_config["embedding_path"] = dir_name + "/../data/test_embedding/vec.txt" task_config["embedding_type"] = "w2v" cfg = AnnotatorConfig(task_config) pos_msg1 = Message(u"你好,我是一个demo!!!!") pos_msg2 = Message(u"你好,你好,你好") neg_msg1 = Message(u"如果发现有文件漏提或注释有误") neg_msg2 = Message(u"增加一个需要上传的文件") train_data = TrainingData([neg_msg1, neg_msg2, pos_msg1, pos_msg2]) cb = ComponentBuilder() char_tokenize = cb.create_component("char_tokenizer", cfg) sent_embedding = cb.create_component("sentence_embedding_extractor", cfg) svm_classifer = cb.create_component("cluster_sklearn", cfg) char_tokenize.train(train_data, cfg) sent_embedding.train(train_data, cfg) svm_classifer.train(train_data, cfg) # test test_msg = Message(u"增加一个需要上传的文件") char_tokenize.process(test_msg, **{}) sent_embedding.process(test_msg, **{}) svm_classifer.process(test_msg, **{}) assert test_msg.get("cluster_center").get("center") is not None
def test_randomforest_classify(self): task_config = dict(config.CLASSIFY_TASK_CONFIG) dir_name = os.path.dirname(os.path.abspath(__file__)) task_config[ "embedding_path"] = dir_name + "/../data/test_embedding/vec.txt" task_config["embedding_type"] = "w2v" cfg = AnnotatorConfig(task_config) train_data = TrainingData( [self.neg_msg1, self.neg_msg2, self.pos_msg1, self.pos_msg2]) cb = ComponentBuilder() char_tokenize = cb.create_component("char_tokenizer", cfg) sent_embedding = cb.create_component("sentence_embedding_extractor", cfg) RandomForest_Classifier = cb.create_component( "RandomForest_Classifier", cfg) char_tokenize.train(train_data, cfg) sent_embedding.train(train_data, cfg) RandomForest_Classifier.train(train_data, cfg) # test test_msg = Message(u"增加一个需要上传的文件") char_tokenize.process(test_msg, **{}) sent_embedding.process(test_msg, **{}) RandomForest_Classifier.process(test_msg, **{}) assert test_msg.get("classifylabel").get("name") == "bad"
def test_char_tokenizer(self): from chi_annotator.algo_factory.components import ComponentBuilder from chi_annotator.algo_factory.common import Message from chi_annotator.task_center.config import AnnotatorConfig msg = Message(u"你好,我是一个demo!!!!") cb = ComponentBuilder() cfg = AnnotatorConfig(config.CLASSIFY_TASK_CONFIG) ct = cb.create_component("char_tokenizer", cfg) assert ct is not None ct.process(msg, **{}) assert len(msg.get("tokens")) > 0
def test_tokenizer_main(): from chi_annotator.algo_factory.components import ComponentBuilder from chi_annotator.algo_factory.common import Message from chi_annotator.config import AnnotatorConfig msg = Message("你好,我是一个demo!!!!") cb = ComponentBuilder() config = AnnotatorConfig() ct = cb.create_component("char_tokenizer", config) if ct is not None: ct.process(msg, **{}) print(msg.get("tokens"))
def ignor_test_senten_embedding_extractor(self): from chi_annotator.algo_factory.components import ComponentBuilder from chi_annotator.algo_factory.common import Message from chi_annotator.task_center.config import AnnotatorConfig cfg = AnnotatorConfig() msg = Message("你好,我是一个demo!!!!") cb = ComponentBuilder() char_tokenize = cb.create_component("char_tokenizer", cfg) sent_embedding = cb.create_component("sentence_embedding_extractor", cfg) char_tokenize.process(msg) sent_embedding.process(msg, **{}) assert msg.get("sentence_embedding").sum() + 7.30032945834 < 1e-6
def ignor_test_embedding(self): from chi_annotator.algo_factory.components import ComponentBuilder from chi_annotator.algo_factory.common import Message from chi_annotator.task_center.config import AnnotatorConfig from chi_annotator.algo_factory.common import TrainingData from gensim.models.word2vec import LineSentence text_dir = create_tmp_test_textfile("spam_email_text_1000") # 将数据放入TrainingData with open(text_dir, 'r') as f: res = [] for line in f.readlines(): line.strip('\n') line = Message(re.sub('\s', '', line)) res.append(line) res = TrainingData(res) cfg = AnnotatorConfig(filename="tests/data/test_config/test_config_embedding.json") cb = ComponentBuilder() # char_tokenize, embedding的训练暂时不用用到 char_tokenize = cb.create_component("char_tokenizer", cfg) char_tokenize.train(res, cfg) # 加载embedding, 训练模型, 传入数据为LinSentence(data_path) embedding = cb.create_component("embedding", cfg) embedding.train(LineSentence(text_dir), cfg) embedding.persist(cfg.wv_model_path) # 加载sent_embedding, 从embedding训练完是model中, 获得sentence_vec sent_embedding = cb.create_component("embedding_extractor", cfg) msg = Message("你好,我是一个demo!!!!") char_tokenize.process(msg) sent_embedding.sentence_process(msg, **{}) assert msg.get("sentence_embedding").sum() != 0 # 加载base model, 加入新的corpus, 在base_model的基础上进行增量学习 embedding = embedding.load(model_metadata=cfg) embedding.train(LineSentence(text_dir), cfg) embedding.persist(cfg.wv_model_path) # 增量学习后生成的新model, 进行EmbeddingExtractor测验 sent_embedding = cb.create_component("embedding_extractor", cfg) msg = Message("你好,我是一个demo!!!!") char_tokenize.process(msg) sent_embedding.sentence_process(msg, **{}) assert msg.get("sentence_embedding").sum() != 0 rm_tmp_file("word2vec.model") rm_tmp_file("word2vec.model.vector") rm_tmp_file("spam_email_text_1000")
def test_words_jieba_tokenizer(self): """ test word tokenizer using jieba :return: """ from chi_annotator.algo_factory.components import ComponentBuilder from chi_annotator.algo_factory.common import Message from chi_annotator.task_center.config import AnnotatorConfig msg = Message(u"你好,我是一个demo!!!!") cb = ComponentBuilder() config = AnnotatorConfig() ct = cb.create_component("tokenizer_jieba", config) assert ct is not None ct.process(msg, **{}) assert len(msg.get("tokens")) > 0
def ignor_test_words_jieba_tokenizer(self): """ #TODO: jieba will add later test word tokenizer using jieba :return: """ from chi_annotator.algo_factory.components import ComponentBuilder from chi_annotator.algo_factory.common import Message from chi_annotator.task_center.config import AnnotatorConfig msg = Message(u"你好,我是一个demo!!!!") cb = ComponentBuilder() cfg = AnnotatorConfig(config.CLASSIFY_TASK_CONFIG) ct = cb.create_component("tokenizer_jieba", cfg) assert ct is not None ct.process(msg, **{}) assert len(msg.get("tokens")) > 0
def test_kmeans(self): from chi_annotator.algo_factory.components import ComponentBuilder from chi_annotator.algo_factory.common import Message from chi_annotator.task_center.config import AnnotatorConfig from chi_annotator.algo_factory.common import TrainingData task_config = dict(config.CLASSIFY_TASK_CONFIG) dir_name = os.path.dirname(os.path.abspath(__file__)) task_config[ "embedding_path"] = dir_name + "/../data/test_embedding/vec.txt" task_config["embedding_type"] = "w2v" cfg = AnnotatorConfig(task_config) pos_msg1 = Message(u"你好,我是一个demo!!!!") pos_msg2 = Message(u"你好,你好,你好") neg_msg1 = Message(u"如果发现有文件漏提或注释有误") neg_msg2 = Message(u"增加一个需要上传的文件") train_data = TrainingData([neg_msg1, neg_msg2, pos_msg1, pos_msg2]) cb = ComponentBuilder() char_tokenize = cb.create_component("char_tokenizer", cfg) sent_embedding = cb.create_component("sentence_embedding_extractor", cfg) svm_classifer = cb.create_component("cluster_sklearn", cfg) char_tokenize.train(train_data, cfg) sent_embedding.train(train_data, cfg) svm_classifer.train(train_data, cfg) # test test_msg = Message(u"增加一个需要上传的文件") char_tokenize.process(test_msg, **{}) sent_embedding.process(test_msg, **{}) svm_classifer.process(test_msg, **{}) assert test_msg.get("cluster_center").get("center") is not None
def test_kmeans(self): from chi_annotator.algo_factory.components import ComponentBuilder from chi_annotator.algo_factory.common import Message from chi_annotator.task_center.config import AnnotatorConfig from chi_annotator.algo_factory.common import TrainingData cfg = AnnotatorConfig() pos_msg1 = Message(u"你好,我是一个demo!!!!") pos_msg2 = Message(u"你好,你好,你好") neg_msg1 = Message(u"如果发现有文件漏提或注释有误") neg_msg2 = Message(u"增加一个需要上传的文件") train_data = TrainingData([neg_msg1, neg_msg2, pos_msg1, pos_msg2]) cb = ComponentBuilder() char_tokenize = cb.create_component("char_tokenizer", cfg) sent_embedding = cb.create_component("sentence_embedding_extractor", cfg) svm_classifer = cb.create_component("cluster_sklearn", cfg) char_tokenize.train(train_data, cfg) sent_embedding.train(train_data, cfg) svm_classifer.train(train_data, cfg) # test test_msg = Message(u"增加一个需要上传的文件") char_tokenize.process(test_msg, **{}) sent_embedding.process(test_msg, **{}) svm_classifer.process(test_msg, **{}) assert test_msg.get("cluster_center").get("center") is not None
def test_online_training(self): """ test online training. :return: """ test_config = "tests/data/test_config.json" config = AnnotatorConfig(test_config) # init trainer first trainer = Trainer(config) # load all data for test, in actual data should get from user label with io.open(config["org_data"], encoding="utf-8-sig") as f: data = simplejson.loads(f.read()) validate_local_data(data) data_set = data.get("data_set", list()) # faker user labeled data, user has labeled 50 texts. faker_user_labeled_data = data_set[:50] # 950 text to predict and rank unlabeled_data = data_set[50:] # now test online training examples = [] for e in faker_user_labeled_data: data = e.copy() if "text" in data: del data["text"] examples.append(Message(e["text"], data)) new_labeled_data = TrainingData(examples) # full amount train and persist model interpreter = trainer.train(new_labeled_data) trainer.persist(config['path'], config['project'], config['fixed_model_name']) # predict unlabeled dataset and ranking predicted_results = [] for unlabeled_data in unlabeled_data: predict = interpreter.parse(unlabeled_data["text"]) predicted_results.append(predict) # sort predict result # predicted result format as # { # 'classifylabel': {'name': 'spam', 'confidence': 0.5701943777626447}, # 'classifylabel_ranking': [{'name': 'spam', 'confidence': 0.5701943777626447}, # {'name': 'notspam', 'confidence': 0.42980562223735524}], # 'text': '我是一个垃圾邮件' # } confidence_threshold = config["confidence_threshold"] ranking_candidates = [text for text in predicted_results \ if text.get("classifylabel").get("confidence") < confidence_threshold] for candidate in ranking_candidates: assert candidate.get("classifylabel").get( "confidence") < confidence_threshold
def test_sgd_classify(self): cfg = AnnotatorConfig() train_data = TrainingData( [self.neg_msg1, self.neg_msg2, self.pos_msg1, self.pos_msg2]) cb = ComponentBuilder() char_tokenize = cb.create_component("char_tokenizer", cfg) sent_embedding = cb.create_component("sentence_embedding_extractor", cfg) SGD_Classifier = cb.create_component("SGD_Classifier", cfg) char_tokenize.train(train_data, cfg) sent_embedding.train(train_data, cfg) SGD_Classifier.train(train_data, cfg) # test test_msg = Message(u"增加一个需要上传的文件") char_tokenize.process(test_msg, **{}) sent_embedding.process(test_msg, **{}) SGD_Classifier.process(test_msg, **{}) assert test_msg.get("classifylabel").get("name") == "bad"
def _train_batch(self, batch_result): # from result to train_data, create train data msg = [] for item in batch_result: msg.append(Message(item["text"], {"label": item["label"]})) train_data = TrainingData(msg) # create interpreter trainer = Trainer(self.task_config) trainer.train(train_data) # save model meta for config trainer.persist(self.task_config.get_save_path_prefix()) return True
def test_randomforest_classify(self): task_config = dict(config.CLASSIFY_TASK_CONFIG) dir_name = os.path.dirname(os.path.abspath(__file__)) task_config["embedding_path"] = dir_name + "/../data/test_embedding/vec.txt" task_config["embedding_type"] = "w2v" cfg = AnnotatorConfig(task_config) train_data = TrainingData([self.neg_msg1, self.neg_msg2, self.pos_msg1, self.pos_msg2]) cb = ComponentBuilder() char_tokenize = cb.create_component("char_tokenizer", cfg) sent_embedding = cb.create_component("sentence_embedding_extractor", cfg) RandomForest_Classifier = cb.create_component("RandomForest_Classifier", cfg) char_tokenize.train(train_data, cfg) sent_embedding.train(train_data, cfg) RandomForest_Classifier.train(train_data, cfg) # test test_msg = Message(u"增加一个需要上传的文件") char_tokenize.process(test_msg, **{}) sent_embedding.process(test_msg, **{}) RandomForest_Classifier.process(test_msg, **{}) assert test_msg.get("classifylabel").get("name") == "bad"
def parse(self, text, time=None): # type: (Text) -> Dict[Text, Any] """Parse the input text, classify it and return pipeline result. The pipeline result usually contains intent and entities.""" if not text: # Not all components are able to handle empty strings. So we need # to prevent that... This default return will not contain all # output attributes of all components, but in the end, no one should # pass an empty string in the first place. output = self.default_output_attributes() output["text"] = "" return output message = Message(text, self.default_output_attributes(), time=time) for component in self.pipeline: component.process(message, **self.context) output = self.default_output_attributes() output.update(message.as_dict(only_output_properties=True)) return output
def ignor_test_embedding(self): from chi_annotator.algo_factory.components import ComponentBuilder from chi_annotator.algo_factory.common import Message from chi_annotator.task_center.config import AnnotatorConfig from chi_annotator.algo_factory.common import TrainingData from gensim.models.word2vec import LineSentence text_dir = create_tmp_test_textfile("spam_email_text_1000") # 将数据放入TrainingData with open(text_dir, 'r') as f: res = [] for line in f.readlines(): line.strip('\n') line = Message(re.sub('\s', '', line)) res.append(line) res = TrainingData(res) cfg = AnnotatorConfig( filename="tests/data/test_config/test_config_embedding.json") cb = ComponentBuilder() # char_tokenize, embedding的训练暂时不用用到 char_tokenize = cb.create_component("char_tokenizer", cfg) char_tokenize.train(res, cfg) # 加载embedding, 训练模型, 传入数据为LinSentence(data_path) embedding = cb.create_component("embedding", cfg) embedding.train(LineSentence(text_dir), cfg) embedding.persist(cfg.wv_model_path) # 加载sent_embedding, 从embedding训练完是model中, 获得sentence_vec sent_embedding = cb.create_component("embedding_extractor", cfg) msg = Message("你好,我是一个demo!!!!") char_tokenize.process(msg) sent_embedding.sentence_process(msg, **{}) assert msg.get("sentence_embedding").sum() != 0 # 加载base model, 加入新的corpus, 在base_model的基础上进行增量学习 embedding = embedding.load(model_metadata=cfg) embedding.train(LineSentence(text_dir), cfg) embedding.persist(cfg.wv_model_path) # 增量学习后生成的新model, 进行EmbeddingExtractor测验 sent_embedding = cb.create_component("embedding_extractor", cfg) msg = Message("你好,我是一个demo!!!!") char_tokenize.process(msg) sent_embedding.sentence_process(msg, **{}) assert msg.get("sentence_embedding").sum() != 0 rm_tmp_file("word2vec.model") rm_tmp_file("word2vec.model.vector") rm_tmp_file("spam_email_text_1000")
def load_local_data(filename): # type: (Text) -> TrainingData """Loads training data stored in the rasa NLU data format.""" with io.open(filename, encoding="utf-8-sig") as f: data = simplejson.loads(f.read()) validate_local_data(data) data_set = data.get("data_set", list()) training_examples = [] for e in data_set: data = e.copy() if "text" in data: del data["text"] training_examples.append(Message(e["text"], data)) return TrainingData(training_examples)
def train(self, data_set): """ train data set :param data_set: format as [{"id": 1, "text": "我是测试", "label": "spam"}, .....] :return: """ config = self.config examples = [] for e in data_set: data = e.copy() if "text" in data: del data["text"] examples.append(Message(e["text"], data)) train_data = TrainingData(examples) self.interpreter = self.trainer.train(train_data) # overwrite save model TODO self.trainer.persist(config['path'], config['project'], config['fixed_model_name'])
class TestClassify(object): pos_msg1 = Message(u"你好,我是一个demo!!!!", {"label": "good"}) pos_msg2 = Message(u"你好,你好,你好", {"label": "good"}) pos_msg3 = Message(u"好的呀,不错", {"label": "good"}) neg_msg1 = Message(u"如果发现有文件漏提或注释有误", {"label": "bad"}) neg_msg2 = Message(u"增加一个需要上传的文件", {"label": "bad"}) neg_msg3 = Message(u"有一个上传的文件", {"label": "bad"}) def test_svm_classify(self): cfg = AnnotatorConfig() train_data = TrainingData( [self.neg_msg1, self.neg_msg2, self.pos_msg1, self.pos_msg2]) cb = ComponentBuilder() char_tokenize = cb.create_component("char_tokenizer", cfg) sent_embedding = cb.create_component("sentence_embedding_extractor", cfg) svm_classifer = cb.create_component("SVM_classifier", cfg) char_tokenize.train(train_data, cfg) sent_embedding.train(train_data, cfg) svm_classifer.train(train_data, cfg) # test test_msg = Message(u"增加一个需要上传的文件") char_tokenize.process(test_msg, **{}) sent_embedding.process(test_msg, **{}) svm_classifer.process(test_msg, **{}) assert test_msg.get("classifylabel").get("name") == "bad" def test_sgd_classify(self): cfg = AnnotatorConfig() train_data = TrainingData( [self.neg_msg1, self.neg_msg2, self.pos_msg1, self.pos_msg2]) cb = ComponentBuilder() char_tokenize = cb.create_component("char_tokenizer", cfg) sent_embedding = cb.create_component("sentence_embedding_extractor", cfg) SGD_Classifier = cb.create_component("SGD_Classifier", cfg) char_tokenize.train(train_data, cfg) sent_embedding.train(train_data, cfg) SGD_Classifier.train(train_data, cfg) # test test_msg = Message(u"增加一个需要上传的文件") char_tokenize.process(test_msg, **{}) sent_embedding.process(test_msg, **{}) SGD_Classifier.process(test_msg, **{}) assert test_msg.get("classifylabel").get("name") == "bad" def test_knn_classify(self): cfg = AnnotatorConfig() train_data = TrainingData([ self.neg_msg1, self.neg_msg2, self.neg_msg3, self.pos_msg1, self.pos_msg2, self.pos_msg3 ]) cb = ComponentBuilder() char_tokenize = cb.create_component("char_tokenizer", cfg) sent_embedding = cb.create_component("sentence_embedding_extractor", cfg) Knn_Classifier = cb.create_component("Knn_Classifier", cfg) char_tokenize.train(train_data, cfg) sent_embedding.train(train_data, cfg) Knn_Classifier.train(train_data, cfg) # test test_msg = Message(u"增加一个需要上传的文件") char_tokenize.process(test_msg, **{}) sent_embedding.process(test_msg, **{}) Knn_Classifier.process(test_msg, **{}) assert test_msg.get("classifylabel").get("name") == "bad" def test_randomforest_classify(self): cfg = AnnotatorConfig() train_data = TrainingData( [self.neg_msg1, self.neg_msg2, self.pos_msg1, self.pos_msg2]) cb = ComponentBuilder() char_tokenize = cb.create_component("char_tokenizer", cfg) sent_embedding = cb.create_component("sentence_embedding_extractor", cfg) RandomForest_Classifier = cb.create_component( "RandomForest_Classifier", cfg) char_tokenize.train(train_data, cfg) sent_embedding.train(train_data, cfg) RandomForest_Classifier.train(train_data, cfg) # test test_msg = Message(u"增加一个需要上传的文件") char_tokenize.process(test_msg, **{}) sent_embedding.process(test_msg, **{}) RandomForest_Classifier.process(test_msg, **{}) assert test_msg.get("classifylabel").get("name") == "bad" def test_adaboost_classify(self): cfg = AnnotatorConfig() train_data = TrainingData( [self.neg_msg1, self.neg_msg2, self.pos_msg1, self.pos_msg2]) cb = ComponentBuilder() char_tokenize = cb.create_component("char_tokenizer", cfg) sent_embedding = cb.create_component("sentence_embedding_extractor", cfg) AdaBoost_Classifier = cb.create_component("AdaBoost_Classifier", cfg) char_tokenize.train(train_data, cfg) sent_embedding.train(train_data, cfg) AdaBoost_Classifier.train(train_data, cfg) # test test_msg = Message(u"增加一个需要上传的文件") char_tokenize.process(test_msg, **{}) sent_embedding.process(test_msg, **{}) AdaBoost_Classifier.process(test_msg, **{}) assert test_msg.get("classifylabel").get("name") == "bad"
class TestClassify(object): pos_msg1 = Message(u"你好,我是一个demo!!!!", {"label": "good"}) pos_msg2 = Message(u"你好,你好,你好", {"label": "good"}) pos_msg3 = Message(u"好的呀,不错", {"label": "good"}) neg_msg1 = Message(u"如果发现有文件漏提或注释有误", {"label": "bad"}) neg_msg2 = Message(u"增加一个需要上传的文件", {"label": "bad"}) neg_msg3 = Message(u"有一个上传的文件", {"label": "bad"}) def test_svm_classify(self): task_config = dict(config.CLASSIFY_TASK_CONFIG) dir_name = os.path.dirname(os.path.abspath(__file__)) task_config[ "embedding_path"] = dir_name + "/../data/test_embedding/vec.txt" task_config["embedding_type"] = "w2v" cfg = AnnotatorConfig(task_config) train_data = TrainingData( [self.neg_msg1, self.neg_msg2, self.pos_msg1, self.pos_msg2]) cb = ComponentBuilder() char_tokenize = cb.create_component("char_tokenizer", cfg) sent_embedding = cb.create_component("sentence_embedding_extractor", cfg) svm_classifer = cb.create_component("SVM_classifier", cfg) char_tokenize.train(train_data, cfg) sent_embedding.train(train_data, cfg) svm_classifer.train(train_data, cfg) # test test_msg = Message(u"增加一个需要上传的文件") char_tokenize.process(test_msg, **{}) sent_embedding.process(test_msg, **{}) svm_classifer.process(test_msg, **{}) assert test_msg.get("classifylabel").get("name") == "bad" def test_sgd_classify(self): task_config = dict(config.CLASSIFY_TASK_CONFIG) dir_name = os.path.dirname(os.path.abspath(__file__)) task_config[ "embedding_path"] = dir_name + "/../data/test_embedding/vec.txt" task_config["embedding_type"] = "w2v" cfg = AnnotatorConfig(task_config) train_data = TrainingData( [self.neg_msg1, self.neg_msg2, self.pos_msg1, self.pos_msg2]) cb = ComponentBuilder() char_tokenize = cb.create_component("char_tokenizer", cfg) sent_embedding = cb.create_component("sentence_embedding_extractor", cfg) SGD_Classifier = cb.create_component("SGD_Classifier", cfg) char_tokenize.train(train_data, cfg) sent_embedding.train(train_data, cfg) SGD_Classifier.train(train_data, cfg) # test test_msg = Message(u"增加一个需要上传的文件") char_tokenize.process(test_msg, **{}) sent_embedding.process(test_msg, **{}) SGD_Classifier.process(test_msg, **{}) assert test_msg.get("classifylabel").get("name") == "bad" def test_knn_classify(self): task_config = dict(config.CLASSIFY_TASK_CONFIG) dir_name = os.path.dirname(os.path.abspath(__file__)) task_config[ "embedding_path"] = dir_name + "/../data/test_embedding/vec.txt" task_config["embedding_type"] = "w2v" cfg = AnnotatorConfig(task_config) train_data = TrainingData([ self.neg_msg1, self.neg_msg2, self.neg_msg3, self.pos_msg1, self.pos_msg2, self.pos_msg3 ]) cb = ComponentBuilder() char_tokenize = cb.create_component("char_tokenizer", cfg) sent_embedding = cb.create_component("sentence_embedding_extractor", cfg) Knn_Classifier = cb.create_component("Knn_Classifier", cfg) char_tokenize.train(train_data, cfg) sent_embedding.train(train_data, cfg) Knn_Classifier.train(train_data, cfg) # test test_msg = Message(u"增加一个需要上传的文件") char_tokenize.process(test_msg, **{}) sent_embedding.process(test_msg, **{}) Knn_Classifier.process(test_msg, **{}) assert test_msg.get("classifylabel").get("name") == "bad" def test_randomforest_classify(self): task_config = dict(config.CLASSIFY_TASK_CONFIG) dir_name = os.path.dirname(os.path.abspath(__file__)) task_config[ "embedding_path"] = dir_name + "/../data/test_embedding/vec.txt" task_config["embedding_type"] = "w2v" cfg = AnnotatorConfig(task_config) train_data = TrainingData( [self.neg_msg1, self.neg_msg2, self.pos_msg1, self.pos_msg2]) cb = ComponentBuilder() char_tokenize = cb.create_component("char_tokenizer", cfg) sent_embedding = cb.create_component("sentence_embedding_extractor", cfg) RandomForest_Classifier = cb.create_component( "RandomForest_Classifier", cfg) char_tokenize.train(train_data, cfg) sent_embedding.train(train_data, cfg) RandomForest_Classifier.train(train_data, cfg) # test test_msg = Message(u"增加一个需要上传的文件") char_tokenize.process(test_msg, **{}) sent_embedding.process(test_msg, **{}) RandomForest_Classifier.process(test_msg, **{}) assert test_msg.get("classifylabel").get("name") == "bad" def test_adaboost_classify(self): task_config = dict(config.CLASSIFY_TASK_CONFIG) dir_name = os.path.dirname(os.path.abspath(__file__)) task_config[ "embedding_path"] = dir_name + "/../data/test_embedding/vec.txt" task_config["embedding_type"] = "w2v" cfg = AnnotatorConfig(task_config) train_data = TrainingData( [self.neg_msg1, self.neg_msg2, self.pos_msg1, self.pos_msg2]) cb = ComponentBuilder() char_tokenize = cb.create_component("char_tokenizer", cfg) sent_embedding = cb.create_component("sentence_embedding_extractor", cfg) AdaBoost_Classifier = cb.create_component("AdaBoost_Classifier", cfg) char_tokenize.train(train_data, cfg) sent_embedding.train(train_data, cfg) AdaBoost_Classifier.train(train_data, cfg) # test test_msg = Message(u"增加一个需要上传的文件") char_tokenize.process(test_msg, **{}) sent_embedding.process(test_msg, **{}) AdaBoost_Classifier.process(test_msg, **{}) assert test_msg.get("classifylabel").get("name") == "bad"