コード例 #1
0
    def test_active_leaner_process_texts(self):
        """
        test active_leaner process raw texts
        :return:
        """
        test_config = "tests/data/test_config.json"
        config = AnnotatorConfig(test_config)
        # init trainer first

        # load all data for test, in actual data should get from user label
        with io.open(config["org_data"], encoding="utf-8-sig") as f:
            data = simplejson.loads(f.read())
        validate_local_data(data)

        data_set = data.get("data_set", list())

        # faker user labeled data, user has labeled 50 texts.
        faker_user_labeled_data = data_set[:50]

        # text to be predict
        texts = [{"uuid": 1, "text": "我是测试"}, {"uuid": 2, "text": "我是测试2"}]

        active_learner = ActiveLearner(config)
        active_learner.train(faker_user_labeled_data)
        predicted = active_learner.process_texts(texts)
        assert len(predicted) == 2
        assert "classifylabel" in predicted[0]
コード例 #2
0
    def ignor_test_active_leaner_process_texts(self):
        """
        test active_leaner process raw texts
        :return:
        """
        test_config = "tests/data/test_config/test_config.json"
        config = AnnotatorConfig(test_config)
        # init trainer first

        # load all data for test, in actual data should get from user label
        with io.open(config["org_data"], encoding="utf-8-sig") as f:
            data = simplejson.loads(f.read())
        validate_local_data(data)

        data_set = data.get("data_set", list())

        # faker user labeled data, user has labeled 50 texts.
        faker_user_labeled_data = data_set[:50]

        # text to be predict
        texts = [{"uuid": 1, "text": "我是测试"}, {"uuid": 2, "text": "我是测试2"}]

        active_learner = ActiveLearner(config)
        active_learner.train(faker_user_labeled_data)
        predicted = active_learner.process_texts(texts)
        assert len(predicted) == 2
        assert "classifylabel" in predicted[0]
コード例 #3
0
    def test_online_training(self):
        """
        test online training.
        :return:
        """
        test_config = "tests/data/test_config.json"
        config = AnnotatorConfig(test_config)
        # init trainer first
        trainer = Trainer(config)

        # load all data for test, in actual data should get from user label
        with io.open(config["org_data"], encoding="utf-8-sig") as f:
            data = simplejson.loads(f.read())
        validate_local_data(data)

        data_set = data.get("data_set", list())

        # faker user labeled data, user has labeled 50 texts.
        faker_user_labeled_data = data_set[:50]
        # 950 text to predict and rank
        unlabeled_data = data_set[50:]

        # now test online training
        examples = []
        for e in faker_user_labeled_data:
            data = e.copy()
            if "text" in data:
                del data["text"]
            examples.append(Message(e["text"], data))

        new_labeled_data = TrainingData(examples)

        # full amount train and persist model
        interpreter = trainer.train(new_labeled_data)
        trainer.persist(config['path'], config['project'],
                        config['fixed_model_name'])

        # predict unlabeled dataset and ranking
        predicted_results = []
        for unlabeled_data in unlabeled_data:
            predict = interpreter.parse(unlabeled_data["text"])
            predicted_results.append(predict)

        # sort predict result
        # predicted result format as
        # {
        #   'classifylabel': {'name': 'spam', 'confidence': 0.5701943777626447},
        #   'classifylabel_ranking': [{'name': 'spam', 'confidence': 0.5701943777626447},
        #                             {'name': 'notspam', 'confidence': 0.42980562223735524}],
        #   'text': '我是一个垃圾邮件'
        # }
        confidence_threshold = config["confidence_threshold"]
        ranking_candidates = [text for text in predicted_results \
                              if text.get("classifylabel").get("confidence") < confidence_threshold]
        for candidate in ranking_candidates:
            assert candidate.get("classifylabel").get(
                "confidence") < confidence_threshold
コード例 #4
0
    def ignor_test_online_training(self):
        """
        test online training.
        """
        test_config = "tests/data/test_config/test_config.json"
        config = AnnotatorConfig(test_config)
        # init trainer first
        trainer = Trainer(config)

        # load all data for test, in actual data should get from user label
        with io.open(config["org_data"], encoding="utf-8-sig") as f:
            data = simplejson.loads(f.read())
        validate_local_data(data)

        data_set = data.get("data_set", list())

        # faker user labeled data, user has labeled 50 texts.
        faker_user_labeled_data = data_set[:50]
        # 950 text to predict and rank
        unlabeled_data = data_set[50:]

        # now test online training
        examples = []
        for e in faker_user_labeled_data:
            data = e.copy()
            if "text" in data:
                del data["text"]
            examples.append(Message(e["text"], data))

        new_labeled_data = TrainingData(examples)

        # full amount train and persist model
        interpreter = trainer.train(new_labeled_data)
        trainer.persist(config['path'],
                        config['project'],
                        config['fixed_model_name'])

        # predict unlabeled dataset and ranking
        predicted_results = []
        for unlabeled_data in unlabeled_data:
            predict = interpreter.parse(unlabeled_data["text"])
            predicted_results.append(predict)

        # sort predict result
        # predicted result format as
        # {
        #   'classifylabel': {'name': 'spam', 'confidence': 0.5701943777626447},
        #   'classifylabel_ranking': [{'name': 'spam', 'confidence': 0.5701943777626447},
        #                             {'name': 'notspam', 'confidence': 0.42980562223735524}],
        #   'text': '我是一个垃圾邮件'
        # }
        confidence_threshold = config["confidence_threshold"]
        ranking_candidates = [text for text in predicted_results \
                              if text.get("classifylabel").get("confidence") < confidence_threshold]
        for candidate in ranking_candidates:
            assert candidate.get("classifylabel").get("confidence") < confidence_threshold