def ignor_test_online_training(self): """ test online training. """ test_config = "tests/data/test_config/test_config.json" config = AnnotatorConfig(test_config) # init trainer first trainer = Trainer(config) # load all data for test, in actual data should get from user label with io.open(config["org_data"], encoding="utf-8-sig") as f: data = simplejson.loads(f.read()) validate_local_data(data) data_set = data.get("data_set", list()) # faker user labeled data, user has labeled 50 texts. faker_user_labeled_data = data_set[:50] # 950 text to predict and rank unlabeled_data = data_set[50:] # now test online training examples = [] for e in faker_user_labeled_data: data = e.copy() if "text" in data: del data["text"] examples.append(Message(e["text"], data)) new_labeled_data = TrainingData(examples) # full amount train and persist model interpreter = trainer.train(new_labeled_data) trainer.persist(config['path'], config['project'], config['fixed_model_name']) # predict unlabeled dataset and ranking predicted_results = [] for unlabeled_data in unlabeled_data: predict = interpreter.parse(unlabeled_data["text"]) predicted_results.append(predict) # sort predict result # predicted result format as # { # 'classifylabel': {'name': 'spam', 'confidence': 0.5701943777626447}, # 'classifylabel_ranking': [{'name': 'spam', 'confidence': 0.5701943777626447}, # {'name': 'notspam', 'confidence': 0.42980562223735524}], # 'text': '我是一个垃圾邮件' # } confidence_threshold = config["confidence_threshold"] ranking_candidates = [text for text in predicted_results \ if text.get("classifylabel").get("confidence") < confidence_threshold] for candidate in ranking_candidates: assert candidate.get("classifylabel").get("confidence") < confidence_threshold
def ignore_test_train_with_empty_data(self): """ test train with empty train data :return: """ test_config = "tests/data/test_config/test_config.json" config = AnnotatorConfig(test_config) trainer = Trainer(config) assert len(trainer.pipeline) > 0 # create tmp train set train_data = TrainingData([]) # rm tmp train set trainer.train(train_data) # test persist and load persisted_path = trainer.persist(config['path'], config['project'], config['fixed_model_name']) interpreter_loaded = Interpreter.load(persisted_path, config) assert interpreter_loaded.pipeline assert interpreter_loaded.parse("hello") is not None assert interpreter_loaded.parse("Hello today is Monday, again!") is not None # remove tmp models shutil.rmtree(config['path'], ignore_errors=False)
def ignore_test_pipeline_flow(self): """ test trainer's train func for pipeline :return: """ test_config = "tests/data/test_config/test_config.json" config = AnnotatorConfig(test_config) trainer = Trainer(config) assert len(trainer.pipeline) > 0 # create tmp train set tmp_path = create_tmp_test_jsonfile("tmp.json") train_data = load_local_data(tmp_path) # rm tmp train set rm_tmp_file("tmp.json") interpreter = trainer.train(train_data) assert interpreter is not None out1 = interpreter.parse(("点连接拿红包啦")) # test persist and load persisted_path = trainer.persist(config['path'], config['project'], config['fixed_model_name']) interpreter_loaded = Interpreter.load(persisted_path, config) out2 = interpreter_loaded.parse("点连接拿红包啦") assert out1.get("classifylabel").get("name") == out2.get("classifylabel").get("name") # remove tmp models shutil.rmtree(config['path'], ignore_errors=True)
def ignore_test_load_and_persist_without_train(self): """ test save and load model without train :return: """ test_config = "tests/data/test_config/test_config.json" config = AnnotatorConfig(test_config) trainer = Trainer(config) assert len(trainer.pipeline) > 0 # create tmp train set tmp_path = create_tmp_test_jsonfile("tmp.json") train_data = load_local_data(tmp_path) # rm tmp train set rm_tmp_file("tmp.json") # interpreter = trainer.train(train_data) # test persist and load persisted_path = trainer.persist(config['path'], config['project'], config['fixed_model_name']) interpreter_loaded = Interpreter.load(persisted_path, config) assert interpreter_loaded.pipeline assert interpreter_loaded.parse("hello") is not None assert interpreter_loaded.parse("Hello today is Monday, again!") is not None # remove tmp models shutil.rmtree(config['path'], ignore_errors=False)
def ignore_test_trainer_persist(self): """ test pipeline persist, metadata will be saved :return: """ test_config = "tests/data/test_config/test_config.json" config = AnnotatorConfig(test_config) trainer = Trainer(config) assert len(trainer.pipeline) > 0 # char_tokenizer component should been created assert trainer.pipeline[0] is not None # create tmp train set tmp_path = create_tmp_test_jsonfile("tmp.json") train_data = load_local_data(tmp_path) # rm tmp train set rm_tmp_file("tmp.json") trainer.train(train_data) persisted_path = trainer.persist(config['path'], config['project'], config['fixed_model_name']) # load persisted metadata metadata_path = os.path.join(persisted_path, 'metadata.json') with io.open(metadata_path) as f: metadata = json.load(f) assert 'trained_at' in metadata # rm tmp files and dirs shutil.rmtree(config['path'], ignore_errors=False)
def test_load_and_persist_without_train(self): """ test save and load model without train :return: """ test_config = "tests/data/test_config.json" config = AnnotatorConfig(test_config) trainer = Trainer(config) assert len(trainer.pipeline) > 0 # create tmp train set tmp_path = create_tmp_test_jsonfile("tmp.json") train_data = load_local_data(tmp_path) # rm tmp train set rm_tmp_file("tmp.json") # interpreter = trainer.train(train_data) # test persist and load persisted_path = trainer.persist(config['path'], config['project'], config['fixed_model_name']) interpreter_loaded = Interpreter.load(persisted_path, config) assert interpreter_loaded.pipeline assert interpreter_loaded.parse("hello") is not None assert interpreter_loaded.parse( "Hello today is Monday, again!") is not None # remove tmp models shutil.rmtree(config['path'], ignore_errors=True)
def test_trainer_persist(self): """ test pipeline persist, metadata will be saved :return: """ test_config = "tests/data/test_config.json" config = AnnotatorConfig(test_config) trainer = Trainer(config) assert len(trainer.pipeline) > 0 # char_tokenizer component should been created assert trainer.pipeline[0] is not None # create tmp train set tmp_path = create_tmp_test_jsonfile("tmp.json") train_data = load_local_data(tmp_path) # rm tmp train set rm_tmp_file("tmp.json") trainer.train(train_data) persisted_path = trainer.persist(config['path'], config['project'], config['fixed_model_name']) # load persisted metadata metadata_path = os.path.join(persisted_path, 'metadata.json') with io.open(metadata_path) as f: metadata = json.load(f) assert 'trained_at' in metadata # rm tmp files and dirs shutil.rmtree(config['path'], ignore_errors=True)
def do_train( config, # type: AnnotatorConfig component_builder=None # type: Optional[ComponentBuilder] ): # type: (...) -> Tuple[Trainer, Interpreter, Text] """Loads the trainer and the data and runs the training of the model.""" # Ensure we are training a model that we can save in the end # WARN: there is still a race condition if a model with the same name is # trained in another subprocess trainer = Trainer(config, component_builder) training_data = load_local_data(config['org_data']) interpreter = trainer.train(training_data) persisted_path = trainer.persist(config['path'], config['project'], config['fixed_model_name']) return trainer, interpreter, persisted_path
class ActiveLearner(object): """ implement of active learning core, this class is mainly the wrapper of trainer and interpreter. you can use ActiveLearner as follow: active_leaner = ActiveLearner(config) data_to_train = get_data_from_user_label() active_leaner.train(data_to_train) data_to_rank = get_data_from_db() low_confidence_data = active_leaner.process_batch(data_to_rank) """ def __init__(self, config): """ init of ActiveLearner """ self.config = config self.trainer = Trainer(config) self.train_data = TrainingData([]) self.new_data_count = 0 self.batch_num = config.get("batch_num", 20) self.db = DBManager(config) self.interpreter = None def train(self, data_set): """ train data set :param data_set: format as [{"id": 1, "text": "我是测试", "label": "spam"}, .....] :return: """ config = self.config examples = [] for e in data_set: data = e.copy() if "text" in data: del data["text"] examples.append(Message(e["text"], data)) train_data = TrainingData(examples) self.interpreter = self.trainer.train(train_data) # overwrite save model TODO self.trainer.persist(config['path'], config['project'], config['fixed_model_name']) def process_one(self, id): """ predict one according id :param id: :return: """ data = self.db.get_row({"id": id}) predict = self.interpreter.parse(data["text"]) return predict def process_batch(self, ids): """ process batch text according ids :param ids: :return: """ datas = self.db.get_row_by_ids(ids) predicted_results = [] for unlabeled_data in datas: predict = self.interpreter.parse(unlabeled_data["text"]) if predict: unlabeled_data.update(predict) predicted_results.append(unlabeled_data) return predicted_results def process_texts(self, texts): """ process texts :param texts: format as [{"id": 1, "text": "我是测试"}, {"id": 2, "text": "我是测试2"}, ...] :return: format as [{'id':-, 'text':-, 'classifylabel':-, 'classifylabel_ranking':-}, ...] """ if self.interpreter is None: logger.warning( "model has not been trained, nothing will be predicted.") return [] predicted_results = [] for unlabeled_data in texts: predict = self.interpreter.parse(unlabeled_data["text"]) if predict: unlabeled_data.update(predict) predicted_results.append(unlabeled_data) return predicted_results
class ActiveLearner(object): """ implement of active learning core, this class is mainly the wrapper of trainer and interpreter. you can use ActiveLearner as follow: active_leaner = ActiveLearner(config) data_to_train = get_data_from_user_label() active_leaner.train(data_to_train) data_to_rank = get_data_from_db() low_confidence_data = active_leaner.process_batch(data_to_rank) """ def __init__(self, config): """ init of ActiveLearner """ self.config = config self.trainer = Trainer(config) self.train_data = TrainingData([]) self.new_data_count = 0 self.batch_num = config.get("batch_num", 20) self.db = DBManager(config) self.interpreter = None def train(self, data_set): """ train data set :param data_set: format as [{"id": 1, "text": "我是测试", "label": "spam"}, .....] :return: """ config = self.config examples = [] for e in data_set: data = e.copy() if "text" in data: del data["text"] examples.append(Message(e["text"], data)) train_data = TrainingData(examples) self.interpreter = self.trainer.train(train_data) # overwrite save model TODO self.trainer.persist(config['path'], config['project'], config['fixed_model_name']) def process_one(self, id): """ predict one according id :param id: :return: """ data = self.db.get_row({"id": id}) predict = self.interpreter.parse(data["text"]) return predict def process_batch(self, ids): """ process batch text according ids :param ids: :return: """ datas = self.db.get_row_by_ids(ids) predicted_results = [] for unlabeled_data in datas: predict = self.interpreter.parse(unlabeled_data["text"]) if predict: unlabeled_data.update(predict) predicted_results.append(unlabeled_data) return predicted_results def process_texts(self, texts): """ process texts :param texts: format as [{"id": 1, "text": "我是测试"}, {"id": 2, "text": "我是测试2"}, ...] :return: format as [{'id':-, 'text':-, 'classifylabel':-, 'classifylabel_ranking':-}, ...] """ if self.interpreter is None: logger.warning("model has not been trained, nothing will be predicted.") return [] predicted_results = [] for unlabeled_data in texts: predict = self.interpreter.parse(unlabeled_data["text"]) if predict: unlabeled_data.update(predict) predicted_results.append(unlabeled_data) return predicted_results