def mock_cws(): os.makedirs("mock", exist_ok=True) text = ["这是最好的基于深度学习的中文分词系统。", "大王叫我来巡山。", "我党多年来致力于改善人民生活水平。"] word2id = Vocabulary() word_list = [ch for ch in "".join(text)] word2id.update(word_list) save_pickle(word2id, "./mock/", "word2id.pkl") class2id = Vocabulary(need_default=False) label_list = ['B', 'M', 'E', 'S'] class2id.update(label_list) save_pickle(class2id, "./mock/", "label2id.pkl") model_args = { "vocab_size": len(word2id), "word_emb_dim": 50, "rnn_hidden_units": 50, "num_classes": len(class2id) } config_file = """ [test_section] vocab_size = {} word_emb_dim = 50 rnn_hidden_units = 50 num_classes = {} """.format(len(word2id), len(class2id)) with open("mock/test.cfg", "w", encoding="utf-8") as f: f.write(config_file) model = AdvSeqLabel(model_args) ModelSaver("mock/cws_basic_model_v_0.pkl").save_pytorch(model)
def mock_pos_tag(): os.makedirs("mock", exist_ok=True) text = ["这是最好的基于深度学习的中文分词系统。", "大王叫我来巡山。", "我党多年来致力于改善人民生活水平。"] vocab = Vocabulary() word_list = [ch for ch in "".join(text)] vocab.update(word_list) save_pickle(vocab, "./mock/", "word2id.pkl") idx2label = Vocabulary(need_default=False) label_list = ['B-n', 'M-v', 'E-nv', 'S-adj', 'B-v', 'M-vn', 'S-adv'] idx2label.update(label_list) save_pickle(idx2label, "./mock/", "label2id.pkl") model_args = { "vocab_size": len(vocab), "word_emb_dim": 50, "rnn_hidden_units": 50, "num_classes": len(idx2label) } config_file = """ [test_section] vocab_size = {} word_emb_dim = 50 rnn_hidden_units = 50 num_classes = {} """.format(len(vocab), len(idx2label)) with open("mock/test.cfg", "w", encoding="utf-8") as f: f.write(config_file) model = AdvSeqLabel(model_args) ModelSaver("mock/pos_tag_model_v_0.pkl").save_pytorch(model)
def train_test(): # Config Loader train_args = ConfigSection() ConfigLoader().load_config(config_path, {"POS_infer": train_args}) # define dataset data_train = TokenizeDataSetLoader().load(cws_data_path) word_vocab = Vocabulary() label_vocab = Vocabulary() data_train.update_vocab(word_seq=word_vocab, label_seq=label_vocab) data_train.index_field("word_seq", word_vocab).index_field("label_seq", label_vocab) data_train.set_origin_len("word_seq") data_train.rename_field("label_seq", "truth").set_target(truth=False) train_args["vocab_size"] = len(word_vocab) train_args["num_classes"] = len(label_vocab) save_pickle(word_vocab, pickle_path, "word2id.pkl") save_pickle(label_vocab, pickle_path, "label2id.pkl") # Trainer trainer = SeqLabelTrainer(**train_args.data) # Model model = SeqLabeling(train_args) # Start training trainer.train(model, data_train) # Saver saver = ModelSaver("./save/saved_model.pkl") saver.save_pytorch(model) del model, trainer # Define the same model model = SeqLabeling(train_args) # Dump trained parameters into the model ModelLoader.load_pytorch(model, "./save/saved_model.pkl") # Load test configuration test_args = ConfigSection() ConfigLoader().load_config(config_path, {"POS_infer": test_args}) test_args["evaluator"] = SeqLabelEvaluator() # Tester tester = SeqLabelTester(**test_args.data) # Start testing data_train.set_target(truth=True) tester.test(model, data_train)
def train_test(): # Config Loader train_args = ConfigSection() ConfigLoader().load_config(config_path, {"POS_infer": train_args}) # define dataset data_train = SeqLabelDataSet(load_func=TokenizeDataSetLoader.load) data_train.load(cws_data_path) train_args["vocab_size"] = len(data_train.word_vocab) train_args["num_classes"] = len(data_train.label_vocab) save_pickle(data_train.word_vocab, pickle_path, "word2id.pkl") save_pickle(data_train.label_vocab, pickle_path, "label2id.pkl") # Trainer trainer = SeqLabelTrainer(**train_args.data) # Model model = SeqLabeling(train_args) # Start training trainer.train(model, data_train) # Saver saver = ModelSaver("./save/saved_model.pkl") saver.save_pytorch(model) del model, trainer # Define the same model model = SeqLabeling(train_args) # Dump trained parameters into the model ModelLoader.load_pytorch(model, "./save/saved_model.pkl") # Load test configuration test_args = ConfigSection() ConfigLoader().load_config(config_path, {"POS_infer": test_args}) test_args["evaluator"] = SeqLabelEvaluator() # Tester tester = SeqLabelTester(**test_args.data) # Start testing change_field_is_target(data_train, "truth", True) tester.test(model, data_train)
def test_seq_label(self): model_args = { "vocab_size": 10, "word_emb_dim": 100, "rnn_hidden_units": 100, "num_classes": 5 } infer_data = [['a', 'b', 'c', 'd', 'e'], ['a', '@', 'c', 'd', 'e'], ['a', 'b', '#', 'd', 'e'], ['a', 'b', 'c', '?', 'e'], ['a', 'b', 'c', 'd', '$'], ['!', 'b', 'c', 'd', 'e']] vocab = Vocabulary() vocab.word2idx = { 'a': 0, 'b': 1, 'c': 2, 'd': 3, 'e': 4, '!': 5, '@': 6, '#': 7, '$': 8, '?': 9 } class_vocab = Vocabulary() class_vocab.word2idx = {"0": 0, "1": 1, "2": 2, "3": 3, "4": 4} os.system("mkdir save") save_pickle(class_vocab, "./save/", "class2id.pkl") save_pickle(vocab, "./save/", "word2id.pkl") model = SeqLabeling(model_args) predictor = Predictor("./save/", task="seq_label") results = predictor.predict(network=model, data=infer_data) self.assertTrue(isinstance(results, list)) self.assertGreater(len(results), 0) for res in results: self.assertTrue(isinstance(res, list)) self.assertEqual(len(res), 5) self.assertTrue(isinstance(res[0], str)) os.system("rm -rf save") print("pickle path deleted")
def train(): train_args, model_args = ConfigSection(), ConfigSection() ConfigLoader.load_config(config_dir, {"text_class": train_args}) # load dataset print("Loading data...") data = TextClassifyDataSet(load_func=ClassDataSetLoader.load) data.load(train_data_dir) print("vocabulary size:", len(data.word_vocab)) print("number of classes:", len(data.label_vocab)) save_pickle(data.word_vocab, save_dir, "word2id.pkl") save_pickle(data.label_vocab, save_dir, "label2id.pkl") model_args["num_classes"] = len(data.label_vocab) model_args["vocab_size"] = len(data.word_vocab) # construct model print("Building model...") model = CNNText(model_args) # train print("Training...") trainer = ClassificationTrainer(epochs=train_args["epochs"], batch_size=train_args["batch_size"], validate=train_args["validate"], use_cuda=train_args["use_cuda"], pickle_path=save_dir, save_best_dev=train_args["save_best_dev"], model_name=model_name, loss=Loss("cross_entropy"), optimizer=Optimizer("SGD", lr=0.001, momentum=0.9)) trainer.train(model, data) print("Training finished!") saver = ModelSaver(os.path.join(save_dir, model_name)) saver.save_pytorch(model) print("Model saved!")
def mock_text_classify(): os.makedirs("mock", exist_ok=True) text = [ "世界物联网大会明日在京召开龙头股启动在即", "乌鲁木齐市新增一处城市中心旅游目的地", "朱元璋的大明朝真的源于明教吗?——告诉你一个真实的“明教”" ] vocab = Vocabulary() word_list = [ch for ch in "".join(text)] vocab.update(word_list) save_pickle(vocab, "./mock/", "word2id.pkl") idx2label = Vocabulary(need_default=False) label_list = [ 'class_A', 'class_B', 'class_C', 'class_D', 'class_E', 'class_F' ] idx2label.update(label_list) save_pickle(idx2label, "./mock/", "label2id.pkl") model_args = { "vocab_size": len(vocab), "word_emb_dim": 50, "rnn_hidden_units": 50, "num_classes": len(idx2label) } config_file = """ [test_section] vocab_size = {} word_emb_dim = 50 rnn_hidden_units = 50 num_classes = {} """.format(len(vocab), len(idx2label)) with open("mock/test.cfg", "w", encoding="utf-8") as f: f.write(config_file) model = CNNText(model_args) ModelSaver("mock/text_class_model_v0.pkl").save_pytorch(model)
def train(): # Config Loader train_args = ConfigSection() test_args = ConfigSection() ConfigLoader().load_config(cfgfile, { "train": train_args, "test": test_args }) print("loading data set...") data = SeqLabelDataSet(load_func=TokenizeDataSetLoader.load) data.load(cws_data_path) data_train, data_dev = data.split(ratio=0.3) train_args["vocab_size"] = len(data.word_vocab) train_args["num_classes"] = len(data.label_vocab) print("vocab size={}, num_classes={}".format(len(data.word_vocab), len(data.label_vocab))) change_field_is_target(data_dev, "truth", True) save_pickle(data_dev, "./save/", "data_dev.pkl") save_pickle(data.word_vocab, "./save/", "word2id.pkl") save_pickle(data.label_vocab, "./save/", "label2id.pkl") # Trainer trainer = SeqLabelTrainer(epochs=train_args["epochs"], batch_size=train_args["batch_size"], validate=train_args["validate"], use_cuda=train_args["use_cuda"], pickle_path=train_args["pickle_path"], save_best_dev=True, print_every_step=10, model_name="trained_model.pkl", evaluator=SeqLabelEvaluator()) # Model model = AdvSeqLabel(train_args) try: ModelLoader.load_pytorch(model, "./save/saved_model.pkl") print('model parameter loaded!') except Exception as e: print("No saved model. Continue.") pass # Start training trainer.train(model, data_train, data_dev) print("Training finished!") # Saver saver = ModelSaver("./save/trained_model.pkl") saver.save_pytorch(model) print("Model saved!")
def test_training(): # Config Loader trainer_args = ConfigSection() model_args = ConfigSection() ConfigLoader().load_config(config_dir, { "test_seq_label_trainer": trainer_args, "test_seq_label_model": model_args}) data_set = TokenizeDataSetLoader().load(data_path) word_vocab = Vocabulary() label_vocab = Vocabulary() data_set.update_vocab(word_seq=word_vocab, label_seq=label_vocab) data_set.index_field("word_seq", word_vocab).index_field("label_seq", label_vocab) data_set.set_origin_len("word_seq") data_set.rename_field("label_seq", "truth").set_target(truth=False) data_train, data_dev = data_set.split(0.3, shuffle=True) model_args["vocab_size"] = len(word_vocab) model_args["num_classes"] = len(label_vocab) save_pickle(word_vocab, pickle_path, "word2id.pkl") save_pickle(label_vocab, pickle_path, "label2id.pkl") trainer = SeqLabelTrainer( epochs=trainer_args["epochs"], batch_size=trainer_args["batch_size"], validate=False, use_cuda=False, pickle_path=pickle_path, save_best_dev=trainer_args["save_best_dev"], model_name=model_name, optimizer=Optimizer("SGD", lr=0.01, momentum=0.9), ) # Model model = SeqLabeling(model_args) # Start training trainer.train(model, data_train, data_dev) # Saver saver = ModelSaver(os.path.join(pickle_path, model_name)) saver.save_pytorch(model) del model, trainer # Define the same model model = SeqLabeling(model_args) # Dump trained parameters into the model ModelLoader.load_pytorch(model, os.path.join(pickle_path, model_name)) # Load test configuration tester_args = ConfigSection() ConfigLoader().load_config(config_dir, {"test_seq_label_tester": tester_args}) # Tester tester = SeqLabelTester(batch_size=4, use_cuda=False, pickle_path=pickle_path, model_name="seq_label_in_test.pkl", evaluator=SeqLabelEvaluator() ) # Start testing with validation data data_dev.set_target(truth=True) tester.test(model, data_dev)
def test_seq_label(self): model_args = { "vocab_size": 10, "word_emb_dim": 100, "rnn_hidden_units": 100, "num_classes": 5 } infer_data = [['a', 'b', 'c', 'd', 'e'], ['a', '@', 'c', 'd', 'e'], ['a', 'b', '#', 'd', 'e'], ['a', 'b', 'c', '?', 'e'], ['a', 'b', 'c', 'd', '$'], ['!', 'b', 'c', 'd', 'e']] vocab = Vocabulary() vocab.word2idx = { 'a': 0, 'b': 1, 'c': 2, 'd': 3, 'e': 4, '!': 5, '@': 6, '#': 7, '$': 8, '?': 9 } class_vocab = Vocabulary() class_vocab.word2idx = {"0": 0, "1": 1, "2": 2, "3": 3, "4": 4} os.system("mkdir save") save_pickle(class_vocab, "./save/", "label2id.pkl") save_pickle(vocab, "./save/", "word2id.pkl") model = CNNText(model_args) import fastNLP.core.predictor as pre predictor = Predictor("./save/", pre.text_classify_post_processor) # Load infer data infer_data_set = convert_seq_dataset(infer_data) infer_data_set.index_field("word_seq", vocab) results = predictor.predict(network=model, data=infer_data_set) self.assertTrue(isinstance(results, list)) self.assertGreater(len(results), 0) self.assertEqual(len(results), len(infer_data)) for res in results: self.assertTrue(isinstance(res, str)) self.assertTrue(res in class_vocab.word2idx) del model, predictor infer_data_set.set_origin_len("word_seq") model = SeqLabeling(model_args) predictor = Predictor("./save/", pre.seq_label_post_processor) results = predictor.predict(network=model, data=infer_data_set) self.assertTrue(isinstance(results, list)) self.assertEqual(len(results), len(infer_data)) for i in range(len(infer_data)): res = results[i] self.assertTrue(isinstance(res, list)) self.assertEqual(len(res), len(infer_data[i])) os.system("rm -rf save") print("pickle path deleted")
def train_and_test(): # Config Loader trainer_args = ConfigSection() model_args = ConfigSection() ConfigLoader().load_config(config_dir, { "test_seq_label_trainer": trainer_args, "test_seq_label_model": model_args }) data_set = SeqLabelDataSet() data_set.load(data_path) train_set, dev_set = data_set.split(0.3, shuffle=True) model_args["vocab_size"] = len(data_set.word_vocab) model_args["num_classes"] = len(data_set.label_vocab) save_pickle(data_set.word_vocab, pickle_path, "word2id.pkl") save_pickle(data_set.label_vocab, pickle_path, "label2id.pkl") trainer = SeqLabelTrainer( epochs=trainer_args["epochs"], batch_size=trainer_args["batch_size"], validate=False, use_cuda=trainer_args["use_cuda"], pickle_path=pickle_path, save_best_dev=trainer_args["save_best_dev"], model_name=model_name, optimizer=Optimizer("SGD", lr=0.01, momentum=0.9), ) # Model model = SeqLabeling(model_args) # Start training trainer.train(model, train_set, dev_set) print("Training finished!") # Saver saver = ModelSaver(os.path.join(pickle_path, model_name)) saver.save_pytorch(model) print("Model saved!") del model, trainer change_field_is_target(dev_set, "truth", True) # Define the same model model = SeqLabeling(model_args) # Dump trained parameters into the model ModelLoader.load_pytorch(model, os.path.join(pickle_path, model_name)) print("model loaded!") # Load test configuration tester_args = ConfigSection() ConfigLoader().load_config(config_dir, {"test_seq_label_tester": tester_args}) # Tester tester = SeqLabelTester(batch_size=4, use_cuda=False, pickle_path=pickle_path, model_name="seq_label_in_test.pkl", evaluator=SeqLabelEvaluator()) # Start testing with validation data tester.test(model, dev_set) print("model tested!")