def predict(): # Config Loader test_args = ConfigSection() ConfigLoader().load_config(cfgfile, {"POS_test": test_args}) # fetch dictionary size and number of labels from pickle files word2index = load_pickle(pickle_path, "word2id.pkl") test_args["vocab_size"] = len(word2index) index2label = load_pickle(pickle_path, "label2id.pkl") test_args["num_classes"] = len(index2label) # load dev data dev_data = load_pickle(pickle_path, "data_dev.pkl") # Define the same model model = AdvSeqLabel(test_args) # Dump trained parameters into the model ModelLoader.load_pytorch(model, "./save/trained_model.pkl") print("model loaded!") # Tester test_args["evaluator"] = SeqLabelEvaluator() tester = SeqLabelTester(**test_args.data) # Start testing tester.test(model, dev_data)
def infer(): # Config Loader test_args = ConfigSection() ConfigLoader().load_config(cfgfile, {"POS_test": test_args}) # fetch dictionary size and number of labels from pickle files word2index = load_pickle(pickle_path, "word2id.pkl") test_args["vocab_size"] = len(word2index) index2label = load_pickle(pickle_path, "label2id.pkl") test_args["num_classes"] = len(index2label) # Define the same model model = AdvSeqLabel(test_args) try: ModelLoader.load_pytorch(model, "./save/trained_model.pkl") print('model loaded!') except Exception as e: print('cannot load model!') raise # Data Loader infer_data = SeqLabelDataSet(load_func=BaseLoader.load_lines) infer_data.load(data_infer_path, vocabs={"word_vocab": word2index}, infer=True) print('data loaded') # Inference interface infer = SeqLabelInfer(pickle_path) results = infer.predict(model, infer_data) print(results) print("Inference finished!")
def mock_cws(): os.makedirs("mock", exist_ok=True) text = ["这是最好的基于深度学习的中文分词系统。", "大王叫我来巡山。", "我党多年来致力于改善人民生活水平。"] word2id = Vocabulary() word_list = [ch for ch in "".join(text)] word2id.update(word_list) save_pickle(word2id, "./mock/", "word2id.pkl") class2id = Vocabulary(need_default=False) label_list = ['B', 'M', 'E', 'S'] class2id.update(label_list) save_pickle(class2id, "./mock/", "label2id.pkl") model_args = { "vocab_size": len(word2id), "word_emb_dim": 50, "rnn_hidden_units": 50, "num_classes": len(class2id) } config_file = """ [test_section] vocab_size = {} word_emb_dim = 50 rnn_hidden_units = 50 num_classes = {} """.format(len(word2id), len(class2id)) with open("mock/test.cfg", "w", encoding="utf-8") as f: f.write(config_file) model = AdvSeqLabel(model_args) ModelSaver("mock/cws_basic_model_v_0.pkl").save_pytorch(model)
def mock_pos_tag(): os.makedirs("mock", exist_ok=True) text = ["这是最好的基于深度学习的中文分词系统。", "大王叫我来巡山。", "我党多年来致力于改善人民生活水平。"] vocab = Vocabulary() word_list = [ch for ch in "".join(text)] vocab.update(word_list) save_pickle(vocab, "./mock/", "word2id.pkl") idx2label = Vocabulary(need_default=False) label_list = ['B-n', 'M-v', 'E-nv', 'S-adj', 'B-v', 'M-vn', 'S-adv'] idx2label.update(label_list) save_pickle(idx2label, "./mock/", "label2id.pkl") model_args = { "vocab_size": len(vocab), "word_emb_dim": 50, "rnn_hidden_units": 50, "num_classes": len(idx2label) } config_file = """ [test_section] vocab_size = {} word_emb_dim = 50 rnn_hidden_units = 50 num_classes = {} """.format(len(vocab), len(idx2label)) with open("mock/test.cfg", "w", encoding="utf-8") as f: f.write(config_file) model = AdvSeqLabel(model_args) ModelSaver("mock/pos_tag_model_v_0.pkl").save_pytorch(model)
def train(): # Config Loader train_args = ConfigSection() test_args = ConfigSection() ConfigLoader().load_config(cfgfile, { "train": train_args, "test": test_args }) print("loading data set...") data = SeqLabelDataSet(load_func=TokenizeDataSetLoader.load) data.load(cws_data_path) data_train, data_dev = data.split(ratio=0.3) train_args["vocab_size"] = len(data.word_vocab) train_args["num_classes"] = len(data.label_vocab) print("vocab size={}, num_classes={}".format(len(data.word_vocab), len(data.label_vocab))) change_field_is_target(data_dev, "truth", True) save_pickle(data_dev, "./save/", "data_dev.pkl") save_pickle(data.word_vocab, "./save/", "word2id.pkl") save_pickle(data.label_vocab, "./save/", "label2id.pkl") # Trainer trainer = SeqLabelTrainer(epochs=train_args["epochs"], batch_size=train_args["batch_size"], validate=train_args["validate"], use_cuda=train_args["use_cuda"], pickle_path=train_args["pickle_path"], save_best_dev=True, print_every_step=10, model_name="trained_model.pkl", evaluator=SeqLabelEvaluator()) # Model model = AdvSeqLabel(train_args) try: ModelLoader.load_pytorch(model, "./save/saved_model.pkl") print('model parameter loaded!') except Exception as e: print("No saved model. Continue.") pass # Start training trainer.train(model, data_train, data_dev) print("Training finished!") # Saver saver = ModelSaver("./save/trained_model.pkl") saver.save_pytorch(model) print("Model saved!")
def train(): # Config Loader train_args = ConfigSection() test_args = ConfigSection() ConfigLoader("good_path").load_config(cfgfile, { "train": train_args, "test": test_args }) # Data Loader loader = TokenizeDatasetLoader(cws_data_path) train_data = loader.load_pku() # Preprocessor preprocessor = SeqLabelPreprocess() data_train, data_dev = preprocessor.run(train_data, pickle_path=pickle_path, train_dev_split=0.3) train_args["vocab_size"] = preprocessor.vocab_size train_args["num_classes"] = preprocessor.num_classes # Trainer trainer = SeqLabelTrainer(**train_args.data) # Model model = AdvSeqLabel(train_args) try: ModelLoader.load_pytorch(model, "./save/saved_model.pkl") print('model parameter loaded!') except Exception as e: print("No saved model. Continue.") pass # Start training trainer.train(model, data_train, data_dev) print("Training finished!") # Saver saver = ModelSaver("./save/saved_model.pkl") saver.save_pytorch(model) print("Model saved!")
def train(checkpoint=None): # load config train_param = ConfigSection() model_param = ConfigSection() ConfigLoader().load_config(cfgfile, { "train": train_param, "model": model_param }) print("config loaded") # Data Loader dataset = ZhConllPOSReader().load("/home/hyan/train.conllx") print(dataset) print("dataset transformed") dataset.rename_field("tag", "truth") vocab_proc = VocabIndexerProcessor("words", new_added_filed_name="word_seq") tag_proc = VocabIndexerProcessor("truth") seq_len_proc = SeqLenProcessor(field_name="word_seq", new_added_field_name="word_seq_origin_len", is_input=True) vocab_proc(dataset) tag_proc(dataset) seq_len_proc(dataset) dataset.set_input("word_seq", "word_seq_origin_len", "truth") dataset.set_target("truth", "word_seq_origin_len") print("processors defined") # dataset.set_is_target(tag_ids=True) model_param["vocab_size"] = vocab_proc.get_vocab_size() model_param["num_classes"] = tag_proc.get_vocab_size() print("vocab_size={} num_classes={}".format(model_param["vocab_size"], model_param["num_classes"])) # define a model if checkpoint is None: # pre_trained = load_tencent_embed("/home/zyfeng/data/char_tencent_embedding.pkl", vocab_proc.vocab.word2idx) pre_trained = None model = AdvSeqLabel(model_param, id2words=tag_proc.vocab.idx2word, emb=pre_trained) print(model) else: model = torch.load(checkpoint) # call trainer to train trainer = Trainer(dataset, model, loss=None, metrics=SpanFPreRecMetric( tag_proc.vocab, pred="predict", target="truth", seq_lens="word_seq_origin_len"), dev_data=dataset, metric_key="f", use_tqdm=True, use_cuda=True, print_every=5, n_epochs=6, save_path="./save") trainer.train(load_best_model=True) # save model & pipeline model_proc = ModelProcessor(model, seq_len_field_name="word_seq_origin_len") id2tag = Index2WordProcessor(tag_proc.vocab, "predict", "tag") pp = Pipeline([vocab_proc, seq_len_proc, model_proc, id2tag]) save_dict = {"pipeline": pp, "model": model, "tag_vocab": tag_proc.vocab} torch.save(save_dict, "model_pp.pkl") print("pipeline saved") torch.save(model, "./save/best_model.pkl")
def train(train_data_path, dev_data_path, checkpoint=None, save=None): # load config train_param = ConfigSection() model_param = ConfigSection() ConfigLoader().load_config(cfgfile, { "train": train_param, "model": model_param }) print("config loaded") # Data Loader print("loading training set...") dataset = ConllxDataLoader().load(train_data_path, return_dataset=True) print("loading dev set...") dev_data = ConllxDataLoader().load(dev_data_path, return_dataset=True) print(dataset) print("================= dataset ready =====================") dataset.rename_field("tag", "truth") dev_data.rename_field("tag", "truth") vocab_proc = VocabIndexerProcessor("words", new_added_filed_name="word_seq") tag_proc = VocabIndexerProcessor("truth", is_input=True) seq_len_proc = SeqLenProcessor(field_name="word_seq", new_added_field_name="word_seq_origin_len", is_input=True) set_input_proc = SetInputProcessor("word_seq", "word_seq_origin_len") vocab_proc(dataset) tag_proc(dataset) seq_len_proc(dataset) # index dev set word_vocab, tag_vocab = vocab_proc.vocab, tag_proc.vocab dev_data.apply(lambda ins: [word_vocab.to_index(w) for w in ins["words"]], new_field_name="word_seq") dev_data.apply(lambda ins: [tag_vocab.to_index(w) for w in ins["truth"]], new_field_name="truth") dev_data.apply(lambda ins: len(ins["word_seq"]), new_field_name="word_seq_origin_len") # set input & target dataset.set_input("word_seq", "word_seq_origin_len", "truth") dev_data.set_input("word_seq", "word_seq_origin_len", "truth") dataset.set_target("truth", "word_seq_origin_len") dev_data.set_target("truth", "word_seq_origin_len") # dataset.set_is_target(tag_ids=True) model_param["vocab_size"] = vocab_proc.get_vocab_size() model_param["num_classes"] = tag_proc.get_vocab_size() print("vocab_size={} num_classes={}".format(model_param["vocab_size"], model_param["num_classes"])) # define a model if checkpoint is None: # pre_trained = load_tencent_embed("/home/zyfeng/data/char_tencent_embedding.pkl", vocab_proc.vocab.word2idx) pre_trained = None model = AdvSeqLabel(model_param, id2words=None, emb=pre_trained) print(model) else: model = torch.load(checkpoint) # call trainer to train trainer = Trainer(dataset, model, loss=None, metrics=SpanFPreRecMetric( tag_proc.vocab, pred="predict", target="truth", seq_lens="word_seq_origin_len"), dev_data=dev_data, metric_key="f", use_tqdm=True, use_cuda=True, print_every=10, n_epochs=20, save_path=save) trainer.train(load_best_model=True) # save model & pipeline model_proc = ModelProcessor(model, seq_len_field_name="word_seq_origin_len") id2tag = Index2WordProcessor(tag_proc.vocab, "predict", "tag") pp = Pipeline( [vocab_proc, seq_len_proc, set_input_proc, model_proc, id2tag]) save_dict = {"pipeline": pp, "model": model, "tag_vocab": tag_proc.vocab} torch.save(save_dict, os.path.join(save, "model_pp.pkl")) print("pipeline saved")