def getRest(input): ckpt_file = tf.train.latest_checkpoint(model_path) print(ckpt_file) paths['model_path'] = ckpt_file model = BiLSTM_CRF(args, embeddings, tag2label, word2id, paths, config=config) model.build_graph() saver = tf.train.Saver() with tf.Session(config=config) as sess: print('============= demo =============') saver.restore(sess, ckpt_file) demo_sent = input if demo_sent == '' or demo_sent.isspace(): return {'status': 'fail'} else: demo_sent = list(demo_sent.strip()) demo_data = [(demo_sent, ['O'] * len(demo_sent))] tag = model.demo_one(sess, demo_data) PER, LOC, ORG = get_entity(tag, demo_sent) result = {'status': 'success', 'PER': PER, 'LOC': LOC, 'ORG': ORG} return result
def demotest(sentence): ckpt_file = tf.train.latest_checkpoint(model_path) print(ckpt_file) paths['model_path'] = ckpt_file model = BiLSTM_CRF(args, embeddings, tag2label, word2id, paths, config=config) model.build_graph() saver = tf.train.Saver() with tf.Session(config=config) as sess: print('============= demo =============') saver.restore(sess, ckpt_file) while (1): #print('Please input your sentence:') demo_sent = sentence if demo_sent == '' or demo_sent.isspace(): print('语句为空') PER = [''] LOC = [''] ORG = [''] return (PER, LOC, ORG) else: demo_sent = list(demo_sent.strip()) demo_data = [(demo_sent, ['O'] * len(demo_sent))] tag = model.demo_one(sess, demo_data) PER, LOC, ORG = get_entity(tag, demo_sent) print('PER: {}\nLOC: {}\nORG: {}'.format(PER, LOC, ORG)) return (PER, LOC, ORG)
def _main(): data_manager = DataManager() vocab_size = len(data_manager.word2ix) model = BiLSTM_CRF(device, vocab_size, data_manager.tag2ix, EMBEDDING_DIM, HIDDEN_DIM) model = model.to(device) train_set = NerDataset(data_manager.train_sents, data_manager.train_tags) dev_set = NerDataset(data_manager.dev_sents, data_manager.dev_tags) train_loader = DataLoader(train_set, batch_size=BATCH_SZ, shuffle=True) dev_loader = DataLoader(dev_set, batch_size=BATCH_SZ, shuffle=True) optimizer = optim.Adam(model.parameters(), lr=0.01) epoch_loss = [] '''with torch.no_grad(): precheck_sent = to_tensor(train_loader[0]) precheck_tag = to_tensor(dataset.train_tags[0]) print(precheck_tag) print(model(precheck_sent))''' for epoch in range(EPOCH_NUM): for sents, tags, lengths in tqdm(train_loader): sents = sents.to(device) tags = tags.to(device) lengths = lengths.to(device) # print(lengths, sents.size(), tags.size()) loss = model.neg_log_likelihood(sents, tags, lengths) epoch_loss.append(loss.item()) optimizer.zero_grad() loss.backward() optimizer.step() print(epoch, ' epoch loss: ', sum(epoch_loss)/len(epoch_loss)) save_model(model, epoch) eval(model, dev_loader)
def demo_one(self, model_path): ''' 输入句子 :param model_path: input:武三思與韋後日夜譖敬暉等不已 :return: [[0, 2, 'PER'], [4, 5, 'PER'], [9, 10, 'PER']] ''' ckpt_file = tf.train.latest_checkpoint(model_path) print(ckpt_file) self.paths['model_path'] = ckpt_file model = BiLSTM_CRF(args, self.embedding, self.tag2id, self.word2id, self.paths, config=config) model.build_graph() saver = tf.train.Saver() with tf.Session(config=config) as sess: print('begain to demo one sentence!') saver.restore(sess, ckpt_file) while (1): print('Please input your sentence:') demo_sent = input() if demo_sent == '' or demo_sent.isspace( ) or demo_sent == 'end': print('See you next time!') break else: demo_sent = list(demo_sent.strip()) demo_data = [(demo_sent, ['O'] * len(demo_sent))] tag = model.demo_one(sess, demo_data) print(get_ner_demo(tag))
def evaluate_words(lines): print("start evaluate_words") ckpt_file = tf.train.latest_checkpoint(model_path) print(ckpt_file) paths['model_path'] = ckpt_file model = BiLSTM_CRF(args, embeddings, tag2label, word2id, paths, config=config) model.build_graph() saver = tf.train.Saver() with tf.Session(config=config) as sess: print('============= demo =============') saver.restore(sess, ckpt_file) demo_sent = lines print(demo_sent) demo_sent = list(demo_sent.strip()) print(demo_sent) demo_data = [(demo_sent, ['O'] * len(demo_sent))] tag = model.demo_one(sess, demo_data) PER, LOC, ORG = get_entity(tag, demo_sent) print('PER: {}\nLOC: {}\nORG: {}'.format(PER, LOC, ORG))
def predict_random(demo_sent): word2id, embeddings = getDicEmbed() ckpt_file = tf.train.latest_checkpoint(model_path) print(ckpt_file) model = BiLSTM_CRF(batch_size=args.batch_size, epoch_num=args.epoch, hidden_dim=args.hidden_dim, embeddings=embeddings, dropout_keep=args.dropout, optimizer=args.optimizer, lr=args.lr, clip_grad=args.clip, tag2label=tag2label, vocab=word2id, shuffle=args.shuffle, model_path=ckpt_file, summary_path=summary_path, log_path=log_path, result_path=result_path, CRF=args.CRF, update_embedding=args.update_embedding) model.build_graph() saver = tf.train.Saver() with tf.Session() as sess: print('============= demo =============') saver.restore(sess, ckpt_file) demo_sent = list(demo_sent.strip()) demo_data = [(demo_sent, ['M'] * len(demo_sent))] tag = model.demo_one(sess, demo_data) sess.close() res = segment(sent, tag) print(res)
class NER_DEMO(object): def __init__(self, args): config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.per_process_gpu_memory_fraction = 0.2 paths, model_path = get_paths(args) ckpt_file = tf.train.latest_checkpoint(model_path) paths['model_path'] = ckpt_file word2id = read_dictionary( os.path.join('.', args.train_data, 'word2id.pkl')) embeddings = random_embedding(word2id, args.embedding_dim) self.model = BiLSTM_CRF(args, embeddings, tag2label, word2id, paths, config=config) self.model.build_graph() self.saver = tf.train.Saver() self.sess = tf.Session(config=config) self.saver.restore(self.sess, ckpt_file) def predict(self, demo_sent): if demo_sent == '' or demo_sent.isspace(): print('See you next time!') return {} else: demo_sent = list(demo_sent.strip()) demo_data = [(demo_sent, ['O'] * len(demo_sent))] tag = self.model.demo_one(self.sess, demo_data) entities = get_entity(tag, demo_sent) return entities
def main(args): labels = [ 'O', 'B-LOC', 'B-ORG', 'B-T', 'I-LOC', 'I-PER', 'B-PER', 'I-ORG', 'I-T' ] # labels = ['O', 'I-PER', 'B-PER', 'I-LOC', 'I-ORG', 'B-ORG', 'B-LOC'] args.num_labels = len(labels) tokenizer = None word2id = None if args.model == 'bert': is_BERT = True # use 'bert-base-chinese' model pretrained_model_name = 'bert-base-chinese' tokenizer = BertTokenizer.from_pretrained(pretrained_model_name) config = BertConfig.from_pretrained( pretrained_model_name, num_labels=args.num_labels, hidden_dropout_prob=args.hidden_dropout_prob) model = BERTforNER_CRF.from_pretrained(pretrained_model_name, config=config, use_crf=args.crf) else: is_BERT = False word2id = json.load(open(args.word2id_file, "r", encoding="utf8")) model = BiLSTM_CRF(len(word2id), args.embedding_dim, args.hidden_dim, args.num_labels, args.hidden_dropout_prob, args.crf) framework = Framework(args) if args.mode == "train": print("loading training dataset...") train_dataset = NERDataset(file_path=args.train_file, labels=labels, word2id=word2id, tokenizer=tokenizer, max_len=args.max_len, is_BERT=is_BERT) print("loading dev datasets...") dev_dataset = NERDataset(file_path=args.dev_file, labels=labels, word2id=word2id, tokenizer=tokenizer, max_len=args.max_len, is_BERT=is_BERT) framework.train(train_dataset, dev_dataset, model, labels) print("\Testing ...") print("loading dev datasets...") test_dataset = NERDataset(file_path=args.test_file, labels=labels, word2id=word2id, tokenizer=tokenizer, max_len=args.max_len, is_BERT=is_BERT) model.load_state_dict(torch.load(args.save_model)) framework.test(test_dataset, model, labels)
def train_and_val(): embedding_dim = 100 hidden_dim = 100 model_load_path = None best_model_save_path = 'model/model_100_best_0223.pth' max_score = 0 stop_epoch = 30 unimprove_time = 0 val_json_path = '/home/agwave/Data/resume/val_0222.json' val_pdf_dir = '/home/agwave/Data/resume/val_0222/' training_data = get_data_from_data_txt(TRAIN_WORD_TO_TAG_PATH) with open('supporting_document/train_word_to_tag_0223.json', 'r') as j: word_to_ix = json.load(j) tag_to_ix = {'b-name': 0, 'i-name': 1, 'b-bir': 2, 'i-bir': 3, 'b-gend': 4, 'i-gend': 5, 'b-tel': 6, 'i-tel': 7, 'b-acad': 8, 'i-acad': 9, 'b-nati': 10, 'i-nati': 11, 'b-live': 12, 'i-live': 13, 'b-poli': 14, 'i-poli': 15, 'b-unv': 16, 'i-unv': 17, 'b-comp': 18, 'i-comp': 19, 'b-work': 20, 'i-work': 21, 'b-post': 22, 'i-post': 23, 'b-proj': 24, 'i-proj': 25, 'b-resp': 26, 'i-resp': 27, 'b-degr': 28, 'i-degr': 29, 'b-grti': 30, 'i-grti': 31, 'b-woti': 32, 'i-woti': 33, 'b-prti': 34, 'i-prti': 35, 'o': 36, '<start>': 37, '<stop>': 38} model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, embedding_dim, hidden_dim) optimizer = optim.Adam(model.parameters(), lr=0.01) start_epoch = 0 if model_load_path != None: print('load model...') checkpoint = torch.load(model_load_path) model.load_state_dict(checkpoint['model_state_dict']) optimizer.load_state_dict(checkpoint['optimizer_state_dict']) start_epoch = checkpoint['epoch'] + 1 preliminary_score = get_score_by_model(model, val_json_path, val_pdf_dir) print('preliminary score:', preliminary_score) for epoch in range(start_epoch, stop_epoch): print("---------------------") print("running epoch : ", epoch) start_time = time.time() for sentence, tags in tqdm(training_data): model.zero_grad() sentence_in = prepare_sequence(sentence, word_to_ix) targets = torch.tensor([tag_to_ix[t] for t in tags], dtype=torch.long) loss = model.neg_log_likelihood(sentence_in, targets) loss.backward() nn.utils.clip_grad_norm_(model.parameters(), 1) optimizer.step() cur_epoch_score = get_score_by_model(model, val_json_path, val_pdf_dir) print('score', cur_epoch_score) print('running time:', time.time() - start_time) if cur_epoch_score > max_score: unimprove_time = 0 max_score = cur_epoch_score torch.save({ 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'epoch': epoch }, best_model_save_path) print('save best model successfully.') else: break
def run(): embedding_mat = np.random.uniform( -0.25, 0.25, (len(read_dictionary(params.vocab_path)), params.embedding_dim)) embedding_mat = np.float32(embedding_mat) embeddings = embedding_mat num_tags = len(params.tag2label) summary_path = "logs" model = BiLSTM_CRF(embeddings, params.update_embedding, params.hidden_dim, num_tags, params.clip, summary_path, params.optimizer) model.build_graph() predict(model, params.batch_size, read_dictionary(params.vocab_path), params.tag2label)
def run(demo_sent, flag=False): embedding_mat = np.random.uniform(-0.25, 0.25, (len(read_dictionary(params.vocab_path)), params.embedding_dim)) embedding_mat = np.float32(embedding_mat) embeddings = embedding_mat num_tags = len(params.tag2label) summary_path = "logs" model = BiLSTM_CRF(embeddings, params.update_embedding, params.hidden_dim, num_tags, params.clip, summary_path, params.optimizer) model.build_graph() PER_mess, LOC_mess, ORG_mess = predict(model, params.batch_size, read_dictionary(params.vocab_path), params.tag2label, demo_sent) if flag: return PER_mess, LOC_mess, ORG_mess #run('我在北京上北京大学,周恩来是中国总理,我喜欢北京。我在清华大学,毛泽东是中国主席,他去过苏联。')
def test(): """ 模型测试 """ # 模型 bilstm_crf = BiLSTM_CRF(opt.vocab_size, opt.emb_dim, opt.emb_dim//2, opt.tag_num, dropout=opt.dropout) if opt.load_model_path: bilstm_crf.load(opt.load_model_path) # 数据 test_dataset = RmrbDataset(train=False) test_dataloader = DataLoader(test_dataset, batch_size=len(test_dataset)) for i, (x_batch, y_batch) in enumerate(test_dataloader): y_hat = bilstm_crf(x_batch) print(classification_report(t.flatten(y_batch), t.flatten(y_hat)))
def test(data, file): """ created by jma 模型测试 :param data:测试数据 :param file:模型 """ model = BiLSTM_CRF(embeddings, args.update_embedding, args.hidden_dim, len(tag2label), args.clip, params.summary_path, args.optimizer) model.build_graph() testsaver = tf.train.Saver() with tf.Session(config=config) as sess: testsaver.restore(sess, file) label_list, seq_len_list = dev_one_epoch(model, sess, data) evaluate(label_list, data)
def run(word_train, label_train, word_dev, label_dev, vocab, device, kf_index=0): # build dataset train_dataset = SegDataset(word_train, label_train, vocab, config.label2id) dev_dataset = SegDataset(word_dev, label_dev, vocab, config.label2id) # build data_loader train_loader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True, collate_fn=train_dataset.collate_fn) dev_loader = DataLoader(dev_dataset, batch_size=config.batch_size, shuffle=True, collate_fn=dev_dataset.collate_fn) # model model = BiLSTM_CRF(embedding_size=config.embedding_size, hidden_size=config.hidden_size, vocab_size=vocab.vocab_size(), target_size=vocab.label_size(), num_layers=config.lstm_layers, lstm_drop_out=config.lstm_drop_out, nn_drop_out=config.nn_drop_out) model.to(device) # optimizer optimizer = optim.Adam(model.parameters(), lr=config.lr, betas=config.betas) scheduler = StepLR(optimizer, step_size=config.lr_step, gamma=config.lr_gamma) # how to initialize these parameters elegantly for p in model.crf.parameters(): _ = torch.nn.init.uniform_(p, -1, 1) # train and test # train(train_loader, dev_loader, vocab, model, optimizer, scheduler, device, kf_index) with torch.no_grad(): # test on the final test set test_loss, f1 = test(config.test_dir, vocab, device, kf_index) return test_loss, f1
def train(args): train_data = args.train_data train_eval_split = args.train_eval_split min_count = args.min_count vocab_file = args.vocab_file max_step = args.max_step model_path = args.model_path data_fold = args.data_fold train_sents, train_sent_labels, eval_sents, eval_sent_labels = preprocess( train_data, train_eval_split, data_fold) vocab = build_vocab(train_sents, min_count, vocab_file) bilstm_crf = BiLSTM_CRF(args.batch_size, args.embedding_size, args.hidden_size, args.lr, len(list(vocab.keys())), tag_num=7) print("bilstm_crf object created.") sess = tf.InteractiveSession() saver = tf.train.Saver(max_to_keep=3) sess.run(tf.global_variables_initializer()) batched_train_sents = batch_generate(train_sents, args.batch_size) batched_train_labels = batch_generate(train_sent_labels, args.batch_size) print("Batch generator created.") while (1): for batch in zip(batched_train_sents, batched_train_labels): #print(batch[0][:10]) #print(batch[1][:10]) batch_sents, batch_labels, seq_len = batch_preprocess(batch, vocab) loss_value, _, global_step_value = sess.run( (bilstm_crf.loss, bilstm_crf.train_step, bilstm_crf.global_step), feed_dict={ bilstm_crf.input_sents: batch_sents, bilstm_crf.input_labels: batch_labels, bilstm_crf.sequence_lengths: seq_len }) print("%d step finished." % (global_step_value)) if (global_step_value % 10 == 0) or (global_step_value == 1): f_score, right_ner, recog_ner, all_ner, shape = sess.run( (bilstm_crf.f_score, bilstm_crf.right_ner, bilstm_crf.recog_ner, bilstm_crf.all_ner, bilstm_crf.shape), feed_dict={ bilstm_crf.input_sents: batch_sents, bilstm_crf.input_labels: batch_labels, bilstm_crf.sequence_lengths: seq_len }) print("%d step, loss is %s, f_score is %s" % (global_step_value, str(loss_value), str(f_score))) print( "length is %s, right_ner is %d, recog_ner is %d, all_ner is %d" % (str(shape), right_ner, recog_ner, all_ner)) saver.save(sess, os.path.join(model_path, "model.ckpt"))
def __init__(self, args): config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.per_process_gpu_memory_fraction = 0.2 paths, model_path = get_paths(args) ckpt_file = tf.train.latest_checkpoint(model_path) paths['model_path'] = ckpt_file word2id = read_dictionary( os.path.join('.', args.train_data, 'word2id.pkl')) embeddings = random_embedding(word2id, args.embedding_dim) self.model = BiLSTM_CRF(args, embeddings, tag2label, word2id, paths, config=config) self.model.build_graph() self.saver = tf.train.Saver() self.sess = tf.Session(config=config) self.saver.restore(self.sess, ckpt_file)
def Train(trainfile): word2id, embeddings = getDicEmbed() traindata = getTrainData(trainfile) model = BiLSTM_CRF(batch_size=args.batch_size, epoch_num=args.epoch, hidden_dim=args.hidden_dim, embeddings=embeddings, dropout_keep=args.dropout, optimizer=args.optimizer, lr=args.lr, clip_grad=args.clip, tag2label=tag2label, vocab=word2id, shuffle=args.shuffle, model_path=ckpt_prefix, summary_path=summary_path, log_path=log_path, result_path=result_path, CRF=args.CRF, update_embedding=args.update_embedding) model.build_graph() dev_data = traindata[:5000] dev_size = len(dev_data) train_data = traindata[5000:] train_size = len(train_data) print("train data: {0}\n dev data: {1}".format(train_size, dev_size)) model.train(traindata, dev_data)
def train_all_data(): embedding_dim = 100 hidden_dim = 100 stop_epoch = 1 model_1_epoch = 'model/model_1_epoch_lr0001.pth' training_data = get_data_from_data_txt(DATA_PERFECT_PATH) word_to_ix = get_word_to_ix(training_data, min_word_freq=1) tag_to_ix = {'b-name': 0, 'i-name': 1, 'b-bir': 2, 'i-bir': 3, 'b-gend': 4, 'i-gend': 5, 'b-tel': 6, 'i-tel': 7, 'b-acad': 8, 'i-acad': 9, 'b-nati': 10, 'i-nati': 11, 'b-live': 12, 'i-live': 13, 'b-poli': 14, 'i-poli': 15, 'b-unv': 16, 'i-unv': 17, 'b-comp': 18, 'i-comp': 19, 'b-work': 20, 'i-work': 21, 'b-post': 22, 'i-post': 23, 'b-proj': 24, 'i-proj': 25, 'b-resp': 26, 'i-resp': 27, 'b-degr': 28, 'i-degr': 29, 'b-grti': 30, 'i-grti': 31, 'b-woti': 32, 'i-woti': 33, 'b-prti': 34, 'i-prti': 35, 'o': 36, '<start>': 37, '<stop>': 38, 'c-live': 39, 'c-proj': 40, 'c-woti': 41, 'c-post': 42, 'c-unv': 43, 'c-nati': 44, 'c-poli': 45, 'c-prti':46, 'c-comp': 47} model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, embedding_dim, hidden_dim) optimizer = optim.Adam(model.parameters(), lr=0.001) # Make sure prepare_sequence from earlier in the LSTM section is loaded for epoch in range( stop_epoch): # again, normally you would NOT do 300 epochs, it is toy data print("---------------------") print("running epon : ", epoch + 1) start_time = time.time() for sentence, tags in tqdm(training_data): model.zero_grad() sentence_in = prepare_sequence(sentence, word_to_ix) targets = torch.tensor([tag_to_ix[t] for t in tags], dtype=torch.long) loss = model.neg_log_likelihood(sentence_in, targets) loss.backward() nn.utils.clip_grad_norm_(model.parameters(), 15) optimizer.step() cur_epoch_score = get_score_by_model(model, TRAIN_JSON_PATH, TRAIN_PDF_DIR) print('score', cur_epoch_score) print('running time:', time.time() - start_time) print() if epoch == stop_epoch: torch.save({ 'model_state_dict': model.state_dict() }, model_1_epoch)
def init_model(char_to_ix, tag_to_ix, START_CHAR_ID, STOP_CHAR_ID, START_TAG_ID, STOP_TAG_ID): if args.old_model is not None: model = torch.load(args.old_model) else: if args.char_embeddings is not None: char_embeddings = utils.read_pretrained_embeddings( args.char_embeddings, char_to_ix) EMBEDDING_DIM = char_embeddings.shape[1] else: char_embeddings = None EMBEDDING_DIM = args.char_embeddings_dim model = BiLSTM_CRF(len(char_to_ix), len(tag_to_ix), START_CHAR_ID, STOP_CHAR_ID, START_TAG_ID, STOP_TAG_ID, args.use_bigram, args.hidden_dim, args.dropout, EMBEDDING_DIM, char_embeddings) return processor.to_cuda_if_available(model)
def train_model(self): ''' 开始训练 :return: ''' model = BiLSTM_CRF(args, self.embedding, self.tag2id, self.word2id, self.paths, config=config) model.build_graph() print("train data: {}".format(len(self.train_data))) print("dev data: {}".format(len(self.dev_data))) model.train(self.train_data, self.dev_data, args)
def test_model(self, model_path): ''' 开始测试,测试时要传入模型地址 :param model_path: :return: ''' ckpt_file = tf.train.latest_checkpoint(model_path) print(ckpt_file) self.paths['model_path'] = ckpt_file model = BiLSTM_CRF(args, self.embedding, self.tag2id, self.word2id, self.paths, config=config) model.build_graph() print("test data size: {}".format(len(self.test_data))) model.test(self.test_data, args)
def train(train_corpus, test_corpus): """ create by ljx 进行模型训练 :param train_corpus: 训练数据 :param test_corpus: 测试数据 :return: """ # model.train model = BiLSTM_CRF(embeddings, args.update_embedding, args.hidden_dim, len(tag2label), args.clip, params.summary_path, args.optimizer) model.build_graph() saver = tf.train.Saver(tf.global_variables()) with tf.Session(config=config) as sess: # tf.global_variables_initializer() # 初始化模型参数 sess.run(model.init_op) model.add_summary(sess) for epoch in range(args.epoch): run_one_epoch(model, sess, train_corpus, test_corpus, tag2label, epoch, saver)
def predict(sentence, print_entity=False): """ 模型预测 """ # 模型 bilstm_crf = BiLSTM_CRF(opt.vocab_size, opt.emb_dim, opt.emb_dim//2, opt.tag_num, dropout=opt.dropout) if opt.load_model_path: bilstm_crf.load(opt.load_model_path) bilstm_crf.eval() # 数据 x = word2idx(sentence) x = t.LongTensor(x).unsqueeze(dim=0) tag_idx = bilstm_crf(x).squeeze(dim=0) tag_idx = tag_idx.numpy().tolist() length = min(opt.max_length, len(sentence)) entity_list = [] i = 0 while i < length: if tag_idx[i] == 1: entity = sentence[i] j = i + 1 for j in range(i+1, length): if tag_idx[j] == 2: entity += sentence[j] else: break i = j entity_list.append(entity) else: i += 1 if print_entity: print(entity_list) print('\n') return idx2tag(tag_idx)
def Test(testfile): word2id, embeddings = getDicEmbed() testdata = getTrainData(testfile) ckpt_file = tf.train.latest_checkpoint(model_path) model = BiLSTM_CRF(batch_size=args.batch_size, epoch_num=args.epoch, hidden_dim=args.hidden_dim, embeddings=embeddings, dropout_keep=args.dropout, optimizer=args.optimizer, lr=args.lr, clip_grad=args.clip, tag2label=tag2label, vocab=word2id, shuffle=args.shuffle, model_path=ckpt_file, summary_path=summary_path, log_path=log_path, result_path=result_path, CRF=args.CRF, update_embedding=args.update_embedding) model.build_graph() model.test(testdata)
if not os.path.exists(summary_path): os.makedirs(summary_path) model_path = os.path.join(output_path, "checkpoints/") if not os.path.exists(model_path): os.makedirs(model_path) ckpt_prefix = os.path.join(model_path, "model") paths['model_path'] = ckpt_prefix result_path = os.path.join(output_path, "results") paths['result_path'] = result_path if not os.path.exists(result_path): os.makedirs(result_path) log_path = os.path.join(result_path, "log.txt") paths['log_path'] = log_path get_logger(log_path).info(str(args)) ## training model if args.mode == 'train': model = BiLSTM_CRF(args, embeddings, tag2label, word2id, paths, config=config) model.build_graph() ## hyperparameters-tuning, split train/dev # dev_data = train_data[:5000]; dev_size = len(dev_data) # train_data = train_data[5000:]; train_size = len(train_data) # print("train data: {0}\ndev data: {1}".format(train_size, dev_size)) # model.train(train=train_data, dev=dev_data) ## train model on the whole training data print("train data: {}".format(len(train_data))) model.train(train=train_data, dev=test_data) # use test_data as the dev_data to see overfitting phenomena ## testing model elif args.mode == 'test': ckpt_file = tf.train.latest_checkpoint(model_path)
import torch import torch.optim as optim from dataset import Dataset from model import BiLSTM_CRF # torch.set_default_tensor_type('torch.cuda.FloatTensor') epochs = 100 dataset = Dataset() train_loader = dataset.get_train_loader(1) model = BiLSTM_CRF(dataset.get_vocab_size(), dataset.get_label_index_dict(), 128, 128) optimizer = optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-4) model.train() for epoch in range(epochs): for iter, batch in enumerate(train_loader): sentence_in, targets = batch.line, batch.label sentence_in = sentence_in.permute([1, 0]).reshape(-1).contiguous() targets = targets.permute([1, 0]).reshape(-1).contiguous() model.zero_grad() loss = model.neg_log_likelihood(sentence_in.squeeze(-1), targets.squeeze(-1)) / len(sentence_in) loss.backward() optimizer.step() print("{}-{}: {:.5f}".format(epoch, iter, loss.item()))
# mappings = { # 'word_to_id': word_to_id, # 'tag_to_id': tag_to_id, # 'char_to_id': char_to_id, # 'parameters': parameters, # 'word_embeds': word_embeds # } # cPickle.dump(mappings, f) print('word_to_id: ', len(word_to_id)) model = BiLSTM_CRF(vocab_size=len(word_to_id), tag_to_ix=tag_to_id, embedding_dim=parameters['word_dim'], hidden_dim=parameters['word_lstm_dim'], use_gpu=use_gpu, char_to_ix=char_to_id, pre_word_embeds=word_embeds, use_crf=parameters['crf'], char_mode=parameters['char_mode'], char_embedding_dim=parameters['char_dim'], char_lstm_dim=parameters['char_lstm_dim'], alpha=parameters['alpha']) # n_cap=4, # cap_embedding_dim=10) if use_gpu: model.cuda() learning_rate = 0.015 optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9) losses = [] best_dev_F = -1.0
if not os.path.exists(model_path): os.makedirs(model_path) ckpt_prefix = os.path.join(model_path, "model") paths['model_path'] = ckpt_prefix result_path = os.path.join(output_path, "results") paths['result_path'] = result_path if not os.path.exists(result_path): os.makedirs(result_path) log_path = os.path.join(result_path, "log.txt") paths['log_path'] = log_path get_logger(log_path).info(str(args)) ## Pretrain language model if args.mode == 'pre_train': args.CRF = False model = BiLSTM_CRF(args, embeddings, tag2label, word2id, paths, config=config) model.num_tags = len(word2id) model.build_graph() ## hyperparameters-tuning, split train/dev # dev_data = train_data[:5000]; dev_size = len(dev_data) # train_data = train_data[5000:]; train_size = len(train_data) # print("train data: {0}\ndev data: {1}".format(train_size, dev_size)) # model.train(train=train_data, dev=dev_data) ## train model on the whole training data print("train data: {}".format(len(pre_train_data))) model.train(train=pre_train_data, dev=test_data ) # use test_data as the dev_data to see overfitting phenomena
train_data = read_train_corpus(file_path=train_path, maxlen=args.max_len) print(args.data_augment) if args.data_augment: train_data = data_augmentation(train_data, maxlen=args.max_len) print("loading valid data...") valid_path = file_path = os.path.join('./data', args.valid_data) valid_data = read_train_corpus(file_path=valid_path, maxlen=args.max_len) print("building model...") result_path = os.path.join('./result', args.result_path) valid_result_path = os.path.join('./result', args.valid_result) model_path = os.path.join(args.model_path, 'model.ckpt') model = BiLSTM_CRF(args, embeddings, model_path=model_path, result_path=result_path, valid_result=valid_result_path, config=config) model.build_graph() ## train model on the whole training data print("train data: {}".format(len(train_data))) print("start trainging...") model.train(train=train_data, dev=valid_data ) # use test_data as the dev_data to see overfitting phenomena ## testing model elif args.mode == 'test': print("loading testing data...") test_path = os.path.join('./data', args.test_data) test_data = read_test_corpus(file_path=test_path, maxlen=args.max_len)
def sort_batch_data(sentences, lengths): lengths_sort, idx_sort = lengths.sort(0, descending=True) sentences_sort = sentences[idx_sort] _, idx_unsort = idx_sort.sort(0, descending=False) return sentences_sort, lengths_sort, idx_unsort char2idx = pickle.load(open('char2idx.pkl', 'rb')) data = pickle.load(open('predict_data.pkl', 'rb')) predict_data = PredData(data, char2idx) dataloader = DataLoader(predict_data, batch_size=32, drop_last=False) model = BiLSTM_CRF(len(char2idx), len(Config.tagert2idx), Config.embedding_dim, Config.hidden_dim) model.load_state_dict(torch.load('model_best.pth')) if Config.use_gpu: model.to('cuda') model.eval() predict_result = [] with torch.no_grad(): for batch_sentences, batch_lengths in dataloader: sentences, lengths, idx_unsort = sort_batch_data( batch_sentences, batch_lengths) if Config.use_gpu: sentences = sentences.cuda() pred = model(sentences, lengths) pred = pred[idx_unsort]
if not os.path.exists(model_path): os.makedirs(model_path) ckpt_prefix = os.path.join(model_path, "model") result_path = os.path.join(output_path, "results") if not os.path.exists(result_path): os.makedirs(result_path) log_path = os.path.join(result_path, "log.txt") get_logger(log_path).info(str(args)) # training model if args.mode == 'train': model = BiLSTM_CRF(batch_size=args.batch_size, epoch_num=args.epoch, hidden_dim=args.hidden_dim, embeddings=embeddings, dropout_keep=args.dropout, optimizer=args.optimizer, lr=args.lr, clip_grad=args.clip, tag2label=tag2label, vocab=word2id, shuffle=args.shuffle, model_path=ckpt_prefix, summary_path=summary_path, log_path=log_path, result_path=result_path, CRF=args.CRF, update_embedding=args.update_embedding) model.build_graph() # hyperparameters-tuning, split train/dev # train model on the whole training raw_data print("train raw_data: {}".format(len(train_data))) model.train(train_data, test_data) # we could use test_data as the dev_data to see the overfitting phenomena # testing model elif args.mode == 'test': ckpt_file = tf.train.latest_checkpoint(model_path) print(ckpt_file) model = BiLSTM_CRF(batch_size=args.batch_size, epoch_num=args.epoch, hidden_dim=args.hidden_dim, embeddings=embeddings, dropout_keep=args.dropout, optimizer=args.optimizer, lr=args.lr, clip_grad=args.clip, tag2label=tag2label, vocab=word2id, shuffle=args.shuffle,