def train_and_val(): embedding_dim = 100 hidden_dim = 100 model_load_path = None best_model_save_path = 'model/model_100_best_0223.pth' max_score = 0 stop_epoch = 30 unimprove_time = 0 val_json_path = '/home/agwave/Data/resume/val_0222.json' val_pdf_dir = '/home/agwave/Data/resume/val_0222/' training_data = get_data_from_data_txt(TRAIN_WORD_TO_TAG_PATH) with open('supporting_document/train_word_to_tag_0223.json', 'r') as j: word_to_ix = json.load(j) tag_to_ix = {'b-name': 0, 'i-name': 1, 'b-bir': 2, 'i-bir': 3, 'b-gend': 4, 'i-gend': 5, 'b-tel': 6, 'i-tel': 7, 'b-acad': 8, 'i-acad': 9, 'b-nati': 10, 'i-nati': 11, 'b-live': 12, 'i-live': 13, 'b-poli': 14, 'i-poli': 15, 'b-unv': 16, 'i-unv': 17, 'b-comp': 18, 'i-comp': 19, 'b-work': 20, 'i-work': 21, 'b-post': 22, 'i-post': 23, 'b-proj': 24, 'i-proj': 25, 'b-resp': 26, 'i-resp': 27, 'b-degr': 28, 'i-degr': 29, 'b-grti': 30, 'i-grti': 31, 'b-woti': 32, 'i-woti': 33, 'b-prti': 34, 'i-prti': 35, 'o': 36, '<start>': 37, '<stop>': 38} model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, embedding_dim, hidden_dim) optimizer = optim.Adam(model.parameters(), lr=0.01) start_epoch = 0 if model_load_path != None: print('load model...') checkpoint = torch.load(model_load_path) model.load_state_dict(checkpoint['model_state_dict']) optimizer.load_state_dict(checkpoint['optimizer_state_dict']) start_epoch = checkpoint['epoch'] + 1 preliminary_score = get_score_by_model(model, val_json_path, val_pdf_dir) print('preliminary score:', preliminary_score) for epoch in range(start_epoch, stop_epoch): print("---------------------") print("running epoch : ", epoch) start_time = time.time() for sentence, tags in tqdm(training_data): model.zero_grad() sentence_in = prepare_sequence(sentence, word_to_ix) targets = torch.tensor([tag_to_ix[t] for t in tags], dtype=torch.long) loss = model.neg_log_likelihood(sentence_in, targets) loss.backward() nn.utils.clip_grad_norm_(model.parameters(), 1) optimizer.step() cur_epoch_score = get_score_by_model(model, val_json_path, val_pdf_dir) print('score', cur_epoch_score) print('running time:', time.time() - start_time) if cur_epoch_score > max_score: unimprove_time = 0 max_score = cur_epoch_score torch.save({ 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'epoch': epoch }, best_model_save_path) print('save best model successfully.') else: break
def train_all_data(): embedding_dim = 100 hidden_dim = 100 stop_epoch = 1 model_1_epoch = 'model/model_1_epoch_lr0001.pth' training_data = get_data_from_data_txt(DATA_PERFECT_PATH) word_to_ix = get_word_to_ix(training_data, min_word_freq=1) tag_to_ix = {'b-name': 0, 'i-name': 1, 'b-bir': 2, 'i-bir': 3, 'b-gend': 4, 'i-gend': 5, 'b-tel': 6, 'i-tel': 7, 'b-acad': 8, 'i-acad': 9, 'b-nati': 10, 'i-nati': 11, 'b-live': 12, 'i-live': 13, 'b-poli': 14, 'i-poli': 15, 'b-unv': 16, 'i-unv': 17, 'b-comp': 18, 'i-comp': 19, 'b-work': 20, 'i-work': 21, 'b-post': 22, 'i-post': 23, 'b-proj': 24, 'i-proj': 25, 'b-resp': 26, 'i-resp': 27, 'b-degr': 28, 'i-degr': 29, 'b-grti': 30, 'i-grti': 31, 'b-woti': 32, 'i-woti': 33, 'b-prti': 34, 'i-prti': 35, 'o': 36, '<start>': 37, '<stop>': 38, 'c-live': 39, 'c-proj': 40, 'c-woti': 41, 'c-post': 42, 'c-unv': 43, 'c-nati': 44, 'c-poli': 45, 'c-prti':46, 'c-comp': 47} model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, embedding_dim, hidden_dim) optimizer = optim.Adam(model.parameters(), lr=0.001) # Make sure prepare_sequence from earlier in the LSTM section is loaded for epoch in range( stop_epoch): # again, normally you would NOT do 300 epochs, it is toy data print("---------------------") print("running epon : ", epoch + 1) start_time = time.time() for sentence, tags in tqdm(training_data): model.zero_grad() sentence_in = prepare_sequence(sentence, word_to_ix) targets = torch.tensor([tag_to_ix[t] for t in tags], dtype=torch.long) loss = model.neg_log_likelihood(sentence_in, targets) loss.backward() nn.utils.clip_grad_norm_(model.parameters(), 15) optimizer.step() cur_epoch_score = get_score_by_model(model, TRAIN_JSON_PATH, TRAIN_PDF_DIR) print('score', cur_epoch_score) print('running time:', time.time() - start_time) print() if epoch == stop_epoch: torch.save({ 'model_state_dict': model.state_dict() }, model_1_epoch)
import torch.optim as optim from dataset import Dataset from model import BiLSTM_CRF # torch.set_default_tensor_type('torch.cuda.FloatTensor') epochs = 100 dataset = Dataset() train_loader = dataset.get_train_loader(1) model = BiLSTM_CRF(dataset.get_vocab_size(), dataset.get_label_index_dict(), 128, 128) optimizer = optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-4) model.train() for epoch in range(epochs): for iter, batch in enumerate(train_loader): sentence_in, targets = batch.line, batch.label sentence_in = sentence_in.permute([1, 0]).reshape(-1).contiguous() targets = targets.permute([1, 0]).reshape(-1).contiguous() model.zero_grad() loss = model.neg_log_likelihood(sentence_in.squeeze(-1), targets.squeeze(-1)) / len(sentence_in) loss.backward() optimizer.step() print("{}-{}: {:.5f}".format(epoch, iter, loss.item()))
def train(conf): train_sentences = load_sentences(conf.train_file, conf.zeros) dev_sentences = load_sentences(conf.dev_file, conf.zeros) test_sentences = load_sentences(conf.test_file, conf.zeros) dico_chars_train = char_mapping(train_sentences, conf.lower)[0] dico_chars, char_to_id, id_to_char = augment_with_pretrained( dico_chars_train.copy(), conf.emb_file, list( itertools.chain.from_iterable([[w[0] for w in s] for s in test_sentences]))) _t, tag_to_id, id_to_tag = tag_mapping(train_sentences) # prepare data, get a collection of list containing index train_data = prepare_dataset(train_sentences, char_to_id, tag_to_id, conf.lower) dev_data = prepare_dataset(dev_sentences, char_to_id, tag_to_id, conf.lower) test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id, conf.lower) #loading word embeddings all_word_embeds = {} for i, line in enumerate(codecs.open(conf.emb_file, 'r', 'utf-8')): s = line.strip().split() if len(s) == conf.embedding_dim + 1: all_word_embeds[s[0]] = np.array([float(i) for i in s[1:]]) word_embeds_dict = np.random.uniform(-np.sqrt(0.06), np.sqrt(0.06), (len(char_to_id), conf.embedding_dim)) for w in char_to_id: if w in all_word_embeds: word_embeds_dict[char_to_id[w]] = all_word_embeds[w] elif w.lower() in all_word_embeds: word_embeds_dict[char_to_id[w]] = all_word_embeds[w.lower()] print('Loaded %i pretrained embeddings.' % len(all_word_embeds)) train_manager = BatchManager(train_data, conf.batch_size) model = BiLSTM_CRF(conf, tag_to_id, char_to_id, word_embeds_dict) optimizer = torch.optim.SGD(model.parameters(), lr=conf.learning_rate, weight_decay=1e-4) epoch = conf.epochs dev_f1_ = 0 for epoch in range(1, epoch + 1): print(f'train on epoch {epoch}') j = 1 for batch in train_manager.iter_batch(shuffle=True): batch_loss = 0.0 sentences = batch[1] tags = batch[-1] for i, index in enumerate(np.random.permutation(len(sentences))): model.zero_grad() sentence_in = sentences[index] tags_in = tags[index] loss = model.neg_log_likelihood(sentence_in, tags_in) loss.backward() optimizer.step() batch_loss += loss.data print( f'[batch {j},batch size:{conf.batch_size}] On this batch loss: {batch_loss}' ) j = j + 1 print(f'Begin validing result on [epoch {epoch}] valid dataset ...') dev_results = get_predictions(model, dev_data, id_to_tag) dev_f1 = evaluate_ner(dev_results, conf) if dev_f1 > dev_f1_: torch.save(model, conf.model_file) print('save model success.') test_results = get_predictions(model, test_data, id_to_tag) test_f1 = evaluate_ner(test_results, conf) print(f'[epoch {epoch}] On test dataset] f1: {test_f1:3f}')