def evaluate(): wordss, tagss, lengths = test_helper.gen_batch().__next__() sentence_in = prepare_sequence(wordss, word_to_ix) target_tag_seqs = prepare_sequence(tagss, tag_to_ix) predict_scores, predict_tag_seqs = model(sentence_in, lengths) for tag in ['a', 'b', 'c']: f1_score(target_tag_seqs, predict_tag_seqs, tag, tag_to_ix, lengths)
def train(): # Restore model parameters model_file = model_path + 'params.pkl' if os.path.exists(model_file): model.load_state_dict(torch.load(model_file)) print("resotre model from {}".format(model_file)) # Select optimizer optimizer = optim.Adam(model.parameters(), lr=0.01) # Make sure prepare_sequence from earlier in the LSTM section is loaded for epoch in range(epoch_num): index = 0 for batch_samples in train_helper.gen_batch(): starttime = time.time() # Step 0. Genetate batch samples. wordss, tagss, lengths = batch_samples # Step 1. Remember that Pytorch accumulates gradients. # We need to clear them out before each instance model.zero_grad() # Step 2. Get our inputs ready for the network, that is, # turn them into Tensors of word indices. sentence_in = prepare_sequence(wordss, word_to_ix) if use_cuda: sentence_in = sentence_in.cuda() label_out = prepare_sequence(tagss, tag_to_ix) if use_cuda: label_out = label_out.cuda() # Step 3. Run our forward pass. loss = model.neg_log_likelihood(sentence_in, label_out, lengths) # Step 4. Compute the loss, gradients, and update the parameters by # calling optimizer.step() loss.backward() optimizer.step() # print info print("epoch:{}, batch:{}, loss:{}, timecost:{}".format( epoch, index, loss.cpu().tolist()[0], (time.time() - starttime))) # Step 5. Save model evaluate() if index and index % 10 == 0: torch.save(model.state_dict(), model_file) index += 1 torch.save(model.state_dict(), model_file)
def eval(tag_path, corpus_path): correct = 0 total = 0 acc_list = [] model_name = MODEL_NAME embedding_dim = EMBEDDING_DIM hidden_dim = HIDDEN_DIM word_to_ix = WORD_TO_IX model = BiLSTM(len(word_to_ix), 5, embedding_dim, hidden_dim) checkpoint = torch.load(model_name) model.load_state_dict(checkpoint['model_state_dict']) model.eval() tag_to_ix = {'1': 0, '2': 1, '3': 2, '4': 3, '5': 4} sentences, tags = load_train_data(tag_path, corpus_path) labels = torch.tensor([[tag_to_ix[tag]] for tag in tags[:]]) with torch.no_grad(): for i, sen in enumerate(tqdm(sentences[:])): input = prepare_sequence(sen, word_to_ix) output = model(input) _, predicted = torch.max(output.data, 1) label = labels[i] total += label.size(0) correct += (predicted == label).sum().item() acc = round(100 * correct / total, 2) acc_list.append(acc) assert len(acc_list) == len(sentences) final_acc = acc plt.plot(list(range(len(tags))), acc_list) plt.xlabel('pred_num') plt.ylabel('accuracy / %') plt.show() return final_acc
def train_and_val(): embedding_dim = 100 hidden_dim = 100 model_load_path = None best_model_save_path = 'model/model_100_best_0223.pth' max_score = 0 stop_epoch = 30 unimprove_time = 0 val_json_path = '/home/agwave/Data/resume/val_0222.json' val_pdf_dir = '/home/agwave/Data/resume/val_0222/' training_data = get_data_from_data_txt(TRAIN_WORD_TO_TAG_PATH) with open('supporting_document/train_word_to_tag_0223.json', 'r') as j: word_to_ix = json.load(j) tag_to_ix = {'b-name': 0, 'i-name': 1, 'b-bir': 2, 'i-bir': 3, 'b-gend': 4, 'i-gend': 5, 'b-tel': 6, 'i-tel': 7, 'b-acad': 8, 'i-acad': 9, 'b-nati': 10, 'i-nati': 11, 'b-live': 12, 'i-live': 13, 'b-poli': 14, 'i-poli': 15, 'b-unv': 16, 'i-unv': 17, 'b-comp': 18, 'i-comp': 19, 'b-work': 20, 'i-work': 21, 'b-post': 22, 'i-post': 23, 'b-proj': 24, 'i-proj': 25, 'b-resp': 26, 'i-resp': 27, 'b-degr': 28, 'i-degr': 29, 'b-grti': 30, 'i-grti': 31, 'b-woti': 32, 'i-woti': 33, 'b-prti': 34, 'i-prti': 35, 'o': 36, '<start>': 37, '<stop>': 38} model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, embedding_dim, hidden_dim) optimizer = optim.Adam(model.parameters(), lr=0.01) start_epoch = 0 if model_load_path != None: print('load model...') checkpoint = torch.load(model_load_path) model.load_state_dict(checkpoint['model_state_dict']) optimizer.load_state_dict(checkpoint['optimizer_state_dict']) start_epoch = checkpoint['epoch'] + 1 preliminary_score = get_score_by_model(model, val_json_path, val_pdf_dir) print('preliminary score:', preliminary_score) for epoch in range(start_epoch, stop_epoch): print("---------------------") print("running epoch : ", epoch) start_time = time.time() for sentence, tags in tqdm(training_data): model.zero_grad() sentence_in = prepare_sequence(sentence, word_to_ix) targets = torch.tensor([tag_to_ix[t] for t in tags], dtype=torch.long) loss = model.neg_log_likelihood(sentence_in, targets) loss.backward() nn.utils.clip_grad_norm_(model.parameters(), 1) optimizer.step() cur_epoch_score = get_score_by_model(model, val_json_path, val_pdf_dir) print('score', cur_epoch_score) print('running time:', time.time() - start_time) if cur_epoch_score > max_score: unimprove_time = 0 max_score = cur_epoch_score torch.save({ 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'epoch': epoch }, best_model_save_path) print('save best model successfully.') else: break
def predict(): word_to_ix = load_dict(word_dict_file) tag_to_ix = load_dict(tag_dict_file) ix_to_tag = {v: k for k, v in tag_to_ix.items()} model_file = model_path + 'params.pkl' if os.path.exists(model_file): model.load_state_dict(torch.load(model_file)) for wordss, tagss, lengths in pred_helper.gen_batch(): sentence_in = prepare_sequence(wordss, word_to_ix) predict_scores, predict_ix_seqs = model(sentence_in, lengths) for word, ix in zip(wordss[0], predict_ix_seqs[0]): print(word, ix_to_tag[ix]) print()
def predict(sentence): sentence = sentence.split() model_name = BEST_NAME embedding_dim = EMBEDDING_DIM hidden_dim = HIDDEN_DIM word_to_ix = WORD_TO_IX model = BiLSTM(len(word_to_ix), 5, embedding_dim, hidden_dim) checkpoint = torch.load(model_name) model.load_state_dict(checkpoint['model_state_dict']) input = prepare_sequence(sentence, word_to_ix) with torch.no_grad(): output = model(input) print(output) _, predicted = torch.max(output.data, 1) print(predicted)
def get_time_to_score(tsv_path, thing, model_path): time_to_count = {} time_to_scoresum = {} if thing == 'hair_dryer': id = '732252283' elif thing == 'microwave': id = '423421857' else: id = '246038397' with open('train_' + thing + '_word_to_ix.json', 'r') as j: word_to_ix = json.load(j) embedding_dim = EMBEDDING_DIM hidden_dim = HIDDEN_DIM model = BiLSTM(len(word_to_ix), 5, embedding_dim, hidden_dim) checkpoints = torch.load(model_path) model.load_state_dict(checkpoints['model_state_dict']) model.eval() with open(tsv_path, 'r') as f: reader = csv.reader(f, delimiter='\t') for i, r in enumerate(reader): if i == 0 or r[4] != id: continue month, _, year = r[14].split('/') if year not in {'2014', '2015'}: continue time = get_idx_by_year_month(int(year), int(month)) if time < 8: continue sen = (r[12] + ' ' + r[13]).lower() sen = re.sub(r'[^A-Za-z0-9,.!]+', ' ', sen) input = prepare_sequence(sen.split(), word_to_ix) with torch.no_grad(): output = model(input) _, predicted = torch.max(output.data, 1) pred_score = predicted.item() if time not in time_to_count: time_to_count[time] = 0 time_to_scoresum[time] = 0. time_to_count[time] += 1 time_to_scoresum[time] += pred_score time_to_scoremean = {} for time in time_to_count.keys(): time_to_scoremean[time] = time_to_scoresum[time] / time_to_count[time] print(time_to_count) return time_to_scoremean
def trainning(): """ 训练模型 :return: """ train_obj = Train() training_data, word_to_ix, tag_to_ix = util.data_prepare( config.TRAIN_DATA_PATH) training_data.char = training_data.char.apply( lambda c: util.prepare_sequence(c, word_to_ix)) training_data.tag = training_data.tag.apply( lambda t: torch.tensor([tag_to_ix[t_] for t_ in t], dtype=torch.long)) model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, config.EMBEDDING_DIM, config.HIDDEN_DIM) # 定义优化器为Adam优化器 optimizer = optim.Adam(model.parameters(), lr=config.lr) best_val_loss = float("inf") best_model = None # Make sure prepare_sequence from earlier in the LSTM section is loaded for epoch in range( config.EPOCHES ): # again, normally you would NOT do 300 epochs, it is toy data rmrb_loader = training_data.iloc[:-10000].sample(config.BATCH_SIZE) rmrb_loader_test = training_data.iloc[-10000:].sample(1000) epoch_start_time = time.time() train_obj.train(model, epoch, optimizer, rmrb_loader) val_loss = train_obj.evaluate(model, rmrb_loader_test) print('-' * 89) print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '. format(epoch, (time.time() - epoch_start_time), val_loss)) print('-' * 89) if val_loss < best_val_loss: best_val_loss = val_loss best_model = model if epoch % 20 == 0: torch.save(best_model.state_dict(), config.MODEL_PATH)
def train_all_data(): embedding_dim = 100 hidden_dim = 100 stop_epoch = 1 model_1_epoch = 'model/model_1_epoch_lr0001.pth' training_data = get_data_from_data_txt(DATA_PERFECT_PATH) word_to_ix = get_word_to_ix(training_data, min_word_freq=1) tag_to_ix = {'b-name': 0, 'i-name': 1, 'b-bir': 2, 'i-bir': 3, 'b-gend': 4, 'i-gend': 5, 'b-tel': 6, 'i-tel': 7, 'b-acad': 8, 'i-acad': 9, 'b-nati': 10, 'i-nati': 11, 'b-live': 12, 'i-live': 13, 'b-poli': 14, 'i-poli': 15, 'b-unv': 16, 'i-unv': 17, 'b-comp': 18, 'i-comp': 19, 'b-work': 20, 'i-work': 21, 'b-post': 22, 'i-post': 23, 'b-proj': 24, 'i-proj': 25, 'b-resp': 26, 'i-resp': 27, 'b-degr': 28, 'i-degr': 29, 'b-grti': 30, 'i-grti': 31, 'b-woti': 32, 'i-woti': 33, 'b-prti': 34, 'i-prti': 35, 'o': 36, '<start>': 37, '<stop>': 38, 'c-live': 39, 'c-proj': 40, 'c-woti': 41, 'c-post': 42, 'c-unv': 43, 'c-nati': 44, 'c-poli': 45, 'c-prti':46, 'c-comp': 47} model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, embedding_dim, hidden_dim) optimizer = optim.Adam(model.parameters(), lr=0.001) # Make sure prepare_sequence from earlier in the LSTM section is loaded for epoch in range( stop_epoch): # again, normally you would NOT do 300 epochs, it is toy data print("---------------------") print("running epon : ", epoch + 1) start_time = time.time() for sentence, tags in tqdm(training_data): model.zero_grad() sentence_in = prepare_sequence(sentence, word_to_ix) targets = torch.tensor([tag_to_ix[t] for t in tags], dtype=torch.long) loss = model.neg_log_likelihood(sentence_in, targets) loss.backward() nn.utils.clip_grad_norm_(model.parameters(), 15) optimizer.step() cur_epoch_score = get_score_by_model(model, TRAIN_JSON_PATH, TRAIN_PDF_DIR) print('score', cur_epoch_score) print('running time:', time.time() - start_time) print() if epoch == stop_epoch: torch.save({ 'model_state_dict': model.state_dict() }, model_1_epoch)
def get_score_by_model(model, train_json_path, pdf_root_dir): pdf_path = os.listdir(pdf_root_dir) random.shuffle(pdf_path) path = pdf_path[:len(pdf_path)] with open('supporting_document/word_to_ix_add_unk_0219.json') as j: word_to_ix = json.load(j) tag_to_ix = {'b-name': 0, 'i-name': 1, 'b-bir': 2, 'i-bir': 3, 'b-gend': 4, 'i-gend': 5, 'b-tel': 6, 'i-tel': 7, 'b-acad': 8, 'i-acad': 9, 'b-nati': 10, 'i-nati': 11, 'b-live': 12, 'i-live': 13, 'b-poli': 14, 'i-poli': 15, 'b-unv': 16, 'i-unv': 17, 'b-comp': 18, 'i-comp': 19, 'b-work': 20, 'i-work': 21, 'b-post': 22, 'i-post': 23, 'b-proj': 24, 'i-proj': 25, 'b-resp': 26, 'i-resp': 27, 'b-degr': 28, 'i-degr': 29, 'b-grti': 30, 'i-grti': 31, 'b-woti': 32, 'i-woti': 33, 'b-prti': 34, 'i-prti': 35, 'o': 36, '<start>': 37, '<stop>': 38, 'c-live': 39, 'c-proj': 40, 'c-woti': 41, 'c-post': 42, 'c-unv': 43, 'c-nati': 44, 'c-poli': 45, 'c-prti':46, 'c-comp': 47} ix_to_tag = {v: k for k, v in tag_to_ix.items()} pred_pdf_info = {} print('predicting...') for p in tqdm(path): if p.endswith('.pdf'): file_name = p[:-4] try: content = get_str_from_pdf(os.path.join(pdf_root_dir, p)) char_list = list(content) with torch.no_grad(): precheck_sent = prepare_sequence(char_list, word_to_ix) _, ix = model(precheck_sent) info = write_info_by_ix_plus(ix, content, ix_to_tag) pred_pdf_info[file_name] = info except Exception as e: if file_name not in pred_pdf_info: pred_pdf_info[file_name] = {} print(e) print('predict OK!') with open(train_json_path, 'r') as j: label_pdf_info = json.load(j) score = get_score_by_label_pred(label_pdf_info, pred_pdf_info) return score
def evaluate(model, sentence, word_to_idx): """Return an int as the predict.""" inputs = prepare_sequence(sentence, word_to_idx) model.hidden = model.init_hidden() labels = model(inputs) return labels.data.max(1)[1][0]
def eval_one_sample(): sample = list(get_str_from_pdf(SAMPLE_PDF_FILE)) with open('supporting_document/word_to_ix_add_unk_0219.json') as j: word_to_ix = json.load(j) tag_to_ix = { 'b-name': 0, 'i-name': 1, 'b-bir': 2, 'i-bir': 3, 'b-gend': 4, 'i-gend': 5, 'b-tel': 6, 'i-tel': 7, 'b-acad': 8, 'i-acad': 9, 'b-nati': 10, 'i-nati': 11, 'b-live': 12, 'i-live': 13, 'b-poli': 14, 'i-poli': 15, 'b-unv': 16, 'i-unv': 17, 'b-comp': 18, 'i-comp': 19, 'b-work': 20, 'i-work': 21, 'b-post': 22, 'i-post': 23, 'b-proj': 24, 'i-proj': 25, 'b-resp': 26, 'i-resp': 27, 'b-degr': 28, 'i-degr': 29, 'b-grti': 30, 'i-grti': 31, 'b-woti': 32, 'i-woti': 33, 'b-prti': 34, 'i-prti': 35, 'o': 36, '<start>': 37, '<stop>': 38, 'c-live': 39, 'c-proj': 40, 'c-woti': 41, 'c-post': 42, 'c-unv': 43, 'c-nati': 44, 'c-poli': 45, 'c-prti': 46, 'c-comp': 47 } ix_to_word = {} for k, v in tag_to_ix.items(): ix_to_word[v] = k model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM) checkpoint = torch.load('model_100_all_data_0226.pth') model.load_state_dict(checkpoint['model_state_dict']) with torch.no_grad(): precheck_sent = prepare_sequence(sample, word_to_ix) score, ix = model(precheck_sent) print(score) predict = [] for i in ix: predict.append(ix_to_word[i]) for i in range(len(ix)): print(sample[i], predict[i])
def get_score_from_model_path(model_path, tag_file, pdf_root_dir, pred_json_dir=None): path = os.listdir(pdf_root_dir) with open('supporting_document/train_word_to_tag_0223.json') as j: word_to_ix = json.load(j) tag_to_ix = { 'b-name': 0, 'i-name': 1, 'b-bir': 2, 'i-bir': 3, 'b-gend': 4, 'i-gend': 5, 'b-tel': 6, 'i-tel': 7, 'b-acad': 8, 'i-acad': 9, 'b-nati': 10, 'i-nati': 11, 'b-live': 12, 'i-live': 13, 'b-poli': 14, 'i-poli': 15, 'b-unv': 16, 'i-unv': 17, 'b-comp': 18, 'i-comp': 19, 'b-work': 20, 'i-work': 21, 'b-post': 22, 'i-post': 23, 'b-proj': 24, 'i-proj': 25, 'b-resp': 26, 'i-resp': 27, 'b-degr': 28, 'i-degr': 29, 'b-grti': 30, 'i-grti': 31, 'b-woti': 32, 'i-woti': 33, 'b-prti': 34, 'i-prti': 35, 'o': 36, '<start>': 37, '<stop>': 38 } ix_to_tag = {v: k for k, v in tag_to_ix.items()} model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM) checkpoint = torch.load(model_path) model.load_state_dict(checkpoint['model_state_dict']) pred_pdf_info = {} print('predicting...') for p in tqdm(path): if p.endswith('.pdf'): file_name = p[:-4] try: content = get_str_from_pdf(os.path.join(pdf_root_dir, p)) char_list = list(content) with torch.no_grad(): precheck_sent = prepare_sequence(char_list, word_to_ix) _, ix = model(precheck_sent) info = write_info_by_ix(ix, content, ix_to_tag) pred_pdf_info[file_name] = info except Exception as e: if file_name not in pred_pdf_info: pred_pdf_info[file_name] = {} print(e) print('predict OK!') if pred_json_dir != None: pred_json_path = os.path.join(pred_json_dir, model_path[-4] + '.json') with open(pred_json_path, 'w') as j: json.dump(pred_pdf_info, j, ensure_ascii=False) with open(tag_file, 'r') as j: label_pdf_info = json.load(j) score = get_score_by_label_pred(label_pdf_info, pred_pdf_info) return score
def train(): logging.basicConfig(level=logging.INFO, filename='log.txt', format='%(message)s') tag_path = TRAIN_TAG_PATH corpus_path = TRAIN_CORPUS_PATH save_model_name = MODEL_NAME best_model_name = BEST_NAME load_model_path = None embedding_dim = EMBEDDING_DIM hidden_dim = HIDDEN_DIM train_epoch = TRAIN_EPOCH word_to_ix = WORD_TO_IX start_epoch = 0 best_score = 0. loss_info, train_avg_info, test_avg_info = [], [], [] sentences, tags = load_train_data(tag_path, corpus_path) tag_to_ix = {'1': 0, '2': 1, '3': 2, '4': 3, '5': 4} label = torch.tensor([[tag_to_ix[tag]] for tag in tags]) model = BiLSTM(len(word_to_ix), 5, embedding_dim, hidden_dim, dropout=0.3) optimizer = optim.Adam(model.parameters(), lr=0.001) criterion = nn.CrossEntropyLoss() if load_model_path is not None: checkpoints = torch.load(load_model_path) model.load_state_dict(checkpoints['model_state_dict']) optimizer.load_state_dict(checkpoints['optim_state_dict']) start_epoch = checkpoints['epoch'] start_time = time.time() logging.info('----------------------') for epoch in range(start_epoch, train_epoch): running_loss = 0.0 for i, sen in enumerate(tqdm(sentences)): optimizer.zero_grad() input = prepare_sequence(sen, word_to_ix) output = model(input) loss = criterion(output, label[i]) running_loss += loss.item() loss.backward() nn.utils.clip_grad_norm_(model.parameters(), 15) optimizer.step() torch.save( { 'model_state_dict': model.state_dict(), 'optim_state_dict': optimizer.state_dict(), 'epoch': epoch + 1 }, save_model_name) train_avg = eval(TRAIN_TAG_PATH, TRAIN_CORPUS_PATH) test_avg = eval(TEST_TAG_PATH, TEST_CORPUS_PATH) loss_info.append(running_loss) train_avg_info.append(train_avg) test_avg_info.append(test_avg) logging.info('********') logging.info('epoch: {}'.format(epoch + 1)) logging.info('loss: {}'.format(running_loss)) logging.info('train avg: {}'.format(train_avg)) logging.info('test avg: {}'.format(test_avg)) if test_avg > best_score: torch.save({ 'model_state_dict': model.state_dict(), }, best_model_name) best_score = test_avg print('save best') print('training time:', time.time() - start_time)
# # Note that element i,j of the output is the score for tag j for word i. # inputs = prepare_sequence(citing_sentences[0], word_to_idx) # print(inputs) # labels = model(inputs) # print(labels) # label_to_text(labels, polarity_to_idx) since = time.time() training_data = list(zip(citing_sentences, polarities)) for epoch in range(EPOCHS): total_loss = torch.Tensor([0]) i = 0 random.shuffle(training_data) for sentence, target in training_data: # Step 1. Prepare the inputs to be passed to the model (i.e, turn the words # into integer indices and wrap them in variables) sentence_in = prepare_sequence(sentence, word_to_idx) target = autograd.Variable(torch.LongTensor([target])) # Step 2. Recall that torch *accumulates* gradients. Before passing in a # new instance, you need to zero out the gradients from the old # instance model.zero_grad() # Also, we need to clear out the hidden state of the LSTM, # detaching it from its history on the last instance. model.hidden = model.init_hidden() # step 3. Run forward pass labels = model(sentence_in) # Step 4. Compute your loss function. (Again, Torch wants the target # word wrapped in a variable)
for batch in range(num_batches): # if((batch+1) % 50 == 0 or (batch+1) == num_batches): # print("Training on batch: {}/{}".format(batch+1,num_batches)) batch_begin = batch*size_batch batch_end = (batch+1)*size_batch*seq_len if(batch_end > len(labels)): batch_end = len(labels) x_data = [] y_data = [] for i in range(batch_begin, batch_end): #x_data.append(util.convert_to_one_hot(words[i], words2int, vocab_size)) #caso one-hot x_data.append(util.get_index(words[i], words2int)) #caso indice per parola y_data.append(labels[i]) x_tensor = util.prepare_sequence(x_data,seq_len) y_tensor = util.prepare_sequence(y_data,seq_len) #x_tensor = Variable(torch.FloatTensor(seq)) #y_tensor = Variable(torch.FloatTensor(y_data)) optimizer.zero_grad() y_prob, (hn, cn) = model(x_tensor, (h0, c0)) #util.tensor_desc(y_prob) #print(y_tensor.shape) y_prob = torch.squeeze(y_prob) y_prob = y_prob.view(-1, seq_len) loss = criterion(y_prob, y_tensor) loss = torch.mul(class_weights, loss ) loss= torch.mean(loss) loss.backward()