def load_data(opt): if args.raw_data: test = load_origin_data(collection_test, args, single_answer=False) else: if args.tokenizer == 'ltp': tokenizer = LtpTokenizer('ltp_data') if args.tokenizer == 'jieba': tokenizer = JiebaTokenizer() if args.tokenizer == 'jieba_origin': tokenizer = JiebaOriginTokenizer() test = load_data_tokenize(collection_test, tokenizer, args, single_answer=False) log.info("[test data length:{}]".format(len(test))) with open('data/meta.msgpack', 'rb') as f: meta = msgpack.load(f, encoding='utf8') opt['pos_size'] = len(meta['vocab_tag']) opt['ner_size'] = len(meta['vocab_ent']) vocab = meta['vocab'] vocab_tag = meta['vocab_tag'] vocab_ent = meta['vocab_ent'] embedding = meta['embedding'] embedding = torch.Tensor(embedding) question_tokens = list(test.query_words) context_tokens = list(test.words) context_tags = list(test.postags) context_ents = list(test.netags) opt['pretrained_words'] = True opt['vocab_size'] = embedding.size(0) opt['embedding_dim'] = embedding.size(1) embedding[1] = torch.normal(means=torch.zeros(opt['embedding_dim']), std=1.) question_ids = token2id(question_tokens, vocab, unk_id=1) context_ids = token2id(context_tokens, vocab, unk_id=1) context_features = get_context_features(question_tokens, context_tokens) context_tag_ids = token2id(context_tags, vocab_tag) context_ent_ids = token2id(context_ents, vocab_ent) test_batches = list( zip( context_ids, context_features, context_tag_ids, context_ent_ids, question_ids, context_tokens, )) test_y = test['answers'].tolist()[:len(test)] return test_batches, test_y, embedding, opt, test
def sim_test(): samples = generate_dataset(100) word2id, id2word = build_vocab(samples) vocab_size = len(word2id) inputs = [] targets = [] rnn = RNN(input_dim=vocab_size, output_dim=vocab_size, hidden_dim=256) test_input = text2token(samples[0])[:-1] test_target = text2token(samples[0])[1:] print("Test Input:", test_input) print("Test Target:", test_target) inputs = one_hot_seq(token2id(test_input, word2id), vocab_size) outputs, hidden_states = rnn.forward(inputs) test_output = [id2word[np.argmax(out)] for out in outputs] print("Test Output:", test_output) for epoch in range(150): losses = [] for sample in samples: ids = token2id(text2token(sample), word2id) inputs = one_hot_seq(ids[:-1], vocab_size) targets = one_hot_seq(ids[1:], vocab_size) #print(inputs[0].shape) outputs, hidden_states = rnn.forward(inputs) #print(rnn.grads['d_W_x']) rnn.zero_grad() loss = rnn.backward(outputs, targets) rnn.update_params(lr=2e-4) losses.append(loss) #print(loss) print(np.array(losses).mean()) print("Test Input:", test_input) print("Test Target:", test_target) inputs = one_hot_seq(token2id(test_input, word2id), vocab_size) outputs, hidden_states = rnn.forward(inputs) test_output = [id2word[np.argmax(out)] for out in outputs] print("Test Output:", test_output)
def evaluate(model, dev_data, batch_size, max_sent_length, device): was_training = model.training model.eval() vocab_tag = model.vocab_tag tok2idx, idx2tok = vocab_tag['token_to_index'], vocab_tag['index_to_token'] tag2idx, idx2tag = vocab_tag['tag_to_index'], vocab_tag['index_to_tag'] data_loader = DataLoader(dev_data, batch_size=batch_size) val_loss = 0 cum_cnt = 0 with torch.no_grad(): for iter_idx, data in enumerate(data_loader): chars_batch, tags_batch = get_feature(data) chars_batch, chars_mask = pad_sents(chars_batch, pad_token, max_len=max_sent_length) tags_batch, tags_mask = pad_sents(tags_batch, pad_token, max_len=max_sent_length) input_chars = torch.tensor(token2id(chars_batch, tok2idx, unk_id=tok2idx['<UNK>']), device=device) target_tags = torch.tensor(token2id(tags_batch, tag2idx), device=device) tags_mask = torch.tensor(tags_mask, dtype=torch.uint8, device=device) # output_emission = conv_seg(input_chars.to(device)) # loss = -crf_model(output_emission.transpose(0,1), target_tags.transpose(0,1).to(device), tags_mask.transpose(0,1).to(device)) loss = -model(input_chars, target_tags, tags_mask) val_loss += loss cum_cnt = cum_cnt + input_chars.shape[0] val_loss = val_loss / cum_cnt if was_training: model.train() return val_loss
def test(model_path, data_path, split, batch_size, max_sent_length, cuda): device = torch.device("cuda:0" if cuda else "cpu") dataset = SIGHAN(split='dev', root_path=data_path) model = CharWordSeg.load(model_path) model = model.to(device) model.eval() vocab_tag = model.vocab_tag tok2idx, idx2tok = vocab_tag['token_to_index'], vocab_tag['index_to_token'] tag2idx, idx2tag = vocab_tag['tag_to_index'], vocab_tag['index_to_tag'] data_loader = DataLoader(dataset, batch_size=batch_size) val_loss = 0 cum_cnt = 0 with torch.no_grad(): for iter_idx, data in enumerate(data_loader): chars_batch, tags_batch = get_feature(data) chars_batch, chars_mask = pad_sents(chars_batch, pad_token, max_len=max_sent_length) tags_batch, tags_mask = pad_sents(tags_batch, pad_token, max_len=max_sent_length) input_chars = torch.tensor(token2id(chars_batch, tok2idx, unk_id=tok2idx['<UNK>']), device=device) target_tags = torch.tensor(token2id(tags_batch, tag2idx), device=device) tags_mask = torch.tensor(tags_mask, dtype=torch.uint8, device=device) loss = -model(input_chars, target_tags, tags_mask) val_loss += loss cum_cnt = cum_cnt + input_chars.shape[0] val_loss = val_loss / cum_cnt return val_loss
def real_test(): if not os.path.exists('train.csv'): prepare_data() data = pd.read_csv('train.csv') texts = data['text'].tolist() labels = data['target'].tolist() word2id, id2word = build_vocab(texts) vocab_size = len(word2id) ids_list = [] for text in texts: ids_list.append(token2id(text2token(text), word2id)) num_class = len(set(labels)) rnn = RNN(input_dim=vocab_size, output_dim=num_class, hidden_dim=256) accs = [] ids_list = ids_list[:100] labels = labels[:100] for epoch in range(10): losses = [] for inputs, label in zip(ids_list, labels): inputs = one_hot_seq(inputs, vocab_size) label = one_hot_seq([label], num_class)[0] outputs, hidden_states = rnn.forward(inputs) #print(rnn.grads['d_W_x']) pred = outputs[-1] rnn.zero_grad() loss = rnn.backward(pred, label) accs.append(np.argmax(label) == np.argmax(pred)) rnn.update_params(lr=2e-4) losses.append(loss) #print(loss) print("Epoch", epoch, "Loss:", np.array(losses).mean(), "Acc:", np.array(accs).mean())
def predict(model_path, data_path, split, output_path, batch_size, max_sent_length, cuda): device = torch.device("cuda:0" if cuda else "cpu") dataset = SIGHAN(split=split, root_path=data_path) output = [] model = CharWordSeg.load(model_path) model = model.to(device) model.eval() vocab_tag = model.vocab_tag tok2idx, idx2tok = vocab_tag['token_to_index'], vocab_tag['index_to_token'] tag2idx, idx2tag = vocab_tag['tag_to_index'], vocab_tag['index_to_tag'] data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=False) with torch.no_grad(): for iter_idx, data in enumerate(data_loader): chars = get_feature(data, stage='predict') chars_batch, chars_mask = pad_sents(chars, pad_token, max_len=max_sent_length) input_chars = torch.tensor(token2id(chars_batch, tok2idx, unk_id=tok2idx['<UNK>']), device=device) chars_mask = torch.tensor(chars_mask, dtype=torch.uint8, device=device) pred_tags = id2token(model.decode(input_chars, chars_mask), idx2tag) output.extend(create_output(chars, pred_tags)) with codecs.open(output_path, 'w', 'utf8') as f: for sent in output: print(sent, file=f)
context_tokens = list(train.words) + list(dev.words) #加载词向量文件中词典 wv_vocab = load_wv_vocab(os.path.join('embedding', args.wv_file)) log.info('wv_vocab loaded.vocab_size:'.format(len(wv_vocab))) #建立训练、验证数据集对应词表 vocab, counter = build_vocab(question_tokens, context_tokens, wv_vocab) #加载词向量文件中词向量 embedding = build_embedding(os.path.join('embedding', args.wv_file), vocab, args.embedding_dim) log.info('got embedding matrix.') #建立词到序号的映射 question_ids = token2id(question_tokens, vocab, unk_id=1) context_ids = token2id(context_tokens, vocab, unk_id=1) #提取文档中词的特征 context_features = get_context_features(question_tokens, context_tokens) #数据集中词性及实体标记的类别 vocab_tag = get_vocab(context_tags) vocab_ent = get_vocab(context_ents) log.info('Found {} POS tags: {}'.format(len(vocab_tag), vocab_tag)) log.info('Found {} entity tags: {}'.format(len(vocab_ent), vocab_ent)) #建立词性、实体标记到序号的映射 context_tag_ids = token2id(context_tags, vocab_tag) context_ent_ids = token2id(context_ents, vocab_ent)
def train(split, data_path, save_to, batch_size, max_sent_length, char_embed_size, num_hidden_layer, channel_size, kernel_size, learning_rate, max_epoch, log_every, patience, max_num_trial, lr_decay, last_model_path, cuda): if not Path(save_to).exists(): Path(save_to).mkdir() device = torch.device("cuda:0" if cuda else "cpu") dataset = SIGHAN(split=split, root_path=data_path) val_dataset = SIGHAN(split='dev', root_path=data_path) # build vocab train_chars, train_tags = get_feature(dataset.data) vocab = build_vocab(chain.from_iterable(train_chars)) tok2idx, idx2tok = add_id(vocab) tags = build_vocab(["B", "M", "E", "S"], add_unk=False, add_pad=True) tag2idx, idx2tag = add_id(tags) # build model # conv_seg = CharWordSeg(len(tags), len(vocab), char_embed_size, num_hidden_layer, channel_size, kernel_size).to(device) # crf_model = CRF(num_tags=5).to(device) # conv_seg.train() # crf_model.train() # optimizer = torch.optim.Adam(list(conv_seg.parameters())+list(crf_model.parameters()), lr=learning_rate) vocab_tag = { "token_to_index": tok2idx, "index_to_token": idx2tok, "tag_to_index": tag2idx, "index_to_tag": idx2tag } data_loader = DataLoader(dataset, batch_size=batch_size) model = CharWordSeg(vocab_tag, char_embed_size, num_hidden_layer, channel_size, kernel_size).to(device) optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) if last_model_path is not None: # load model logging.info(f'load model from {last_model_path}') params = torch.load(last_model_path, map_location=lambda storage, loc: storage) model.load_state_dict(params['state_dict']) logging.info('restore parameters of the optimizers') optimizer.load_state_dict(torch.load(last_model_path + '.optim')) model.train() epoch = 0 cur_trial = 0 hist_valid_scores = [] train_time = begin_time = time.time() logging.info("begin training!") while True: epoch += 1 train_loss = 0 cum_cnt = 0 for iter_idx, data in enumerate(data_loader): optimizer.zero_grad() chars_batch, tags_batch = get_feature(data) chars_batch, chars_mask = pad_sents(chars_batch, pad_token, max_len=max_sent_length) tags_batch, tags_mask = pad_sents(tags_batch, pad_token, max_len=max_sent_length) input_chars = torch.tensor(token2id(chars_batch, tok2idx, unk_id=tok2idx['<UNK>']), device=device) target_tags = torch.tensor(token2id(tags_batch, tag2idx), device=device) tags_mask = torch.tensor(tags_mask, dtype=torch.uint8, device=device) # output_emission = conv_seg(input_chars.to(device)) # loss = -crf_model(output_emission.transpose(0,1), target_tags.transpose(0,1).to(device), tags_mask.transpose(0,1).to(device)) loss = -model(input_chars, target_tags, tags_mask) train_loss += loss cum_cnt = cum_cnt + input_chars.shape[0] loss.backward() optimizer.step() train_loss = train_loss / cum_cnt val_loss = evaluate(model, val_dataset, batch_size, max_sent_length, device) logging.info( f'epoch {epoch}\t train_loss: {train_loss}\t val_loss:{val_loss}\t speed:{time.time()-train_time:.2f}s/epoch\t time elapsed {time.time()-begin_time:.2f}s' ) train_time = time.time() is_better = len( hist_valid_scores) == 0 or val_loss < min(hist_valid_scores) hist_valid_scores.append(val_loss) if epoch % log_every == 0: model.save(f"{save_to}/model_step_{epoch}") torch.save(optimizer.state_dict(), f"{save_to}/model_step_{epoch}.optim") if is_better: cur_patience = 0 model_save_path = f"{save_to}/model_best" print(f'save currently the best model to [{model_save_path}]') model.save(model_save_path) # also save the optimizers' state torch.save(optimizer.state_dict(), model_save_path + '.optim') elif cur_patience < patience: cur_patience += 1 print('hit patience %d' % cur_patience) if cur_patience == patience: cur_trial += 1 print(f'hit #{cur_trial} trial') if cur_trial == max_num_trial: print('early stop!') exit(0) # decay lr, and restore from previously best checkpoint lr = optimizer.param_groups[0]['lr'] * lr_decay logging.info( f'load previously best model and decay learning rate to {lr}' ) # load model params = torch.load(model_save_path, map_location=lambda storage, loc: storage) model.load_state_dict(params['state_dict']) model = model.to(device) logging.info('restore parameters of the optimizers') optimizer.load_state_dict( torch.load(model_save_path + '.optim')) # set new lr for param_group in optimizer.param_groups: param_group['lr'] = lr # reset patience cur_patience = 0 if epoch == max_epoch: print('reached maximum number of epochs!') exit(0)