def main(): if not os.path.exists(args.ckpt_file): raise FileNotFoundError("model file not found") data_dir = '/home/tiankeke/workspace/datas/sumdata/' TRAIN_X = os.path.join(data_dir, 'train/train.article.txt') TRAIN_Y = os.path.join(data_dir, 'train/train.title.txt') TEST_X = args.input_file small_vocab_file = 'sumdata/small_vocab.json' if os.path.exists(small_vocab_file): small_vocab = json.load(open(small_vocab_file)) else: small_vocab = build_vocab([TRAIN_X, TRAIN_Y], small_vocab_file, vocab_size=80000) max_src_len = 101 max_tgt_len = 47 test_x = BatchManager(load_data(TEST_X, max_src_len, args.n_test), args.batch_size, small_vocab) model = Transformer(len(small_vocab), len(small_vocab), max_src_len, d_word_vec=300, d_model=300, d_inner=1200, n_layers=1, n_head=6, d_k=50, d_v=50, dropout=0.1, tgt_emb_prj_weight_sharing=True, emb_src_tgt_weight_sharing=True).cuda() # print(model) model.eval() saved_state = torch.load(args.ckpt_file) model.load_state_dict(saved_state['state_dict']) print('Load model parameters from %s' % args.ckpt_file) my_test(test_x, model, small_vocab)
def evaluate(model: Transformer, criterion, device): model.eval() epoches_loss = 0 print('evaluate') with torch.no_grad(): for index, batch in enumerate(dataset_pro.valid_iter): shang_lian, shang_lian_length = batch.shang_lian shang_lian = shang_lian.permute(1, 0).to(device) # shang_lian_length = shang_lian_length.permute(1, 0).to(device) # shang_lian_length = shang_lian_length.numpy() # shang_lian_pos = torch.LongTensor(get_pos_ids(shang_lian_length, shang_lian.shape[1])).to(device) xia_lian, xia_lian_length = batch.xia_lian xia_lian = xia_lian.permute(1, 0).to(device) # xia_lian_length = xia_lian_length.numpy() # xia_lian_pos = torch.LongTensor(get_pos_ids(xia_lian_length, xia_lian.shape[1])).to(device) outputs = model(shang_lian, xia_lian[:, :-1]) outputs = outputs.contiguous().view(-1, outputs.shape[-1]) xia_lian = xia_lian[:, 1:].contiguous().view(-1) loss = criterion(outputs, xia_lian) epoches_loss += loss.item() return epoches_loss / len(dataset_pro.valid_iter)
def main(): device = torch.device("cuda:0" if USE_CUDA else "cpu") env = Environment() END_TAG_IDX = env.lang.word2idx[END_TAG] SAY_HI = "hello" targ_lang = env.lang vocab_inp_size = len(env.lang.word2idx) vocab_tar_size = len(targ_lang.word2idx) print("vocab_inp_size", vocab_inp_size) print("vocab_tar_size", vocab_tar_size) model = Transformer( vocab_inp_size, vocab_tar_size, MAX_TARGET_LEN, d_word_vec=32, d_model=32, d_inner=32, n_layers=3, n_head=4, d_k=32, d_v=32, dropout=0.1, ).to(device) # baseline = Baseline(UNITS) history = [] l_optimizer = torch.optim.Adam(model.parameters(), lr=1e-5) batch = None def maybe_pad_sentence(s): return tf.keras.preprocessing.sequence.pad_sequences( s, maxlen=MAX_TARGET_LEN, padding='post') def get_returns(r: float, seq_len: int): return list(reversed([r * (GAMMA**t) for t in range(seq_len)])) def sentence_to_idxs(sentence: str): return [ env.lang.word2idx[token] for token in tokenize_sentence(sentence) ] for episode in range(EPISODES): # Start of Episode env.reset() model.eval() # get first state from the env state, _, done = env.step(SAY_HI) while not done: src_seq = [ env.lang.word2idx[token] for token in tokenize_sentence(state) ] src_seq, src_pos = collate_fn([src_seq]) src_seq, src_pos = src_seq.to(device), src_pos.to(device) enc_output, *_ = model.encoder(src_seq, src_pos) actions_t = [] actions = [] actions_idx = [] while len(actions) == 0 or actions[len(actions) - 1] != END_TAG_IDX and len( actions) < MAX_TARGET_LEN: # construct new tgt_seq based on what's outputed so far if len(actions_t) == 0: tgt_seq = [env.lang.word2idx[Constants.UNK_WORD]] else: tgt_seq = actions_idx tgt_seq, tgt_pos = collate_fn([tgt_seq]) tgt_seq, tgt_pos = tgt_seq.to(device), tgt_pos.to(device) # dec_output dims: [1, pos, hidden] dec_output, * \ _ = model.decoder(tgt_seq, tgt_pos, src_seq, enc_output) # pick last step dec_output = dec_output[:, -1, :] # w_logits dims: [1, vocab_size] w_logits = model.tgt_word_prj(dec_output) # w_probs dims: [1, vocab_size] w_probs = torch.nn.functional.softmax(w_logits, dim=1) w_dist = torch.distributions.categorical.Categorical( probs=w_probs) w_idx_t = w_dist.sample() w_idx = w_idx_t.cpu().numpy()[0] actions_t.append(w_idx_t) actions_idx.append(w_idx) actions.append(env.lang.idx2word[w_idx]) # action is a sentence (string) action_str = ' '.join(actions) next_state, reward, done = env.step(action_str) # print(reward) history.append((state, actions_t, action_str, reward)) state = next_state # record history (to be used for gradient updating after the episode is done) # End of Episode # Update policy model.train() while len(history) >= BATCH_SIZE: batch = history[:BATCH_SIZE] state_inp_b, action_inp_b, reward_b, ret_seq_b = zip(*[[ sentence_to_idxs(state), actions_b, reward, get_returns(reward, MAX_TARGET_LEN) ] for state, actions_b, _, reward in batch]) action_inp_b = [torch.stack(sent) for sent in action_inp_b] action_inp_b = torch.stack(action_inp_b) ret_seq_b = np.asarray(ret_seq_b) # ret_mean = np.mean(ret_seq_b) # ret_std = np.std(ret_seq_b) # ret_seq_b = (ret_seq_b - ret_mean) / ret_std ret_seq_b = np.exp((ret_seq_b - 0.5) * 5) ret_seq_b = torch.tensor(ret_seq_b, dtype=torch.float32).to(device) loss = 0 # loss_bl=0 l_optimizer.zero_grad() # accumulate gradient with GradientTape src_seq, src_pos = collate_fn(list(state_inp_b)) src_seq, src_pos = src_seq.to(device), src_pos.to(device) enc_output_b, *_ = model.encoder(src_seq, src_pos) max_sentence_len = action_inp_b.shape[1] tgt_seq = [[Constants.BOS] for i in range(BATCH_SIZE)] for t in range(max_sentence_len): # _b stands for batch prev_w_idx_b, tgt_pos = collate_fn(tgt_seq) prev_w_idx_b, tgt_pos = prev_w_idx_b.to(device), tgt_pos.to( device) # dec_output_b dims: [batch, pos, hidden] dec_output_b, *_ = \ model.decoder(prev_w_idx_b, tgt_pos, src_seq, enc_output_b) # pick last step dec_output_b = dec_output_b[:, -1, :] # w_logits_b dims: [batch, vocab_size] w_logits_b = model.tgt_word_prj(dec_output_b) # w_probs dims: [batch, vocab_size] w_probs_b = torch.nn.functional.softmax(w_logits_b, dim=1) dist_b = torch.distributions.categorical.Categorical( probs=w_probs_b) curr_w_idx_b = action_inp_b[:, t, :] log_probs_b = torch.transpose( dist_b.log_prob(torch.transpose(curr_w_idx_b, 0, 1)), 0, 1) # bl_val_b = baseline(tf.cast(dec_hidden_b, 'float32')) # delta_b = ret_b - bl_val_b # cost_b = -tf.math.multiply(log_probs_b, delta_b) # cost_b = -tf.math.multiply(log_probs_b, ret_b) ret_b = torch.reshape(ret_seq_b[:, t], (BATCH_SIZE, 1)).to(device) # alternatively, use torch.mul() but it is overloaded. Might need to try log_probs_b*vec.expand_as(A) cost_b = -torch.mul(log_probs_b, ret_b) # log_probs_b*vec.expand_as(A) # cost_b = -torch.bmm() #if we are doing batch multiplication loss += cost_b # loss_bl += -tf.math.multiply(delta_b, bl_val_b) prev_w_idx_b = curr_w_idx_b tgt_seq = np.append(tgt_seq, prev_w_idx_b.data.cpu().numpy(), axis=1).tolist() # calculate cumulative gradients # model_vars = encoder.variables + decoder.variables loss = loss.mean() loss.backward() # loss_bl.backward() # finally, apply gradient l_optimizer.step() # bl_optimizer.step() # Reset everything for the next episode history = history[BATCH_SIZE:] if episode % max(BATCH_SIZE, 32) == 0 and batch != None: print(">>>>>>>>>>>>>>>>>>>>>>>>>>") print("Episode # ", episode) print("Samples from episode with rewards > 0: ") good_rewards = [(s, a_str, r) for s, _, a_str, r in batch] for s, a, r in random.sample(good_rewards, min(len(good_rewards), 3)): print("prev_state: ", s) print("actions: ", a) print("reward: ", r) # print("return: ", get_returns(r, MAX_TARGET_LEN)) ret_seq_b_np = ret_seq_b.cpu().numpy() print("all returns: min=%f, max=%f, median=%f" % (np.min(ret_seq_b_np), np.max(ret_seq_b_np), np.median(ret_seq_b_np))) print("avg reward: ", sum(reward_b) / len(reward_b)) print("avg loss: ", np.mean(loss.cpu().detach().numpy()))
# outpu_tensor = torch.argmax(output.squeeze(1), 1) ouput_str = get_output_char(result) return ouput_str else: target = beam_search.beam_decode(input_tensor, model, beam_with=5) print(target) print(len(target[0][0])) ouput_str = get_output_char(target[0][0][1:]) return ouput_str if __name__ == '__main__': args = get_args() # pad index device = torch.device('cuda' if args.no_cuda == False else 'cpu') transformer_model = Transformer(args.sl_vocab_size, args.xl_vocab_size, hid_dim=args.embedding_dim, pf_dim=args.fp_inner_dim, n_layers=args.n_layers, n_heads=args.n_head, dropout=args.dropout, device=device, SOS_IDX=SOS_IDX, PAD_IDX=PAD_IDX, EOS_IDX=EOS_IDX).to( device) # transformer_model.load_state_dict(torch.load('./models-bak/transformer/1121/transformer-model_11.pt', map_location='cpu')) transformer_model.load_state_dict(torch.load('./models-bak/transformer/1122/transformer-model_500.pt', map_location='cpu')) transformer_model.eval() text = '欲出烦恼须无我' print(predict_xl(text, transformer_model, device, is_beam_search=True)) # df = pd.read_excel('./couplet/result-test.xlsx') # df['transformer'] = df['上联'].apply(lambda x: predict_xl(x, transformer_model, device, is_beam_search=False)) # df['transformer_beam'] = df['上联'].apply(lambda x: predict_xl(x, transformer_model, device, is_beam_search=True)) # df.to_excel('./couplet/result-test.xlsx',index=False)
for i in range(config["num_epochs"]): start = time.time() epoch_metrics = dict() # output an example greedy_output_example(model, val_dataset, device, vocab) # run each phase per epoch for phase in ["train", "val"]: if phase == "train": # set model to training mode model.train() dataloader = data_loader_train batch_size = config["train_batch_size"] else: # set model to evaluation mode model.eval() dataloader = data_loader_val batch_size = config["val_batch_size"] # initialize metrics phase_metrics = dict() epoch_loss = list() average_epoch_loss = None n_word_total = 0 n_correct = 0 n_word_correct = 0 for i, batch in enumerate( tqdm(dataloader, mininterval=2, desc=phase, leave=False)): # forward pred, gold = forward(phase, batch, model, optimizer) # backward
def run(): parser = ArgumentParser() parser.add_argument("--dataset_path", type=str, default="", help="Path or url of the dataset.") parser.add_argument("--model_checkpoint", type=str, default="", help="Path, url or short name of the model") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument("--gpt2_model_name", type=str, default="gpt2", help="name of the model ex)openai-gpt") parser.add_argument("--no_sample", action='store_true', help="Set to use greedy decoding instead of sampling") parser.add_argument("--max_length", type=int, default=30, help="Maximum length of the output utterances") parser.add_argument("--min_length", type=int, default=4, help="Minimum length of the output utterances") parser.add_argument("--seed", type=int, default=0, help="Seed") parser.add_argument("--keyword_module", type=str, default="new", help="add, attention, ") parser.add_argument("--temperature", type=int, default=0.8, help="Sampling softmax temperature") parser.add_argument( "--top_k", type=int, default=30, help="Filter top-k tokens before sampling (<=0: no filtering)") parser.add_argument( "--top_p", type=float, default=0.9, help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)") parser.add_argument('-d_model', type=int, default=512) parser.add_argument('-d_inner_hid', type=int, default=2048) parser.add_argument('-d_k', type=int, default=64) parser.add_argument('-d_v', type=int, default=64) parser.add_argument('-n_head', type=int, default=8) parser.add_argument('-n_layers', type=int, default=6) parser.add_argument('-warmup', '--n_warmup_steps', type=int, default=4000) parser.add_argument('-dropout', type=float, default=0.1) parser.add_argument('-embs_share_weight', action='store_true') parser.add_argument('-proj_share_weight', action='store_true') parser.add_argument('-label_smoothing', action='store_true') args = parser.parse_args() args.d_word_vec = args.d_model logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__file__) logger.info(pformat(args)) if args.seed != 0: random.seed(args.seed) torch.random.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) logger.info("Get pretrained model and tokenizer") tokenizer_class = GPT2Tokenizer if "gpt2" in args.gpt2_model_name else OpenAIGPTTokenizer # cant use Autotokenizer because checkpoint could be a Path tokenizer = tokenizer_class.from_pretrained(args.gpt2_model_name) num_tokens = len(tokenizer.encoder) num_added_tokens = tokenizer.add_special_tokens( ATTR_TO_SPECIAL_TOKEN) # doesn't add if they are already there model = Transformer( num_tokens + num_added_tokens, num_tokens + num_added_tokens, src_pad_idx=tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[-1]), trg_pad_idx=tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[-1]), trg_emb_prj_weight_sharing=args.proj_share_weight, emb_src_trg_weight_sharing=args.embs_share_weight, d_k=args.d_k, d_v=args.d_v, d_model=args.d_model, d_word_vec=args.d_word_vec, d_inner=args.d_inner_hid, n_layers=args.n_layers, n_head=args.n_head, dropout=args.dropout, n_position=512, keyword_module=args.keyword_module).to(args.device) model.load_state_dict(torch.load(args.model_checkpoint), strict=False) model.eval() sourceList, targetList, scoreList = get_test_datasetEN( tokenizer, tokenizer, args.dataset_path) current_time = datetime.now().strftime('%b%d_%H-%M-%S') f1 = open((args.model_checkpoint + current_time + "_output.txt"), 'w') for line in tqdm(zip(sourceList, targetList, scoreList), total=len(sourceList)): out_ids = sample_sequence(line[0], line[2], tokenizer, model, tokenizer, args) out_texts = tokenizer.decode(out_ids) for text in out_texts: f1.write(text.replace('▁', ' ').replace('</s>', ' ')) """ for id in out_ids: f1.write(str(id)) f1.write(' ') """ f1.write("\n") f1.close()
def get_embedding(): import transformer.Constants as Constants from transformer.Models import Transformer from transformer.Optim import ScheduledOptim from transformer.Modules import LabelSmoothing from transformer.Beam import Beam from transformer.Translator import translate from preprocess import read_instances_from_file, convert_instance_to_idx_seq import evals from evals import Logger from DataLoader import DataLoader data = torch.load(opt.data) opt.max_token_seq_len_e = data['settings'].max_seq_len opt.max_token_seq_len_d = 30 opt.proj_share_weight = True opt.d_word_vec = opt.d_model # training_data = DataLoader( # data['dict']['src'], # data['dict']['tgt'], # src_insts=data['train']['src'], # tgt_insts=data['train']['tgt'], # batch_size=opt.batch_size, # shuffle=True, # cuda=opt.cuda) opt.src_vocab_size = training_data.src_vocab_size opt.tgt_vocab_size = training_data.tgt_vocab_size opt.tgt_vocab_size = opt.tgt_vocab_size - 4 opt.src_vocab_size = training_data.src_vocab_size opt.tgt_vocab_size = training_data.tgt_vocab_size opt.tgt_vocab_size = opt.tgt_vocab_size - 4 opt.d_v = int(opt.d_model / opt.n_head) opt.d_k = int(opt.d_model / opt.n_head) model = Transformer(opt.src_vocab_size, opt.tgt_vocab_size, opt.max_token_seq_len_e, opt.max_token_seq_len_d, proj_share_weight=opt.proj_share_weight, embs_share_weight=False, d_k=opt.d_k, d_v=opt.d_v, d_model=opt.d_model, d_word_vec=opt.d_word_vec, d_inner_hid=opt.d_inner_hid, n_layers_enc=opt.n_layers_enc, n_layers_dec=opt.n_layers_dec, n_head=opt.n_head, dropout=opt.dropout, dec_dropout=opt.dec_dropout, encoder=opt.encoder, decoder=opt.decoder, enc_transform=opt.enc_transform, onehot=opt.onehot, no_enc_pos_embedding=opt.no_enc_pos_embedding, dec_reverse=opt.dec_reverse, no_residual=opt.no_residual) state_dict = torch.load(opt.results_dir + '/' + opt.mname + '/model.chkpt') model.load_state_dict(state_dict['model']) model = model.cuda() model.eval() model.decoder.tgt_word_emb.weight W = model.decoder.tgt_word_emb.weight.data.cpu().numpy() numpy.save(W, 'Embedding')