def gen_best_23_error(): data_loader = DataLoader() Checkpoint.CHECKPOINT_DIR_NAME = args.checkpoint_dir_name checkpoint_path = os.path.join("./experiment", Checkpoint.CHECKPOINT_DIR_NAME, 'best') checkpoint = Checkpoint.load(checkpoint_path) seq2seq = checkpoint.model if args.cuda_use: seq2seq = seq2seq.cuda() seq2seq.eval() emb_model = seq2seq.encoder.embedding emb_np = emb_model.weight.cpu().data.numpy() np.save("./data/rl_train_data/emb.npy", emb_np) evaluator = Evaluator(vocab_dict = data_loader.vocab_dict, vocab_list = data_loader.vocab_list, decode_classes_dict = data_loader.decode_classes_dict, decode_classes_list = data_loader.decode_classes_list, loss = NLLLoss(), cuda_use = args.cuda_use) evaluator.gen_rl_data(model = seq2seq, data_loader = data_loader, data_list = data_loader.math23k_train_list, template_flag = False, batch_size = 16, evaluate_type = 0, use_rule = False, mode = args.mode, filename = args.load_name)
def step_one_test(): data_loader = DataLoader(args) #Checkpoint.CHECKPOINT_DIR_NAME = "0120_0030" Checkpoint.CHECKPOINT_DIR_NAME = args.checkpoint_dir_name checkpoint_path = os.path.join("./experiment", Checkpoint.CHECKPOINT_DIR_NAME, "best") checkpoint = Checkpoint.load(checkpoint_path) seq2seq = checkpoint.model if args.cuda_use: seq2seq = seq2seq.cuda() seq2seq.eval() evaluator = Evaluator(vocab_dict=data_loader.vocab_dict, vocab_list=data_loader.vocab_list, decode_classes_dict=data_loader.decode_classes_dict, decode_classes_list=data_loader.decode_classes_list, loss=NLLLoss(), cuda_use=args.cuda_use) name = args.run_flag if name == 'test_23k': test_temp_acc, test_ans_acc = evaluator.evaluate( model=seq2seq, data_loader=data_loader, data_list=data_loader.math23k_test_list, template_flag=True, batch_size=64, evaluate_type=0, use_rule=False, mode=args.mode, post_flag=args.post_flag, name_save=name) print(test_temp_acc, test_ans_acc)
def step_three(): data_loader = DataLoader(args) Checkpoint.CHECKPOINT_DIR_NAME = args.checkpoint_dir_name checkpoint_path = os.path.join("./experiment", Checkpoint.CHECKPOINT_DIR_NAME, "best") checkpoint = Checkpoint.load(checkpoint_path) seq2seq = checkpoint.model if args.cuda_use: seq2seq = seq2seq.cuda() seq2seq.eval() evaluator = Evaluator(vocab_dict=data_loader.vocab_dict, vocab_list=data_loader.vocab_list, decode_classes_dict=data_loader.decode_classes_dict, decode_classes_list=data_loader.decode_classes_list, loss=NLLLoss(), cuda_use=args.cuda_use) test_temp_acc, test_ans_acc = evaluator.evaluate( model=seq2seq, data_loader=data_loader, data_list=data_loader.math57k_data_list, template_flag=False, batch_size=64, evaluate_type=0, use_rule=True, mode=args.mode) print(test_temp_acc, test_ans_acc)
def gen_math57k_error(): data_loader = DataLoader() Checkpoint.CHECKPOINT_DIR_NAME = args.checkpoint_dir_name checkpoint_path = os.path.join("./experiment", Checkpoint.CHECKPOINT_DIR_NAME, args.load_name) checkpoint = Checkpoint.load(checkpoint_path) seq2seq = checkpoint.model if args.cuda_use: seq2seq = seq2seq.cuda() seq2seq.eval() evaluator = Evaluator(vocab_dict = data_loader.vocab_dict, vocab_list = data_loader.vocab_list, decode_classes_dict = data_loader.decode_classes_dict, decode_classes_list = data_loader.decode_classes_list, loss = NLLLoss(), cuda_use = args.cuda_use) evaluator.gen_rl_data(model = seq2seq, data_loader = data_loader, data_list = data_loader.math57k_data_list, template_flag = False, batch_size = 16, evaluate_type = 0, use_rule = True, mode = args.mode, filename = args.load_name)
def get(self, Y_pred, Y_true): N = Y_pred.shape[0] softmax = Softmax() prob = softmax._forward(Y_pred) loss = NLLLoss(prob, Y_true) Y_serial = np.argmax(Y_true, axis=1) dout = prob.copy() dout[np.arange(N), Y_serial] -= 1 return loss, dout
def train(self, model, data_loader, batch_size, n_epoch, template_flag, \ resume=False, optimizer=None, mode=0, teacher_forcing_ratio=0, post_flag=False): self.evaluator = Evaluator( vocab_dict=self.vocab_dict, vocab_list=self.vocab_list, decode_classes_dict=self.decode_classes_dict, decode_classes_list=self.decode_classes_list, loss=NLLLoss(), cuda_use=self.cuda_use) if resume: checkpoint_path = Checkpoint.get_certain_checkpoint( "./experiment", "best") resume_checkpoint = Checkpoint.load(checkpoint_path) model = resume_checkpoint.model self.optimizer = resume_checkpoint.optimizer resume_optim = self.optimizer.optimizer defaults = resume_optim.param_groups[0] defaults.pop('params', None) self.optimizer.optimizer = resume_optim.__class__( model.parameters(), **defaults) start_epoch = resume_checkpoint.epoch start_step = resume_checkpoint.step self.train_acc_list = resume_checkpoint.train_acc_list self.test_acc_list = resume_checkpoint.test_acc_list self.loss_list = resume_checkpoint.loss_list else: start_epoch = 1 start_step = 0 self.train_acc_list = [] self.test_acc_list = [] self.loss_list = [] model_opt = NoamOpt( 512, 1, 2000, torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9)) if optimizer is None: optimizer = Optimizer(optim.Adam(model.parameters()), max_grad_norm=0) self.optimizer = model_opt self._train_epoches(data_loader=data_loader, model=model, batch_size=batch_size, start_epoch=start_epoch, start_step=start_step, n_epoch=n_epoch, mode=mode, template_flag=template_flag, teacher_forcing_ratio=teacher_forcing_ratio, post_flag=post_flag)
def sample(self, batch_size, max_length=140, temperature=1.): """ Sample a batch of sequences :param batch_size: Number of sequences to sample :param max_length: Maximum length of the sequences :param temperature: Factor by which which the logits are dived. Small numbers make the model more confident on each position, but also more conservative. Large values result in random predictions at each step. return: seqs: (batch_size, seq_length) The sampled sequences. log_probs : (batch_size) Log likelihood for each sequence. entropy: (batch_size) The entropies for the sequences. Not currently used. """ start_token = Variable(torch.zeros(batch_size).long()) start_token[:] = self.voc.vocab["^"] h = None # creates zero tensor by default x = start_token unfinished = torch.ones_like(start_token, dtype=torch.uint8) sequences = [] log_probs = Variable(torch.zeros(batch_size)) entropy = Variable(torch.zeros(batch_size)) for step in range(max_length): logits, h = self.rnn(x, h) logits = logits / temperature prob = F.softmax(logits, dim=1) log_prob = F.log_softmax(logits, dim=1) x = torch.multinomial(prob, 1).view(-1) sequences.append(x.view(-1, 1)) log_prob = log_prob * unfinished.unsqueeze(1).float() log_probs += NLLLoss(log_prob, x) entropy += -torch.sum((log_prob * prob), 1) x = Variable(x.data) EOS_sampled = (x == self.voc.vocab['$']) unfinished = torch.eq(unfinished - EOS_sampled, 1) if torch.sum(unfinished) == 0: break sequences = torch.cat(sequences, 1) return sequences.data, log_probs, entropy
def likelihood(self, target, temperature=1.): """ Retrieves the likelihood of a given sequence :param target: (batch_size * sequence_lenght) A batch of sequences :param temperature: Factor by which which the logits are dived. Small numbers make the model more confident on each position, but also more conservative. Large values result in random predictions at each step. :return:log_probs : (batch_size) Log likelihood for each example* entropy: (batch_size) The entropies for the sequences. Not currently used. """ batch_size, seq_length = target.size() start_token = Variable(torch.zeros(batch_size, 1).long()) start_token[:] = self.voc.vocab["^"] x = torch.cat((start_token, target[:, :-1]), 1) h = None # creates zero tensor by default unfinished = torch.ones_like(start_token, dtype=torch.uint8) log_probs = Variable(torch.zeros(batch_size)) entropy = Variable(torch.zeros(batch_size)) for step in range(seq_length): logits, h = self.rnn(x[:, step], h) logits = logits / temperature log_prob = F.log_softmax(logits, dim=1) prob = F.softmax(logits, dim=1) log_prob = log_prob * unfinished.float() log_probs += NLLLoss(log_prob, target[:, step]) entropy += -torch.sum((log_prob * prob), 1) EOS_sampled = (x[:, step] == self.voc.vocab['$']).unsqueeze(1) unfinished = torch.eq(unfinished - EOS_sampled, 1) if torch.sum(unfinished) == 0: break return log_probs, entropy
def step_one(): if args.mode == 0: encoder_cell = 'lstm' decoder_cell = 'lstm' elif args.mode == 1: encoder_cell = 'gru' decoder_cell = 'gru' elif args.mode == 2: encoder_cell = 'gru' decoder_cell = 'lstm' else: encoder_cell = 'lstm' decoder_cell = 'gru' data_loader = DataLoader(args) embed_model = nn.Embedding(data_loader.vocab_len, 128) #embed_model.weight.data.copy_(torch.from_numpy(data_loader.word2vec.emb_vectors)) encode_model = EncoderRNN(vocab_size=data_loader.vocab_len, embed_model=embed_model, emb_size=128, hidden_size=256, input_dropout_p=0.3, dropout_p=0.4, n_layers=2, bidirectional=True, rnn_cell=None, rnn_cell_name=encoder_cell, variable_lengths=True) decode_model = DecoderRNN_3(vocab_size=data_loader.vocab_len, class_size=data_loader.classes_len, embed_model=embed_model, emb_size=128, hidden_size=512, n_layers=2, rnn_cell=None, rnn_cell_name=decoder_cell, sos_id=data_loader.vocab_dict['END_token'], eos_id=data_loader.vocab_dict['END_token'], input_dropout_p=0.3, dropout_p=0.4) seq2seq = Seq2seq(encode_model, decode_model) if args.cuda_use: seq2seq = seq2seq.cuda() weight = torch.ones(data_loader.classes_len) pad = data_loader.decode_classes_dict['PAD_token'] loss = NLLLoss(weight, pad) st = SupervisedTrainer(vocab_dict=data_loader.vocab_dict, vocab_list=data_loader.vocab_list, decode_classes_dict=data_loader.decode_classes_dict, decode_classes_list=data_loader.decode_classes_list, cuda_use=args.cuda_use, loss=loss, print_every=10, teacher_schedule=False, checkpoint_dir_name=args.checkpoint_dir_name) print('start training') st.train(model=seq2seq, data_loader=data_loader, batch_size=128, n_epoch=300, template_flag=True, resume=args.resume, optimizer=None, mode=args.mode, teacher_forcing_ratio=args.teacher_forcing_ratio, post_flag=args.post_flag)