def test(): from args import conf data = SNLI(conf) setattr(conf, 'char_vocab_size', len(data.char_vocab)) setattr(conf, 'word_vocab_size', len(data.TEXT.vocab)) setattr(conf, 'class_size', len(data.LABEL.vocab)) setattr(conf, 'max_word_len', data.max_word_len) model = BIMPM(conf, data) model.load_state_dict(torch.load('results/baseline.pt')) model = model.to(conf.device) _, acc = evaluate(model, conf, data) print(f'test acc: {acc:.3f}')
def test(): data = SNLI(conf) conf.char_vocab_size = len(data.char_vocab) conf.word_vocab_size = len(data.TEXT.vocab) conf.class_size = len(data.LABEL.vocab) conf.max_word_len = data.max_word_len model = BIMPM(conf, data) model.load_state_dict(torch.load('results/baseline.pt')) model.word_emb.weight.requires_grad = True model = model.to(conf.device).eval() batch = next(iter(data.dev_iter)) output = F.softmax(model(batch.premise, batch.hypothesis), 1) original_scores, original_predictions = torch.max(output, 1) original_scores = original_scores.detach().cpu().numpy() original_predictions = original_predictions.detach().cpu().numpy() reduced, removed_indices = get_rawr( model, batch, max_beam_size=rawr_conf.max_beam_size, conf_threshold=rawr_conf.conf_threshold, p_not_h=False, ) reduced_hypothesis = padding_tensor( [torch.LongTensor(r[0]) for r in reduced]) reduced_hypothesis = reduced_hypothesis.to(conf.device) output = F.softmax(model(batch.premise, batch.hypothesis), 1) reduced_scores, reduced_predictions = torch.max(output, 1) reduced_scores = reduced_scores.detach().cpu().numpy() reduced_predictions = reduced_predictions.detach().cpu().numpy() print(all(reduced_predictions == original_predictions))
def main(): from args import conf, tune_conf parser = argparse.ArgumentParser() parser.add_argument('--baseline', default='results/baseline.pt') parser.add_argument( '--ent-train', default='/scratch0/shifeng/rawr/new_snli/rawr.train.pkl') parser.add_argument('--ent-dev', default='/scratch0/shifeng/rawr/new_snli/rawr.dev.pkl') args = parser.parse_args() out_dir = prepare_output_dir(args, args.root_dir) log = logging.getLogger(__name__) log.setLevel(logging.DEBUG) fh = logging.FileHandler(os.path.join(out_dir, 'output.log')) fh.setLevel(logging.DEBUG) ch = logging.StreamHandler(sys.stdout) ch.setLevel(logging.INFO) formatter = logging.Formatter(fmt='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S') fh.setFormatter(formatter) ch.setFormatter(formatter) log.addHandler(fh) log.addHandler(ch) log.info('===== {} ====='.format(out_dir)) ''' load regular data ''' log.info('loading regular training data') data = SNLI(conf) conf.char_vocab_size = len(data.char_vocab) conf.word_vocab_size = len(data.TEXT.vocab) conf.class_size = len(data.LABEL.vocab) conf.max_word_len = data.max_word_len log.info('loading entropy dev data {}'.format(tune_conf.ent_dev)) with open(tune_conf.ent_dev, 'rb') as f: ent_dev = pickle.load(f) if isinstance(ent_dev[0], list): ent_dev = list(itertools.chain(*ent_dev)) log.info('{} entropy dev examples'.format(len(ent_dev))) ent_dev = [[ x['data']['premise'], x['data']['hypothesis'], x['data']['label'] ] for x in ent_dev] log.info('loading entropy training data {}'.format(tune_conf.ent_train)) with open(tune_conf.ent_train, 'rb') as f: ent_train = pickle.load(f) if isinstance(ent_train[0], list): ent_train = list(itertools.chain(*ent_train)) log.info('{} entropy training examples'.format(len(ent_train))) ent_train = [[ x['data']['premise'], x['data']['hypothesis'], x['data']['label'] ] for x in ent_train] train_ent_batches = batchify(ent_train, tune_conf.batch_size) log.info('{} entropy training batches'.format(len(train_ent_batches))) log.info('loading model from {}'.format(args.baseline)) model = BIMPM(conf, data) model.load_state_dict(torch.load(args.baseline)) # model.word_emb.weight.requires_grad = True model.cuda(conf.gpu) parameters = list(filter(lambda p: p.requires_grad, model.parameters())) optimizer = optim.Adam(parameters, lr=tune_conf.lr) ent_optimizer = optim.Adam(parameters, lr=tune_conf.ent_lr) criterion = nn.CrossEntropyLoss() init_loss, init_acc = evaluate(model, data.dev_iter) log.info("initial loss {:.4f} accuracy {:.4f}".format(init_loss, init_acc)) best_acc = init_acc dev_ent_batches = batchify(ent_dev, tune_conf.batch_size) init_ent, init_ent_acc = evaluate_ent(model, dev_ent_batches) log.info("initial entropy {:.4f} ent_acc {:.4f}".format( init_ent, init_ent_acc)) epoch = 0 i_ent, i_mle = 0, 0 # number of examples train_loss, train_ent = 0, 0 train_mle_iter = iter(data.train_iter) train_ent_iter = iter(train_ent_batches) while True: model.train() for i in range(tune_conf.n_ent): try: prem, hypo, label = next(train_ent_iter) except StopIteration: random.shuffle(train_ent_batches) train_ent_iter = iter(train_ent_batches) i_ent = 0 train_ent = 0 break output = forward(model, prem, hypo, conf.max_sent_len) output = F.softmax(output, 1) ent = entropy(output).sum() train_ent += ent.data.cpu().numpy()[0] loss = -tune_conf.gamma * ent ent_optimizer.zero_grad() loss.backward() ent_optimizer.step() i_ent += prem.shape[0] end_of_epoch = False for i in range(tune_conf.n_mle): if i_mle >= len(data.train_iter): epoch += 1 end_of_epoch = True data.train_iter.init_epoch() train_mle_iter = iter(data.train_iter) i_mle = 0 train_loss = 0 break batch = next(train_mle_iter) output = forward(model, batch.premise, batch.hypothesis, conf.max_sent_len) loss = criterion(output, batch.label) train_loss += loss.data.cpu().numpy()[0] optimizer.zero_grad() loss.backward() optimizer.step() i_mle += batch.premise.shape[0] if i_mle % 1000 == 0: _loss = train_loss / i_mle if i_mle != 0 else 0 _ent = train_ent / i_ent if i_ent != 0 else 0 log.info( 'epoch [{:2}] [{} / {}] loss[{:.5f}] entropy[{:.5f}]'.format( epoch, i_mle, len(data.train_iter), _loss, _ent)) if end_of_epoch or i_mle % 1e5 == 0: dev_loss, dev_acc = evaluate(model, data.dev_iter) dev_ent, dev_ent_acc = evaluate_ent(model, dev_ent_batches) log.info("dev acc: {:.4f} ent: {:.4f} ent_acc: {:.4f}".format( dev_acc, dev_ent, dev_ent_acc)) model_path = os.path.join(out_dir, 'checkpoint_epoch_{}.pt'.format(epoch)) torch.save(model.state_dict(), model_path) if dev_acc > best_acc: best_acc = dev_acc model_path = os.path.join(out_dir, 'best_model.pt') torch.save(model.state_dict(), model_path) log.info("best model saved {}".format(dev_acc)) if epoch > 40: break
def main(): parser = argparse.ArgumentParser() parser.add_argument('--fold', required=True) parser.add_argument('--baseline', default='results/baseline.pt') parser.add_argument('--pnoth', default=False, action='store_true', help='reduce premise instead of hypothesis') parser.add_argument('--truth', default=False, action='store_true', help='use label instead of prediction as target') args = parser.parse_args() data = SNLI(conf) conf.char_vocab_size = len(data.char_vocab) conf.word_vocab_size = len(data.TEXT.vocab) conf.class_size = len(data.LABEL.vocab) conf.max_word_len = data.max_word_len q_vocab = data.TEXT.vocab.itos a_vocab = data.LABEL.vocab.itos out_dir = prepare_output_dir(conf, 'results', 'rawr') print('Generating [{}] rawr data from [{}].'.format( args.fold, args.baseline)) print(out_dir) model = BIMPM(conf, data) model.load_state_dict(torch.load(args.baseline)) model.word_emb.weight.requires_grad = True model.to(conf.device) datasets = {'train': data.train_iter, 'dev': data.dev_iter} if args.pnoth: fname = 'rawr.{}.premise.pkl'.format(args.fold) else: fname = 'rawr.{}.hypothesis.pkl'.format(args.fold) checkpoint = [] for batch_i, batch in enumerate(tqdm(datasets[args.fold])): if batch_i > len(datasets[args.fold]): # otherwise train iter will loop forever! break batch_size = batch.hypothesis.shape[0] model.eval() output = F.softmax(model(batch.premise, batch.hypothesis), 1) original_scores, original_predictions = torch.max(output, 1) original_scores = original_scores.detach().cpu().numpy() original_predictions = original_predictions.detach().cpu().numpy() batch_cpu = Batch(batch.premise.data.cpu(), batch.hypothesis.data.cpu(), batch.label.data.cpu()) reduced, removed_indices = get_rawr( model, batch, max_beam_size=rawr_conf.max_beam_size, conf_threshold=rawr_conf.conf_threshold, p_not_h=args.pnoth) for i in range(batch_size): og = { 'premise': batch_cpu.premise[i], 'hypothesis': batch_cpu.hypothesis[i], 'premise_readable': to_text(batch_cpu.premise[i], q_vocab), 'hypothesis_readable': to_text(batch_cpu.hypothesis[i], q_vocab), 'prediction': original_predictions[i], 'prediction_readable': a_vocab[original_predictions[i]], 'score': original_scores[i], 'label': batch_cpu.label[i], 'label_readable': a_vocab[batch_cpu.label[i]] } checkpoint.append({'original': og, 'reduced': []}) s1 = batch.hypothesis[i] if args.pnoth else batch.premise[i] s1 = s1.to(conf.device) for j, s2 in enumerate(reduced[i]): s2 = torch.LongTensor(s2).to(conf.device) model.eval() if args.pnoth: output = model(s2.unsqueeze(0), s1.unsqueeze(0)) else: output = model(s1.unsqueeze(0), s2.unsqueeze(0)) output = F.softmax(output, 1) pred_scores, pred = torch.max(output, 1) pred = pred.detach().cpu().numpy()[0] pred_scores = pred_scores.detach().cpu().numpy()[0] if args.pnoth: hypo, prem = s1.cpu(), s2.cpu() else: prem, hypo = s1.cpu(), s2.cpu() checkpoint[-1]['reduced'].append({ 'premise': prem, 'hypothesis': hypo, 'premise_readable': to_text(prem, q_vocab), 'hypothesis_readable': to_text(hypo, q_vocab), 'prediction': pred, 'prediction_readable': a_vocab[pred], 'score': pred_scores, 'label': batch_cpu.label[i], 'label_readable': a_vocab[batch_cpu.label[i]], 'removed_indices': removed_indices[i][j], 'which_reduced': 'premise' if args.pnoth else 'hypothesis' }) if batch_i % 1000 == 0 and batch_i > 0: out_path = os.path.join(out_dir, '{}.{}'.format(fname, batch_i)) with open(out_path, 'wb') as f: pickle.dump(checkpoint, f) checkpoint = [] if len(checkpoint) > 0: out_path = os.path.join(out_dir, '{}.{}'.format(fname, batch_i)) with open(out_path, 'wb') as f: pickle.dump(checkpoint, f)
def main(): from args import conf parser = argparse.ArgumentParser() parser.add_argument('--train', default='results/rawr.train.hypothesis.pkl') parser.add_argument('--dev', default='results/rawr.dev.hypothesis.pkl') parser.add_argument('--truth', default=False, action='store_true', help='use label instead of prediction as target') parser.add_argument('--ogdev', default=False, action='store_true', help='use original dev set instead of reduced') parser.add_argument('--full', default=0, type=float, help='amount of full examples to include') args = parser.parse_args() conf.train_data = args.train conf.dev_data = args.dev print('loading regular data...') regular_data = SNLI(conf) conf.char_vocab_size = len(regular_data.char_vocab) conf.word_vocab_size = len(regular_data.TEXT.vocab) conf.class_size = len(regular_data.LABEL.vocab) conf.max_word_len = regular_data.max_word_len conf.out_dir = prepare_output_dir(conf, 'results', 'reduced') print('loading reduced data from [{}]'.format(conf.train_data)) with open(conf.train_data, 'rb') as f: train = pickle.load(f) print('loading reduced data from [{}]'.format(conf.dev_data)) with open(conf.dev_data, 'rb') as f: dev = pickle.load(f) train_label = 'label' if args.truth else 'prediction' train = [(x['premise'], x['hypothesis'], ex['original'][train_label]) for ex in train for x in ex['reduced']] # dev = [(x['premise'], x['hypothesis'], x['label']) # for ex in dev for x in ex['reduced']] dev = [(x['premise'], x['hypothesis'], x['label']) for ex in dev for x in ex['reduced'][:1]] train_batches = batchify(train, conf.batch_size) if args.full > 0: n_examples = int(len(regular_data.train_iter) * args.full) print('use {} ({}) full training data'.format( n_examples * conf.batch_size, args.full)) full_batches = [] for j, x in enumerate(regular_data.train_iter): if j > n_examples: break full_batches.append((x.premise, x.hypothesis, x.label)) # train_batches += full_batches train_batches = full_batches print(len(train_batches)) if args.ogdev: dev_batches = list(regular_data.dev_iter) dev_batches = [(x.premise, x.hypothesis, x.label) for x in dev_batches] else: dev_batches = batchify(train, conf.batch_size) model = BIMPM(conf, regular_data) if conf.gpu > -1: model.cuda(conf.gpu) print('begin training') best_model = train_reduced(model, train_batches, dev_batches, conf) torch.save(best_model.state_dict(), os.path.join(conf.out_dir, 'best.pt')) print('training finished!')
def train(conf, data): model = BIMPM(conf, data) model = model.to(conf.device) parameters = filter(lambda p: p.requires_grad, model.parameters()) optimizer = optim.Adam(parameters, lr=conf.lr) criterion = nn.CrossEntropyLoss() model.train() loss, last_epoch = 0, -1 max_dev_acc, max_test_acc = 0, 0 iterator = data.train_iter for i, batch in enumerate(iterator): present_epoch = int(iterator.epoch) if present_epoch == conf.epoch: break if present_epoch > last_epoch: print('epoch:', present_epoch + 1) ckp_dir = 'results/baseline_checkpoints/baseline_epoch_{}.pt' torch.save(model.state_dict(), ckp_dir.format(present_epoch + 1)) last_epoch = present_epoch s1, s2 = 'premise', 'hypothesis' s1, s2 = getattr(batch, s1), getattr(batch, s2) # limit the lengths of input sentences up to max_sent_len if conf.max_sent_len >= 0: if s1.size()[1] > conf.max_sent_len: s1 = s1[:, :conf.max_sent_len] if s2.size()[1] > conf.max_sent_len: s2 = s2[:, :conf.max_sent_len] kwargs = {'p': s1, 'h': s2} if conf.use_char_emb: char_p = torch.LongTensor(data.characterize(s1)) char_h = torch.LongTensor(data.characterize(s2)) char_p = char_p.to(conf.device) char_h = char_h.to(conf.device) kwargs['char_p'] = char_p kwargs['char_h'] = char_h # pred = model(**kwargs) pred = model(s1, s2) optimizer.zero_grad() batch_loss = criterion(pred, batch.label) loss += batch_loss.data.item() batch_loss.backward() optimizer.step() if (i + 1) % conf.print_freq == 0: dev_loss, dev_acc = evaluate(model, conf, data, mode='dev') test_loss, test_acc = evaluate(model, conf, data) # c = (i + 1) // conf.print_freq # writer.add_scalar('loss/train', loss, c) # writer.add_scalar('loss/dev', dev_loss, c) # writer.add_scalar('acc/dev', dev_acc, c) # writer.add_scalar('loss/test', test_loss, c) # writer.add_scalar('acc/test', test_acc, c) print(f'train loss: {loss:.3f} / \ dev loss: {dev_loss:.3f} / \ test loss: {test_loss:.3f} /' f'dev acc: {dev_acc:.3f} / \ test acc: {test_acc:.3f}') if dev_acc > max_dev_acc: max_dev_acc = dev_acc max_test_acc = test_acc best_model = copy.deepcopy(model) loss = 0 model.train() # writer.close() print(f'max dev acc: {max_dev_acc:.3f} / max test acc: {max_test_acc:.3f}') return best_model
"wb")) import json json.dump( config, open( "/data/xuht/guoxin/poc/duplicate_sentence_model/duplicate_models/bimpm1/config.json", "w")) api = ModelAPI( "/data/xuht/guoxin/poc/duplicate_sentence_model/duplicate_models/bimpm1", "/data/xuht/guoxin/poc/duplicate_sentence_model/duplicate_models/bimpm1" ) api.load_config() model = BIMPM() api.build_graph(model) api.train_step([ train_anchor_matrix, train_check_matrix, train_label_matrix, train_anchor_len_matrix, train_check_len_matrix ], [ dev_anchor_matrix, dev_check_matrix, dev_label_matrix, dev_anchor_len_matrix, dev_check_len_matrix ]) elif model_type == "siamese_cnn": os.environ["CUDA_VISIBLE_DEVICES"] = "3" config = { "vocab_size": vocab_size, "max_length": 200,