Esempio n. 1
0
def main():
	parser = argparse.ArgumentParser()
	parser.add_argument("--train_file", default=None, type=str, required=True,
						help="training file")
	parser.add_argument("--dev_file", default=None, type=str, required=True,
						help="development file")
	parser.add_argument("--output_dir", default=None, type=str, required=True,
						help="output directory for tokenizers and models")
	parser.add_argument("--num_epochs", default=10, type=int, required=False,
						help="number of epochs for training")
	parser.add_argument("--vocab_size", default=50000, type=int, required=False,
						help="vocabulary size")
	parser.add_argument("--hidden_size", default=300, type=int, required=False,
						help="hidden size of GRU")
	parser.add_argument("--embed_size", default=300, type=int, required=False,
						help="word embedding size")
	parser.add_argument("--batch_size", default=64, type=int, required=False,
						help="batch size for train and eval")
	parser.add_argument("--loss_function", default="hinge", type=str, required=False,
						choices=["CrossEntropy", "hinge"],
						help="which loss function to choose")
	args = parser.parse_args()

	# load dataset
	train_df = pd.read_csv(args.train_file)[["title", "reply"]]
	dev_df = pd.read_csv(args.dev_file)[["title", "reply"]]
	texts = list(train_df["title"]) + list(train_df["reply"])
	tokenizer = create_tokenizer(texts, args.vocab_size)

	title_encoder = GRUEncoder(tokenizer.vocab_size, args.embed_size, args.hidden_size)
	reply_encoder = GRUEncoder(tokenizer.vocab_size, args.embed_size, args.hidden_size)
	model = DualEncoder(title_encoder, reply_encoder, type=args.loss_function)
	if args.loss_function == "CrossEntropy":
		loss_fn = nn.BCEWithLogitsLoss()
	elif args.loss_function == "hinge":
		loss_fn = nn.CosineEmbeddingLoss()
	optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
	device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 
	model = model.to(device)
	
	if not os.path.exists(args.output_dir):
		os.makedirs(args.output_dir)
	pickle.dump(tokenizer, open(os.path.join(args.output_dir, "tokenizer.pickle"), "wb"))

	best_acc = 0.
	for epoch in range(args.num_epochs):
		print("start epoch {}".format(epoch))
		train(train_df, model, loss_fn, optimizer, device, tokenizer, args)	
		acc = evaluate(dev_df, model, loss_fn, device, tokenizer, args)
		if acc > best_acc:
			best_acc = acc
			print("saving best model")
			torch.save(model.state_dict(), os.path.join(args.output_dir, "faq_model.pth"))
Esempio n. 2
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--train_file", default=None, type=str, required=True,
                        help="training file")
    parser.add_argument("--output_dir", default=None, type=str, required=True,
                        help="output directory for tokenizers and models")
    parser.add_argument("--batch_size", default=64, type=int, required=False,
                        help="batch size for train and eval")
    parser.add_argument("--hidden_size", default=300, type=int, required=False,
                        help="hidden size of GRU")
    parser.add_argument("--embed_size", default=300, type=int, required=False,
                        help="word embedding size")
    args = parser.parse_args()

    # load dataset
    train_df = pd.read_csv(args.train_file)[["title", "reply"]]
    tokenizer = pickle.load(open(os.path.join(args.output_dir, "tokenizer.pickle"), "rb"))

    title_encoder = GRUEncoder(tokenizer.vocab_size, args.embed_size, args.hidden_size)
    reply_encoder = GRUEncoder(tokenizer.vocab_size, args.embed_size, args.hidden_size)
    model = DualEncoder(title_encoder, reply_encoder)
    model.load_state_dict(torch.load(os.path.join(args.output_dir, "faq_model.pth")))
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 
    model = model.to(device)

    candidate_file = os.path.join(args.output_dir, "reply_candidates.pickle")
    if not os.path.isfile(candidate_file):
        replies, vectors = prepare_replies(train_df, model, device, tokenizer, args) 
        pickle.dump([replies, vectors], open(candidate_file, "wb"))
    else:
        replies, vectors = pickle.load(open(candidate_file, "rb"))


    while True:
        title = input("你的问题是?\n")
        if len(title.strip()) == 0:
            continue
        title = [title] 
        x, x_mask = list2tensor(title, tokenizer)
        x = x.to(device)	
        x_mask = x_mask.to(device)
        x_rep = model.encoder2(x, x_mask).data.cpu().numpy()
        scores = cosine_similarity(x_rep, vectors)[0]
        index = np.argmax(scores)
        print("可能的答案:", replies[index])
def main():
    """
    Training and validation.
    """

    global best_bleu4, epochs_since_improvement, checkpoint, start_epoch, fine_tune_encoder, data_name, word_map

    if dual_encoder:  # this is always initialized with pre-trained models:
        print("DUAL ENCODER")
        if dual_encoder_checkpoint is not None:
            print('Loaded Dual Encoder Checkpoint')
            dual_branch_checkpoint = torch.load(checkpoint,
                                                map_location='cuda:0')
            encoder = dual_branch_checkpoint['encoder']

            decoder = dual_branch_checkpoint['decoder']
            decoder_optimizer = torch.optim.Adam(params=filter(
                lambda p: p.requires_grad, decoder.parameters()),
                                                 lr=decoder_lr)

        else:
            main_branch_checkpoint = torch.load(checkpoint,
                                                map_location='cuda:0')
            encoder = DualEncoder(sketch_resnet=sketch_encoder_resnet)
            encoder.m_resnet = main_branch_checkpoint['encoder'].resnet
            print("Use pre-trained resnet")
            # encoder.m_adaptive_pool = main_branch_checkpoint['encoder'].adaptive_pool

            decoder = main_branch_checkpoint['decoder']
            decoder_optimizer = torch.optim.Adam(params=filter(
                lambda p: p.requires_grad, decoder.parameters()),
                                                 lr=decoder_lr)

        if fine_tune_encoder is True:
            print("!!! Will fine tune Encoder !!!")
            encoder.fine_tune(fine_tune_encoder)
            encoder_optimizer = torch.optim.Adam(params=filter(
                lambda p: p.requires_grad, encoder.parameters()),
                                                 lr=encoder_lr)

        else:
            encoder_optimizer = torch.optim.Adam(params=filter(
                lambda p: p.requires_grad, encoder.parameters()),
                                                 lr=encoder_lr)

    else:  # following method is for One Encoder architecture
        # Initialize / load checkpoint
        if checkpoint is None:
            decoder = DecoderWithAttention(attention_dim=attention_dim,
                                           embed_dim=emb_dim,
                                           decoder_dim=decoder_dim,
                                           vocab_size=len(word_map),
                                           dropout=dropout)
            decoder_optimizer = torch.optim.Adam(params=filter(
                lambda p: p.requires_grad, decoder.parameters()),
                                                 lr=decoder_lr)
            encoder = Encoder(specify_resnet=main_encoder_resnet)
            encoder.fine_tune(fine_tune_encoder)
            encoder_optimizer = torch.optim.Adam(
                params=filter(lambda p: p.requires_grad, encoder.parameters()),
                lr=encoder_lr) if fine_tune_encoder else None

        else:
            checkpoint = torch.load(checkpoint, map_location='cuda:0')
            # start_epoch = checkpoint['epoch'] + 1
            # epochs_since_improvement = checkpoint['epochs_since_improvement']
            # best_bleu4 = checkpoint['bleu-4'] this metric is unfair when we switch to a different domain
            decoder = checkpoint['decoder']
            # decoder_optimizer = checkpoint['decoder_optimizer']
            decoder_optimizer = torch.optim.Adam(params=filter(
                lambda p: p.requires_grad, decoder.parameters()),
                                                 lr=decoder_lr)
            if main_encoder_resnet is not None:
                encoder = Encoder(
                    specify_resnet=main_encoder_resnet
                )  # specify here so the encoder remove the last 2 layers of resnet
                encoder.adaptive_pool = checkpoint['encoder'].adaptive_pool

            else:
                encoder = checkpoint['encoder']

            # encoder_optimizer = checkpoint['encoder_optimizer']
            # if fine_tune_encoder is True and encoder_optimizer is None:

            if fine_tune_encoder is True:
                print("Will fine tune Encoder")
                encoder.fine_tune(fine_tune_encoder)
                encoder_optimizer = torch.optim.Adam(params=filter(
                    lambda p: p.requires_grad, encoder.parameters()),
                                                     lr=encoder_lr)

            else:
                encoder_optimizer = torch.optim.Adam(params=filter(
                    lambda p: p.requires_grad, encoder.parameters()),
                                                     lr=encoder_lr)

    # Move to GPU, if available
    decoder = decoder.to(device)
    encoder = encoder.to(device)

    # Loss function
    criterion = nn.CrossEntropyLoss().to(device)

    # Custom dataloaders
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])

    # data augmention for nycc dataset
    augment = transforms.Compose([
        transforms.RandomAffine(20, (0.1, 0.1), (0.8, 1.2)),
        transforms.RandomHorizontalFlip(p=0.5)
    ])

    train_loader = torch.utils.data.DataLoader(CaptionDataset(
        data_folder,
        data_name,
        'TRAIN',
        transform=transforms.Compose([augment, normalize])),
                                               batch_size=batch_size,
                                               shuffle=True,
                                               num_workers=workers,
                                               pin_memory=True)
    val_loader = torch.utils.data.DataLoader(CaptionDataset(
        data_folder,
        data_name,
        'VAL',
        transform=transforms.Compose([normalize])),
                                             batch_size=batch_size,
                                             shuffle=True,
                                             num_workers=workers,
                                             pin_memory=True)

    # Epochs
    for epoch in range(start_epoch, epochs):

        # Decay learning rate if there is no improvement for 8 consecutive epochs, and terminate training after 20
        # if epochs_since_improvement == 40:
        #    break
        # if epochs_since_improvement > 0 and epochs_since_improvement % 8 == 0:
        #    adjust_learning_rate(decoder_optimizer, 0.8)
        #    if fine_tune_encoder:
        #        adjust_learning_rate(encoder_optimizer, 0.8)

        # One epoch's training
        train(train_loader=train_loader,
              encoder=encoder,
              decoder=decoder,
              criterion=criterion,
              encoder_optimizer=encoder_optimizer,
              decoder_optimizer=decoder_optimizer,
              epoch=epoch)

        # One epoch's validation
        recent_bleu4 = validate(val_loader=val_loader,
                                encoder=encoder,
                                decoder=decoder,
                                criterion=criterion,
                                epoch=epoch)

        # Check if there was an improvement
        is_best = recent_bleu4 > best_bleu4
        best_bleu4 = max(recent_bleu4, best_bleu4)
        if not is_best:
            epochs_since_improvement += 1
            print("\nEpochs since last improvement: %d\n" %
                  (epochs_since_improvement, ))
        else:
            epochs_since_improvement = 0
            # Save checkpoint
            print(" *** saving model with bleu score: ", recent_bleu4)
            save_checkpoint(data_name, epoch, epochs_since_improvement,
                            encoder, decoder, encoder_optimizer,
                            decoder_optimizer, recent_bleu4, is_best)

    print(" *** LAST EPOCH saving model with bleu score: ", recent_bleu4)
    save_checkpoint(data_name, epoch, epochs_since_improvement, encoder,
                    decoder, encoder_optimizer, decoder_optimizer,
                    recent_bleu4, is_best)