Beispiel #1
0
def main(args):
    with open(args.word2idx_path, 'r') as fr:
        word2idx = json.loads(fr.read())
    with open(args.sememe2idx_path, 'r') as fr:
        sememe2idx = json.loads(fr.read())
    results = ResDataset(args.gen_file_path, word2idx, sememe2idx)
    res_loader = data.DataLoader(dataset=results, batch_size=1, shuffle=False)

    if torch.cuda.is_available():
        pretrained_word_emb = torch.Tensor(
            np.load(args.pretrained_word_emb_path)).cuda()
        pretrained_sememe_emb = torch.Tensor(
            np.load(args.pretrained_sememe_emb_path)).cuda()
    else:
        pretrained_word_emb = torch.Tensor(
            np.load(args.pretrained_word_emb_path))
        pretrained_sememe_emb = torch.Tensor(
            np.load(args.pretrained_sememe_emb_path))

    # Load pretrained model or build from scratch
    adaptive = Encoder2Decoder(args.embed_size, args.hidden_size,
                               len(word2idx) + 1, pretrained_word_emb,
                               pretrained_sememe_emb)
    if torch.cuda.is_available():
        adaptive.cuda()
    adaptive.load_state_dict(torch.load(args.pretrained))
    scores = gen_score(adaptive, res_loader)
    with codecs.open(args.output_path, 'w', 'utf-8') as fw:
        fw.write('\n'.join(scores))
    return 0
Beispiel #2
0
def main(args):
    # tb_summary_writer = SummaryWriter(args.checkpoint_path)
    if args.gpu is not None:
        os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu
        # torch.cuda.set_device(args.gpu)
        torch.backends.cudnn.benchmark = True

    # To reproduce training results
    torch.manual_seed(args.seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(args.seed)

    # Image Preprocessing
    # For normalization, see https://github.com/pytorch/vision#models

    # Load vocabulary wrapper.
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Load pretrained model or build from scratch
    adaptive = Encoder2Decoder(args, len(vocab))
    if torch.cuda.is_available():
        adaptive.cuda()
    # adaptive = Encoder2Decoder(args, len(vocab), args.gpu)
    if vars(args).get('start_from', None) is not None and os.path.isfile(args.start_from):
        adaptive.load_state_dict(torch.load(args.start_from))
    # cider_scores = []

    # Start Training
    # for epoch in range(start_epoch, args.num_epochs + 1):

    cider, metrics = coco_eval(adaptive, args, 0, split='test')
    print('Testing Model: CIDEr score %.2f' % (cider))
    def main(self):
        print "********************Overhead Operations***************************"

        with open(self.vocab_path, 'rb') as f:
            self.vocab = pickle.load(f)

        # Image transformation
        self.transform = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
        ])

        # Load model
        print "Loading model {}......".format(self.pretrained)
        self.model = Encoder2Decoder(256, len(self.vocab), 512)
        self.model.load_state_dict(torch.load(self.pretrained))
        self.model.eval()
        if torch.cuda.is_available():
            self.model.cuda()
        print "Model loaded!"

        print "********************Validation Phase***************************"

        images_path = './data/resized/val2014/'
        caption_path = './data/annotations/captions_val2014.json'

        self.eval(images_path, caption_path, self.args.val_saved_name)

        print "********************Test Phase***************************"

        images_path = './data/resized/test2014/'
        caption_path = './data/annotations/image_info_test2014.json'

        self.eval(images_path, caption_path, self.args.test_saved_name)
Beispiel #4
0
def main(args):
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    adaptive = Encoder2Decoder(args.embed_size, len(vocab), args.hidden_size)
    adaptive.load_state_dict(torch.load(args.pretrained))
    if torch.cuda.is_available():
        adaptive.cuda()
    adaptive.eval()

    transform = transforms.Compose([
        transforms.Scale((args.crop_size, args.crop_size)),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    inference_data_loader = torch.utils.data.DataLoader(
        InferenceLoader(args.image_dir, img_transform=transform),
        batch_size=args.eval_size,
        shuffle=False,
        num_workers=args.num_workers,
        drop_last=False)
    results = []
    print(
        '---------------------Start Inference on AI-challenger dataset-----------------------'
    )

    for i, (images, file_prefix) in enumerate(inference_data_loader):
        images = to_var(images)
        generated_captions = adaptive.sampler_beam_search(
            images, args.beam_size)

        sampled_caption = []
        #_generated_captions=generated_captions.cpu().data.numpy()
        for word_id in generated_captions:
            #print(word_id.int())
            word = vocab.idx2word[int(word_id.cpu().data.numpy())]
            if word == '<end>':
                break
            else:
                sampled_caption.append(word)

        sentence = ''.join(sampled_caption[1:])
        temp = {'image_id': file_prefix[0], 'caption': sentence}
        results.append(temp)

        # Disp evaluation process
        if (i + 1) % 10 == 0:
            print('[%d/%d]' % ((i + 1), len(inference_data_loader)))

    #json.dump(results,open(args.inference_output_json,"w"),ensure_ascii=False,sort_keys=True, indent=2, separators=(',', ': '))
    with io.open(args.inference_output_json, 'w', encoding='utf-8') as fd:
        fd.write(
            unicode(
                json.dumps(results,
                           ensure_ascii=False,
                           sort_keys=True,
                           indent=2,
                           separators=(',', ': '))))
def main(args):
    # To reproduce training results
    torch.manual_seed(args.seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(args.seed)

    # Load word2idx
    with open(args.word2idx_path, 'r') as fr:
        word2idx = json.loads(fr.read())
    with open(args.idx2word_path, 'r') as fr:
        idx2word = json.loads(fr.read())
    with open(args.idx2sememe_path, 'r') as fr:
        idx2sememe = json.loads(fr.read())

    # Build training data loader
    test_loader = get_loader(args.test_path,
                             args.test_size,
                             shuffle=False,
                             num_workers=args.num_workers,
                             mode='test')

    # Load pretrained embeddings
    if torch.cuda.is_available():
        pretrained_word_emb = torch.Tensor(
            np.load(args.pretrained_word_emb_path)).cuda()
        pretrained_sememe_emb = torch.Tensor(
            np.load(args.pretrained_sememe_emb_path)).cuda()
    else:
        pretrained_word_emb = torch.Tensor(
            np.load(args.pretrained_word_emb_path))
        pretrained_sememe_emb = torch.Tensor(
            np.load(args.pretrained_sememe_emb_path))

    adaptive = Encoder2Decoder(args.embed_size, args.hidden_size,
                               len(word2idx) + 1, pretrained_word_emb,
                               pretrained_sememe_emb)
    if torch.cuda.is_available():
        adaptive.cuda()
    adaptive.load_state_dict(torch.load(args.pretrained))
    if args.beam_size == 1:
        results = greedy_sampler(adaptive, test_loader, idx2word, idx2sememe)
    else:
        results = beam_sampler(adaptive, test_loader, idx2word, idx2sememe)
    with codecs.open(args.output_path, 'w', 'utf-8') as fw:
        for word, sememes, definition in results:
            fw.write('%s ||| %s ||| %s\n' % (word, sememes, definition))
    return 0
    def __init__(self, dictionaries, model_path):

        self.model_path = model_path

        self.seg2idx, self.idx2seg = dictionaries

        self.model = Encoder2Decoder(256, len(self.seg2idx), 512)
        self.model.load_state_dict(torch.load(
            self.model_path, map_location='cpu'
            ))
        self.model.cuda()
        self.model.eval()

        CROP_SIZE = 224
        transform = transforms.Compose([
            transforms.Resize((CROP_SIZE, CROP_SIZE)),
            transforms.ToTensor(),
            transforms.Normalize((0.485, 0.456, 0.406),
                                 (0.229, 0.224, 0.225))
                                 ])
        self.transform = transform
Beispiel #7
0
def main(args):
    args.checkpoint_path = os.path.join(
        'log_' + args.dataset + '_' + args.pattern, args.session)
    tb_summary_writer = SummaryWriter(args.checkpoint_path)
    if args.gpu is not None:
        os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu
        # torch.cuda.set_device(args.gpu)
        torch.backends.cudnn.benchmark = True

    # To reproduce training results
    torch.manual_seed(args.seed)
    if torch.cuda.is_available():
        print('### CUDA is available!')
        torch.cuda.manual_seed(args.seed)

    # Create model directory
    if not os.path.exists(args.checkpoint_path):
        os.makedirs(args.checkpoint_path)
    if not os.path.exists('data'):
        os.mkdir('data')

    # Image Preprocessing
    # For normalization, see https://github.com/pytorch/vision#models
    transform = transforms.Compose([
        transforms.RandomCrop(args.crop_size),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    # Load vocabulary wrapper.
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)
        args.vocab = vocab

    # Build training data loader
    data_loader = get_loader(args.image_dir,
                             args.caption_path,
                             vocab,
                             transform,
                             args.batch_size,
                             shuffle=True,
                             num_workers=args.num_workers)

    # Load pretrained model or build from scratch
    adaptive = Encoder2Decoder(args, len(vocab))
    # adaptive = Encoder2Decoder(args, len(vocab), args.gpu)

    infos = {}
    if args.start_from is not None:
        with open(
                os.path.join(args.start_from,
                             'infos_' + args.dataset + '.pkl')) as f:
            infos = cPickle.load(f)
            # saved_model_opt = infos['args']
            # need_be_same = ["caption_model", "rnn_type", "rnn_size", "num_layers"]
            # for checkme in need_be_same:
            #     assert vars(saved_model_opt)[checkme] == vars(args)[
            #         checkme], "Command line argument and saved model disagree on '%s' " % checkme
    if vars(args).get('start_from', None) is not None and os.path.isfile(
            os.path.join(args.start_from, "model.pth")):
        adaptive.load_state_dict(
            torch.load(os.path.join(args.start_from, 'model.pth')))

    epoch = infos.get('epoch', 1)

    # Constructing CNN parameters for optimization, only fine-tuning higher layers
    cnn_subs = list(adaptive.encoder.vgg_conv.children())
    cnn_params = [list(sub_module.parameters()) for sub_module in cnn_subs]
    cnn_params = [item for sublist in cnn_params for item in sublist]

    cnn_optimizer = torch.optim.Adam(cnn_params,
                                     lr=args.learning_rate_cnn,
                                     betas=(args.alpha, args.beta))
    if vars(args).get('start_from', None) is not None and os.path.isfile(
            os.path.join(args.start_from, "cnn_optimizer.pth")):
        cnn_optimizer.load_state_dict(
            torch.load(os.path.join(args.start_from, 'cnn_optimizer.pth')))

    # Other parameter optimization
    params = list(adaptive.decoder.parameters())

    # Will decay later
    learning_rate = args.learning_rate

    # Language Modeling Loss, Optimizers
    LMcriterion = nn.CrossEntropyLoss()

    # Change to GPU mode if available
    if torch.cuda.is_available():
        adaptive.cuda()
        LMcriterion.cuda()

    # Train the Models
    total_step = len(data_loader)

    cider_scores = []
    best_cider = 0.0
    best_epoch = 0
    best_cider_test = 0.0
    best_epoch_test = 0
    optimizer = torch.optim.Adam(params, lr=learning_rate)
    if vars(args).get('start_from', None) is not None and os.path.isfile(
            os.path.join(args.start_from, "optimizer.pth")):
        optimizer.load_state_dict(
            torch.load(os.path.join(args.start_from, 'optimizer.pth')))

    # Start Training
    # for epoch in range(start_epoch, args.num_epochs + 1):
    update_lr_flag = True
    while True:
        if update_lr_flag:
            if epoch > args.lr_decay:
                frac = (epoch -
                        args.cnn_epoch) / args.learning_rate_decay_every
                decay_factor = math.pow(0.5, frac)

                # Decay the learning rate
                learning_rate = learning_rate * decay_factor
                for group in optimizer.param_groups:
                    group['lr'] = learning_rate
            update_lr_flag = False
        # Language Modeling Training
        print('------------------Training for Epoch %d----------------' %
              (epoch))
        cur_time = time.time()
        for i, (images, captions, lengths) in enumerate(data_loader):
            start_time = time.time()
            # print('### images:', images.size())
            # print('### captions:', captions.size())
            # print('### lengths:', len(lengths))
            # Set mini-batch dataset
            images = to_var(images)
            captions = to_var(captions)
            lengths = [cap_len - 1 for cap_len in lengths]
            targets = pack_padded_sequence(captions[:, 1:],
                                           lengths,
                                           batch_first=True)[0]

            # Forward, Backward and Optimize
            adaptive.train()
            adaptive.zero_grad()

            packed_scores = adaptive(images, captions, lengths, args.pattern)

            # Compute loss and backprop
            loss = LMcriterion(packed_scores[0], targets)
            loss.backward()

            # Gradient clipping for gradient exploding problem in LSTM
            for p in adaptive.decoder.lstm_cell.parameters():
                p.data.clamp_(-args.clip, args.clip)

            optimizer.step()

            # Start learning rate decay

            # Start CNN fine-tuning
            if epoch > args.cnn_epoch:
                cnn_optimizer.step()

            # Print log info
            if i % args.log_step == 0:
                print('Epoch [%d/%d], Step [%d/%d], CrossEntropy Loss: %.4f, Perplexity: %5.4f, Elapsed: %.2fs' % \
                      (epoch, args.num_epochs,
                       i, total_step,
                       loss.item(),
                       np.exp(loss.item()),
                       time.time() - start_time))

                add_summary_value(tb_summary_writer, 'loss', loss.item(),
                                  epoch)
        print('##### Per Epoch Cost time: %.2fs' % (time.time() - cur_time))
        infos['epoch'] = epoch
        infos['vocab'] = vocab
        infos['args'] = args
        with open(os.path.join(args.checkpoint_path, 'infos.pkl'), 'wb') as f:
            cPickle.dump(infos, f)
        torch.save(optimizer.state_dict(),
                   os.path.join(args.checkpoint_path, 'optimizer.pth'))
        torch.save(cnn_optimizer.state_dict(),
                   os.path.join(args.checkpoint_path, 'cnn_optimizer.pth'))
        torch.save(adaptive.state_dict(),
                   os.path.join(args.checkpoint_path, 'model.pkl'))
        # with open(os.path.join(args.checkpoint_path, 'histories.pkl'), 'wb') as f:
        #     cPickle.dump(infos, f)
        # Evaluation on validation set
        cider, metrics = coco_eval(adaptive, args, epoch, split='val')
        cider_scores.append(cider)
        add_summary_dict(tb_summary_writer, 'metrics', metrics, epoch)

        if cider > best_cider:
            best_cider = cider
            best_epoch = epoch

            # Save the Adaptive Attention model after each epoch
            # name = str(args.yml).split('.')[0].split('/')[-1]
            torch.save(adaptive.state_dict(),
                       os.path.join(args.checkpoint_path, 'model-best.pkl'))
            with open(os.path.join(args.checkpoint_path, 'infos-best.pkl'),
                      'wb') as f:
                cPickle.dump(infos, f)
        print('Model of best epoch #: %d with CIDEr score %.2f' %
              (best_epoch, best_cider))

        # Test on test set
        caption_val_path = args.caption_val_path
        args.caption_val_path = args.caption_val_path.replace('val', 'test')
        cider_test, metrics_test = coco_eval(adaptive,
                                             args,
                                             epoch,
                                             split='test')
        args.caption_val_path = caption_val_path
        if cider_test > best_cider_test:
            best_cider_test = cider_test
            best_epoch_test = epoch
        print('Test Phase: Model of best epoch #: %d with CIDEr score %.2f' %
              (best_epoch_test, best_cider_test))

        epoch += 1
        if epoch > 80:
            break
Beispiel #8
0
def main(args):

    # To reproduce training results
    torch.manual_seed(args.seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(args.seed)

    # Create model directory
    if not os.path.exists(args.model_path):
        os.makedirs(args.model_path)

    # Image Preprocessing
    # For normalization, see https://github.com/pytorch/vision#models
    transform = transforms.Compose([
        transforms.RandomCrop(args.crop_size),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    # Load vocabulary wrapper.
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build training data loader
    data_loader = get_loader(args.image_dir,
                             args.caption_path,
                             vocab,
                             transform,
                             args.batch_size,
                             shuffle=True,
                             num_workers=args.num_workers)

    # Load pretrained model or build from scratch
    adaptive = Encoder2Decoder(args.embed_size, len(vocab), args.hidden_size)

    if args.pretrained:

        adaptive.load_state_dict(torch.load(args.pretrained))
        # Get starting epoch #, note that model is named as '...your path to model/algoname-epoch#.pkl'
        # A little messy here.
        start_epoch = int(
            args.pretrained.split('/')[-1].split('-')[1].split('.')[0]) + 1

    else:
        start_epoch = 1

    # Constructing CNN parameters for optimization, only fine-tuning higher layers
    cnn_subs = list(
        adaptive.encoder.resnet_conv.children())[args.fine_tune_start_layer:]
    cnn_params = [list(sub_module.parameters()) for sub_module in cnn_subs]
    cnn_params = [item for sublist in cnn_params for item in sublist]

    cnn_optimizer = torch.optim.Adam(cnn_params,
                                     lr=args.learning_rate_cnn,
                                     betas=(args.alpha, args.beta))

    # Other parameter optimization
    params = list( adaptive.encoder.affine_a.parameters() ) + list( adaptive.encoder.affine_b.parameters() ) \
                + list( adaptive.decoder.parameters() )

    # Will decay later
    learning_rate = args.learning_rate

    # Language Modeling Loss, Optimizers
    LMcriterion = nn.CrossEntropyLoss()

    # Change to GPU mode if available
    if torch.cuda.is_available():
        adaptive.cuda()
        LMcriterion.cuda()

    # Train the Models
    total_step = len(data_loader)

    cider_scores = []
    best_cider = 0.0
    best_epoch = 0

    # Start Training
    for epoch in range(start_epoch, args.num_epochs + 1):

        optimizer = torch.optim.Adam(params, lr=learning_rate)

        # Language Modeling Training
        print '------------------Training for Epoch %d----------------' % (
            epoch)
        for i, (images, captions, lengths, _) in enumerate(data_loader):

            # Set mini-batch dataset
            images = to_var(images)
            captions = to_var(captions)
            lengths = [cap_len - 1 for cap_len in lengths]
            targets = pack_padded_sequence(captions[:, 1:],
                                           lengths,
                                           batch_first=True)[0]

            # Forward, Backward and Optimize
            adaptive.train()
            adaptive.zero_grad()

            packed_scores = adaptive(images, captions, lengths)

            # Compute loss and backprop
            loss = LMcriterion(packed_scores[0], targets)
            loss.backward()

            # Gradient clipping for gradient exploding problem in LSTM
            for p in adaptive.decoder.LSTM.parameters():
                p.data.clamp_(-args.clip, args.clip)

            optimizer.step()

            # Start learning rate decay
            if epoch > args.lr_decay:

                frac = (epoch -
                        args.cnn_epoch) / args.learning_rate_decay_every
                decay_factor = math.pow(0.5, frac)

                # Decay the learning rate
                learning_rate = learning_rate * decay_factor

            # Start CNN fine-tuning
            if epoch > args.cnn_epoch:

                cnn_optimizer.step()

            # Print log info
            if i % args.log_step == 0:
                print 'Epoch [%d/%d], Step [%d/%d], CrossEntropy Loss: %.4f, Perplexity: %5.4f' % (
                    epoch, args.num_epochs, i, total_step, loss.data[0],
                    np.exp(loss.data[0]))

        # Save the Adaptive Attention model after each epoch
        torch.save(adaptive.state_dict(),
                   os.path.join(args.model_path, 'adaptive-%d.pkl' % (epoch)))

        # Evaluation on validation set
        cider = coco_eval(adaptive, args, epoch)
        cider_scores.append(cider)

        if cider > best_cider:
            best_cider = cider
            best_epoch = epoch

        if len(cider_scores) > 5:

            last_6 = cider_scores[-6:]
            last_6_max = max(last_6)

            # Test if there is improvement, if not do early stopping
            if last_6_max != best_cider:

                print 'No improvement with CIDEr in the last 6 epochs...Early stopping triggered.'
                print 'Model of best epoch #: %d with CIDEr score %.2f' % (
                    best_epoch, best_cider)
                break
def main(args):
    # To reproduce training results
    torch.manual_seed(args.seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(args.seed)

    # Create model directory
    if not os.path.exists(args.model_path):
        os.makedirs(args.model_path)

    # Image Preprocessing
    # For normalization, see https://github.com/pytorch/vision#models
    transform = transforms.Compose([
        transforms.RandomCrop(args.crop_size),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    # Load vocabulary wrapper.
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build training data loader
    data_loader = get_loader(args.image_dir,
                             args.caption_path,
                             vocab,
                             args.det_file,
                             transform,
                             args.batch_size,
                             shuffle=True,
                             num_workers=args.num_workers)

    # Load pretrained model or build from scratch
    adaptive = Encoder2Decoder(args.embed_size, len(vocab), args.hidden_size)

    if args.pretrained:

        adaptive.load_state_dict(torch.load(args.pretrained))
        # Get starting epoch #, note that model is named as '...your path to model/algoname-epoch#.pkl'
        # A little messy here.
        start_epoch = int(
            args.pretrained.split('/')[-1].split('-')[1].split('.')[0]) + 1

    else:
        start_epoch = 1

    # Constructing CNN parameters for optimization, only fine-tuning higher layers

    ch = list(adaptive.encoder.resnet_conv.children())
    #for i in range(len(ch)):
    #    print i,'th:',ch[i]
    #cnn_subs = list(adaptive.encoder.resnet_conv.children())[args.fine_tune_start_layer:]

    cnn_subs = list(adaptive.encoder.resnet_conv.children())

    cnn_params = [list(sub_module.parameters()) for sub_module in cnn_subs]

    #print  "cnn_params",cnn_params
    cnn_params = [item for sublist in cnn_params for item in sublist]

    #print "cnn_params",cnn_params
    #www
    cnn_optimizer = torch.optim.Adam(cnn_params,
                                     lr=args.learning_rate_cnn,
                                     betas=(args.alpha, args.beta))

    # Other parameter optimization
    params = list(adaptive.encoder.affine_a.parameters()) + list(
        adaptive.encoder.affine_b.parameters()) + list(
            adaptive.decoder.parameters())

    # Will decay later
    learning_rate = args.learning_rate

    # Language Modeling Loss
    LMcriterion = nn.CrossEntropyLoss()

    # Change to GPU mode if available
    if torch.cuda.is_available():
        adaptive.cuda()
        LMcriterion.cuda()

    # Train the Models
    total_step = len(data_loader)

    cider_scores = []
    best_cider = 0.0
    best_epoch = 0

    adaptive.load_state_dict(torch.load(args.checkpoint_file))
    # Start Training
    for epoch in range(start_epoch, args.num_epochs + 1):
        if epoch > start_epoch:
            break

        # Evaluation on validation set
        Flickr_visual(adaptive, args, epoch)
Beispiel #10
0
def main(args):

    # To reproduce training results
    torch.manual_seed(args.seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(args.seed)

    # Create model directory
    if not os.path.exists(args.model_path):
        os.makedirs(args.model_path)

    # Load word2idx
    with open(args.word2idx_path, 'r') as fr:
        word2idx = json.loads(fr.read())

    # Build training data loader
    data_loader = get_loader(args.train_path,
                             args.batch_size,
                             shuffle=True,
                             num_workers=args.num_workers,
                             mode='train')

    # Load pretrained embeddings
    if torch.cuda.is_available():
        pretrained_word_emb = torch.Tensor(
            np.load(args.pretrained_word_emb_path)).cuda()
        pretrained_sememe_emb = torch.Tensor(
            np.load(args.pretrained_sememe_emb_path)).cuda()
    else:
        pretrained_word_emb = torch.Tensor(
            np.load(args.pretrained_word_emb_path))
        pretrained_sememe_emb = torch.Tensor(
            np.load(args.pretrained_sememe_emb_path))

    # Load pretrained model or build from scratch
    adaptive = Encoder2Decoder(args.embed_size, args.hidden_size,
                               len(word2idx) + 1, pretrained_word_emb,
                               pretrained_sememe_emb)

    if args.pretrained:
        adaptive.load_state_dict(torch.load(args.pretrained))
        # Get starting epoch #,
        # note that model is named as
        # '...your path to model/algoname-epoch#.pkl'
        # A little messy here.
        start_epoch = int(
            args.pretrained.split('/')[-1].split('-')[1].split('.')[0]) + 1
    else:
        start_epoch = 1

    # Will decay later
    # learning_rate = args.learning_rate

    # Language Modeling Loss
    LMcriterion = nn.CrossEntropyLoss(ignore_index=0)

    # Change to GPU mode if available
    if torch.cuda.is_available():
        adaptive.cuda()
        LMcriterion.cuda()

    # Train the Models
    total_step = len(data_loader)

    ppl_scores = []
    best_ppl = 0.0
    best_epoch = 0

    # Start Learning Rate Decay
    # if epoch > args.lr_decay:

    #     frac = float(epoch -
    #                     args.lr_decay) / args.learning_rate_decay_every
    #     decay_factor = math.pow(0.5, frac)

    #     # Decay the learning rate
    #     learning_rate = args.learning_rate * decay_factor

    # print('Learning Rate for Epoch %d: %.6f' % (epoch, learning_rate))

    optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad,
                                        adaptive.parameters()),
                                 lr=args.learning_rate,
                                 betas=(args.alpha, args.beta))

    # Start Training
    for epoch in range(start_epoch, args.num_epochs + 1):

        epoch_loss = []

        # Language Modeling Training
        print('------------------Training for Epoch %d----------------' %
              (epoch))
        for i, (word, sememes, definition) in enumerate(data_loader):
            # Set mini-batch dataset
            word = to_var(word)
            sememes = to_var(sememes)
            definition = to_var(definition)
            targets = definition[:, 1:]

            # Forward, Backward and Optimize
            adaptive.train()
            adaptive.zero_grad()

            scores, _ = adaptive(word, sememes, definition)
            scores = scores[:, :-1, :].transpose(1, 2)

            # Compute loss and backprop
            loss = LMcriterion(scores, targets)
            epoch_loss.append(loss.data[0])
            loss.backward()

            # Gradient clipping for gradient exploding problem in LSTM
            # for p in adaptive.decoder.LSTM.parameters():
            #     p.data.clamp_(-args.clip, args.clip)

            clip_grad_norm(
                filter(lambda p: p.requires_grad, adaptive.parameters()),
                args.clip)
            # print(args.clip)
            optimizer.step()

            # Print log info
            if i % args.log_step == 0:
                print(
                    'Epoch [%d/%d], Step [%d/%d], CrossEntropy Loss: %.4f, Perplexity: %5.4f'
                    % (epoch, args.num_epochs, i, total_step, loss.data[0],
                       np.exp(loss.data[0])))
        train_loss = np.mean(epoch_loss)
        train_ppl = np.exp(train_loss)
        # Save the Adaptive Attention model after each epoch
        torch.save(adaptive.state_dict(),
                   os.path.join(args.model_path, 'adaptive-%d.pkl' % (epoch)))

        # Evaluation on validation set
        valid_ppl = defseq_eval(adaptive, args, epoch)
        ppl_scores.append(valid_ppl)

        print(
            'Epoch [%d/%d], Train Loss: %.4f, Train PPL: %5.4f, Valid PPL: %5.4f'
            % (epoch, args.num_epochs, train_loss, train_ppl, valid_ppl))

        if valid_ppl < best_ppl or best_ppl == 0.0:
            best_ppl = valid_ppl
            best_epoch = epoch

        if len(ppl_scores) > 5:
            last_6 = ppl_scores[-6:]
            last_6_min = min(last_6)

            # Test if there is improvement, if not do early stopping
            if last_6_min != best_ppl:

                print(
                    'No improvement with ppl in the last 6 epochs...Early stopping triggered.'
                )
                print('Model of best epoch #: %d with ppl score %.2f' %
                      (best_epoch, best_ppl))
                break