コード例 #1
0
ファイル: modules.py プロジェクト: ruchikachavhan/SLP
class ValueNet(nn.Module):
	def __init__(self, embed_size, vocab_size, hidden_size, vocab, max_seq):
		super(ValueNet, self).__init__()
		self.embed_size = embed_size
		self.vocab_size = vocab_size
		self.hidden_size = hidden_size
		self.vocab = vocab
		self.CNNv = EncoderCNN(embed_size)
		self.RNNv = DecoderRNN(vocab_size, embed_size, hidden_size, vocab, max_seq)
		self.fc1 = nn.utils.weight_norm(nn.Linear(embed_size*2 , embed_size))
		self.fc2 = nn.utils.weight_norm(nn.Linear(embed_size, embed_size))
		self.fc3 = nn.utils.weight_norm(nn.Linear(embed_size, 1))
		self.relu = nn.LeakyReLU(0.2, inplace = True)
		self.norm1 = nn.LayerNorm(embed_size)
		self.norm2 = nn.LayerNorm(embed_size)

	def features_extract(self,images):
		features = self.CNNv(images)
		features = features.unsqueeze(1)
		return features
	#Gives value function for each generated word for the image
	def forward(self, images, generated_embed):
		features = self.features_extract(images)
		generated_embed = generated_embed.to(device)
		bs, max_seq,_, _, _ = generated_embed.shape
		value = torch.zeros(generated_embed.shape[1], 1)
		in_features = features.to(device)
		for index in range(0, generated_embed.shape[1]):
			captions_in = generated_embed[:, index, :, :, :]
			# captions_in = captions_in.view(bs, 1, -1).to(device)
			captions_in = captions_in.squeeze(0)
			input = torch.cat((in_features, captions_in), 2)
			fc_1 = self.relu(self.norm1(self.fc1(input)))
			fc_1 = self.relu(self.norm2(self.fc2(fc_1)))
			fc_1 = torch.tanh(self.fc3(fc_1))
			value[index] = fc_1
			in_features = generated_embed[:, index, :, :, :].squeeze(0)
		return value

	#Loss for this network
	def loss(self, image, reward, captions):
		value = self.forward(image, captions)
		value = value.to(device)
		reward = reward.to(device)
		Loss = F.smooth_l1_loss(value, reward)
		return Loss	

	#embedding features of Rnnv in hidden state of original captions
	def captions_hidden_state(self, caption):
		in_captions = self.RNNv.embed(caption)
		hiddens, hiddens_list = self.RNNv.forward_captions(in_captions)
		return hiddens, hiddens_list
コード例 #2
0
ファイル: train.py プロジェクト: BoomHaha/Image_caption
def do(args: argparse.Namespace):
    os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu
    print('gpu:', args.gpu)
    if not os.path.exists(args.save_model_path):
        os.mkdir(args.save_model_path)
    # preprocess
    preprocess = transforms.Compose([
        transforms.RandomCrop(args.random_crop_size),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
    ])
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)
    # dataset
    coco_loader = get_dataloader(root=args.dataset_path, json_path=args.json_path, vocab=vocab, batch_size=args.batch_size, num_workers=args.num_workers,
                                 transform=preprocess, shuffle=False)
    # models
    encoder = EncoderCNN(args.embed_size).cuda()
    decoder = DecoderRNN(len(vocab), args.embed_size, args.hidden_size, args.num_layers).cuda()
    loss_cls = nn.CrossEntropyLoss().cuda()
    params = list(encoder.fc.parameters()) + list(encoder.bn1d.parameters()) + list(decoder.parameters())
    optimizer = torch.optim.Adam(params, lr=args.learning_rate)
    # resume
    if args.resume:
        model_states = torch.load(os.path.join(args.save_model_path, 'model.ckpt'))
        print('checkpoint epoch: %d\tstep: %d' % (model_states['epoch'], model_states['step']))
        encoder.load_state_dict(model_states['encoder'])
        decoder.load_state_dict(model_states['decoder'])
        print('load successfully')
    # train
    total_step = len(coco_loader)
    print('total step in each epoch : ', total_step)
    encoder.fc.train(mode=True)
    encoder.bn1d.train(mode=True)
    encoder.encoder.eval()
    decoder.train(mode=True)
    input('ready')
    for cur_epoch in range(args.num_epochs):
        for cur_step, (image, caption, length) in enumerate(coco_loader):
            image = image.cuda()
            caption = caption.cuda()
            target = pack_padded_sequence(caption, length, batch_first=True)[0]
            out = decoder(encoder(image), caption, length)
            loss = loss_cls(out, target)
            encoder.zero_grad()
            decoder.zero_grad()
            loss.backward()
            optimizer.step()
            if (cur_step + 1) % args.print_step == 0:
                print('Epoch : %d/%d\tStep : %d/%d\tLoss : %.8f\tPerplexity : %.8f' % (
                    cur_epoch + 1, args.num_epochs, cur_step + 1, total_step, loss.item(), np.exp(loss.item())))
            if (cur_step + 1) % args.save_model_step == 0:
                torch.save({'epoch': cur_epoch + 1, 'step': cur_step + 1, 'encoder': encoder.state_dict(), 'decoder': decoder.state_dict()},
                           os.path.join(args.save_model_path, 'model.ckpt'))
                print('model saved at E:%d\tS:%d' % (cur_epoch + 1, cur_step + 1))
コード例 #3
0
ファイル: modules.py プロジェクト: ruchikachavhan/SLP
	def __init__(self, embed_size, vocab_size, hidden_size, vocab, max_seq):
		super(ValueNet, self).__init__()
		self.embed_size = embed_size
		self.vocab_size = vocab_size
		self.hidden_size = hidden_size
		self.vocab = vocab
		self.CNNv = EncoderCNN(embed_size)
		self.RNNv = DecoderRNN(vocab_size, embed_size, hidden_size, vocab, max_seq)
		self.fc1 = nn.utils.weight_norm(nn.Linear(embed_size*2 , embed_size))
		self.fc2 = nn.utils.weight_norm(nn.Linear(embed_size, embed_size))
		self.fc3 = nn.utils.weight_norm(nn.Linear(embed_size, 1))
		self.relu = nn.LeakyReLU(0.2, inplace = True)
		self.norm1 = nn.LayerNorm(embed_size)
		self.norm2 = nn.LayerNorm(embed_size)
コード例 #4
0
def main():
    lang1 = "eng"
    lang2 = "fra"
    f = open("../data/data/" + lang1 + "-" + lang2 + ".txt", encoding='utf-8')
    print(f)
    lines = f.readlines()
    eng_sentences, fra_sentences = data_loaders.getSentences(lines)
    print(len(eng_sentences), len(fra_sentences))
    eng_lang = Lang(lang1)
    eng_lang.parseSentences(eng_sentences)
    fra_lang = Lang(lang2)
    fra_lang.parseSentences(fra_sentences)
    print("No of eng words: ", len(eng_lang.vocab))
    print("No of fra words: ", len(fra_lang.vocab))
    pairs = data_loaders.createPairs(eng_sentences, fra_sentences)
    print("Length of pairs: ", len(pairs))

    hidden_size = 256
    encoder1 = EncoderRNN(len(eng_lang.vocab), hidden_size).to(device)
    attn_decoder1 = DecoderRNN(len(fra_lang.vocab), hidden_size,
                               len(fra_lang.vocab)).to(device)

    train.trainIters(encoder1,
                     attn_decoder1,
                     75000,
                     pairs,
                     eng_lang,
                     fra_lang,
                     print_every=5000)
コード例 #5
0
def main(opt):
    dataset = VideoDataset(opt, 'inference')
    opt["vocab_size"] = dataset.get_vocab_size()
    opt["seq_length"] = dataset.max_len

    if opt['beam_size'] != 1:
        assert opt["batch_size"] == 1
    if opt["model"] == 'S2VTModel':
        model = S2VTModel(opt["vocab_size"],
                          opt["max_len"],
                          opt["dim_hidden"],
                          opt["dim_word"],
                          opt['dim_vid'],
                          n_layers=opt['num_layers'],
                          rnn_cell=opt['rnn_type'],
                          bidirectional=opt["bidirectional"],
                          rnn_dropout_p=opt["rnn_dropout_p"])
    elif opt["model"] == "S2VTAttModel":
        encoder = EncoderRNN(opt["dim_vid"],
                             opt["dim_hidden"],
                             n_layers=opt['num_layers'],
                             rnn_cell=opt['rnn_type'],
                             bidirectional=opt["bidirectional"],
                             input_dropout_p=opt["input_dropout_p"],
                             rnn_dropout_p=opt["rnn_dropout_p"])
        decoder = DecoderRNN(opt["vocab_size"],
                             opt["max_len"],
                             opt["dim_hidden"],
                             opt["dim_word"],
                             n_layers=opt['num_layers'],
                             rnn_cell=opt['rnn_type'],
                             input_dropout_p=opt["input_dropout_p"],
                             rnn_dropout_p=opt["rnn_dropout_p"],
                             bidirectional=opt["bidirectional"])
        model = S2VTAttModel(encoder, decoder)
    else:
        return

    # if torch.cuda.device_count() > 1:
    #     print("{} devices detected, switch to parallel model.".format(torch.cuda.device_count()))
    #     model = nn.DataParallel(model)

    convnet = 'nasnetalarge'
    vocab = dataset.get_vocab()
    full_decoder = ConvS2VT(convnet, model, opt)

    tf_img_fn = ptm_utils.TransformImage(full_decoder.conv)
    load_img_fn = PIL.Image.fromarray

    for video_path in opt['videos']:
        print(video_path)
        with torch.no_grad():
            frames = skvideo.io.vread(video_path)
            # bp ---
            batches = create_batches(frames, load_img_fn, tf_img_fn)
            seq_prob, seq_preds = full_decoder(batches, mode='inference')
            sents = utils.decode_sequence(vocab, seq_preds)

            for sent in sents:
                print(sent)
コード例 #6
0
def main(opt):
    dataset = VideoDataset(opt, 'train')
    dataloader = DataLoader(dataset, batch_size=opt["batch_size"], shuffle=True)
    opt["vocab_size"] = dataset.get_vocab_size()
    encoder = EncoderRNN(
        opt["dim_vid"],
        opt["dim_hidden"],
        bidirectional=bool(opt["bidirectional"]),
        input_dropout_p=opt["input_dropout_p"],
        rnn_cell=opt['rnn_type'],
        rnn_dropout_p=opt["rnn_dropout_p"])
    decoder = DecoderRNN(
        opt["vocab_size"],
        opt["max_len"],
        opt["dim_hidden"],
        opt["dim_word"],
        input_dropout_p=opt["input_dropout_p"],
        rnn_cell=opt['rnn_type'],
        rnn_dropout_p=opt["rnn_dropout_p"],
        bidirectional=bool(opt["bidirectional"]))
    model = S2VTAttModel(encoder, decoder)
    #model = S2VTModel(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], opt['dim_vid'], rnn_cell=opt['rnn_type'], n_layers=opt['num_layers'], rnn_dropout_p=opt["rnn_dropout_p"])
    #model = model.cuda()
    crit = utils.LanguageModelCriterion()
    rl_crit = utils.RewardCriterion()
    optimizer = optim.Adam(
        model.parameters(),
        lr=opt["learning_rate"],
        weight_decay=opt["weight_decay"])
    exp_lr_scheduler = optim.lr_scheduler.StepLR(
        optimizer,
        step_size=opt["learning_rate_decay_every"],
        gamma=opt["learning_rate_decay_rate"])

    train(dataloader, model, crit, optimizer, exp_lr_scheduler, opt, rl_crit)
コード例 #7
0
def main(opt):
    dataset = VideoDataset(opt, 'val', 'chinese')
    opt["vocab_size"] = 13491  #dataset.get_vocab_size() + chinDataset.get_vocab_size()
    opt["seq_length"] = dataset.max_len
    encoder = EncoderRNN(opt["dim_vid"],
                         opt["dim_hidden"],
                         bidirectional=bool(opt["bidirectional"]),
                         input_dropout_p=opt["input_dropout_p"],
                         rnn_cell=opt['rnn_type'],
                         rnn_dropout_p=opt["rnn_dropout_p"])
    decoder = DecoderRNN(opt["vocab_size"],
                         opt["max_len"],
                         opt["dim_hidden"],
                         opt["dim_word"],
                         input_dropout_p=opt["input_dropout_p"],
                         rnn_cell=opt['rnn_type'],
                         rnn_dropout_p=opt["rnn_dropout_p"],
                         bidirectional=bool(opt["bidirectional"]))
    model = S2VTAttModel(encoder, decoder)
    # Setup the model
    model.load_state_dict(
        torch.load(opt["saved_model"], map_location=torch.device('cpu')))
    crit = utils.LanguageModelCriterion()

    test(model, crit, dataset, dataset.get_vocab(), opt)
コード例 #8
0
def main(opt):
    video_path = opt["video_path"]

    os.environ['CUDA_VISIBLE_DEVICES'] = '0'
    image_feats = extract_image_feats(video_path)
    image_feats = torch.from_numpy(image_feats).type(torch.FloatTensor).unsqueeze(0)

    encoder = EncoderRNN(opt["dim_vid"], opt["dim_hidden"], bidirectional=bool(opt["bidirectional"]),
                             input_dropout_p=opt["input_dropout_p"], rnn_dropout_p=opt["rnn_dropout_p"])
    decoder = DecoderRNN(16860, opt["max_len"], opt["dim_hidden"], opt["dim_word"],
                            input_dropout_p=opt["input_dropout_p"],
                            rnn_dropout_p=opt["rnn_dropout_p"], bidirectional=bool(opt["bidirectional"]))
    model = S2VTAttModel(encoder, decoder).cuda()

    model.load_state_dict(torch.load(opt["saved_model"]))
    model.eval()
    opt = dict()
    opt['child_sum'] = True
    opt['temporal_attention'] = True
    opt['multimodel_attention'] = True
    with torch.no_grad():
        _, seq_preds = model(image_feats.cuda(), mode='inference', opt=opt)
    vocab = json.load(open('data/info.json'))['ix_to_word']
    sent = NLUtils.decode_sequence(vocab, seq_preds)
    print(sent)
コード例 #9
0
ファイル: eval.py プロジェクト: htt98/AI_View
def main(opt):
    dataset = VideoDataset(opt, "test")
    opt["vocab_size"] = dataset.get_vocab_size()
    opt["seq_length"] = dataset.max_len
    if opt["model"] == 'S2VTModel':
        model = S2VTModel(opt["vocab_size"],
                          opt["max_len"],
                          opt["dim_hidden"],
                          opt["dim_word"],
                          rnn_dropout_p=opt["rnn_dropout_p"]).cuda()
    elif opt["model"] == "S2VTAttModel":
        encoder = EncoderRNN(opt["dim_vid"],
                             opt["dim_hidden"],
                             bidirectional=opt["bidirectional"],
                             input_dropout_p=opt["input_dropout_p"],
                             rnn_dropout_p=opt["rnn_dropout_p"])
        decoder = DecoderRNN(opt["vocab_size"],
                             opt["max_len"],
                             opt["dim_hidden"],
                             opt["dim_word"],
                             input_dropout_p=opt["input_dropout_p"],
                             rnn_dropout_p=opt["rnn_dropout_p"],
                             bidirectional=opt["bidirectional"])
        model = S2VTAttModel(encoder, decoder).cuda()
    model = nn.DataParallel(model)
    # Setup the model
    model.load_state_dict(torch.load(opt["saved_model"]))
    crit = utils.LanguageModelCriterion()

    get_caption(model, crit, dataset, dataset.get_vocab(), opt)
コード例 #10
0
ファイル: eval.py プロジェクト: Blues5/video-caption-pytorch
def main(opt):
    dataset = VideoDataset(opt, 'test')
    opt.vocab_size = dataset.get_vocab_size()
    opt.seq_length = dataset.seq_length
    if opt.model == 'S2VTModel':
        model = S2VTModel(opt.vocab_size,
                          opt.seq_length,
                          opt.dim_hidden,
                          opt.dim_word,
                          rnn_dropout_p=opt.rnn_dropout_p).cuda()
    elif opt.model == "S2VTAttModel":
        encoder = EncoderRNN(opt.dim_vid, opt.dim_hidden)
        decoder = DecoderRNN(opt.vocab_size,
                             opt.seq_length,
                             opt.dim_hidden,
                             opt.dim_word,
                             rnn_dropout_p=0.2)
        model = S2VTAttModel(encoder, decoder).cuda()
    model = nn.DataParallel(model)
    # Setup the model
    model.load_state_dict(torch.load(opt.saved_model))
    model.eval()
    crit = utils.LanguageModelCriterion()

    test(model, crit, dataset, dataset.get_vocab(), opt)
コード例 #11
0
def main(opt):
    dataset = VideoDataset(opt, 'train')
    dataloader = DataLoader(dataset, batch_size=opt["batch_size"], shuffle=True, num_workers=0, pin_memory=True)
    global dataset_val
    global dataloader_val
    dataset_val =  VideoDataset(opt, 'val')
    dataloader_val = DataLoader(dataset_val, batch_size=opt["batch_size"], shuffle=True, num_workers=0, pin_memory=True)
    opt["vocab_size"] = dataset.get_vocab_size()
    encoder = EncoderRNN(
        opt["dim_vid"],
        opt["dim_hidden"],
        bidirectional=bool(opt["bidirectional"]),
        input_dropout_p=opt["input_dropout_p"],
        rnn_cell=opt['rnn_type'],
        rnn_dropout_p=opt["rnn_dropout_p"])
    decoder = DecoderRNN(
        opt["vocab_size"],
        opt["max_len"],
        opt["dim_hidden"],
        opt["dim_word"],
        input_dropout_p=opt["input_dropout_p"],
        rnn_cell=opt['rnn_type'],
        rnn_dropout_p=opt["rnn_dropout_p"],
        bidirectional=bool(opt["bidirectional"]))
    model = EncoderDecoderModel(encoder, decoder)
    model = model.cuda()
    model = nn.DataParallel(model)
    model.load_state_dict(torch.load('data/save_vatex_batch_noc3d/model_500.pth'))
    crit = utils.LanguageModelCriterion()
    optimizer = optim.Adam(model.parameters(),lr=opt["learning_rate"],weight_decay=opt["weight_decay"])
    exp_lr_scheduler = optim.lr_scheduler.StepLR(optimizer,step_size=opt["learning_rate_decay_every"],gamma=opt["learning_rate_decay_rate"])
    print("Data Loaded")
    train(dataloader, model, crit, optimizer, exp_lr_scheduler, opt, rl_crit)
コード例 #12
0
ファイル: modules.py プロジェクト: ruchikachavhan/SLP
	def __init__(self, embed_size, vocab_size, hidden_size, vocab, max_seq):
		super(PolicyNet, self).__init__()
		self.embed_size = embed_size
		self.vocab_size = vocab_size
		self.hidden_size = hidden_size
		self.vocab = vocab
		self.CNNp = EncoderCNN(embed_size)
		self.RNNp = DecoderRNN(vocab_size, embed_size, hidden_size, vocab, max_seq)
コード例 #13
0
def main(args):
  
  vocab = load_vocab()
  
  encoder = CNNEncoder()
  decoder = DecoderRNN(512,512,len(vocab))
  
  encoder_state_dict, decoder_state_dict, optimizer, *meta = utils.load_models(args.checkpoint_file,False)
  encoder.load_state_dict(encoder_state_dict)
  decoder.load_state_dict(decoder_state_dict)
  
  if torch.cuda.is_available():
    encoder.cuda()
    decoder.cuda()
    
  transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
        ])
  
  inp = cv2.imread(args.image_path)
  inp = transform(Image.fromarray(inp)).unsqueeze(0)
  inp = utils.to_var(inp, volatile=True)
  
  features = encoder(inp)
  sampled_ids = decoder.sample(features)
  
  sampled_ids = sampled_ids.cpu().data.numpy()[0]
  sentence = utils.convert_back_to_text(sampled_ids, vocab)
  
  print('Caption:', sentence)
コード例 #14
0
ファイル: train.py プロジェクト: jackson1895/beautifulday
def main(opt):
    opt_test = opt
    test_dataset = VideoDataset(opt_test, 'test')
    opt_test["vocab_size"] = test_dataset.get_vocab_size()
    opt_test["seq_length"] = test_dataset.max_len
    dataset = VideoDataset(opt, 'train')
    dataloader = DataLoader(dataset,
                            batch_size=opt["batch_size"],
                            shuffle=True)
    opt["vocab_size"] = dataset.get_vocab_size()
    if opt["model"] == 'S2VTModel':
        model = S2VTModel(opt["vocab_size"],
                          opt["max_len"],
                          opt["dim_hidden"],
                          opt["dim_word"],
                          opt['dim_vid'],
                          rnn_cell=opt['rnn_type'],
                          n_layers=opt['num_layers'],
                          rnn_dropout_p=opt["rnn_dropout_p"])
    elif opt["model"] == "S2VTAttModel":
        encoder = EncoderRNN(
            opt["dim_vid"],
            opt["dim_hidden"],
            # bidirectional=opt["bidirectional"],
            input_dropout_p=opt["input_dropout_p"],
            rnn_cell=opt['rnn_type'],
            rnn_dropout_p=opt["rnn_dropout_p"])
        second_lstm = Two_Lstm(
            opt["dim_vid"],
            opt["dim_hidden"],
            # bidirectional=opt["bidirectional"],
            input_dropout_p=opt["input_dropout_p"],
            rnn_cell=opt['rnn_type'],
            rnn_dropout_p=opt["rnn_dropout_p"])
        decoder = DecoderRNN(opt["vocab_size"],
                             opt["max_len"],
                             opt["dim_hidden"],
                             opt["dim_word"],
                             input_dropout_p=opt["input_dropout_p"],
                             rnn_cell=opt['rnn_type'],
                             rnn_dropout_p=opt["rnn_dropout_p"])
        # bidirectional=opt["bidirectional"])
        model = S2VTAttModel(encoder, second_lstm, decoder)
    model = model.cuda()
    crit = utils.LanguageModelCriterion()
    rl_crit = utils.RewardCriterion()
    optimizer = optim.Adam(model.parameters(),
                           lr=opt["learning_rate"],
                           weight_decay=opt["weight_decay"])
    exp_lr_scheduler = optim.lr_scheduler.StepLR(
        optimizer,
        step_size=opt["learning_rate_decay_every"],
        gamma=opt["learning_rate_decay_rate"])

    train(dataloader, model, crit, optimizer, exp_lr_scheduler, opt, rl_crit,
          opt_test, test_dataset)
コード例 #15
0
def main(opt):
    dataset = VideoDataset(opt, "test")
    opt["vocab_size"] = dataset.get_vocab_size()
    opt["seq_length"] = dataset.max_len
    encoder = EncoderRNN(opt["dim_vid"], opt["dim_hidden"], bidirectional=bool(opt["bidirectional"]),input_dropout_p=opt["input_dropout_p"], rnn_dropout_p=opt["rnn_dropout_p"])
    decoder = DecoderRNN(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"],input_dropout_p=opt["input_dropout_p"],rnn_dropout_p=opt["rnn_dropout_p"], bidirectional=bool(opt["bidirectional"]))
    model = EncoderDecoderModel(encoder, decoder).cuda()
    model = nn.DataParallel(model)
    model.load_state_dict(torch.load(opt["saved_model"]))
    crit = utils.LanguageModelCriterion()
    test(model, crit, dataset, dataset.get_vocab(), opt)
コード例 #16
0
def main(opt):

    dataset = VideoDataset(opt, 'train')
    dataloader = DataLoader(dataset,
                            batch_size=opt["batch_size"],
                            num_workers=8,
                            shuffle=True)
    opt["vocab_size"] = dataset.get_vocab_size()
    if opt["model"] == 'S2VTModel':
        model = S2VTModel(opt["vocab_size"],
                          opt["max_len"],
                          opt["dim_hidden"],
                          opt["dim_word"],
                          opt['dim_vid'],
                          rnn_cell=opt['rnn_type'],
                          n_layers=opt['num_layers'],
                          bidirectional=opt["bidirectional"],
                          rnn_dropout_p=opt["rnn_dropout_p"]).cuda()
    elif opt["model"] == "S2VTAttModel":
        encoder = EncoderRNN(opt["dim_vid"],
                             opt["dim_hidden"],
                             n_layers=opt['num_layers'],
                             bidirectional=opt["bidirectional"],
                             input_dropout_p=opt["input_dropout_p"],
                             rnn_cell=opt['rnn_type'],
                             rnn_dropout_p=opt["rnn_dropout_p"])
        decoder = DecoderRNN(opt["vocab_size"],
                             opt["max_len"],
                             opt["dim_hidden"],
                             opt["dim_word"],
                             n_layers=opt['num_layers'],
                             input_dropout_p=opt["input_dropout_p"],
                             rnn_cell=opt['rnn_type'],
                             rnn_dropout_p=opt["rnn_dropout_p"],
                             bidirectional=opt["bidirectional"])
        model = S2VTAttModel(encoder, decoder).cuda()
    crit = utils.LanguageModelCriterion()
    rl_crit = utils.RewardCriterion()
    optimizer = optim.Adam(model.parameters(),
                           lr=opt["learning_rate"],
                           weight_decay=opt["weight_decay"])
    exp_lr_scheduler = optim.lr_scheduler.StepLR(
        optimizer,
        step_size=opt["learning_rate_decay_every"],
        gamma=opt["learning_rate_decay_rate"])

    model.load_state_dict(
        torch.load(
            "C:\\Users\\Shumpu\\VideoCaptioningAttack\\video_caption_pytorch\\save\\vgg16_model_460.pth"
        ))
    train(dataloader, model, crit, optimizer, exp_lr_scheduler, opt, rl_crit)
コード例 #17
0
def main(args):

    print("Process %s, running on %s: starting (%s)" % (
        os.getpid(), os.name, time.asctime()))

    encoder = EncoderCNN()
    decoder = DecoderRNN()
    if torch.cuda.is_available() and args.gpu:
        encoder = encoder.cuda()
        decoder = decoder.cuda()

    encoder_trainables = [p for p in encoder.parameters() if p.requires_grad]
    decoder_trainables = [p for p in decoder.parameters() if p.requires_grad]

    params = encoder_trainables + decoder_trainables

    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406),
                             (0.229, 0.224, 0.225))])

    data_loader = trainloader(transform=transform)
    optimizer = torch.optim.SGD(params=params, lr=args.lr, momentum=0.9)
コード例 #18
0
def setup():
    global char2index
    global index2char
    global SOS_token
    global EOS_token
    global PAD_token

    global model
    global device

    char2index, index2char = label_loader.load_label_json(
        "../data/kor_syllable_zeroth.json")
    SOS_token = char2index['<s>']
    EOS_token = char2index['</s>']
    PAD_token = char2index['_']

    print(f"device: {device}")

    input_size = int(161)
    enc = EncoderRNN(input_size,
                     512,
                     n_layers=3,
                     dropout_p=0.3,
                     bidirectional=True,
                     rnn_cell='LSTM',
                     variable_lengths=False)

    dec = DecoderRNN(len(char2index),
                     128,
                     512,
                     512,
                     SOS_token,
                     EOS_token,
                     n_layers=2,
                     rnn_cell='LSTM',
                     dropout_p=0.3,
                     bidirectional_encoder=True)

    model = Seq2Seq(enc, dec).to(device)

    model_path = "../models/zeroth_korean_trimmed/LSTM_512x3_512x2_zeroth_korean_trimmed/final.pth"
    print("Loading checkpoint model %s" % model_path)
    state = torch.load(model_path, map_location=device)
    model.load_state_dict(state['model'])
    print('Model loaded')
コード例 #19
0
def main(opt):
    train_dataset = VideoDataset(opt, 'train')
    train_dataloader = DataLoader(train_dataset,
                                  batch_size=opt.batch_size,
                                  shuffle=True)
    opt.vocab_size = train_dataset.vocab_size
    opt.seq_length = train_dataset.seq_length
    val_dataset = VideoDataset(opt, 'val')
    val_dataloader = DataLoader(val_dataset,
                                batch_size=opt.batch_size,
                                shuffle=True)
    if opt.model == 'S2VTModel':
        model = S2VTModel(opt.vocab_size,
                          opt.seq_length,
                          opt.dim_hidden,
                          opt.dim_word,
                          rnn_dropout_p=opt.rnn_dropout_p).cuda()
    elif opt.model == "Vid2seq":
        encoder = EncoderRNN(opt.dim_vid, opt.dim_hidden)
        decoder = DecoderRNN(opt.vocab_size,
                             opt.seq_length,
                             opt.dim_hidden,
                             use_attention=True,
                             rnn_dropout_p=opt.rnn_dropout_p)
        model = Vid2seq(encoder, decoder).cuda()
    crit = utils.LanguageModelCriterion()
    rl_crit = utils.RewardCriterion()
    optimizer = optim.Adam(model.parameters(),
                           lr=opt.learning_rate,
                           weight_decay=opt.weight_decay)
    exp_lr_scheduler = optim.lr_scheduler.StepLR(
        optimizer,
        step_size=opt.learning_rate_decay_every,
        gamma=opt.learning_rate_decay_rate)
    if not os.path.isdir(opt.checkpoint_path):
        os.mkdir(opt.checkpoint_path)
    train(train_dataloader, val_dataloader, model, crit, optimizer,
          exp_lr_scheduler, opt, rl_crit)
コード例 #20
0
    def main(self, opt):
        os.environ['CUDA_VISIBLE_DEVICES'] = '0'
        video_path = self.ent1.get().replace("/", "\\")
        image_feats = self.extract_image_feats(video_path)
        image_feats = torch.from_numpy(image_feats).type(
            torch.FloatTensor).unsqueeze(0)

        encoder = EncoderRNN(opt["dim_vid"],
                             opt["dim_hidden"],
                             bidirectional=bool(opt["bidirectional"]),
                             input_dropout_p=opt["input_dropout_p"],
                             rnn_dropout_p=opt["rnn_dropout_p"])
        decoder = DecoderRNN(16860,
                             opt["max_len"],
                             opt["dim_hidden"],
                             opt["dim_word"],
                             input_dropout_p=opt["input_dropout_p"],
                             rnn_dropout_p=opt["rnn_dropout_p"],
                             bidirectional=bool(opt["bidirectional"]))
        model = S2VTAttModel(encoder, decoder).cuda()
        model.load_state_dict(torch.load("data/save/model_500.pth"))
        model.eval()
        opt = dict()
        opt['child_sum'] = True
        opt['temporal_attention'] = True
        opt['multimodel_attention'] = True
        with torch.no_grad():
            _, seq_preds = model(image_feats.cuda(), mode='inference', opt=opt)
        vocab = json.load(open('data/info.json'))['ix_to_word']
        self.sent = NLUtils.decode_sequence(vocab, seq_preds)
        hasil = self.translator.translate(self.sent[0], dest='id')
        print(self.sent[0])
        self.hasilPred.configure(text=self.sent[0])
        self.hasiltrans.configure(text=hasil.text)
        # coba = self.sent[0]
        self.textToSpeech(self.sent[0], hasil.text)
        del seq_preds
        torch.cuda.empty_cache()
コード例 #21
0
def main(opt):
    dataset_test = VideoDataset(opt, 'test')
    dataloader_test = DataLoader(dataset_test,
                                 batch_size=opt["batch_size"],
                                 shuffle=False)
    opt["obj_vocab_size"] = dataset_test.get_obj_vocab_size()
    opt["rel_vocab_size"] = dataset_test.get_rel_vocab_size()
    if opt["model"] == 'S2VTModel':
        model = S2VTModel(opt["vocab_size"],
                          opt["max_len"],
                          opt["dim_hidden"],
                          opt["dim_word"],
                          opt['dim_vid'],
                          rnn_cell=opt['rnn_type'],
                          n_layers=opt['num_layers'],
                          rnn_dropout_p=opt["rnn_dropout_p"])
    elif opt["model"] == "S2VTAttModel":
        encoder = EncoderRNN(opt["dim_vid"],
                             opt["dim_hidden"],
                             bidirectional=opt["bidirectional"],
                             input_dropout_p=opt["input_dropout_p"],
                             rnn_cell=opt['rnn_type'],
                             rnn_dropout_p=opt["rnn_dropout_p"])
        decoder = DecoderRNN(opt["obj_vocab_size"],
                             opt["rel_vocab_size"],
                             opt["max_len"],
                             opt["dim_hidden"],
                             opt["dim_word"],
                             input_dropout_p=opt["input_dropout_p"],
                             rnn_cell=opt['rnn_type'],
                             rnn_dropout_p=opt["rnn_dropout_p"],
                             bidirectional=opt["bidirectional"])
        model = S2VTAttModel(encoder, decoder)
    model = model.cuda()
    model.load_state_dict(torch.load(opt['ckpt_path']))
    crit = utils.ObjRelCriterion()
    test(model, crit, opt, dataloader_test)
コード例 #22
0
ファイル: eval.py プロジェクト: sadari1/VideoCaptioningAttack
def main(opt):
    dataset = VideoDataset(opt, "test")
    opt["vocab_size"] = dataset.get_vocab_size()
    opt["seq_length"] = dataset.max_len
    if opt['beam_size'] != 1:
        assert opt["batch_size"] == 1
    if opt["model"] == 'S2VTModel':
        model = S2VTModel(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], opt['dim_vid'],
                          n_layers=opt['num_layers'],
                          rnn_cell=opt['rnn_type'],
                          bidirectional=opt["bidirectional"],
                          rnn_dropout_p=opt["rnn_dropout_p"])
    elif opt["model"] == "S2VTAttModel":
        encoder = EncoderRNN(opt["dim_vid"], opt["dim_hidden"],
                             n_layers=opt['num_layers'],
                             rnn_cell=opt['rnn_type'], bidirectional=opt["bidirectional"],
                             input_dropout_p=opt["input_dropout_p"], rnn_dropout_p=opt["rnn_dropout_p"])
        decoder = DecoderRNN(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"],
                             n_layers=opt['num_layers'],
                             rnn_cell=opt['rnn_type'], input_dropout_p=opt["input_dropout_p"],
                             rnn_dropout_p=opt["rnn_dropout_p"], bidirectional=opt["bidirectional"])
        model = S2VTAttModel(encoder, decoder)
    else:
        return

    if torch.cuda.device_count() > 1:
        print("{} devices detected, switch to parallel model.".format(torch.cuda.device_count()))
        model = nn.DataParallel(model)

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Setup the model
    model.load_state_dict(torch.load(opt["saved_model"]))
    crit = utils.LanguageModelCriterion()

    test(model, crit, dataset, dataset.get_vocab(), opt)
コード例 #23
0
ファイル: sample.py プロジェクト: BoomHaha/Image_caption
def do(args: argparse.Namespace):
    os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu
    print('gpu :', args.gpu)
    # preprocess
    preprocess = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(mean=(0.485, 0.456, 0.406),
                             std=(0.229, 0.224, 0.225))
    ])
    # vocab
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)
    # model
    encoder = EncoderCNN(args.embed_size).cuda()
    decoder = DecoderRNN(len(vocab), args.embed_size, args.hidden_size,
                         args.num_layers).cuda()
    model_state = torch.load(args.checkpoint_path)
    encoder.load_state_dict(model_state['encoder'])
    decoder.load_state_dict(model_state['decoder'])
    print('load successfully at\tepoch:%d\tstep:%d' %
          (model_state['epoch'], model_state['step']))
    encoder.eval()
    decoder.eval()
    # image
    img = load_image(args.img_path, preprocess).cuda()
    outs = decoder.sample(encoder(img))
    outs = outs.cpu().numpy()
    print(outs)
    # caption
    caption = []
    for word_id in outs:
        word = vocab.idx2word[word_id]
        caption.append(word)
        if word == '<end>':
            break
    sentence = ' '.join(caption)
    print(sentence)
コード例 #24
0
def main():

    global char2index
    global index2char
    global SOS_token
    global EOS_token
    global PAD_token

    parser = argparse.ArgumentParser(description='Speech hackathon Baseline')
    parser.add_argument('--hidden_size',
                        type=int,
                        default=512,
                        help='hidden size of model (default: 256)')
    parser.add_argument('--layer_size',
                        type=int,
                        default=3,
                        help='number of layers of model (default: 3)')
    parser.add_argument('--dropout',
                        type=float,
                        default=0.2,
                        help='dropout rate in training (default: 0.2)')
    parser.add_argument(
        '--bidirectional',
        action='store_true',
        help='use bidirectional RNN for encoder (default: False)')
    parser.add_argument(
        '--use_attention',
        action='store_true',
        help='use attention between encoder-decoder (default: False)')
    parser.add_argument('--batch_size',
                        type=int,
                        default=32,
                        help='batch size in training (default: 32)')
    parser.add_argument(
        '--workers',
        type=int,
        default=4,
        help='number of workers in dataset loader (default: 4)')
    parser.add_argument('--max_epochs',
                        type=int,
                        default=10,
                        help='number of max epochs in training (default: 10)')
    parser.add_argument('--lr',
                        type=float,
                        default=1e-04,
                        help='learning rate (default: 0.0001)')
    parser.add_argument('--teacher_forcing',
                        type=float,
                        default=0.5,
                        help='teacher forcing ratio in decoder (default: 0.5)')
    parser.add_argument('--max_len',
                        type=int,
                        default=80,
                        help='maximum characters of sentence (default: 80)')
    parser.add_argument('--no_cuda',
                        action='store_true',
                        default=False,
                        help='disables CUDA training')
    parser.add_argument('--seed',
                        type=int,
                        default=1,
                        help='random seed (default: 1)')
    parser.add_argument('--save_name',
                        type=str,
                        default='model',
                        help='the name of model in nsml or local')
    parser.add_argument('--mode', type=str, default='train')
    parser.add_argument("--pause", type=int, default=0)

    args = parser.parse_args()

    char2index, index2char = label_loader.load_label('./hackathon.labels')
    SOS_token = char2index['<s>']
    EOS_token = char2index['</s>']
    PAD_token = char2index['_']

    random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    args.cuda = not args.no_cuda and torch.cuda.is_available()
    device = torch.device('cuda' if args.cuda else 'cpu')

    # N_FFT: defined in loader.py
    feature_size = N_FFT / 2 + 1

    enc = EncoderRNN(feature_size,
                     args.hidden_size,
                     input_dropout_p=args.dropout,
                     dropout_p=args.dropout,
                     n_layers=args.layer_size,
                     bidirectional=args.bidirectional,
                     rnn_cell='gru',
                     variable_lengths=False)

    dec = DecoderRNN(len(char2index),
                     args.max_len,
                     args.hidden_size * (2 if args.bidirectional else 1),
                     SOS_token,
                     EOS_token,
                     n_layers=args.layer_size,
                     rnn_cell='gru',
                     bidirectional=args.bidirectional,
                     input_dropout_p=args.dropout,
                     dropout_p=args.dropout,
                     use_attention=args.use_attention)

    model = Seq2seq(enc, dec)
    model.flatten_parameters()

    for param in model.parameters():
        param.data.uniform_(-0.08, 0.08)

    # lnw add get the number of model parameters
    print('Number of model parameters: {}'.format(
        sum([p.data.nelement() for p in model.parameters()])))

    model = nn.DataParallel(model).to(device)

    optimizer = optim.Adam(model.module.parameters(), lr=args.lr)
    criterion = nn.CrossEntropyLoss(reduction='sum',
                                    ignore_index=PAD_token).to(device)

    bind_model(model, optimizer)

    if args.pause == 1:
        nsml.paused(scope=locals())

    if args.mode != "train":
        return

    data_list = os.path.join(DATASET_PATH, 'train_data', 'data_list.csv')
    wav_paths = list()
    script_paths = list()

    with open(data_list, 'r') as f:
        for line in f:
            # line: "aaa.wav,aaa.label"

            wav_path, script_path = line.strip().split(',')
            wav_paths.append(os.path.join(DATASET_PATH, 'train_data',
                                          wav_path))
            script_paths.append(
                os.path.join(DATASET_PATH, 'train_data', script_path))

    best_loss = 1e10
    begin_epoch = 0

    # load all target scripts for reducing disk i/o
    target_path = os.path.join(DATASET_PATH, 'train_label')
    load_targets(target_path)

    # lnw  valid_ratio=0.05 ->  valid_ratio=0.1  or 0.03
    #train_batch_num, train_dataset_list, valid_dataset = split_dataset(args, wav_paths, script_paths, valid_ratio=0.05)
    train_batch_num, train_dataset_list, valid_dataset = split_dataset(
        args, wav_paths, script_paths, valid_ratio=0.03)

    #lnw add
    lstart_time = datetime.now()
    print("Start time : " + str(lstart_time))

    #lnw block
    #logger.info('start')

    train_begin = time.time()

    for epoch in range(begin_epoch, args.max_epochs):

        #lnw add
        lepoch_start = datetime.now()
        print(epoch, "epoch Start time : " + str(lepoch_start))

        train_queue = queue.Queue(args.workers * 2)

        train_loader = MultiLoader(train_dataset_list, train_queue,
                                   args.batch_size, args.workers)
        train_loader.start()

        #lnw modified  print_batch 10 -> 100, 450
        #train_loss, train_cer = train(model, train_batch_num, train_queue, criterion, optimizer, device, train_begin, args.workers, 10, args.teacher_forcing)
        train_loss, train_cer = train(model, train_batch_num, train_queue,
                                      criterion, optimizer, device,
                                      train_begin, args.workers, 450,
                                      args.teacher_forcing)

        logger.info('Epoch %d (Training) Loss %0.4f CER %0.4f' %
                    (epoch, train_loss, train_cer))

        train_loader.join()

        valid_queue = queue.Queue(args.workers * 2)
        valid_loader = BaseDataLoader(valid_dataset, valid_queue,
                                      args.batch_size, 0)
        valid_loader.start()

        eval_loss, eval_cer = evaluate(model, valid_loader, valid_queue,
                                       criterion, device)
        logger.info('Epoch %d (Evaluate) Loss %0.4f CER %0.4f' %
                    (epoch, eval_loss, eval_cer))

        valid_loader.join()

        nsml.report(False,
                    step=epoch,
                    train_epoch__loss=train_loss,
                    train_epoch__cer=train_cer,
                    eval__loss=eval_loss,
                    eval__cer=eval_cer)

        best_model = (eval_loss < best_loss)
        nsml.save(args.save_name)

        if best_model:
            nsml.save('best')
            best_loss = eval_loss

            #lnw add. save best model
            torch.save(model, 'ModelBestSave.pt')

        #lnw end time, duration
        lepoch_end = datetime.now()
        print(epoch, "epoch End time: " + str(lepoch_end), "Duration:",
              str(lepoch_end - lepoch_start), "SratTime-NowTime:",
              str(lepoch_end - lstart_time))

    #lnw add
    lend_time = datetime.now()
    print("End time : " + str(lend_time))
    print('Duration: {}'.format(lend_time - lstart_time))
コード例 #25
0
def train(n_epochs, train_loader, valid_loader, save_location_path, embed_size,
          hidden_size, vocab_size):

    encoder = EncoderCNN(embed_size)
    decoder = DecoderRNN(embed_size, hidden_size, vocab_size)

    # Move to GPU, if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    encoder = encoder.to(device)
    decoder = decoder.to(device)

    criterion = nn.CrossEntropyLoss().to(device)
    params = list(decoder.parameters()) + list(encoder.embed.parameters())
    optimizer = torch.optim.Adam(params, lr=0.001)

    # This is to make sure that the 1st loss is  lower than sth and
    # Save the model according to this comparison
    valid_loss_min = np.Inf

    for epoch in range(1, n_epochs + 1):

        # Keep track of training and validation loss
        train_loss = 0.0
        valid_loss = 0.0

        encoder.train()
        decoder.train()
        for data in train_loader:
            images, captions = data['image'], data['caption']
            images = images.type(torch.FloatTensor)
            images.to(device)
            captions.to(device)

            decoder.zero_grad()
            encoder.zero_grad()

            features = encoder(images)
            outputs = decoder(features, captions)

            loss = criterion(outputs.contiguous().view(-1, vocab_size),
                             captions.view(-1))
            loss.backward()
            optimizer.step()

            train_loss += loss.item() * images.size(0)

        encoder.eval()
        decoder.eval()
        for data in valid_loader:
            images, captions = data['image'], data['caption']
            images = images.type(torch.FloatTensor)
            images.to(device)
            captions.to(device)

            features = encoder(images)
            outputs = decoder(features, captions)

            loss = criterion(outputs.contiguous().view(-1, vocab_size),
                             captions.view(-1))

            valid_loss += loss.item() * images.size(0)

            # Average losses
            train_loss = train_loss / len(train_loader)
            valid_loss = valid_loss / len(valid_loader)

            print(
                f"Epoch: {epoch} \tTraining Loss: {train_loss} \tValidation Loss: {valid_loss}"
            )

            # save model if validation loss has decreased
            if valid_loss <= valid_loss_min:
                print(
                    f"Validation loss decreased ({valid_loss_min} --> {valid_loss}).  Saving model ..."
                )
                torch.save(encoder.state_dict(),
                           save_location_path + '/encoder{n_epochs}.pt')
                torch.save(decoder.state_dict(),
                           save_location_path + '/decoder{n_epochs}.pt')
                valid_loss_min = valid_loss
コード例 #26
0
from torch import nn, optim
from torch.autograd import Variable
import torch.nn.functional as F
import numpy as np
from models import Attention_layer, EncoderRNN, DecoderRNN
# load words dictionary
with open("word_index_dict", "rb") as f:
    word_index_dict = pickle.load(f)

with open("index_word_dict", "rb") as f:
    index_word_dict = pickle.load(f)

maxlen_q, maxlen_a = 19, 19
# build the model now
encoder = EncoderRNN(len(word_index_dict) + 1, 1024, 1024)  #.cuda()
decoder = DecoderRNN(1024, 1024, len(index_word_dict) + 2)  #.cuda()
attention = Attention_layer(maxlen_q + 1)  #.cuda()
encoder.eval()
decoder.eval()
attention.eval()
params_encoder,params_decoder,params_attention=\
    list(encoder.parameters()),list(decoder.parameters()),list(attention.parameters())

# load weights into model
with open("weights/encoder", "rb") as f:
    weights_encoder = pickle.load(f)

with open("weights/decoder", "rb") as f:
    weights_decoder = pickle.load(f)

with open("weights/attention", "rb") as f:
コード例 #27
0
def main():

    global char2index
    global index2char
    global SOS_token
    global EOS_token
    global PAD_token

    parser = argparse.ArgumentParser(description='Speech hackathon Baseline')
    parser.add_argument('--hidden_size',
                        type=int,
                        default=512,
                        help='hidden size of model (default: 256)')
    parser.add_argument('--layer_size',
                        type=int,
                        default=3,
                        help='number of layers of model (default: 3)')
    parser.add_argument('--dropout',
                        type=float,
                        default=0.2,
                        help='dropout rate in training (default: 0.2)')
    parser.add_argument(
        '--bidirectional',
        action='store_true',
        help='use bidirectional RNN for encoder (default: False)')
    parser.add_argument(
        '--use_attention',
        action='store_true',
        help='use attention between encoder-decoder (default: False)')
    parser.add_argument('--batch_size',
                        type=int,
                        default=32,
                        help='batch size in training (default: 32)')
    parser.add_argument(
        '--workers',
        type=int,
        default=4,
        help='number of workers in dataset loader (default: 4)')
    parser.add_argument('--max_epochs',
                        type=int,
                        default=10,
                        help='number of max epochs in training (default: 10)')
    parser.add_argument('--lr',
                        type=float,
                        default=1e-04,
                        help='learning rate (default: 0.0001)')
    parser.add_argument('--teacher_forcing',
                        type=float,
                        default=0.5,
                        help='teacher forcing ratio in decoder (default: 0.5)')
    parser.add_argument('--max_len',
                        type=int,
                        default=80,
                        help='maximum characters of sentence (default: 80)')
    parser.add_argument('--no_cuda',
                        action='store_true',
                        default=False,
                        help='disables CUDA training')
    parser.add_argument('--seed',
                        type=int,
                        default=1,
                        help='random seed (default: 1)')
    parser.add_argument('--save_name',
                        type=str,
                        default='model',
                        help='the name of model in nsml or local')
    parser.add_argument('--mode', type=str, default='train')
    parser.add_argument("--pause", type=int, default=0)

    parser.add_argument(
        '--feature',
        type=str,
        default='mel',
        help='select feature extraction function. mel or log_mel ')

    args = parser.parse_args()

    char2index, index2char = label_loader.load_label('./hackathon.labels')
    SOS_token = char2index['<s>']
    EOS_token = char2index['</s>']
    PAD_token = char2index['_']

    random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    args.cuda = not args.no_cuda and torch.cuda.is_available()
    device = torch.device('cuda' if args.cuda else 'cpu')

    # N_FFT: defined in loader.py ; N_FFT = size of the Fourier Transform
    feature_size = N_FFT / 2 + 1  # N_FFT size = 512

    enc = EncoderRNN(feature_size,
                     args.hidden_size,
                     input_dropout_p=args.dropout,
                     dropout_p=args.dropout,
                     n_layers=args.layer_size,
                     bidirectional=args.bidirectional,
                     rnn_cell='gru',
                     variable_lengths=False)

    dec = DecoderRNN(len(char2index),
                     args.max_len,
                     args.hidden_size * (2 if args.bidirectional else 1),
                     SOS_token,
                     EOS_token,
                     n_layers=args.layer_size,
                     rnn_cell='gru',
                     bidirectional=args.bidirectional,
                     input_dropout_p=args.dropout,
                     dropout_p=args.dropout,
                     use_attention=args.use_attention)

    model = Seq2seq(enc, dec)
    model.flatten_parameters()

    # initial distribution of model weights
    for param in model.parameters():
        param.data.uniform_(-0.08, 0.08)

    # make tensors able to be computed on multiple devices in parallel and copy tensors to GPU
    model = nn.DataParallel(model).to(device)

    optimizer = optim.Adam(model.module.parameters(), lr=args.lr)
    criterion = nn.CrossEntropyLoss(reduction='sum',
                                    ignore_index=PAD_token).to(device)

    bind_model(model, optimizer)

    if args.pause == 1:
        nsml.paused(scope=locals())

    if args.mode != "train":
        return

    data_list = os.path.join(DATASET_PATH, 'train_data', 'data_list.csv')
    wav_paths = list()
    script_paths = list()

    with open(data_list, 'r') as f:
        for line in f:
            # line: "aaa.wav,aaa.label"

            wav_path, script_path = line.strip().split(',')
            wav_paths.append(os.path.join(DATASET_PATH, 'train_data',
                                          wav_path))
            script_paths.append(
                os.path.join(DATASET_PATH, 'train_data', script_path))

    best_loss = 1e10
    begin_epoch = 0

    # load all target scripts for reducing disk i/o
    target_path = os.path.join(DATASET_PATH, 'train_label')
    load_targets(target_path)

    # val ratio can be adjusted -> 10% ??
    train_batch_num, train_dataset_list, valid_dataset = split_dataset(
        args, wav_paths, script_paths, valid_ratio=0.05)

    logger.info('start')

    train_begin = time.time()

    for epoch in range(begin_epoch, args.max_epochs):

        train_queue = queue.Queue(args.workers * 2)

        # load train data
        train_loader = MultiLoader(train_dataset_list, train_queue,
                                   args.batch_size, args.workers)
        train_loader.start()

        # train epoch
        train_loss, train_cer = train(model, train_batch_num, train_queue,
                                      criterion, optimizer, device,
                                      train_begin, args.workers, 10,
                                      args.teacher_forcing)
        logger.info('Epoch %d (Training) Loss %0.4f CER %0.4f' %
                    (epoch, train_loss, train_cer))
        print('Epoch %d (Training) Loss %0.4f CER %0.4f' %
              (epoch, train_loss, train_cer))

        train_loader.join()

        # eval for each epoch
        valid_queue = queue.Queue(args.workers * 2)
        valid_loader = BaseDataLoader(valid_dataset, valid_queue,
                                      args.batch_size, 0)
        valid_loader.start()

        eval_loss, eval_cer = evaluate(model, valid_loader, valid_queue,
                                       criterion, device)
        logger.info('Epoch %d (Evaluate) Loss %0.4f CER %0.4f' %
                    (epoch, eval_loss, eval_cer))
        print('Epoch %d (Evaluate) Loss %0.4f CER %0.4f' %
              (epoch, eval_loss, eval_cer))

        valid_loader.join()

        nsml.report(False,
                    step=epoch,
                    train_epoch__loss=train_loss,
                    train_epoch__cer=train_cer,
                    eval__loss=eval_loss,
                    eval__cer=eval_cer)

        best_model = (eval_loss < best_loss)
        nsml.save(args.save_name)

        if best_model:
            nsml.save('best')
            best_loss = eval_loss
コード例 #28
0
def main():
    global char2index
    global index2char
    global SOS_token
    global EOS_token
    global PAD_token

    parser = argparse.ArgumentParser(
        description='speech recognition for multi language')
    parser.add_argument('--language',
                        type=str,
                        default='english',
                        help='target language')
    parser.add_argument('--hidden_size',
                        type=int,
                        default=512,
                        help='hidden size of model (default: 256)')
    parser.add_argument('--layer_size',
                        type=int,
                        default=3,
                        help='number of layers of model (default: 3)')
    parser.add_argument('--dropout',
                        type=float,
                        default=0.2,
                        help='dropout rate in training (default: 0.2)')
    parser.add_argument(
        '--bidirectional',
        action='store_true',
        help='use bidirectional RNN for encoder (default: False)')
    parser.add_argument(
        '--use_attention',
        action='store_true',
        help='use attention between encoder-decoder (default: False)')
    parser.add_argument('--batch_size',
                        type=int,
                        default=32,
                        help='batch size in training (default: 32)')
    parser.add_argument(
        '--workers',
        type=int,
        default=4,
        help='number of workers in dataset loader (default: 4)')
    parser.add_argument('--max_epochs',
                        type=int,
                        default=100,
                        help='number of max epochs in training (default: 10)')
    parser.add_argument('--lr',
                        type=float,
                        default=1e-04,
                        help='learning rate (default: 0.0001)')
    parser.add_argument('--teacher_forcing',
                        type=float,
                        default=0.5,
                        help='teacher forcing ratio in decoder (default: 0.5)')
    parser.add_argument('--max_len',
                        type=int,
                        default=80,
                        help='maximum characters of sentence (default: 80)')
    parser.add_argument('--no_cuda',
                        action='store_true',
                        default=False,
                        help='disables CUDA training')
    parser.add_argument('--seed',
                        type=int,
                        default=1,
                        help='random seed (default: 1)')
    parser.add_argument('--save_name',
                        type=str,
                        default='model',
                        help='the name of model in nsml or local')
    parser.add_argument('--mode', type=str, default='train')
    parser.add_argument("--pause", type=int, default=0)

    args = parser.parse_args()

    if args.language == 'korean':
        char2index, index2char = label_loader.load_label(
            'korean.labels', args.language)
    else:
        char2index, index2char = label_loader.load_label(
            'english.json', args.language)
    SOS_token = char2index['<s>']
    EOS_token = char2index['</s>']
    PAD_token = char2index['_']

    random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    args.cuda = not args.no_cuda and torch.cuda.is_available()
    device = torch.device('cuda' if args.cuda else 'cpu')

    # N_FFT: defined in loader.py
    feature_size = N_FFT / 2 + 1

    enc = EncoderRNN(feature_size,
                     args.hidden_size,
                     input_dropout_p=args.dropout,
                     dropout_p=args.dropout,
                     n_layers=args.layer_size,
                     bidirectional=args.bidirectional,
                     rnn_cell='gru',
                     variable_lengths=False)

    dec = DecoderRNN(len(char2index),
                     args.max_len,
                     args.hidden_size * (2 if args.bidirectional else 1),
                     SOS_token,
                     EOS_token,
                     n_layers=args.layer_size,
                     rnn_cell='gru',
                     bidirectional=args.bidirectional,
                     input_dropout_p=args.dropout,
                     dropout_p=args.dropout,
                     use_attention=args.use_attention)

    model = Seq2seq(enc, dec)
    model.flatten_parameters()

    model = nn.DataParallel(model).to(device)

    optimizer = optim.Adam(model.module.parameters(), lr=args.lr)
    criterion = nn.CrossEntropyLoss(reduction='sum',
                                    ignore_index=PAD_token).to(device)

    bind_model(model, optimizer)

    if args.mode != "train":
        return

    download_TIMIT()

    train_paths = np.loadtxt("dataset/TRAIN_list.csv",
                             delimiter=',',
                             dtype=np.unicode)
    valid_paths = np.loadtxt("dataset/TEST_developmentset_list.csv",
                             delimiter=',',
                             dtype=np.unicode)
    test_paths = np.loadtxt("dataset/TEST_coreset_list.csv",
                            delimiter=',',
                            dtype=np.unicode)

    best_loss = 1e10
    begin_epoch = 0

    # load all target scripts for reducing disk i/o
    target_path = os.path.join(DATASET_PATH, 'train_label')
    load_targets(target_path)

    train_batch_num, train_dataset_list, valid_dataset, test_dataset = split_dataset(
        args, train_paths, valid_paths, test_paths)

    logger.info('start')

    train_begin = time.time()

    for epoch in range(begin_epoch, args.max_epochs):

        train_queue = queue.Queue(args.workers * 2)

        train_loader = MultiLoader(train_dataset_list, train_queue,
                                   args.batch_size, args.workers)
        train_loader.start()

        train_loss, train_cer = train(model, train_batch_num, train_queue,
                                      criterion, optimizer, device,
                                      train_begin, args.workers, 10,
                                      args.teacher_forcing)
        logger.info('Epoch %d (Training) Loss %0.4f CER %0.4f' %
                    (epoch, train_loss, train_cer))

        train_loader.join()

        valid_queue = queue.Queue(args.workers * 2)
        valid_loader = BaseDataLoader(valid_dataset, valid_queue,
                                      args.batch_size, 0)
        valid_loader.start()

        eval_loss, eval_cer = evaluate(model, valid_loader, valid_queue,
                                       criterion, device)
        logger.info('Epoch %d (Evaluate) Loss %0.4f CER %0.4f' %
                    (epoch, eval_loss, eval_cer))

        valid_loader.join()

        best_model = (eval_loss < best_loss)

        if best_model:
            best_loss = eval_loss
コード例 #29
0
def main(opt):
    dataset = VideoDataset(opt, 'train')

    dataloader = DataLoader(dataset,
                            batch_size=opt["batch_size"],
                            shuffle=True)
    opt["vocab_size"] = dataset.get_vocab_size()
    if opt["model"] == 'S2VTModel':
        model = S2VTModel(opt["vocab_size"],
                          opt["max_len"],
                          opt["dim_hidden"],
                          opt["dim_word"],
                          opt['dim_vid'],
                          rnn_cell=opt['rnn_type'],
                          n_layers=opt['num_layers'],
                          rnn_dropout_p=opt["rnn_dropout_p"])
    elif opt["model"] == "S2VTAttModel":
        encoder = EncoderRNN(opt["dim_vid"],
                             opt["dim_hidden"],
                             bidirectional=opt["bidirectional"],
                             input_dropout_p=opt["input_dropout_p"],
                             rnn_cell=opt['rnn_type'],
                             rnn_dropout_p=opt["rnn_dropout_p"])
        decoder = DecoderRNN(opt["vocab_size"],
                             opt["max_len"],
                             opt["dim_hidden"],
                             opt["dim_word"],
                             input_dropout_p=opt["input_dropout_p"],
                             rnn_cell=opt['rnn_type'],
                             rnn_dropout_p=opt["rnn_dropout_p"],
                             bidirectional=opt["bidirectional"])
        model = S2VTAttModel(encoder, decoder)
    elif opt["model"] == "CTCmodel":
        # input_dim, hidden_dim, output_dim, num_layers, biFlag, dropout = 0.5
        # model = CTCmodel(opt["dim_vid"],opt["dim_hidden"],opt["vocab_size"]+1)
        model = CTCmodel(opt['vocab_size'], opt['dim_hidden'])
    elif opt["model"] == "CTC_Hieratical_LSTM":
        encoder = EncoderRNN(
            opt["dim_vid"],
            opt["dim_hidden"],
            # bidirectional=opt["bidirectional"],
            input_dropout_p=opt["input_dropout_p"],
            rnn_cell=opt['rnn_type'],
            rnn_dropout_p=opt["rnn_dropout_p"])
        second_lstm = two_lstm(
            opt["dim_hidden"] * 2,
            opt['vocab_size'],
            # bidirectional=opt["bidirectional"],
            input_dropout_p=opt["input_dropout_p"],
            rnn_cell=opt['rnn_type'],
            rnn_dropout_p=opt["rnn_dropout_p"])
        model = CTC_Hieratical_LSTM(encoder, second_lstm, opt['vocab_size'],
                                    opt['dim_word'], opt['dim_hidden'],
                                    opt['duration'], opt['video_duration'])

    # model = model.cuda()
    # crit = utils.LanguageModelCriterion()
    # rl_crit = utils.RewardCriterion()
    ctc_loss = nn.CTCLoss(reduction='mean')
    optimizer = optim.Adam(model.parameters(),
                           lr=opt["learning_rate"],
                           weight_decay=opt["weight_decay"])
    if opt['lr_schluder'] == 'StepLR':
        lr_scheduler = optim.lr_scheduler.StepLR(
            optimizer,
            step_size=opt["learning_rate_decay_every"],
            gamma=opt["learning_rate_decay_rate"])
    elif opt['lr_schluder'] == 'ReduceLROnPlateau':
        lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(
            optimizer,
            mode='min',
            factor=0.1,
            patience=opt['patience'],
            verbose=True,
            threshold_mode='rel',
            threshold=opt['threshold'],
            cooldown=0,
            min_lr=opt['min_lr'],
            eps=1e-8)
    else:
        raise NotImplementedError('Only implement ReduceLROnPlateau | StepLR')
    opt['check_bool'] = False
    if opt['check_bool']:
        check_path = os.path.join(opt['check_path'], 'model_10.pth')
        model.load_state_dict(torch.load(check_path))
        opt['root_model_path'] = opt['check_path']
        print('have loaded model info from:', check_path)
        #TODO断点重新训练
        val(model, ctc_loss, opt)
    else:
        opt_json = os.path.join(
            opt["checkpoint_path"],
            time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime(time.time())),
            'opt_info.json')
        root_model_path = os.path.join(
            opt['checkpoint_path'],
            time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime(time.time())))
        opt['root_model_path'] = root_model_path
        if not os.path.isdir(opt["checkpoint_path"]):
            os.mkdir(opt["checkpoint_path"])
        if not os.path.isdir(root_model_path):
            os.mkdir(root_model_path)
        with open(opt_json, 'w') as f:
            json.dump(opt, f)
        print('save opt details to %s' % (opt_json))
    train(dataloader, model, ctc_loss, optimizer, lr_scheduler, opt)
コード例 #30
0
def main():
    global char2index
    global index2char
    global SOS_token
    global EOS_token
    global PAD_token

    parser = argparse.ArgumentParser(description='LAS')
    parser.add_argument('--model-name', type=str, default='LAS')
    # Dataset
    parser.add_argument('--train-file',
                        type=str,
                        help='data list about train dataset',
                        default='data/ClovaCall/train_ClovaCall.json')
    parser.add_argument('--test-file-list',
                        nargs='*',
                        help='data list about test dataset',
                        default=['data/ClovaCall/test_ClovCall.json'])
    parser.add_argument('--labels-path',
                        default='data/kor_syllable.json',
                        help='Contains large characters over korean')
    parser.add_argument('--dataset-path',
                        default='data/ClovaCall/clean',
                        help='Target dataset path')
    # Hyperparameters
    parser.add_argument('--rnn-type',
                        default='lstm',
                        help='Type of the RNN. rnn|gru|lstm are supported')
    parser.add_argument('--encoder_layers',
                        type=int,
                        default=3,
                        help='number of layers of model (default: 3)')
    parser.add_argument('--encoder_size',
                        type=int,
                        default=512,
                        help='hidden size of model (default: 512)')
    parser.add_argument('--decoder_layers',
                        type=int,
                        default=2,
                        help='number of pyramidal layers (default: 2)')
    parser.add_argument('--decoder_size',
                        type=int,
                        default=512,
                        help='hidden size of model (default: 512)')
    parser.add_argument('--dropout',
                        type=float,
                        default=0.3,
                        help='Dropout rate in training (default: 0.3)')
    parser.add_argument(
        '--no-bidirectional',
        dest='bidirectional',
        action='store_false',
        default=True,
        help='Turn off bi-directional RNNs, introduces lookahead convolution')
    parser.add_argument('--batch_size',
                        type=int,
                        default=32,
                        help='Batch size in training (default: 32)')
    parser.add_argument(
        '--num_workers',
        type=int,
        default=4,
        help='Number of workers in dataset loader (default: 4)')
    parser.add_argument('--num_gpu',
                        type=int,
                        default=1,
                        help='Number of gpus (default: 1)')
    parser.add_argument('--epochs',
                        type=int,
                        default=100,
                        help='Number of max epochs in training (default: 100)')
    parser.add_argument('--lr',
                        type=float,
                        default=3e-4,
                        help='Learning rate (default: 3e-4)')
    parser.add_argument('--learning-anneal',
                        default=1.1,
                        type=float,
                        help='Annealing learning rate every epoch')
    parser.add_argument('--teacher_forcing',
                        type=float,
                        default=1.0,
                        help='Teacher forcing ratio in decoder (default: 1.0)')
    parser.add_argument('--max_len',
                        type=int,
                        default=80,
                        help='Maximum characters of sentence (default: 80)')
    parser.add_argument('--max-norm',
                        default=400,
                        type=int,
                        help='Norm cutoff to prevent explosion of gradients')
    # Audio Config
    parser.add_argument('--sample-rate',
                        default=16000,
                        type=int,
                        help='Sampling Rate')
    parser.add_argument('--window-size',
                        default=.02,
                        type=float,
                        help='Window size for spectrogram')
    parser.add_argument('--window-stride',
                        default=.01,
                        type=float,
                        help='Window stride for spectrogram')
    # System
    parser.add_argument('--save-folder',
                        default='models',
                        help='Location to save epoch models')
    parser.add_argument('--model-path',
                        default='models/las_final.pth',
                        help='Location to save best validation model')
    parser.add_argument(
        '--log-path',
        default='log/',
        help='path to predict log about valid and test dataset')
    parser.add_argument('--cuda',
                        action='store_true',
                        default=False,
                        help='disables CUDA training')
    parser.add_argument('--seed',
                        type=int,
                        default=123456,
                        help='random seed (default: 123456)')
    parser.add_argument('--mode',
                        type=str,
                        default='train',
                        help='Train or Test')
    parser.add_argument('--load-model',
                        action='store_true',
                        default=False,
                        help='Load model')
    parser.add_argument('--finetune',
                        dest='finetune',
                        action='store_true',
                        default=False,
                        help='Finetune the model after load model')
    args = parser.parse_args()

    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)
    np.random.seed(args.seed)
    random.seed(args.seed)

    char2index, index2char = label_loader.load_label_json(args.labels_path)
    SOS_token = char2index['<s>']
    EOS_token = char2index['</s>']
    PAD_token = char2index['_']

    device = torch.device('cuda' if args.cuda else 'cpu')

    audio_conf = dict(sample_rate=args.sample_rate,
                      window_size=args.window_size,
                      window_stride=args.window_stride)

    # Batch Size
    batch_size = args.batch_size * args.num_gpu

    print(">> Train dataset : ", args.train_file)
    trainData_list = []
    with open(args.train_file, 'r', encoding='utf-8') as f:
        trainData_list = json.load(f)

    if args.num_gpu != 1:
        last_batch = len(trainData_list) % batch_size
        if last_batch != 0 and last_batch < args.num_gpu:
            trainData_list = trainData_list[:-last_batch]

    train_dataset = SpectrogramDataset(audio_conf=audio_conf,
                                       dataset_path=args.dataset_path,
                                       data_list=trainData_list,
                                       char2index=char2index,
                                       sos_id=SOS_token,
                                       eos_id=EOS_token,
                                       normalize=True)

    train_sampler = BucketingSampler(train_dataset, batch_size=batch_size)
    train_loader = AudioDataLoader(train_dataset,
                                   num_workers=args.num_workers,
                                   batch_sampler=train_sampler)

    print(">> Test dataset : ", args.test_file_list)
    testLoader_dict = {}
    for test_file in args.test_file_list:
        testData_list = []
        with open(test_file, 'r', encoding='utf-8') as f:
            testData_list = json.load(f)

        test_dataset = SpectrogramDataset(audio_conf=audio_conf,
                                          dataset_path=args.dataset_path,
                                          data_list=testData_list,
                                          char2index=char2index,
                                          sos_id=SOS_token,
                                          eos_id=EOS_token,
                                          normalize=True)
        testLoader_dict[test_file] = AudioDataLoader(
            test_dataset, batch_size=1, num_workers=args.num_workers)

    input_size = int(math.floor((args.sample_rate * args.window_size) / 2) + 1)
    enc = EncoderRNN(input_size,
                     args.encoder_size,
                     n_layers=args.encoder_layers,
                     dropout_p=args.dropout,
                     bidirectional=args.bidirectional,
                     rnn_cell=args.rnn_type,
                     variable_lengths=False)

    dec = DecoderRNN(len(char2index),
                     args.max_len,
                     args.decoder_size,
                     args.encoder_size,
                     SOS_token,
                     EOS_token,
                     n_layers=args.decoder_layers,
                     rnn_cell=args.rnn_type,
                     dropout_p=args.dropout,
                     bidirectional_encoder=args.bidirectional)

    model = Seq2Seq(enc, dec)

    save_folder = args.save_folder
    os.makedirs(save_folder, exist_ok=True)

    optim_state = None
    if args.load_model:  # Starting from previous model
        print("Loading checkpoint model %s" % args.model_path)
        state = torch.load(args.model_path)
        model.load_state_dict(state['model'])
        print('Model loaded')

        if not args.finetune:  # Just load model
            optim_state = state['optimizer']

    model = model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=1e-5)
    if optim_state is not None:
        optimizer.load_state_dict(optim_state)

    criterion = nn.CrossEntropyLoss(reduction='mean').to(device)

    print(model)
    print("Number of parameters: %d" % Seq2Seq.get_param_size(model))

    train_model = nn.DataParallel(model)

    if args.mode != "train":
        for test_file in args.test_file_list:
            test_loader = testLoader_dict[test_file]
            test_loss, test_cer, transcripts_list = evaluate(model,
                                                             test_loader,
                                                             criterion,
                                                             device,
                                                             save_output=True)

            for idx, line in enumerate(transcripts_list):
                # print(line)
                hyp, ref = line.split('\t')
                print("({:3d}/{:3d}) [REF]: {}".format(idx + 1,
                                                       len(transcripts_list),
                                                       ref))
                print("({:3d}/{:3d}) [HYP]: {}".format(idx + 1,
                                                       len(transcripts_list),
                                                       hyp))
                print()

            print("Test {} CER : {}".format(test_file, test_cer))
    else:
        best_cer = 1e10
        begin_epoch = 0

        # start_time = time.time()
        start_time = datetime.datetime.now()

        for epoch in range(begin_epoch, args.epochs):
            train_loss, train_cer = train(train_model, train_loader, criterion,
                                          optimizer, device, epoch,
                                          train_sampler, args.max_norm,
                                          args.teacher_forcing)

            # end_time = time.time()
            # elapsed_time = end_time - start_time
            elapsed_time = datetime.datetime.now() - start_time

            train_log = 'Train({name}) Summary Epoch: [{0}]\tAverage Loss {loss:.3f}\tAverage CER {cer:.3f}\tTime {time:}'.format(
                epoch + 1,
                name='train',
                loss=train_loss,
                cer=train_cer,
                time=elapsed_time)
            print(train_log)

            cer_list = []
            for test_file in args.test_file_list:
                test_loader = testLoader_dict[test_file]
                test_loss, test_cer, _ = evaluate(model,
                                                  test_loader,
                                                  criterion,
                                                  device,
                                                  save_output=False)
                test_log = 'Test({name}) Summary Epoch: [{0}]\tAverage Loss {loss:.3f}\tAverage CER {cer:.3f}\t'.format(
                    epoch + 1, name=test_file, loss=test_loss, cer=test_cer)
                print(test_log)

                cer_list.append(test_cer)

            if best_cer > cer_list[0]:
                print("Found better validated model, saving to %s" %
                      args.model_path)
                state = {
                    'model': model.state_dict(),
                    'optimizer': optimizer.state_dict()
                }
                torch.save(state, args.model_path)
                best_cer = cer_list[0]

            print("Shuffling batches...")
            train_sampler.shuffle(epoch)

            for g in optimizer.param_groups:
                g['lr'] = g['lr'] / args.learning_anneal
            print('Learning rate annealed to: {lr:.6f}'.format(lr=g['lr']))