Esempio n. 1
0
def main():
    project_path = str(Path(__file__).resolve().parents[0])
    tokenizer = RobertaTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)
    train_dataset = ParallelLanguageDataset(
        project_path + "/data/raw/en/train.txt",
        project_path + "/data/raw/fr/train.txt",
        tokenizer,
        max_seq_length,
    )
    train_loader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        num_workers=4,
        pin_memory=True,
    )
    valid_dataset = ParallelLanguageDataset(
        project_path + "/data/raw/en/val.txt",
        project_path + "/data/raw/fr/val.txt",
        tokenizer,
        max_seq_length,
    )
    valid_loader = DataLoader(
        valid_dataset,
        batch_size=batch_size,
        shuffle=True,
        num_workers=4,
        pin_memory=True,
    )

    model = LanguageTransformer(
        tokenizer.vocab_size,
        d_model,
        nhead,
        num_encoder_layers,
        num_decoder_layers,
        dim_feedforward,
        max_seq_length,
        pos_dropout,
        trans_dropout,
    ).to("cpu")
    for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_normal_(p)

    optim = ScheduledOptim(
        Adam(model.parameters(), betas=(0.9, 0.98), eps=1e-09), d_model,
        n_warmup_steps)

    criterion = nn.CrossEntropyLoss(ignore_index=0)
    train_losses, val_losses = train(train_loader, valid_loader, model, optim,
                                     criterion, num_epochs)
Esempio n. 2
0
def main(**kwargs):
    project_path = str(Path(__file__).resolve().parents[0])

    # train_dataset = ParallelLanguageDataset(project_path + '/data/processed/en/train.pkl',
    #                                         project_path + '/data/processed/fr/train.pkl',
    #                                         kwargs['num_tokens'], kwargs['max_seq_length'])
    # train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True, num_workers=4, pin_memory=True)
    # valid_dataset = ParallelLanguageDataset(project_path + '/data/processed/en/val.pkl',
    #                                         project_path + '/data/processed/fr/val.pkl',
    #                                         kwargs['num_tokens'], kwargs['max_seq_length'])
    # valid_loader = DataLoader(valid_dataset, batch_size=1, shuffle=True, num_workers=4, pin_memory=True)

    train_dataset = TranslationDataset(
        project_path + '/data_enru/processed/en/train.pkl',
        project_path + '/data_enru/processed/ru/train.pkl',
        kwargs['num_tokens'], kwargs['max_seq_length'])
    train_loader = DataLoader(train_dataset,
                              batch_size=1,
                              shuffle=True,
                              num_workers=4,
                              pin_memory=True)
    valid_dataset = TranslationDataset(
        project_path + '/data_enru/processed/en/val.pkl',
        project_path + '/data_enru/processed/ru/val.pkl', kwargs['num_tokens'],
        kwargs['max_seq_length'])
    valid_loader = DataLoader(valid_dataset,
                              batch_size=1,
                              shuffle=True,
                              num_workers=4,
                              pin_memory=True)

    model = LanguageTransformer(
        kwargs['vocab_size'], kwargs['d_model'], kwargs['nhead'],
        kwargs['num_encoder_layers'], kwargs['num_decoder_layers'],
        kwargs['dim_feedforward'], kwargs['max_seq_length'],
        kwargs['pos_dropout'], kwargs['trans_dropout']).to('cuda')
    for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_normal_(p)

    optim = ScheduledOptim(
        Adam(model.parameters(), betas=(0.9, 0.98), eps=1e-09),
        kwargs['d_model'], kwargs['n_warmup_steps'])

    criterion = nn.CrossEntropyLoss(ignore_index=0)
    train_losses, val_losses = train(train_loader, valid_loader, model, optim,
                                     criterion, kwargs['num_epochs'])
    def train(self, maxEpoch):
        if use_cuda:
            print('use cuda')
            self.model = self.model.cuda()
            self.model.use_cuda = True
            self.model.tensor = torch.cuda.LongTensor

        self.model.train()
        #opt = optim.Adam(self.model.parameters(),betas=(0.9, 0.98), eps=1e-09)
        #opt = optim.Adam(self.model.parameters())

        opt = ScheduledOptim(
            optim.Adam(filter(lambda x: x.requires_grad,
                              self.model.parameters()),
                       betas=(0.9, 0.98),
                       eps=1e-09), config.d_model, config.n_warmup_steps)

        for ep in range(maxEpoch):
            start = time.time()
            print(ep)
            indices = np.random.permutation(len(self.ds_train.idData))
            batches = pack(indices, 64)

            accLoss = 0
            n_word_total = 0
            n_word_correct = 0
            for batch in tqdm(batches):
                opt.zero_grad()
                idLines = [self.ds_train.idData[b] for b in batch]
                loss, n_correct, ts = self.model.getLoss(idLines)

                # backward and update
                loss.backward()
                opt.step_and_update_lr()

                # keep info
                accLoss += loss.item()
                non_pad_mask = ts.ne(config.pad_id)
                n_word = non_pad_mask.sum().item()
                n_word_total += n_word
                n_word_correct += n_correct

            loss_per_word = accLoss / n_word_total
            accuracy = n_word_correct / n_word_total

            print('  - (Train) ppl: {ppl: 8.5f}, accuracy: {accu:3.3f} %, '\
                        'elapse: {elapse:3.3f} min'.format(
                        ppl=math.exp(min(loss_per_word, 100)), accu=100*accuracy,
                        elapse=(time.time()-start)/60))

            self.evaluate()
Esempio n. 4
0
def train_process(opt):
    opt.cuda = not opt.no_cuda
    opt.d_word_vec = opt.d_model
    opt.batch_size = opt.b

    device = torch.device('cuda' if opt.cuda else 'cpu')

    data_class = SingleTurnDialog.load_class('OpenSubtitles')
    data_arg = Storage()
    data_arg.file_id = opt.datapath
    data_arg.min_vocab_times = 20

    def load_dataset(data_arg, wvpath, embedding_size):
        dm = data_class(**data_arg)
        return dm

    opt.n_position = 100
    dm = load_dataset(data_arg, None, opt.n_position)

    opt.n_src_vocab = dm.valid_vocab_len
    opt.n_trg_vocab = dm.valid_vocab_len
    opt.n_vocab_size = dm.valid_vocab_len
    opt.src_pad_idx = 0
    opt.trg_pad_idx = 0
    opt.pad_idx = 0

    model = transformer_model(opt, device).to(device)

    n_steps = 0
    optimizer_ = optim.Adam(model.parameters(), betas=(0.9, 0.98), eps=1e-09)

    if (opt.restore != None):
        checkpoint = torch.load(opt.restore)
        model.load_state_dict(checkpoint['net'])
        n_steps = checkpoint['n_steps']
        optimizer_.load_state_dict(checkpoint['opt'])

    optimizer = ScheduledOptim(optimizer_, opt.lr, opt.d_model,
                               opt.n_warmup_steps, n_steps)

    dl = cotk.dataloader.OpenSubtitles(
        opt.datapath, min_vocab_times=data_arg.min_vocab_times)
    train(model, dm, optimizer, device, opt, dl)
def make_model(classes,
               n_warmup_steps,
               n_encoder=2,
               d_dim=256,
               dropout=0.1,
               l_byte=1500,
               byte_range=256,
               h_groups=8):
    # we use 40 bytes for classify, but here we define l_byte=1500 just because it is the MTU
    # in fact, Preprocess.py has make every packet to 40 bytes
    model = SAN(classes, n_encoder, d_dim, dropout, l_byte, byte_range,
                h_groups)
    for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)

    optimizer = ScheduledOptim(
        optim.Adam(filter(lambda x: x.requires_grad, model.parameters()),
                   betas=(0.9, 0.98),
                   eps=1e-09), d_dim, n_warmup_steps)

    return model.to(torch.device('cuda')), optimizer
Esempio n. 6
0
    fusion, out_prob, cross_att = net_model(audio_inputs, video_inputs,
                                            args.threshold)

    labels = labels.cpu().data.numpy()
    x_labels = out_prob.cpu().data.numpy()
    acc = compute_acc(labels, x_labels, nb_batch)
    print('[test]acc: ', acc)

    return acc


if __name__ == "__main__":
    args = parser.parse_args()
    print("args: ", args)

    # model and optimizer
    model_name = args.model_name
    if model_name == "PSP":
        net_model = psp_net(128, 512, 128, 29)
    else:
        raise NotImplementedError
    net_model.to(device)
    optimizer = optim.Adam(net_model.parameters(), lr=1e-3)
    optimizer = ScheduledOptim(optimizer)

    if args.train:
        train(args, net_model, optimizer)
    else:
        test_acc = test(args, net_model, model_path=args.trained_model_path)
        print("[test] accuracy: ", test_acc)
Esempio n. 7
0
def main():
    torch_num_threads = 25
    torch.set_num_threads(torch_num_threads)
    ''' Main function'''
    parser = argparse.ArgumentParser()

    #parser.add_argument('-data', required=True)
    parser.add_argument('-torch_threads', type=int, default=25)
    
    parser.add_argument('-epoch', type=int, default=10)
    parser.add_argument('-batch_size', type=int, default=8)
    
    #parser.add_argument('-d_word_vec', type=int, default=512)
    parser.add_argument('-d_model', type=int, default=8)
    parser.add_argument('-d_inner_hid', type=int, default=8)

    parser.add_argument('-n_warmup_steps', type=int, default=3)

    parser.add_argument('-dropout', type=float, default=0.1)
    parser.add_argument('-embs_share_weight', action='store_true')
    parser.add_argument('-proj_share_weight', action='store_true')

    parser.add_argument('-log', default=None)
    parser.add_argument('-save_model', default='model')
    parser.add_argument('-save_mode', type=str, choices=['all', 'best'], default='best')

    parser.add_argument('-no_cuda', action='store_true')

    parser.add_argument('-network', type=int, default=0) # use social network; need features or deepwalk embeddings as initial input
    parser.add_argument('-pos_emb', type=int, default=1)
    parser.add_argument('-warmup', type=int, default=3) # warmup epochs
    parser.add_argument('-notes', default='')
    parser.add_argument('-data_name', default='twitter')

    opt = parser.parse_args()
    opt.cuda = not opt.no_cuda
    opt.d_word_vec = opt.d_model
    if opt.network==1:
        opt.network = True
    else:
        opt.network = False
    if opt.pos_emb==1:
        opt.pos_emb = True
    else:
        opt.pos_emb = False
    print(opt.notes)
    

    #========= Preparing DataLoader =========#
    train_data = DataLoader(opt.data_name, data=0, load_dict=True, batch_size=opt.batch_size, cuda=opt.cuda, loadNE=opt.network)
    valid_data = DataLoader(opt.data_name, data=1, batch_size=opt.batch_size, cuda=opt.cuda, loadNE=opt.network)
    test_data = DataLoader(opt.data_name, data=2, batch_size=opt.batch_size, cuda=opt.cuda, loadNE=opt.network)

    opt.user_size = train_data.user_size
    if opt.network:
        opt.net = train_data._adj_list
        opt.net_dict = train_data._adj_dict_list
        opt.embeds = train_data._embeds

    #========= Preparing Model =========#
    #print(opt)

    decoder = RNNModel('GRUCell', opt)
    RLLearner = RRModel(decoder)
    #print(transformer)

    optimizer = ScheduledOptim(
        optim.Adam(
            RLLearner.parameters(),
            betas=(0.9, 0.98), eps=1e-09),
        opt.d_model, opt.n_warmup_steps)


    def get_criterion(user_size):
        ''' With PAD token zero weight '''
        weight = torch.ones(user_size)
        weight[Constants.PAD] = 0
        weight[Constants.EOS] = 1
        return nn.CrossEntropyLoss(weight, size_average=False)

    crit = get_criterion(train_data.user_size)

    if opt.cuda:
        decoder = decoder.cuda()
        RLLearner = RLLearner.cuda()
        crit = crit.cuda()

    train(RLLearner, train_data, valid_data, test_data, crit, optimizer, opt)
Esempio n. 8
0
def train(opt):
    """ dataset preparation """
    opt.select_data = opt.select_data.split('-')
    opt.batch_ratio = opt.batch_ratio.split('-')
    train_dataset = Batch_Balanced_Dataset(opt)

    AlignCollate_valid = AlignCollate(imgH=opt.imgH,
                                      imgW=opt.imgW,
                                      keep_ratio_with_pad=opt.PAD)
    valid_dataset = hierarchical_dataset(root=opt.valid_data, opt=opt)
    valid_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=opt.batch_size,
        # 'True' to check training progress with validation function.
        shuffle=True,
        num_workers=int(opt.workers),
        collate_fn=AlignCollate_valid,
        pin_memory=True)
    print('-' * 80)
    """ model configuration """
    if 'Transformer' in opt.SequenceModeling:
        converter = TransformerLabelConverter(opt.character)
    elif 'CTC' in opt.Prediction:
        converter = CTCLabelConverter(opt.character)
    else:
        converter = AttnLabelConverter(opt.character)
    opt.num_class = len(converter.character)

    if opt.rgb:
        opt.input_channel = 3
    model = Model(opt)
    print('model input parameters', opt.imgH, opt.imgW, opt.num_fiducial,
          opt.input_channel, opt.output_channel, opt.hidden_size,
          opt.num_class, opt.batch_max_length, opt.Transformation,
          opt.FeatureExtraction, opt.SequenceModeling, opt.Prediction)

    # weight initialization
    for name, param in model.named_parameters():
        if 'localization_fc2' in name:
            print(f'Skip {name} as it is already initialized')
            continue
        try:
            if 'bias' in name:
                init.constant_(param, 0.0)
            elif 'weight' in name:
                init.kaiming_normal_(param)
        except Exception as e:  # for batchnorm.
            if 'weight' in name:
                param.data.fill_(1)
            continue
    """ setup loss """
    if 'Transformer' in opt.SequenceModeling:
        criterion = transformer_loss
    elif 'CTC' in opt.Prediction:
        criterion = torch.nn.CTCLoss(zero_infinity=True).cuda()
    else:
        # ignore [GO] token = ignore index 0
        criterion = torch.nn.CrossEntropyLoss(ignore_index=0).cuda()
    # loss averager
    loss_avg = Averager()

    # filter that only require gradient decent
    filtered_parameters = []
    params_num = []
    for p in filter(lambda p: p.requires_grad, model.parameters()):
        filtered_parameters.append(p)
        params_num.append(np.prod(p.size()))
    print('Trainable params num : ', sum(params_num))
    # [print(name, p.numel()) for name, p in filter(lambda p: p[1].requires_grad, model.named_parameters())]

    # setup optimizer
    if opt.adam:
        optimizer = optim.Adam(filtered_parameters,
                               lr=opt.lr,
                               betas=(opt.beta1, 0.999))
    elif 'Transformer' in opt.SequenceModeling and opt.use_scheduled_optim:
        optimizer = optim.Adam(filtered_parameters,
                               betas=(0.9, 0.98),
                               eps=1e-09)
        optimizer_schedule = ScheduledOptim(optimizer, opt.d_model,
                                            opt.n_warmup_steps)
    else:
        optimizer = optim.Adadelta(filtered_parameters,
                                   lr=opt.lr,
                                   rho=opt.rho,
                                   eps=opt.eps)
    print("Optimizer:")
    print(optimizer)
    """ final options """
    # print(opt)
    with open(f'./saved_models/{opt.experiment_name}/opt.txt',
              'a') as opt_file:
        opt_log = '------------ Options -------------\n'
        args = vars(opt)
        for k, v in args.items():
            opt_log += f'{str(k)}: {str(v)}\n'
        opt_log += '---------------------------------------\n'
        print(opt_log)
        opt_file.write(opt_log)
    """ start training """
    start_iter = 0

    start_time = time.time()
    best_accuracy = -1
    best_norm_ED = 1e+6
    pickle.load = partial(pickle.load, encoding="latin1")
    pickle.Unpickler = partial(pickle.Unpickler, encoding="latin1")
    if opt.load_weights != '' and check_isfile(opt.load_weights):
        # load pretrained weights but ignore layers that don't match in size
        checkpoint = torch.load(opt.load_weights, pickle_module=pickle)
        if type(checkpoint) == dict:
            pretrain_dict = checkpoint['state_dict']
        else:
            pretrain_dict = checkpoint
        model_dict = model.state_dict()
        pretrain_dict = {
            k: v
            for k, v in pretrain_dict.items()
            if k in model_dict and model_dict[k].size() == v.size()
        }
        model_dict.update(pretrain_dict)
        model.load_state_dict(model_dict)
        print("Loaded pretrained weights from '{}'".format(opt.load_weights))
        del checkpoint
        torch.cuda.empty_cache()
    if opt.continue_model != '':
        print(f'loading pretrained model from {opt.continue_model}')
        checkpoint = torch.load(opt.continue_model)
        print(checkpoint.keys())
        model.load_state_dict(checkpoint['state_dict'])
        start_iter = checkpoint['step'] + 1
        print('continue to train start_iter: ', start_iter)
        if 'optimizer' in checkpoint.keys():
            optimizer.load_state_dict(checkpoint['optimizer'])
            for state in optimizer.state.values():
                for k, v in state.items():
                    if isinstance(v, torch.Tensor):
                        state[k] = v.cuda()
        if 'best_accuracy' in checkpoint.keys():
            best_accuracy = checkpoint['best_accuracy']
        if 'best_norm_ED' in checkpoint.keys():
            best_norm_ED = checkpoint['best_norm_ED']
        del checkpoint
        torch.cuda.empty_cache()
    # data parallel for multi-GPU
    model = torch.nn.DataParallel(model).cuda()
    model.train()
    print("Model size:", count_num_param(model), 'M')
    if 'Transformer' in opt.SequenceModeling and opt.use_scheduled_optim:
        optimizer_schedule.n_current_steps = start_iter

    for i in tqdm(range(start_iter, opt.num_iter)):
        for p in model.parameters():
            p.requires_grad = True

        cpu_images, cpu_texts = train_dataset.get_batch()
        image = cpu_images.cuda()
        if 'Transformer' in opt.SequenceModeling:
            text, length, text_pos = converter.encode(cpu_texts,
                                                      opt.batch_max_length)
        elif 'CTC' in opt.Prediction:
            text, length = converter.encode(cpu_texts)
        else:
            text, length = converter.encode(cpu_texts, opt.batch_max_length)
        batch_size = image.size(0)

        if 'Transformer' in opt.SequenceModeling:
            preds = model(image, text, tgt_pos=text_pos)
            target = text[:, 1:]  # without <s> Symbol
            cost = criterion(preds.view(-1, preds.shape[-1]),
                             target.contiguous().view(-1))
        elif 'CTC' in opt.Prediction:
            preds = model(image, text).log_softmax(2)
            preds_size = torch.IntTensor([preds.size(1)] * batch_size)
            preds = preds.permute(1, 0, 2)  # to use CTCLoss format
            cost = criterion(preds, text, preds_size, length)
        else:
            preds = model(image, text)
            target = text[:, 1:]  # without [GO] Symbol
            cost = criterion(preds.view(-1, preds.shape[-1]),
                             target.contiguous().view(-1))

        model.zero_grad()
        cost.backward()

        if 'Transformer' in opt.SequenceModeling and opt.use_scheduled_optim:
            optimizer_schedule.step_and_update_lr()
        elif 'Transformer' in opt.SequenceModeling:
            optimizer.step()
        else:
            # gradient clipping with 5 (Default)
            torch.nn.utils.clip_grad_norm_(model.parameters(), opt.grad_clip)
            optimizer.step()

        loss_avg.add(cost)

        # validation part
        if i > 0 and (i + 1) % opt.valInterval == 0:
            elapsed_time = time.time() - start_time
            print(
                f'[{i+1}/{opt.num_iter}] Loss: {loss_avg.val():0.5f} elapsed_time: {elapsed_time:0.5f}'
            )
            # for log
            with open(f'./saved_models/{opt.experiment_name}/log_train.txt',
                      'a') as log:
                log.write(
                    f'[{i+1}/{opt.num_iter}] Loss: {loss_avg.val():0.5f} elapsed_time: {elapsed_time:0.5f}\n'
                )
                loss_avg.reset()

                model.eval()
                with torch.no_grad():
                    valid_loss, current_accuracy, current_norm_ED, preds, gts, infer_time, length_of_data = validation(
                        model, criterion, valid_loader, converter, opt)
                model.train()

                for pred, gt in zip(preds[:5], gts[:5]):
                    if 'Transformer' in opt.SequenceModeling:
                        pred = pred[:pred.find('</s>')]
                        gt = gt[:gt.find('</s>')]
                    elif 'Attn' in opt.Prediction:
                        pred = pred[:pred.find('[s]')]
                        gt = gt[:gt.find('[s]')]

                    print(f'{pred:20s}, gt: {gt:20s},   {str(pred == gt)}')
                    log.write(
                        f'{pred:20s}, gt: {gt:20s},   {str(pred == gt)}\n')

                valid_log = f'[{i+1}/{opt.num_iter}] valid loss: {valid_loss:0.5f}'
                valid_log += f' accuracy: {current_accuracy:0.3f}, norm_ED: {current_norm_ED:0.2f}'
                print(valid_log)
                log.write(valid_log + '\n')

                # keep best accuracy model
                if current_accuracy > best_accuracy:
                    best_accuracy = current_accuracy
                    state_dict = model.module.state_dict()
                    save_checkpoint(
                        {
                            'best_accuracy': best_accuracy,
                            'state_dict': state_dict,
                        }, False,
                        f'./saved_models/{opt.experiment_name}/best_accuracy.pth'
                    )
                if current_norm_ED < best_norm_ED:
                    best_norm_ED = current_norm_ED
                    state_dict = model.module.state_dict()
                    save_checkpoint(
                        {
                            'best_norm_ED': best_norm_ED,
                            'state_dict': state_dict,
                        }, False,
                        f'./saved_models/{opt.experiment_name}/best_norm_ED.pth'
                    )
                    # torch.save(
                    #     model.state_dict(), f'./saved_models/{opt.experiment_name}/best_norm_ED.pth')
                best_model_log = f'best_accuracy: {best_accuracy:0.3f}, best_norm_ED: {best_norm_ED:0.2f}'
                print(best_model_log)
                log.write(best_model_log + '\n')

        # save model per 1000 iter.
        if (i + 1) % 1000 == 0:
            state_dict = model.module.state_dict()
            optimizer_state_dict = optimizer.state_dict()
            save_checkpoint(
                {
                    'state_dict': state_dict,
                    'optimizer': optimizer_state_dict,
                    'step': i,
                    'best_accuracy': best_accuracy,
                    'best_norm_ED': best_norm_ED,
                }, False,
                f'./saved_models/{opt.experiment_name}/iter_{i+1}.pth')
Esempio n. 9
0
                dim_c=args.dim_c,
                hidden_size1=hidden_size1,
                hidden_size2=hidden_size2,
                hidden_size3=hidden_size3,
                dropout=args.dropout,
                use_selu=args.use_selu,
                model_path=args.model_path,
                n_warmup_steps=args.n_warmup_steps,
                comp_eff=args.comp_eff)

dim_embedding = args.dim_u + args.dim_s1 + args.dim_s2 + args.dim_s3

## Optimizer
optimizer_ttime = ScheduledOptim(
    torch.optim.Adam(TTime_combine.parameters(),
                     betas=(0.9, 0.98),
                     eps=1e-9,
                     amsgrad=False), args.lr, dim_embedding,
    args.n_warmup_steps)

## Preparing the data
trainfiles = list(
    filter(lambda x: x.endswith(".h5"), sorted(os.listdir(args.trainpath))))
validfiles = list(
    filter(lambda x: x.endswith(".h5"), sorted(os.listdir(args.validpath))))
train_dataloader = DataLoader(args.trainpath)
print("Loading the training data...")
train_dataloader.read_files(trainfiles)
valid_dataloader = DataLoader(args.validpath)
print("Loading the validation data...")
valid_dataloader.read_files(validfiles)
train_slot_size = np.array(
Esempio n. 10
0
def train(hp):
    train_loader, valset, collate_fn = prepare_dataloaders(hparams)
    #device = torch.device('cuda' if hp.cuda else 'cpu') 
    device = torch.device('cuda')
    '''
    model=Transformer(n_src_vocab=hp.n_src_vocab,len_max_seq=hp.len_max_seq,d_word_vec=hp.d_word_vec,d_model=hp.d_model,
                        d_inner=hp.d_inner,n_layers=hp.n_layers,n_head=hp.n_head,
                        d_k=hp.d_k,d_v=hp.d_v,dropout=hp.dropout).to(device)
    '''
    model=make_model().to('cuda')
    print(model)
    
    #model_opt = NoamOpt(512, 1, 400,torch.optim.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.98), eps=1e-9))
    model_opt=ScheduledOptim(optim.Adam(
        filter(lambda x:x.requires_grad,model.parameters()),betas=(0.9,0.98),eps=1e-09),
        hp.d_model,hp.n_warmup_steps)
    '''
    optimizer=ScheduledOptim(optim.Adam(\
                filter(lambda x:x.requires_grad,model.parameters()),betas=(0.9,0.98),eps=1e-09),
                 hp.d_model,hp.n_warmup_steps)
    '''
    try:
        checkpoint=torch.load(os.path.join(hp.checkpoint_path,'checkpoint_%d.pth.tar'% args.restore_step))
        model.load_state_dict(checkpoint['model'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        print("\n--------model restored at step %d--------\n" % args.restore_step)
    except:
        print("\n--------Start New Training--------\n")
    if not os.path.exists(hp.checkpoint_path):
        os.mkdir(hp.checkpoint_path)
    model.train() 
    lambda_l1=0.3
    l1=0
    for epoch in range(hp.epochs):
        train_loader, valset, collate_fn = prepare_dataloaders(hparams)
        for i,data in enumerate(train_loader):
            #current_step = i + hp.restore_step + epoch * len(dataloader) + 1
            n_iter = i + 0 + epoch * len(train_loader) + 1
            #print(data[0].shape) 
            #print(data[1].shape)
            #print(data[2].shape)
            #print(data[3].shape)
            #optimizer.zero_grad()
            model_opt.zero_grad() 
            try:
                #mel_input = np.concatenate((np.zeros([hp.batch_size, hp.num_mels, 1], dtype=np.float32),data[2][:,:,1:]), axis=2)
                mel_input = np.concatenate((np.zeros([hp.batch_size, hp.num_mels, 1], dtype=np.float32),data[2][:,:,1:]), axis=2)
            except:
                raise TypeError("not same dimension")
            
            #mel_input = np.concatenate((np.zeros([hp.batch_size, hp.num_mels, 1], dtype=np.float32),data[2][:,:,1:]), axis=2)

            if use_cuda:
                text_padded=Variable(data[0].type(torch.cuda.LongTensor), requires_grad=False).cuda()
                text_zeroone=Variable(data[1].type(torch.cuda.FloatTensor), requires_grad=False).cuda()
                #print(text_zeroone)
                #text_zeronne=torch.ones(text_zeroone.shape).cuda()-text_zeroone
                en_mask=make_src_mask(text_padded,0)
                #mel_padded=Variable(torch.from_numpy(mel_input).type(torch.cuda.FloatTensor), requires_grad=False).cuda()
                mel_zeroone=Variable(data[4].type(torch.cuda.FloatTensor), requires_grad=False).cuda()
                #print(mel_zeroone)
                mel_zeroone=torch.ones(mel_zeroone.shape).cuda()-mel_zeroone
                de_mask=make_tgt_mask(mel_zeroone,0)
                mel_truth=Variable(data[2].type(torch.cuda.FloatTensor), requires_grad=False).cuda()
                mel_input=Variable(data[3].type(torch.cuda.FloatTensor), requires_grad=False).cuda()
            else:
                text_padded=Variable(torch.from_numpy(data[0]).type(torch.LongTensor), requires_grad=False)
                text_zeroone=Variable(torch.from_numpy(data[1]).type(torch.FloatTensor), requires_grad=False)
                mel_padded=Variable(torch.from_numpy(mel_input).type(torch.FloatTensor), requires_grad=False)
                mel_zeroone=Variable(torch.from_numpy(data[3]).type(torch.FloatTensor), requires_grad=False)
                mel_truth=Variable(torch.from_numpy(data[2]).type(torch.FloatTensor), requires_grad=False)
            '''
            print('epoch:',epoch)
            print('text_padded:',text_padded.shape)
            print('mel_padded:',mel_input.shape)
            print('text_zeroonr',text_zeroone.shape)
            print('mel_zeroone',mel_zeroone.shape)
            print('en_mask',en_mask)
            print('de_mask',de_mask[0,0,:])
            '''
            #l2_regularization=torch.Tensor(0)
            
            #for param in model.parameters():
            #    l2_regularization += torch.norm(param, 2)
            frame,frame_post,stop=model(text_padded,mel_input,en_mask,de_mask)
            #print("0",model.decoder.layers[0].en_attn.attn[0,0,:,:])
            #print("1",model.decoder.layers[1].en_attn.attn[0, 2])

            if n_iter%500==0:
                for layer in range(4):
                    for h in range(hp.n_head):
                        alignment=model.decoder.layers[layer].en_attn.attn[0, h].cpu().data.numpy()
                        tag="alignment_layer{}_head{}".format(layer,h)

                        writer.add_image(tag,np.uint8(cm.viridis(np.flip(alignment, 1).T) * 255),n_iter)

                        #plot.plot_alignment(model.decoder.layers[layer].en_attn.attn[0, h],os.path.join(hp.checkpoint_path,'step-%d-layer-%d-head-%d-align.png' % (current_step, layer+1,h+1)),info='%s, %s, %s, step=%d, loss=%.5f' % ('transformer','mo', time_string(), current_step, 0))
            #print(model.decoder.layers[0].en_attn.attn[0, 1]) 
            '''
            before_loss=criterion(before,mel_truth)
            post_loss=criterion(post,mel_truth)
            gate_loss=nn.BCEWithLogitsLoss()(stop, mel_zeroone)
            loss=before_loss+post_loss+gate_loss
            '''
            #l2_regularization=torch.Tensor(0)
            #for param in model.parameters():
            #    l2_regularization += torch.norm(param, 2)
            #print(l2_regularization)
            before = nn.MSELoss()(frame.transpose(-2,-1),mel_truth)
            post=nn.MSELoss()(frame_post,mel_truth)
            gate=nn.BCEWithLogitsLoss()(stop, mel_zeroone)
            loss=before+post+gate
            '''
            for param in model.parameters():
                l1=l1+param.abs().sum()
            loss=loss+lambda_l1*l1
            '''
            loss.backward()
            #nn.utils.clip_grad_norm(model.parameters(), 1.)
            #optimizer.step_and_update_lr()
            nn.utils.clip_grad_norm(model.parameters(), 1.)
            model_opt.step_and_update_lr()
            if i%1000==0:
                writer.add_scalar('Train/before',before, n_iter)
                writer.add_scalar('Train/post', post, n_iter)
                writer.add_scalar('Train/gate', gate, n_iter)
                writer.add_scalar('Train/all', loss, n_iter)

            '''
            if i%100==0:
               for i in range(len(dec_enc_attn_list)):
                   for j in range(0,hp.batch_size*hp.n_head,hp.batch_size):
                       #print('dec_enc_attn:',dec_enc_attn_list[i][j].shape)
                       plot.plot_alignment(dec_enc_attn_list[i][j],os.path.join(hp.checkpoint_path, 'step-%d-layer-%d-head-%d-align.png' % (niter, i+1, j/hp.batch_size+1)),info='%s, %s, %s, step=    %d, loss=%.5f' % ('transformer','mo', time_string(), niter, loss))
            params=list(model.named_parameters())
            print(params)
            '''

        '''