Esempio n. 1
0
def main():

    global char2index
    global index2char
    global SOS_token
    global EOS_token
    global PAD_token

    parser = argparse.ArgumentParser(description='Speech hackathon Baseline')
    parser.add_argument('--hidden_size',
                        type=int,
                        default=512,
                        help='hidden size of model (default: 256)')
    parser.add_argument('--layer_size',
                        type=int,
                        default=3,
                        help='number of layers of model (default: 3)')
    parser.add_argument('--dropout',
                        type=float,
                        default=0.2,
                        help='dropout rate in training (default: 0.2)')
    parser.add_argument(
        '--bidirectional',
        action='store_true',
        help='use bidirectional RNN for encoder (default: False)')
    parser.add_argument(
        '--use_attention',
        action='store_true',
        help='use attention between encoder-decoder (default: False)')
    parser.add_argument('--batch_size',
                        type=int,
                        default=32,
                        help='batch size in training (default: 32)')
    parser.add_argument(
        '--workers',
        type=int,
        default=4,
        help='number of workers in dataset loader (default: 4)')
    parser.add_argument('--max_epochs',
                        type=int,
                        default=10,
                        help='number of max epochs in training (default: 10)')
    parser.add_argument('--lr',
                        type=float,
                        default=1e-04,
                        help='learning rate (default: 0.0001)')
    parser.add_argument('--teacher_forcing',
                        type=float,
                        default=0.5,
                        help='teacher forcing ratio in decoder (default: 0.5)')
    parser.add_argument('--max_len',
                        type=int,
                        default=80,
                        help='maximum characters of sentence (default: 80)')
    parser.add_argument('--no_cuda',
                        action='store_true',
                        default=False,
                        help='disables CUDA training')
    parser.add_argument('--seed',
                        type=int,
                        default=1,
                        help='random seed (default: 1)')
    parser.add_argument('--save_name',
                        type=str,
                        default='model',
                        help='the name of model in nsml or local')
    parser.add_argument('--mode', type=str, default='train')
    parser.add_argument("--pause", type=int, default=0)
    parser.add_argument('--rnn_cell', type=str, default='gru')
    parser.add_argument("--iteration", type=int, default=0)
    parser.add_argument('--feature', type=str, default='spec')
    parser.add_argument('--save_dir', type=str, default='')

    args = parser.parse_args()

    char2index, index2char = label_loader.load_label('./hackathon.labels')
    SOS_token = char2index['<s>']
    EOS_token = char2index['</s>']
    PAD_token = char2index['_']

    random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    args.cuda = not args.no_cuda and torch.cuda.is_available()
    device = torch.device('cuda' if args.cuda else 'cpu')

    logger.info('Using %s as feature' % args.feature)
    if args.save_dir:
        logger.info('Save directory: %s' % args.save_dir)
        os.makedirs(args.save_dir, exist_ok=True)

    # N_FFT: defined in loader.py
    if args.feature == 'mfcc':
        feature_size = N_MFCC * 3  # concat of mfcc, mfcc' mfcc''
    elif args.feature == 'melspec':
        feature_size = N_MELS
    elif args.feature == 'spec':
        feature_size = N_FFT / 2 + 1
    else:
        raise ValueError('Unsupported feature %s' % args.feature)

    enc = EncoderRNN(feature_size,
                     args.hidden_size,
                     input_dropout_p=args.dropout,
                     dropout_p=args.dropout,
                     n_layers=args.layer_size,
                     bidirectional=args.bidirectional,
                     rnn_cell=args.rnn_cell,
                     variable_lengths=False)

    dec = DecoderRNN(len(char2index),
                     args.max_len,
                     args.hidden_size * (2 if args.bidirectional else 1),
                     SOS_token,
                     EOS_token,
                     n_layers=args.layer_size,
                     rnn_cell=args.rnn_cell,
                     bidirectional=args.bidirectional,
                     input_dropout_p=args.dropout,
                     dropout_p=args.dropout,
                     use_attention=args.use_attention)

    model = Seq2seq(enc, dec)
    model.flatten_parameters()

    for param in model.parameters():
        param.data.uniform_(-0.08, 0.08)

    model = nn.DataParallel(model).to(device)

    optimizer = optim.Adam(model.module.parameters(), lr=args.lr)
    criterion = nn.CrossEntropyLoss(reduction='sum',
                                    ignore_index=PAD_token).to(device)

    bind_model(model, optimizer, args.feature)
    if args.pause != 1:
        nsml.load(checkpoint='10', session='team236/sr-hack-2019-dataset/122')
        nsml.save('init')
        logger.info('Saved!')

    if args.pause == 1:
        nsml.paused(scope=locals())

    if args.mode != "train":
        return

    data_list = os.path.join(DATASET_PATH, 'train_data', 'data_list.csv')
    wav_paths = list()
    script_paths = list()
    # load all target scripts for reducing disk i/o
    target_path = os.path.join(DATASET_PATH, 'train_label')
    target_dict = load_targets(target_path)

    with open(data_list, 'r') as f:
        for line in f:
            # line: "aaa.wav,aaa.label"
            wav_path, script_path = line.strip().split(',')
            wav_paths.append(os.path.join(DATASET_PATH, 'train_data',
                                          wav_path))
            script_paths.append(
                os.path.join(DATASET_PATH, 'train_data', script_path))

    best_loss = 1e10
    begin_epoch = 0

    train_dataset, valid_dataset = split_dataset(args,
                                                 wav_paths,
                                                 script_paths,
                                                 target_dict,
                                                 args.feature,
                                                 valid_ratio=0.05)

    train_begin = time.time()

    for epoch in range(begin_epoch, args.max_epochs):
        train_loader = torch.utils.data.DataLoader(train_dataset,
                                                   batch_size=args.batch_size,
                                                   shuffle=True,
                                                   num_workers=args.workers,
                                                   collate_fn=collate_fn)

        train_loss, train_cer = train(model, train_loader, criterion,
                                      optimizer, device, train_begin, 10,
                                      args.teacher_forcing)

        logger.info('Epoch %d (Training) Loss %0.4f CER %0.4f' %
                    (epoch, train_loss, train_cer))

        valid_loader = torch.utils.data.DataLoader(valid_dataset,
                                                   batch_size=4,
                                                   shuffle=False,
                                                   num_workers=args.workers,
                                                   collate_fn=collate_fn)

        eval_loss, eval_cer = evaluate(model, valid_loader, criterion, device)
        logger.info('Epoch %d (Evaluate) Loss %0.4f CER %0.4f' %
                    (epoch, eval_loss, eval_cer))

        nsml.report(False,
                    step=epoch,
                    train_epoch__loss=train_loss,
                    train_epoch__cer=train_cer,
                    eval__loss=eval_loss,
                    eval__cer=eval_cer)

        best_model = (eval_loss < best_loss)
        nsml.save(args.save_name)
        nsml.save(str(epoch))

        if args.save_dir:
            save_model(
                model, optimizer,
                os.path.join(args.save_dir,
                             './epoch-%d-cer-%d.pt' % (epoch, eval_cer)))

        if best_model:
            nsml.save('best')
            best_loss = eval_loss
Esempio n. 2
0
def main():

    global char2index
    global index2char
    global SOS_token
    global EOS_token
    global PAD_token

    parser = argparse.ArgumentParser(description='Speech hackathon Baseline')
    parser.add_argument('--hidden_size',
                        type=int,
                        default=512,
                        help='hidden size of model (default: 256)')
    parser.add_argument('--layer_size',
                        type=int,
                        default=1,
                        help='number of layers of model (default: 3)')
    parser.add_argument('--dropout',
                        type=float,
                        default=0,
                        help='dropout rate in training (default: 0.2)')
    parser.add_argument(
        '--bidirectional',
        action='store_true',
        help='use bidirectional RNN for encoder (default: False)')
    parser.add_argument(
        '--use_attention',
        action='store_true',
        help='use attention between encoder-decoder (default: False)')
    parser.add_argument('--batch_size',
                        type=int,
                        default=16,
                        help='batch size in training (default: 8)')
    parser.add_argument(
        '--workers',
        type=int,
        default=4,
        help='number of workers in dataset loader (default: 4)')
    parser.add_argument('--max_epochs',
                        type=int,
                        default=100,
                        help='number of max epochs in training (default: 10)')
    parser.add_argument('--lr',
                        type=float,
                        default=1e-04,
                        help='learning rate (default: 0.0001)')
    parser.add_argument('--teacher_forcing',
                        type=float,
                        default=0,
                        help='teacher forcing ratio in decoder (default: 0.5)')
    parser.add_argument('--max_len',
                        type=int,
                        default=80,
                        help='maximum characters of sentence (default: 80)')
    parser.add_argument('--no_cuda',
                        action='store_true',
                        default=False,
                        help='disables CUDA training')
    parser.add_argument('--seed',
                        type=int,
                        default=1,
                        help='random seed (default: 1)')
    parser.add_argument('--save_name',
                        type=str,
                        default='model',
                        help='the name of model in nsml or local')
    parser.add_argument('--mode', type=str, default='train')
    parser.add_argument("--pause", type=int, default=0)

    args = parser.parse_args()

    char2index, index2char = label_loader.load_label('./hackathon.labels')
    SOS_token = char2index['<s>']
    EOS_token = char2index['</s>']
    PAD_token = char2index['_']

    ######## embeding function

    random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    args.cuda = not args.no_cuda and torch.cuda.is_available()
    device = torch.device('cuda' if args.cuda else 'cpu')

    # N_FFT: defined in loader.py
    feature_size = N_FFT / 2 + 1

    # enc = EncoderRNN(feature_size, args.hidden_size,
    #                  input_dropout_p=args.dropout, dropout_p=args.dropout,
    #                  n_layers=args.layer_size, bidirectional=args.bidirectional, rnn_cell='gru', variable_lengths=False)
    enc = PBlstm.Listener(feature_size, args.hidden_size, 3, 'LSTM', True,
                          args.dropout)

    dec = DecoderRNN(len(char2index),
                     args.max_len,
                     args.hidden_size * (2 if True else 1),
                     SOS_token,
                     EOS_token,
                     n_layers=args.layer_size,
                     rnn_cell='LSTM',
                     bidirectional=True,
                     input_dropout_p=args.dropout,
                     dropout_p=args.dropout,
                     use_attention=args.use_attention)

    model = Seq2seq(enc, dec)
    model.flatten_parameters()

    # for name, param in model.named_parameters():
    #     if param.requires_grad:
    #         print(name)
    #         print(param.data.shape)

    # encoder.conv.0.weight
    # torch.Size([32, 1, 41, 11])
    # encoder.conv.0.bias
    # torch.Size([32])
    # encoder.conv.1.weight
    # torch.Size([32])
    # encoder.conv.1.bias
    # torch.Size([32])
    # encoder.conv.3.weight
    # torch.Size([32, 32, 21, 11])
    # encoder.conv.3.bias
    # torch.Size([32])
    # encoder.conv.4.weight
    # torch.Size([32])
    # encoder.conv.4.bias
    # torch.Size([32])
    # encoder.rnn.weight_ih_l0
    # torch.Size([1536, 4128])
    # encoder.rnn.weight_hh_l0
    # torch.Size([1536, 512])
    # encoder.rnn.bias_ih_l0
    # torch.Size([1536])
    # encoder.rnn.bias_hh_l0
    # torch.Size([1536])
    # encoder.rnn.weight_ih_l1
    # torch.Size([1536, 512])
    # encoder.rnn.weight_hh_l1
    # torch.Size([1536, 512])
    # encoder.rnn.bias_ih_l1
    # torch.Size([1536])
    # encoder.rnn.bias_hh_l1
    # torch.Size([1536])
    # encoder.rnn.weight_ih_l2
    # torch.Size([1536, 512])
    # encoder.rnn.weight_hh_l2
    # torch.Size([1536, 512])
    # encoder.rnn.bias_ih_l2
    # torch.Size([1536])
    # encoder.rnn.bias_hh_l2
    # torch.Size([1536])
    # decoder.rnn.weight_ih_l0
    # torch.Size([1536, 512])
    # decoder.rnn.weight_hh_l0
    # torch.Size([1536, 512])
    # decoder.rnn.bias_ih_l0
    # torch.Size([1536])
    # decoder.rnn.bias_hh_l0
    # torch.Size([1536])
    # decoder.rnn.weight_ih_l1
    # torch.Size([1536, 512])
    # decoder.rnn.weight_hh_l1
    # torch.Size([1536, 512])
    # decoder.rnn.bias_ih_l1
    # torch.Size([1536])
    # decoder.rnn.bias_hh_l1
    # torch.Size([1536])
    # decoder.rnn.weight_ih_l2
    # torch.Size([1536, 512])
    # decoder.rnn.weight_hh_l2
    # torch.Size([1536, 512])
    # decoder.rnn.bias_ih_l2
    # torch.Size([1536])
    # decoder.rnn.bias_hh_l2
    # torch.Size([1536])
    # decoder.embedding.weight
    # torch.Size([820, 512])
    # decoder.out.weight
    # torch.Size([820, 512])
    # decoder.out.bias
    # torch.Size([820])

    for param in model.parameters():
        param.data.uniform_(-0.08, 0.08)

    model = nn.DataParallel(model).to(device)

    optimizer = optim.Adam(model.module.parameters(), lr=args.lr)
    criterion = nn.CrossEntropyLoss(reduction='sum',
                                    ignore_index=PAD_token).to(device)

    bind_model(model, optimizer)

    if args.pause == 1:
        nsml.paused(scope=locals())

    if args.mode != "train":
        return

    nsml.load(checkpoint='4', session='team147/sr-hack-2019-dataset/787')
    nsml.save('787_4')
    # exit()

    data_list = os.path.join(DATASET_PATH, 'train_data', 'data_list.csv')
    ################################################ 원본
    # wav_paths = list()
    # script_paths = list()

    # with open(data_list, 'r') as f:
    #     for line in f:
    #         # line: "aaa.wav,aaa.label"

    #         wav_path, script_path = line.strip().split(',')
    #         wav_paths.append(os.path.join(DATASET_PATH, 'train_data', wav_path))
    #         script_paths.append(os.path.join(DATASET_PATH, 'train_data', script_path))
    ######################################## time sorting ###########
    wav_paths = list()
    script_paths = list()
    wav_path_len = list()

    with open(data_list, 'r') as f:
        for line in f:
            # line: "aaa.wav,aaa.label"
            wav_path, script_path = line.strip().split(',')

            wav_paths.append(os.path.join(DATASET_PATH, 'train_data',
                                          wav_path))
            script_paths.append(
                os.path.join(DATASET_PATH, 'train_data', script_path))

            wav_path = (os.path.join(DATASET_PATH, 'train_data', wav_path))
            with contextlib.closing(wave.open(wav_path, 'r')) as wav_file:
                frames = wav_file.getnframes()
                rate = wav_file.getframerate()
                length = frames / float(rate)
                wav_path_len.append((wav_path, length))

            # script_paths.append(os.path.join(DATASET_PATH, 'train_data', script_path))

    best_loss = 1e10
    best_cer = 1e10
    begin_epoch = 0

    # load all target scripts for reducing disk i/o
    target_path = os.path.join(DATASET_PATH, 'train_label')
    load_targets(target_path)

    train_batch_num, train_dataset_list, valid_dataset = split_dataset(
        args, wav_paths, script_paths, wav_path_len, valid_ratio=0.05)

    logger.info('start')

    train_begin = time.time()

    # ctc = nn.CTCLoss(blank=0, reduction='mean').to(device)

    epoch_chk = 0

    for epoch in range(begin_epoch, args.max_epochs):

        train_queue = queue.Queue(args.workers * 2)

        train_loader = MultiLoader(train_dataset_list, train_queue,
                                   args.batch_size, args.workers)
        train_loader.start()

        train_loss, train_cer = train(model, train_batch_num, train_queue,
                                      criterion, optimizer, device,
                                      train_begin, args.workers, 10,
                                      args.teacher_forcing)
        logger.info('Epoch %d (Training) Loss %0.4f CER %0.4f' %
                    (epoch, train_loss, train_cer))

        train_loader.join()

        valid_queue = queue.Queue(args.workers * 2)
        valid_loader = BaseDataLoader(valid_dataset, valid_queue,
                                      args.batch_size, 0)
        valid_loader.start()

        eval_loss, eval_cer = evaluate(model, valid_loader, valid_queue,
                                       criterion, device)
        logger.info('Epoch %d (Evaluate) Loss %0.4f CER %0.4f' %
                    (epoch, eval_loss, eval_cer))

        valid_loader.join()

        nsml.report(False,
                    step=epoch,
                    train_epoch__loss=train_loss,
                    train_epoch__cer=train_cer,
                    eval__loss=eval_loss,
                    eval__cer=eval_cer)

        nsml.save(epoch_chk)
        epoch_chk += 1
Esempio n. 3
0
def main():

    global char2index
    global index2char
    global SOS_token
    global EOS_token
    global PAD_token

    parser = argparse.ArgumentParser(description='Speech hackathon Baseline')
    parser.add_argument('--hidden_size', type=int, default=256, help='hidden size of model (default: 256)')
    parser.add_argument('--embedding_size', type=int, default=64, help=' size of embedding dimension (default: 64)')
    parser.add_argument('--encoder_layer_size', type=int, default=4, help='number of layers of model (default: 3)')
    parser.add_argument('--decoder_layer_size', type=int, default=3, help='number of layers of model (default: 3)')
    parser.add_argument('--dropout', type=float, default=0.2, help='dropout rate in training (default: 0.2)')
    parser.add_argument('--batch_size', type=int, default=32, help='batch size in training (default: 32)')
    parser.add_argument('--workers', type=int, default=4, help='number of workers in dataset loader (default: 4)')
    parser.add_argument('--max_epochs', type=int, default=10, help='number of max epochs in training (default: 10)')
    parser.add_argument('--lr', type=float, default=1e-04, help='learning rate (default: 0.0001)')
    parser.add_argument('--teacher_forcing', type=float, default=1.0, help='teacher forcing ratio in decoder (default: 1.0)')
    parser.add_argument('--max_len', type=int, default=80, help='maximum characters of sentence (default: 80)')
    parser.add_argument('--no_cuda', action='store_true', default=False, help='disables CUDA training')
    parser.add_argument('--seed', type=int, default=1, help='random seed (default: 1)')
    parser.add_argument('--save_name', type=str, default='model', help='the name of model in nsml or local')
    parser.add_argument('--mode', type=str, default='train')
    parser.add_argument("--pause", type=int, default=0)

    args = parser.parse_args()

    char2index, index2char = label_loader.load_label('./hackathon.labels')
    SOS_token = char2index['<s>']
    EOS_token = char2index['</s>']
    PAD_token = char2index['_']

    vocab_size = len(char2index)

    random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    args.cuda = not args.no_cuda and torch.cuda.is_available()
    device = torch.device('cuda' if args.cuda else 'cpu')

    # N_FFT: defined in loader.py
    feature_size = MEL_FILTERS

    enc = ListenRNN(feature_size, args.hidden_size,
                     input_dropout_p=args.dropout, dropout_p=args.dropout,
                     n_layers=args.encoder_layer_size, rnn_cell='gru')


    dec = AttendSpellRNN(vocab_size, args.max_len, args.hidden_size * 2,
                     SOS_token, EOS_token,
                     n_layers=args.decoder_layer_size, rnn_cell='gru', embedding_size=args.embedding_size,
                     input_dropout_p=args.dropout, dropout_p=args.dropout, beam_width=4, device=device)

    model = Seq2seq(enc, dec)
    model.flatten_parameters()

    model = nn.DataParallel(model).to(device)

    optimizer = optim.Adam(model.module.parameters(), lr=args.lr)

    scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[20, 35, 45], gamma=0.5)
    criterion = LabelSmoothingLoss(vocab_size, ignore_index=PAD_token, smoothing=0.1, dim=-1)
    bind_model(model, optimizer)

    if args.pause == 1:
        nsml.paused(scope=locals())

    if args.mode != "train":
        return

    data_list = os.path.join(DATASET_PATH, 'train_data', 'data_list.csv')
    wav_paths = list()
    script_paths = list()

    with open(data_list, 'r') as f:
        for line in f:
            # line: "aaa.wav,aaa.label"

            wav_path, script_path = line.strip().split(',')
            wav_paths.append(os.path.join(DATASET_PATH, 'train_data', wav_path))
            script_paths.append(os.path.join(DATASET_PATH, 'train_data', script_path))

    best_loss = 1e10
    best_cer = 1e10
    begin_epoch = 0

    # load all target scripts for reducing disk i/o
    target_path = os.path.join(DATASET_PATH, 'train_label')
    load_targets(target_path)

    train_batch_num, train_dataset_list, valid_dataset = split_dataset(args, wav_paths, script_paths, valid_ratio=0.05)

    logger.info('start')

    train_begin = time.time()

    for epoch in range(begin_epoch, args.max_epochs):
        if epoch == begin_epoch:
            for group in optimizer.param_groups:
                group['lr'] = 0

        train_queue = queue.Queue(args.workers * 2)

        train_loader = MultiLoader(train_dataset_list, train_queue, args.batch_size, args.workers)
        train_loader.start()

        train_loss, train_cer = train(model, train_batch_num, train_queue, criterion, optimizer, device, train_begin, args.workers, 10, args.teacher_forcing, epoch == begin_epoch, args.lr)
        logger.info('Epoch %d (Training) Loss %0.4f CER %0.4f' % (epoch, train_loss, train_cer))

        train_loader.join()

        valid_queue = queue.Queue(args.workers * 2)
        valid_loader = BaseDataLoader(valid_dataset, valid_queue, args.batch_size, 0)
        valid_loader.start()

        eval_loss, eval_cer = evaluate(model, valid_loader, valid_queue, criterion, device)
        logger.info('Epoch %d (Evaluate) Loss %0.4f CER %0.4f' % (epoch, eval_loss, eval_cer))

        valid_loader.join()

        nsml.report(False,
            step=epoch, train_epoch__loss=train_loss, train_epoch__cer=train_cer,
            eval__loss=eval_loss, eval__cer=eval_cer)

        best_model = (eval_cer < best_cer)
        nsml.save(args.save_name)

        if best_model:
            nsml.save('best' + str(eval_cer))
            best_cer = eval_cer

        scheduler.step()
Esempio n. 4
0
def main():

    global char2index
    global index2char
    global SOS_token
    global EOS_token
    global PAD_token

    parser = argparse.ArgumentParser(description='Speech hackathon Baseline')
    parser.add_argument('--hidden_size',
                        type=int,
                        default=512,
                        help='hidden size of model (default: 256)')
    parser.add_argument('--layer_size',
                        type=int,
                        default=3,
                        help='number of layers of model (default: 3)')
    parser.add_argument('--dropout',
                        type=float,
                        default=0.2,
                        help='dropout rate in training (default: 0.2)')
    parser.add_argument(
        '--bidirectional',
        action='store_true',
        help='use bidirectional RNN for encoder (default: False)')
    parser.add_argument(
        '--use_attention',
        action='store_true',
        help='use attention between encoder-decoder (default: False)')
    parser.add_argument('--batch_size',
                        type=int,
                        default=32,
                        help='batch size in training (default: 32)')
    parser.add_argument(
        '--workers',
        type=int,
        default=4,
        help='number of workers in dataset loader (default: 4)')
    parser.add_argument('--max_epochs',
                        type=int,
                        default=10,
                        help='number of max epochs in training (default: 10)')
    parser.add_argument('--lr',
                        type=float,
                        default=1e-04,
                        help='learning rate (default: 0.0001)')
    parser.add_argument('--teacher_forcing',
                        type=float,
                        default=0.5,
                        help='teacher forcing ratio in decoder (default: 0.5)')
    parser.add_argument('--max_len',
                        type=int,
                        default=80,
                        help='maximum characters of sentence (default: 80)')
    parser.add_argument('--no_cuda',
                        action='store_true',
                        default=False,
                        help='disables CUDA training')
    parser.add_argument('--seed',
                        type=int,
                        default=1,
                        help='random seed (default: 1)')
    parser.add_argument('--save_name',
                        type=str,
                        default='model',
                        help='the name of model in nsml or local')
    parser.add_argument('--mode', type=str, default='train')
    parser.add_argument("--pause", type=int, default=0)

    args = parser.parse_args()

    char2index, index2char = label_loader.load_label('./hackathon.labels')
    SOS_token = char2index['<s>']
    EOS_token = char2index['</s>']
    PAD_token = char2index['_']

    random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    args.cuda = not args.no_cuda and torch.cuda.is_available()
    device = torch.device('cuda' if args.cuda else 'cpu')

    # N_FFT: defined in loader.py
    feature_size = N_FFT / 2 + 1

    enc = EncoderRNN(feature_size,
                     args.hidden_size,
                     input_dropout_p=args.dropout,
                     dropout_p=args.dropout,
                     n_layers=args.layer_size,
                     bidirectional=args.bidirectional,
                     rnn_cell='gru',
                     variable_lengths=False)

    dec = DecoderRNN(len(char2index),
                     args.max_len,
                     args.hidden_size * (2 if args.bidirectional else 1),
                     SOS_token,
                     EOS_token,
                     n_layers=args.layer_size,
                     rnn_cell='gru',
                     bidirectional=args.bidirectional,
                     input_dropout_p=args.dropout,
                     dropout_p=args.dropout,
                     use_attention=args.use_attention)

    model = Seq2seq(enc, dec)
    model.flatten_parameters()

    for param in model.parameters():
        param.data.uniform_(-0.08, 0.08)

    model = nn.DataParallel(model).to(device)

    #optimizer = optim.Adam(model.module.parameters(), lr=args.lr)
    ## weight_decay 추가
    optimizer = optim.Adam(model.module.parameters(),
                           lr=args.lr,
                           weight_decay=1e-5)
    criterion = nn.CrossEntropyLoss(reduction='sum',
                                    ignore_index=PAD_token).to(device)

    bind_model(model, optimizer)

    if args.pause == 1:
        nsml.paused(scope=locals())

    if args.mode != "train":
        return

    data_list = os.path.join(DATASET_PATH, 'train_data', 'data_list.csv')
    wav_paths = list()
    script_paths = list()

    with open(data_list, 'r') as f:
        for line in f:
            # line: "aaa.wav,aaa.label"

            wav_path, script_path = line.strip().split(',')
            wav_paths.append(os.path.join(DATASET_PATH, 'train_data',
                                          wav_path))
            script_paths.append(
                os.path.join(DATASET_PATH, 'train_data', script_path))

    best_loss = 1e10
    begin_epoch = 0

    # load all target scripts for reducing disk i/o
    target_path = os.path.join(DATASET_PATH, 'train_label')
    load_targets(target_path)

    train_batch_num, train_dataset_list, valid_dataset = split_dataset(
        args, wav_paths, script_paths, valid_ratio=0.05)

    logger.info('start')

    train_begin = time.time()

    for epoch in range(begin_epoch, args.max_epochs):

        train_queue = queue.Queue(args.workers * 2)

        train_loader = MultiLoader(train_dataset_list, train_queue,
                                   args.batch_size, args.workers)
        train_loader.start()

        train_loss, train_cer = train(model, train_batch_num, train_queue,
                                      criterion, optimizer, device,
                                      train_begin, args.workers, 10,
                                      args.teacher_forcing)
        logger.info('Epoch %d (Training) Loss %0.4f CER %0.4f' %
                    (epoch, train_loss, train_cer))

        train_loader.join()

        valid_queue = queue.Queue(args.workers * 2)
        valid_loader = BaseDataLoader(valid_dataset, valid_queue,
                                      args.batch_size, 0)
        valid_loader.start()

        eval_loss, eval_cer = evaluate(model, valid_loader, valid_queue,
                                       criterion, device)
        logger.info('Epoch %d (Evaluate) Loss %0.4f CER %0.4f' %
                    (epoch, eval_loss, eval_cer))

        valid_loader.join()

        nsml.report(False,
                    step=epoch,
                    train_epoch__loss=train_loss,
                    train_epoch__cer=train_cer,
                    eval__loss=eval_loss,
                    eval__cer=eval_cer)

        best_model = (eval_loss < best_loss)
        nsml.save(args.save_name)

        if best_model:
            nsml.save('best')
            best_loss = eval_loss
Esempio n. 5
0
def main():

    global char2index
    global index2char
    global SOS_token
    global EOS_token
    global PAD_token

    parser = argparse.ArgumentParser(description='Speech hackathon Baseline')

    parser.add_argument('--hidden_size',
                        type=int,
                        default=512,
                        help='hidden size of model (default: 512)'
                        )  #gru의 hidden layer size(model 참고)
    parser.add_argument(
        '--layer_size',
        type=int,
        default=3,
        help='number of layers of model (default: 3)')  #gru의 512 노드가 3개 Layer

    parser.add_argument('--dropout',
                        type=float,
                        default=0.2,
                        help='dropout rate in training (default: 0.2)')
    parser.add_argument(
        '--bidirectional',
        action='store_true',
        help='use bidirectional RNN for encoder (default: False)')
    parser.add_argument(
        '--use_attention',
        action='store_true',
        help='use attention between encoder-decoder (default: False)')

    parser.add_argument('--batch_size',
                        type=int,
                        default=32,
                        help='batch size in training (default: 32)')  #메모리 문제
    parser.add_argument(
        '--workers',
        type=int,
        default=4,
        help='number of workers in dataset loader (default: 4)')
    parser.add_argument('--max_epochs',
                        type=int,
                        default=10,
                        help='number of max epochs in training (default: 10)')
    parser.add_argument('--lr',
                        type=float,
                        default=1e-04,
                        help='learning rate (default: 0.0001)')  #learning rate
    parser.add_argument(
        '--teacher_forcing',
        type=float,
        default=0.5,
        help='teacher forcing ratio in decoder (default: 0.5)'
    )  #(epoch 5~10번할 때)처음에 넣었다가 점점 떨어지는 구조 : teacher forcing 과의 비율(낮을 수록, 높을 수록) 찾아보기

    parser.add_argument(
        '--max_len',
        type=int,
        default=80,
        help='maximum characters of sentence (default: 80)')  #굳이 고려안함
    parser.add_argument('--no_cuda',
                        action='store_true',
                        default=False,
                        help='disables CUDA training')
    parser.add_argument('--seed',
                        type=int,
                        default=1,
                        help='random seed (default: 1)')
    parser.add_argument('--save_name',
                        type=str,
                        default='model',
                        help='the name of model in nsml or local')
    parser.add_argument('--mode', type=str, default='train')  #inference 할지
    parser.add_argument("--pause", type=int, default=0)

    args = parser.parse_args()

    char2index, index2char = label_loader.load_label(
        './hackathon.labels')  #라벨이랑 매칭 시키는 구조
    SOS_token = char2index['<s>']
    EOS_token = char2index['</s>']
    PAD_token = char2index['_']

    random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)  #gpu 쓸 때 이용

    args.cuda = not args.no_cuda and torch.cuda.is_available()
    device = torch.device('cuda' if args.cuda else 'cpu')

    # N_FFT: defined in loader.py
    feature_size = N_FFT / 2 + 1

    # Initialize the model for this run
    model_ft, input_size = initialize_model(model_name,
                                            num_classes,
                                            feature_extract,
                                            use_pretrained=True)

    # Print the model we just instantiated
    print(model_ft)

    # Data augmentation and normalization for training
    # Just normalization for validation
    data_transforms = {
        'train':
        transforms.Compose([
            transforms.RandomResizedCrop(input_size),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        ]),
        'val':
        transforms.Compose([
            transforms.Resize(input_size),
            transforms.CenterCrop(input_size),
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        ]),
    }

    print("Initializing Datasets and Dataloaders...")

    # Create training and validation datasets
    image_datasets = {
        x: datasets.ImageFolder(os.path.join(data_dir, x), data_transforms[x])
        for x in ['train', 'val']
    }
    # Create training and validation dataloaders
    dataloaders_dict = {
        x: torch.utils.data.DataLoader(image_datasets[x],
                                       batch_size=batch_size,
                                       shuffle=True,
                                       num_workers=4)
        for x in ['train', 'val']
    }

    # Detect if we have a GPU available
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    # Send the model to GPU
    model_ft = model_ft.to(device)

    params_to_update = model_ft.parameters()
    print("Params to learn:")
    if feature_extract:
        params_to_update = []
        for name, param in model_ft.named_parameters():
            if param.requires_grad == True:
                params_to_update.append(param)
                print("\t", name)
    else:
        for name, param in model_ft.named_parameters():
            if param.requires_grad == True:
                print("\t", name)

    # Observe that all parameters are being optimized
    optimizer_ft = optim.SGD(params_to_update, lr=0.001, momentum=0.9)

    # Setup the loss fxn
    criterion = nn.CrossEntropyLoss()

    # Train and evaluate
    model_ft, hist = train_model(model_ft,
                                 dataloaders_dict,
                                 criterion,
                                 optimizer_ft,
                                 num_epochs=num_epochs,
                                 is_inception=(model_name == "inception"))

    # Initialize the non-pretrained version of the model used for this run
    scratch_model, _ = initialize_model(model_name,
                                        num_classes,
                                        feature_extract=False,
                                        use_pretrained=False)
    scratch_model = scratch_model.to(device)
    scratch_optimizer = optim.SGD(scratch_model.parameters(),
                                  lr=0.001,
                                  momentum=0.9)
    scratch_criterion = nn.CrossEntropyLoss()
    _, scratch_hist = train_model(scratch_model,
                                  dataloaders_dict,
                                  scratch_criterion,
                                  scratch_optimizer,
                                  num_epochs=num_epochs,
                                  is_inception=(model_name == "inception"))

    # Plot the training curves of validation accuracy vs. number
    #  of training epochs for the transfer learning method and
    #  the model trained from scratch
    ohist = []
    shist = []

    ohist = [h.cpu().numpy() for h in hist]
    shist = [h.cpu().numpy() for h in scratch_hist]

    plt.title("Validation Accuracy vs. Number of Training Epochs")
    plt.xlabel("Training Epochs")
    plt.ylabel("Validation Accuracy")
    plt.plot(range(1, num_epochs + 1), ohist, label="Pretrained")
    plt.plot(range(1, num_epochs + 1), shist, label="Scratch")
    plt.ylim((0, 1.))
    plt.xticks(np.arange(1, num_epochs + 1, 1.0))
    plt.legend()
    plt.show()
    for param in model.parameters():
        param.data.uniform_(-0.08, 0.08)  #model initialize

    model = nn.DataParallel(model).to(device)

    optimizer = optim.Adam(model.module.parameters(), lr=args.lr)
    criterion = nn.CrossEntropyLoss(
        reduction='sum', ignore_index=PAD_token).to(device)  #focal loss 이용

    bind_model(model, optimizer)

    if args.pause == 1:
        nsml.paused(scope=locals())

    if args.mode != "train":
        return

    data_list = os.path.join(DATASET_PATH, 'train_data', 'data_list.csv')
    wav_paths = list()
    script_paths = list()

    with open(data_list, 'r') as f:
        for line in f:
            # line: "aaa.wav,aaa.label"

            wav_path, script_path = line.strip().split(',')
            wav_paths.append(os.path.join(DATASET_PATH, 'train_data',
                                          wav_path))
            script_paths.append(
                os.path.join(DATASET_PATH, 'train_data', script_path))

    best_loss = 1e10
    begin_epoch = 0

    # load all target scripts for reducing disk i/o
    target_path = os.path.join(DATASET_PATH, 'train_label')
    load_targets(target_path)

    train_batch_num, train_dataset_list, valid_dataset = split_dataset(
        args, wav_paths, script_paths, valid_ratio=0.05)

    logger.info('start')

    train_begin = time.time()

    for epoch in range(begin_epoch, args.max_epochs):

        train_queue = queue.Queue(args.workers * 2)

        train_loader = MultiLoader(train_dataset_list, train_queue,
                                   args.batch_size, args.workers)
        train_loader.start()

        train_loss, train_cer = train(model, train_batch_num, train_queue,
                                      criterion, optimizer, device,
                                      train_begin, args.workers, 10,
                                      args.teacher_forcing)
        logger.info('Epoch %d (Training) Loss %0.4f CER %0.4f' %
                    (epoch, train_loss, train_cer))

        train_loader.join()

        valid_queue = queue.Queue(args.workers * 2)
        valid_loader = BaseDataLoader(valid_dataset, valid_queue,
                                      args.batch_size, 0)
        valid_loader.start()

        eval_loss, eval_cer = evaluate(model, valid_loader, valid_queue,
                                       criterion, device)
        logger.info('Epoch %d (Evaluate) Loss %0.4f CER %0.4f' %
                    (epoch, eval_loss, eval_cer))

        valid_loader.join()

        nsml.report(False,
                    step=epoch,
                    train_epoch__loss=train_loss,
                    train_epoch__cer=train_cer,
                    eval__loss=eval_loss,
                    eval__cer=eval_cer)

        best_model = (eval_loss < best_loss)
        nsml.save(args.save_name)

        if best_model:  #최적의 기준으로 뽑아서
            nsml.save('best')
            best_loss = eval_loss
Esempio n. 6
0

"""

import label_loader
import os
import glob

char2index = dict()
index2char = dict()
SOS_token = 0
EOS_token = 0
PAD_token = 0

#### read labels
char2index, index2char = label_loader.load_label('./hackathon.labels')
SOS_token = char2index['<s>']
EOS_token = char2index['</s>']
PAD_token = char2index['_']

#print(char2index)    # {'웰': 234, '와': 294, ... }
#print(index2char)   # {0: '_', 1: '군', 2: '철', ... }

#### get files list
fname = "./sample_dataset/train/train_label0"
outdir = "./sample_dataset/train/train_data/"

#### read file
#f = open(fname,'rt',encoding='cp949')
f = open(fname)
Esempio n. 7
0
def main():

    global char2index
    global index2char
    global SOS_token
    global EOS_token
    global PAD_token

    parser = argparse.ArgumentParser(description='Speech hackathon Baseline')
    parser.add_argument('--hidden_size',
                        type=int,
                        default=512,
                        help='hidden size of model (default: 256)')
    parser.add_argument('--layer_size',
                        type=int,
                        default=3,
                        help='number of layers of model (default: 3)')
    parser.add_argument('--dropout',
                        type=float,
                        default=0.2,
                        help='dropout rate in training (default: 0.2)')
    parser.add_argument(
        '--bidirectional',
        action='store_true',
        help='use bidirectional RNN for encoder (default: False)')
    parser.add_argument(
        '--use_attention',
        action='store_true',
        default=True,
        help='use attention between encoder-decoder (default: False)')
    parser.add_argument('--batch_size',
                        type=int,
                        default=5,
                        help='batch size in training (default: 32)')
    parser.add_argument(
        '--workers',
        type=int,
        default=4,
        help='number of workers in dataset loader (default: 4)')
    parser.add_argument('--max_epochs',
                        type=int,
                        default=10,
                        help='number of max epochs in training (default: 10)')
    parser.add_argument('--lr',
                        type=float,
                        default=5e-04,
                        help='learning rate (default: 0.0001)')
    parser.add_argument('--teacher_forcing',
                        type=float,
                        default=0.5,
                        help='teacher forcing ratio in decoder (default: 0.5)')
    parser.add_argument('--max_len',
                        type=int,
                        default=80,
                        help='maximum characters of sentence (default: 80)')
    parser.add_argument('--no_cuda',
                        action='store_true',
                        default=False,
                        help='disables CUDA training')
    parser.add_argument('--seed',
                        type=int,
                        default=1,
                        help='random seed (default: 1)')
    parser.add_argument('--save_name',
                        type=str,
                        default='model',
                        help='the name of model in nsml or local')
    parser.add_argument('--mode', type=str, default='train')
    parser.add_argument("--pause", type=int, default=0)

    args = parser.parse_args()

    char2index, index2char = label_loader.load_label('./hackathon.labels')
    jasper_model_definition['labels']['labels'] = index2char
    SOS_token = char2index['<s>']
    EOS_token = char2index['</s>']
    PAD_token = char2index['_']

    #bs.testBeamSearch(index2char, "휴무일")

    random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    args.cuda = not args.no_cuda and torch.cuda.is_available()
    device = torch.device('cuda' if args.cuda else 'cpu')

    featurizer_config = jasper_model_definition['input']

    # N_FFT: defined in loader.py
    feature_size = N_FFT / 2 + 1
    model = Jasper(feature_config=featurizer_config,
                   jasper_model_definition=jasper_model_definition,
                   feat_in=1024,
                   num_classes=len(index2char))

    #enc = EncoderRNN(feature_size, args.hidden_size,
    #                 input_dropout_p=args.dropout, dropout_p=args.dropout,
    #                 n_layers=args.layer_size, bidirectional=args.bidirectional, rnn_cell='lstm', variable_lengths=False)
    #
    #dec = DecoderRNN(len(char2index), args.max_len, args.hidden_size * (2 if args.bidirectional else 1),
    #                 SOS_token, EOS_token,
    #                 n_layers=args.layer_size, rnn_cell='lstm', bidirectional=args.bidirectional,
    #                 input_dropout_p=args.dropout, dropout_p=args.dropout, use_attention=args.use_attention)
    #
    #model = Seq2seq(enc, dec)
    #model.flatten_parameters()

    for param in model.parameters():
        param.data.uniform_(-0.08, 0.08)

    model = nn.DataParallel(model).to(device)
    #Cross Entropy 적용
    #optimizer = optim.Adam(model.module.parameters(), lr=args.lr)
    #criterion = nn.CrossEntropyLoss(reduction='sum', ignore_index=PAD_token).to(device)
    #CTCLoss 적용 Adam->SGD
    optimizer = optim.Adam(
        model.module.parameters(),
        lr=args.lr)  #optim.SGD(model.module.parameters(), lr=args.lr)#
    criterion = nn.CTCLoss(reduction='sum',
                           blank=PAD_token,
                           zero_infinity=True).to(device)

    #파일남아있는지 확인용 코드

    bind_model(model, optimizer)

    if args.pause == 1:
        nsml.paused(scope=locals())

    if args.mode != "train":
        return

    #try:
    #    nsml.load(checkpoint='75_model', session='team138/sr-hack-2019-dataset/151')
    #except Exception:
    #    pass

    data_list = os.path.join(DATASET_PATH, 'train_data', 'data_list.csv')
    wav_paths = list()
    script_paths = list()

    with open(data_list, 'r') as f:
        for line in f:
            # line: "aaa.wav,aaa.label"

            wav_path, script_path = line.strip().split(',')
            wav_paths.append(os.path.join(DATASET_PATH, 'train_data',
                                          wav_path))
            script_paths.append(
                os.path.join(DATASET_PATH, 'train_data', script_path))

    best_loss = 1e10
    best_cer = 1e10
    begin_epoch = 0

    # load all target scripts for reducing disk i/o
    target_path = os.path.join(DATASET_PATH, 'train_label')
    load_targets(target_path)

    train_batch_num, train_dataset_list, valid_dataset = split_dataset(
        args, wav_paths, script_paths, valid_ratio=0.05)
    logger.info('wav to melspectrogram in train set')

    #log - melspectrogram 데이터 로드 학습데이터
    batch_idx = 0

    all_train_data = list()
    train_data = list()
    for train_dataset in train_dataset_list:

        for idx, _ in enumerate(train_dataset.wav_paths):
            train_data.append(train_dataset.getitem(idx))
            batch_idx += 1
            if batch_idx % args.batch_size == 0:
                random.shuffle(train_data)
                batch = _collate_fn(train_data)
                all_train_data.append(batch)
                train_data = list()
    if len(train_data) > 0:
        random.shuffle(train_data)
        batch = _collate_fn(train_data)
        all_train_data.append(batch)

    logger.info('wav to melspectrogram in validation set')
    #log - melspectrogram 데이터 로드 테스트데이터
    all_valid_data = list()
    valid_data = list()
    batch_idx = 0
    for idx, _ in enumerate(valid_dataset.wav_paths):
        valid_data.append(valid_dataset.getitem(idx))
        batch_idx += 1
        if batch_idx % args.batch_size == 0:
            batch = _collate_fn(valid_data)
            all_valid_data.append(batch)
            valid_data = list()
    if len(valid_data) > 0:
        batch = _collate_fn(valid_data)
        all_valid_data.append(batch)

    logger.info('start')

    train_begin = time.time()

    for epoch in range(begin_epoch, args.max_epochs):
        random.shuffle(all_train_data)
        #Queue 생성 및 만들어진 데이터 저장
        #train_queue = queue.Queue(args.workers * 2)

        #train_loader = MultiLoader(train_dataset_list, train_queue, args.batch_size, args.workers)
        #train_loader.start()

        train_loss, train_cer = train(model, train_batch_num, all_train_data,
                                      criterion, optimizer, device,
                                      train_begin, args.workers, 10,
                                      args.teacher_forcing)
        logger.info('Epoch %d (Training) Loss %0.4f CER %0.4f' %
                    (epoch, train_loss, train_cer))

        #train_loader.join()

        #valid_queue = queue.Queue(args.workers * 2)
        #valid_loader = BaseDataLoader(valid_dataset, valid_queue, args.batch_size, 0)
        #valid_loader.start()

        eval_loss, eval_cer = evaluate(model, all_valid_data, criterion,
                                       device)
        logger.info('Epoch %d (Evaluate) Loss %0.4f CER %0.4f' %
                    (epoch, eval_loss, eval_cer))

        #valid_loader.join()

        nsml.report(False,
                    step=epoch,
                    train_epoch__loss=train_loss,
                    train_epoch__cer=train_cer,
                    eval__loss=eval_loss,
                    eval__cer=eval_cer)

        best_model = (eval_loss < best_loss)
        best_model_cer = (eval_cer < best_cer)

        if epoch % 5 == 0:
            nsml.save("%s_model" % epoch)
        if best_model:
            nsml.save('best_loss')
            best_loss = eval_loss
        if best_model_cer:
            nsml.save('best_cer')
            best_model_cer = eval_cer
Esempio n. 8
0
def main():

    global char2index
    global index2char
    global SOS_token
    global EOS_token
    global PAD_token

    parser = argparse.ArgumentParser(description='Speech hackathon Baseline')

    parser.add_argument('--no_train', action='store_true', default=False)
    parser.add_argument('--local', action='store_true', default=False)
    parser.add_argument('--no_cuda',
                        action='store_true',
                        default=False,
                        help='disables CUDA training')
    parser.add_argument('--seed',
                        type=int,
                        default=1,
                        help='random seed (default: 1)')
    parser.add_argument('--save_name',
                        type=str,
                        default='model',
                        help='the name of model in nsml or local')
    parser.add_argument('--mode', type=str, default='train')
    parser.add_argument("--pause", type=int, default=0)
    parser.add_argument("--USE_LM", action='store_true', default=False)
    parser.add_argument('--config',
                        type=str,
                        default='./config/legacy/cfg0/baseline.cfg0.json')
    args = parser.parse_args()
    cfg = config.utils.read_cfg(args.config)

    char2index, index2char = label_loader.load_label('./hackathon.labels')
    SOS_token = char2index['<s>']
    EOS_token = char2index['</s>']
    PAD_token = char2index['_']

    random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    args.cuda = not args.no_cuda and torch.cuda.is_available()
    device = torch.device('cuda' if args.cuda else 'cpu')

    ngram_models = None

    if args.USE_LM:
        print("Begin language model setup")
        ngram_models = {}
        max_n_gram_size = 4
        for n in range(max_n_gram_size - 1):
            ngram_models[n + 2] = n_gram_train(
                os.path.join(DATASET_PATH, 'train_label'), n + 2)
            del (n)
        print("LM setup complete")

    # N_FFT: defined in loader.py
    feature_size = N_FFT / 2 + 1

    enc = EncoderRNN(cfg["model"], feature_size, variable_lengths=False)

    dec = DecoderRNN(cfg["model"], len(char2index), SOS_token, EOS_token)

    model = Seq2seq(enc, dec)
    model.flatten_parameters()

    for param in model.parameters():
        param.data.uniform_(-0.08, 0.08)

    model = nn.DataParallel(model).to(device)

    optimizer = optim.Adam(model.module.parameters(), lr=cfg["lr"])
    criterion = nn.CrossEntropyLoss(reduction='sum',
                                    ignore_index=PAD_token).to(device)

    bind_model(cfg["data"], model, optimizer, ngram_models)
    if args.no_train and not args.local:
        nsml.load(checkpoint='best', session="team161/sr-hack-2019-50000/78")

    if args.pause == 1:
        nsml.paused(scope=locals())

    if args.mode != "train":
        return

    data_list = os.path.join(DATASET_PATH, 'train_data', 'data_list.csv')
    wav_paths = list()
    script_paths = list()

    with open(data_list, 'r') as f:
        for line in f:
            # line: "aaa.wav,aaa.label"

            wav_path, script_path = line.strip().split(',')
            wav_paths.append(os.path.join(DATASET_PATH, 'train_data',
                                          wav_path))
            script_paths.append(
                os.path.join(DATASET_PATH, 'train_data', script_path))

    best_loss = 1e10
    best_cer = 1e10
    begin_epoch = 0

    # load all target scripts for reducing disk i/o
    target_path = os.path.join(DATASET_PATH, 'train_label')
    load_targets(target_path)

    if args.no_train:
        train_batch_num, train_dataset_list, valid_dataset = split_dataset(
            cfg, wav_paths, script_paths, valid_ratio=0.05)
    else:
        train_batch_num, train_dataset_list, valid_dataset = split_dataset(
            cfg, wav_paths, script_paths, valid_ratio=0.05)

    lr_scheduler = StepLR(optimizer, step_size=1, gamma=0.96)

    logger.info('start')

    nsml.save('notrain')

    train_begin = time.time()
    for epoch in range(begin_epoch, cfg["max_epochs"]):
        print("epoch", epoch)
        #tracker.print_diff()
        if not args.no_train:
            train_queue = queue.Queue(cfg["workers"] * 2)
            train_loader = MultiLoader(train_dataset_list, train_queue,
                                       cfg["batch_size"], cfg["workers"])
            train_loader.start()
            # scheduled sampling
            # ratio_s -> ratio_e (linear decreasing) -> maintain
            # decreasing epoch-scale = n_epoch_ramp
            n_epoch_ramp = 10
            ratio_s = 0.25
            ratio_e = 0
            teacher_forcing_ratio = max(
                ratio_s - (ratio_s - ratio_e) * epoch / n_epoch_ramp, ratio_e)
            train_loss, train_cer = train(
                model, train_batch_num, train_queue, criterion, optimizer,
                device, train_begin, cfg["workers"], 10,
                teacher_forcing_ratio)  # cfg["teacher_forcing"]
            lr_scheduler.step(epoch)
            logger.info('Epoch %d (Training) Loss %0.4f CER %0.4f' %
                        (epoch, train_loss, train_cer))
            train_loader.join()

        valid_queue = queue.Queue(cfg["workers"] * 2)
        valid_loader = BaseDataLoader(valid_dataset, valid_queue,
                                      cfg["batch_size"], 0)
        valid_loader.start()
        print("start eval")
        eval_loss, eval_cer = evaluate(model,
                                       valid_loader,
                                       valid_queue,
                                       criterion,
                                       device,
                                       ngram_models=ngram_models)
        logger.info('Epoch %d (Evaluate) Loss %0.4f CER %0.4f' %
                    (epoch, eval_loss, eval_cer))
        valid_loader.join()
        print("end eval")

        if args.no_train:
            continue

        nsml.report(False,
                    step=epoch,
                    train_epoch__loss=train_loss,
                    train_epoch__cer=train_cer,
                    eval__loss=eval_loss,
                    eval__cer=eval_cer)

        # save every epoch
        save_name = "model_%03d" % (epoch)
        nsml.save(save_name)
        # save best loss model
        is_best_loss = (eval_loss < best_loss)
        if is_best_loss:
            nsml.save('best')
            best_loss = eval_loss
        # save best cer model
        is_best_cer = (eval_cer < best_cer)
        if is_best_cer:
            nsml.save('cer')
            best_cer = eval_cer