def main(): global char2index global index2char global SOS_token global EOS_token global PAD_token parser = argparse.ArgumentParser(description='Speech hackathon Baseline') parser.add_argument('--hidden_size', type=int, default=512, help='hidden size of model (default: 256)') parser.add_argument('--layer_size', type=int, default=3, help='number of layers of model (default: 3)') parser.add_argument('--dropout', type=float, default=0.2, help='dropout rate in training (default: 0.2)') parser.add_argument( '--bidirectional', action='store_true', help='use bidirectional RNN for encoder (default: False)') parser.add_argument( '--use_attention', action='store_true', help='use attention between encoder-decoder (default: False)') parser.add_argument('--batch_size', type=int, default=32, help='batch size in training (default: 32)') parser.add_argument( '--workers', type=int, default=4, help='number of workers in dataset loader (default: 4)') parser.add_argument('--max_epochs', type=int, default=10, help='number of max epochs in training (default: 10)') parser.add_argument('--lr', type=float, default=1e-04, help='learning rate (default: 0.0001)') parser.add_argument('--teacher_forcing', type=float, default=0.5, help='teacher forcing ratio in decoder (default: 0.5)') parser.add_argument('--max_len', type=int, default=80, help='maximum characters of sentence (default: 80)') parser.add_argument('--no_cuda', action='store_true', default=False, help='disables CUDA training') parser.add_argument('--seed', type=int, default=1, help='random seed (default: 1)') parser.add_argument('--save_name', type=str, default='model', help='the name of model in nsml or local') parser.add_argument('--mode', type=str, default='train') parser.add_argument("--pause", type=int, default=0) parser.add_argument('--rnn_cell', type=str, default='gru') parser.add_argument("--iteration", type=int, default=0) parser.add_argument('--feature', type=str, default='spec') parser.add_argument('--save_dir', type=str, default='') args = parser.parse_args() char2index, index2char = label_loader.load_label('./hackathon.labels') SOS_token = char2index['<s>'] EOS_token = char2index['</s>'] PAD_token = char2index['_'] random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) args.cuda = not args.no_cuda and torch.cuda.is_available() device = torch.device('cuda' if args.cuda else 'cpu') logger.info('Using %s as feature' % args.feature) if args.save_dir: logger.info('Save directory: %s' % args.save_dir) os.makedirs(args.save_dir, exist_ok=True) # N_FFT: defined in loader.py if args.feature == 'mfcc': feature_size = N_MFCC * 3 # concat of mfcc, mfcc' mfcc'' elif args.feature == 'melspec': feature_size = N_MELS elif args.feature == 'spec': feature_size = N_FFT / 2 + 1 else: raise ValueError('Unsupported feature %s' % args.feature) enc = EncoderRNN(feature_size, args.hidden_size, input_dropout_p=args.dropout, dropout_p=args.dropout, n_layers=args.layer_size, bidirectional=args.bidirectional, rnn_cell=args.rnn_cell, variable_lengths=False) dec = DecoderRNN(len(char2index), args.max_len, args.hidden_size * (2 if args.bidirectional else 1), SOS_token, EOS_token, n_layers=args.layer_size, rnn_cell=args.rnn_cell, bidirectional=args.bidirectional, input_dropout_p=args.dropout, dropout_p=args.dropout, use_attention=args.use_attention) model = Seq2seq(enc, dec) model.flatten_parameters() for param in model.parameters(): param.data.uniform_(-0.08, 0.08) model = nn.DataParallel(model).to(device) optimizer = optim.Adam(model.module.parameters(), lr=args.lr) criterion = nn.CrossEntropyLoss(reduction='sum', ignore_index=PAD_token).to(device) bind_model(model, optimizer, args.feature) if args.pause != 1: nsml.load(checkpoint='10', session='team236/sr-hack-2019-dataset/122') nsml.save('init') logger.info('Saved!') if args.pause == 1: nsml.paused(scope=locals()) if args.mode != "train": return data_list = os.path.join(DATASET_PATH, 'train_data', 'data_list.csv') wav_paths = list() script_paths = list() # load all target scripts for reducing disk i/o target_path = os.path.join(DATASET_PATH, 'train_label') target_dict = load_targets(target_path) with open(data_list, 'r') as f: for line in f: # line: "aaa.wav,aaa.label" wav_path, script_path = line.strip().split(',') wav_paths.append(os.path.join(DATASET_PATH, 'train_data', wav_path)) script_paths.append( os.path.join(DATASET_PATH, 'train_data', script_path)) best_loss = 1e10 begin_epoch = 0 train_dataset, valid_dataset = split_dataset(args, wav_paths, script_paths, target_dict, args.feature, valid_ratio=0.05) train_begin = time.time() for epoch in range(begin_epoch, args.max_epochs): train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, collate_fn=collate_fn) train_loss, train_cer = train(model, train_loader, criterion, optimizer, device, train_begin, 10, args.teacher_forcing) logger.info('Epoch %d (Training) Loss %0.4f CER %0.4f' % (epoch, train_loss, train_cer)) valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=4, shuffle=False, num_workers=args.workers, collate_fn=collate_fn) eval_loss, eval_cer = evaluate(model, valid_loader, criterion, device) logger.info('Epoch %d (Evaluate) Loss %0.4f CER %0.4f' % (epoch, eval_loss, eval_cer)) nsml.report(False, step=epoch, train_epoch__loss=train_loss, train_epoch__cer=train_cer, eval__loss=eval_loss, eval__cer=eval_cer) best_model = (eval_loss < best_loss) nsml.save(args.save_name) nsml.save(str(epoch)) if args.save_dir: save_model( model, optimizer, os.path.join(args.save_dir, './epoch-%d-cer-%d.pt' % (epoch, eval_cer))) if best_model: nsml.save('best') best_loss = eval_loss
def main(): global char2index global index2char global SOS_token global EOS_token global PAD_token parser = argparse.ArgumentParser(description='Speech hackathon Baseline') parser.add_argument('--hidden_size', type=int, default=512, help='hidden size of model (default: 256)') parser.add_argument('--layer_size', type=int, default=1, help='number of layers of model (default: 3)') parser.add_argument('--dropout', type=float, default=0, help='dropout rate in training (default: 0.2)') parser.add_argument( '--bidirectional', action='store_true', help='use bidirectional RNN for encoder (default: False)') parser.add_argument( '--use_attention', action='store_true', help='use attention between encoder-decoder (default: False)') parser.add_argument('--batch_size', type=int, default=16, help='batch size in training (default: 8)') parser.add_argument( '--workers', type=int, default=4, help='number of workers in dataset loader (default: 4)') parser.add_argument('--max_epochs', type=int, default=100, help='number of max epochs in training (default: 10)') parser.add_argument('--lr', type=float, default=1e-04, help='learning rate (default: 0.0001)') parser.add_argument('--teacher_forcing', type=float, default=0, help='teacher forcing ratio in decoder (default: 0.5)') parser.add_argument('--max_len', type=int, default=80, help='maximum characters of sentence (default: 80)') parser.add_argument('--no_cuda', action='store_true', default=False, help='disables CUDA training') parser.add_argument('--seed', type=int, default=1, help='random seed (default: 1)') parser.add_argument('--save_name', type=str, default='model', help='the name of model in nsml or local') parser.add_argument('--mode', type=str, default='train') parser.add_argument("--pause", type=int, default=0) args = parser.parse_args() char2index, index2char = label_loader.load_label('./hackathon.labels') SOS_token = char2index['<s>'] EOS_token = char2index['</s>'] PAD_token = char2index['_'] ######## embeding function random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) args.cuda = not args.no_cuda and torch.cuda.is_available() device = torch.device('cuda' if args.cuda else 'cpu') # N_FFT: defined in loader.py feature_size = N_FFT / 2 + 1 # enc = EncoderRNN(feature_size, args.hidden_size, # input_dropout_p=args.dropout, dropout_p=args.dropout, # n_layers=args.layer_size, bidirectional=args.bidirectional, rnn_cell='gru', variable_lengths=False) enc = PBlstm.Listener(feature_size, args.hidden_size, 3, 'LSTM', True, args.dropout) dec = DecoderRNN(len(char2index), args.max_len, args.hidden_size * (2 if True else 1), SOS_token, EOS_token, n_layers=args.layer_size, rnn_cell='LSTM', bidirectional=True, input_dropout_p=args.dropout, dropout_p=args.dropout, use_attention=args.use_attention) model = Seq2seq(enc, dec) model.flatten_parameters() # for name, param in model.named_parameters(): # if param.requires_grad: # print(name) # print(param.data.shape) # encoder.conv.0.weight # torch.Size([32, 1, 41, 11]) # encoder.conv.0.bias # torch.Size([32]) # encoder.conv.1.weight # torch.Size([32]) # encoder.conv.1.bias # torch.Size([32]) # encoder.conv.3.weight # torch.Size([32, 32, 21, 11]) # encoder.conv.3.bias # torch.Size([32]) # encoder.conv.4.weight # torch.Size([32]) # encoder.conv.4.bias # torch.Size([32]) # encoder.rnn.weight_ih_l0 # torch.Size([1536, 4128]) # encoder.rnn.weight_hh_l0 # torch.Size([1536, 512]) # encoder.rnn.bias_ih_l0 # torch.Size([1536]) # encoder.rnn.bias_hh_l0 # torch.Size([1536]) # encoder.rnn.weight_ih_l1 # torch.Size([1536, 512]) # encoder.rnn.weight_hh_l1 # torch.Size([1536, 512]) # encoder.rnn.bias_ih_l1 # torch.Size([1536]) # encoder.rnn.bias_hh_l1 # torch.Size([1536]) # encoder.rnn.weight_ih_l2 # torch.Size([1536, 512]) # encoder.rnn.weight_hh_l2 # torch.Size([1536, 512]) # encoder.rnn.bias_ih_l2 # torch.Size([1536]) # encoder.rnn.bias_hh_l2 # torch.Size([1536]) # decoder.rnn.weight_ih_l0 # torch.Size([1536, 512]) # decoder.rnn.weight_hh_l0 # torch.Size([1536, 512]) # decoder.rnn.bias_ih_l0 # torch.Size([1536]) # decoder.rnn.bias_hh_l0 # torch.Size([1536]) # decoder.rnn.weight_ih_l1 # torch.Size([1536, 512]) # decoder.rnn.weight_hh_l1 # torch.Size([1536, 512]) # decoder.rnn.bias_ih_l1 # torch.Size([1536]) # decoder.rnn.bias_hh_l1 # torch.Size([1536]) # decoder.rnn.weight_ih_l2 # torch.Size([1536, 512]) # decoder.rnn.weight_hh_l2 # torch.Size([1536, 512]) # decoder.rnn.bias_ih_l2 # torch.Size([1536]) # decoder.rnn.bias_hh_l2 # torch.Size([1536]) # decoder.embedding.weight # torch.Size([820, 512]) # decoder.out.weight # torch.Size([820, 512]) # decoder.out.bias # torch.Size([820]) for param in model.parameters(): param.data.uniform_(-0.08, 0.08) model = nn.DataParallel(model).to(device) optimizer = optim.Adam(model.module.parameters(), lr=args.lr) criterion = nn.CrossEntropyLoss(reduction='sum', ignore_index=PAD_token).to(device) bind_model(model, optimizer) if args.pause == 1: nsml.paused(scope=locals()) if args.mode != "train": return nsml.load(checkpoint='4', session='team147/sr-hack-2019-dataset/787') nsml.save('787_4') # exit() data_list = os.path.join(DATASET_PATH, 'train_data', 'data_list.csv') ################################################ 원본 # wav_paths = list() # script_paths = list() # with open(data_list, 'r') as f: # for line in f: # # line: "aaa.wav,aaa.label" # wav_path, script_path = line.strip().split(',') # wav_paths.append(os.path.join(DATASET_PATH, 'train_data', wav_path)) # script_paths.append(os.path.join(DATASET_PATH, 'train_data', script_path)) ######################################## time sorting ########### wav_paths = list() script_paths = list() wav_path_len = list() with open(data_list, 'r') as f: for line in f: # line: "aaa.wav,aaa.label" wav_path, script_path = line.strip().split(',') wav_paths.append(os.path.join(DATASET_PATH, 'train_data', wav_path)) script_paths.append( os.path.join(DATASET_PATH, 'train_data', script_path)) wav_path = (os.path.join(DATASET_PATH, 'train_data', wav_path)) with contextlib.closing(wave.open(wav_path, 'r')) as wav_file: frames = wav_file.getnframes() rate = wav_file.getframerate() length = frames / float(rate) wav_path_len.append((wav_path, length)) # script_paths.append(os.path.join(DATASET_PATH, 'train_data', script_path)) best_loss = 1e10 best_cer = 1e10 begin_epoch = 0 # load all target scripts for reducing disk i/o target_path = os.path.join(DATASET_PATH, 'train_label') load_targets(target_path) train_batch_num, train_dataset_list, valid_dataset = split_dataset( args, wav_paths, script_paths, wav_path_len, valid_ratio=0.05) logger.info('start') train_begin = time.time() # ctc = nn.CTCLoss(blank=0, reduction='mean').to(device) epoch_chk = 0 for epoch in range(begin_epoch, args.max_epochs): train_queue = queue.Queue(args.workers * 2) train_loader = MultiLoader(train_dataset_list, train_queue, args.batch_size, args.workers) train_loader.start() train_loss, train_cer = train(model, train_batch_num, train_queue, criterion, optimizer, device, train_begin, args.workers, 10, args.teacher_forcing) logger.info('Epoch %d (Training) Loss %0.4f CER %0.4f' % (epoch, train_loss, train_cer)) train_loader.join() valid_queue = queue.Queue(args.workers * 2) valid_loader = BaseDataLoader(valid_dataset, valid_queue, args.batch_size, 0) valid_loader.start() eval_loss, eval_cer = evaluate(model, valid_loader, valid_queue, criterion, device) logger.info('Epoch %d (Evaluate) Loss %0.4f CER %0.4f' % (epoch, eval_loss, eval_cer)) valid_loader.join() nsml.report(False, step=epoch, train_epoch__loss=train_loss, train_epoch__cer=train_cer, eval__loss=eval_loss, eval__cer=eval_cer) nsml.save(epoch_chk) epoch_chk += 1
def main(): global char2index global index2char global SOS_token global EOS_token global PAD_token parser = argparse.ArgumentParser(description='Speech hackathon Baseline') parser.add_argument('--hidden_size', type=int, default=256, help='hidden size of model (default: 256)') parser.add_argument('--embedding_size', type=int, default=64, help=' size of embedding dimension (default: 64)') parser.add_argument('--encoder_layer_size', type=int, default=4, help='number of layers of model (default: 3)') parser.add_argument('--decoder_layer_size', type=int, default=3, help='number of layers of model (default: 3)') parser.add_argument('--dropout', type=float, default=0.2, help='dropout rate in training (default: 0.2)') parser.add_argument('--batch_size', type=int, default=32, help='batch size in training (default: 32)') parser.add_argument('--workers', type=int, default=4, help='number of workers in dataset loader (default: 4)') parser.add_argument('--max_epochs', type=int, default=10, help='number of max epochs in training (default: 10)') parser.add_argument('--lr', type=float, default=1e-04, help='learning rate (default: 0.0001)') parser.add_argument('--teacher_forcing', type=float, default=1.0, help='teacher forcing ratio in decoder (default: 1.0)') parser.add_argument('--max_len', type=int, default=80, help='maximum characters of sentence (default: 80)') parser.add_argument('--no_cuda', action='store_true', default=False, help='disables CUDA training') parser.add_argument('--seed', type=int, default=1, help='random seed (default: 1)') parser.add_argument('--save_name', type=str, default='model', help='the name of model in nsml or local') parser.add_argument('--mode', type=str, default='train') parser.add_argument("--pause", type=int, default=0) args = parser.parse_args() char2index, index2char = label_loader.load_label('./hackathon.labels') SOS_token = char2index['<s>'] EOS_token = char2index['</s>'] PAD_token = char2index['_'] vocab_size = len(char2index) random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) args.cuda = not args.no_cuda and torch.cuda.is_available() device = torch.device('cuda' if args.cuda else 'cpu') # N_FFT: defined in loader.py feature_size = MEL_FILTERS enc = ListenRNN(feature_size, args.hidden_size, input_dropout_p=args.dropout, dropout_p=args.dropout, n_layers=args.encoder_layer_size, rnn_cell='gru') dec = AttendSpellRNN(vocab_size, args.max_len, args.hidden_size * 2, SOS_token, EOS_token, n_layers=args.decoder_layer_size, rnn_cell='gru', embedding_size=args.embedding_size, input_dropout_p=args.dropout, dropout_p=args.dropout, beam_width=4, device=device) model = Seq2seq(enc, dec) model.flatten_parameters() model = nn.DataParallel(model).to(device) optimizer = optim.Adam(model.module.parameters(), lr=args.lr) scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[20, 35, 45], gamma=0.5) criterion = LabelSmoothingLoss(vocab_size, ignore_index=PAD_token, smoothing=0.1, dim=-1) bind_model(model, optimizer) if args.pause == 1: nsml.paused(scope=locals()) if args.mode != "train": return data_list = os.path.join(DATASET_PATH, 'train_data', 'data_list.csv') wav_paths = list() script_paths = list() with open(data_list, 'r') as f: for line in f: # line: "aaa.wav,aaa.label" wav_path, script_path = line.strip().split(',') wav_paths.append(os.path.join(DATASET_PATH, 'train_data', wav_path)) script_paths.append(os.path.join(DATASET_PATH, 'train_data', script_path)) best_loss = 1e10 best_cer = 1e10 begin_epoch = 0 # load all target scripts for reducing disk i/o target_path = os.path.join(DATASET_PATH, 'train_label') load_targets(target_path) train_batch_num, train_dataset_list, valid_dataset = split_dataset(args, wav_paths, script_paths, valid_ratio=0.05) logger.info('start') train_begin = time.time() for epoch in range(begin_epoch, args.max_epochs): if epoch == begin_epoch: for group in optimizer.param_groups: group['lr'] = 0 train_queue = queue.Queue(args.workers * 2) train_loader = MultiLoader(train_dataset_list, train_queue, args.batch_size, args.workers) train_loader.start() train_loss, train_cer = train(model, train_batch_num, train_queue, criterion, optimizer, device, train_begin, args.workers, 10, args.teacher_forcing, epoch == begin_epoch, args.lr) logger.info('Epoch %d (Training) Loss %0.4f CER %0.4f' % (epoch, train_loss, train_cer)) train_loader.join() valid_queue = queue.Queue(args.workers * 2) valid_loader = BaseDataLoader(valid_dataset, valid_queue, args.batch_size, 0) valid_loader.start() eval_loss, eval_cer = evaluate(model, valid_loader, valid_queue, criterion, device) logger.info('Epoch %d (Evaluate) Loss %0.4f CER %0.4f' % (epoch, eval_loss, eval_cer)) valid_loader.join() nsml.report(False, step=epoch, train_epoch__loss=train_loss, train_epoch__cer=train_cer, eval__loss=eval_loss, eval__cer=eval_cer) best_model = (eval_cer < best_cer) nsml.save(args.save_name) if best_model: nsml.save('best' + str(eval_cer)) best_cer = eval_cer scheduler.step()
def main(): global char2index global index2char global SOS_token global EOS_token global PAD_token parser = argparse.ArgumentParser(description='Speech hackathon Baseline') parser.add_argument('--hidden_size', type=int, default=512, help='hidden size of model (default: 256)') parser.add_argument('--layer_size', type=int, default=3, help='number of layers of model (default: 3)') parser.add_argument('--dropout', type=float, default=0.2, help='dropout rate in training (default: 0.2)') parser.add_argument( '--bidirectional', action='store_true', help='use bidirectional RNN for encoder (default: False)') parser.add_argument( '--use_attention', action='store_true', help='use attention between encoder-decoder (default: False)') parser.add_argument('--batch_size', type=int, default=32, help='batch size in training (default: 32)') parser.add_argument( '--workers', type=int, default=4, help='number of workers in dataset loader (default: 4)') parser.add_argument('--max_epochs', type=int, default=10, help='number of max epochs in training (default: 10)') parser.add_argument('--lr', type=float, default=1e-04, help='learning rate (default: 0.0001)') parser.add_argument('--teacher_forcing', type=float, default=0.5, help='teacher forcing ratio in decoder (default: 0.5)') parser.add_argument('--max_len', type=int, default=80, help='maximum characters of sentence (default: 80)') parser.add_argument('--no_cuda', action='store_true', default=False, help='disables CUDA training') parser.add_argument('--seed', type=int, default=1, help='random seed (default: 1)') parser.add_argument('--save_name', type=str, default='model', help='the name of model in nsml or local') parser.add_argument('--mode', type=str, default='train') parser.add_argument("--pause", type=int, default=0) args = parser.parse_args() char2index, index2char = label_loader.load_label('./hackathon.labels') SOS_token = char2index['<s>'] EOS_token = char2index['</s>'] PAD_token = char2index['_'] random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) args.cuda = not args.no_cuda and torch.cuda.is_available() device = torch.device('cuda' if args.cuda else 'cpu') # N_FFT: defined in loader.py feature_size = N_FFT / 2 + 1 enc = EncoderRNN(feature_size, args.hidden_size, input_dropout_p=args.dropout, dropout_p=args.dropout, n_layers=args.layer_size, bidirectional=args.bidirectional, rnn_cell='gru', variable_lengths=False) dec = DecoderRNN(len(char2index), args.max_len, args.hidden_size * (2 if args.bidirectional else 1), SOS_token, EOS_token, n_layers=args.layer_size, rnn_cell='gru', bidirectional=args.bidirectional, input_dropout_p=args.dropout, dropout_p=args.dropout, use_attention=args.use_attention) model = Seq2seq(enc, dec) model.flatten_parameters() for param in model.parameters(): param.data.uniform_(-0.08, 0.08) model = nn.DataParallel(model).to(device) #optimizer = optim.Adam(model.module.parameters(), lr=args.lr) ## weight_decay 추가 optimizer = optim.Adam(model.module.parameters(), lr=args.lr, weight_decay=1e-5) criterion = nn.CrossEntropyLoss(reduction='sum', ignore_index=PAD_token).to(device) bind_model(model, optimizer) if args.pause == 1: nsml.paused(scope=locals()) if args.mode != "train": return data_list = os.path.join(DATASET_PATH, 'train_data', 'data_list.csv') wav_paths = list() script_paths = list() with open(data_list, 'r') as f: for line in f: # line: "aaa.wav,aaa.label" wav_path, script_path = line.strip().split(',') wav_paths.append(os.path.join(DATASET_PATH, 'train_data', wav_path)) script_paths.append( os.path.join(DATASET_PATH, 'train_data', script_path)) best_loss = 1e10 begin_epoch = 0 # load all target scripts for reducing disk i/o target_path = os.path.join(DATASET_PATH, 'train_label') load_targets(target_path) train_batch_num, train_dataset_list, valid_dataset = split_dataset( args, wav_paths, script_paths, valid_ratio=0.05) logger.info('start') train_begin = time.time() for epoch in range(begin_epoch, args.max_epochs): train_queue = queue.Queue(args.workers * 2) train_loader = MultiLoader(train_dataset_list, train_queue, args.batch_size, args.workers) train_loader.start() train_loss, train_cer = train(model, train_batch_num, train_queue, criterion, optimizer, device, train_begin, args.workers, 10, args.teacher_forcing) logger.info('Epoch %d (Training) Loss %0.4f CER %0.4f' % (epoch, train_loss, train_cer)) train_loader.join() valid_queue = queue.Queue(args.workers * 2) valid_loader = BaseDataLoader(valid_dataset, valid_queue, args.batch_size, 0) valid_loader.start() eval_loss, eval_cer = evaluate(model, valid_loader, valid_queue, criterion, device) logger.info('Epoch %d (Evaluate) Loss %0.4f CER %0.4f' % (epoch, eval_loss, eval_cer)) valid_loader.join() nsml.report(False, step=epoch, train_epoch__loss=train_loss, train_epoch__cer=train_cer, eval__loss=eval_loss, eval__cer=eval_cer) best_model = (eval_loss < best_loss) nsml.save(args.save_name) if best_model: nsml.save('best') best_loss = eval_loss
def main(): global char2index global index2char global SOS_token global EOS_token global PAD_token parser = argparse.ArgumentParser(description='Speech hackathon Baseline') parser.add_argument('--hidden_size', type=int, default=512, help='hidden size of model (default: 512)' ) #gru의 hidden layer size(model 참고) parser.add_argument( '--layer_size', type=int, default=3, help='number of layers of model (default: 3)') #gru의 512 노드가 3개 Layer parser.add_argument('--dropout', type=float, default=0.2, help='dropout rate in training (default: 0.2)') parser.add_argument( '--bidirectional', action='store_true', help='use bidirectional RNN for encoder (default: False)') parser.add_argument( '--use_attention', action='store_true', help='use attention between encoder-decoder (default: False)') parser.add_argument('--batch_size', type=int, default=32, help='batch size in training (default: 32)') #메모리 문제 parser.add_argument( '--workers', type=int, default=4, help='number of workers in dataset loader (default: 4)') parser.add_argument('--max_epochs', type=int, default=10, help='number of max epochs in training (default: 10)') parser.add_argument('--lr', type=float, default=1e-04, help='learning rate (default: 0.0001)') #learning rate parser.add_argument( '--teacher_forcing', type=float, default=0.5, help='teacher forcing ratio in decoder (default: 0.5)' ) #(epoch 5~10번할 때)처음에 넣었다가 점점 떨어지는 구조 : teacher forcing 과의 비율(낮을 수록, 높을 수록) 찾아보기 parser.add_argument( '--max_len', type=int, default=80, help='maximum characters of sentence (default: 80)') #굳이 고려안함 parser.add_argument('--no_cuda', action='store_true', default=False, help='disables CUDA training') parser.add_argument('--seed', type=int, default=1, help='random seed (default: 1)') parser.add_argument('--save_name', type=str, default='model', help='the name of model in nsml or local') parser.add_argument('--mode', type=str, default='train') #inference 할지 parser.add_argument("--pause", type=int, default=0) args = parser.parse_args() char2index, index2char = label_loader.load_label( './hackathon.labels') #라벨이랑 매칭 시키는 구조 SOS_token = char2index['<s>'] EOS_token = char2index['</s>'] PAD_token = char2index['_'] random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) #gpu 쓸 때 이용 args.cuda = not args.no_cuda and torch.cuda.is_available() device = torch.device('cuda' if args.cuda else 'cpu') # N_FFT: defined in loader.py feature_size = N_FFT / 2 + 1 # Initialize the model for this run model_ft, input_size = initialize_model(model_name, num_classes, feature_extract, use_pretrained=True) # Print the model we just instantiated print(model_ft) # Data augmentation and normalization for training # Just normalization for validation data_transforms = { 'train': transforms.Compose([ transforms.RandomResizedCrop(input_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]), 'val': transforms.Compose([ transforms.Resize(input_size), transforms.CenterCrop(input_size), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]), } print("Initializing Datasets and Dataloaders...") # Create training and validation datasets image_datasets = { x: datasets.ImageFolder(os.path.join(data_dir, x), data_transforms[x]) for x in ['train', 'val'] } # Create training and validation dataloaders dataloaders_dict = { x: torch.utils.data.DataLoader(image_datasets[x], batch_size=batch_size, shuffle=True, num_workers=4) for x in ['train', 'val'] } # Detect if we have a GPU available device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # Send the model to GPU model_ft = model_ft.to(device) params_to_update = model_ft.parameters() print("Params to learn:") if feature_extract: params_to_update = [] for name, param in model_ft.named_parameters(): if param.requires_grad == True: params_to_update.append(param) print("\t", name) else: for name, param in model_ft.named_parameters(): if param.requires_grad == True: print("\t", name) # Observe that all parameters are being optimized optimizer_ft = optim.SGD(params_to_update, lr=0.001, momentum=0.9) # Setup the loss fxn criterion = nn.CrossEntropyLoss() # Train and evaluate model_ft, hist = train_model(model_ft, dataloaders_dict, criterion, optimizer_ft, num_epochs=num_epochs, is_inception=(model_name == "inception")) # Initialize the non-pretrained version of the model used for this run scratch_model, _ = initialize_model(model_name, num_classes, feature_extract=False, use_pretrained=False) scratch_model = scratch_model.to(device) scratch_optimizer = optim.SGD(scratch_model.parameters(), lr=0.001, momentum=0.9) scratch_criterion = nn.CrossEntropyLoss() _, scratch_hist = train_model(scratch_model, dataloaders_dict, scratch_criterion, scratch_optimizer, num_epochs=num_epochs, is_inception=(model_name == "inception")) # Plot the training curves of validation accuracy vs. number # of training epochs for the transfer learning method and # the model trained from scratch ohist = [] shist = [] ohist = [h.cpu().numpy() for h in hist] shist = [h.cpu().numpy() for h in scratch_hist] plt.title("Validation Accuracy vs. Number of Training Epochs") plt.xlabel("Training Epochs") plt.ylabel("Validation Accuracy") plt.plot(range(1, num_epochs + 1), ohist, label="Pretrained") plt.plot(range(1, num_epochs + 1), shist, label="Scratch") plt.ylim((0, 1.)) plt.xticks(np.arange(1, num_epochs + 1, 1.0)) plt.legend() plt.show() for param in model.parameters(): param.data.uniform_(-0.08, 0.08) #model initialize model = nn.DataParallel(model).to(device) optimizer = optim.Adam(model.module.parameters(), lr=args.lr) criterion = nn.CrossEntropyLoss( reduction='sum', ignore_index=PAD_token).to(device) #focal loss 이용 bind_model(model, optimizer) if args.pause == 1: nsml.paused(scope=locals()) if args.mode != "train": return data_list = os.path.join(DATASET_PATH, 'train_data', 'data_list.csv') wav_paths = list() script_paths = list() with open(data_list, 'r') as f: for line in f: # line: "aaa.wav,aaa.label" wav_path, script_path = line.strip().split(',') wav_paths.append(os.path.join(DATASET_PATH, 'train_data', wav_path)) script_paths.append( os.path.join(DATASET_PATH, 'train_data', script_path)) best_loss = 1e10 begin_epoch = 0 # load all target scripts for reducing disk i/o target_path = os.path.join(DATASET_PATH, 'train_label') load_targets(target_path) train_batch_num, train_dataset_list, valid_dataset = split_dataset( args, wav_paths, script_paths, valid_ratio=0.05) logger.info('start') train_begin = time.time() for epoch in range(begin_epoch, args.max_epochs): train_queue = queue.Queue(args.workers * 2) train_loader = MultiLoader(train_dataset_list, train_queue, args.batch_size, args.workers) train_loader.start() train_loss, train_cer = train(model, train_batch_num, train_queue, criterion, optimizer, device, train_begin, args.workers, 10, args.teacher_forcing) logger.info('Epoch %d (Training) Loss %0.4f CER %0.4f' % (epoch, train_loss, train_cer)) train_loader.join() valid_queue = queue.Queue(args.workers * 2) valid_loader = BaseDataLoader(valid_dataset, valid_queue, args.batch_size, 0) valid_loader.start() eval_loss, eval_cer = evaluate(model, valid_loader, valid_queue, criterion, device) logger.info('Epoch %d (Evaluate) Loss %0.4f CER %0.4f' % (epoch, eval_loss, eval_cer)) valid_loader.join() nsml.report(False, step=epoch, train_epoch__loss=train_loss, train_epoch__cer=train_cer, eval__loss=eval_loss, eval__cer=eval_cer) best_model = (eval_loss < best_loss) nsml.save(args.save_name) if best_model: #최적의 기준으로 뽑아서 nsml.save('best') best_loss = eval_loss
""" import label_loader import os import glob char2index = dict() index2char = dict() SOS_token = 0 EOS_token = 0 PAD_token = 0 #### read labels char2index, index2char = label_loader.load_label('./hackathon.labels') SOS_token = char2index['<s>'] EOS_token = char2index['</s>'] PAD_token = char2index['_'] #print(char2index) # {'웰': 234, '와': 294, ... } #print(index2char) # {0: '_', 1: '군', 2: '철', ... } #### get files list fname = "./sample_dataset/train/train_label0" outdir = "./sample_dataset/train/train_data/" #### read file #f = open(fname,'rt',encoding='cp949') f = open(fname)
def main(): global char2index global index2char global SOS_token global EOS_token global PAD_token parser = argparse.ArgumentParser(description='Speech hackathon Baseline') parser.add_argument('--hidden_size', type=int, default=512, help='hidden size of model (default: 256)') parser.add_argument('--layer_size', type=int, default=3, help='number of layers of model (default: 3)') parser.add_argument('--dropout', type=float, default=0.2, help='dropout rate in training (default: 0.2)') parser.add_argument( '--bidirectional', action='store_true', help='use bidirectional RNN for encoder (default: False)') parser.add_argument( '--use_attention', action='store_true', default=True, help='use attention between encoder-decoder (default: False)') parser.add_argument('--batch_size', type=int, default=5, help='batch size in training (default: 32)') parser.add_argument( '--workers', type=int, default=4, help='number of workers in dataset loader (default: 4)') parser.add_argument('--max_epochs', type=int, default=10, help='number of max epochs in training (default: 10)') parser.add_argument('--lr', type=float, default=5e-04, help='learning rate (default: 0.0001)') parser.add_argument('--teacher_forcing', type=float, default=0.5, help='teacher forcing ratio in decoder (default: 0.5)') parser.add_argument('--max_len', type=int, default=80, help='maximum characters of sentence (default: 80)') parser.add_argument('--no_cuda', action='store_true', default=False, help='disables CUDA training') parser.add_argument('--seed', type=int, default=1, help='random seed (default: 1)') parser.add_argument('--save_name', type=str, default='model', help='the name of model in nsml or local') parser.add_argument('--mode', type=str, default='train') parser.add_argument("--pause", type=int, default=0) args = parser.parse_args() char2index, index2char = label_loader.load_label('./hackathon.labels') jasper_model_definition['labels']['labels'] = index2char SOS_token = char2index['<s>'] EOS_token = char2index['</s>'] PAD_token = char2index['_'] #bs.testBeamSearch(index2char, "휴무일") random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) args.cuda = not args.no_cuda and torch.cuda.is_available() device = torch.device('cuda' if args.cuda else 'cpu') featurizer_config = jasper_model_definition['input'] # N_FFT: defined in loader.py feature_size = N_FFT / 2 + 1 model = Jasper(feature_config=featurizer_config, jasper_model_definition=jasper_model_definition, feat_in=1024, num_classes=len(index2char)) #enc = EncoderRNN(feature_size, args.hidden_size, # input_dropout_p=args.dropout, dropout_p=args.dropout, # n_layers=args.layer_size, bidirectional=args.bidirectional, rnn_cell='lstm', variable_lengths=False) # #dec = DecoderRNN(len(char2index), args.max_len, args.hidden_size * (2 if args.bidirectional else 1), # SOS_token, EOS_token, # n_layers=args.layer_size, rnn_cell='lstm', bidirectional=args.bidirectional, # input_dropout_p=args.dropout, dropout_p=args.dropout, use_attention=args.use_attention) # #model = Seq2seq(enc, dec) #model.flatten_parameters() for param in model.parameters(): param.data.uniform_(-0.08, 0.08) model = nn.DataParallel(model).to(device) #Cross Entropy 적용 #optimizer = optim.Adam(model.module.parameters(), lr=args.lr) #criterion = nn.CrossEntropyLoss(reduction='sum', ignore_index=PAD_token).to(device) #CTCLoss 적용 Adam->SGD optimizer = optim.Adam( model.module.parameters(), lr=args.lr) #optim.SGD(model.module.parameters(), lr=args.lr)# criterion = nn.CTCLoss(reduction='sum', blank=PAD_token, zero_infinity=True).to(device) #파일남아있는지 확인용 코드 bind_model(model, optimizer) if args.pause == 1: nsml.paused(scope=locals()) if args.mode != "train": return #try: # nsml.load(checkpoint='75_model', session='team138/sr-hack-2019-dataset/151') #except Exception: # pass data_list = os.path.join(DATASET_PATH, 'train_data', 'data_list.csv') wav_paths = list() script_paths = list() with open(data_list, 'r') as f: for line in f: # line: "aaa.wav,aaa.label" wav_path, script_path = line.strip().split(',') wav_paths.append(os.path.join(DATASET_PATH, 'train_data', wav_path)) script_paths.append( os.path.join(DATASET_PATH, 'train_data', script_path)) best_loss = 1e10 best_cer = 1e10 begin_epoch = 0 # load all target scripts for reducing disk i/o target_path = os.path.join(DATASET_PATH, 'train_label') load_targets(target_path) train_batch_num, train_dataset_list, valid_dataset = split_dataset( args, wav_paths, script_paths, valid_ratio=0.05) logger.info('wav to melspectrogram in train set') #log - melspectrogram 데이터 로드 학습데이터 batch_idx = 0 all_train_data = list() train_data = list() for train_dataset in train_dataset_list: for idx, _ in enumerate(train_dataset.wav_paths): train_data.append(train_dataset.getitem(idx)) batch_idx += 1 if batch_idx % args.batch_size == 0: random.shuffle(train_data) batch = _collate_fn(train_data) all_train_data.append(batch) train_data = list() if len(train_data) > 0: random.shuffle(train_data) batch = _collate_fn(train_data) all_train_data.append(batch) logger.info('wav to melspectrogram in validation set') #log - melspectrogram 데이터 로드 테스트데이터 all_valid_data = list() valid_data = list() batch_idx = 0 for idx, _ in enumerate(valid_dataset.wav_paths): valid_data.append(valid_dataset.getitem(idx)) batch_idx += 1 if batch_idx % args.batch_size == 0: batch = _collate_fn(valid_data) all_valid_data.append(batch) valid_data = list() if len(valid_data) > 0: batch = _collate_fn(valid_data) all_valid_data.append(batch) logger.info('start') train_begin = time.time() for epoch in range(begin_epoch, args.max_epochs): random.shuffle(all_train_data) #Queue 생성 및 만들어진 데이터 저장 #train_queue = queue.Queue(args.workers * 2) #train_loader = MultiLoader(train_dataset_list, train_queue, args.batch_size, args.workers) #train_loader.start() train_loss, train_cer = train(model, train_batch_num, all_train_data, criterion, optimizer, device, train_begin, args.workers, 10, args.teacher_forcing) logger.info('Epoch %d (Training) Loss %0.4f CER %0.4f' % (epoch, train_loss, train_cer)) #train_loader.join() #valid_queue = queue.Queue(args.workers * 2) #valid_loader = BaseDataLoader(valid_dataset, valid_queue, args.batch_size, 0) #valid_loader.start() eval_loss, eval_cer = evaluate(model, all_valid_data, criterion, device) logger.info('Epoch %d (Evaluate) Loss %0.4f CER %0.4f' % (epoch, eval_loss, eval_cer)) #valid_loader.join() nsml.report(False, step=epoch, train_epoch__loss=train_loss, train_epoch__cer=train_cer, eval__loss=eval_loss, eval__cer=eval_cer) best_model = (eval_loss < best_loss) best_model_cer = (eval_cer < best_cer) if epoch % 5 == 0: nsml.save("%s_model" % epoch) if best_model: nsml.save('best_loss') best_loss = eval_loss if best_model_cer: nsml.save('best_cer') best_model_cer = eval_cer
def main(): global char2index global index2char global SOS_token global EOS_token global PAD_token parser = argparse.ArgumentParser(description='Speech hackathon Baseline') parser.add_argument('--no_train', action='store_true', default=False) parser.add_argument('--local', action='store_true', default=False) parser.add_argument('--no_cuda', action='store_true', default=False, help='disables CUDA training') parser.add_argument('--seed', type=int, default=1, help='random seed (default: 1)') parser.add_argument('--save_name', type=str, default='model', help='the name of model in nsml or local') parser.add_argument('--mode', type=str, default='train') parser.add_argument("--pause", type=int, default=0) parser.add_argument("--USE_LM", action='store_true', default=False) parser.add_argument('--config', type=str, default='./config/legacy/cfg0/baseline.cfg0.json') args = parser.parse_args() cfg = config.utils.read_cfg(args.config) char2index, index2char = label_loader.load_label('./hackathon.labels') SOS_token = char2index['<s>'] EOS_token = char2index['</s>'] PAD_token = char2index['_'] random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) args.cuda = not args.no_cuda and torch.cuda.is_available() device = torch.device('cuda' if args.cuda else 'cpu') ngram_models = None if args.USE_LM: print("Begin language model setup") ngram_models = {} max_n_gram_size = 4 for n in range(max_n_gram_size - 1): ngram_models[n + 2] = n_gram_train( os.path.join(DATASET_PATH, 'train_label'), n + 2) del (n) print("LM setup complete") # N_FFT: defined in loader.py feature_size = N_FFT / 2 + 1 enc = EncoderRNN(cfg["model"], feature_size, variable_lengths=False) dec = DecoderRNN(cfg["model"], len(char2index), SOS_token, EOS_token) model = Seq2seq(enc, dec) model.flatten_parameters() for param in model.parameters(): param.data.uniform_(-0.08, 0.08) model = nn.DataParallel(model).to(device) optimizer = optim.Adam(model.module.parameters(), lr=cfg["lr"]) criterion = nn.CrossEntropyLoss(reduction='sum', ignore_index=PAD_token).to(device) bind_model(cfg["data"], model, optimizer, ngram_models) if args.no_train and not args.local: nsml.load(checkpoint='best', session="team161/sr-hack-2019-50000/78") if args.pause == 1: nsml.paused(scope=locals()) if args.mode != "train": return data_list = os.path.join(DATASET_PATH, 'train_data', 'data_list.csv') wav_paths = list() script_paths = list() with open(data_list, 'r') as f: for line in f: # line: "aaa.wav,aaa.label" wav_path, script_path = line.strip().split(',') wav_paths.append(os.path.join(DATASET_PATH, 'train_data', wav_path)) script_paths.append( os.path.join(DATASET_PATH, 'train_data', script_path)) best_loss = 1e10 best_cer = 1e10 begin_epoch = 0 # load all target scripts for reducing disk i/o target_path = os.path.join(DATASET_PATH, 'train_label') load_targets(target_path) if args.no_train: train_batch_num, train_dataset_list, valid_dataset = split_dataset( cfg, wav_paths, script_paths, valid_ratio=0.05) else: train_batch_num, train_dataset_list, valid_dataset = split_dataset( cfg, wav_paths, script_paths, valid_ratio=0.05) lr_scheduler = StepLR(optimizer, step_size=1, gamma=0.96) logger.info('start') nsml.save('notrain') train_begin = time.time() for epoch in range(begin_epoch, cfg["max_epochs"]): print("epoch", epoch) #tracker.print_diff() if not args.no_train: train_queue = queue.Queue(cfg["workers"] * 2) train_loader = MultiLoader(train_dataset_list, train_queue, cfg["batch_size"], cfg["workers"]) train_loader.start() # scheduled sampling # ratio_s -> ratio_e (linear decreasing) -> maintain # decreasing epoch-scale = n_epoch_ramp n_epoch_ramp = 10 ratio_s = 0.25 ratio_e = 0 teacher_forcing_ratio = max( ratio_s - (ratio_s - ratio_e) * epoch / n_epoch_ramp, ratio_e) train_loss, train_cer = train( model, train_batch_num, train_queue, criterion, optimizer, device, train_begin, cfg["workers"], 10, teacher_forcing_ratio) # cfg["teacher_forcing"] lr_scheduler.step(epoch) logger.info('Epoch %d (Training) Loss %0.4f CER %0.4f' % (epoch, train_loss, train_cer)) train_loader.join() valid_queue = queue.Queue(cfg["workers"] * 2) valid_loader = BaseDataLoader(valid_dataset, valid_queue, cfg["batch_size"], 0) valid_loader.start() print("start eval") eval_loss, eval_cer = evaluate(model, valid_loader, valid_queue, criterion, device, ngram_models=ngram_models) logger.info('Epoch %d (Evaluate) Loss %0.4f CER %0.4f' % (epoch, eval_loss, eval_cer)) valid_loader.join() print("end eval") if args.no_train: continue nsml.report(False, step=epoch, train_epoch__loss=train_loss, train_epoch__cer=train_cer, eval__loss=eval_loss, eval__cer=eval_cer) # save every epoch save_name = "model_%03d" % (epoch) nsml.save(save_name) # save best loss model is_best_loss = (eval_loss < best_loss) if is_best_loss: nsml.save('best') best_loss = eval_loss # save best cer model is_best_cer = (eval_cer < best_cer) if is_best_cer: nsml.save('cer') best_cer = eval_cer