def test_k2_speech_recognition_iterable_dataset_low_max_frames(k2_cut_set): dataset = K2SpeechRecognitionIterableDataset(k2_cut_set, shuffle=False, max_frames=2) dloader = DataLoader(dataset, batch_size=None) # Check that it does not crash for batch in dloader: # There will be only a single item in each batch as we're exceeding the limit each time. assert batch['features'].shape[0] == 1
def test_k2_speech_recognition_iterable_dataset_shuffling(): # The dummy cuts have a duration of 1 second each cut_set = DummyManifest(CutSet, begin_id=0, end_id=100) dataset = K2SpeechRecognitionIterableDataset( cuts=cut_set, return_cuts=True, shuffle=True, # Set an effective batch size of 10 cuts, as all have 1s duration == 100 frames # This way we're testing that it works okay when returning multiple batches in # a full epoch. max_frames=1000 ) dloader = DataLoader(dataset, batch_size=None, num_workers=2) dloader_cut_ids = [] batches = [] for batch in dloader: batches.append(batch) dloader_cut_ids.extend(c.id for c in batch['supervisions']['cut']) # Invariant 1: we receive the same amount of items in a dataloader epoch as there we in the CutSet assert len(dloader_cut_ids) == len(cut_set) # Invariant 2: the items are not duplicated assert len(set(dloader_cut_ids)) == len(dloader_cut_ids) # Invariant 3: the items are shuffled, i.e. the order is different than that in the CutSet assert dloader_cut_ids != [c.id for c in cut_set]
def test_k2_speech_recognition_augmentation(k2_cut_set, k2_noise_cut_set): dataset = K2SpeechRecognitionIterableDataset(k2_cut_set, shuffle=False, aug_cuts=k2_noise_cut_set) dloader = DataLoader(dataset, batch_size=None) # Check that it does not crash by just running all dataloader iterations batches = list(dloader) assert len(batches) > 0
def test_k2_speech_recognition_iterable_dataset(k2_cut_set, num_workers): from torch import tensor dataset = K2SpeechRecognitionIterableDataset(k2_cut_set, shuffle=False) # Note: "batch_size=None" disables the automatic batching mechanism, # which is required when Dataset takes care of the collation itself. dloader = DataLoader(dataset, batch_size=None, num_workers=num_workers) batch = next(iter(dloader)) assert batch['features'].shape == (4, 308, 80) # Each list has 5 items, to account for: # one cut with two supervisions + 3 three cuts with one supervision assert (batch['supervisions']['sequence_idx'] == tensor([0, 1, 2, 3, 3])).all() assert batch['supervisions']['text'] == ['IN EIGHTEEN THIRTEEN'] * 5 # a list, not tensor assert (batch['supervisions']['start_frame'] == tensor([0] * 4 + [153])).all() assert (batch['supervisions']['num_frames'] == tensor([154] * 5)).all()
def test_k2_speech_recognition_iterable_dataset_multiple_workers(k2_cut_set, num_workers): from torch import tensor dataset = K2SpeechRecognitionIterableDataset(k2_cut_set, shuffle=False) dloader = DataLoader(dataset, batch_size=None, num_workers=num_workers) # We expect a variable number of batches for each parametrized num_workers value, # because the dataset is small with 4 cuts that are partitioned across the workers. batches = list(dloader) features = torch.cat([b['features'] for b in batches]) assert features.shape == (4, 308, 80) text = [t for b in batches for t in b['supervisions']['text']] assert text == ['IN EIGHTEEN THIRTEEN'] * 5 # a list, not tensor start_frame = torch.cat([b['supervisions']['start_frame'] for b in batches]) assert (start_frame == tensor([0] * 4 + [153])).all() num_frames = torch.cat([b['supervisions']['num_frames'] for b in batches]) assert (num_frames == tensor([154] * 5)).all()
def main(): fix_random_seed(42) exp_dir = 'exp-lstm-adam' setup_logger('{}/log/log-train'.format(exp_dir)) tb_writer = SummaryWriter(log_dir=f'{exp_dir}/tensorboard') # load L, G, symbol_table lang_dir = Path('data/lang_nosp') phone_symbol_table = k2.SymbolTable.from_file(lang_dir / 'phones.txt') word_symbol_table = k2.SymbolTable.from_file(lang_dir / 'words.txt') logging.info("Loading L.fst") if (lang_dir / 'Linv.pt').exists(): L_inv = k2.Fsa.from_dict(torch.load(lang_dir / 'Linv.pt')) else: with open(lang_dir / 'L.fst.txt') as f: L = k2.Fsa.from_openfst(f.read(), acceptor=False) L_inv = k2.arc_sort(L.invert_()) torch.save(L_inv.as_dict(), lang_dir / 'Linv.pt') graph_compiler = CtcTrainingGraphCompiler(L_inv=L_inv, phones=phone_symbol_table, words=word_symbol_table, oov='<SPOKEN_NOISE>') # load dataset feature_dir = Path('exp/data') logging.info("About to get train cuts") cuts_train = CutSet.from_json(feature_dir / 'cuts_train.json.gz') logging.info("About to get dev cuts") cuts_dev = CutSet.from_json(feature_dir / 'cuts_dev.json.gz') logging.info("About to create train dataset") train = K2SpeechRecognitionIterableDataset(cuts_train, max_frames=90000, shuffle=True) logging.info("About to create dev dataset") validate = K2SpeechRecognitionIterableDataset(cuts_dev, max_frames=90000, shuffle=False, concat_cuts=False) logging.info("About to create train dataloader") train_dl = torch.utils.data.DataLoader(train, batch_size=None, num_workers=4) logging.info("About to create dev dataloader") valid_dl = torch.utils.data.DataLoader(validate, batch_size=None, num_workers=1) if not torch.cuda.is_available(): logging.error('No GPU detected!') sys.exit(-1) logging.info("About to create model") device_id = 0 device = torch.device('cuda', device_id) model = TdnnLstm1b(num_features=40, num_classes=len(phone_symbol_table), subsampling_factor=3) learning_rate = 0.00001 start_epoch = 0 num_epochs = 10 best_objf = np.inf best_epoch = start_epoch best_model_path = os.path.join(exp_dir, 'best_model.pt') best_epoch_info_filename = os.path.join(exp_dir, 'best-epoch-info') global_batch_idx_train = 0 # for logging only global_batch_idx_valid = 0 # for logging only if start_epoch > 0: model_path = os.path.join(exp_dir, 'epoch-{}.pt'.format(start_epoch - 1)) (epoch, learning_rate, objf) = load_checkpoint(filename=model_path, model=model) best_objf = objf logging.info("epoch = {}, objf = {}".format(epoch, objf)) model.to(device) describe(model) # optimizer = optim.SGD(model.parameters(), # lr=learning_rate, # momentum=0.9, # weight_decay=5e-4) optimizer = optim.AdamW( model.parameters(), # lr=learning_rate, weight_decay=5e-4) for epoch in range(start_epoch, num_epochs): curr_learning_rate = 1e-3 # curr_learning_rate = learning_rate * pow(0.4, epoch) # for param_group in optimizer.param_groups: # param_group['lr'] = curr_learning_rate tb_writer.add_scalar('learning_rate', curr_learning_rate, epoch) logging.info('epoch {}, learning rate {}'.format( epoch, curr_learning_rate)) objf = train_one_epoch(dataloader=train_dl, valid_dataloader=valid_dl, model=model, device=device, graph_compiler=graph_compiler, optimizer=optimizer, current_epoch=epoch, tb_writer=tb_writer, num_epochs=num_epochs, global_batch_idx_train=global_batch_idx_train, global_batch_idx_valid=global_batch_idx_valid) # the lower, the better if objf < best_objf: best_objf = objf best_epoch = epoch save_checkpoint(filename=best_model_path, model=model, epoch=epoch, learning_rate=curr_learning_rate, objf=objf) save_training_info(filename=best_epoch_info_filename, model_path=best_model_path, current_epoch=epoch, learning_rate=curr_learning_rate, objf=best_objf, best_objf=best_objf, best_epoch=best_epoch) # we always save the model for every epoch model_path = os.path.join(exp_dir, 'epoch-{}.pt'.format(epoch)) save_checkpoint(filename=model_path, model=model, epoch=epoch, learning_rate=curr_learning_rate, objf=objf) epoch_info_filename = os.path.join(exp_dir, 'epoch-{}-info'.format(epoch)) save_training_info(filename=epoch_info_filename, model_path=model_path, current_epoch=epoch, learning_rate=curr_learning_rate, objf=objf, best_objf=best_objf, best_epoch=best_epoch) logging.warning('Done')
def main(): fix_random_seed(42) exp_dir = f'exp-lstm-adam-mmi-mbr-musan' setup_logger('{}/log/log-train'.format(exp_dir)) tb_writer = SummaryWriter(log_dir=f'{exp_dir}/tensorboard') if not torch.cuda.is_available(): logging.warn('No GPU detected!') logging.warn('USE CPU (very slow)!') device = torch.device('cpu') else: logging.info('Use GPU') device_id = 0 device = torch.device('cuda', device_id) # load L, G, symbol_table lang_dir = Path('data/lang_nosp') phone_symbol_table = k2.SymbolTable.from_file(lang_dir / 'phones.txt') word_symbol_table = k2.SymbolTable.from_file(lang_dir / 'words.txt') logging.info("Loading L.fst") if (lang_dir / 'Linv.pt').exists(): logging.info('Loading precompiled L') L_inv = k2.Fsa.from_dict(torch.load(lang_dir / 'Linv.pt')) else: logging.info('Compiling L') with open(lang_dir / 'L.fst.txt') as f: L = k2.Fsa.from_openfst(f.read(), acceptor=False) L_inv = k2.arc_sort(L.invert_()) torch.save(L_inv.as_dict(), lang_dir / 'Linv.pt') logging.info("Loading L_disambig.fst") if (lang_dir / 'L_disambig.pt').exists(): logging.info('Loading precompiled L_disambig') L_disambig = k2.Fsa.from_dict(torch.load(lang_dir / 'L_disambig.pt')) else: logging.info('Compiling L_disambig') with open(lang_dir / 'L_disambig.fst.txt') as f: L_disambig = k2.Fsa.from_openfst(f.read(), acceptor=False) L_disambig = k2.arc_sort(L_disambig) torch.save(L_disambig.as_dict(), lang_dir / 'L_disambig.pt') logging.info("Loading G.fst") if (lang_dir / 'G_uni.pt').exists(): logging.info('Loading precompiled G') G = k2.Fsa.from_dict(torch.load(lang_dir / 'G_uni.pt')) else: logging.info('Compiling G') with open(lang_dir / 'G_uni.fst.txt') as f: G = k2.Fsa.from_openfst(f.read(), acceptor=False) G = k2.arc_sort(G) torch.save(G.as_dict(), lang_dir / 'G_uni.pt') graph_compiler = MmiMbrTrainingGraphCompiler(L_inv=L_inv, L_disambig=L_disambig, G=G, device=device, phones=phone_symbol_table, words=word_symbol_table) phone_ids = get_phone_symbols(phone_symbol_table) P = create_bigram_phone_lm(phone_ids) P.scores = torch.zeros_like(P.scores) # load dataset feature_dir = Path('exp/data') logging.info("About to get train cuts") cuts_train = CutSet.from_json(feature_dir / 'cuts_train-clean-100.json.gz') logging.info("About to get dev cuts") cuts_dev = CutSet.from_json(feature_dir / 'cuts_dev-clean.json.gz') logging.info("About to get Musan cuts") cuts_musan = CutSet.from_json(feature_dir / 'cuts_musan.json.gz') logging.info("About to create train dataset") train = K2SpeechRecognitionIterableDataset(cuts_train, max_frames=30000, shuffle=True, aug_cuts=cuts_musan, aug_prob=0.5, aug_snr=(10, 20)) logging.info("About to create dev dataset") validate = K2SpeechRecognitionIterableDataset(cuts_dev, max_frames=60000, shuffle=False, concat_cuts=False) logging.info("About to create train dataloader") train_dl = torch.utils.data.DataLoader(train, batch_size=None, num_workers=4) logging.info("About to create dev dataloader") valid_dl = torch.utils.data.DataLoader(validate, batch_size=None, num_workers=1) logging.info("About to create model") model = TdnnLstm1b( num_features=40, num_classes=len(phone_ids) + 1, # +1 for the blank symbol subsampling_factor=3) model.P_scores = nn.Parameter(P.scores.clone(), requires_grad=True) start_epoch = 0 num_epochs = 10 best_objf = np.inf best_valid_objf = np.inf best_epoch = start_epoch best_model_path = os.path.join(exp_dir, 'best_model.pt') best_epoch_info_filename = os.path.join(exp_dir, 'best-epoch-info') global_batch_idx_train = 0 # for logging only use_adam = True if start_epoch > 0: model_path = os.path.join(exp_dir, 'epoch-{}.pt'.format(start_epoch - 1)) ckpt = load_checkpoint(filename=model_path, model=model) best_objf = ckpt['objf'] best_valid_objf = ckpt['valid_objf'] global_batch_idx_train = ckpt['global_batch_idx_train'] logging.info( f"epoch = {ckpt['epoch']}, objf = {best_objf}, valid_objf = {best_valid_objf}" ) model.to(device) describe(model) P = P.to(device) if use_adam: learning_rate = 1e-3 weight_decay = 5e-4 optimizer = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay) # Equivalent to the following in the epoch loop: # if epoch > 6: # curr_learning_rate *= 0.8 lr_scheduler = optim.lr_scheduler.LambdaLR( optimizer, lambda ep: 1.0 if ep < 7 else 0.8**(ep - 6)) else: learning_rate = 5e-5 weight_decay = 1e-5 momentum = 0.9 lr_schedule_gamma = 0.7 optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=momentum, weight_decay=weight_decay) lr_scheduler = optim.lr_scheduler.ExponentialLR( optimizer=optimizer, gamma=lr_schedule_gamma, last_epoch=start_epoch - 1) for epoch in range(start_epoch, num_epochs): # LR scheduler can hold multiple learning rates for multiple parameter groups; # For now we report just the first LR which we assume concerns most of the parameters. curr_learning_rate = lr_scheduler.get_last_lr()[0] tb_writer.add_scalar('train/learning_rate', curr_learning_rate, global_batch_idx_train) tb_writer.add_scalar('train/epoch', epoch, global_batch_idx_train) logging.info('epoch {}, learning rate {}'.format( epoch, curr_learning_rate)) objf, valid_objf, global_batch_idx_train = train_one_epoch( dataloader=train_dl, valid_dataloader=valid_dl, model=model, P=P, device=device, graph_compiler=graph_compiler, optimizer=optimizer, current_epoch=epoch, tb_writer=tb_writer, num_epochs=num_epochs, global_batch_idx_train=global_batch_idx_train, ) # the lower, the better if valid_objf < best_valid_objf: best_valid_objf = valid_objf best_objf = objf best_epoch = epoch save_checkpoint(filename=best_model_path, model=model, epoch=epoch, learning_rate=curr_learning_rate, objf=objf, valid_objf=valid_objf, global_batch_idx_train=global_batch_idx_train) save_training_info(filename=best_epoch_info_filename, model_path=best_model_path, current_epoch=epoch, learning_rate=curr_learning_rate, objf=objf, best_objf=best_objf, valid_objf=valid_objf, best_valid_objf=best_valid_objf, best_epoch=best_epoch) # we always save the model for every epoch model_path = os.path.join(exp_dir, 'epoch-{}.pt'.format(epoch)) save_checkpoint(filename=model_path, model=model, epoch=epoch, learning_rate=curr_learning_rate, objf=objf, valid_objf=valid_objf, global_batch_idx_train=global_batch_idx_train) epoch_info_filename = os.path.join(exp_dir, 'epoch-{}-info'.format(epoch)) save_training_info(filename=epoch_info_filename, model_path=model_path, current_epoch=epoch, learning_rate=curr_learning_rate, objf=objf, best_objf=best_objf, valid_objf=valid_objf, best_valid_objf=best_valid_objf, best_epoch=best_epoch) lr_scheduler.step() logging.warning('Done')
def main(): # load L, G, symbol_table lang_dir = 'data/lang_nosp' with open(lang_dir + '/L.fst.txt') as f: L = k2.Fsa.from_openfst(f.read(), acceptor=False) with open(lang_dir + '/G.fsa.txt') as f: G = k2.Fsa.from_openfst(f.read(), acceptor=True) with open(lang_dir + '/words.txt') as f: symbol_table = k2.SymbolTable.from_str(f.read()) L = k2.arc_sort(L.invert_()) G = k2.arc_sort(G) graph = k2.intersect(L, G) graph = k2.arc_sort(graph) # load dataset feature_dir = 'exp/data1' cuts_train = CutSet.from_json(feature_dir + '/cuts_train-clean-100.json.gz') cuts_dev = CutSet.from_json(feature_dir + '/cuts_dev-clean.json.gz') train = K2SpeechRecognitionIterableDataset(cuts_train, shuffle=True) validate = K2SpeechRecognitionIterableDataset(cuts_dev, shuffle=False) train_dl = torch.utils.data.DataLoader(train, batch_size=None, num_workers=1) valid_dl = torch.utils.data.DataLoader(validate, batch_size=None, num_workers=1) dir = 'exp' setup_logger('{}/log/log-train'.format(dir)) if not torch.cuda.is_available(): logging.error('No GPU detected!') sys.exit(-1) device_id = 0 device = torch.device('cuda', device_id) model = Wav2Letter(num_classes=364, input_type='mfcc', num_features=40) model.to(device) learning_rate = 0.001 start_epoch = 0 num_epochs = 10 best_objf = 100000 best_epoch = start_epoch best_model_path = os.path.join(dir, 'best_model.pt') best_epoch_info_filename = os.path.join(dir, 'best-epoch-info') optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=5e-4) # optimizer = optim.SGD(net.parameters(), lr=learning_rate, momentum=0.9) for epoch in range(start_epoch, num_epochs): curr_learning_rate = learning_rate * pow(0.4, epoch) for param_group in optimizer.param_groups: param_group['lr'] = curr_learning_rate logging.info('epoch {}, learning rate {}'.format( epoch, curr_learning_rate)) objf = train_one_epoch(dataloader=train_dl, valid_dataloader=valid_dl, model=model, device=device, graph=graph, symbols=symbol_table, optimizer=optimizer, current_epoch=epoch, num_epochs=num_epochs) if objf < best_objf: best_objf = objf best_epoch = epoch save_checkpoint(filename=best_model_path, model=model, epoch=epoch, learning_rate=curr_learning_rate, objf=objf) save_training_info(filename=best_epoch_info_filename, model_path=best_model_path, current_epoch=epoch, learning_rate=curr_learning_rate, objf=best_objf, best_objf=best_objf, best_epoch=best_epoch) # we always save the model for every epoch model_path = os.path.join(dir, 'epoch-{}.pt'.format(epoch)) save_checkpoint(filename=model_path, model=model, epoch=epoch, learning_rate=curr_learning_rate, objf=objf) epoch_info_filename = os.path.join(dir, 'epoch-{}-info'.format(epoch)) save_training_info(filename=epoch_info_filename, model_path=model_path, current_epoch=epoch, learning_rate=curr_learning_rate, objf=objf, best_objf=best_objf, best_epoch=best_epoch) logging.warning('Done')
def main(): # load L, G, symbol_table lang_dir = 'data/lang_nosp' with open(lang_dir + '/words.txt') as f: symbol_table = k2.SymbolTable.from_str(f.read()) ## This commented code created LG. We don't need that there. ## There were problems with disambiguation symbols; the G has ## disambiguation symbols which L.fst doesn't support. # if not os.path.exists(lang_dir + '/LG.pt'): # print("Loading L.fst.txt") # with open(lang_dir + '/L.fst.txt') as f: # L = k2.Fsa.from_openfst(f.read(), acceptor=False) # print("Loading G.fsa.txt") # with open(lang_dir + '/G.fsa.txt') as f: # G = k2.Fsa.from_openfst(f.read(), acceptor=True) # print("Arc-sorting L...") # L = k2.arc_sort(L.invert_()) # G = k2.arc_sort(G) # print(k2.is_arc_sorted(k2.get_properties(L))) # print(k2.is_arc_sorted(k2.get_properties(G))) # print("Intersecting L and G") # graph = k2.intersect(L, G) # graph = k2.arc_sort(graph) # print(k2.is_arc_sorted(k2.get_properties(graph))) # torch.save(graph.as_dict(), lang_dir + '/LG.pt') # else: # d = torch.load(lang_dir + '/LG.pt') # print("Loading pre-prepared LG") # graph = k2.Fsa.from_dict(d) print("Loading L.fst.txt") with open(lang_dir + '/L.fst.txt') as f: L = k2.Fsa.from_openfst(f.read(), acceptor=False) L = k2.arc_sort(L.invert_()) # load dataset feature_dir = 'exp/data1' print("About to get train cuts") #cuts_train = CutSet.from_json(feature_dir + # '/cuts_train-clean-100.json.gz') cuts_train = CutSet.from_json(feature_dir + '/cuts_dev-clean.json.gz') print("About to get dev cuts") cuts_dev = CutSet.from_json(feature_dir + '/cuts_dev-clean.json.gz') print("About to create train dataset") train = K2SpeechRecognitionIterableDataset(cuts_train, max_frames=1000, shuffle=True) print("About to create dev dataset") validate = K2SpeechRecognitionIterableDataset(cuts_dev, max_frames=1000, shuffle=False) print("About to create train dataloader") train_dl = torch.utils.data.DataLoader(train, batch_size=None, num_workers=1) print("About to create dev dataloader") valid_dl = torch.utils.data.DataLoader(validate, batch_size=None, num_workers=1) exp_dir = 'exp' setup_logger('{}/log/log-train'.format(exp_dir)) if not torch.cuda.is_available(): logging.error('No GPU detected!') sys.exit(-1) print("About to create model") device_id = 0 device = torch.device('cuda', device_id) model = Wav2Letter(num_classes=364, input_type='mfcc', num_features=40) model.to(device) learning_rate = 0.001 start_epoch = 0 num_epochs = 10 best_objf = 100000 best_epoch = start_epoch best_model_path = os.path.join(exp_dir, 'best_model.pt') best_epoch_info_filename = os.path.join(exp_dir, 'best-epoch-info') optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=5e-4) # optimizer = optim.SGD(net.parameters(), lr=learning_rate, momentum=0.9) for epoch in range(start_epoch, num_epochs): curr_learning_rate = learning_rate * pow(0.4, epoch) for param_group in optimizer.param_groups: param_group['lr'] = curr_learning_rate logging.info('epoch {}, learning rate {}'.format( epoch, curr_learning_rate)) objf = train_one_epoch(dataloader=train_dl, valid_dataloader=valid_dl, model=model, device=device, L=L, symbols=symbol_table, optimizer=optimizer, current_epoch=epoch, num_epochs=num_epochs) if objf < best_objf: best_objf = objf best_epoch = epoch save_checkpoint(filename=best_model_path, model=model, epoch=epoch, learning_rate=curr_learning_rate, objf=objf) save_training_info(filename=best_epoch_info_filename, model_path=best_model_path, current_epoch=epoch, learning_rate=curr_learning_rate, objf=best_objf, best_objf=best_objf, best_epoch=best_epoch) # we always save the model for every epoch model_path = os.path.join(exp_dir, 'epoch-{}.pt'.format(epoch)) save_checkpoint(filename=model_path, model=model, epoch=epoch, learning_rate=curr_learning_rate, objf=objf) epoch_info_filename = os.path.join(exp_dir, 'epoch-{}-info'.format(epoch)) save_training_info(filename=epoch_info_filename, model_path=model_path, current_epoch=epoch, learning_rate=curr_learning_rate, objf=objf, best_objf=best_objf, best_epoch=best_epoch) logging.warning('Done')
def main(): exp_dir = Path('exp-lstm-adam-ctc-musan') setup_logger('{}/log/log-decode'.format(exp_dir), log_level='debug') # load L, G, symbol_table lang_dir = Path('data/lang_nosp') symbol_table = k2.SymbolTable.from_file(lang_dir / 'words.txt') phone_symbol_table = k2.SymbolTable.from_file(lang_dir / 'phones.txt') phone_ids = get_phone_symbols(phone_symbol_table) phone_ids_with_blank = [0] + phone_ids ctc_topo = k2.arc_sort(build_ctc_topo(phone_ids_with_blank)) if not os.path.exists(lang_dir / 'LG.pt'): print("Loading L_disambig.fst.txt") with open(lang_dir / 'L_disambig.fst.txt') as f: L = k2.Fsa.from_openfst(f.read(), acceptor=False) print("Loading G.fst.txt") with open(lang_dir / 'G.fst.txt') as f: G = k2.Fsa.from_openfst(f.read(), acceptor=False) first_phone_disambig_id = find_first_disambig_symbol( phone_symbol_table) first_word_disambig_id = find_first_disambig_symbol(symbol_table) LG = compile_LG(L=L, G=G, ctc_topo=ctc_topo, labels_disambig_id_start=first_phone_disambig_id, aux_labels_disambig_id_start=first_word_disambig_id) torch.save(LG.as_dict(), lang_dir / 'LG.pt') else: print("Loading pre-compiled LG") d = torch.load(lang_dir / 'LG.pt') LG = k2.Fsa.from_dict(d) # load dataset feature_dir = Path('exp/data') print("About to get test cuts") cuts_test = CutSet.from_json(feature_dir / 'cuts_test-clean.json.gz') print("About to create test dataset") test = K2SpeechRecognitionIterableDataset(cuts_test, max_frames=100000, shuffle=False, concat_cuts=False) print("About to create test dataloader") test_dl = torch.utils.data.DataLoader(test, batch_size=None, num_workers=1) # if not torch.cuda.is_available(): # logging.error('No GPU detected!') # sys.exit(-1) print("About to load model") # Note: Use "export CUDA_VISIBLE_DEVICES=N" to setup device id to N # device = torch.device('cuda', 1) device = torch.device('cuda') model = TdnnLstm1b(num_features=40, num_classes=len(phone_ids_with_blank)) checkpoint = os.path.join(exp_dir, 'epoch-7.pt') load_checkpoint(checkpoint, model) model.to(device) model.eval() print("convert LG to device") LG = LG.to(device) LG.aux_labels = k2.ragged.remove_values_eq(LG.aux_labels, 0) LG.requires_grad_(False) print("About to decode") results = decode(dataloader=test_dl, model=model, device=device, LG=LG, symbols=symbol_table) s = '' for ref, hyp in results: s += f'ref={ref}\n' s += f'hyp={hyp}\n' logging.info(s) # compute WER dists = [edit_distance(r, h) for r, h in results] errors = { key: sum(dist[key] for dist in dists) for key in ['sub', 'ins', 'del', 'total'] } total_words = sum(len(ref) for ref, _ in results) # Print Kaldi-like message: # %WER 8.20 [ 4459 / 54402, 695 ins, 427 del, 3337 sub ] logging.info( f'%WER {errors["total"] / total_words:.2%} ' f'[{errors["total"]} / {total_words}, {errors["ins"]} ins, {errors["del"]} del, {errors["sub"]} sub ]' )
def main(): assert False, 'We are still working on this script as it has some issues, so please do NOT try to run it for now.' exp_dir = Path('exp') setup_logger('{}/log/log-decode'.format(exp_dir)) # load L, G, symbol_table lang_dir = Path('data/lang_nosp') symbol_table = k2.SymbolTable.from_file(lang_dir / 'words.txt') if not os.path.exists(lang_dir / 'LG.pt'): print("Loading L_disambig.fst.txt") with open(lang_dir / 'L_disambig.fst.txt') as f: L = k2.Fsa.from_openfst(f.read(), acceptor=False) print("Loading G.fsa.txt") with open(lang_dir / 'G.fsa.txt') as f: G = k2.Fsa.from_openfst(f.read(), acceptor=True) LG = compile_LG(L=L, G=G, labels_disambig_id_start=347, aux_labels_disambig_id_start=200004) torch.save(LG.as_dict(), lang_dir / 'LG.pt') else: print("Loading pre-compiled LG") d = torch.load(lang_dir / 'LG.pt') LG = k2.Fsa.from_dict(d) # load dataset feature_dir = exp_dir / 'data' print("About to get test cuts") cuts_test = CutSet.from_json(feature_dir / 'cuts_test-clean.json.gz') print("About to create test dataset") test = K2SpeechRecognitionIterableDataset(cuts_test, max_frames=100000, shuffle=False) print("About to create test dataloader") test_dl = torch.utils.data.DataLoader(test, batch_size=None, num_workers=1) if not torch.cuda.is_available(): logging.error('No GPU detected!') sys.exit(-1) print("About to load model") # Note: Use "export CUDA_VISIBLE_DEVICES=N" to setup device id to N device = torch.device('cuda') model = Tdnn1a(num_features=40, num_classes=364) checkpoint = os.path.join(exp_dir, 'epoch-9.pt') load_checkpoint(checkpoint, model) model.to(device) model.eval() LG = LG.to(device) LG.requires_grad_(False) results = decode(dataloader=test_dl, model=model, device=device, LG=LG, symbols=symbol_table) for ref, hyp in results: print('ref=', ref, ', hyp=', hyp) # compute WER dists = [edit_distance(r, h) for r, h in results] errors = { key: sum(dist[key] for dist in dists) for key in ['sub', 'ins', 'del', 'total'] } total_words = sum(len(ref) for ref, _ in results) # Print Kaldi-like message: # %WER 8.20 [ 4459 / 54402, 695 ins, 427 del, 3337 sub ] print( f'%WER {errors["total"] / total_words:.2%} ' f'[{errors["total"]} / {total_words}, {errors["ins"]} ins, {errors["del"]} del, {errors["sub"]} sub ]' )
def main(): # load L, G, symbol_table lang_dir = 'data/lang_nosp' symbol_table = k2.SymbolTable.from_file(lang_dir + '/words.txt') print("Loading L.fst") if os.path.exists(lang_dir + '/Linv.pt'): L_inv = k2.Fsa.from_dict(torch.load(lang_dir + '/Linv.pt')) else: with open(lang_dir + '/L.fst.txt') as f: L = k2.Fsa.from_openfst(f.read(), acceptor=False) L_inv = k2.arc_sort(L.invert_()) torch.save(L_inv.as_dict(), lang_dir + '/Linv.pt') graph_compiler = TrainingGraphCompiler( L_inv=L_inv, vocab=symbol_table, ) # load dataset feature_dir = 'exp/data' print("About to get train cuts") cuts_train = CutSet.from_json(feature_dir + '/cuts_train-clean-100.json.gz') print("About to get dev cuts") cuts_dev = CutSet.from_json(feature_dir + '/cuts_dev-clean.json.gz') print("About to create train dataset") train = K2SpeechRecognitionIterableDataset(cuts_train, max_frames=100000, shuffle=True) print("About to create dev dataset") validate = K2SpeechRecognitionIterableDataset(cuts_dev, max_frames=100000, shuffle=False) print("About to create train dataloader") train_dl = torch.utils.data.DataLoader(train, batch_size=None, num_workers=4) print("About to create dev dataloader") valid_dl = torch.utils.data.DataLoader(validate, batch_size=None, num_workers=1) exp_dir = 'exp' setup_logger('{}/log/log-train'.format(exp_dir)) if not torch.cuda.is_available(): logging.error('No GPU detected!') sys.exit(-1) print("About to create model") device_id = 0 device = torch.device('cuda', device_id) model = Tdnn1a(num_features=40, num_classes=364, subsampling_factor=3) model.to(device) learning_rate = 0.00001 start_epoch = 0 num_epochs = 10 best_objf = 100000 best_epoch = start_epoch best_model_path = os.path.join(exp_dir, 'best_model.pt') best_epoch_info_filename = os.path.join(exp_dir, 'best-epoch-info') optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9, weight_decay=5e-4) for epoch in range(start_epoch, num_epochs): curr_learning_rate = learning_rate * pow(0.4, epoch) for param_group in optimizer.param_groups: param_group['lr'] = curr_learning_rate logging.info('epoch {}, learning rate {}'.format( epoch, curr_learning_rate)) objf = train_one_epoch(dataloader=train_dl, valid_dataloader=valid_dl, model=model, device=device, graph_compiler=graph_compiler, optimizer=optimizer, current_epoch=epoch, num_epochs=num_epochs) if objf < best_objf: best_objf = objf best_epoch = epoch save_checkpoint(filename=best_model_path, model=model, epoch=epoch, learning_rate=curr_learning_rate, objf=objf) save_training_info(filename=best_epoch_info_filename, model_path=best_model_path, current_epoch=epoch, learning_rate=curr_learning_rate, objf=best_objf, best_objf=best_objf, best_epoch=best_epoch) # we always save the model for every epoch model_path = os.path.join(exp_dir, 'epoch-{}.pt'.format(epoch)) save_checkpoint(filename=model_path, model=model, epoch=epoch, learning_rate=curr_learning_rate, objf=objf) epoch_info_filename = os.path.join(exp_dir, 'epoch-{}-info'.format(epoch)) save_training_info(filename=epoch_info_filename, model_path=model_path, current_epoch=epoch, learning_rate=curr_learning_rate, objf=objf, best_objf=best_objf, best_epoch=best_epoch) logging.warning('Done')