def forward(ctx, loglikes, den_graph, supervision, chain_opts): loglikes = kaldi_matrix.Matrix(loglikes.detach().cpu().numpy()) #if kaldi_cudamatrix.cuda_available(): # from kaldi.cudamatrix import CuDevice # CuDevice.instantiate().select_gpu_id('yes') # CuDevice.instantiate().allow_multithreading() nnet_out = kaldi_cudamatrix.CuMatrix().from_matrix(loglikes) grad = kaldi_cudamatrix.CuMatrix().from_size(nnet_out.num_rows(), nnet_out.num_cols()) grad_xent = kaldi_cudamatrix.CuMatrix().from_size( nnet_out.num_rows(), nnet_out.num_cols()) loss = kaldi_chain.compute_chain_objf_and_deriv( chain_opts, den_graph, supervision, nnet_out, grad, grad_xent) grad.add_mat(chain_opts.xent_regularize, grad_xent) grad_out = kaldi_matrix.Matrix(nnet_out.num_rows(), nnet_out.num_cols()) grad.copy_to_mat(grad_out) ctx.save_for_backward(th.from_numpy(grad_out.numpy()).cuda()) return th.tensor(loss[0])
def forward(ctx, loglikes_T, loglikes_S, asr_decoder): # We can use either ther teacher model or the student model to generate the lattice decode_out = asr_decoder.decode( kaldi_matrix.Matrix(loglikes_T.detach().cpu().numpy())) lattice = decoder_out["lattice"] kaldi_lat.functions.top_sort_lattice_if_needed(lattice) lat_like_T, post_T, acoustic_like_T = kaldi_lat.functions.lattice_forward_backward( lattice) decodable = kaldi_decoder.DecodableMatrixScaled(loglikes_S, 1.0) if kaldi_lat.functions.rescore_lattice(decodable, lattice): lat_like_S, post_S, acoustic_like_S = kaldi_lat.functions.lattice_forward_backward( lattice) else: sys.stderr.write('ERROR: Rescore lattice failed!') sys.exit(0) post_T = kaldi_hmm.Posterior.from_posteriors(post_T) post_S = kaldi_hmm.Posterior.from_posteriors(post_S) post_mat = post_T.to_pdf_matrix( trans_model).numpy() - post_S.to_pdf_matrix(trans_model).numpy() ctx.save_for_backward(th.from_numpy(post_mat).cuda()) loss = F.cross_entropy(th.from_numpy(post_T), th.from_nompy(post_S)) return loss.item()
def forward(ctx, loglikes, den_graph, supervision, chain_opts): loglikes = kaldi_matrix.Matrix(loglikes.detach().cpu().numpy()) nnet_out = kaldi_cudamatrix.CuMatrix().from_matrix(loglikes) grad = kaldi_cudamatrix.CuMatrix().from_size(nnet_out.num_rows(), nnet_out.num_cols()) grad_xent = kaldi_cudamatrix.CuMatrix().from_size( nnet_out.num_rows(), nnet_out.num_cols()) loss = kaldi_chain.compute_chain_objf_and_deriv( chain_opts, den_graph, supervision, nnet_out, grad, grad_xent) grad.add_mat(chain_opts.xent_regularize, grad_xent) grad_out = kaldi_matrix.Matrix(nnet_out.num_rows(), nnet_out.num_cols()) grad.copy_to_mat(grad_out) ctx.save_for_backward(th.from_numpy(grad_out.numpy()).cuda()) return th.tensor(loss[0])
def forward(ctx, loglikes, asr_decoder, trans_model, trans_ids): decode_out = asr_decoder.decode(kaldi_matrix.Matrix(loglikes.detach().cpu().numpy())) lattice = decode_out["lattice"] kaldi_lat.functions.top_sort_lattice_if_needed(lattice) lat_like, post = kaldi_lat.functions.lattice_forward_backward_mmi(trans_model, lattice, trans_ids, True, False, True) post = kaldi_hmm.Posterior.from_posteriors(post) post_mat = post.to_pdf_matrix(trans_model).numpy() ctx.save_for_backward(th.from_numpy(post_mat).cuda()) #print(post_mat) return th.tensor(lat_like)
def write(self, key, value): """Writes the `(key, value)` pair to the table. This method is provided for compatibility with the C++ API only; most users should use the Pythonic API. Overrides write to accept both Matrix and SubMatrix. Args: key (str): The key. value: The value. """ super(MatrixWriter, self).write(key, _matrix.Matrix(value))
def forward(ctx, loglikes, asr_decoder, trans_model, supervision, config): decode_out = asr_decoder.decode( kaldi_matrix.Matrix(loglikes.detach().cpu().numpy())) lattice = decode_out["lattice"] kaldi_lat.functions.top_sort_lattice_if_needed(lattice) scale = kaldi_fst.utils.lattice_scale(config['lm_weight'], config['am_weight']) kaldi_fst.utils.scale_lattice(scale, lattice) lat_like, post, acoustic_like = kaldi_lat.functions.lattice_forward_backward( lattice) if config['phone_level']: kaldi_lat.functions.convert_lattice_to_phones(trans_model, lattice) _, supervision_phones = kaldi_hmm.split_to_phones( trans_model, supervision) supervision = [ trans_model.transition_id_to_phone(cluster[0]) for cluster in supervision_phones ] #lattice = kaldi_fst.utils.convert_compact_lattice_to_lattice(lattice) ifst = kaldi_fst.utils.convert_lattice_to_std(lattice) length = loglikes.size(0) ilabels = list() olabels = list() edit_distance = list() weights = list() if config['rand_path']: #ofst = kaldi_fst.randgen(fst, npath=8, seed=None, select='uniform', max_length=length, weighted=True, remove_total_weight=False) #for i in range(config['num_paths']): n = 0 while len(olabels) < config['num_paths'] and n < 500: n += 1 ofst = kaldi_fst.StdVectorFst() randint = np.random.randint(0, 10000) if (kaldi_fst.utils.equal_align(ifst, length, randint, ofst)): ilabel, olabel, weight = kaldi_fst.utils.get_linear_symbol_sequence( ofst) if olabel not in olabels: olabels.append(olabel) ilabels.append(ilabel) weights.append(weight.value) edit_distance.append( editdistance.eval(olabel, supervision)) else: nbest_lats = kaldi_fst.utils.nbest_as_fsts(ifst, config['num_paths']) for path in nbest_lats: ilabel, olabel, weight = kaldi_fst.utils.get_linear_symbol_sequence( path) if olabel not in olabels: olabels.append(olabel) ilabels.append(ilabel) weights.append(weight.value) edit_distance.append(editdistance.eval( olabel, supervision)) if (config['equal_weight']): normalizer = th.ones(len(weights), dtype=th.float32) * 1.0 / len(weights) else: normalizer = F.softmax(th.FloatTensor(weights) * -1, dim=0) mean_err = th.FloatTensor(edit_distance) * normalizer loss = mean_err.sum() grad_value = (th.FloatTensor(edit_distance) - loss) * normalizer # compute gradients grad_out = th.zeros(loglikes.size()) for idx in range(len(ilabels)): ilabel = ilabels[idx] for i in range(len(ilabel)): pdf_id = trans_model.transition_id_to_pdf(ilabel[i]) grad_out[i][pdf_id] += grad_value[idx] ctx.save_for_backward(grad_out.cuda()) return loss
def run_train_epoch(model, optimizer, log_prior, dataloader, epoch, asr_decoder, trans_model, silence_ids, aligner, args): batch_time = utils.AverageMeter('Time', ':6.3f') losses = utils.AverageMeter('Loss', ':.4e') grad_norm = utils.AverageMeter('grad_norm', ':.4e') progress = utils.ProgressMeter(len(dataloader), batch_time, losses, grad_norm, prefix="Epoch: [{}]".format(epoch)) ce_criterion = nn.CrossEntropyLoss(ignore_index=-100, reduction='sum') if args.criterion == "mmi": criterion = ops.MMIFunction.apply else: criterion = ops.sMBRFunction.apply end = time.time() for i, batch in enumerate(dataloader): feat = batch["x"] label = batch["y"] num_frs = batch["num_frs"] utt_ids = batch["utt_ids"] aux = batch["aux"] #word labels for se loss x = feat.to(th.float32) y = label.long() x = x.cuda() y = y.cuda() prediction = model(x) ce_loss = ce_criterion(prediction.view(-1, prediction.shape[2]), y.view(-1)) loss = args.ce_ratio * ce_loss for j in range(len(num_frs)): loglike = prediction[j, :, :] loglike_j = loglike[:num_frs[j], :] loglike_j = loglike_j - log_prior text = th.from_numpy(aux[j][0][0].astype(int)).tolist() #text = ' '.join(str(k) for k in text) try: align_in = kaldi_matrix.Matrix( loglike_j.detach().cpu().numpy()) align_out = aligner.align(align_in, text) trans_ids = align_out["alignment"] if args.criterion == "mmi": se_loss = criterion(loglike_j, asr_decoder, trans_model, trans_ids) else: se_loss = criterion(loglike_j, asr_decoder, trans_model, trans_ids, args.criterion, silence_ids) loss += se_loss.cuda() except: print( "Warning: failed to align utterance {}, skip the utterance for SE loss" .format(utt_ids[j])) optimizer.zero_grad() loss.backward() # Gradient Clipping (th 5.0) norm = nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() grad_norm.update(norm) # update loss tot_frs = np.array(num_frs).sum() losses.update(loss.item() / tot_frs) # measure elapsed time batch_time.update(time.time() - end) # save model if hvd.rank() == 0 and i % args.save_freq == 0: checkpoint = {} checkpoint['model'] = model.state_dict() checkpoint['optimizer'] = optimizer.state_dict() output_file = args.exp_dir + '/model.se.' + str(i) + '.tar' th.save(checkpoint, output_file) if hvd.rank() == 0 and i % args.print_freq == 0: progress.print(i)
def main(): parser = argparse.ArgumentParser() parser.add_argument("-config") parser.add_argument("-model_path") parser.add_argument("-data_path") parser.add_argument("-prior_path", help="the path to load the final.occs file") parser.add_argument("-out_file", help="write out the log-probs to this file") parser.add_argument("-transform", help="feature transformation matrix or mvn statistics") parser.add_argument( "-trans_model", help="the HMM transistion model, used for lattice generation") parser.add_argument("-graph_dir", help="the decoding graph directory") parser.add_argument("-batch_size", default=32, type=int, help="Override the batch size in the config") parser.add_argument("-sweep_size", default=200, type=float, help="process n hours of data per sweep (default:60)") parser.add_argument("-data_loader_threads", default=4, type=int, help="number of workers for data loading") args = parser.parse_args() with open(args.config) as f: config = yaml.safe_load(f) config["sweep_size"] = args.sweep_size config["source_paths"] = list() data_config = dict() data_config["type"] = "Eval" data_config["wav"] = args.data_path config["source_paths"].append(data_config) print("job starts with config {}".format( json.dumps(config, sort_keys=True, indent=4))) transform = None if args.transform is not None and os.path.isfile(args.transform): with open(args.transform, 'rb') as f: transform = pickle.load(f) dataset = SpeechDataset(config) #data = trainset.__getitem__(0) test_dataloader = SeqDataloader(dataset, batch_size=args.batch_size, test_only=True, global_mvn=True, transform=transform) print("Data loader set up successfully!") print("Number of minibatches: {}".format(len(test_dataloader))) # ceate model model_config = config["model_config"] lstm = LSTMStack(model_config["feat_dim"], model_config["hidden_size"], model_config["num_layers"], model_config["dropout"], True) model = NnetAM(lstm, model_config["hidden_size"] * 2, model_config["label_size"]) device = th.device("cuda" if th.cuda.is_available() else "cpu") model.cuda() assert os.path.isfile( args.model_path), "ERROR: model file {} does not exit!".format( args.model_path) checkpoint = th.load(args.model_path, map_location='cuda:0') state_dict = checkpoint['model'] from collections import OrderedDict new_state_dict = OrderedDict() for k, v in state_dict.items(): header = k[:7] name = k[7:] # remove 'module.' of dataparallel new_state_dict[name] = v if header == "module.": model.load_state_dict(new_state_dict) else: model.load_state_dict(state_dict) print("=> loaded checkpoint '{}' ".format(args.model_path)) HCLG = args.graph_dir + "/HCLG.fst" words_txt = args.graph_dir + "/words.txt" if not os.path.isfile(HCLG): sys.stderr.write('ERROR: The HCLG file %s does not exist!\n' % (HCLG)) sys.exit(0) if not os.path.isfile(words_txt): sys.stderr.write('ERROR: The words.txt file %s does not exist!\n' % (words_txt)) sys.exit(0) if os.path.isfile(args.trans_model): trans_model = kaldi_hmm.TransitionModel() with kaldi_util.io.xopen(args.trans_model) as ki: trans_model.read(ki.stream(), ki.binary) else: sys.stderr.write('ERROR: The trans_model %s does not exist!\n' % (args.trans_model)) sys.exit(0) prior = read_matrix(args.prior_path).numpy() log_prior = th.tensor(np.log(prior[0] / np.sum(prior[0])), dtype=th.float) # now we can setup the decoder decoder_opts = LatticeFasterDecoderOptions() decoder_opts.beam = config["decoder_config"]["beam"] decoder_opts.lattice_beam = config["decoder_config"]["lattice_beam"] decoder_opts.max_active = config["decoder_config"]["max_active"] acoustic_scale = config["decoder_config"]["acoustic_scale"] decoder_opts.determinize_lattice = True #To produce compact lattice asr_decoder = MappedLatticeFasterRecognizer.from_files( args.trans_model, HCLG, words_txt, acoustic_scale=acoustic_scale, decoder_opts=decoder_opts) model.eval() with th.no_grad(): with kaldi_util.table.CompactLatticeWriter("ark:" + args.out_file) as lat_out: for data in test_dataloader: feat = data["x"] num_frs = data["num_frs"] utt_ids = data["utt_ids"] x = feat.to(th.float32) x = x.cuda() prediction = model(x) for j in range(len(num_frs)): loglikes = prediction[j, :, :].data.cpu() loglikes_j = loglikes[:num_frs[j], :] loglikes_j = loglikes_j - log_prior decoder_out = asr_decoder.decode( kaldi_matrix.Matrix(loglikes_j.numpy())) key = utt_ids[j][0] print(key, decoder_out["text"]) print("Log-like per-frame for utterance {} is {}".format( key, decoder_out["likelihood"] / num_frs[j])) # save lattice lat_out[key] = decoder_out["lattice"]