def load_features(self, feat_rxspecifier, vad_rxspecifier): feats = kio.read_matrix(feat_rxspecifier) vad_labels = kio.read_vector(vad_rxspecifier) feats = featfuncs.compute_deltas(self.delta_opts, feats) featfuncs.sliding_window_cmn(self.cmn_opts, feats, feats) feats = feats.numpy()[vad_labels.numpy().astype(bool), :] return feats
def load_wav_to_torch(scp_path): """ Loads wavdata into torch array """ data = read_matrix(scp_path).numpy().reshape(-1) data = data / MAX_WAV_VALUE return torch.from_numpy(data).float()
def test_lda_trans(self): mfcc_opts = feat.MfccOptions() mfcc_opts.frame_opts.allow_downsample = True mfcc_opts.frame_opts.snip_edges = False mfccs = feat.compute_mfcc_feats(self.wave_data, mfcc_opts) trans = read_matrix("data/lda.mat") mfccs = splice_frames(mfccs, 3, 3) mfcc_lda = feat.apply_feat_transform(mfccs, trans) self.assertEqual(mfcc_lda.num_rows, mfccs.num_rows) self.assertEqual(mfcc_lda.num_cols, trans.num_rows)
def test_compute_feat_for_nnet_internal(self): wave_data = feat.read_wav_kaldi(self.wav_path) trans = read_matrix(self.lda_path) shift = 10 feats = ppg.compute_feat_for_nnet_internal(wave_data, trans, frame_shift=shift) expected_num_frames = wave_data.data().num_cols / ( wave_data.samp_freq * shift / 1000) expected_num_frames = int(round(expected_num_frames)) # Closest int self.assertEqual(feats.num_rows, expected_num_frames) self.assertEqual(feats.num_cols, self.lda_dim)
def __init__(self, nnet_path=NNET_PATH, lda_path=LDA_PATH, reduce_dim_path=REDUCE_DIM_PATH, splice_opts_path=SPLICE_OPTS_PATH): """Load the given resources. Args: nnet_path: Path to acoustic model. lda_path: Path to LDA. reduce_dim_path: Path to pdf-to-Monophone transformation. splice_opts_path: Path to splice options. """ # Check inputs if not os.path.isfile(nnet_path): logging.error("File %s does not exist!", nnet_path) self.nnet_path = nnet_path if not os.path.isfile(lda_path): logging.error("File %s does not exist!", lda_path) self.lda_path = lda_path if not os.path.isfile(reduce_dim_path): logging.error("File %s does not exist!", reduce_dim_path) self.reduce_dim_path = reduce_dim_path if not os.path.isfile(splice_opts_path): logging.error("File %s does not exist!", splice_opts_path) self.splice_opts_path = splice_opts_path # Read in those dependencies self.context_parser = re.compile(r"--left-context=(\d+) " r"--right-context=(\d+)") self.nnet = decode.read_nnet3_model(nnet_path) self.lda = read_matrix(lda_path) self.monophone_trans = feat.read_sparse_mat(reduce_dim_path) with open(splice_opts_path, 'r') as reader: splice_opts = reader.readline() self.splice_opts = splice_opts if splice_opts: context = self.context_parser.match(splice_opts) context = context.groups() else: context = (None, None) logging.warning("Splice options are empty.") self.left_context = context[0] self.right_context = context[1]
def compute_feat_for_nnet(wav_path: str, lda_path: str) -> Matrix: """This is the external wrapper for computing input features to an AM. This function will not apply the fMLLR transform. Args: wav_path: Path to a wave file. lda_path: Path to an LDA transform matrix. Returns: feats: A T*D feature matrix. """ if os.path.exists(wav_path): wave_data = feat.read_wav_kaldi(wav_path) else: logging.error("File %s does not exist." % (wav_path)) if os.path.exists(lda_path): trans = read_matrix(lda_path) else: logging.error("Transform file %s does not exist." % (lda_path)) feats = compute_feat_for_nnet_internal(wave_data, trans) return feats
def main(): #if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument("-config") parser.add_argument("-model_path") parser.add_argument("-data") parser.add_argument("-data_path", default='', type=str, help="path of data files") parser.add_argument("-prior_path", default=None, help="the path to load the final.occs file") parser.add_argument("-transform", help="feature transformation matrix or mvn statistics") parser.add_argument("-out_file", help="write out the log-probs to this file") parser.add_argument("-batch_size", default=32, type=int, help="Override the batch size in the config") parser.add_argument("-sweep_size", default=200, type=float, help="process n hours of data per sweep (default:60)") parser.add_argument("-frame_subsampling_factor", default=1, type=int, help="the factor to subsample the features") parser.add_argument("-data_loader_threads", default=4, type=int, help="number of workers for data loading") args = parser.parse_args() with open(args.config) as f: config = yaml.safe_load(f) config["sweep_size"] = args.sweep_size config["source_paths"] = list() data_config = dict() data_config["type"] = "Eval" data_config["wav"] = args.data config["source_paths"].append(data_config) config["data_path"] = args.data_path print("job starts with config {}".format( json.dumps(config, sort_keys=True, indent=4))) transform = None if args.transform is not None and os.path.isfile(args.transform): with open(args.transform, 'rb') as f: transform = pickle.load(f) dataset = SpeechDataset(config) print(transform) test_dataloader = SeqDataloader(dataset, batch_size=args.batch_size, test_only=True, global_mvn=True, transform=transform) print("Data loader set up successfully!") print("Number of minibatches: {}".format(len(test_dataloader))) # ceate model model_config = config["model_config"] lstm = LSTMStack(model_config["feat_dim"], model_config["hidden_size"], model_config["num_layers"], model_config["dropout"], True) model = NnetAM(lstm, model_config["hidden_size"] * 2, model_config["label_size"]) device = th.device("cuda:1" if th.cuda.is_available() else "cpu") model.cuda() assert os.path.isfile( args.model_path), "ERROR: model file {} does not exit!".format( args.model_path) checkpoint = th.load(args.model_path, map_location='cuda:0') state_dict = checkpoint['model'] from collections import OrderedDict new_state_dict = OrderedDict() for k, v in state_dict.items(): header = k[:7] name = k[7:] # remove 'module.' of dataparallel new_state_dict[name] = v if header == "module.": model.load_state_dict(new_state_dict) else: model.load_state_dict(state_dict) print("=> loaded checkpoint '{}' ".format(args.model_path)) log_prior = None if (args.prior_path): prior = read_matrix(args.prior_path).numpy() log_prior = th.tensor(np.log(prior[0] / np.sum(prior[0])), dtype=th.float) model.eval() with th.no_grad(): with MatrixWriter("ark:" + args.out_file) as llout: for i, data in enumerate(test_dataloader): feat = data["x"] num_frs = data["num_frs"] utt_ids = data["utt_ids"] x = feat.to(th.float32) if (args.frame_subsampling_factor > 1): x = x.unfold(1, 1, args.frame_subsampling_factor).squeeze(-1) x = x.cuda() prediction = model(x) # save only unpadded part for each utt in batch for j in range(len(num_frs)): loglikes = prediction[j, :, :].data.cpu() loglikes_j = loglikes[:num_frs[j], :] if (log_prior): loglikes_j = loglikes_j - log_prior llout[utt_ids[j][0]] = loglikes_j print("Process batch [{}/{}]".format(i + 1, len(test_dataloader)))
def load_scp_to_torch(scp_path): """ Loads data into torch array """ data = read_matrix(scp_path).numpy() return torch.from_numpy(data).float()
matrix_in_fn = po.get_arg(1) matrix_out_fn = po.get_arg(2) in_is_rspecifier = classify_rspecifier( matrix_in_fn)[0] != RspecifierType.NO_SPECIFIER out_is_wspecifier = classify_wspecifier( matrix_out_fn)[0] != WspecifierType.NO_SPECIFIER if in_is_rspecifier != out_is_wspecifier: print("Cannot mix archives with regular files (copying matrices)", file=sys.stderr) sys.exit(1) if not in_is_rspecifier: mat = read_matrix(matrix_in_fn) if opts.scale != 1.0: mat.scale_(opts.scale) if opts.apply_log: mat.apply_floor_(1.0e-20) mat.apply_log_() if opts.apply_exp: mat.apply_exp_() if opts.apply_softmax_per_row: apply_softmax_per_row(mat) if opts.apply_power != 1.0: mat.apply_power_(opts.apply_power)
def main(): parser = argparse.ArgumentParser() parser.add_argument("-config") parser.add_argument("-model_path") parser.add_argument("-data_path") parser.add_argument("-prior_path", help="the path to load the final.occs file") parser.add_argument("-out_file", help="write out the log-probs to this file") parser.add_argument("-transform", help="feature transformation matrix or mvn statistics") parser.add_argument( "-trans_model", help="the HMM transistion model, used for lattice generation") parser.add_argument("-graph_dir", help="the decoding graph directory") parser.add_argument("-batch_size", default=32, type=int, help="Override the batch size in the config") parser.add_argument("-sweep_size", default=200, type=float, help="process n hours of data per sweep (default:60)") parser.add_argument("-data_loader_threads", default=4, type=int, help="number of workers for data loading") args = parser.parse_args() with open(args.config) as f: config = yaml.safe_load(f) config["sweep_size"] = args.sweep_size config["source_paths"] = list() data_config = dict() data_config["type"] = "Eval" data_config["wav"] = args.data_path config["source_paths"].append(data_config) print("job starts with config {}".format( json.dumps(config, sort_keys=True, indent=4))) transform = None if args.transform is not None and os.path.isfile(args.transform): with open(args.transform, 'rb') as f: transform = pickle.load(f) dataset = SpeechDataset(config) #data = trainset.__getitem__(0) test_dataloader = SeqDataloader(dataset, batch_size=args.batch_size, test_only=True, global_mvn=True, transform=transform) print("Data loader set up successfully!") print("Number of minibatches: {}".format(len(test_dataloader))) # ceate model model_config = config["model_config"] lstm = LSTMStack(model_config["feat_dim"], model_config["hidden_size"], model_config["num_layers"], model_config["dropout"], True) model = NnetAM(lstm, model_config["hidden_size"] * 2, model_config["label_size"]) device = th.device("cuda" if th.cuda.is_available() else "cpu") model.cuda() assert os.path.isfile( args.model_path), "ERROR: model file {} does not exit!".format( args.model_path) checkpoint = th.load(args.model_path, map_location='cuda:0') state_dict = checkpoint['model'] from collections import OrderedDict new_state_dict = OrderedDict() for k, v in state_dict.items(): header = k[:7] name = k[7:] # remove 'module.' of dataparallel new_state_dict[name] = v if header == "module.": model.load_state_dict(new_state_dict) else: model.load_state_dict(state_dict) print("=> loaded checkpoint '{}' ".format(args.model_path)) HCLG = args.graph_dir + "/HCLG.fst" words_txt = args.graph_dir + "/words.txt" if not os.path.isfile(HCLG): sys.stderr.write('ERROR: The HCLG file %s does not exist!\n' % (HCLG)) sys.exit(0) if not os.path.isfile(words_txt): sys.stderr.write('ERROR: The words.txt file %s does not exist!\n' % (words_txt)) sys.exit(0) if os.path.isfile(args.trans_model): trans_model = kaldi_hmm.TransitionModel() with kaldi_util.io.xopen(args.trans_model) as ki: trans_model.read(ki.stream(), ki.binary) else: sys.stderr.write('ERROR: The trans_model %s does not exist!\n' % (args.trans_model)) sys.exit(0) prior = read_matrix(args.prior_path).numpy() log_prior = th.tensor(np.log(prior[0] / np.sum(prior[0])), dtype=th.float) # now we can setup the decoder decoder_opts = LatticeFasterDecoderOptions() decoder_opts.beam = config["decoder_config"]["beam"] decoder_opts.lattice_beam = config["decoder_config"]["lattice_beam"] decoder_opts.max_active = config["decoder_config"]["max_active"] acoustic_scale = config["decoder_config"]["acoustic_scale"] decoder_opts.determinize_lattice = True #To produce compact lattice asr_decoder = MappedLatticeFasterRecognizer.from_files( args.trans_model, HCLG, words_txt, acoustic_scale=acoustic_scale, decoder_opts=decoder_opts) model.eval() with th.no_grad(): with kaldi_util.table.CompactLatticeWriter("ark:" + args.out_file) as lat_out: for data in test_dataloader: feat = data["x"] num_frs = data["num_frs"] utt_ids = data["utt_ids"] x = feat.to(th.float32) x = x.cuda() prediction = model(x) for j in range(len(num_frs)): loglikes = prediction[j, :, :].data.cpu() loglikes_j = loglikes[:num_frs[j], :] loglikes_j = loglikes_j - log_prior decoder_out = asr_decoder.decode( kaldi_matrix.Matrix(loglikes_j.numpy())) key = utt_ids[j][0] print(key, decoder_out["text"]) print("Log-like per-frame for utterance {} is {}".format( key, decoder_out["likelihood"] / num_frs[j])) # save lattice lat_out[key] = decoder_out["lattice"]