def load_features(self, feat_rxspecifier, vad_rxspecifier):
     feats = kio.read_matrix(feat_rxspecifier)
     vad_labels = kio.read_vector(vad_rxspecifier)
     feats = featfuncs.compute_deltas(self.delta_opts, feats)
     featfuncs.sliding_window_cmn(self.cmn_opts, feats, feats)
     feats = feats.numpy()[vad_labels.numpy().astype(bool), :]
     return feats
Exemple #2
0
def load_wav_to_torch(scp_path):
    """
    Loads wavdata into torch array
    """
    data = read_matrix(scp_path).numpy().reshape(-1)
    data = data / MAX_WAV_VALUE
    return torch.from_numpy(data).float()
 def test_lda_trans(self):
     mfcc_opts = feat.MfccOptions()
     mfcc_opts.frame_opts.allow_downsample = True
     mfcc_opts.frame_opts.snip_edges = False
     mfccs = feat.compute_mfcc_feats(self.wave_data, mfcc_opts)
     trans = read_matrix("data/lda.mat")
     mfccs = splice_frames(mfccs, 3, 3)
     mfcc_lda = feat.apply_feat_transform(mfccs, trans)
     self.assertEqual(mfcc_lda.num_rows, mfccs.num_rows)
     self.assertEqual(mfcc_lda.num_cols, trans.num_rows)
Exemple #4
0
 def test_compute_feat_for_nnet_internal(self):
     wave_data = feat.read_wav_kaldi(self.wav_path)
     trans = read_matrix(self.lda_path)
     shift = 10
     feats = ppg.compute_feat_for_nnet_internal(wave_data,
                                                trans,
                                                frame_shift=shift)
     expected_num_frames = wave_data.data().num_cols / (
         wave_data.samp_freq * shift / 1000)
     expected_num_frames = int(round(expected_num_frames))  # Closest int
     self.assertEqual(feats.num_rows, expected_num_frames)
     self.assertEqual(feats.num_cols, self.lda_dim)
    def __init__(self,
                 nnet_path=NNET_PATH,
                 lda_path=LDA_PATH,
                 reduce_dim_path=REDUCE_DIM_PATH,
                 splice_opts_path=SPLICE_OPTS_PATH):
        """Load the given resources.

        Args:
            nnet_path: Path to acoustic model.
            lda_path: Path to LDA.
            reduce_dim_path: Path to pdf-to-Monophone transformation.
            splice_opts_path: Path to splice options.
        """
        # Check inputs
        if not os.path.isfile(nnet_path):
            logging.error("File %s does not exist!", nnet_path)
        self.nnet_path = nnet_path
        if not os.path.isfile(lda_path):
            logging.error("File %s does not exist!", lda_path)
        self.lda_path = lda_path
        if not os.path.isfile(reduce_dim_path):
            logging.error("File %s does not exist!", reduce_dim_path)
        self.reduce_dim_path = reduce_dim_path
        if not os.path.isfile(splice_opts_path):
            logging.error("File %s does not exist!", splice_opts_path)
        self.splice_opts_path = splice_opts_path

        # Read in those dependencies
        self.context_parser = re.compile(r"--left-context=(\d+) "
                                         r"--right-context=(\d+)")
        self.nnet = decode.read_nnet3_model(nnet_path)
        self.lda = read_matrix(lda_path)
        self.monophone_trans = feat.read_sparse_mat(reduce_dim_path)
        with open(splice_opts_path, 'r') as reader:
            splice_opts = reader.readline()
        self.splice_opts = splice_opts
        if splice_opts:
            context = self.context_parser.match(splice_opts)
            context = context.groups()
        else:
            context = (None, None)
            logging.warning("Splice options are empty.")
        self.left_context = context[0]
        self.right_context = context[1]
def compute_feat_for_nnet(wav_path: str, lda_path: str) -> Matrix:
    """This is the external wrapper for computing input features to an AM.

    This function will not apply the fMLLR transform.

    Args:
        wav_path: Path to a wave file.
        lda_path: Path to an LDA transform matrix.

    Returns:
        feats: A T*D feature matrix.
    """
    if os.path.exists(wav_path):
        wave_data = feat.read_wav_kaldi(wav_path)
    else:
        logging.error("File %s does not exist." % (wav_path))

    if os.path.exists(lda_path):
        trans = read_matrix(lda_path)
    else:
        logging.error("Transform file %s does not exist." % (lda_path))

    feats = compute_feat_for_nnet_internal(wave_data, trans)
    return feats
Exemple #7
0
def main():
    #if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument("-config")
    parser.add_argument("-model_path")
    parser.add_argument("-data")
    parser.add_argument("-data_path",
                        default='',
                        type=str,
                        help="path of data files")
    parser.add_argument("-prior_path",
                        default=None,
                        help="the path to load the final.occs file")
    parser.add_argument("-transform",
                        help="feature transformation matrix or mvn statistics")
    parser.add_argument("-out_file",
                        help="write out the log-probs to this file")
    parser.add_argument("-batch_size",
                        default=32,
                        type=int,
                        help="Override the batch size in the config")
    parser.add_argument("-sweep_size",
                        default=200,
                        type=float,
                        help="process n hours of data per sweep (default:60)")
    parser.add_argument("-frame_subsampling_factor",
                        default=1,
                        type=int,
                        help="the factor to subsample the features")
    parser.add_argument("-data_loader_threads",
                        default=4,
                        type=int,
                        help="number of workers for data loading")

    args = parser.parse_args()

    with open(args.config) as f:
        config = yaml.safe_load(f)

    config["sweep_size"] = args.sweep_size

    config["source_paths"] = list()
    data_config = dict()

    data_config["type"] = "Eval"
    data_config["wav"] = args.data

    config["source_paths"].append(data_config)
    config["data_path"] = args.data_path

    print("job starts with config {}".format(
        json.dumps(config, sort_keys=True, indent=4)))

    transform = None
    if args.transform is not None and os.path.isfile(args.transform):
        with open(args.transform, 'rb') as f:
            transform = pickle.load(f)

    dataset = SpeechDataset(config)
    print(transform)
    test_dataloader = SeqDataloader(dataset,
                                    batch_size=args.batch_size,
                                    test_only=True,
                                    global_mvn=True,
                                    transform=transform)

    print("Data loader set up successfully!")
    print("Number of minibatches: {}".format(len(test_dataloader)))

    # ceate model
    model_config = config["model_config"]
    lstm = LSTMStack(model_config["feat_dim"], model_config["hidden_size"],
                     model_config["num_layers"], model_config["dropout"], True)
    model = NnetAM(lstm, model_config["hidden_size"] * 2,
                   model_config["label_size"])

    device = th.device("cuda:1" if th.cuda.is_available() else "cpu")
    model.cuda()

    assert os.path.isfile(
        args.model_path), "ERROR: model file {} does not exit!".format(
            args.model_path)

    checkpoint = th.load(args.model_path, map_location='cuda:0')
    state_dict = checkpoint['model']
    from collections import OrderedDict
    new_state_dict = OrderedDict()
    for k, v in state_dict.items():
        header = k[:7]
        name = k[7:]  # remove 'module.' of dataparallel
        new_state_dict[name] = v
    if header == "module.":
        model.load_state_dict(new_state_dict)
    else:
        model.load_state_dict(state_dict)
    print("=> loaded checkpoint '{}' ".format(args.model_path))

    log_prior = None
    if (args.prior_path):
        prior = read_matrix(args.prior_path).numpy()
        log_prior = th.tensor(np.log(prior[0] / np.sum(prior[0])),
                              dtype=th.float)

    model.eval()
    with th.no_grad():
        with MatrixWriter("ark:" + args.out_file) as llout:
            for i, data in enumerate(test_dataloader):
                feat = data["x"]
                num_frs = data["num_frs"]
                utt_ids = data["utt_ids"]

                x = feat.to(th.float32)
                if (args.frame_subsampling_factor > 1):
                    x = x.unfold(1, 1,
                                 args.frame_subsampling_factor).squeeze(-1)
                x = x.cuda()
                prediction = model(x)
                # save only unpadded part for each utt in batch
                for j in range(len(num_frs)):
                    loglikes = prediction[j, :, :].data.cpu()
                    loglikes_j = loglikes[:num_frs[j], :]
                    if (log_prior):
                        loglikes_j = loglikes_j - log_prior

                    llout[utt_ids[j][0]] = loglikes_j

                print("Process batch [{}/{}]".format(i + 1,
                                                     len(test_dataloader)))
Exemple #8
0
def load_scp_to_torch(scp_path):
    """
    Loads data into torch array
    """
    data = read_matrix(scp_path).numpy()
    return torch.from_numpy(data).float()
Exemple #9
0
    matrix_in_fn = po.get_arg(1)
    matrix_out_fn = po.get_arg(2)

    in_is_rspecifier = classify_rspecifier(
        matrix_in_fn)[0] != RspecifierType.NO_SPECIFIER
    out_is_wspecifier = classify_wspecifier(
        matrix_out_fn)[0] != WspecifierType.NO_SPECIFIER

    if in_is_rspecifier != out_is_wspecifier:
        print("Cannot mix archives with regular files (copying matrices)",
              file=sys.stderr)
        sys.exit(1)

    if not in_is_rspecifier:
        mat = read_matrix(matrix_in_fn)
        if opts.scale != 1.0:
            mat.scale_(opts.scale)

        if opts.apply_log:
            mat.apply_floor_(1.0e-20)
            mat.apply_log_()

        if opts.apply_exp:
            mat.apply_exp_()

        if opts.apply_softmax_per_row:
            apply_softmax_per_row(mat)

        if opts.apply_power != 1.0:
            mat.apply_power_(opts.apply_power)
Exemple #10
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-config")
    parser.add_argument("-model_path")
    parser.add_argument("-data_path")
    parser.add_argument("-prior_path",
                        help="the path to load the final.occs file")
    parser.add_argument("-out_file",
                        help="write out the log-probs to this file")
    parser.add_argument("-transform",
                        help="feature transformation matrix or mvn statistics")
    parser.add_argument(
        "-trans_model",
        help="the HMM transistion model, used for lattice generation")
    parser.add_argument("-graph_dir", help="the decoding graph directory")
    parser.add_argument("-batch_size",
                        default=32,
                        type=int,
                        help="Override the batch size in the config")
    parser.add_argument("-sweep_size",
                        default=200,
                        type=float,
                        help="process n hours of data per sweep (default:60)")
    parser.add_argument("-data_loader_threads",
                        default=4,
                        type=int,
                        help="number of workers for data loading")

    args = parser.parse_args()

    with open(args.config) as f:
        config = yaml.safe_load(f)

    config["sweep_size"] = args.sweep_size

    config["source_paths"] = list()
    data_config = dict()

    data_config["type"] = "Eval"
    data_config["wav"] = args.data_path

    config["source_paths"].append(data_config)

    print("job starts with config {}".format(
        json.dumps(config, sort_keys=True, indent=4)))

    transform = None
    if args.transform is not None and os.path.isfile(args.transform):
        with open(args.transform, 'rb') as f:
            transform = pickle.load(f)

    dataset = SpeechDataset(config)
    #data = trainset.__getitem__(0)
    test_dataloader = SeqDataloader(dataset,
                                    batch_size=args.batch_size,
                                    test_only=True,
                                    global_mvn=True,
                                    transform=transform)

    print("Data loader set up successfully!")
    print("Number of minibatches: {}".format(len(test_dataloader)))

    # ceate model
    model_config = config["model_config"]
    lstm = LSTMStack(model_config["feat_dim"], model_config["hidden_size"],
                     model_config["num_layers"], model_config["dropout"], True)
    model = NnetAM(lstm, model_config["hidden_size"] * 2,
                   model_config["label_size"])

    device = th.device("cuda" if th.cuda.is_available() else "cpu")
    model.cuda()

    assert os.path.isfile(
        args.model_path), "ERROR: model file {} does not exit!".format(
            args.model_path)

    checkpoint = th.load(args.model_path, map_location='cuda:0')
    state_dict = checkpoint['model']
    from collections import OrderedDict
    new_state_dict = OrderedDict()
    for k, v in state_dict.items():
        header = k[:7]
        name = k[7:]  # remove 'module.' of dataparallel
        new_state_dict[name] = v
    if header == "module.":
        model.load_state_dict(new_state_dict)
    else:
        model.load_state_dict(state_dict)
    print("=> loaded checkpoint '{}' ".format(args.model_path))

    HCLG = args.graph_dir + "/HCLG.fst"
    words_txt = args.graph_dir + "/words.txt"

    if not os.path.isfile(HCLG):
        sys.stderr.write('ERROR: The HCLG file %s does not exist!\n' % (HCLG))
        sys.exit(0)

    if not os.path.isfile(words_txt):
        sys.stderr.write('ERROR: The words.txt file %s does not exist!\n' %
                         (words_txt))
        sys.exit(0)

    if os.path.isfile(args.trans_model):
        trans_model = kaldi_hmm.TransitionModel()
        with kaldi_util.io.xopen(args.trans_model) as ki:
            trans_model.read(ki.stream(), ki.binary)
    else:
        sys.stderr.write('ERROR: The trans_model %s does not exist!\n' %
                         (args.trans_model))
        sys.exit(0)

    prior = read_matrix(args.prior_path).numpy()
    log_prior = th.tensor(np.log(prior[0] / np.sum(prior[0])), dtype=th.float)

    # now we can setup the decoder
    decoder_opts = LatticeFasterDecoderOptions()
    decoder_opts.beam = config["decoder_config"]["beam"]
    decoder_opts.lattice_beam = config["decoder_config"]["lattice_beam"]
    decoder_opts.max_active = config["decoder_config"]["max_active"]
    acoustic_scale = config["decoder_config"]["acoustic_scale"]
    decoder_opts.determinize_lattice = True  #To produce compact lattice
    asr_decoder = MappedLatticeFasterRecognizer.from_files(
        args.trans_model,
        HCLG,
        words_txt,
        acoustic_scale=acoustic_scale,
        decoder_opts=decoder_opts)

    model.eval()
    with th.no_grad():
        with kaldi_util.table.CompactLatticeWriter("ark:" +
                                                   args.out_file) as lat_out:
            for data in test_dataloader:
                feat = data["x"]
                num_frs = data["num_frs"]
                utt_ids = data["utt_ids"]

                x = feat.to(th.float32)
                x = x.cuda()

                prediction = model(x)

                for j in range(len(num_frs)):
                    loglikes = prediction[j, :, :].data.cpu()

                    loglikes_j = loglikes[:num_frs[j], :]
                    loglikes_j = loglikes_j - log_prior

                    decoder_out = asr_decoder.decode(
                        kaldi_matrix.Matrix(loglikes_j.numpy()))

                    key = utt_ids[j][0]
                    print(key, decoder_out["text"])

                    print("Log-like per-frame for utterance {} is {}".format(
                        key, decoder_out["likelihood"] / num_frs[j]))

                    # save lattice
                    lat_out[key] = decoder_out["lattice"]