Exemple #1
0
    def __init__(self, idx_to_char, params={}):

        self.idx_to_char = idx_to_char

        self.reorder_1, self.reorder_2 = create_phone_map(
            params['phones_path'], idx_to_char)
        self.word_syms = SymbolTable.read_text(params['words_path'])

        self.acoustic_scale = params.get('acoustic', 1.2)
        if self.acoustic_scale < 0:
            print("Warning: acoustic scale is less than 0")
        allow_partial = params.get('allow_partial', True)
        beam = params.get('beam', 13)
        self.alphaweight = params.get('alphaweight', 0.3)

        trans_model = TransitionModel()
        with xopen(params['mdl_path']) as ki:
            trans_model.read(ki.stream(), ki.binary)

        decoder_opts = FasterDecoderOptions()
        decoder_opts.beam = beam

        decode_fst = read_fst_kaldi(params['fst_path'])

        self.decoder_opts = decoder_opts
        self.trans_model = trans_model
        self.decode_fst = decode_fst

        self.stats = LMStats()
        self.stats_state = None
        self.add_stats_phase = True
Exemple #2
0
def read_trans_model(model_path: str) -> hmm.TransitionModel:
    """Read in a transition model stored in the header of a .mdl file.

    Args:
        model_path: Path to a .mdl file.

    Returns:
        The transition model.
    """
    with xopen(model_path) as istream:
        trans_model = hmm.TransitionModel().read(istream.stream(),
                                                 istream.binary)
    return trans_model
def read_sparse_mat(sparse_mat_dir: str) -> SparseMatrix:
    """Read in a sparse matrix.

    Args:
        sparse_mat_dir: Path to the sparse matrix file.

    Returns:
        mat: A sparse matrix.
    """
    with xopen(sparse_mat_dir, 'r') as reader:
        mat = SparseMatrix()
        mat.read_(reader.stream(), reader.binary)
    return mat
Exemple #4
0
 def from_kaldi(cls, filename, device):
     ubm = KaldiFullGmm()
     with kio.xopen(filename) as f:
         ubm.read(f.stream(), f.binary)
     means = torch.from_numpy(ubm.get_means().numpy())
     weights = torch.from_numpy(ubm.weights().numpy())
     n_components = weights.numel()
     feat_dim = means.size()[1]
     covariances = torch.zeros([n_components, feat_dim, feat_dim],
                               device='cpu',
                               dtype=torch.float32)
     for index, kaldicovar in enumerate(ubm.get_covars()):
         covariances[index, :, :] = torch.from_numpy(
             KaldiMatrix(kaldicovar).numpy())
     return Gmm(means, covariances, weights, device=device)
Exemple #5
0
def read_nnet3_model(model_path: str) -> nnet3.Nnet:
    """Read in a nnet3 model in raw format.

    Actually if this model is not a raw format it will still work, but this is
    not an official feature; it was due to some kaldi internal code.

    Args:
        model_path: Path to a raw nnet3 model, e.g., "data/final.raw"

    Returns:
        nnet: A neural network AM.
    """
    nnet = nnet3.Nnet()
    with xopen(model_path) as istream:
        nnet.read(istream.stream(), istream.binary)
    return nnet
Exemple #6
0
def get_diag_gmm_params(file_diag, out_dir):
    if not os.path.isfile(file_diag):
        print("File {0} does not exist!".format(file_diag))
    else:
        print("Getting GMM stats from", format(os.path.basename(file_diag)))
        diag_mdl = io.xopen(file_diag)  # reading .mdl or .ubm file
        gmm = DiagGmm()  # creating DiagGmm object
        gmm.read(diag_mdl.stream(), diag_mdl.binary)  # reading model

        file_name = os.path.basename(file_diag)

        vars = np.asanyarray(gmm.get_vars())
        means = np.asanyarray(gmm.get_means())
        weights = np.asanyarray(gmm.weights())  # priors

        np.savetxt(out_dir + '{}_variances.dubm'.format(file_name), vars)
        np.savetxt(out_dir + '{}_means.dubm'.format(file_name), means)
        np.savetxt(out_dir + '{}_weights.dubm'.format(file_name), weights)
        print("Vars, means and weights saved to:", out_dir)

        return vars, means, weights, gmm.num_gauss()
def get_utterance_pairs(reco2file_and_channel_rxfilename):
    utt_pairs = []
    call_to_uttlist = defaultdict(list)
    for line in xopen(reco2file_and_channel_rxfilename, "rt"):
        try:
            utt, call, _ = line.split()  # lines like: sw02001-A sw02001 A
        except:
            filename = printable_rxfilename(reco2file_and_channel_rxfilename)
            raise ValueError("Expecting 3 fields per line of "
                             "reco2file_and_channel file {}, got: {}".format(
                                 filename, len(line.split())))
        call_to_uttlist[call].append(utt)
    for key, uttlist in call_to_uttlist.items():
        if len(uttlist) == 2:
            utt_pairs.append(uttlist)
        else:
            print("Call {} has {} utterances, expected two; treating them "
                  "singly.".format(key, len(uttlist)),
                  file=sys.stderr)
            utt_pairs.extend([x] for x in uttlist)
    return utt_pairs
Exemple #8
0
def gmm_decode_faster(model_rxfilename, fst_rxfilename,
                      feature_rspecifier, words_wspecifier,
                      alignment_wspecifier="", lattice_wspecifier="",
                      word_symbol_table="", acoustic_scale=0.1,
                      allow_partial=True, decoder_opts=FasterDecoderOptions()):
    # Read model.
    trans_model = TransitionModel()
    am_gmm = AmDiagGmm()
    with xopen(model_rxfilename) as ki:
        trans_model.read(ki.stream(), ki.binary)
        am_gmm.read(ki.stream(), ki.binary)

    # Open table readers/writers.
    feature_reader = SequentialMatrixReader(feature_rspecifier)
    words_writer = IntVectorWriter(words_wspecifier)
    alignment_writer = IntVectorWriter(alignment_wspecifier)
    clat_writer = CompactLatticeWriter(lattice_wspecifier)

    # Read symbol table.
    word_syms = None
    if word_symbol_table != "":
        word_syms = SymbolTable.read_text(word_symbol_table)
        if not word_syms:
            raise RuntimeError("Could not read symbol table from file {}"
                               .format(word_symbol_table))

    # NOTE:
    # It is important to read decode_fst after opening feature reader as
    # it can prevent crashes on systems without enough virtual memory.

    # Read decoding graph and instantiate decoder.
    decode_fst = read_fst_kaldi(fst_rxfilename)
    decoder = FasterDecoder(decode_fst, decoder_opts)

    tot_like = 0.0
    frame_count = 0
    num_success, num_fail = 0, 0
    start = time.time()

    for key, features in feature_reader:
        if features.num_rows == 0:
            num_fail += 1
            logging.warning("Zero-length utterance: {}".format(key))
            continue

        gmm_decodable = DecodableAmDiagGmmScaled(am_gmm, trans_model,
                                                 features, acoustic_scale)
        decoder.decode(gmm_decodable)

        if not (allow_partial or decoder.reached_final()):
            num_fail += 1
            logging.warning("Did not successfully decode utterance {}, len = {}"
                            .format(key, features.num_rows))
            continue

        try:
            best_path = decoder.get_best_path()
        except RuntimeError:
            num_fail += 1
            logging.warning("Did not successfully decode utterance {}, len = {}"
                            .format(key, features.num_rows))
            continue

        if not decoder.reached_final():
            logging.warning("Decoder did not reach end-state, outputting "
                            "partial traceback since --allow-partial=true")

        ali, words, weight = get_linear_symbol_sequence(best_path)

        words_writer[key] = words

        if alignment_writer.is_open():
            alignment_writer[key] = ali

        if clat_writer.is_open():
            if acoustic_scale != 0.0:
                scale = acoustic_lattice_scale(1.0 / acoustic_scale)
                scale_lattice(scale, best_path)
            best_path = convert_lattice_to_compact_lattice(best_path)
            clat_writer[key] = best_path

        if word_syms:
            syms = convert_indices_to_symbols(word_syms, words)
            print(key, " ".join(syms), file=sys.stderr)

        num_success += 1
        frame_count += features.num_rows
        like = - (weight.value1 + weight.value2);
        tot_like += like
        logging.info("Log-like per frame for utterance {} is {} over {} "
                     "frames.".format(key, like / features.num_rows,
                                      features.num_rows))
        logging.debug("Cost for utterance {} is {} + {}"
                      .format(key, weight.value1, weight.value2))

    elapsed = time.time() - start
    logging.info("Time taken [excluding initialization] {}s: real-time factor "
                 "assuming 100 frames/sec is {}"
                 .format(elapsed, elapsed * 100 / frame_count))
    logging.info("Done {} utterances, failed for {}"
                 .format(num_success, num_fail))
    logging.info("Overall log-likelihood per frame is {} over {} frames."
                 .format(tot_like / frame_count, frame_count))

    feature_reader.close()
    words_writer.close()
    if alignment_writer.is_open():
        alignment_writer.close()
    if clat_writer.is_open():
        clat_writer.close()

    return True if num_success != 0 else False
Exemple #9
0
def extract_segments(wav_rspecifier, segments_rxfilename, wav_wspecifier,
                     opts):
    with RandomAccessWaveReader(wav_rspecifier) as reader, \
         WaveWriter(wav_wspecifier) as writer:
        num_success, num_lines = 0, 0
        for num_lines, line in enumerate(xopen(segments_rxfilename, "rt"), 1):
            # segments file format:
            #   segment-name wav-name start-time end-time [channel]
            try:
                segment, recording, start, end = line.split()
                channel = None
            except:
                try:
                    segment, recording, start, end, channel = line.split()
                except:
                    logging.warning(
                        "Invalid line in segments file: {}".format(line))
                    continue

            try:
                start = float(start)
            except:
                logging.warning(
                    "Invalid line in segments file [bad start]: {}".format(
                        line))
                continue

            try:
                end = float(end)
            except:
                logging.warning(
                    "Invalid line in segments file [bad end]: {}".format(line))
                continue

            if ((start < 0 or (end != -1.0 and end <= 0))
                    or (start >= end and end > 0)):
                logging.warning("Invalid line in segments file [empty or "
                                "invalid segment]: {}".format(line))
                continue

            try:
                if channel:
                    channel = int(channel)
            except:
                logging.warning("Invalid line in segments file "
                                "[bad channel]: {}".format(line))
                continue

            if not recording in reader:
                logging.warning("Could not find recording {}, skipping "
                                "segment {}".format(recording, segment))
                continue

            wave = reader[recording]
            wave_data = wave.data()
            samp_freq = wave.samp_freq
            num_chan, num_samp = wave_data.shape

            # Convert starting time of the segment to corresponding sample
            # number. If end time is -1 then use the whole file starting
            # from start time.
            start_samp = start * samp_freq
            end_samp = end * samp_freq if end != -1 else num_samp
            assert start_samp >= 0 and end_samp > 0, "Invalid start or end."

            # start sample must be less than total number of samples,
            # otherwise skip the segment
            if start_samp < 0 or start_samp >= num_samp:
                logging.warning("Start sample out of range {} [length:] {}, "
                                "skipping segment {}".format(
                                    start_samp, num_samp, segment))
                continue

            # end sample must be less than total number samples
            # otherwise skip the segment
            if end_samp > num_samp:
                if end_samp >= num_samp + int(opts.max_overshoot * samp_freq):
                    logging.warning("End sample too far out of range {} "
                                    "[length:] {}, skipping segment {}".format(
                                        end_samp, num_samp, segment))
                    continue
                end_samp = num_samp  #for small differences, just truncate.

            # Skip if segment size is less than minimum segment length
            # (default 0.1s)
            min_samp = int(opts.min_segment_length * samp_freq)
            if end_samp <= start_samp + min_samp:
                logging.warning(
                    "Segment {} too short, skipping it!".format(segment))
                continue

            # check whether the wav file has more than one channel
            # if yes, specify the channel info in segments file
            # otherwise skips the segment
            if channel is None:
                if num_chan == 1:
                    channel = 0
                else:
                    raise ValuError(
                        "If your data has multiple channels, you "
                        "must specify the channel in the segments "
                        "file. Processing segment {}".format(segment))
            else:
                if channel >= num_chan:
                    logging.warning(
                        "Invalid channel {} >= {}, skipping segment"
                        " {}".format(channel, num_chan, segment))
                    continue

            segment_matrix = SubMatrix(wave_data, channel, 1, int(start_samp),
                                       int(end_samp - start_samp))
            segment_wave = WaveData.new(samp_freq, segment_matrix)
            writer[segment] = segment_wave  # write segment in wave format
            num_success += 1

        logging.info("Succesfully processed {} lines out of {} in the "
                     "segments file".format(num_success, num_lines))
Exemple #10
0
vad_opts = VadEnergyOptions()
vad_opts.vad_energy_threshold = 5.5
vad_opts.vad_energy_mean_scale = 0.5

delta_opts = DeltaFeaturesOptions()
delta_opts.window = 3
delta_opts.order = 2

feat_pipeline = make_feat_pipeline(mfcc, sliding_opts, vad_opts, delta_opts)

try:
    LOG.info('Loading ubm...')
    if not os.path.exists('app/extractor/final.ubm'):
        LOG.error('Not Found extractor/final.ubm, please recheck file')
        exit(1)
    with xopen('app/extractor/final.ubm') as ki:
        fgmm = FullGmm()
        fgmm.read(ki.stream(), ki.binary)
        gmm = DiagGmm()
        gmm.copy_from_full(fgmm)

    if not os.path.exists('app/extractor/final.ie'):
        LOG.error('Not Found app/extractor/final.ie, please recheck file')
        exit(1)

    with xopen('app/extractor/final.ie') as ki:
        extractor_ = IvectorExtractor()
        extractor_.read(ki.stream(), ki.binary)
        LOG.info('IvectorExtractor ready')

except Exception:
Exemple #11
0
        cmvn.accumulate(feats)
        cmvn.apply(feats)
        return compute_deltas(opts, feats)

    return feat_pipeline


mfcc_opts = MfccOptions()
mfcc_opts.frame_opts.samp_freq = 44100
mfcc_opts.frame_opts.allow_downsample = True
mfcc_opts.use_energy = False

feat_pipeline = make_feat_pipeline(Mfcc(mfcc_opts))

# Read the model
with xopen("models/mono/final.mdl") as ki:
    trans_model = TransitionModel().read(ki.stream(), ki.binary)
    acoustic_model = AmDiagGmm().read(ki.stream(), ki.binary)


# Define the decodable wrapper: (features, acoustic_scale) -> decodable
def make_decodable_wrapper(trans_model, acoustic_model):
    def decodable_wrapper(features, acoustic_scale):
        return DecodableAmDiagGmmScaled(acoustic_model, trans_model, features,
                                        acoustic_scale)

    return decodable_wrapper


decodable_wrapper = make_decodable_wrapper(trans_model, acoustic_model)
Exemple #12
0
from kaldi.util.table import SequentialWaveReader


# Define the feature pipeline: (wav) -> feats
def make_feat_pipeline(base, opts=DeltaFeaturesOptions()):
    def feat_pipeline(wav):
        feats = base.compute_features(wav.data()[0], wav.samp_freq, 1.0)
        return compute_deltas(opts, feats)

    return feat_pipeline


feat_pipeline = make_feat_pipeline(Mfcc(MfccOptions()))

# Read the model
with xopen("/home/dogan/tools/pykaldi/egs/models/wsj/final.mdl") as ki:
    trans_model = TransitionModel().read(ki.stream(), ki.binary)
    acoustic_model = AmDiagGmm().read(ki.stream(), ki.binary)


# Define the decodable wrapper: (features, acoustic_scale) -> decodable
def make_decodable_wrapper(trans_model, acoustic_model):
    def decodable_wrapper(features, acoustic_scale):
        return DecodableAmDiagGmmScaled(acoustic_model, trans_model, features,
                                        acoustic_scale)

    return decodable_wrapper


decodable_wrapper = make_decodable_wrapper(trans_model, acoustic_model)