Esempio n. 1
0
import sys
import argparse

import numpy as np
from kaldi.util.table import SequentialWaveReader
from kaldi.matrix import Matrix, _matrix_ext

if __name__ == '__main__':

    parser = argparse.ArgumentParser(description='wav.scp to byte files, i.e.,'
                                                 'each line: uttid num_bytes')
    parser.add_argument('wav_rspecifier', type=str,
                        help='input wav.scp filename')
    parser.add_argument('byte_file', type=str,
                        help='input wav.scp filename')
    args, unk = parser.parse_known_args()
   
    wav_reader = SequentialWaveReader(args.wav_rspecifier)
    with open(args.byte_file, 'w') as bf:
        for uttid, wave in wav_reader:
            wave_data = _matrix_ext.matrix_to_numpy(wave.data())
            #has to be one channel 
            assert wave_data.shape[0] == 1
            bf.write('{} {}\n'.format(uttid, 2*len(wave_data[0].astype('int16'))))
Esempio n. 2
0
def otf_utt_generator(data_triplets, rir, noise, args):
    """
    Args:
        data_lst: list of mrk and seq of input audios, and label ark
        rir: list of rir, List[AudioSegment]
        noise: list of noise, List[AudioSegment]
        args: argumnets for loader
    """
    max_len = args.max_len
    batch_size = args.batch_size
    data_buffer = np.zeros((batch_size, max_len, get_inputdim(args)),
                           dtype=np.float32)
    target_buffer = np.zeros((batch_size, max_len), dtype=np.int32)
    len_buffer = np.zeros(batch_size, dtype=np.int32)
    ali_len = np.zeros(batch_size, dtype=np.int32)

    batch_idx = 0
    valid_idx = 0
    target_len = 0
    batch_max_len = -1
    target_max_len = -1

    #rates for speed perturbation
    speed_rate = [float(rate) for rate in args.speed_rate.split(',')]
    #volume level perturbation
    gain_lo, gain_hi = [-float(gain) for gain in args.gain_range.split(',')]
    #snr range for noise perturbation: 0-20db with mean of 10
    #mu, sigma = 10, 10
    #lo, hi = (0 - mu) / sigma, (20 - mu) / sigma
    #Fbank config
    po = ParseOptions('')
    fbank_opt = FbankOptions()
    fbank_opt.register(po)
    #fbank_opt = MfccOptions()
    #fbank_opt.register(po)
    po.read_config_file(args.feat_config)
    fbank = Fbank(fbank_opt)
    #fbank = Mfcc(fbank_opt)

    for data_triplet in data_triplets:
        mrk_fn, seq_fn = data_triplet[0], data_triplet[1]
        ali_rspec = data_triplet[2]
        with open(mrk_fn, 'r', encoding='utf-8') as mrk,\
             open(seq_fn, 'rb') as seq:
            ali_reader = SequentialIntVectorReader(ali_rspec)
            for line, (uttid1, ali) in zip(mrk, ali_reader):
                uttid = line.split()[0]
                assert uttid == uttid1
                seq.seek(int(line.split()[1]))
                num_bytes = int(line.split()[2])
                num_bytes -= num_bytes % 2
                audio_bytes = seq.read(num_bytes)
                audio_np = np.frombuffer(audio_bytes, dtype='int16')
                #data augmentation function goes here
                audio_seg = AudioSegment(audio_np, args.sample_rate)
                #speed perturbation
                spr = speed_rate[randint(0, len(speed_rate) - 1)]
                audio_seg.change_speed(spr)
                audio_seg.normalize(np.random.uniform(gain_lo, gain_hi))
                #noise adding example:
                #snr = truncnorm.rvs(lo, hi, scale=sigma, loc=mu, size=1)
                #audio_seg.add_noise(noise[randint(0, len(noise)-1)], snr)
                #rir adding example:
                #audio_seg.convolve_and_normalize(rir[randint(0, len(rir)-1)])
                audio_np = audio_seg._convert_samples_from_float32(\
                                     audio_seg.samples, 'int16')
                wave_1ch = Vector(audio_np)
                feats = fbank.compute_features(wave_1ch,
                                               args.sample_rate,
                                               vtnl_warp=1.0)
                ali = np.array(ali)
                if args.reverse_labels:
                    ali = ali[::-1]
                if args.SOS >= 0:
                    ali = np.concatenate(([args.SOS], ali))
                if args.EOS >= 0:
                    ali = np.concatenate((ali, [args.EOS]))
                feats = _matrix_ext.matrix_to_numpy(feats)
                utt_len = feats.shape[0] // args.stride + \
                          int(feats.shape[0] % args.stride != 0)
                #limits on T*U products due to RNNT.
                #this is pretty hacky now
                if ali.shape[0] * utt_len // 3 <= args.TU_limit:
                    ali_len[valid_idx] = ali.shape[0]
                    data_buffer[valid_idx, :utt_len, :] = \
                        splice(feats, args.lctx, args.rctx)[::args.stride]
                    target_buffer[valid_idx, :ali_len[valid_idx]] = ali
                    len_buffer[valid_idx] = utt_len
                    if utt_len > batch_max_len:
                        batch_max_len = utt_len
                    if ali_len[valid_idx] > target_max_len:
                        target_max_len = ali_len[valid_idx]
                    valid_idx += 1

                batch_idx += 1

                if batch_idx == batch_size:
                    for b in range(valid_idx):
                        utt_len = len_buffer[b]
                        target_len = ali_len[b]
                        #data and target padding
                        if utt_len > 0:
                            data_buffer[b, utt_len:batch_max_len, :] = \
                                data_buffer[b, utt_len-1, :]
                            target_buffer[b, target_len:target_max_len] = \
                                args.padding_tgt

                    data = data_buffer[:valid_idx, :batch_max_len, :]
                    target = target_buffer[:valid_idx, :target_max_len]

                    if not args.batch_first:
                        data = np.transpose(data, (1, 0, 2))
                        target = np.transpose(target, (1, 0))

                    data = torch.from_numpy(np.copy(data))
                    target = torch.from_numpy(np.copy(target))
                    lens = torch.from_numpy(np.copy(len_buffer[:valid_idx]))
                    ali_lens = torch.from_numpy(np.copy(ali_len[:valid_idx]))

                    if valid_idx > 0:
                        #not doing cuda() here, in main process instead
                        yield data, target, lens, ali_lens
                    else:
                        yield None, None, \
                              torch.IntTensor([0]), torch.IntTensor([0])

                    batch_idx = 0
                    valid_idx = 0
                    target_len = 0
                    batch_max_len = -1
                    target_max_len = -1

            ali_reader.close()

    yield None
Esempio n. 3
0
def ctc_utt_generator(align_rspec, feats_rspec, shuffle, args):
    """
    we do not really need 'target' generated
    in MMI/sMBR training from this generator
    so the interface is adjusted to fullfill
    warp_ctc for CTC training, target is now
    a tuple of (label, label_size).
    """
    ali_reader = SequentialIntVectorReader(align_rspec)
    feats_reader = SequentialMatrixReader(feats_rspec)
    max_len = args.max_len
    batch_size = args.batch_size

    data_buffer = np.zeros((batch_size, max_len, get_inputdim(args)),
                           dtype=np.float32)
    target_buffer = np.zeros((batch_size * max_len), dtype=np.int32)
    len_buffer = np.zeros(batch_size, dtype=np.int32)
    ali_len = np.zeros(batch_size, dtype=np.int32)
    start_flag = torch.IntTensor([1] * batch_size)

    if args.cuda:
        start_flag = start_flag.cuda(args.local_rank)

    batch_idx = 0
    target_len = 0
    batch_max_len = -1

    #!!!make sure feature and ali
    #!!!has exact the same  order
    for (uttid, ali), (uttid2, feats) in zip(ali_reader, feats_reader):
        assert uttid2 == uttid
        ali = np.array(ali)
        feats = _matrix_ext.matrix_to_numpy(feats)
        #in CTC training, the ali is shorter
        utt_len = feats.shape[0] // args.stride + \
                  int(feats.shape[0] % args.stride != 0)
        assert ali.shape[0] <= utt_len

        ali_len[batch_idx] = ali.shape[0]
        data_buffer[batch_idx, :utt_len, :] = splice(feats, args.lctx,
                                                     args.rctx)[::args.stride]
        target_buffer[target_len:target_len + ali_len[batch_idx]] = ali
        target_len += ali_len[batch_idx]
        len_buffer[batch_idx] = utt_len

        if utt_len > batch_max_len:
            batch_max_len = utt_len

        batch_idx += 1

        if batch_idx == batch_size:
            for b in range(batch_size):
                utt_len = len_buffer[b]
                data_buffer[b, utt_len:batch_max_len, :] = 0
                #target_buffer[b, ali_len[b]:batch_max_len]  = -1

            data = data_buffer[:, :batch_max_len, :]
            target = target_buffer[:target_len]

            if not args.batch_first:
                data = np.transpose(data, (1, 0, 2))
                #target = np.transpose(target, (1, 0))

            data = np.copy(data)
            target = np.copy(target)
            lens = np.copy(len_buffer)
            ali_lens = np.copy(ali_len)

            data = torch.from_numpy(data)
            target = torch.from_numpy(target)

            if args.cuda:
                data, target = data.cuda(args.local_rank), target

            yield Variable(data), (Variable(target),
                                   ali_lens), lens, start_flag

            batch_idx = 0
            target_len = 0
            batch_max_len = -1

    yield None
Esempio n. 4
0
            mrk_fn = line.split()[0]
            seq_fn = line.split()[1]
            with open(mrk_fn, 'r', encoding='utf-8') as mrk, \
                 open(seq_fn, 'rb') as seq:
                for mrk_line in mrk:
                    seq.seek(int(mrk_line.split()[1]))
                    num_bytes = int(mrk_line.split()[2])
                    #this is making sure even number of bytes
                    num_bytes -= num_bytes % 2
                    audio_bytes = seq.read(num_bytes)
                    audio_np = np.frombuffer(audio_bytes, dtype='int16')
                    audio_seg = AudioSegment(audio_np, args.sample_rate)
                    spr = speed_rate[randint(0, len(speed_rate) - 1)]
                    audio_seg.change_speed(spr)
                    #-55 to -10 db
                    audio_seg.normalize(np.random.uniform(-55, -10))
                    audio_np = audio_seg._convert_samples_from_float32(\
                                         audio_seg.samples, 'int16')
                    wave_1ch = Vector(audio_np)
                    feats = fbank.compute_features(wave_1ch,
                                                   args.sample_rate,
                                                   vtnl_warp=1.0)
                    if args.cmn:
                        feats = _matrix_ext.matrix_to_numpy(feats)
                        feats -= np.mean(feats, axis=0)
                        feats = Matrix(feats)

                    cmvn.accumulate(feats)

    cmvn.write_stats(args.cmvn_stats, binary=False)
Esempio n. 5
0
def utt_generator(align_rspec, feats_rspec, shuffle, args):
    """
    Args:
        align_rspec: kaldi style read rspecifier for alignment
        feats_rspec: kaldi stule read rspecifier for feature
        shuffle: deprecated
        args: arguments
    """
    ali_reader = SequentialIntVectorReader(align_rspec)
    feats_reader = SequentialMatrixReader(feats_rspec)
    max_len = args.max_len
    batch_size = args.batch_size
    data_buffer = np.zeros((batch_size, max_len, get_inputdim(args)),
                           dtype=np.float32)
    target_buffer = np.zeros((batch_size, max_len), dtype=np.int32)
    len_buffer = np.zeros(batch_size, dtype=np.int32)
    ali_len = np.zeros(batch_size, dtype=np.int32)
    start_flag = torch.IntTensor([1] * batch_size)

    if args.cuda:
        start_flag = start_flag.cuda(args.local_rank)

    batch_idx = 0
    target_len = 0
    batch_max_len = -1
    target_max_len = -1
    for (uttid, ali), (uttid2, feats) in zip(ali_reader, feats_reader):
        assert uttid2 == uttid
        ali = np.array(ali)
        feats = _matrix_ext.matrix_to_numpy(feats)
        utt_len = feats.shape[0] // args.stride + int(
            feats.shape[0] % args.stride != 0)
        #ali/targets should be shorter
        #assert ali.shape[0] <= utt_len
        ali_len[batch_idx] = ali.shape[0]
        data_buffer[batch_idx, :utt_len, :] = \
            splice(feats, args.lctx, args.rctx)[::args.stride]
        target_buffer[batch_idx, :ali_len[batch_idx]] = ali
        #target_len += ali_len[batch_idx]
        len_buffer[batch_idx] = utt_len

        if utt_len > batch_max_len:
            batch_max_len = utt_len

        if ali_len[batch_idx] > target_max_len:
            target_max_len = ali_len[batch_idx]

        batch_idx += 1

        if batch_idx == batch_size:
            for b in range(batch_size):
                utt_len = len_buffer[b]
                target_len = ali_len[b]
                #data and target padding
                data_buffer[b, utt_len:batch_max_len, :] = \
                    data_buffer[b, utt_len-1, :]
                target_buffer[b, target_len:target_max_len] = args.padding_tgt

            data = data_buffer[:, :batch_max_len, :]
            target = target_buffer[:, :target_max_len]

            if not args.batch_first:
                data = np.transpose(data, (1, 0, 2))
                target = np.transpose(target, (1, 0))

            data = np.copy(data)
            target = np.copy(target)
            lens = np.copy(len_buffer)
            ali_lens = np.copy(ali_len)

            data = torch.from_numpy(data)
            target = torch.from_numpy(target).long()

            if args.cuda:
                data = data.cuda(args.local_rank)
                target = target.cuda(args.local_rank)
            yield Variable(data), Variable(target), lens, ali_lens

            batch_idx = 0
            target_len = 0
            batch_max_len = -1
            target_max_len = -1

    yield None