Esempio n. 1
0
def main():
    desc = 'Kaldi outputs token IDs in numbers. We can map them back to ' + \
            'textual form given an ID to text mapping. Will output to stdout.'
    parser = common.init_argparse(desc)
    parser.add_argument('fname', help='File to process. We expect each line ' + \
                        'to have tokens separated by whitespace, where ' + \
                        'the first token is a key or name (e.g. utt name) ' + \
                        'that can be skipped, and the rest are ID numbers.')
    parser.add_argument('id_map', help='Mapping from textual form to ID. ' + \
                        'We expect each line to have two tokens separated ' + \
                        'by whitespace, where the first token is the text ' + \
                        'and the second token is the ID number.')
    args = parser.parse_args()

    id_map = common.make_reverse_index(io.dict_read(args.id_map))
    # Check that mapping from number to text is 1-to-1
    for k in id_map.keys():
        if len(id_map[k]) != 1:
            raise ValueError('Mapping at {} not 1-1: {}'.format(k, id_map[k]))
        id_map[k] = id_map[k][0]

    with open(args.fname, 'r') as f:
        for line in f:
            ary = line.strip().split()
            for i in range(1, len(ary)):
                ary[i] = id_map[ary[i]]
            print ' '.join(ary)
Esempio n. 2
0
 def parse_args(self):
     parser = common.init_argparse(self._desc())
     self._register_custom_types(parser)
     self._main_args(parser)
     self._validation_args(parser)
     self._data_args(parser)
     self._model_args(parser)
     self._training_args(parser)
     self.args = parser.parse_args()
     self._check_args()
Esempio n. 3
0
def main():
    desc = 'Convert phone to word alignment. Output to stdout.'
    parser = common.init_argparse(desc)
    parser.add_argument('ali_phones_with_length',
                        help='File containing phone alignment with length ' + \
                        '(generated with ali-to-phones --write-lengths=true)')
    parser.add_argument('text', help='Kaldi word-level transcript')
    parser.add_argument('phone_map', help='Mapping from text to phone ID. ' + \
                        'We expect each line to have two tokens separated ' + \
                        'by whitespace, where the first token is the phone ' + \
                        'and the second token is the ID number.')
    parser.add_argument('lexicon', help='Pronunciation lexicon')
    parser.add_argument('--sil-phones',
                        nargs='+',
                        default=[],
                        help='IDs of phones regarded as silence')
    parser.add_argument('--sil-label',
                        default='sil',
                        help='Label of silence phone/word to use in output')
    args = parser.parse_args()

    alis = ali_with_length_read(args.ali_phones_with_length,
                                ordered=True,
                                expand=False)
    io.log('Loaded {} alignments'.format(len(alis)))
    text = io.dict_read(args.text, lst=True)
    io.log('Loaded transcript containing {} utterances'.format(len(text)))
    phone2id = io.dict_read(args.phone_map)
    io.log('Loaded phone2id containing {} phones'.format(len(phone2id)))
    id2phone = {}
    # We normalize the phone name so that IDs of phone variants will map to
    # the primary phone. For example, IDs of sil, sil_B, sil_E, sil_I, sil_S
    # will all map to sil. The assumption here is that anything after and
    # including the '_' character is not part of the primary phone name.
    for phone in phone2id.keys():
        nphone = phone.split('_')[0]
        id2phone[phone2id[phone]] = nphone
    io.log('Total phones in id2phone: {}'.format(len(set(id2phone.values()))))
    lexicon = io.lexicon_read(args.lexicon)
    io.log('Loaded lexicon containing {} words'.format(len(lexicon)))
    sil_phones = set(args.sil_phones)
    io.log('sil_phones: {} ({}), sil_label: {}'.format(
        sil_phones, [id2phone[i] for i in sil_phones], args.sil_label))

    for key in alis:
        phone_tokens, length = get_phone_tokens(alis[key], id2phone,
                                                sil_phones)
        if len(phone_tokens) == 0:
            io.log('WARNING: {} - no non-silence tokens'.format(key))
            continue
        if key not in text:
            io.log('WARNING: {} not in text'.format(key))
            continue
        phone2word_ali(key, phone_tokens, text[key], lexicon, args.sil_label,
                       length)
Esempio n. 4
0
def main():
    desc = 'Extract features with DNN. Output to Kaldi ark.'
    parser = common.init_argparse(desc)
    parser.add_argument('model_in', help='Model that can be read by load_dnn')
    parser.add_argument('feats_scp', help='scp of input features')
    parser.add_argument('ark_out', help='Output ark file')
    parser.add_argument('--output-layer', type=int, default=-2,
                        help='Layer to use for extracting features. ' + \
                             'Negative index can be used. For example, ' + \
                             '-1 means the last layer, and so on.')
    parser.add_argument('--context',
                        type=int,
                        default=8,
                        help='Number of context frames for splicing')
    parser.add_argument('--padding', default='replicate',
                        help='What to do with out-of-bound frames. Valid ' + \
                             'values: [replicate|zero]')
    parser.add_argument('--ivectors', help='Utterance i-vectors to append')
    parser.add_argument('--chunk-size',
                        default='300m',
                        help='Chunk size for data buffering')
    args = parser.parse_args()

    io.log('Initializing dataset')
    ivectors = None if args.ivectors is None else \
            io.ivector_ark_read(args.ivectors, dtype=theano.config.floatX)
    dataset = init_dataset(args.feats_scp, args.context, args.padding,
                           ivectors)
    io.log('Initializing model')
    dnn = load_dnn(args.model_in)

    # Initializing shared_ds according to chunk_size
    num_items = get_num_items(args.chunk_size, theano.config.floatX)
    max_frames = num_items / dataset.get_dim()
    max_utt_frames = np.max(
        map(dataset.get_num_frames_by_utt_name, dataset.get_utt_names()))
    common.CHK_GE(max_frames, max_utt_frames)
    x = np.zeros((max_frames, dataset.get_dim()), dtype=theano.config.floatX)
    io.log('...getting extraction function')
    extract_fn = dnn.build_extract_feat_function(args.output_layer)
    io.log('Got it!')

    io.log('** Begin outputting to {} **'.format(args.ark_out))
    ark_out = KaldiWriteOut(args.ark_out)
    utt_names, utt_frames, total_frames = [], [], 0
    for utt in dataset.get_utt_names():
        frames = dataset.get_num_frames_by_utt_name(utt)
        if total_frames + frames > max_frames:
            __extract(extract_fn, ark_out, dataset, x, utt_names, utt_frames)
            utt_names, utt_frames, total_frames = [], [], 0
        utt_names.append(utt)
        utt_frames.append(frames)
        total_frames += frames
    __extract(extract_fn, ark_out, dataset, x, utt_names, utt_frames)
    ark_out.close()
Esempio n. 5
0
def main():
    desc = 'Convert from alignment with length to regular alignments. Output to stdout.'
    parser = common.init_argparse(desc)
    parser.add_argument('ali_with_length', help='Alignment with lengths')
    args = parser.parse_args()

    ali = ali_with_length_read(args.ali_with_length, ordered=True, expand=True)
    io.log('Read {} aligment with lengths'.format(len(ali)))

    for key in ali:
        print '{} {}'.format(key, ' '.join(ali[key]))
Esempio n. 6
0
def main():
    desc = 'Convert from speaker i-vectors to utt-ivectors. Output to stdout.'
    parser = common.init_argparse(desc)
    parser.add_argument('spk_ivectors', help='File containing spk i-vectors.')
    parser.add_argument('utt2spk', help='Kaldi utt2spk mapping.')
    args = parser.parse_args()

    spk_ivectors = ivector_ark_read(args.spk_ivectors)
    utt2spk = io.dict_read(args.utt2spk, ordered=True)
    spk2utt = common.make_reverse_index(utt2spk, ordered=True)

    wrote = 0
    for spk in spk2utt.keys():
        for utt in spk2utt[spk]:
            print_vector(utt, spk_ivectors[spk])
            wrote += 1
    io.log('Wrote {} utt i-vectors for {} spks'.format(wrote, len(spk2utt)))
Esempio n. 7
0
def main():
    desc = 'Reads in a pdf alignment and output prior counts to disk.'
    parser = common.init_argparse(desc)
    parser.add_argument('alipdf', help='pdf alignment file.')
    parser.add_argument('output_fname', help='File to output prior counts to')
    parser.add_argument('--num-pdfs', type=int, help='Number of pdfs. ' + \
                        'If not set, use max value in `alipdf`.')
    args = parser.parse_args()

    alipdf = io.dict_read(args.alipdf)
    pdfs = []
    for utt in alipdf.keys():
        pdfs.extend(numpy.asarray(alipdf[utt], dtype=numpy.int))
    bins = numpy.bincount(pdfs, minlength=args.num_pdfs)

    fw = open(args.output_fname, 'w')
    fw.write('[ {} ]\n'.format(' '.join(numpy.asarray(bins, dtype=numpy.str))))
    fw.close()
Esempio n. 8
0
def main():
    desc = 'Convert from one mapping to another. Will output to stdout.'
    parser = common.init_argparse(desc)
    parser.add_argument('fname', help='File to process. We expect each line ' + \
                        'to have tokens separated by whitespace, where ' + \
                        'the first token is a key or name (e.g. utt name) ' + \
                        'that can be skipped, and the rest are values.')
    parser.add_argument('id_map', help='Mapping from one ID to another ID. ' + \
                        'Each line has two tokens separated by whitespace.')
    args = parser.parse_args()

    id_map = io.dict_read(args.id_map)
    io.log('Read {} mappings'.format(len(id_map)))

    with open(args.fname, 'r') as f:
        for line in f:
            ary = line.strip().split()
            for i in range(1, len(ary)):
                ary[i] = id_map[ary[i]]
            print ' '.join(ary)
Esempio n. 9
0
def main():
    desc = 'Convert from utt i-vectors to spk-ivectors. NOTE: this ' + \
           'script does not check the values of utt i-vectors that belong ' + \
           'to the same spk. It will simply treat the first utt i-vector ' + \
           'it finds from a spk as the i-vector for that spk. Output to stdout.'
    parser = common.init_argparse(desc)
    parser.add_argument('utt_ivectors', help='File containing utt i-vectors.')
    parser.add_argument('utt2spk', help='Kaldi utt2spk mapping.')
    args = parser.parse_args()

    utt_ivectors = ivector_ark_read(args.utt_ivectors, ordered=True)
    utt2spk = io.dict_read(args.utt2spk)

    processed_spks = set()
    for utt in utt_ivectors.keys():
        spk = utt2spk[utt]
        if spk in processed_spks:
            continue
        print_vector(spk, utt_ivectors[utt])
        processed_spks.add(spk)
    io.log('Wrote {} spk i-vectors'.format(len(processed_spks)))
Esempio n. 10
0
def main():
    desc = 'Use phone alignment to generate VAD vectors. Output to stdout.'
    parser = common.init_argparse(desc)
    parser.add_argument('ali_phones_with_length',
                        help='File containing phone alignment with length ' + \
                        '(generated with ali-to-phones --write-lengths=true)')
    parser.add_argument('silphones', help='List of phones regarded as silence')
    args = parser.parse_args()

    silphones = set(io.read_lines(args.silphones))
    io.log('{} silence phones: {}'.format(len(silphones), ':'.join(silphones)))
    alis = ali_with_length_read(args.ali_phones_with_length,
                                ordered=True,
                                expand=False)
    io.log('Loaded {} alignments'.format(len(alis)))

    for key in alis:
        vad = []
        for ali in alis[key]:
            phone, length = ali
            vad.extend([0.0 if phone in silphones else 1.0] * length)
        print_vector(key, vad)
Esempio n. 11
0
def main():
    desc = 'Outputs Kaldi-compatible log-likelihood to stdout using a pdnn ' + \
           'model. This mimics the design of Kaldi nnet-forward. Use this ' + \
           'for networks that cannot be converted to Kaldi, e.g. factored model'
    parser = common.init_argparse(desc)
    parser.add_argument('model_in', help='Model that can be read by load_dnn')
    parser.add_argument('feats_scp', help='scp of input features')
    parser.add_argument('--context',
                        type=int,
                        default=8,
                        help='Number of context frames for splicing')
    parser.add_argument('--padding', default='replicate',
                        help='What to do with out-of-bound frames. Valid ' + \
                             'values: [replicate|zero]')
    parser.add_argument('--class-frame-counts', help='Kaldi vector with ' + \
                        'frame-counts of pdfs to compute log-priors')
    parser.add_argument('--prior-floor', type=float, default=1e-10,
                        help='Flooring constant for prior probability, ' + \
                             'i.e. pdfs with prior smaller than this ' + \
                             'value will be ignored during decoding.')
    parser.add_argument('--ivectors', help='Utterance i-vectors to append')
    parser.add_argument('--chunk-size',
                        default='300m',
                        help='Chunk size for data buffering')
    args = parser.parse_args()

    io.log('Initializing dataset')
    ivectors = None if args.ivectors is None else \
            ivector_ark_read(args.ivectors, dtype=theano.config.floatX)
    dataset = init_dataset(args.feats_scp, args.context, args.padding,
                           ivectors)
    io.log('Initializing model')
    dnn = load_dnn(args.model_in)
    io.log('Initializing priors')
    log_priors = get_log_priors(args.class_frame_counts, args.prior_floor)

    # Initializing shared_ds according to chunk_size
    num_items = get_num_items(args.chunk_size, theano.config.floatX)
    max_frames = num_items / dataset.get_dim()
    max_utt_frames = np.max(
        map(dataset.get_num_frames_by_utt_name, dataset.get_utt_names()))
    common.CHK_GE(max_frames, max_utt_frames)
    x = np.zeros((max_frames, dataset.get_dim()), dtype=theano.config.floatX)
    shared_x = theano.shared(x, name='x', borrow=True)
    io.log('Using shared_x with size {} ({})'.format(x.shape, args.chunk_size))
    io.log('...getting output function')
    output_fn = dnn.build_output_function(shared_x)
    io.log('Got it!')

    io.log('** Begin outputting **')
    utt_names, utt_frames, total_frames = [], [], 0
    for utt in dataset.get_utt_names():
        frames = dataset.get_num_frames_by_utt_name(utt)
        if total_frames + frames > max_frames:
            __nnet_fwd(output_fn, dataset, x, shared_x, utt_names, utt_frames,
                       log_priors)
            utt_names, utt_frames, total_frames = [], [], 0
        utt_names.append(utt)
        utt_frames.append(frames)
        total_frames += frames
    __nnet_fwd(output_fn, dataset, x, shared_x, utt_names, utt_frames,
               log_priors)
Esempio n. 12
0
import chaipy.common as common
import chaipy.io as io

from chaipy.data.temporal import TemporalData


def main(args):
    ds = TemporalData.from_kaldi(args.scp)
    io.log('Loaded dataset containing {} utts'.format(len(ds.get_utt_names())))
    utt2label = io.dict_read(args.utt2label)
    io.log('Loaded utt2label containing {} entries'.format(len(utt2label)))

    for utt_name in ds.get_utt_names():
        if utt_name not in utt2label:
            io.log('WARNING: {} not in utt2label, skipping'.format(utt_name))
        lbl = utt2label[utt_name]
        dur = ds.get_num_frames_by_utt_name(utt_name)
        print '{} {}'.format(utt_name, ' '.join([lbl] * dur))


if __name__ == '__main__':
    desc = 'Takes in a Kaldi scp and utterance-level labels, outputs ' + \
           'frame-level labels of all utterances in the scp to stdout. ' + \
           'Utterances that are in the scp but not in the label mapping ' + \
           'will be skipped.'
    parser = common.init_argparse(desc)
    parser.add_argument('scp', help='Kaldi scp')
    parser.add_argument('utt2label', help='Mapping from utterance to label')
    main(parser.parse_args())

Esempio n. 13
0
def main():
    desc = 'Outputs Kaldi-compatible log-likelihood to stdout using a ' + \
           'Keras model. This mimics the design of Kaldi nnet-forward.'
    parser = common.init_argparse(desc)
    parser.add_argument('model_json', help='JSON description of the model')
    parser.add_argument('model_weights', help='File containing model weights')
    parser.add_argument('feats_scp', help='scp of input features')
    parser.add_argument('--context', type=int, default=8,
                        help='Number of context frames for splicing')
    parser.add_argument('--padding', default='replicate',
                        help='What to do with out-of-bound frames. Valid ' + \
                             'values: [replicate|zero]')
    parser.add_argument('--primary-task', type=int,
                        help='Set to enable multi-task model decoding')
    parser.add_argument('--nutts', type=int, default=10,
                        help='How many utterances to feed to the model at once')
    parser.add_argument('--delay', type=int, default=5,
                        help='Output delay in frames')
    parser.add_argument('--class-frame-counts', help='Kaldi vector with ' + \
                        'frame-counts of pdfs to compute log-priors')
    parser.add_argument('--prior-floor', type=float, default=1e-10,
                        help='Flooring constant for prior probability, ' + \
                             'i.e. pdfs with prior smaller than this ' + \
                             'value will be ignored during decoding.')
    parser.add_argument('--ivectors', help='Utterance i-vectors to append')
    args = parser.parse_args()

    io.log('Initializing dataset')
    ivectors = None if args.ivectors is None else \
            ivector_ark_read(args.ivectors, dtype=np.float32)
    buf_ds = init_dataset(
        args.feats_scp, args.context, args.padding,
        args.nutts, args.delay, ivectors
    )
    io.log('Initializing model')
    json_str = io.json_load(args.model_json)
    model = model_from_json(json_str)
    model.load_weights(args.model_weights)
    io.log('Initializing priors')
    log_priors = get_log_priors(args.class_frame_counts, args.prior_floor)
    if args.primary_task is not None:
        io.log('Multi-task decoding enabled, primary task {}'.format(args.primary_task))

    io.log('** Begin outputting **')
    while True:
        # Load data chunk
        chunk = buf_ds.read_next_chunk()
        if chunk is None:
            break
        Xs, _, eobs, utt_indices = chunk
        X = Xs[0]
        eob = eobs[0]
        utt_names = buf_ds.dataset().get_utt_names_by_utt_indices(utt_indices)
        y = model.predict(X, batch_size=len(utt_indices), verbose=0)
        if args.primary_task is not None:
            y = y[args.primary_task]
        y = np.log(y, y)
        if log_priors is not None:
            y -= log_priors
        for i in range(len(utt_indices)):
            print_matrix(utt_names[i], y[i][buf_ds.get_delay():eob[i]])