Beispiel #1
0
    def _map_fn(path):
        feats_reader = kaldi_io.SequentialBaseFloatMatrixReader("scp:%s" %
                                                                path)

        feats = []
        spks = []
        pdfs = []
        for (utt, utt_feats) in feats_reader:
            if utt not in utt_to_pdfs:
                continue

            spk = utt_to_spk[utt] if random.random() > si_prob else 0
            utt_pdfs = utt_to_pdfs[utt]

            utt_subsampled_length = utt_feats.shape[0] / subsampling_factor
            if abs(utt_subsampled_length - utt_pdfs.shape[0]) > 1:
                continue

            utt_feats = utt_feats[:utt_subsampled_length * subsampling_factor]
            utt_pdfs = utt_pdfs[:utt_subsampled_length]
            chunks = create_chunks(utt_feats, utt_pdfs, utt_pdfs, chunk_size,
                                   left_context, right_context,
                                   subsampling_factor)

            feats.extend([chunk[0] for chunk in chunks])
            spks.extend([spk for chunk in chunks])
            pdfs.extend([chunk[1] for chunk in chunks])

        feats = np.array(feats, dtype=np.float32)
        spks = np.array(spks, dtype=np.int32)
        pdfs = np.array(pdfs, dtype=np.int32)

        num_chunks = chunks_per_sample * (feats.shape[0] // chunks_per_sample)

        return feats[:num_chunks], spks[:num_chunks], pdfs[:num_chunks]
Beispiel #2
0
    def _map_fn(path):
        if path.endswith("hdf5"):
            f = h5py.File(path, 'r')
            feats_reader = [(utt, f[utt].value) for utt in f]
        else:
            feats_reader = kaldi_io.SequentialBaseFloatMatrixReader("scp:%s" %
                                                                    path)

        feats = []
        spks = []
        pdfs = []
        for (utt, utt_feats) in feats_reader:
            if utt not in utt_to_pdfs:
                continue

            # spk = utt_to_spk[utt] if random.random() < 0.5 else 0
            spk = 0  # HACK
            utt_pdfs = utt_to_pdfs[utt]

            utt_subsampled_length = utt_feats.shape[0] / subsampling_factor
            if abs(utt_subsampled_length - utt_pdfs.shape[0]) > 1:
                continue

            utt_feats = utt_feats[:utt_subsampled_length * subsampling_factor]
            utt_pdfs = utt_pdfs[:utt_subsampled_length]
            chunks = create_chunks(utt_feats, utt_pdfs, utt_pdfs, chunk_size,
                                   left_context, right_context,
                                   subsampling_factor)

            feats.extend([chunk[0] for chunk in chunks])
            spks.extend([spk for chunk in chunks])
            pdfs.extend([chunk[1] for chunk in chunks])

        return np.array(feats, dtype=np.float32), np.array(
            spks, dtype=np.int32), np.array(pdfs, dtype=np.int32)
Beispiel #3
0
    def _map_fn(path):
        feats_reader = kaldi_io.SequentialBaseFloatMatrixReader("scp:%s" %
                                                                path)

        feats = []
        spks = []
        pdfs = []
        for (utt, utt_feats) in feats_reader:
            if utt not in utt_to_pdfs:
                continue

            spk = utt_to_spk[utt]
            utt_pdfs = utt_to_pdfs[utt]

            utt_subsampled_length = utt_feats.shape[0] / subsampling_factor
            if abs(utt_subsampled_length - utt_pdfs.shape[0]) > 1:
                continue

            utt_feats = utt_feats[:utt_subsampled_length * subsampling_factor]
            utt_pdfs = utt_pdfs[:utt_subsampled_length]
            chunks = create_chunks(utt_feats, utt_pdfs, utt_pdfs, chunk_size,
                                   left_context, right_context,
                                   subsampling_factor)

            feats.extend([chunk[0] for chunk in chunks])
            spks.extend([spk for chunk in chunks])
            pdfs.extend([chunk[1] for chunk in chunks])

        return (
            np.array(feats, dtype=np.float32), np.array(spks, dtype=np.int32) *
            (np.random.uniform(size=len(spks)) >= speaker_independent_prob),
            np.array(pdfs, dtype=np.int32))
Beispiel #4
0
def load_utts_per_spk(feats, utt2spk, adapt_pdfs, test_pdfs, subsampling_factor):
    utt_to_adapt_pdfs = load_utt_to_pdfs(adapt_pdfs)
    utt_to_test_pdfs = load_utt_to_pdfs(test_pdfs)
    utt_to_spk = load_utt_to_spk(utt2spk)
    feats_reader = kaldi_io.SequentialBaseFloatMatrixReader(feats)

    utts_per_spk = collections.defaultdict(list)
    for (utt, utt_feats) in feats_reader:
        if utt not in utt_to_adapt_pdfs or utt not in utt_to_test_pdfs:
            continue

        spk = utt_to_spk[utt]
        utt_adapt_pdfs = utt_to_adapt_pdfs[utt]
        utt_test_pdfs = utt_to_test_pdfs[utt]

        utt_subsampled_length = utt_feats.shape[0] / subsampling_factor
        if abs(utt_subsampled_length - utt_adapt_pdfs.shape[0]) > 1:
            continue

        if abs(utt_subsampled_length - utt_test_pdfs.shape[0]) > 1:
            continue

        utts_per_spk[spk].append((
            utt_feats[:utt_subsampled_length * subsampling_factor],
            utt_adapt_pdfs[:utt_subsampled_length],
            utt_test_pdfs[:utt_subsampled_length]
        ))

    return utts_per_spk
Beispiel #5
0
    def _map_fn(path):
        feats_reader = kaldi_io.SequentialBaseFloatMatrixReader("scp:%s" %
                                                                path)

        chunks = []
        for (utt, utt_feats) in feats_reader:
            if utt not in utt_to_adapt_pdfs:
                continue

            if utt not in utt_to_test_pdfs:
                continue

            utt_adapt_pdfs = utt_to_adapt_pdfs[utt]
            utt_test_pdfs = utt_to_test_pdfs[utt]

            utt_subsampled_length = utt_feats.shape[0] / subsampling_factor
            if abs(utt_subsampled_length - utt_adapt_pdfs.shape[0]) > 1:
                continue

            if abs(utt_subsampled_length - utt_test_pdfs.shape[0]) > 1:
                continue

            utt_feats = utt_feats[:utt_subsampled_length * subsampling_factor]
            utt_adapt_pdfs = utt_adapt_pdfs[:utt_subsampled_length]
            utt_test_pdfs = utt_test_pdfs[:utt_subsampled_length]
            chunks.extend(
                create_chunks(utt_feats, utt_adapt_pdfs, utt_test_pdfs,
                              chunk_size, left_context, right_context,
                              subsampling_factor))

        adapt_x = []
        adapt_y = []
        test_x = []
        test_y = []

        for offset in range(0,
                            len(chunks) - 2 * chunks_per_sample, chunk_shift):
            adapt_x.append(
                [x[0] for x in chunks[offset:offset + chunks_per_sample]] *
                adaptation_steps)
            adapt_y.append(
                [x[1] for x in chunks[offset:offset + chunks_per_sample]] *
                adaptation_steps)
            test_x.append([
                x[0] for x in chunks[offset + chunks_per_sample:offset +
                                     2 * chunks_per_sample]
            ])
            test_y.append([
                x[2] for x in chunks[offset + chunks_per_sample:offset +
                                     2 * chunks_per_sample]
            ])

        return (
            np.array(adapt_x, dtype=np.float32),
            np.array(adapt_y, dtype=np.int32),
            np.array(test_x, dtype=np.float32),
            np.array(test_y, dtype=np.int32),
        )
Beispiel #6
0
def get_mean_std_from_audio_features(path):
    sum = np.zeros((43, ))
    sum_sq = np.zeros((43, ))
    n = 0

    with kaldi_io.SequentialBaseFloatMatrixReader(path) as reader:
        for name, feats in reader:
            nframes, nfeats = feats.shape
            n += nframes
            sum += feats.sum(0)
            sum_sq += (feats * feats).sum(0)

    mean = np.asarray(sum / n, dtype=kaldi_io.KALDI_BASE_FLOAT())
    std = np.asarray(np.sqrt(sum_sq / n - mean**2),
                     dtype=kaldi_io.KALDI_BASE_FLOAT())

    return mean, std
Beispiel #7
0
    padding_left = padding
    if args.padding_left is not None: padding_left = int(args.padding_left)

    padding_right = padding
    if args.padding_right is not None: padding_right = int(args.padding_right)

    if padding_left < 0 or padding_right < 0:
        logging.error("Padding can't be negative!")
        sys.exit(1)

    count = 0
    logging.info("Padding with %d in the left and %d on the right",
                 padding_left, padding_right)

    #should use with, but if something happens the files will get closed anyways
    reader = kaldi_io.SequentialBaseFloatMatrixReader(args.in_rxfilename)
    writer = kaldi_io.BaseFloatMatrixWriter(args.out_wxfilename)

    size_writer = None
    if args.orig_size_wxfilename is not None:
        size_writer = kaldi_io.PythonWriter(args.orig_size_wxfilename)

    for name, value in reader:
        count += 1
        if padding_left + padding_right == 0:
            padded = value
        else:
            num_frames, frame_dim = value.shape
            padded = np.empty(shape=(num_frames + padding_left + padding_right,
                                     frame_dim),
                              dtype=value.dtype)
Beispiel #8
0
# Add uttid information
with open(all_fbank41_scp) as f:
    uttids = [l.strip().split(None, 1)[0] for l in f]
    for row_idx, uttid in enumerate(uttids):
        uttids_ds[row_idx] = uttid

# Add spk information
with open(all_utt2spk) as f:
    utt2spk = [l.strip().split(None, 1)[1] for l in f]
    for row_idx, spk in enumerate(utt2spk):
        spks_ds[row_idx] = spk

feat = 'ark:add-detlas scp:{} ark:- | apply-global-cmvn.py --global-stats=ark:{} ark:- ark:-|'.format(all_fbank41_scp, cmvn_stats)
# Add features (deltas added and globally normalized on the fly)
for row_idx, (uttid, value) in enumerate(kaldi_io.SequentialBaseFloatMatrixReader(feat)):
    features_shapes[row_idx,:] = value.shape
    features[row_idx] = value.ravel()

ivector= 'ark:apply-global-cmvn-vector.py ark:{} scp:{} ark:-|'.format(spk_ivector_cmvn_stats, all_spk_ivectors_scp)
# Add ivectors
for row_idx, (uttid, value) in enumerate(kaldi_io.SequentialBaseFloatVectorReader(ivector)):
    frame_wise_value = numpy.tile(value, (features_shapes[row_idx][0], 1))
    ivectors_shapes[row_idx,:] = frame_wise_value.shape
    ivectors[row_idx] = frame_wise_value.ravel()

f['train_si84_rand_indices'] = numpy.random.choice(37394, 7138, replace=False)
train_si84_rand_ref = f['train_si84_rand_indices'].ref

# Split information
split_dict = {
config.intra_op_parallelism_threads = 1
config.inter_op_parallelism_threads = 1
keras.backend.tensorflow_backend.set_session(tf.Session(config=config))

if __name__ == '__main__':
    model = sys.argv[1]
    left_context = int(sys.argv[2])
    right_context = int(sys.argv[3])

    if not model.endswith('.h5'):
        raise TypeError(
            'Unsupported model type. Please use h5 format. Update Keras if needed'
        )

    m = keras.models.load_model(model)
    with kaldi_io.SequentialBaseFloatMatrixReader("ark:-") as arkIn, \
            kaldi_io.BaseFloatMatrixWriter("ark,t:-") as arkOut:
        signal(SIGPIPE, SIG_DFL)

        for utt, utt_feats in arkIn:
            feats = np.zeros(
                (utt_feats.shape[0] + left_context + right_context,
                 utt_feats.shape[1]))
            feats[:left_context, :] = utt_feats[0]
            feats[-right_context:, :] = utt_feats[-1]
            feats[left_context:-right_context, :] = utt_feats
            feats = np.expand_dims(feats, 0)

            logProbMat = np.log(m.predict(feats)[0])
            logProbMat[logProbMat == -np.inf] = -100
            arkOut.write(utt, logProbMat)
Beispiel #10
0
def main():
    parser = argparse.ArgumentParser()
    # general configuration
    parser.add_argument('--gpu',
                        '-g',
                        default='-1',
                        type=str,
                        help='GPU ID (negative value indicates CPU)')
    parser.add_argument('--debugmode', default=1, type=int, help='Debugmode')
    parser.add_argument('--seed', default=1, type=int, help='Random seed')
    parser.add_argument('--verbose',
                        '-V',
                        default=1,
                        type=int,
                        help='Verbose option')
    # task related
    parser.add_argument(
        '--recog-feat',
        type=str,
        required=True,
        help='Filename of recognition feature data (Kaldi scp)')
    parser.add_argument('--recog-label',
                        type=str,
                        required=True,
                        help='Filename of recognition label data (json)')
    parser.add_argument('--result-label',
                        type=str,
                        required=True,
                        help='Filename of result label data (json)')
    # model (parameter) related
    parser.add_argument('--model',
                        type=str,
                        required=True,
                        help='Model file parameters to read')
    parser.add_argument('--model-conf',
                        type=str,
                        required=True,
                        help='Model config file')
    # search related
    parser.add_argument('--beam-size', type=int, default=1, help='Beam size')
    parser.add_argument('--penalty',
                        default=0.0,
                        type=float,
                        help='Incertion penalty')
    parser.add_argument('--maxlenratio',
                        default=0.5,
                        type=float,
                        help='Input length ratio to obtain max output length')
    parser.add_argument('--minlenratio',
                        default=0.0,
                        type=float,
                        help='Input length ratio to obtain min output length')
    args = parser.parse_args()

    # logging info
    if args.verbose == 1:
        logging.basicConfig(
            level=logging.INFO,
            format=
            "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s")
    if args.verbose == 2:
        logging.basicConfig(
            level=logging.DEBUG,
            format=
            "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s")
    else:
        logging.basicConfig(
            level=logging.WARN,
            format=
            "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s")
        logging.warning("Skip DEBUG/INFO messages")

    # display PYTHONPATH
    logging.info('python path = ' + os.environ['PYTHONPATH'])

    # display chainer version
    logging.info('chainer version = ' + chainer.__version__)

    # seed setting (chainer seed may not need it)
    nseed = args.seed
    random.seed(nseed)
    np.random.seed(nseed)
    os.environ["CHAINER_SEED"] = str(nseed)
    logging.info('chainer seed = ' + os.environ['CHAINER_SEED'])

    # read training config
    with open(args.model_conf, "r") as f:
        logging.info('reading a model config file from' + args.model_conf)
        idim, odim, train_args = pickle.load(f)

    for key in sorted(vars(args).keys()):
        logging.info('ARGS: ' + key + ': ' + str(vars(args)[key]))

    # specify model architecture
    logging.info('reading model parameters from' + args.model)
    e2e = E2E(idim, odim, train_args)
    model = MTLLoss(e2e, train_args.mtlalpha)
    chainer.serializers.load_npz(args.model, model)

    # prepare Kaldi reader
    reader = kaldi_io.SequentialBaseFloatMatrixReader(args.recog_feat)

    # read json data
    with open(args.recog_label, 'r') as f:
        recog_json = json.load(f)['utts']

    new_json = {}
    for name, feat in reader:
        y_hat = e2e.recognize(feat, args, train_args.char_list)
        y_true = map(int, recog_json[name]['tokenid'].split())

        # print out decoding result
        seq_hat = [train_args.char_list[int(idx)] for idx in y_hat]
        seq_true = [train_args.char_list[int(idx)] for idx in y_true]
        seq_hat_text = "".join(seq_hat)
        seq_true_text = "".join(seq_true)
        logging.info("groundtruth[%s]: " + seq_true_text, name)
        logging.info("prediction [%s]: " + seq_hat_text, name)

        # copy old json info
        new_json[name] = recog_json[name]

        # added recognition results to json
        new_json[name]['rec_tokenid'] = " ".join(
            [str(idx[0]) for idx in y_hat])
        new_json[name]['rec_token'] = " ".join(seq_hat)
        new_json[name]['rec_text'] = seq_hat_text

    # TODO fix character coding problems when saving it
    with open(args.result_label, 'w') as f:
        f.write(json.dumps({'utts': new_json}, indent=4).encode('utf_8'))
Beispiel #11
0
    uttids = [l.strip().split(None, 1)[0] for l in f]
    for row_idx, uttid in enumerate(uttids):
        uttids_ds[row_idx] = uttid

# Add spk information
with open(all_utt2spk) as f:
    utt2spk = [l.strip().split(None, 1)[1] for l in f]
    for row_idx, spk in enumerate(utt2spk):
        spks_ds[row_idx] = spk

feat = 'ark:add-deltas scp:{} ark:- | apply-global-cmvn.py --global-stats=ark:{} ark:- ark:-|'.format(
    all_fbank41_scp, cmvn_stats)

# Add features (deltas added and globally normalized on the fly)
for row_idx, (uttid, value) in enumerate(
        kaldi_io.SequentialBaseFloatMatrixReader(feat)):
    features_shapes[row_idx, :] = value.shape
    features[row_idx] = value.ravel()

ivector = 'ark:apply-global-cmvn-vector.py ark:{} scp:{} ark:-|'.format(
    spk_ivector_cmvn_stats, all_spk_ivectors_scp)

# Add ivectors
for row_idx, (uttid, value) in enumerate(
        kaldi_io.SequentialBaseFloatVectorReader(ivector)):
    frame_wise_value = numpy.tile(value, (features_shapes[row_idx][0], 1))
    ivectors_shapes[row_idx, :] = frame_wise_value.shape
    ivectors[row_idx] = frame_wise_value.ravel()

#f['train_si84_rand_indices'] = numpy.random.choice(37394, 7138, replace=False)
#train_si84_rand_ref = f['train_si84_rand_indices'].ref
Beispiel #12
0
def get_audio_features_from_file(path, take_every_nth, mean, std):
    for (uttid, features) in kaldi_io.SequentialBaseFloatMatrixReader(path):
        features = features[::take_every_nth]
        features = (features - mean) / std

        yield uttid, features