def _map_fn(path): feats_reader = kaldi_io.SequentialBaseFloatMatrixReader("scp:%s" % path) feats = [] spks = [] pdfs = [] for (utt, utt_feats) in feats_reader: if utt not in utt_to_pdfs: continue spk = utt_to_spk[utt] if random.random() > si_prob else 0 utt_pdfs = utt_to_pdfs[utt] utt_subsampled_length = utt_feats.shape[0] / subsampling_factor if abs(utt_subsampled_length - utt_pdfs.shape[0]) > 1: continue utt_feats = utt_feats[:utt_subsampled_length * subsampling_factor] utt_pdfs = utt_pdfs[:utt_subsampled_length] chunks = create_chunks(utt_feats, utt_pdfs, utt_pdfs, chunk_size, left_context, right_context, subsampling_factor) feats.extend([chunk[0] for chunk in chunks]) spks.extend([spk for chunk in chunks]) pdfs.extend([chunk[1] for chunk in chunks]) feats = np.array(feats, dtype=np.float32) spks = np.array(spks, dtype=np.int32) pdfs = np.array(pdfs, dtype=np.int32) num_chunks = chunks_per_sample * (feats.shape[0] // chunks_per_sample) return feats[:num_chunks], spks[:num_chunks], pdfs[:num_chunks]
def _map_fn(path): if path.endswith("hdf5"): f = h5py.File(path, 'r') feats_reader = [(utt, f[utt].value) for utt in f] else: feats_reader = kaldi_io.SequentialBaseFloatMatrixReader("scp:%s" % path) feats = [] spks = [] pdfs = [] for (utt, utt_feats) in feats_reader: if utt not in utt_to_pdfs: continue # spk = utt_to_spk[utt] if random.random() < 0.5 else 0 spk = 0 # HACK utt_pdfs = utt_to_pdfs[utt] utt_subsampled_length = utt_feats.shape[0] / subsampling_factor if abs(utt_subsampled_length - utt_pdfs.shape[0]) > 1: continue utt_feats = utt_feats[:utt_subsampled_length * subsampling_factor] utt_pdfs = utt_pdfs[:utt_subsampled_length] chunks = create_chunks(utt_feats, utt_pdfs, utt_pdfs, chunk_size, left_context, right_context, subsampling_factor) feats.extend([chunk[0] for chunk in chunks]) spks.extend([spk for chunk in chunks]) pdfs.extend([chunk[1] for chunk in chunks]) return np.array(feats, dtype=np.float32), np.array( spks, dtype=np.int32), np.array(pdfs, dtype=np.int32)
def _map_fn(path): feats_reader = kaldi_io.SequentialBaseFloatMatrixReader("scp:%s" % path) feats = [] spks = [] pdfs = [] for (utt, utt_feats) in feats_reader: if utt not in utt_to_pdfs: continue spk = utt_to_spk[utt] utt_pdfs = utt_to_pdfs[utt] utt_subsampled_length = utt_feats.shape[0] / subsampling_factor if abs(utt_subsampled_length - utt_pdfs.shape[0]) > 1: continue utt_feats = utt_feats[:utt_subsampled_length * subsampling_factor] utt_pdfs = utt_pdfs[:utt_subsampled_length] chunks = create_chunks(utt_feats, utt_pdfs, utt_pdfs, chunk_size, left_context, right_context, subsampling_factor) feats.extend([chunk[0] for chunk in chunks]) spks.extend([spk for chunk in chunks]) pdfs.extend([chunk[1] for chunk in chunks]) return ( np.array(feats, dtype=np.float32), np.array(spks, dtype=np.int32) * (np.random.uniform(size=len(spks)) >= speaker_independent_prob), np.array(pdfs, dtype=np.int32))
def load_utts_per_spk(feats, utt2spk, adapt_pdfs, test_pdfs, subsampling_factor): utt_to_adapt_pdfs = load_utt_to_pdfs(adapt_pdfs) utt_to_test_pdfs = load_utt_to_pdfs(test_pdfs) utt_to_spk = load_utt_to_spk(utt2spk) feats_reader = kaldi_io.SequentialBaseFloatMatrixReader(feats) utts_per_spk = collections.defaultdict(list) for (utt, utt_feats) in feats_reader: if utt not in utt_to_adapt_pdfs or utt not in utt_to_test_pdfs: continue spk = utt_to_spk[utt] utt_adapt_pdfs = utt_to_adapt_pdfs[utt] utt_test_pdfs = utt_to_test_pdfs[utt] utt_subsampled_length = utt_feats.shape[0] / subsampling_factor if abs(utt_subsampled_length - utt_adapt_pdfs.shape[0]) > 1: continue if abs(utt_subsampled_length - utt_test_pdfs.shape[0]) > 1: continue utts_per_spk[spk].append(( utt_feats[:utt_subsampled_length * subsampling_factor], utt_adapt_pdfs[:utt_subsampled_length], utt_test_pdfs[:utt_subsampled_length] )) return utts_per_spk
def _map_fn(path): feats_reader = kaldi_io.SequentialBaseFloatMatrixReader("scp:%s" % path) chunks = [] for (utt, utt_feats) in feats_reader: if utt not in utt_to_adapt_pdfs: continue if utt not in utt_to_test_pdfs: continue utt_adapt_pdfs = utt_to_adapt_pdfs[utt] utt_test_pdfs = utt_to_test_pdfs[utt] utt_subsampled_length = utt_feats.shape[0] / subsampling_factor if abs(utt_subsampled_length - utt_adapt_pdfs.shape[0]) > 1: continue if abs(utt_subsampled_length - utt_test_pdfs.shape[0]) > 1: continue utt_feats = utt_feats[:utt_subsampled_length * subsampling_factor] utt_adapt_pdfs = utt_adapt_pdfs[:utt_subsampled_length] utt_test_pdfs = utt_test_pdfs[:utt_subsampled_length] chunks.extend( create_chunks(utt_feats, utt_adapt_pdfs, utt_test_pdfs, chunk_size, left_context, right_context, subsampling_factor)) adapt_x = [] adapt_y = [] test_x = [] test_y = [] for offset in range(0, len(chunks) - 2 * chunks_per_sample, chunk_shift): adapt_x.append( [x[0] for x in chunks[offset:offset + chunks_per_sample]] * adaptation_steps) adapt_y.append( [x[1] for x in chunks[offset:offset + chunks_per_sample]] * adaptation_steps) test_x.append([ x[0] for x in chunks[offset + chunks_per_sample:offset + 2 * chunks_per_sample] ]) test_y.append([ x[2] for x in chunks[offset + chunks_per_sample:offset + 2 * chunks_per_sample] ]) return ( np.array(adapt_x, dtype=np.float32), np.array(adapt_y, dtype=np.int32), np.array(test_x, dtype=np.float32), np.array(test_y, dtype=np.int32), )
def get_mean_std_from_audio_features(path): sum = np.zeros((43, )) sum_sq = np.zeros((43, )) n = 0 with kaldi_io.SequentialBaseFloatMatrixReader(path) as reader: for name, feats in reader: nframes, nfeats = feats.shape n += nframes sum += feats.sum(0) sum_sq += (feats * feats).sum(0) mean = np.asarray(sum / n, dtype=kaldi_io.KALDI_BASE_FLOAT()) std = np.asarray(np.sqrt(sum_sq / n - mean**2), dtype=kaldi_io.KALDI_BASE_FLOAT()) return mean, std
padding_left = padding if args.padding_left is not None: padding_left = int(args.padding_left) padding_right = padding if args.padding_right is not None: padding_right = int(args.padding_right) if padding_left < 0 or padding_right < 0: logging.error("Padding can't be negative!") sys.exit(1) count = 0 logging.info("Padding with %d in the left and %d on the right", padding_left, padding_right) #should use with, but if something happens the files will get closed anyways reader = kaldi_io.SequentialBaseFloatMatrixReader(args.in_rxfilename) writer = kaldi_io.BaseFloatMatrixWriter(args.out_wxfilename) size_writer = None if args.orig_size_wxfilename is not None: size_writer = kaldi_io.PythonWriter(args.orig_size_wxfilename) for name, value in reader: count += 1 if padding_left + padding_right == 0: padded = value else: num_frames, frame_dim = value.shape padded = np.empty(shape=(num_frames + padding_left + padding_right, frame_dim), dtype=value.dtype)
# Add uttid information with open(all_fbank41_scp) as f: uttids = [l.strip().split(None, 1)[0] for l in f] for row_idx, uttid in enumerate(uttids): uttids_ds[row_idx] = uttid # Add spk information with open(all_utt2spk) as f: utt2spk = [l.strip().split(None, 1)[1] for l in f] for row_idx, spk in enumerate(utt2spk): spks_ds[row_idx] = spk feat = 'ark:add-detlas scp:{} ark:- | apply-global-cmvn.py --global-stats=ark:{} ark:- ark:-|'.format(all_fbank41_scp, cmvn_stats) # Add features (deltas added and globally normalized on the fly) for row_idx, (uttid, value) in enumerate(kaldi_io.SequentialBaseFloatMatrixReader(feat)): features_shapes[row_idx,:] = value.shape features[row_idx] = value.ravel() ivector= 'ark:apply-global-cmvn-vector.py ark:{} scp:{} ark:-|'.format(spk_ivector_cmvn_stats, all_spk_ivectors_scp) # Add ivectors for row_idx, (uttid, value) in enumerate(kaldi_io.SequentialBaseFloatVectorReader(ivector)): frame_wise_value = numpy.tile(value, (features_shapes[row_idx][0], 1)) ivectors_shapes[row_idx,:] = frame_wise_value.shape ivectors[row_idx] = frame_wise_value.ravel() f['train_si84_rand_indices'] = numpy.random.choice(37394, 7138, replace=False) train_si84_rand_ref = f['train_si84_rand_indices'].ref # Split information split_dict = {
config.intra_op_parallelism_threads = 1 config.inter_op_parallelism_threads = 1 keras.backend.tensorflow_backend.set_session(tf.Session(config=config)) if __name__ == '__main__': model = sys.argv[1] left_context = int(sys.argv[2]) right_context = int(sys.argv[3]) if not model.endswith('.h5'): raise TypeError( 'Unsupported model type. Please use h5 format. Update Keras if needed' ) m = keras.models.load_model(model) with kaldi_io.SequentialBaseFloatMatrixReader("ark:-") as arkIn, \ kaldi_io.BaseFloatMatrixWriter("ark,t:-") as arkOut: signal(SIGPIPE, SIG_DFL) for utt, utt_feats in arkIn: feats = np.zeros( (utt_feats.shape[0] + left_context + right_context, utt_feats.shape[1])) feats[:left_context, :] = utt_feats[0] feats[-right_context:, :] = utt_feats[-1] feats[left_context:-right_context, :] = utt_feats feats = np.expand_dims(feats, 0) logProbMat = np.log(m.predict(feats)[0]) logProbMat[logProbMat == -np.inf] = -100 arkOut.write(utt, logProbMat)
def main(): parser = argparse.ArgumentParser() # general configuration parser.add_argument('--gpu', '-g', default='-1', type=str, help='GPU ID (negative value indicates CPU)') parser.add_argument('--debugmode', default=1, type=int, help='Debugmode') parser.add_argument('--seed', default=1, type=int, help='Random seed') parser.add_argument('--verbose', '-V', default=1, type=int, help='Verbose option') # task related parser.add_argument( '--recog-feat', type=str, required=True, help='Filename of recognition feature data (Kaldi scp)') parser.add_argument('--recog-label', type=str, required=True, help='Filename of recognition label data (json)') parser.add_argument('--result-label', type=str, required=True, help='Filename of result label data (json)') # model (parameter) related parser.add_argument('--model', type=str, required=True, help='Model file parameters to read') parser.add_argument('--model-conf', type=str, required=True, help='Model config file') # search related parser.add_argument('--beam-size', type=int, default=1, help='Beam size') parser.add_argument('--penalty', default=0.0, type=float, help='Incertion penalty') parser.add_argument('--maxlenratio', default=0.5, type=float, help='Input length ratio to obtain max output length') parser.add_argument('--minlenratio', default=0.0, type=float, help='Input length ratio to obtain min output length') args = parser.parse_args() # logging info if args.verbose == 1: logging.basicConfig( level=logging.INFO, format= "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s") if args.verbose == 2: logging.basicConfig( level=logging.DEBUG, format= "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s") else: logging.basicConfig( level=logging.WARN, format= "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s") logging.warning("Skip DEBUG/INFO messages") # display PYTHONPATH logging.info('python path = ' + os.environ['PYTHONPATH']) # display chainer version logging.info('chainer version = ' + chainer.__version__) # seed setting (chainer seed may not need it) nseed = args.seed random.seed(nseed) np.random.seed(nseed) os.environ["CHAINER_SEED"] = str(nseed) logging.info('chainer seed = ' + os.environ['CHAINER_SEED']) # read training config with open(args.model_conf, "r") as f: logging.info('reading a model config file from' + args.model_conf) idim, odim, train_args = pickle.load(f) for key in sorted(vars(args).keys()): logging.info('ARGS: ' + key + ': ' + str(vars(args)[key])) # specify model architecture logging.info('reading model parameters from' + args.model) e2e = E2E(idim, odim, train_args) model = MTLLoss(e2e, train_args.mtlalpha) chainer.serializers.load_npz(args.model, model) # prepare Kaldi reader reader = kaldi_io.SequentialBaseFloatMatrixReader(args.recog_feat) # read json data with open(args.recog_label, 'r') as f: recog_json = json.load(f)['utts'] new_json = {} for name, feat in reader: y_hat = e2e.recognize(feat, args, train_args.char_list) y_true = map(int, recog_json[name]['tokenid'].split()) # print out decoding result seq_hat = [train_args.char_list[int(idx)] for idx in y_hat] seq_true = [train_args.char_list[int(idx)] for idx in y_true] seq_hat_text = "".join(seq_hat) seq_true_text = "".join(seq_true) logging.info("groundtruth[%s]: " + seq_true_text, name) logging.info("prediction [%s]: " + seq_hat_text, name) # copy old json info new_json[name] = recog_json[name] # added recognition results to json new_json[name]['rec_tokenid'] = " ".join( [str(idx[0]) for idx in y_hat]) new_json[name]['rec_token'] = " ".join(seq_hat) new_json[name]['rec_text'] = seq_hat_text # TODO fix character coding problems when saving it with open(args.result_label, 'w') as f: f.write(json.dumps({'utts': new_json}, indent=4).encode('utf_8'))
uttids = [l.strip().split(None, 1)[0] for l in f] for row_idx, uttid in enumerate(uttids): uttids_ds[row_idx] = uttid # Add spk information with open(all_utt2spk) as f: utt2spk = [l.strip().split(None, 1)[1] for l in f] for row_idx, spk in enumerate(utt2spk): spks_ds[row_idx] = spk feat = 'ark:add-deltas scp:{} ark:- | apply-global-cmvn.py --global-stats=ark:{} ark:- ark:-|'.format( all_fbank41_scp, cmvn_stats) # Add features (deltas added and globally normalized on the fly) for row_idx, (uttid, value) in enumerate( kaldi_io.SequentialBaseFloatMatrixReader(feat)): features_shapes[row_idx, :] = value.shape features[row_idx] = value.ravel() ivector = 'ark:apply-global-cmvn-vector.py ark:{} scp:{} ark:-|'.format( spk_ivector_cmvn_stats, all_spk_ivectors_scp) # Add ivectors for row_idx, (uttid, value) in enumerate( kaldi_io.SequentialBaseFloatVectorReader(ivector)): frame_wise_value = numpy.tile(value, (features_shapes[row_idx][0], 1)) ivectors_shapes[row_idx, :] = frame_wise_value.shape ivectors[row_idx] = frame_wise_value.ravel() #f['train_si84_rand_indices'] = numpy.random.choice(37394, 7138, replace=False) #train_si84_rand_ref = f['train_si84_rand_indices'].ref
def get_audio_features_from_file(path, take_every_nth, mean, std): for (uttid, features) in kaldi_io.SequentialBaseFloatMatrixReader(path): features = features[::take_every_nth] features = (features - mean) / std yield uttid, features