def gen_dummy_meta(num_spk, num_utt_per_spk): ''' Generate a dummy data. ''' meta = kaldi_dir.KaldiMetaData() for spk_idx in range(num_spk): for utt_idx in range(num_utt_per_spk): spk = str(spk_idx) utt = '%s_%d' % (spk, utt_idx) utt_meta = kaldi_dir.Utt() utt_meta.feat = 'foo/bar/feat/%s' % (utt) utt_meta.vad = 'foo/bar/vad/%s' % (utt) utt_meta.spk = spk meta.utts[utt] = utt_meta meta.collect_spks_from_utts() return meta
def gen_dummy_data_dir(data_dir, num_spk, num_utt_per_spk, feat_len=100, feat_dim=40): ''' Generate a dummy data directory and return its meta. ''' os.makedirs(data_dir, exist_ok=True) meta = kaldi_dir.KaldiMetaData() feats = {} vads = {} for spk_idx in range(num_spk): for utt_idx in range(num_utt_per_spk): spk = str(spk_idx) utt = '%s_%d' % (spk, utt_idx) utt_meta = kaldi_dir.Utt() feat_mat = np.ones((feat_len, feat_dim), dtype='float32') feats[utt] = feat_mat utt_meta.featlen = feat_len vad_mat = np.ones((feat_len, ), dtype='float32') vads[utt] = vad_mat utt_meta.spk = spk meta.utts[utt] = utt_meta meta.collect_spks_from_utts() meta.dump(data_dir, True) feats_ark_path = os.path.join(data_dir, 'feats.ark') feats_scp_path = os.path.join(data_dir, 'feats.scp') kaldiio.save_ark(feats_ark_path, feats, scp=feats_scp_path, text=True) vad_ark_path = os.path.join(data_dir, 'vad.ark') vad_scp_path = os.path.join(data_dir, 'vad.scp') kaldiio.save_ark(vad_ark_path, vads, scp=vad_scp_path, text=True) loaded_meta = kaldi_dir.KaldiMetaData() loaded_meta.load(data_dir) return loaded_meta
def test_dump_and_load(self): ''' test dump and load data ''' temp_dir = self.get_temp_dir() num_spk = 5 num_utt_per_spk = 3 meta = gen_dummy_meta(num_spk, num_utt_per_spk) meta.dump(temp_dir, True) with open(os.path.join(temp_dir, 'feats.scp'), 'r') as fp_in: logging.info('feats.scp:\n%s' % (fp_in.read())) loaded_meta = kaldi_dir.KaldiMetaData() loaded_meta.load(temp_dir) self.assertEqual(len(meta.utts), len(loaded_meta.utts)) for utt_key in meta.utts.keys(): self.assertIn(utt_key, loaded_meta.utts) self.assertEqual(len(meta.spks), len(loaded_meta.spks)) for spk_key in meta.spks.keys(): self.assertIn(spk_key, loaded_meta.spks)
def main(): ''' The main function. ''' logging.set_verbosity(logging.INFO) parser = argparse.ArgumentParser() parser.add_argument('--num-spk-cv', type=float, default=0) parser.add_argument('--num-utt-cv', type=float, default=0) parser.add_argument('--cv-spk-percent', type=float, default=0.0) parser.add_argument('--cv-utt-percent', type=float, default=0.0) parser.add_argument('--fair-choice', type=bool, default=True) parser.add_argument('data_dir') parser.add_argument('data_dir_tr') parser.add_argument('data_dir_cv') args = parser.parse_args() num_spk_cv = args.num_spk_cv num_utt_cv = args.num_utt_cv if args.cv_spk_percent > 0: if args.cv_spk_percent >= 100: raise ValueError('cv_spk_percent cannot >= 100') num_spk_cv = args.cv_spk_percent / 100 if args.cv_utt_percent > 0: if args.cv_utt_percent >= 100: raise ValueError('cv_utt_percent cannot >= 100') num_utt_cv = args.cv_utt_percent / 100 if num_spk_cv == 0 and num_utt_cv == 0: num_spk_cv = 0.1 meta = kaldi_dir.KaldiMetaData() meta.load(args.data_dir) meta_tr, meta_cv = kaldi_dir_utils.subset_data_dir_tr_cv( meta, num_spk_cv=num_spk_cv, num_utt_cv=num_utt_cv, fair_choice=args.fair_choice) logging.info('#spks tr: %d, cv: %d; #utts tr: %d, cv: %d' % (len(meta_tr.spks), len(meta_cv.spks), len( meta_tr.utts), len(meta_cv.utts))) meta_tr.dump(args.data_dir_tr, overwrite=True) meta_cv.dump(args.data_dir_cv, overwrite=True)