def dump_pseudo_label_mfcc(km_path, task, sample_rate, nj): apply_kmeans = ApplyKmeans(km_path) reader = MfccFeatureReader(sample_rate) generator, num = get_path_iterator(f"{task}/wav.scp", 1.0) iterator = generator() if nj > 1: feats = joblib.Parallel(n_jobs=nj)( joblib.delayed(reader.get_feats)(path) for utt_id, path in tqdm.tqdm(iterator, total=num)) p_labs = joblib.Parallel(n_jobs=nj)( joblib.delayed(apply_kmeans)(feat) for feat in tqdm.tqdm(feats, total=num)) iterator = generator() utt_ids = [utt_id for utt_id, _ in iterator] else: utt_ids, p_labs = [], [] for utt_id, path in tqdm.tqdm(iterator, total=num): feat = reader.get_feats(path) p_lab = apply_kmeans(feat).tolist() p_labs.append(p_lab) utt_ids.append(utt_id) return utt_ids, p_labs
def dump_pseudo_label_hubert(km_path, task, sample_rate, url, dir, layer): apply_kmeans = ApplyKmeans(km_path) reader = HubertFeatureReader(sample_rate, url, dir, layer) generator, num = get_path_iterator(f"{task}/wav.scp", 1.0) iterator = generator() utt_ids, p_labs = [], [] for utt_id, path in tqdm.tqdm(iterator, total=num): feat = reader.get_feats(path) p_lab = apply_kmeans(feat).tolist() p_labs.append(p_lab) utt_ids.append(utt_id) return utt_ids, p_labs