def validating_noise_data(in_path_raw): # preparing noise_dataset = ['musan', 'rirs'] all_files = defaultdict(list) n_files = sum( len(sre_file_list[i]) for i in noise_dataset if i in sre_file_list) n_non_exist = 0 n_exist = 0 prog = Progbar(target=n_files, print_summary=True, name="Validating noise dataset") prog.set_summarizer(key='#Non-exist', fn=lambda x: x[-1]) prog.set_summarizer(key='#Exist', fn=lambda x: x[-1]) # check all dataset for ds_name in noise_dataset: if ds_name not in sre_file_list: continue if ds_name not in in_path_raw: continue base_path = in_path_raw[ds_name] base_ds = all_files[ds_name] # start validating for row in sre_file_list[ds_name]: # check file path, channel, name, noise_type, duration = row[:5] path = os.path.join(base_path, path) if os.path.exists(path): base_ds.append([path, channel, name, noise_type, duration]) n_exist += 1 else: n_non_exist += 1 # update progress prog['ds'] = ds_name prog['#Exist'] = n_exist prog['#Non-exist'] = n_non_exist prog.add(1) # ====== return ====== # # Header: # 0 1 2 3 4 # path, channel, name, noise_type, duration return { key: np.array(sorted(val, key=lambda x: x[0])) for key, val in all_files.items() }
def validating_noise_data(in_path_raw): # preparing noise_dataset = ['musan', 'rirs'] all_files = defaultdict(list) n_files = sum(len(sre_file_list[i]) for i in noise_dataset if i in sre_file_list) n_non_exist = 0 n_exist = 0 prog = Progbar(target=n_files, print_summary=True, name="Validating noise dataset") prog.set_summarizer(key='#Non-exist', fn=lambda x: x[-1]) prog.set_summarizer(key='#Exist', fn=lambda x: x[-1]) # check all dataset for ds_name in noise_dataset: if ds_name not in sre_file_list: continue if ds_name not in in_path_raw: continue base_path = in_path_raw[ds_name] base_ds = all_files[ds_name] # start validating for row in sre_file_list[ds_name]: # check file path, channel, name, noise_type, duration = row[:5] path = os.path.join(base_path, path) if os.path.exists(path): base_ds.append([path, channel, name, noise_type, duration]) n_exist += 1 else: n_non_exist += 1 # update progress prog['ds'] = ds_name prog['#Exist'] = n_exist prog['#Non-exist'] = n_non_exist prog.add(1) # ====== return ====== # # Header: # 0 1 2 3 4 # path, channel, name, noise_type, duration return {key: np.array(sorted(val, key=lambda x: x[0])) for key, val in all_files.items()}
def filter_utterances(X, indices, spkid, min_dur=None, min_utt=None, remove_min_length=True, remove_min_uttspk=True, n_speakers=None, ncpu=None, save_path=None, title=''): """ X : 2-D matrix input features indices : Mapping utterance_name -> (start, end) in `X` spkid : Mapping utterance_name -> speaker_id remove_min_length : bool (default: True) if True, remove all files shorter than MINIMUM_UTT_DURATION remove_min_uttspk : bool (default: True) if True, remove all speakers with lower amount of utterances than MINIMUM_UTT_PER_SPEAKERS n_speakers : {None, int} (default: None) if given, downsample the dataset by given number of speakers save_path : {None, str} (default: None) if given, pickle all filtered files to disk """ if min_dur is None: min_dur = MINIMUM_UTT_DURATION if min_utt is None: min_utt = MINIMUM_UTT_PER_SPEAKERS minimum_amount_of_frames = min_dur / Config.STEP_LENGTH save_data = {} prog = Progbar(target=len(indices), print_report=True, print_summary=True, name='Filtering broken utterances: %s' % title) prog.set_summarizer('zero-length', fn=lambda x: x[-1]) prog.set_summarizer('min-frames', fn=lambda x: x[-1]) prog.set_summarizer('zero-var', fn=lambda x: x[-1]) prog.set_summarizer('small-var', fn=lambda x: x[-1]) prog.set_summarizer('overflow', fn=lambda x: x[-1]) # ====== mpi function for checking ====== # @nb.jit(nopython=True, nogil=True) def _fast_mean_var_ax0(z): # using this function for calculating mean and variance # can double the speed but cannot check overflow, # only accept float32 or float64 input s1 = np.zeros(shape=(z.shape[1], ), dtype=z.dtype) s2 = np.zeros(shape=(z.shape[1], ), dtype=z.dtype) for i in range(z.shape[0]): s1 += z[i] s2 += np.power(z[i], 2) mean = s1 / z.shape[0] var = s2 / z.shape[0] - np.power(mean, 2) return mean, var def _mpi_func(jobs): for name, (start, end) in jobs: y = X[start:end] # flags is_zero_len = False is_zero_var = False is_small_var = False is_min_frames = False is_overflow = False # checking length if y.shape[0] == 0: is_zero_len = True elif y.shape[0] < minimum_amount_of_frames: is_min_frames = True # checking statistics else: with catch_warnings_error(RuntimeWarning): try: # mean = np.mean(y, axis=-1) var = np.var(y, axis=-1) # min_val = np.min(y, axis=-1) # max_val = np.max(y, axis=-1) # numerical unstable except RuntimeWarning as w: if 'overflow encountered' in str(w): is_overflow = True else: print(name, ':', w) # process with more numerical filtering else: if np.any(np.isclose(var, 0)): is_zero_var = True # very heuristic and aggressive here # filter-out anything with ~16.67% of low-var # this could remove 1/3 of the original data if np.sum(var < 0.01) > (len(y) / 6): is_small_var = True # return the flags yield (name, is_zero_len, is_min_frames, is_zero_var, is_small_var, is_overflow) # ====== running the multiprocessing filter ====== # zero_len_files = {} min_frame_files = {} zero_var_files = {} small_var_files = {} overflow_files = {} for res in mpi.MPI(jobs=sorted(indices.items(), key=lambda x: x[1][0]), func=_mpi_func, ncpu=NCPU if ncpu is None else int(ncpu), batch=250): name = res[0] if res[1]: zero_len_files[name] = 1 if res[2]: min_frame_files[name] = 1 if res[3]: zero_var_files[name] = 1 if res[4]: small_var_files[name] = 1 if res[5]: overflow_files[name] = 1 # update progress prog['name'] = name[:48] prog['zero-length'] = len(zero_len_files) prog['min-frames'] = len(min_frame_files) prog['zero-var'] = len(zero_var_files) prog['small-var'] = len(small_var_files) prog['overflow'] = len(overflow_files) prog.add(1) # ====== remove broken files ====== # if not bool(remove_min_length): min_frame_files = {} new_indices = { name: (start, end) for name, (start, end) in indices.items() if name not in zero_len_files and name not in min_frame_files and name not in zero_var_files and name not in small_var_files and name not in overflow_files } print("Filtered #utterances: %s/%s (files)" % (ctext(len(indices) - len(new_indices), 'lightcyan'), ctext(len(indices), 'cyan'))) indices = new_indices # ====== store save data ====== # save_data['zero_len'] = zero_len_files save_data['min_dur'] = min_frame_files save_data['zero_var'] = zero_var_files save_data['small_var'] = small_var_files save_data['overflow'] = overflow_files # ====== filter-out by number of utt-per-speaker ====== # if bool(remove_min_uttspk): spk2utt = defaultdict(list) for name in indices.keys(): spk2utt[spkid[name]].append(name) n_utt_removed = 0 n_spk_removed = 0 removed_utt = [] keep_utt = [] for spk, utt in spk2utt.items(): if len(utt) < min_utt: n_utt_removed += len(utt) n_spk_removed += 1 removed_utt += utt else: keep_utt += utt removed_utt = set(removed_utt) keep_utt = set(keep_utt) save_data['min_utt'] = removed_utt print("Removed min-utt/spk: %s/%s(utt) %s/%s(spk)" % (ctext(n_utt_removed, 'lightcyan'), ctext(len(indices), 'cyan'), ctext(n_spk_removed, 'lightcyan'), ctext(len(spk2utt), 'cyan'))) assert len(indices) == n_utt_removed + len(keep_utt), "Not possible!" indices = { name: (start, end) for name, (start, end) in indices.items() if name in keep_utt } # ====== sample by number of speakers ====== # if isinstance(n_speakers, Number) and n_speakers > 0: spk2utt = defaultdict(list) for name, (start, end) in indices.items(): spk2utt[spkid[name]].append((name, (start, end))) n_org_spk = len(spk2utt) n_org_ids = len(indices) # only need down-sampling with smaller number of speaker if n_speakers < n_org_spk: rand = np.random.RandomState(seed=Config.SUPER_SEED) tmp = list(spk2utt.keys()) rand.shuffle(tmp) sampled_spk = tmp[:n_speakers] indices = [] for spk in sampled_spk: indices += spk2utt[spk] indices = dict(indices) else: sampled_spk = spk2utt # print some log print("Selected: %s/%s(spk) which have %s/%s(utt)" % (ctext(len(sampled_spk), 'lightcyan'), ctext(n_org_spk, 'cyan'), ctext(len(indices), 'lightcyan'), ctext(n_org_ids, 'cyan'))) # ====== return the new indices ====== # if save_path is not None: try: with open(save_path, 'wb') as save_file: pickle.dump(save_data, save_file) except Exception as e: print("Cannot save filtering data to path: '%s', error: '%s'" % (save_path, str(e))) return indices
def validating_training_data(in_path_raw, training_dataset): file_list = { ds: sre_file_list[ds] for ds in training_dataset if ds in sre_file_list } # ====== meta info ====== # all_files = [] non_exist_files = [] extension_count = defaultdict(int) total_data = sum(v.shape[0] for k, v in file_list.items() if k not in ('musan', 'rirs')) # ====== progress ====== # prog = Progbar(target=total_data, print_summary=True, print_report=True, name="Preprocessing File List") prog.set_summarizer('#Files', fn=lambda x: x[-1]) prog.set_summarizer('#Non-exist', fn=lambda x: x[-1]) # ====== iterating ====== # for ds_name, data in sorted(file_list.items(), key=lambda x: x[0]): if ds_name in ('musan', 'rirs'): continue for row in data: path, channel, name, spkid = row[:4] assert channel in ('0', '1') # check path provided if ds_name in in_path_raw: path = os.path.join(in_path_raw[ds_name], path) # create new row start_time = '-' end_time = '-' if ds_name == 'mx6': start_time, end_time = row[-2:] new_row = [ path, channel, name, ds_name + '_' + spkid, ds_name, start_time, end_time ] # check file exist if os.path.exists(path): all_files.append(new_row) else: non_exist_files.append(new_row) # extension ext = os.path.splitext(path)[-1] extension_count[ext + '-' + ds_name] += 1 # update progress prog['Dataset'] = ds_name prog['#Files'] = len(all_files) prog['#Non-exist'] = len(non_exist_files) prog.add(1) # final results all_files = np.array(all_files) if len(all_files) == 0: return all_files, np.array(non_exist_files), extension_count # ====== check no duplicated name ====== # n_files = len(all_files) n_unique_files = len(np.unique(all_files[:, 2])) assert n_files == n_unique_files, \ 'Found duplicated name: %d != %d' % (n_files, n_unique_files) # ====== check no duplicated speaker ====== # n_spk = sum( len(np.unique(dat[:, 3])) for name, dat in file_list.items() if name not in ('musan', 'rirs')) n_unique_spk = len(np.unique(all_files[:, 3])) assert n_spk == n_unique_spk, \ 'Found duplicated speakers: %d != %d' % (n_spk, n_unique_spk) # ====== return ====== # # Header: # 0 1 2 3 4 5 6 # path, channel, name, spkid, dataset, start_time, end_time return all_files, np.array(non_exist_files), extension_count
def filter_utterances(X, indices, spkid, min_dur=None, min_utt=None, remove_min_length=True, remove_min_uttspk=True, n_speakers=None, ncpu=None, save_path=None, title=''): """ X : 2-D matrix input features indices : Mapping utterance_name -> (start, end) in `X` spkid : Mapping utterance_name -> speaker_id remove_min_length : bool (default: True) if True, remove all files shorter than MINIMUM_UTT_DURATION remove_min_uttspk : bool (default: True) if True, remove all speakers with lower amount of utterances than MINIMUM_UTT_PER_SPEAKERS n_speakers : {None, int} (default: None) if given, downsample the dataset by given number of speakers save_path : {None, str} (default: None) if given, pickle all filtered files to disk """ if min_dur is None: min_dur = MINIMUM_UTT_DURATION if min_utt is None: min_utt = MINIMUM_UTT_PER_SPEAKERS minimum_amount_of_frames = min_dur / Config.STEP_LENGTH save_data = {} prog = Progbar(target=len(indices), print_report=True, print_summary=True, name='Filtering broken utterances: %s' % title) prog.set_summarizer('zero-length', fn=lambda x: x[-1]) prog.set_summarizer('min-frames', fn=lambda x: x[-1]) prog.set_summarizer('zero-var', fn=lambda x: x[-1]) prog.set_summarizer('small-var', fn=lambda x: x[-1]) prog.set_summarizer('overflow', fn=lambda x: x[-1]) # ====== mpi function for checking ====== # @nb.jit(nopython=True, nogil=True) def _fast_mean_var_ax0(z): # using this function for calculating mean and variance # can double the speed but cannot check overflow, # only accept float32 or float64 input s1 = np.zeros(shape=(z.shape[1],), dtype=z.dtype) s2 = np.zeros(shape=(z.shape[1],), dtype=z.dtype) for i in range(z.shape[0]): s1 += z[i] s2 += np.power(z[i], 2) mean = s1 / z.shape[0] var = s2 / z.shape[0] - np.power(mean, 2) return mean, var def _mpi_func(jobs): for name, (start, end) in jobs: y = X[start:end] # flags is_zero_len = False is_zero_var = False is_small_var = False is_min_frames = False is_overflow = False # checking length if y.shape[0] == 0: is_zero_len = True elif y.shape[0] < minimum_amount_of_frames: is_min_frames = True # checking statistics else: with catch_warnings_error(RuntimeWarning): try: # mean = np.mean(y, axis=-1) var = np.var(y, axis=-1) # min_val = np.min(y, axis=-1) # max_val = np.max(y, axis=-1) # numerical unstable except RuntimeWarning as w: if 'overflow encountered' in str(w): is_overflow = True else: print(name, ':', w) # process with more numerical filtering else: if np.any(np.isclose(var, 0)): is_zero_var = True # very heuristic and aggressive here # filter-out anything with ~16.67% of low-var # this could remove 1/3 of the original data if np.sum(var < 0.01) > (len(y) / 6): is_small_var = True # return the flags yield (name, is_zero_len, is_min_frames, is_zero_var, is_small_var, is_overflow) # ====== running the multiprocessing filter ====== # zero_len_files = {} min_frame_files = {} zero_var_files = {} small_var_files = {} overflow_files = {} for res in mpi.MPI(jobs=sorted(indices.items(), key=lambda x: x[1][0]), func=_mpi_func, ncpu=NCPU if ncpu is None else int(ncpu), batch=250): name = res[0] if res[1]: zero_len_files[name] = 1 if res[2]: min_frame_files[name] = 1 if res[3]: zero_var_files[name] = 1 if res[4]: small_var_files[name] = 1 if res[5]: overflow_files[name] = 1 # update progress prog['name'] = name[:48] prog['zero-length'] = len(zero_len_files) prog['min-frames'] = len(min_frame_files) prog['zero-var'] = len(zero_var_files) prog['small-var'] = len(small_var_files) prog['overflow'] = len(overflow_files) prog.add(1) # ====== remove broken files ====== # if not bool(remove_min_length): min_frame_files = {} new_indices = {name: (start, end) for name, (start, end) in indices.items() if name not in zero_len_files and name not in min_frame_files and name not in zero_var_files and name not in small_var_files and name not in overflow_files} print("Filtered #utterances: %s/%s (files)" % (ctext(len(indices) - len(new_indices), 'lightcyan'), ctext(len(indices), 'cyan'))) indices = new_indices # ====== store save data ====== # save_data['zero_len'] = zero_len_files save_data['min_dur'] = min_frame_files save_data['zero_var'] = zero_var_files save_data['small_var'] = small_var_files save_data['overflow'] = overflow_files # ====== filter-out by number of utt-per-speaker ====== # if bool(remove_min_uttspk): spk2utt = defaultdict(list) for name in indices.keys(): spk2utt[spkid[name]].append(name) n_utt_removed = 0 n_spk_removed = 0 removed_utt = [] keep_utt = [] for spk, utt in spk2utt.items(): if len(utt) < min_utt: n_utt_removed += len(utt) n_spk_removed += 1 removed_utt += utt else: keep_utt += utt removed_utt = set(removed_utt) keep_utt = set(keep_utt) save_data['min_utt'] = removed_utt print("Removed min-utt/spk: %s/%s(utt) %s/%s(spk)" % ( ctext(n_utt_removed, 'lightcyan'), ctext(len(indices), 'cyan'), ctext(n_spk_removed, 'lightcyan'), ctext(len(spk2utt), 'cyan') )) assert len(indices) == n_utt_removed + len(keep_utt), "Not possible!" indices = {name: (start, end) for name, (start, end) in indices.items() if name in keep_utt} # ====== sample by number of speakers ====== # if isinstance(n_speakers, Number) and n_speakers > 0: spk2utt = defaultdict(list) for name, (start, end) in indices.items(): spk2utt[spkid[name]].append((name, (start, end))) n_org_spk = len(spk2utt) n_org_ids = len(indices) # only need down-sampling with smaller number of speaker if n_speakers < n_org_spk: rand = np.random.RandomState(seed=Config.SUPER_SEED) tmp = list(spk2utt.keys()) rand.shuffle(tmp) sampled_spk = tmp[:n_speakers] indices = [] for spk in sampled_spk: indices += spk2utt[spk] indices = dict(indices) else: sampled_spk = spk2utt # print some log print("Selected: %s/%s(spk) which have %s/%s(utt)" % ( ctext(len(sampled_spk), 'lightcyan'), ctext(n_org_spk, 'cyan'), ctext(len(indices), 'lightcyan'), ctext(n_org_ids, 'cyan') )) # ====== return the new indices ====== # if save_path is not None: try: with open(save_path, 'wb') as save_file: pickle.dump(save_data, save_file) except Exception as e: print("Cannot save filtering data to path: '%s', error: '%s'" % (save_path, str(e))) return indices
def validating_training_data(in_path_raw, training_dataset): file_list = {ds: sre_file_list[ds] for ds in training_dataset if ds in sre_file_list} # ====== meta info ====== # all_files = [] non_exist_files = [] extension_count = defaultdict(int) total_data = sum(v.shape[0] for k, v in file_list.items() if k not in('musan', 'rirs')) # ====== progress ====== # prog = Progbar(target=total_data, print_summary=True, print_report=True, name="Preprocessing File List") prog.set_summarizer('#Files', fn=lambda x: x[-1]) prog.set_summarizer('#Non-exist', fn=lambda x: x[-1]) # ====== iterating ====== # for ds_name, data in sorted(file_list.items(), key=lambda x: x[0]): if ds_name in ('musan', 'rirs'): continue for row in data: path, channel, name, spkid = row[:4] assert channel in ('0', '1') # check path provided if ds_name in in_path_raw: path = os.path.join(in_path_raw[ds_name], path) # create new row start_time = '-' end_time = '-' if ds_name == 'mx6': start_time, end_time = row[-2:] new_row = [path, channel, name, ds_name + '_' + spkid, ds_name, start_time, end_time] # check file exist if os.path.exists(path): all_files.append(new_row) else: non_exist_files.append(new_row) # extension ext = os.path.splitext(path)[-1] extension_count[ext + '-' + ds_name] += 1 # update progress prog['Dataset'] = ds_name prog['#Files'] = len(all_files) prog['#Non-exist'] = len(non_exist_files) prog.add(1) # final results all_files = np.array(all_files) if len(all_files) == 0: return all_files, np.array(non_exist_files), extension_count # ====== check no duplicated name ====== # n_files = len(all_files) n_unique_files = len(np.unique(all_files[:, 2])) assert n_files == n_unique_files, \ 'Found duplicated name: %d != %d' % (n_files, n_unique_files) # ====== check no duplicated speaker ====== # n_spk = sum(len(np.unique(dat[:, 3])) for name, dat in file_list.items() if name not in ('musan', 'rirs')) n_unique_spk = len(np.unique(all_files[:, 3])) assert n_spk == n_unique_spk, \ 'Found duplicated speakers: %d != %d' % (n_spk, n_unique_spk) # ====== return ====== # # Header: # 0 1 2 3 4 5 6 # path, channel, name, spkid, dataset, start_time, end_time return all_files, np.array(non_exist_files), extension_count