def process(self, name, X): # ====== not enough data points for sequencing ====== # if self.end == 'cut' and \ any(x.shape[0] < self.frame_length for x in X): return None if self.end == 'ignore' and \ any(x.shape[0] > self.frame_length for x in X): return None end = self.end if end == 'ignore': end = 'pad' # ====== preprocessing data-idx, label-idx ====== # data_idx = axis_normalize(axis=self.data_idx, ndim=len(X), return_tuple=True) # ====== segments X ====== # X_new = [] for idx, x in enumerate(X): ## for data if idx in data_idx: if end == 'mix': x = segment_axis(a=x, frame_length=self.frame_length, step_length=self.step_length, axis=0, end='cut' if x.shape[0] >= self.frame_length else 'pad', pad_value=self.pad_value, pad_mode=self.pad_mode) else: x = segment_axis(a=x, frame_length=self.frame_length, step_length=self.step_length, axis=0, end=end, pad_value=self.pad_value, pad_mode=self.pad_mode) ## for all X_new.append(x) return name, X_new
def process(self, name, X): # not enough data points for stacking if X[0].shape[0] < self.frame_length: return None data_idx, label_idx = _get_data_label_idx( self.data_idx, self.label_idx, len(X)) # ====== stacking ====== # X_new = [] for idx, x in enumerate(X): if idx in data_idx: if x.ndim == 1: x = np.expand_dims(x, axis=-1) x = stack_frames(x, frame_length=self.frame_length, step_length=self.shift, keep_length=self.keep_length) elif idx in label_idx: if not self.keep_length: x = segment_axis(x, frame_length=self.frame_length, step_length=self.shift, axis=0, end='cut') x = _apply_label_mode(x, self.label_mode) else: raise NotImplementedError # TODO X_new.append(x) return name, X_new
def prepare_dnn_data(recipe, feat, utt_length, seed=87654321): """ Return ------ train_feeder : Feeder for training valid_feeder : Feeder for validating test_ids : Test indices test_dat : Data array all_speakers : list of all speaker in training set """ # Load dataset frame_length = int(utt_length / FRAME_SHIFT) ds = F.Dataset(os.path.join(PATH_ACOUSTIC_FEAT, recipe), read_only=True) X = ds[feat] train_indices = {name: ds['indices'][name] for name in TRAIN_DATA.keys()} test_indices = { name: start_end for name, start_end in ds['indices'].items() if name not in TRAIN_DATA } train_indices, valid_indices = train_valid_test_split(x=list( train_indices.items()), train=0.9, inc_test=False, seed=seed) all_speakers = sorted(set(TRAIN_DATA.values())) n_speakers = max(all_speakers) + 1 print("#Train files:", ctext(len(train_indices), 'cyan')) print("#Valid files:", ctext(len(valid_indices), 'cyan')) print("#Test files:", ctext(len(test_indices), 'cyan')) print("#Speakers:", ctext(n_speakers, 'cyan')) recipes = [ F.recipes.Sequencing(frame_length=frame_length, step_length=frame_length, end='pad', pad_value=0, pad_mode='post', data_idx=0), F.recipes.Name2Label(lambda name: TRAIN_DATA[name], ref_idx=0), F.recipes.LabelOneHot(nb_classes=n_speakers, data_idx=1) ] train_feeder = F.Feeder(data_desc=F.IndexedData(data=X, indices=train_indices), batch_mode='batch', ncpu=7, buffer_size=12) valid_feeder = F.Feeder(data_desc=F.IndexedData(data=X, indices=valid_indices), batch_mode='batch', ncpu=2, buffer_size=4) train_feeder.set_recipes(recipes) valid_feeder.set_recipes(recipes) print(train_feeder) # ====== cache the test data ====== # cache_dat = os.path.join(PATH_EXP, 'test_%s_%d.dat' % (feat, int(utt_length))) cache_ids = os.path.join(PATH_EXP, 'test_%s_%d.ids' % (feat, int(utt_length))) # validate cache files if os.path.exists(cache_ids): with open(cache_ids, 'rb') as f: ids = pickle.load(f) if len(ids) != len(test_indices): os.remove(cache_ids) if os.path.exists(cache_dat): os.remove(cache_dat) elif os.path.exists(cache_dat): os.remove(cache_dat) # caching if not os.path.exists(cache_dat): dat = F.MmapData(cache_dat, dtype='float16', shape=(0, frame_length, X.shape[1])) ids = {} prog = Progbar(target=len(test_indices)) s = 0 for name, (start, end) in test_indices.items(): y = X[start:end] y = segment_axis(y, axis=0, frame_length=frame_length, step_length=frame_length, end='pad', pad_value=0, pad_mode='post') dat.append(y) # update indices ids[name] = (s, s + len(y)) s += len(y) # update progress prog.add(1) dat.flush() dat.close() with open(cache_ids, 'wb') as f: pickle.dump(ids, f) # ====== re-load ====== # dat = F.MmapData(cache_dat, read_only=True) with open(cache_ids, 'rb') as f: ids = pickle.load(f) # ====== save some sample ====== # sample_path = os.path.join(PATH_EXP, 'test_%s_%d.pdf' % (feat, int(utt_length))) V.plot_figure(nrow=9, ncol=6) for i, (name, (start, end)) in enumerate( sampling_iter(it=sorted(ids.items(), key=lambda x: x[0]), k=12, seed=87654321)): x = dat[start:end][:].astype('float32') ax = V.plot_spectrogram(x[np.random.randint(0, len(x))].T, ax=(12, 1, i + 1), title='') ax.set_title(name) V.plot_save(sample_path) return (train_feeder, valid_feeder, ids, dat, all_speakers)
def prepare_dnn_data(recipe, feat, utt_length, seed=52181208): """ Return ------ train_feeder : Feeder for training valid_feeder : Feeder for validating test_ids : Test indices test_dat : Data array all_speakers : list of all speaker in training set """ # Load dataset frame_length = int(utt_length / FRAME_SHIFT) ds = F.Dataset(os.path.join(PATH_ACOUSTIC_FEAT, recipe), read_only=True) X = ds[feat] train_indices = {name: ds['indices'][name] for name in TRAIN_DATA.keys()} test_indices = {name: start_end for name, start_end in ds['indices'].items() if name not in TRAIN_DATA} train_indices, valid_indices = train_valid_test_split( x=list(train_indices.items()), train=0.9, inc_test=False, seed=seed) all_speakers = sorted(set(TRAIN_DATA.values())) n_speakers = max(all_speakers) + 1 print("#Train files:", ctext(len(train_indices), 'cyan')) print("#Valid files:", ctext(len(valid_indices), 'cyan')) print("#Test files:", ctext(len(test_indices), 'cyan')) print("#Speakers:", ctext(n_speakers, 'cyan')) recipes = [ F.recipes.Sequencing(frame_length=frame_length, step_length=frame_length, end='pad', pad_value=0, pad_mode='post', data_idx=0), F.recipes.Name2Label(lambda name:TRAIN_DATA[name], ref_idx=0), F.recipes.LabelOneHot(nb_classes=n_speakers, data_idx=1) ] train_feeder = F.Feeder( data_desc=F.IndexedData(data=X, indices=train_indices), batch_mode='batch', ncpu=7, buffer_size=12) valid_feeder = F.Feeder( data_desc=F.IndexedData(data=X, indices=valid_indices), batch_mode='batch', ncpu=2, buffer_size=4) train_feeder.set_recipes(recipes) valid_feeder.set_recipes(recipes) print(train_feeder) # ====== cache the test data ====== # cache_dat = os.path.join(PATH_EXP, 'test_%s_%d.dat' % (feat, int(utt_length))) cache_ids = os.path.join(PATH_EXP, 'test_%s_%d.ids' % (feat, int(utt_length))) # validate cache files if os.path.exists(cache_ids): with open(cache_ids, 'rb') as f: ids = pickle.load(f) if len(ids) != len(test_indices): os.remove(cache_ids) if os.path.exists(cache_dat): os.remove(cache_dat) elif os.path.exists(cache_dat): os.remove(cache_dat) # caching if not os.path.exists(cache_dat): dat = F.MmapData(cache_dat, dtype='float16', shape=(0, frame_length, X.shape[1])) ids = {} prog = Progbar(target=len(test_indices)) s = 0 for name, (start, end) in test_indices.items(): y = X[start:end] y = segment_axis(y, axis=0, frame_length=frame_length, step_length=frame_length, end='pad', pad_value=0, pad_mode='post') dat.append(y) # update indices ids[name] = (s, s + len(y)) s += len(y) # update progress prog.add(1) dat.flush() dat.close() with open(cache_ids, 'wb') as f: pickle.dump(ids, f) # ====== re-load ====== # dat = F.MmapData(cache_dat, read_only=True) with open(cache_ids, 'rb') as f: ids = pickle.load(f) # ====== save some sample ====== # sample_path = os.path.join(PATH_EXP, 'test_%s_%d.pdf' % (feat, int(utt_length))) V.plot_figure(nrow=9, ncol=6) for i, (name, (start, end)) in enumerate( sampling_iter(it=sorted(ids.items(), key=lambda x: x[0]), k=12, seed=52181208)): x = dat[start:end][:].astype('float32') ax = V.plot_spectrogram(x[np.random.randint(0, len(x))].T, ax=(12, 1, i + 1), title='') ax.set_title(name) V.plot_save(sample_path) return (train_feeder, valid_feeder, ids, dat, all_speakers)
def test_func(s, sr, maximum_duration=30, minimum_duration=None, frame_length=256, nb_mixtures=3, threshold=0.3, return_vad=False, return_voices=False, return_cut=False): """ Splitting an audio based on VAD indicator. * The audio is segmented into multiple with length given by `frame_length` * Log-energy is calculated for each frames * Gaussian mixtures with `nb_mixtures` is fitted, and output vad indicator for each frames. * A flat window (ones-window) of `frame_length` is convolved with the vad indices. * All frames within the percentile >= `threshold` is treated as voiced. * The splitting process is greedy, frames is grouped until reaching the maximum duration Parameters ---------- s: 1-D numpy.ndarray loaded audio array sr: int sample rate maximum_duration: float (second) maximum duration of each segments in seconds minimum_duration: None, or float (second) all segments below this length will be merged into longer segments, if None, any segments with half of the `maximum_duration` are considered. frame_length: int number of frames for windowing nb_mixtures: int number of Gaussian mixture for energy-based VAD (the higher the more overfitting). threshold: float (0. to 1.) The higher the values, the more frames are considered as voiced, this value is the lower percentile of voiced frames. return_vad: bool if True, return VAD confident values return_voices: bool if True, return the voices frames indices return_cut: bool if True, return the cut points of the audio. Return ------ segments: list of audio arrays vad (optional): list of 0, 1 for VAD indices voices (optional): list of thresholded VAD for more precise voices frames. cut (optional): list of indicator 0, 1 (1 for the cut point) Note ---- this function does not guarantee the output audio length is always smaller than `maximum_duration`, the higher the threshold, the better chance you get everything smaller than `maximum_duration` """ frame_length = int(frame_length) maximum_duration = maximum_duration * sr results = [] # ====== check if audio long enough ====== # if len(s) < maximum_duration: if return_cut or return_vad or return_voices: raise ValueError("Cannot return `cut` points, `vad` or `voices` since" "the original audio is shorter than `maximum_duration`, " "hence, no need for splitting.") return [s] maximum_duration /= frame_length if minimum_duration is None: minimum_duration = maximum_duration // 2 else: minimum_duration = minimum_duration * sr / frame_length minimum_duration = np.clip(minimum_duration, 0., 0.99 * maximum_duration) # ====== start spliting ====== # frames = signal.segment_axis(s, frame_length, frame_length, axis=0, end='pad', endvalue=0.) energy = signal.get_energy(frames, log=True) vad = signal.vad_energy(energy, distrib_nb=nb_mixtures, nb_train_it=33)[0] vad = signal.smooth(vad, win=frame_length, window='flat') # explicitly return VAD if return_vad: results.append(vad) # ====== get all possible sliences ====== # # all voice indices indices = np.where(vad >= np.percentile(vad, q=threshold * 100))[0].tolist() if len(vad) - 1 not in indices: indices.append(len(vad) - 1) # explicitly return voiced frames if return_voices: tmp = np.zeros(shape=(len(vad),)) tmp[indices] = 1 results.append(tmp) # ====== spliting the audio ====== # segments = [] start = 0 prev_end = 0 # greedy adding new frames to reach desire maximum length for end in indices: # over-reach the maximum length if end - start > maximum_duration: segments.append((start, prev_end)) start = prev_end # exact maximum length elif end - start == maximum_duration: segments.append((start, end)) start = end prev_end = end # if found NO segments just return original file if len(segments) == 0: return [s] # add ending index if necessary if indices[-1] != segments[-1][-1]: segments.append((start, indices[-1])) # re-fining, short segments will be merged into bigger onces found_under_length = True while found_under_length: new_segments = [] found_under_length = False for (s1, e1), (s2, e2) in zip(segments, segments[1:]): # merge if length < length_threshold if (e1 - s1) < minimum_duration or (e2 - s2) < minimum_duration: new_segments.append((s1, e2)) found_under_length = True # keep both of the segments else: new_segments.append((s1, e1)) new_segments.append((s2, e2)) segments = new_segments # explicitly return cut points if return_cut: tmp = np.zeros(shape=(segments[-1][-1] + 1,)) for i, j in segments: tmp[i] = 1; tmp[j] = 1 results.append(tmp) # ====== convert everythng to raw signal index ====== # segments = [[i * frame_length, j * frame_length] for i, j in segments] segments[-1][-1] = s.shape[0] # cut segments out of raw audio array segments = [s[i:j] for i, j in segments] results = [segments] + results return results[0] if len(results) == 1 else results
def test_func(s, sr, maximum_duration=30, minimum_duration=None, frame_length=256, nb_mixtures=3, threshold=0.3, return_vad=False, return_voices=False, return_cut=False): """ Splitting an audio based on VAD indicator. * The audio is segmented into multiple with length given by `frame_length` * Log-energy is calculated for each frames * Gaussian mixtures with `nb_mixtures` is fitted, and output vad indicator for each frames. * A flat window (ones-window) of `frame_length` is convolved with the vad indices. * All frames within the percentile >= `threshold` is treated as voiced. * The splitting process is greedy, frames is grouped until reaching the maximum duration Parameters ---------- s: 1-D numpy.ndarray loaded audio array sr: int sample rate maximum_duration: float (second) maximum duration of each segments in seconds minimum_duration: None, or float (second) all segments below this length will be merged into longer segments, if None, any segments with half of the `maximum_duration` are considered. frame_length: int number of frames for windowing nb_mixtures: int number of Gaussian mixture for energy-based VAD (the higher the more overfitting). threshold: float (0. to 1.) The higher the values, the more frames are considered as voiced, this value is the lower percentile of voiced frames. return_vad: bool if True, return VAD confident values return_voices: bool if True, return the voices frames indices return_cut: bool if True, return the cut points of the audio. Return ------ segments: list of audio arrays vad (optional): list of 0, 1 for VAD indices voices (optional): list of thresholded VAD for more precise voices frames. cut (optional): list of indicator 0, 1 (1 for the cut point) Note ---- this function does not guarantee the output audio length is always smaller than `maximum_duration`, the higher the threshold, the better chance you get everything smaller than `maximum_duration` """ frame_length = int(frame_length) maximum_duration = maximum_duration * sr results = [] # ====== check if audio long enough ====== # if len(s) < maximum_duration: if return_cut or return_vad or return_voices: raise ValueError( "Cannot return `cut` points, `vad` or `voices` since" "the original audio is shorter than `maximum_duration`, " "hence, no need for splitting.") return [s] maximum_duration /= frame_length if minimum_duration is None: minimum_duration = maximum_duration // 2 else: minimum_duration = minimum_duration * sr / frame_length minimum_duration = np.clip(minimum_duration, 0., 0.99 * maximum_duration) # ====== start spliting ====== # frames = signal.segment_axis(s, frame_length, frame_length, axis=0, end='pad', endvalue=0.) energy = signal.get_energy(frames, log=True) vad = signal.vad_energy(energy, distrib_nb=nb_mixtures, nb_train_it=33)[0] vad = signal.smooth(vad, win=frame_length, window='flat') # explicitly return VAD if return_vad: results.append(vad) # ====== get all possible sliences ====== # # all voice indices indices = np.where( vad >= np.percentile(vad, q=threshold * 100))[0].tolist() if len(vad) - 1 not in indices: indices.append(len(vad) - 1) # explicitly return voiced frames if return_voices: tmp = np.zeros(shape=(len(vad), )) tmp[indices] = 1 results.append(tmp) # ====== spliting the audio ====== # segments = [] start = 0 prev_end = 0 # greedy adding new frames to reach desire maximum length for end in indices: # over-reach the maximum length if end - start > maximum_duration: segments.append((start, prev_end)) start = prev_end # exact maximum length elif end - start == maximum_duration: segments.append((start, end)) start = end prev_end = end # if found NO segments just return original file if len(segments) == 0: return [s] # add ending index if necessary if indices[-1] != segments[-1][-1]: segments.append((start, indices[-1])) # re-fining, short segments will be merged into bigger onces found_under_length = True while found_under_length: new_segments = [] found_under_length = False for (s1, e1), (s2, e2) in zip(segments, segments[1:]): # merge if length < length_threshold if (e1 - s1) < minimum_duration or (e2 - s2) < minimum_duration: new_segments.append((s1, e2)) found_under_length = True # keep both of the segments else: new_segments.append((s1, e1)) new_segments.append((s2, e2)) segments = new_segments # explicitly return cut points if return_cut: tmp = np.zeros(shape=(segments[-1][-1] + 1, )) for i, j in segments: tmp[i] = 1 tmp[j] = 1 results.append(tmp) # ====== convert everythng to raw signal index ====== # segments = [[i * frame_length, j * frame_length] for i, j in segments] segments[-1][-1] = s.shape[0] # cut segments out of raw audio array segments = [s[i:j] for i, j in segments] results = [segments] + results return results[0] if len(results) == 1 else results