def parallel_audio_processing(self, clean_filename): clean_audio, _ = read_audio(clean_filename, self.sample_rate) # remove silent frame from clean audio clean_audio = self._remove_silent_frames(clean_audio) noise_filename = self._sample_noise_filename() # read the noise filename noise_audio, sr = read_audio(noise_filename, self.sample_rate) # remove silent frame from noise audio noise_audio = self._remove_silent_frames(noise_audio) # sample random fixed-sized snippets of audio clean_audio = self._audio_random_crop(clean_audio, duration=self.audio_max_duration) # add noise to input image noiseInput = self._add_noise_to_clean_audio(clean_audio, noise_audio) # extract stft features from noisy audio noisy_input_fe = FeatureExtractor(noiseInput, windowLength=self.window_length, overlap=self.overlap, sample_rate=self.sample_rate) noise_spectrogram = noisy_input_fe.get_stft_spectrogram() # Or get the phase angle (in radians) # noisy_stft_magnitude, noisy_stft_phase = librosa.magphase(noisy_stft_features) noise_phase = np.angle(noise_spectrogram) # get the magnitude of the spectral noise_magnitude = np.abs(noise_spectrogram) # extract stft features from clean audio clean_audio_fe = FeatureExtractor(clean_audio, windowLength=self.window_length, overlap=self.overlap, sample_rate=self.sample_rate) clean_spectrogram = clean_audio_fe.get_stft_spectrogram() # clean_spectrogram = cleanAudioFE.get_mel_spectrogram() # get the clean phase clean_phase = np.angle(clean_spectrogram) # get the clean spectral magnitude clean_magnitude = np.abs(clean_spectrogram) # clean_magnitude = 2 * clean_magnitude / np.sum(scipy.signal.hamming(self.window_length, sym=False)) clean_magnitude = self._phase_aware_scaling(clean_magnitude, clean_phase, noise_phase) scaler = StandardScaler(copy=False, with_mean=True, with_std=True) noise_magnitude = scaler.fit_transform(noise_magnitude) clean_magnitude = scaler.transform(clean_magnitude) return noise_magnitude, clean_magnitude, noise_phase
def _get_pad_wave_data(file): # 3s wave_data, sr = utils.read_audio(file) # data, fs while len(wave_data) < PARAM.LEN_WAWE_PAD_TO: wave_data = np.tile(wave_data, 2) len_wave = len(wave_data) wave_begin = np.random.randint(len_wave - PARAM.LEN_WAWE_PAD_TO + 1) return wave_data[wave_begin:wave_begin + PARAM.LEN_WAWE_PAD_TO]
def parallel_audio_processing(self, clean_filename): clean_audio, _ = read_audio(clean_filename, self.sample_rate) # remove silent frame from clean audio clean_audio = self._remove_silent_frames(clean_audio) # sample random fixed-sized snippets of audio clean_audio = self._audio_random_crop(clean_audio, duration=self.audio_max_duration) ## extract stft features from clean audio ## clean_audio_fe = FeatureExtractor(clean_audio, windowLength=self.window_length, overlap=self.overlap, sample_rate=self.sample_rate) clean_spectrogram = clean_audio_fe.get_stft_spectrogram() ## clean_spectrogram = cleanAudioFE.get_mel_spectrogram() # get the clean phase clean_phase = np.angle(clean_spectrogram) # get the clean spectral magnitude clean_magnitude = np.abs(clean_spectrogram) # noise generation noise_magnitude = self._gen_noise_stft(clean_magnitude, 0) #clean_magnitude = self._phase_aware_scaling(clean_magnitude, clean_phase, noise_phase) scaler = StandardScaler(copy=False, with_mean=True, with_std=True) noise_magnitude = scaler.fit_transform(noise_magnitude) clean_magnitude = scaler.transform(clean_magnitude) return noise_magnitude, clean_magnitude, clean_phase
def _check_data_properties(self): base_descr_file = os.path.join(self.output_base_path, 'base_description.yml') with open(base_descr_file, 'r') as f: base_descr = yaml.safe_load(f) target_sr = base_descr['data_properties']['sample_rate'] n_channels = base_descr['data_properties']['n_channels'] meta_file_general = os.path.join(self.output_base_path, base_descr['general_meta']) df = pd.read_csv(meta_file_general, sep=';') data_path = os.path.join(self.output_base_path, base_descr['data_path']) for i, row in df.iterrows(): f_name = os.path.join(data_path, row['cur_name']) try: _, wav_data = read_audio(f_name, target_sr, dtype='float') except Exception as e: print(str(e)) # raise CheckBaseError(str(e)) if len(wav_data.shape) != n_channels: raise CheckBaseError( 'Wrong number of channels! Target is {}, current is {}. File: ' '"{}"'.format(n_channels, len(wav_data.shape), f_name)) begin, end = float(row['begin']), float(row['end']) if abs(len(wav_data) / target_sr - (end - begin)) > 0.1: print( 'Wrong audio length. It must be the same as (end - begin) in meta! File: {} target_sr={}, begin={}, end={}, len(wav_data)={}' .format(f_name, target_sr, begin, end, len(wav_data)))
def audio_bytes_to_np(wav_data: bytes, normalize_db: float = 0.1): # Parse and normalize the audio. audio = AudioSegment.from_file(io.BytesIO(wav_data)) audio.remove_dc_offset() if normalize_db is not None: audio.normalize(headroom=normalize_db) # Save to tempfile and load with librosa. with tempfile.NamedTemporaryFile(suffix='.wav') as temp_wav_file: fname = temp_wav_file.name audio.export(fname, format='wav') wav = read_audio(fname) return wav
def __init__(self, chunk_len, filter_, hq_path, cutoff, duration=None, start=8): hq, sr = u.read_audio(hq_path) # high quality target lq = u.lowpass(hq, cutoff, filter_=filter_) # low quality input # CROP song_len = lq.shape[-1] if duration is None: # save entire song test_start = 0 test_len = song_len else: test_start = start * sr # start from n th second test_len = duration * sr test_len = min(test_len, song_len - test_start) lq = lq[:, test_start:test_start + test_len] hq = hq[:, test_start:test_start + test_len] self.x_full = lq.copy() self.t_full = hq.copy() # To have equal length chunks for minibatching time_len = lq.shape[-1] n_chunks, rem = divmod(time_len, chunk_len) lq = lq[..., :-rem or None] # or None handles rem=0 hq = hq[..., :-rem or None] # adjust lengths self.x_full = self.x_full[..., :lq.shape[-1] or None] self.t_full = self.t_full[..., :lq.shape[-1] or None] # Save full samples self.lq = np.split(lq, n_chunks, axis=-1) # create a lists of chunks self.hq = np.split(hq, n_chunks, axis=-1) # create a lists of chunks
def __getitem__(self, idx): try: hq, sr = u.read_audio(self.file_list[idx]) # high-quality target # take a chunk starting at random location x_length = hq.shape[1] start_loc = random.randint(0, x_length - self.input_len - 1) hq = hq[:, start_loc:start_loc + self.input_len] # select filter randomly from the list random_filter = random.choice(self.filters) # apply low-pass filter lq = u.lowpass(hq, self.cutoff, filter_=random_filter) # low-quality input hq = torch.from_numpy(hq) # convert to torch tensor lq = torch.from_numpy(lq) # convert to torch tensor return lq, hq # input, target except: # In case of a problem, Nones are filtered out later. return None
def get_noisy_audio(self, *, filename): return read_audio(filename, self.sample_rate)
dir_path = os.path.dirname(cur_line) f_name = os.path.basename(cur_line) for ch in range(1, 9): if args.is_real == 1: dir_path = os.path.dirname(cur_line) f_name = os.path.basename(cur_line) part1 = f_name.split('-')[0] part2 = f_name.split('-')[1] part3 = f_name.split('-')[2].split('_')[1] f_name_new = part1 + '-' + part2 + '-' + '{}'.format(ch) + '_' + part3 f = os.path.join(dir_path, f_name_new) elif args.is_real == 0: f_name_no_ch = f_name.split('_')[0] f_with_ch = f_name_no_ch + '_ch{}.wav'.format(ch) f = os.path.join(dir_path, f_with_ch) audio_data = read_audio(f) fx, tx, s_x = signal.stft(audio_data, fs=16000, nperseg=512, noverlap=512-128, nfft=512) s_x = np.transpose(s_x) if ch == 1: T, F = s_x.shape Y = np.zeros((8, T, F), dtype=np.complex64) Y[ch-1, :, :] = s_x s_x_abs = 20 * log_sp(np.abs(s_x)) s_x_abs = stack_features(s_x_abs.astype(np.float32), 5) s_x_abs = Variable(s_x_abs) if args.gpu >= 0: s_x_abs.to_gpu(args.gpu) s_x_abs_list.append(s_x_abs) elif args.single == 1: audio_data = read_audio(cur_line)
def main(args): if len(args) != 2: sys.stderr.write( 'Usage: analyze.py <path to audio file> <n_clusters>\n') sys.exit(1) """ Initialize Config input: n_clusters: Integer set by a user text_processor: by default it is set to nltk.stem.snowball.SnowballStemmer sample_rate: by default set to 16 kHz due to ASR model specs aggressivness: required for VAD, by default set to maximum=3 as audiofiles are long """ config = Config(n_clusters=int(args[1])) print( "If you want to check any specific target vocabulary, please type them\n", "Ex.: train, dog, work, seventeen, Brazil\n", "Otherwise, hit enter to skip") try: lesson_vocabulary = input().lower() except SyntaxError: pass lesson = LessonSegment( lesson_vocabulary, # target_vocabulary read_audio(args[0], config.sample_rate) # audio to get pcm_data ) # update lesson dictionary to collect statistics lesson.update_dictionary(config.text_processor) # VAD vad = webrtcvad.Vad(config.aggressivness) frames = frame_generator(30, lesson.bytes, config.sample_rate) frames = list(frames) segments = vad_collector(config.sample_rate, 10, 150, vad, frames) # ASR asr = KaldiRecognizer(Model("model"), config.sample_rate) # store LessonSegment instances lesson_segments = [] # store static tempo and pitch of each LessonSegment features = [] for segment in segments: seg = LessonSegment('', segment) seg.transcribe(asr) features.append(seg.get_features(config.sample_rate)) lesson_segments.append(seg) # Clustering features = MinMaxScaler().fit_transform(np.array(features)) cl = GaussianMixture(n_components=config.n_clusters, covariance_type='full') clusters = cl.fit_predict(features) # Resegmentation - create empty n*LessonSegments segments = [LessonSegment('', b'') for n in range(config.n_clusters)] for i, cluster in enumerate(clusters): cluster = int(cluster) segments[cluster].bytes += lesson_segments[i].bytes segments[cluster].transcript.extend(lesson_segments[i].transcript) [segment.get_staistics(lesson.dictionary) for segment in segments] for i, segment in enumerate(segments): path = 'resegmentation/cluster-%002d.mp3' % (i, ) print('Writing %s' % (path, )) write_audio(path, segment.bytes, config.sample_rate) print("\n", segment.statistics, "\n")
def evaluate(folder_audio): results_file = os.path.join(FOLDER, 'results.csv') if os.path.exists(results_file): results_file = os.path.join( FOLDER, 'results' + os.path.split(folder_audio)[1] + '.csv') with open(results_file, mode='a', newline='') as csv_file: PR_STOIS = [] OR_STOIS = [] fieldnames = [ 'Sample', 'Speech', 'Noise', 'SNR', 'STOI orig.', 'STOI pred.', 'eSTOI orig.', 'eSTOI pred.', 'PESQ orig.', 'PESQ pred.' ] class excel_semicolon(csv.excel): delimiter = ';' writer = csv.DictWriter(csv_file, fieldnames=fieldnames, dialect=excel_semicolon, extrasaction='ignore') writer.writeheader() sleep(0.1) # for tqdm pred_stois, orig_stois = [], [] pred_estois, orig_estois = [], [] pred_pesqs, orig_pesqs = [], [] speech_names, noise_names = [], [] snrs = [] index = 0 n = get_count_of_audiofiles(folder_audio) // 3 for i in tqdm(range(n), total=n, desc='Calculating STOI & PESQ'): list_audio = [ k for k in get_list_of_files(folder_audio) if '.wav' in k ] list_audio.sort() assert len(list_audio) % 3 == 0 filename = list_audio[index][:-9] fsx, x = read_audio(filename + 'noisy.wav') fsy, y = read_audio(filename + 'clean.wav') fsyh, y_hat = read_audio(filename + 'predi.wav') x, y = x[:len(y_hat)], y[:len(y_hat)] assert fsx == fsy == fsyh == target_fs assert len(x) == len(y) == len(y_hat) index += 3 # filenames _, f = os.path.split(filename) speech_noise_name = f[:-5] if f[-4] is '-' else f[:-4] sn = speech_noise_name.split('_') sn = [x.strip() for x in sn if x.strip()] speech_name = sn[0] noise_name = sn[1] speech_names.append(speech_name) noise_names.append(noise_name) # snr snr_string = f[-5:-3] snr = int( snr_string[1]) if snr_string[0] is '_' else int(snr_string) snrs.append(snr) # STOI pred_stoi = np.round(stoi(y, y_hat, target_fs), 3) orig_stoi = np.round(stoi(y, x, target_fs), 3) # eSTOI pred_estoi = np.round(stoi(y, y_hat, target_fs, extended=True), 3) orig_estoi = np.round(stoi(y, x, target_fs, extended=True), 3) # PESQ pred_pesq = np.round( pypesq(fs=target_fs, ref=y, deg=y_hat, mode='wb'), 3) orig_pesq = np.round(pypesq(fs=target_fs, ref=y, deg=x, mode='wb'), 3) # Results pred_stois.append(pred_stoi) pred_estois.append(pred_estoi) pred_pesqs.append(pred_pesq) orig_stois.append(orig_stoi) orig_estois.append(orig_estoi) orig_pesqs.append(orig_pesq) writer.writerow({ 'Sample': i, 'Speech': speech_name, 'Noise': noise_name, 'SNR': snr, 'STOI orig.': orig_stoi, 'STOI pred.': pred_stoi, 'eSTOI orig.': orig_estoi, 'eSTOI pred.': pred_estoi, 'PESQ orig.': orig_pesq, 'PESQ pred.': pred_pesq }) sleep(0.15) # for tqdm # Results analysis with pandas csv_file.close() total_metrics = 'Orig. STOI: %s - eSTOI: %s - PESQ: %s \nPred. STOI: %s - eSTOI: %s - PESQ: %s' % \ (mean_std(np.array(orig_stois)), mean_std(np.array(orig_estois)), mean_std(np.array(orig_pesqs)), mean_std(np.array(pred_stois)), mean_std(np.array(pred_estois)), mean_std(np.array(pred_pesqs))) with open(os.path.join(FOLDER, 'results_total.txt'), 'a') as file: file.write(total_metrics) file.close() df = pd.read_csv(results_file, sep=';') fig, ax = plt.subplots() df.groupby('Noise').mean()['STOI orig.'].plot(kind='bar', ax=ax, position=1, width=0.3, color='C0') df.groupby('Noise').mean()['STOI pred.'].plot(kind='bar', ax=ax, position=0, width=0.3, color='C1') plt.legend() plt.savefig(FOLDER + '/metrics_1stoi.png', dpi=600) # , bbox_inches='tight') plt.clf() plt.cla() plt.close() fig, ax = plt.subplots() df.groupby('Noise').mean()['eSTOI orig.'].plot(kind='bar', ax=ax, position=1, width=0.3, color='C0') df.groupby('Noise').mean()['eSTOI pred.'].plot(kind='bar', ax=ax, position=0, width=0.3, color='C1') plt.legend() plt.savefig(FOLDER + '/metrics_2estoi.png', dpi=600) # , bbox_inches='tight') # plt.show() plt.clf() plt.cla() plt.close() fig, ax = plt.subplots() df.groupby('Noise').mean()['PESQ orig.'].plot(kind='bar', ax=ax, position=1, width=0.3, color='C0') df.groupby('Noise').mean()['PESQ pred.'].plot(kind='bar', ax=ax, position=0, width=0.3, color='C1') plt.legend() plt.savefig(FOLDER + '/metrics_3pesq.png', dpi=600) # , bbox_inches='tight') # plt.show() plt.clf() plt.cla() plt.close() fig, ax = plt.subplots() df.groupby('SNR').mean()['STOI orig.'].plot(kind='bar', ax=ax, position=1, width=0.3, color='C0') df.groupby('SNR').mean()['STOI pred.'].plot(kind='bar', ax=ax, position=0, width=0.3, color='C1') plt.legend() plt.savefig(FOLDER + '/metrics_snr_1stoi.png', dpi=600) # , bbox_inches='tight') # plt.show() plt.clf() plt.cla() plt.close() fig, ax = plt.subplots() df.groupby('SNR').mean()['eSTOI orig.'].plot(kind='bar', ax=ax, position=1, width=0.3, color='C0') df.groupby('SNR').mean()['eSTOI pred.'].plot(kind='bar', ax=ax, position=0, width=0.3, color='C1') plt.legend() plt.savefig(FOLDER + '/metrics_snr_2estoi.png', dpi=600) # , bbox_inches='tight') # plt.show() plt.clf() plt.cla() plt.close() fig, ax = plt.subplots() df.groupby('SNR').mean()['PESQ orig.'].plot(kind='bar', ax=ax, position=1, width=0.3, color='C0') df.groupby('SNR').mean()['PESQ pred.'].plot(kind='bar', ax=ax, position=0, width=0.3, color='C1') plt.legend() plt.savefig(FOLDER + '/metrics_snr_3pesq.png', dpi=600) # , bbox_inches='tight') # plt.show() plt.clf() plt.cla() plt.close() PR_STOIS.extend(pred_stois) OR_STOIS.extend(orig_stois) print( '__________________________________________________________________________________________________' ) print('Evaluation Results: (%d files)\n' % (n)) print(total_metrics) print( '__________________________________________________________________________________________________' ) return total_metrics
second = C.set_start_second(max_value=audio_info["duration"]) sr = C.set_sampling_rate(audio_info["sample_rate"]) options = st.sidebar.selectbox( "Audio option", options=["normal", "preprocessing", "augmentations"]) utils.display_media_audio(audio_path, second) annotation = st.sidebar.file_uploader( "Upload annotation file if exist") if annotation is not None: event_level_annotation = utils.read_csv(annotation) else: event_level_annotation = None y = utils.read_audio(audio_path, audio_info, sr=sr) if options == "preprocessing": y_processed = C.preprocess_on_wave(y, sr=sr, audio_path=str(audio_path)) if y_processed is not None: st.text("Processed audio") utils.display_media_audio_from_ndarray(y_processed, sr) if event_level_annotation is None: C.waveplot(y, sr, y_processed) C.specshow(y, sr, y_processed) else: C.waveplot_with_annotation(y, sr, event_level_annotation, audio_file_name, y_processed) C.specshow_with_annotation(y, sr, event_level_annotation, audio_file_name, y_processed)
def test_one(checkpoint_path, model_type, cuda, test_fold, test_wav, test_segment): # test_bgn_time = time.time() Model = eval(model_type) model = Model(config.classes_num, activation='logsoftmax') checkpoint = torch.load(checkpoint_path) model.load_state_dict(checkpoint['model']) if cuda: model.cuda() # test_fin_time = time.time() # test_time = test_fin_time - test_bgn_time # print(test_time) audio_path = os.path.join(test_fold, test_wav) sample_rate = config.sample_rate window_size = config.window_size hop_size = config.hop_size mel_bins = config.mel_bins fmin = config.fmin fmax = config.fmax frames_per_second = config.frames_per_second frames_num = config.frames_num frames_num_clip = config.frames_num_clip total_samples = config.total_samples lb_to_idx = config.lb_to_idx audio_duration_clip = config.audio_duration_clip audio_stride_clip = config.audio_stride_clip audio_duration = config.audio_duration audio_num = config.audio_num total_frames = config.total_frames (audio, _) = read_audio( audio_path=audio_path, target_fs=sample_rate) feature_extractor = LogMelExtractor( sample_rate=sample_rate, window_size=window_size, hop_size=hop_size, mel_bins=mel_bins, fmin=fmin, fmax=fmax) audio = pad_truncate_sequence(audio, total_samples) fea_list = np.zeros((1, audio_num, frames_num_clip, mel_bins)) feature = feature_extractor.transform(audio) feature = feature[0 : total_frames] for i in range(audio_num): feature_clip = feature[i*frames_per_second*audio_stride_clip: (i+audio_duration_clip)*frames_per_second*audio_stride_clip] fea_list[0, i ,: ,:]= feature_clip fea_list = move_data_to_gpu(fea_list, cuda) pred = np.zeros((audio_num), dtype=int) for i in range(audio_num): output = model(fea_list[:, i, :, :]) output = np.argmax(output.data.cpu().numpy(), axis=-1) pred[i] = output start = -1 end = -1 # print(pred) for i in range(len(pred)): # first second if pred[i] == 0 and start == -1: start = i end = i+3 if pred[i] == 0 and start != -1: end = i+3 if start != -1: return True, start, end else : return False, -1, -1
def decode_and_getMeature(mixed_file_list, ref_list, sess, model, decode_ans_file, save_audio, ans_file): ''' (mixed_dir,ref_dir,sess,model,'decode_nnet_C001_8_2',False,'xxxans.txt') ''' if os.path.exists(os.path.join(decode_ans_file, ans_file)): os.remove(os.path.join(decode_ans_file, ans_file)) pesq_raw_sum, pesq_en_sum = 0, 0 sdr_raw_sum, sdr_en_sum = 0, 0 for i, mixed_dir in enumerate(mixed_file_list): print('\n', i + 1, mixed_dir) waveData, sr = utils.read_audio(mixed_dir) reY, mask = decode_one_wav(sess, model, waveData) reY = np.where(reY > PARAM.AMP_MAX, PARAM.AMP_MAX, reY) reY = np.where(reY < -PARAM.AMP_MAX, -PARAM.AMP_MAX, reY) file_name = mixed_dir[mixed_dir.rfind('/') + 1:mixed_dir.rfind('.')] if save_audio: utils.write_audio( os.path.join(decode_ans_file, (ckpt + '_%03d_' % (i + 1)) + mixed_dir[mixed_dir.rfind('/') + 1:]), reY, sr) utils.picture_spec( mask, os.path.join(decode_ans_file, (ckpt + '_%03d_' % (i + 1)) + file_name)) if i < len(ref_list): ref, sr = utils.read_audio(ref_list[i]) print(' refer: ', ref_list[i]) len_small = min(len(ref), len(waveData), len(reY)) ref = np.array(ref[:len_small]) waveData = np.array(waveData[:len_small]) # sdr sdr_raw = utils.cal_SDR(np.array([ref]), np.array([waveData])) sdr_en = utils.cal_SDR(np.array([ref]), np.array(reY)) sdr_raw_sum += sdr_raw sdr_en_sum += sdr_en # pesq pesq_raw = pesqexe.calc_pesq(ref, waveData, sr) pesq_en = pesqexe.calc_pesq(ref, reY, sr) pesq_raw_sum += pesq_raw pesq_en_sum += pesq_en print("SR = %d" % sr) print("SDR_raw: %.3f, SDR_en: %.3f, SDR_imp: %.3f. " % (sdr_raw, sdr_en, sdr_en - sdr_raw)) sys.stdout.flush() with open(os.path.join(decode_ans_file, ans_file), 'a+') as f: f.write(file_name + '\r\n') f.write( " |-PESQ_raw: %.3f, PESQ_en: %.3f, PESQimp: %.3f. \r\n" % (pesq_raw, pesq_en, pesq_en - pesq_raw)) f.write( " |-SDR_raw: %.3f, SDR_en: %.3f, SDR_imp: %.3f. \r\n" % (sdr_raw, sdr_en, sdr_en - sdr_raw)) len_list = len(ref_list) with open(os.path.join(decode_ans_file, ans_file), 'a+') as f: f.write('PESQ_raw:%.3f, PESQ_en:%.3f, PESQi_avg:%.3f. \r\n' % (pesq_raw_sum / len_list, pesq_en_sum / len_list, (pesq_en_sum - pesq_raw_sum) / len_list)) f.write('SDR_raw:%.3f, SDR_en:%.3f, SDRi_avg:%.3f. \r\n' % (sdr_raw_sum / len_list, sdr_en_sum / len_list, (sdr_en_sum - sdr_raw_sum) / len_list)) print('\n\n\n-----------------------------------------') print('PESQ_raw:%.3f, PESQ_en:%.3f, PESQi_avg:%.3f. \r\n' % (pesq_raw_sum / len_list, pesq_en_sum / len_list, (pesq_en_sum - pesq_raw_sum) / len_list)) print('SDR_raw:%.3f, SDR_en:%.3f, SDRi_avg:%.3f. \r\n' % (sdr_raw_sum / len_list, sdr_en_sum / len_list, (sdr_en_sum - sdr_raw_sum) / len_list)) sys.stdout.flush()
def upload_audio(normalize_db: Optional[float] = None): audio_files = files.upload() fnames = list(audio_files.keys()) if len(fnames) == 0: return None return read_audio(fnames[0])
windowLength = args.windowLength overlap = args.overlap ffTLength = args.ffTLength inputFs = args.inputFs fs = args.fs numFeatures = ffTLength // 2 + 1 numSegments = 8 model = models.build_model(l2_strength=0.0) model.summary() model.load_weights( os.path.join(mozilla_basepath, 'denoiser_cnn_log_mel_generator.h5')) cleanAudio, sr = read_audio(os.path.join(mozilla_basepath, 'clips', 'common_voice_en_16526.mp3'), sample_rate=fs) print("Min:", np.min(cleanAudio), "Max:", np.max(cleanAudio)) noiseAudio, sr = read_audio(os.path.join(urbansound_basepath, 'audio', 'fold10', '7913-3-0-0.wav'), sample_rate=fs) print("Min:", np.min(noiseAudio), "Max:", np.max(noiseAudio)) cleanAudioFeatureExtractor = FeatureExtractor(cleanAudio, windowLength=windowLength, overlap=overlap, sample_rate=sr) stft_features = cleanAudioFeatureExtractor.get_stft_spectrogram() stft_features = np.abs(stft_features) print("Min:", np.min(stft_features), "Max:", np.max(stft_features))
def create_mixture_csv(data_type): """Create csv containing mixture information. Each line in the .csv file contains [speech_name, noise_name, noise_onset, noise_offset] Args: workspace: str, path of workspace. speech_dir: str, path of speech data. noise_dir: str, path of noise data. data_type: str, 'train' | 'test'. magnification: int, only used when data_type='train', number of noise selected to mix with a speech. E.g., when magnication=3, then 4620 speech with create 4620*3 mixtures. magnification should not larger than the species of noises. """ workspace = config.workspace data_dir = config.data_dir speech_dir = os.path.join(data_dir, '{}_speech'.format(data_type)) noise_dir = os.path.join(data_dir, '{}_noise'.format(data_type)) magnification = config.magnification fs = config.sample_rate speech_names = [ na for na in os.listdir(speech_dir) if na.lower().endswith(".wav") ] noise_names = [ na for na in os.listdir(noise_dir) if na.lower().endswith(".wav") ] rs = np.random.RandomState(0) out_csv_path = os.path.join(workspace, "mixture_csvs", "%s.csv" % data_type) create_folder(os.path.dirname(out_csv_path)) cnt = 0 f = open(out_csv_path, 'w') f.write("%s\t%s\t%s\t%s\n" % ("speech_name", "noise_name", "noise_onset", "noise_offset")) for speech_na in speech_names: # Read speech. speech_path = os.path.join(speech_dir, speech_na) (speech_audio, _) = read_audio(speech_path) len_speech = len(speech_audio) # For training data, mix each speech with randomly picked #magnification noises. if data_type == 'train': selected_noise_names = rs.choice(noise_names, size=magnification, replace=False) # For test data, mix each speech with all noises. elif data_type == 'test': selected_noise_names = noise_names else: raise Exception("data_type must be train | test!") # Mix one speech with different noises many times. for noise_na in selected_noise_names: noise_path = os.path.join(noise_dir, noise_na) (noise_audio, _) = read_audio(noise_path) len_noise = len(noise_audio) if len_noise <= len_speech: noise_onset = 0 nosie_offset = len_speech # If noise longer than speech then randomly select a segment of noise. else: noise_onset = rs.randint(0, len_noise - len_speech, size=1)[0] nosie_offset = noise_onset + len_speech if cnt % 100 == 0: print(cnt) cnt += 1 f.write("%s\t%s\t%d\t%d\n" % (speech_na, noise_na, noise_onset, nosie_offset)) f.close() print(out_csv_path) print("Create %s mixture csv finished!" % data_type)
def calculate_mixture_features(data_type): """Calculate spectrogram for mixed, speech and noise audio. Then write the features to disk. Args: workspace: str, path of workspace. speech_dir: str, path of speech data. noise_dir: str, path of noise data. data_type: str, 'train' | 'test'. snr: float, signal to noise ratio to be mixed. """ workspace = config.workspace data_dir = config.data_dir speech_dir = os.path.join(data_dir, '{}_speech'.format(data_type)) noise_dir = os.path.join(data_dir, '{}_noise'.format(data_type)) fs = config.sample_rate if data_type == 'train': snr = config.Tr_SNR elif data_type == 'test': snr = config.Te_SNR else: raise Exception("data_type must be train | test!") # Open mixture csv. mixture_csv_path = os.path.join(workspace, "mixture_csvs", "%s.csv" % data_type) with open(mixture_csv_path, 'r') as f: reader = csv.reader(f, delimiter='\t') lis = list(reader) t1 = time.time() cnt = 0 for i1 in range(1, len(lis)): [speech_na, noise_na, noise_onset, noise_offset] = lis[i1] noise_onset = int(noise_onset) noise_offset = int(noise_offset) # Read speech audio. speech_path = os.path.join(speech_dir, speech_na) (speech_audio, _) = read_audio(speech_path, target_fs=fs) # Read noise audio. noise_path = os.path.join(noise_dir, noise_na) (noise_audio, _) = read_audio(noise_path, target_fs=fs) # Repeat noise to the same length as speech. if len(noise_audio) < len(speech_audio): n_repeat = int( np.ceil(float(len(speech_audio)) / float(len(noise_audio)))) noise_audio_ex = np.tile(noise_audio, n_repeat) noise_audio = noise_audio_ex[0:len(speech_audio)] # Truncate noise to the same length as speech. else: noise_audio = noise_audio[noise_onset:noise_offset] # Scale speech to given snr. scaler = get_amplitude_scaling_factor(speech_audio, noise_audio, snr=snr) speech_audio *= scaler # Get normalized mixture, speech, noise. (mixed_audio, speech_audio, noise_audio, alpha) = additive_mixing(speech_audio, noise_audio) # Write out mixed audio. out_bare_na = os.path.join( "%s.%s" % (os.path.splitext(speech_na)[0], os.path.splitext(noise_na)[0])) out_audio_path = os.path.join(workspace, "mixed_audios", "spectrogram", data_type, "%ddb" % int(snr), "%s.wav" % out_bare_na) create_folder(os.path.dirname(out_audio_path)) write_audio(out_audio_path, mixed_audio, fs) # Extract spectrogram. mixed_complx_x = calc_sp(mixed_audio, mode='complex') speech_x = calc_sp(speech_audio, mode='magnitude') noise_x = calc_sp(noise_audio, mode='magnitude') # Write out features. out_feat_path = os.path.join(workspace, "features", "spectrogram", data_type, "%ddb" % int(snr), "%s.p" % out_bare_na) create_folder(os.path.dirname(out_feat_path)) data = [mixed_complx_x, speech_x, noise_x, alpha, out_bare_na] pickle.dump(data, open(out_feat_path, 'wb'), protocol=pickle.HIGHEST_PROTOCOL) # Print. if cnt % 100 == 0: print(cnt) cnt += 1 print("Extracting feature time: %s" % (time.time() - t1))
is_speech = self.vad.is_speech(audio[start:stop], sample_rate=self.sample_rate) if is_speech == True: vad_res.append(1) else: vad_res.append(0) #print(is_speech) return vad_res if __name__ == "__main__": filepath = '../db/test/file003_e.wav' vad = WebrtcVAD() audio = read_audio(filepath) vad_res = vad.perform_vad(audio) vad_res = np.array(vad_res) print(vad_res) x = len(vad_res) * 640 / 16000 x = np.linspace(0, x, len(vad_res)) print(x.shape, vad_res.shape) plt.plot(x, vad_res) plt.xticks(np.arange(0, 42, step=2)) plt.show()
# "exp/rnn_speech_enhancement/8k/2_00_8k_raw.wav", ] decode_file_list_16k = [ "exp/test_oc/refer_wav/hebing2_ref.wav", "exp/test_oc/refer_wav/test1_ref.wav", ] if PARAM.FS == 8000: decode_file_list = decode_file_list_8k elif PARAM.FS == 16000: decode_file_list = decode_file_list_16k else: print('PARAM.FS error, exit.'), exit(-1) for i, mixed_dir in enumerate(decode_file_list): print(i + 1, mixed_dir) waveData, sr = utils.read_audio(mixed_dir) reY, mask = decode_one_wav(sess, model, waveData) print(np.max(reY)) abs_max = (2**(PARAM.AUDIO_BITS - 1) - 1) reY = np.where(reY > abs_max, abs_max, reY) reY = np.where(reY < -abs_max, -abs_max, reY) utils.write_audio( os.path.join(decode_ans_file, (ckpt + '_%03d_' % (i + 1)) + mixed_dir[mixed_dir.rfind('/') + 1:]), reY, sr) file_name = mixed_dir[mixed_dir.rfind('/') + 1:mixed_dir.rfind('.')] utils.picture_spec( mask, os.path.join(decode_ans_file, (ckpt + '_%03d_' % (i + 1)) + file_name))
def calculate_feature_for_all_audio_files(args): '''Calculate feature of audio files and write out features to a hdf5 file. Args: dataset_dir: string workspace: string mini_data: bool, set True for debugging on a small part of data ''' # Arguments & parameters dataset_dir = args.dataset_dir workspace = args.workspace mini_data = args.mini_data sample_rate = config.sample_rate window_size = config.window_size hop_size = config.hop_size mel_bins = config.mel_bins fmin = config.fmin fmax = config.fmax frames_per_second = config.frames_per_second frames_num = config.frames_num total_samples = config.total_samples lb_to_idx = config.lb_to_idx audio_duration_clip = config.audio_duration_clip audio_stride_clip = config.audio_stride_clip audio_duration = config.audio_duration audio_num = config.audio_num total_frames = config.total_frames # Paths if mini_data: prefix = 'minidata_' else: prefix = '' audios_dir = os.path.join(dataset_dir, 'audio') metadata_path = os.path.join(dataset_dir, 'meta', 'esc50.csv') feature_path = os.path.join( workspace, 'features', '{}logmel_{}frames_{}melbins.h5'.format(prefix, frames_per_second, mel_bins)) create_folder(os.path.dirname(feature_path)) # Feature extractor feature_extractor = LogMelExtractor(sample_rate=sample_rate, window_size=window_size, hop_size=hop_size, mel_bins=mel_bins, fmin=fmin, fmax=fmax) # Read metadata meta_dict = read_metadata(metadata_path) # Extract features and targets if mini_data: mini_num = 10 total_num = len(meta_dict['filename']) random_state = np.random.RandomState(1234) indexes = random_state.choice(total_num, size=mini_num, replace=False) for key in meta_dict.keys(): meta_dict[key] = meta_dict[key][indexes] print('Extracting features of all audio files ...') extract_time = time.time() # Hdf5 file for storing features and targets hf = h5py.File(feature_path, 'w') hf.create_dataset( name='filename', data=[filename.encode() for filename in meta_dict['filename']], dtype='S80') if 'fold' in meta_dict.keys(): hf.create_dataset(name='fold', data=[fold for fold in meta_dict['fold']], dtype=np.int64) if 'target' in meta_dict.keys(): hf.create_dataset(name='target', data=[target for target in meta_dict['target']], dtype=np.int64) if 'category' in meta_dict.keys(): hf.create_dataset( name='category', data=[category.encode() for category in meta_dict['category']], dtype='S80') if 'esc10' in meta_dict.keys(): hf.create_dataset(name='esc10', data=[esc10 for esc10 in meta_dict['esc10']], dtype=np.bool) if 'src_file' in meta_dict.keys(): hf.create_dataset( name='src_file', data=[src_file for src_file in meta_dict['src_file']], dtype=np.int64) if 'take' in meta_dict.keys(): hf.create_dataset(name='take', data=[take.encode() for take in meta_dict['take']], dtype='S24') hf.create_dataset(name='feature', shape=(0, audio_num, frames_num, mel_bins), maxshape=(None, audio_num, frames_num, mel_bins), dtype=np.float32) for (n, filename) in enumerate(meta_dict['filename']): audio_path = os.path.join(audios_dir, filename) print(n, audio_path) # Read audio (audio, _) = read_audio(audio_path=audio_path, target_fs=sample_rate) # Pad or truncate audio recording to the same length audio = pad_truncate_sequence(audio, total_samples) # Extract feature fea_list = [] # for i in range(audio_num): # audio_clip = audio[i*sample_rate*audio_stride_clip: (i+2)*sample_rate*audio_stride_clip] # feature = feature_extractor.transform(audio_clip) # feature = feature[0 : frames_per_second*audio_duration_clip] # fea_list.append(feature) feature = feature_extractor.transform(audio) # # Remove the extra log mel spectrogram frames caused by padding zero feature = feature[0:total_frames] for i in range(audio_num): feature_clip = feature[i * frames_per_second * audio_stride_clip:(i + audio_duration_clip) * frames_per_second * audio_stride_clip] fea_list.append(feature_clip) hf['feature'].resize((n + 1, audio_num, frames_num, mel_bins)) hf['feature'][n] = fea_list hf.close() print('Write hdf5 file to {} using {:.3f} s'.format( feature_path, time.time() - extract_time))
file_list = in_file.read().split('\n') del file_list[-1] perm = np.random.permutation(len(file_list)) bno = 0 for i in tqdm(range(0, len(file_list), args.batch_size), desc='Generating data for {}'.format(data_type)): bno = bno + 1 s_n_abs_list = [] s_x_abs_list = [] for bid in range(0, args.batch_size): if i + bid < len(file_list): f_template = file_list[perm[i + bid]] for ch in range(1, 9): f = f_template + '_ch{}.wav'.format(ch) f_no_ltr = f_template + '_ch{}.NLR.wav'.format(ch) ltr_audio = read_audio(f) no_ltr_audio = read_audio(f_no_ltr) fx, tx, s_x = signal.stft(no_ltr_audio, fs=16000, nperseg=512, noverlap=512 - 128, nfft=512) fn, tn, s_n = signal.stft(ltr_audio, fs=16000, nperseg=512, noverlap=512 - 128, nfft=512) s_x = np.transpose(s_x) s_n = np.transpose(s_n) s_x_abs = 20 * log_sp(np.abs(s_x)) s_n_abs = 20 * log_sp(np.abs(s_n))