Ejemplo n.º 1
0
    def parallel_audio_processing(self, clean_filename):

        clean_audio, _ = read_audio(clean_filename, self.sample_rate)

        # remove silent frame from clean audio
        clean_audio = self._remove_silent_frames(clean_audio)

        noise_filename = self._sample_noise_filename()

        # read the noise filename
        noise_audio, sr = read_audio(noise_filename, self.sample_rate)

        # remove silent frame from noise audio
        noise_audio = self._remove_silent_frames(noise_audio)

        # sample random fixed-sized snippets of audio
        clean_audio = self._audio_random_crop(clean_audio,
                                              duration=self.audio_max_duration)

        # add noise to input image
        noiseInput = self._add_noise_to_clean_audio(clean_audio, noise_audio)

        # extract stft features from noisy audio
        noisy_input_fe = FeatureExtractor(noiseInput,
                                          windowLength=self.window_length,
                                          overlap=self.overlap,
                                          sample_rate=self.sample_rate)
        noise_spectrogram = noisy_input_fe.get_stft_spectrogram()

        # Or get the phase angle (in radians)
        # noisy_stft_magnitude, noisy_stft_phase = librosa.magphase(noisy_stft_features)
        noise_phase = np.angle(noise_spectrogram)

        # get the magnitude of the spectral
        noise_magnitude = np.abs(noise_spectrogram)

        # extract stft features from clean audio
        clean_audio_fe = FeatureExtractor(clean_audio,
                                          windowLength=self.window_length,
                                          overlap=self.overlap,
                                          sample_rate=self.sample_rate)
        clean_spectrogram = clean_audio_fe.get_stft_spectrogram()
        # clean_spectrogram = cleanAudioFE.get_mel_spectrogram()

        # get the clean phase
        clean_phase = np.angle(clean_spectrogram)

        # get the clean spectral magnitude
        clean_magnitude = np.abs(clean_spectrogram)
        # clean_magnitude = 2 * clean_magnitude / np.sum(scipy.signal.hamming(self.window_length, sym=False))

        clean_magnitude = self._phase_aware_scaling(clean_magnitude,
                                                    clean_phase, noise_phase)

        scaler = StandardScaler(copy=False, with_mean=True, with_std=True)
        noise_magnitude = scaler.fit_transform(noise_magnitude)
        clean_magnitude = scaler.transform(clean_magnitude)

        return noise_magnitude, clean_magnitude, noise_phase
Ejemplo n.º 2
0
def _get_pad_wave_data(file): # 3s
    wave_data, sr = utils.read_audio(file)  # data, fs
    while len(wave_data) < PARAM.LEN_WAWE_PAD_TO:
        wave_data = np.tile(wave_data, 2)
    len_wave = len(wave_data)
    wave_begin = np.random.randint(len_wave - PARAM.LEN_WAWE_PAD_TO + 1)
    return wave_data[wave_begin:wave_begin + PARAM.LEN_WAWE_PAD_TO]
Ejemplo n.º 3
0
    def parallel_audio_processing(self, clean_filename):

        clean_audio, _ = read_audio(clean_filename, self.sample_rate)

        # remove silent frame from clean audio
        clean_audio = self._remove_silent_frames(clean_audio)
        
        # sample random fixed-sized snippets of audio
        clean_audio = self._audio_random_crop(clean_audio, duration=self.audio_max_duration)
        
        ## extract stft features from clean audio ##
        clean_audio_fe = FeatureExtractor(clean_audio, windowLength=self.window_length,
                                          overlap=self.overlap, sample_rate=self.sample_rate)
        clean_spectrogram = clean_audio_fe.get_stft_spectrogram()
        ## clean_spectrogram = cleanAudioFE.get_mel_spectrogram()
        
        # get the clean phase
        clean_phase = np.angle(clean_spectrogram)
        # get the clean spectral magnitude
        clean_magnitude = np.abs(clean_spectrogram)
        
        # noise generation
        noise_magnitude = self._gen_noise_stft(clean_magnitude, 0)
        #clean_magnitude = self._phase_aware_scaling(clean_magnitude, clean_phase, noise_phase)
        scaler = StandardScaler(copy=False, with_mean=True, with_std=True)
        noise_magnitude = scaler.fit_transform(noise_magnitude)
        clean_magnitude = scaler.transform(clean_magnitude)

        return noise_magnitude, clean_magnitude, clean_phase
Ejemplo n.º 4
0
    def _check_data_properties(self):
        base_descr_file = os.path.join(self.output_base_path,
                                       'base_description.yml')
        with open(base_descr_file, 'r') as f:
            base_descr = yaml.safe_load(f)
        target_sr = base_descr['data_properties']['sample_rate']
        n_channels = base_descr['data_properties']['n_channels']

        meta_file_general = os.path.join(self.output_base_path,
                                         base_descr['general_meta'])
        df = pd.read_csv(meta_file_general, sep=';')
        data_path = os.path.join(self.output_base_path,
                                 base_descr['data_path'])
        for i, row in df.iterrows():
            f_name = os.path.join(data_path, row['cur_name'])
            try:
                _, wav_data = read_audio(f_name, target_sr, dtype='float')
            except Exception as e:
                print(str(e))
                # raise CheckBaseError(str(e))
            if len(wav_data.shape) != n_channels:
                raise CheckBaseError(
                    'Wrong number of channels! Target is {}, current is {}. File: '
                    '"{}"'.format(n_channels, len(wav_data.shape), f_name))
            begin, end = float(row['begin']), float(row['end'])
            if abs(len(wav_data) / target_sr - (end - begin)) > 0.1:
                print(
                    'Wrong audio length. It must be the same as (end - begin) in meta! File: {} target_sr={}, begin={}, end={}, len(wav_data)={}'
                    .format(f_name, target_sr, begin, end, len(wav_data)))
Ejemplo n.º 5
0
def audio_bytes_to_np(wav_data: bytes, normalize_db: float = 0.1):
    # Parse and normalize the audio.
    audio = AudioSegment.from_file(io.BytesIO(wav_data))
    audio.remove_dc_offset()
    if normalize_db is not None:
        audio.normalize(headroom=normalize_db)
    # Save to tempfile and load with librosa.
    with tempfile.NamedTemporaryFile(suffix='.wav') as temp_wav_file:
        fname = temp_wav_file.name
        audio.export(fname, format='wav')
        wav = read_audio(fname)
    return wav
Ejemplo n.º 6
0
    def __init__(self,
                 chunk_len,
                 filter_,
                 hq_path,
                 cutoff,
                 duration=None,
                 start=8):

        hq, sr = u.read_audio(hq_path)  # high quality target
        lq = u.lowpass(hq, cutoff, filter_=filter_)  # low quality input

        # CROP
        song_len = lq.shape[-1]

        if duration is None:  # save entire song
            test_start = 0
            test_len = song_len
        else:
            test_start = start * sr  # start from n th second
            test_len = duration * sr

        test_len = min(test_len, song_len - test_start)

        lq = lq[:, test_start:test_start + test_len]
        hq = hq[:, test_start:test_start + test_len]

        self.x_full = lq.copy()
        self.t_full = hq.copy()

        # To have equal length chunks for minibatching
        time_len = lq.shape[-1]
        n_chunks, rem = divmod(time_len, chunk_len)
        lq = lq[..., :-rem or None]  # or None handles rem=0
        hq = hq[..., :-rem or None]

        # adjust lengths
        self.x_full = self.x_full[..., :lq.shape[-1] or None]
        self.t_full = self.t_full[..., :lq.shape[-1] or None]

        # Save full samples

        self.lq = np.split(lq, n_chunks, axis=-1)  # create a lists of chunks
        self.hq = np.split(hq, n_chunks, axis=-1)  # create a lists of chunks
Ejemplo n.º 7
0
    def __getitem__(self, idx):
        try:
            hq, sr = u.read_audio(self.file_list[idx])  # high-quality target

            # take a chunk starting at random location
            x_length = hq.shape[1]
            start_loc = random.randint(0, x_length - self.input_len - 1)
            hq = hq[:, start_loc:start_loc + self.input_len]
            # select filter randomly from the list
            random_filter = random.choice(self.filters)
            # apply low-pass filter
            lq = u.lowpass(hq, self.cutoff,
                           filter_=random_filter)  # low-quality input

            hq = torch.from_numpy(hq)  # convert to torch tensor
            lq = torch.from_numpy(lq)  # convert to torch tensor

            return lq, hq  # input, target
        except:
            # In case of a problem, Nones are filtered out later.
            return None
Ejemplo n.º 8
0
 def get_noisy_audio(self, *, filename):
     return read_audio(filename, self.sample_rate)
Ejemplo n.º 9
0
     dir_path = os.path.dirname(cur_line)
     f_name = os.path.basename(cur_line)
     for ch in range(1, 9):
         if args.is_real == 1:
             dir_path = os.path.dirname(cur_line)
             f_name = os.path.basename(cur_line)
             part1 = f_name.split('-')[0]
             part2 = f_name.split('-')[1]
             part3 = f_name.split('-')[2].split('_')[1]
             f_name_new = part1 + '-' + part2 + '-' + '{}'.format(ch) + '_' + part3
             f = os.path.join(dir_path, f_name_new)
         elif args.is_real == 0:
             f_name_no_ch = f_name.split('_')[0]
             f_with_ch = f_name_no_ch + '_ch{}.wav'.format(ch)
             f = os.path.join(dir_path, f_with_ch)
         audio_data = read_audio(f)
         fx, tx, s_x = signal.stft(audio_data, fs=16000, nperseg=512,
                                   noverlap=512-128, nfft=512)
         s_x = np.transpose(s_x)
         if ch == 1:
             T, F = s_x.shape
             Y = np.zeros((8, T, F), dtype=np.complex64)
         Y[ch-1, :, :] = s_x
         s_x_abs = 20 * log_sp(np.abs(s_x))
         s_x_abs = stack_features(s_x_abs.astype(np.float32), 5)
         s_x_abs = Variable(s_x_abs)
         if args.gpu >= 0:
             s_x_abs.to_gpu(args.gpu)
         s_x_abs_list.append(s_x_abs)
 elif args.single == 1:
     audio_data = read_audio(cur_line)
Ejemplo n.º 10
0
def main(args):
    if len(args) != 2:
        sys.stderr.write(
            'Usage: analyze.py <path to audio file> <n_clusters>\n')
        sys.exit(1)
    """
    Initialize Config
    input:
    n_clusters: Integer set by a user
    text_processor: by default it is set to nltk.stem.snowball.SnowballStemmer
    sample_rate: by default set to 16 kHz due to ASR model specs
    aggressivness: required for VAD, by default set to maximum=3 as audiofiles are long
    """
    config = Config(n_clusters=int(args[1]))

    print(
        "If you want to check any specific target vocabulary, please type them\n",
        "Ex.: train, dog, work, seventeen, Brazil\n",
        "Otherwise, hit enter to skip")

    try:
        lesson_vocabulary = input().lower()
    except SyntaxError:
        pass

    lesson = LessonSegment(
        lesson_vocabulary,  # target_vocabulary
        read_audio(args[0], config.sample_rate)  # audio to get pcm_data
    )

    # update lesson dictionary to collect statistics
    lesson.update_dictionary(config.text_processor)

    # VAD
    vad = webrtcvad.Vad(config.aggressivness)
    frames = frame_generator(30, lesson.bytes, config.sample_rate)
    frames = list(frames)
    segments = vad_collector(config.sample_rate, 10, 150, vad, frames)

    # ASR
    asr = KaldiRecognizer(Model("model"), config.sample_rate)

    # store LessonSegment instances
    lesson_segments = []
    # store static tempo and pitch of each LessonSegment
    features = []
    for segment in segments:
        seg = LessonSegment('', segment)
        seg.transcribe(asr)
        features.append(seg.get_features(config.sample_rate))
        lesson_segments.append(seg)

    # Clustering
    features = MinMaxScaler().fit_transform(np.array(features))
    cl = GaussianMixture(n_components=config.n_clusters,
                         covariance_type='full')
    clusters = cl.fit_predict(features)

    # Resegmentation - create empty n*LessonSegments
    segments = [LessonSegment('', b'') for n in range(config.n_clusters)]
    for i, cluster in enumerate(clusters):
        cluster = int(cluster)
        segments[cluster].bytes += lesson_segments[i].bytes
        segments[cluster].transcript.extend(lesson_segments[i].transcript)

    [segment.get_staistics(lesson.dictionary) for segment in segments]

    for i, segment in enumerate(segments):
        path = 'resegmentation/cluster-%002d.mp3' % (i, )
        print('Writing %s' % (path, ))
        write_audio(path, segment.bytes, config.sample_rate)
        print("\n", segment.statistics, "\n")
def evaluate(folder_audio):
    results_file = os.path.join(FOLDER, 'results.csv')
    if os.path.exists(results_file):
        results_file = os.path.join(
            FOLDER, 'results' + os.path.split(folder_audio)[1] + '.csv')
    with open(results_file, mode='a', newline='') as csv_file:
        PR_STOIS = []
        OR_STOIS = []
        fieldnames = [
            'Sample', 'Speech', 'Noise', 'SNR', 'STOI orig.', 'STOI pred.',
            'eSTOI orig.', 'eSTOI pred.', 'PESQ orig.', 'PESQ pred.'
        ]

        class excel_semicolon(csv.excel):
            delimiter = ';'

        writer = csv.DictWriter(csv_file,
                                fieldnames=fieldnames,
                                dialect=excel_semicolon,
                                extrasaction='ignore')
        writer.writeheader()
        sleep(0.1)  # for tqdm
        pred_stois, orig_stois = [], []
        pred_estois, orig_estois = [], []
        pred_pesqs, orig_pesqs = [], []
        speech_names, noise_names = [], []
        snrs = []
        index = 0
        n = get_count_of_audiofiles(folder_audio) // 3
        for i in tqdm(range(n), total=n, desc='Calculating STOI & PESQ'):
            list_audio = [
                k for k in get_list_of_files(folder_audio) if '.wav' in k
            ]
            list_audio.sort()
            assert len(list_audio) % 3 == 0
            filename = list_audio[index][:-9]
            fsx, x = read_audio(filename + 'noisy.wav')
            fsy, y = read_audio(filename + 'clean.wav')
            fsyh, y_hat = read_audio(filename + 'predi.wav')
            x, y = x[:len(y_hat)], y[:len(y_hat)]
            assert fsx == fsy == fsyh == target_fs
            assert len(x) == len(y) == len(y_hat)

            index += 3
            # filenames
            _, f = os.path.split(filename)
            speech_noise_name = f[:-5] if f[-4] is '-' else f[:-4]
            sn = speech_noise_name.split('_')
            sn = [x.strip() for x in sn if x.strip()]
            speech_name = sn[0]
            noise_name = sn[1]
            speech_names.append(speech_name)
            noise_names.append(noise_name)
            # snr
            snr_string = f[-5:-3]
            snr = int(
                snr_string[1]) if snr_string[0] is '_' else int(snr_string)
            snrs.append(snr)
            # STOI
            pred_stoi = np.round(stoi(y, y_hat, target_fs), 3)
            orig_stoi = np.round(stoi(y, x, target_fs), 3)
            # eSTOI
            pred_estoi = np.round(stoi(y, y_hat, target_fs, extended=True), 3)
            orig_estoi = np.round(stoi(y, x, target_fs, extended=True), 3)
            # PESQ
            pred_pesq = np.round(
                pypesq(fs=target_fs, ref=y, deg=y_hat, mode='wb'), 3)
            orig_pesq = np.round(pypesq(fs=target_fs, ref=y, deg=x, mode='wb'),
                                 3)
            # Results
            pred_stois.append(pred_stoi)
            pred_estois.append(pred_estoi)
            pred_pesqs.append(pred_pesq)
            orig_stois.append(orig_stoi)
            orig_estois.append(orig_estoi)
            orig_pesqs.append(orig_pesq)
            writer.writerow({
                'Sample': i,
                'Speech': speech_name,
                'Noise': noise_name,
                'SNR': snr,
                'STOI orig.': orig_stoi,
                'STOI pred.': pred_stoi,
                'eSTOI orig.': orig_estoi,
                'eSTOI pred.': pred_estoi,
                'PESQ orig.': orig_pesq,
                'PESQ pred.': pred_pesq
            })
        sleep(0.15)  # for tqdm

        # Results analysis with pandas
        csv_file.close()
        total_metrics = 'Orig. STOI: %s - eSTOI: %s - PESQ: %s \nPred. STOI: %s - eSTOI: %s - PESQ: %s' % \
                        (mean_std(np.array(orig_stois)), mean_std(np.array(orig_estois)), mean_std(np.array(orig_pesqs)),
                         mean_std(np.array(pred_stois)), mean_std(np.array(pred_estois)), mean_std(np.array(pred_pesqs)))
        with open(os.path.join(FOLDER, 'results_total.txt'), 'a') as file:
            file.write(total_metrics)
            file.close()
        df = pd.read_csv(results_file, sep=';')
        fig, ax = plt.subplots()
        df.groupby('Noise').mean()['STOI orig.'].plot(kind='bar',
                                                      ax=ax,
                                                      position=1,
                                                      width=0.3,
                                                      color='C0')
        df.groupby('Noise').mean()['STOI pred.'].plot(kind='bar',
                                                      ax=ax,
                                                      position=0,
                                                      width=0.3,
                                                      color='C1')
        plt.legend()
        plt.savefig(FOLDER + '/metrics_1stoi.png',
                    dpi=600)  # , bbox_inches='tight')
        plt.clf()
        plt.cla()
        plt.close()

        fig, ax = plt.subplots()
        df.groupby('Noise').mean()['eSTOI orig.'].plot(kind='bar',
                                                       ax=ax,
                                                       position=1,
                                                       width=0.3,
                                                       color='C0')
        df.groupby('Noise').mean()['eSTOI pred.'].plot(kind='bar',
                                                       ax=ax,
                                                       position=0,
                                                       width=0.3,
                                                       color='C1')
        plt.legend()
        plt.savefig(FOLDER + '/metrics_2estoi.png',
                    dpi=600)  # , bbox_inches='tight')
        # plt.show()
        plt.clf()
        plt.cla()
        plt.close()

        fig, ax = plt.subplots()
        df.groupby('Noise').mean()['PESQ orig.'].plot(kind='bar',
                                                      ax=ax,
                                                      position=1,
                                                      width=0.3,
                                                      color='C0')
        df.groupby('Noise').mean()['PESQ pred.'].plot(kind='bar',
                                                      ax=ax,
                                                      position=0,
                                                      width=0.3,
                                                      color='C1')
        plt.legend()
        plt.savefig(FOLDER + '/metrics_3pesq.png',
                    dpi=600)  # , bbox_inches='tight')
        # plt.show()
        plt.clf()
        plt.cla()
        plt.close()

        fig, ax = plt.subplots()
        df.groupby('SNR').mean()['STOI orig.'].plot(kind='bar',
                                                    ax=ax,
                                                    position=1,
                                                    width=0.3,
                                                    color='C0')
        df.groupby('SNR').mean()['STOI pred.'].plot(kind='bar',
                                                    ax=ax,
                                                    position=0,
                                                    width=0.3,
                                                    color='C1')
        plt.legend()
        plt.savefig(FOLDER + '/metrics_snr_1stoi.png',
                    dpi=600)  # , bbox_inches='tight')
        # plt.show()
        plt.clf()
        plt.cla()
        plt.close()

        fig, ax = plt.subplots()
        df.groupby('SNR').mean()['eSTOI orig.'].plot(kind='bar',
                                                     ax=ax,
                                                     position=1,
                                                     width=0.3,
                                                     color='C0')
        df.groupby('SNR').mean()['eSTOI pred.'].plot(kind='bar',
                                                     ax=ax,
                                                     position=0,
                                                     width=0.3,
                                                     color='C1')
        plt.legend()
        plt.savefig(FOLDER + '/metrics_snr_2estoi.png',
                    dpi=600)  # , bbox_inches='tight')
        # plt.show()
        plt.clf()
        plt.cla()
        plt.close()

        fig, ax = plt.subplots()
        df.groupby('SNR').mean()['PESQ orig.'].plot(kind='bar',
                                                    ax=ax,
                                                    position=1,
                                                    width=0.3,
                                                    color='C0')
        df.groupby('SNR').mean()['PESQ pred.'].plot(kind='bar',
                                                    ax=ax,
                                                    position=0,
                                                    width=0.3,
                                                    color='C1')
        plt.legend()
        plt.savefig(FOLDER + '/metrics_snr_3pesq.png',
                    dpi=600)  # , bbox_inches='tight')
        # plt.show()
        plt.clf()
        plt.cla()
        plt.close()

        PR_STOIS.extend(pred_stois)
        OR_STOIS.extend(orig_stois)

        print(
            '__________________________________________________________________________________________________'
        )
        print('Evaluation Results: (%d files)\n' % (n))
        print(total_metrics)
        print(
            '__________________________________________________________________________________________________'
        )

    return total_metrics
Ejemplo n.º 12
0
        second = C.set_start_second(max_value=audio_info["duration"])
        sr = C.set_sampling_rate(audio_info["sample_rate"])

        options = st.sidebar.selectbox(
            "Audio option",
            options=["normal", "preprocessing", "augmentations"])
        utils.display_media_audio(audio_path, second)

        annotation = st.sidebar.file_uploader(
            "Upload annotation file if exist")
        if annotation is not None:
            event_level_annotation = utils.read_csv(annotation)
        else:
            event_level_annotation = None

        y = utils.read_audio(audio_path, audio_info, sr=sr)
        if options == "preprocessing":
            y_processed = C.preprocess_on_wave(y,
                                               sr=sr,
                                               audio_path=str(audio_path))
            if y_processed is not None:
                st.text("Processed audio")
                utils.display_media_audio_from_ndarray(y_processed, sr)
                if event_level_annotation is None:
                    C.waveplot(y, sr, y_processed)
                    C.specshow(y, sr, y_processed)
                else:
                    C.waveplot_with_annotation(y, sr, event_level_annotation,
                                               audio_file_name, y_processed)
                    C.specshow_with_annotation(y, sr, event_level_annotation,
                                               audio_file_name, y_processed)
Ejemplo n.º 13
0
def test_one(checkpoint_path, model_type, cuda, test_fold, test_wav, test_segment):
#     test_bgn_time = time.time()
    Model = eval(model_type)
    model = Model(config.classes_num, activation='logsoftmax')         
    checkpoint = torch.load(checkpoint_path)
    model.load_state_dict(checkpoint['model'])
    if cuda:
        model.cuda()
#     test_fin_time = time.time()
#     test_time = test_fin_time - test_bgn_time
#     print(test_time)
    audio_path = os.path.join(test_fold, test_wav)
    sample_rate = config.sample_rate
    window_size = config.window_size
    hop_size = config.hop_size
    mel_bins = config.mel_bins
    fmin = config.fmin
    fmax = config.fmax
    frames_per_second = config.frames_per_second
    frames_num = config.frames_num
    frames_num_clip = config.frames_num_clip
    total_samples = config.total_samples
    lb_to_idx = config.lb_to_idx
    audio_duration_clip = config.audio_duration_clip
    audio_stride_clip = config.audio_stride_clip
    audio_duration = config.audio_duration
    audio_num = config.audio_num
    total_frames = config.total_frames
    
    (audio, _) = read_audio(
            audio_path=audio_path, 
            target_fs=sample_rate)
    feature_extractor = LogMelExtractor(
        sample_rate=sample_rate, 
        window_size=window_size, 
        hop_size=hop_size, 
        mel_bins=mel_bins, 
        fmin=fmin, 
        fmax=fmax)
    audio = pad_truncate_sequence(audio, total_samples)
    fea_list = np.zeros((1, audio_num, frames_num_clip, mel_bins))
    feature = feature_extractor.transform(audio)
    feature = feature[0 : total_frames]
    for i in range(audio_num):
        feature_clip = feature[i*frames_per_second*audio_stride_clip: (i+audio_duration_clip)*frames_per_second*audio_stride_clip]
        fea_list[0, i ,: ,:]= feature_clip
    fea_list = move_data_to_gpu(fea_list, cuda)
    
    pred = np.zeros((audio_num), dtype=int)
    for i in range(audio_num): 
        output = model(fea_list[:, i, :, :])
        output = np.argmax(output.data.cpu().numpy(), axis=-1)
        pred[i] = output
    start = -1
    end = -1
#     print(pred)
    for i in range(len(pred)):
        # first second
        if pred[i] == 0 and start == -1:
            start = i
            end = i+3
        if pred[i] == 0 and start != -1:
            end = i+3
    if start != -1:
        return True, start, end
    else :
        return False, -1, -1
Ejemplo n.º 14
0
def decode_and_getMeature(mixed_file_list, ref_list, sess, model,
                          decode_ans_file, save_audio, ans_file):
    '''
    (mixed_dir,ref_dir,sess,model,'decode_nnet_C001_8_2',False,'xxxans.txt')
    '''
    if os.path.exists(os.path.join(decode_ans_file, ans_file)):
        os.remove(os.path.join(decode_ans_file, ans_file))
    pesq_raw_sum, pesq_en_sum = 0, 0
    sdr_raw_sum, sdr_en_sum = 0, 0
    for i, mixed_dir in enumerate(mixed_file_list):
        print('\n', i + 1, mixed_dir)
        waveData, sr = utils.read_audio(mixed_dir)
        reY, mask = decode_one_wav(sess, model, waveData)
        reY = np.where(reY > PARAM.AMP_MAX, PARAM.AMP_MAX, reY)
        reY = np.where(reY < -PARAM.AMP_MAX, -PARAM.AMP_MAX, reY)
        file_name = mixed_dir[mixed_dir.rfind('/') + 1:mixed_dir.rfind('.')]
        if save_audio:
            utils.write_audio(
                os.path.join(decode_ans_file, (ckpt + '_%03d_' % (i + 1)) +
                             mixed_dir[mixed_dir.rfind('/') + 1:]), reY, sr)
            utils.picture_spec(
                mask,
                os.path.join(decode_ans_file,
                             (ckpt + '_%03d_' % (i + 1)) + file_name))
        if i < len(ref_list):
            ref, sr = utils.read_audio(ref_list[i])
            print(' refer: ', ref_list[i])
            len_small = min(len(ref), len(waveData), len(reY))
            ref = np.array(ref[:len_small])
            waveData = np.array(waveData[:len_small])

            # sdr
            sdr_raw = utils.cal_SDR(np.array([ref]), np.array([waveData]))
            sdr_en = utils.cal_SDR(np.array([ref]), np.array(reY))
            sdr_raw_sum += sdr_raw
            sdr_en_sum += sdr_en

            # pesq
            pesq_raw = pesqexe.calc_pesq(ref, waveData, sr)
            pesq_en = pesqexe.calc_pesq(ref, reY, sr)
            pesq_raw_sum += pesq_raw
            pesq_en_sum += pesq_en

            print("SR = %d" % sr)
            print("SDR_raw: %.3f, SDR_en: %.3f, SDR_imp: %.3f. " %
                  (sdr_raw, sdr_en, sdr_en - sdr_raw))
            sys.stdout.flush()
            with open(os.path.join(decode_ans_file, ans_file), 'a+') as f:
                f.write(file_name + '\r\n')
                f.write(
                    "    |-PESQ_raw: %.3f, PESQ_en: %.3f, PESQimp: %.3f. \r\n"
                    % (pesq_raw, pesq_en, pesq_en - pesq_raw))
                f.write(
                    "    |-SDR_raw: %.3f, SDR_en: %.3f, SDR_imp: %.3f. \r\n" %
                    (sdr_raw, sdr_en, sdr_en - sdr_raw))

        len_list = len(ref_list)
        with open(os.path.join(decode_ans_file, ans_file), 'a+') as f:
            f.write('PESQ_raw:%.3f, PESQ_en:%.3f, PESQi_avg:%.3f. \r\n' %
                    (pesq_raw_sum / len_list, pesq_en_sum / len_list,
                     (pesq_en_sum - pesq_raw_sum) / len_list))
            f.write('SDR_raw:%.3f, SDR_en:%.3f, SDRi_avg:%.3f. \r\n' %
                    (sdr_raw_sum / len_list, sdr_en_sum / len_list,
                     (sdr_en_sum - sdr_raw_sum) / len_list))
        print('\n\n\n-----------------------------------------')
        print('PESQ_raw:%.3f, PESQ_en:%.3f, PESQi_avg:%.3f. \r\n' %
              (pesq_raw_sum / len_list, pesq_en_sum / len_list,
               (pesq_en_sum - pesq_raw_sum) / len_list))
        print('SDR_raw:%.3f, SDR_en:%.3f, SDRi_avg:%.3f. \r\n' %
              (sdr_raw_sum / len_list, sdr_en_sum / len_list,
               (sdr_en_sum - sdr_raw_sum) / len_list))
        sys.stdout.flush()
Ejemplo n.º 15
0
def upload_audio(normalize_db: Optional[float] = None):
    audio_files = files.upload()
    fnames = list(audio_files.keys())
    if len(fnames) == 0:
        return None
    return read_audio(fnames[0])
Ejemplo n.º 16
0
    windowLength = args.windowLength
    overlap = args.overlap
    ffTLength = args.ffTLength
    inputFs = args.inputFs
    fs = args.fs
    numFeatures = ffTLength // 2 + 1
    numSegments = 8

    model = models.build_model(l2_strength=0.0)
    model.summary()

    model.load_weights(
        os.path.join(mozilla_basepath, 'denoiser_cnn_log_mel_generator.h5'))

    cleanAudio, sr = read_audio(os.path.join(mozilla_basepath, 'clips',
                                             'common_voice_en_16526.mp3'),
                                sample_rate=fs)
    print("Min:", np.min(cleanAudio), "Max:", np.max(cleanAudio))

    noiseAudio, sr = read_audio(os.path.join(urbansound_basepath, 'audio',
                                             'fold10', '7913-3-0-0.wav'),
                                sample_rate=fs)
    print("Min:", np.min(noiseAudio), "Max:", np.max(noiseAudio))

    cleanAudioFeatureExtractor = FeatureExtractor(cleanAudio,
                                                  windowLength=windowLength,
                                                  overlap=overlap,
                                                  sample_rate=sr)
    stft_features = cleanAudioFeatureExtractor.get_stft_spectrogram()
    stft_features = np.abs(stft_features)
    print("Min:", np.min(stft_features), "Max:", np.max(stft_features))
Ejemplo n.º 17
0
def create_mixture_csv(data_type):
    """Create csv containing mixture information. 
    Each line in the .csv file contains [speech_name, noise_name, noise_onset, noise_offset]
    
    Args:
      workspace: str, path of workspace. 
      speech_dir: str, path of speech data. 
      noise_dir: str, path of noise data. 
      data_type: str, 'train' | 'test'. 
      magnification: int, only used when data_type='train', number of noise 
          selected to mix with a speech. E.g., when magnication=3, then 4620
          speech with create 4620*3 mixtures. magnification should not larger 
          than the species of noises. 
    """

    workspace = config.workspace
    data_dir = config.data_dir
    speech_dir = os.path.join(data_dir, '{}_speech'.format(data_type))
    noise_dir = os.path.join(data_dir, '{}_noise'.format(data_type))
    magnification = config.magnification
    fs = config.sample_rate

    speech_names = [
        na for na in os.listdir(speech_dir) if na.lower().endswith(".wav")
    ]
    noise_names = [
        na for na in os.listdir(noise_dir) if na.lower().endswith(".wav")
    ]

    rs = np.random.RandomState(0)
    out_csv_path = os.path.join(workspace, "mixture_csvs",
                                "%s.csv" % data_type)
    create_folder(os.path.dirname(out_csv_path))

    cnt = 0
    f = open(out_csv_path, 'w')
    f.write("%s\t%s\t%s\t%s\n" %
            ("speech_name", "noise_name", "noise_onset", "noise_offset"))
    for speech_na in speech_names:
        # Read speech.
        speech_path = os.path.join(speech_dir, speech_na)
        (speech_audio, _) = read_audio(speech_path)
        len_speech = len(speech_audio)

        # For training data, mix each speech with randomly picked #magnification noises.
        if data_type == 'train':
            selected_noise_names = rs.choice(noise_names,
                                             size=magnification,
                                             replace=False)
        # For test data, mix each speech with all noises.
        elif data_type == 'test':
            selected_noise_names = noise_names
        else:
            raise Exception("data_type must be train | test!")

        # Mix one speech with different noises many times.
        for noise_na in selected_noise_names:
            noise_path = os.path.join(noise_dir, noise_na)
            (noise_audio, _) = read_audio(noise_path)

            len_noise = len(noise_audio)

            if len_noise <= len_speech:
                noise_onset = 0
                nosie_offset = len_speech
            # If noise longer than speech then randomly select a segment of noise.
            else:
                noise_onset = rs.randint(0, len_noise - len_speech, size=1)[0]
                nosie_offset = noise_onset + len_speech

            if cnt % 100 == 0:
                print(cnt)

            cnt += 1
            f.write("%s\t%s\t%d\t%d\n" %
                    (speech_na, noise_na, noise_onset, nosie_offset))
    f.close()
    print(out_csv_path)
    print("Create %s mixture csv finished!" % data_type)
Ejemplo n.º 18
0
def calculate_mixture_features(data_type):
    """Calculate spectrogram for mixed, speech and noise audio. Then write the 
    features to disk. 
    
    Args:
      workspace: str, path of workspace. 
      speech_dir: str, path of speech data. 
      noise_dir: str, path of noise data. 
      data_type: str, 'train' | 'test'. 
      snr: float, signal to noise ratio to be mixed. 
    """
    workspace = config.workspace
    data_dir = config.data_dir
    speech_dir = os.path.join(data_dir, '{}_speech'.format(data_type))
    noise_dir = os.path.join(data_dir, '{}_noise'.format(data_type))

    fs = config.sample_rate

    if data_type == 'train':
        snr = config.Tr_SNR
    elif data_type == 'test':
        snr = config.Te_SNR
    else:
        raise Exception("data_type must be train | test!")

    # Open mixture csv.
    mixture_csv_path = os.path.join(workspace, "mixture_csvs",
                                    "%s.csv" % data_type)
    with open(mixture_csv_path, 'r') as f:
        reader = csv.reader(f, delimiter='\t')
        lis = list(reader)

    t1 = time.time()
    cnt = 0
    for i1 in range(1, len(lis)):
        [speech_na, noise_na, noise_onset, noise_offset] = lis[i1]
        noise_onset = int(noise_onset)
        noise_offset = int(noise_offset)

        # Read speech audio.
        speech_path = os.path.join(speech_dir, speech_na)
        (speech_audio, _) = read_audio(speech_path, target_fs=fs)

        # Read noise audio.
        noise_path = os.path.join(noise_dir, noise_na)
        (noise_audio, _) = read_audio(noise_path, target_fs=fs)

        # Repeat noise to the same length as speech.
        if len(noise_audio) < len(speech_audio):
            n_repeat = int(
                np.ceil(float(len(speech_audio)) / float(len(noise_audio))))
            noise_audio_ex = np.tile(noise_audio, n_repeat)
            noise_audio = noise_audio_ex[0:len(speech_audio)]
        # Truncate noise to the same length as speech.
        else:
            noise_audio = noise_audio[noise_onset:noise_offset]

        # Scale speech to given snr.
        scaler = get_amplitude_scaling_factor(speech_audio,
                                              noise_audio,
                                              snr=snr)
        speech_audio *= scaler

        # Get normalized mixture, speech, noise.
        (mixed_audio, speech_audio, noise_audio,
         alpha) = additive_mixing(speech_audio, noise_audio)

        # Write out mixed audio.
        out_bare_na = os.path.join(
            "%s.%s" %
            (os.path.splitext(speech_na)[0], os.path.splitext(noise_na)[0]))
        out_audio_path = os.path.join(workspace, "mixed_audios", "spectrogram",
                                      data_type, "%ddb" % int(snr),
                                      "%s.wav" % out_bare_na)
        create_folder(os.path.dirname(out_audio_path))
        write_audio(out_audio_path, mixed_audio, fs)

        # Extract spectrogram.
        mixed_complx_x = calc_sp(mixed_audio, mode='complex')
        speech_x = calc_sp(speech_audio, mode='magnitude')
        noise_x = calc_sp(noise_audio, mode='magnitude')

        # Write out features.
        out_feat_path = os.path.join(workspace, "features", "spectrogram",
                                     data_type, "%ddb" % int(snr),
                                     "%s.p" % out_bare_na)
        create_folder(os.path.dirname(out_feat_path))
        data = [mixed_complx_x, speech_x, noise_x, alpha, out_bare_na]
        pickle.dump(data,
                    open(out_feat_path, 'wb'),
                    protocol=pickle.HIGHEST_PROTOCOL)

        # Print.
        if cnt % 100 == 0:
            print(cnt)

        cnt += 1

    print("Extracting feature time: %s" % (time.time() - t1))
Ejemplo n.º 19
0
                is_speech = self.vad.is_speech(audio[start:stop],
                                               sample_rate=self.sample_rate)
                if is_speech == True:
                    vad_res.append(1)
                else:
                    vad_res.append(0)

                #print(is_speech)

        return vad_res


if __name__ == "__main__":
    filepath = '../db/test/file003_e.wav'

    vad = WebrtcVAD()

    audio = read_audio(filepath)
    vad_res = vad.perform_vad(audio)

    vad_res = np.array(vad_res)
    print(vad_res)

    x = len(vad_res) * 640 / 16000
    x = np.linspace(0, x, len(vad_res))
    print(x.shape, vad_res.shape)

    plt.plot(x, vad_res)
    plt.xticks(np.arange(0, 42, step=2))
    plt.show()
Ejemplo n.º 20
0
            # "exp/rnn_speech_enhancement/8k/2_00_8k_raw.wav",
        ]

        decode_file_list_16k = [
            "exp/test_oc/refer_wav/hebing2_ref.wav",
            "exp/test_oc/refer_wav/test1_ref.wav",
        ]
        if PARAM.FS == 8000:
            decode_file_list = decode_file_list_8k
        elif PARAM.FS == 16000:
            decode_file_list = decode_file_list_16k
        else:
            print('PARAM.FS error, exit.'), exit(-1)
        for i, mixed_dir in enumerate(decode_file_list):
            print(i + 1, mixed_dir)
            waveData, sr = utils.read_audio(mixed_dir)
            reY, mask = decode_one_wav(sess, model, waveData)
            print(np.max(reY))
            abs_max = (2**(PARAM.AUDIO_BITS - 1) - 1)
            reY = np.where(reY > abs_max, abs_max, reY)
            reY = np.where(reY < -abs_max, -abs_max, reY)
            utils.write_audio(
                os.path.join(decode_ans_file, (ckpt + '_%03d_' % (i + 1)) +
                             mixed_dir[mixed_dir.rfind('/') + 1:]), reY, sr)
            file_name = mixed_dir[mixed_dir.rfind('/') +
                                  1:mixed_dir.rfind('.')]
            utils.picture_spec(
                mask,
                os.path.join(decode_ans_file,
                             (ckpt + '_%03d_' % (i + 1)) + file_name))
Ejemplo n.º 21
0
def calculate_feature_for_all_audio_files(args):
    '''Calculate feature of audio files and write out features to a hdf5 file. 
    
    Args:
      dataset_dir: string
      workspace: string
      mini_data: bool, set True for debugging on a small part of data
    '''

    # Arguments & parameters
    dataset_dir = args.dataset_dir
    workspace = args.workspace
    mini_data = args.mini_data

    sample_rate = config.sample_rate
    window_size = config.window_size
    hop_size = config.hop_size
    mel_bins = config.mel_bins
    fmin = config.fmin
    fmax = config.fmax
    frames_per_second = config.frames_per_second
    frames_num = config.frames_num
    total_samples = config.total_samples
    lb_to_idx = config.lb_to_idx
    audio_duration_clip = config.audio_duration_clip
    audio_stride_clip = config.audio_stride_clip
    audio_duration = config.audio_duration
    audio_num = config.audio_num
    total_frames = config.total_frames
    # Paths
    if mini_data:
        prefix = 'minidata_'
    else:
        prefix = ''

    audios_dir = os.path.join(dataset_dir, 'audio')
    metadata_path = os.path.join(dataset_dir, 'meta', 'esc50.csv')
    feature_path = os.path.join(
        workspace, 'features',
        '{}logmel_{}frames_{}melbins.h5'.format(prefix, frames_per_second,
                                                mel_bins))
    create_folder(os.path.dirname(feature_path))
    # Feature extractor
    feature_extractor = LogMelExtractor(sample_rate=sample_rate,
                                        window_size=window_size,
                                        hop_size=hop_size,
                                        mel_bins=mel_bins,
                                        fmin=fmin,
                                        fmax=fmax)

    # Read metadata
    meta_dict = read_metadata(metadata_path)

    # Extract features and targets
    if mini_data:
        mini_num = 10
        total_num = len(meta_dict['filename'])
        random_state = np.random.RandomState(1234)
        indexes = random_state.choice(total_num, size=mini_num, replace=False)
        for key in meta_dict.keys():
            meta_dict[key] = meta_dict[key][indexes]

    print('Extracting features of all audio files ...')
    extract_time = time.time()
    # Hdf5 file for storing features and targets
    hf = h5py.File(feature_path, 'w')

    hf.create_dataset(
        name='filename',
        data=[filename.encode() for filename in meta_dict['filename']],
        dtype='S80')

    if 'fold' in meta_dict.keys():
        hf.create_dataset(name='fold',
                          data=[fold for fold in meta_dict['fold']],
                          dtype=np.int64)

    if 'target' in meta_dict.keys():
        hf.create_dataset(name='target',
                          data=[target for target in meta_dict['target']],
                          dtype=np.int64)

    if 'category' in meta_dict.keys():
        hf.create_dataset(
            name='category',
            data=[category.encode() for category in meta_dict['category']],
            dtype='S80')
    if 'esc10' in meta_dict.keys():
        hf.create_dataset(name='esc10',
                          data=[esc10 for esc10 in meta_dict['esc10']],
                          dtype=np.bool)
    if 'src_file' in meta_dict.keys():
        hf.create_dataset(
            name='src_file',
            data=[src_file for src_file in meta_dict['src_file']],
            dtype=np.int64)
    if 'take' in meta_dict.keys():
        hf.create_dataset(name='take',
                          data=[take.encode() for take in meta_dict['take']],
                          dtype='S24')

    hf.create_dataset(name='feature',
                      shape=(0, audio_num, frames_num, mel_bins),
                      maxshape=(None, audio_num, frames_num, mel_bins),
                      dtype=np.float32)

    for (n, filename) in enumerate(meta_dict['filename']):
        audio_path = os.path.join(audios_dir, filename)
        print(n, audio_path)

        # Read audio
        (audio, _) = read_audio(audio_path=audio_path, target_fs=sample_rate)

        # Pad or truncate audio recording to the same length
        audio = pad_truncate_sequence(audio, total_samples)
        # Extract feature
        fea_list = []
        #         for i in range(audio_num):
        #             audio_clip = audio[i*sample_rate*audio_stride_clip: (i+2)*sample_rate*audio_stride_clip]
        #             feature = feature_extractor.transform(audio_clip)
        #             feature = feature[0 : frames_per_second*audio_duration_clip]
        #             fea_list.append(feature)
        feature = feature_extractor.transform(audio)
        #         # Remove the extra log mel spectrogram frames caused by padding zero
        feature = feature[0:total_frames]
        for i in range(audio_num):
            feature_clip = feature[i * frames_per_second *
                                   audio_stride_clip:(i +
                                                      audio_duration_clip) *
                                   frames_per_second * audio_stride_clip]
            fea_list.append(feature_clip)

        hf['feature'].resize((n + 1, audio_num, frames_num, mel_bins))
        hf['feature'][n] = fea_list

    hf.close()

    print('Write hdf5 file to {} using {:.3f} s'.format(
        feature_path,
        time.time() - extract_time))
Ejemplo n.º 22
0
     file_list = in_file.read().split('\n')
 del file_list[-1]
 perm = np.random.permutation(len(file_list))
 bno = 0
 for i in tqdm(range(0, len(file_list), args.batch_size),
               desc='Generating data for {}'.format(data_type)):
     bno = bno + 1
     s_n_abs_list = []
     s_x_abs_list = []
     for bid in range(0, args.batch_size):
         if i + bid < len(file_list):
             f_template = file_list[perm[i + bid]]
             for ch in range(1, 9):
                 f = f_template + '_ch{}.wav'.format(ch)
                 f_no_ltr = f_template + '_ch{}.NLR.wav'.format(ch)
                 ltr_audio = read_audio(f)
                 no_ltr_audio = read_audio(f_no_ltr)
                 fx, tx, s_x = signal.stft(no_ltr_audio,
                                           fs=16000,
                                           nperseg=512,
                                           noverlap=512 - 128,
                                           nfft=512)
                 fn, tn, s_n = signal.stft(ltr_audio,
                                           fs=16000,
                                           nperseg=512,
                                           noverlap=512 - 128,
                                           nfft=512)
                 s_x = np.transpose(s_x)
                 s_n = np.transpose(s_n)
                 s_x_abs = 20 * log_sp(np.abs(s_x))
                 s_n_abs = 20 * log_sp(np.abs(s_n))