コード例 #1
0
def get_raw_data():
    NFFT = 2**int(np.ceil(np.log2(cfg.frame_size)))

    audio_paths_labels_and_names = get_film_clap_paths_and_labels(
        "../../data/FilmClap", time_margin=cfg.time_margin)

    features = []
    label_sets = []
    file_names = []
    for i, (audio_path, start_times, end_times,
            audio_name) in enumerate(audio_paths_labels_and_names):
        assert "_".join(audio_name.split("_")[1:]) in audio_path
        waveform = read_multichannel_audio(audio_path,
                                           target_fs=cfg.working_sample_rate)
        waveform = waveform.T  # -> (channels, samples)
        # Split wave form to overlapping frames and create labels for each
        frames, labels = split_to_frames_with_hop_size(waveform, start_times,
                                                       end_times)
        frames = np.concatenate(frames, axis=0)
        frames *= np.hanning(frames.shape[1])
        complex_spectogram = np.fft.rfft(frames, NFFT)
        mel_features = multichannel_complex_to_log_mel(complex_spectogram)

        features.append(mel_features)
        label_sets.append(np.array(labels))
        file_names.append(audio_name)

    data = list(zip(features, label_sets, file_names))
    return data
コード例 #2
0
def preprocess_data(audio_path_and_labels,
                    output_dir,
                    output_mean_std_file,
                    preprocess_mode='logMel'):
    print("Preprocessing collected data")
    os.makedirs(output_dir, exist_ok=True)

    all_features = []

    for (audio_path, start_times, end_times,
         audio_name) in tqdm(audio_path_and_labels):
        multichannel_waveform = read_multichannel_audio(
            audio_path=audio_path, target_fs=cfg.working_sample_rate)
        feature = multichannel_stft(multichannel_waveform)
        if preprocess_mode == 'logMel':
            feature = multichannel_complex_to_log_mel(feature)
        all_features.append(feature)

        output_path = os.path.join(
            output_dir,
            audio_name + f"_{preprocess_mode}_features_and_labels.pkl")
        with open(output_path, 'wb') as f:
            pickle.dump(
                {
                    'features': feature,
                    'start_times': start_times,
                    'end_times': end_times
                }, f)

    all_features = np.concatenate(all_features, axis=1)
    mean, std = calculate_scalar_of_tensor(all_features)
    with open(output_mean_std_file, 'wb') as f:
        pickle.dump({'mean': mean, 'std': std}, f)

    # Visualize single data sample
    (audio_path, start_times, end_times,
     audio_name) = random.choice(audio_path_and_labels)
    analyze_data_sample(
        audio_path, start_times, end_times, audio_name,
        os.path.join(os.path.dirname(output_mean_std_file), "data_sample.png"))
コード例 #3
0
def analyze_data_sample(audio_path, start_times, end_times, audio_name,
                        plot_path):
    """
    A debug function that plots a single sample and analyzes how the spectogram configuration affect the feature final size
    """
    from dataset.spectogram.spectograms_dataset import create_event_matrix
    org_multichannel_audio, org_sample_rate = soundfile.read(audio_path)

    multichannel_audio = read_multichannel_audio(
        audio_path=audio_path, target_fs=cfg.working_sample_rate)
    feature = multichannel_stft(multichannel_audio)
    feature = multichannel_complex_to_log_mel(
        feature)  # (channels, frames, mel_bins)
    event_matrix = create_event_matrix(feature.shape[1], start_times,
                                       end_times)
    plot_sample_features(feature,
                         mode='spectogram',
                         target=event_matrix,
                         plot_path=plot_path,
                         file_name=audio_name)

    signal_time = multichannel_audio.shape[0] / cfg.working_sample_rate
    FPS = cfg.working_sample_rate / cfg.hop_size
    print(f"Data sample analysis: {audio_name}")
    print(
        f"\tOriginal audio: {org_multichannel_audio.shape} sample_rate={org_sample_rate}"
    )
    print(
        f"\tsingle channel audio: {multichannel_audio.shape}, sample_rate={cfg.working_sample_rate}"
    )
    print(f"\tSignal time is (num_samples/sample_rate)={signal_time:.1f}s")
    print(f"\tSIFT FPS is (sample_rate/hop_size)={FPS}")
    print(
        f"\tTotal number of frames is (FPS*signal_time)={FPS*signal_time:.1f}")
    print(
        f"\tEach frame covers {cfg.frame_size} samples or {cfg.frame_size / cfg.working_sample_rate:.3f} seconds "
        f"padded into {cfg.NFFT} samples and allow ({cfg.NFFT}//2+1)={cfg.NFFT // 2 + 1} frequency bins"
    )
    print(f"\tFeatures shape: {feature.shape}")
コード例 #4
0
                        type=str,
                        default='inference_outputs',
                        help='Directory of your workspace.')
    parser.add_argument('--device', default='cuda:0', type=str)
    args = parser.parse_args()

    device = torch.device("cuda:0" if torch.cuda.is_available()
                          and args.device == "cuda:0" else "cpu")

    model = Cnn_AvgPooling(cfg.classes_num).to(device)
    # checkpoint = torch.load(args.ckpt, map_location=device)
    # model.load_state_dict(checkpoint['model'])

    print("Preprocessing audio file..")

    multichannel_audio = read_multichannel_audio(
        audio_path=args.audio_file, target_fs=cfg.working_sample_rate)

    log_mel_features = multichannel_complex_to_log_mel(
        multichannel_stft(multichannel_audio))[0]

    print("Inference..")
    with torch.no_grad():
        output_event = model(
            torch.from_numpy(log_mel_features).to(device).float().unsqueeze(1))
    output_event = output_event.cpu()
    os.makedirs(args.outputs_dir, exist_ok=True)

    plot_debug_image(
        log_mel_features,
        output=output_event[0],
        plot_path=os.path.join(
コード例 #5
0
import matplotlib.pyplot as plt
import numpy as np
import soundfile
import matplotlib
matplotlib.use('TkAgg')

if __name__ == '__main__':
    # audio_path = '/home/ariel/projects/sound/data/FilmClap/original/Meron/S005-S004T1.WAV'
    # audio_path = '/home/ariel/projects/sound/data/FilmClap/original/StillJames/2C-T001.WAV'
    # audio_path = '/home/ariel/projects/sound/data/FilmClap/original/JackRinger-05/161019_1233.wav'
    audio_path = '/home/ariel/projects/sound/data/FilmClap/original/StillJames/8D-T001.WAV'

    sec_start = 35.45
    sec_end = 35.65

    multichannel_waveform = read_multichannel_audio(
        audio_path=audio_path, target_fs=cfg.working_sample_rate)

    multichannel_waveform = multichannel_waveform[
        int(cfg.working_sample_rate * sec_start):int(cfg.working_sample_rate *
                                                     sec_end)]
    soundfile.write("tmp_file.WAV", multichannel_waveform,
                    cfg.working_sample_rate)
    feature = multichannel_stft(multichannel_waveform)
    feature = multichannel_complex_to_log_mel(feature)

    frames_num = feature.shape[1]
    tick_hop = max(1, frames_num // 20)
    xticks = np.concatenate((np.arange(0, frames_num - tick_hop,
                                       tick_hop), [frames_num]))
    xlabels = [f"{x / cfg.frames_per_second:.3f}s" for x in xticks]