def get_raw_data(): NFFT = 2**int(np.ceil(np.log2(cfg.frame_size))) audio_paths_labels_and_names = get_film_clap_paths_and_labels( "../../data/FilmClap", time_margin=cfg.time_margin) features = [] label_sets = [] file_names = [] for i, (audio_path, start_times, end_times, audio_name) in enumerate(audio_paths_labels_and_names): assert "_".join(audio_name.split("_")[1:]) in audio_path waveform = read_multichannel_audio(audio_path, target_fs=cfg.working_sample_rate) waveform = waveform.T # -> (channels, samples) # Split wave form to overlapping frames and create labels for each frames, labels = split_to_frames_with_hop_size(waveform, start_times, end_times) frames = np.concatenate(frames, axis=0) frames *= np.hanning(frames.shape[1]) complex_spectogram = np.fft.rfft(frames, NFFT) mel_features = multichannel_complex_to_log_mel(complex_spectogram) features.append(mel_features) label_sets.append(np.array(labels)) file_names.append(audio_name) data = list(zip(features, label_sets, file_names)) return data
def preprocess_data(audio_path_and_labels, output_dir, output_mean_std_file, preprocess_mode='logMel'): print("Preprocessing collected data") os.makedirs(output_dir, exist_ok=True) all_features = [] for (audio_path, start_times, end_times, audio_name) in tqdm(audio_path_and_labels): multichannel_waveform = read_multichannel_audio( audio_path=audio_path, target_fs=cfg.working_sample_rate) feature = multichannel_stft(multichannel_waveform) if preprocess_mode == 'logMel': feature = multichannel_complex_to_log_mel(feature) all_features.append(feature) output_path = os.path.join( output_dir, audio_name + f"_{preprocess_mode}_features_and_labels.pkl") with open(output_path, 'wb') as f: pickle.dump( { 'features': feature, 'start_times': start_times, 'end_times': end_times }, f) all_features = np.concatenate(all_features, axis=1) mean, std = calculate_scalar_of_tensor(all_features) with open(output_mean_std_file, 'wb') as f: pickle.dump({'mean': mean, 'std': std}, f) # Visualize single data sample (audio_path, start_times, end_times, audio_name) = random.choice(audio_path_and_labels) analyze_data_sample( audio_path, start_times, end_times, audio_name, os.path.join(os.path.dirname(output_mean_std_file), "data_sample.png"))
def analyze_data_sample(audio_path, start_times, end_times, audio_name, plot_path): """ A debug function that plots a single sample and analyzes how the spectogram configuration affect the feature final size """ from dataset.spectogram.spectograms_dataset import create_event_matrix org_multichannel_audio, org_sample_rate = soundfile.read(audio_path) multichannel_audio = read_multichannel_audio( audio_path=audio_path, target_fs=cfg.working_sample_rate) feature = multichannel_stft(multichannel_audio) feature = multichannel_complex_to_log_mel( feature) # (channels, frames, mel_bins) event_matrix = create_event_matrix(feature.shape[1], start_times, end_times) plot_sample_features(feature, mode='spectogram', target=event_matrix, plot_path=plot_path, file_name=audio_name) signal_time = multichannel_audio.shape[0] / cfg.working_sample_rate FPS = cfg.working_sample_rate / cfg.hop_size print(f"Data sample analysis: {audio_name}") print( f"\tOriginal audio: {org_multichannel_audio.shape} sample_rate={org_sample_rate}" ) print( f"\tsingle channel audio: {multichannel_audio.shape}, sample_rate={cfg.working_sample_rate}" ) print(f"\tSignal time is (num_samples/sample_rate)={signal_time:.1f}s") print(f"\tSIFT FPS is (sample_rate/hop_size)={FPS}") print( f"\tTotal number of frames is (FPS*signal_time)={FPS*signal_time:.1f}") print( f"\tEach frame covers {cfg.frame_size} samples or {cfg.frame_size / cfg.working_sample_rate:.3f} seconds " f"padded into {cfg.NFFT} samples and allow ({cfg.NFFT}//2+1)={cfg.NFFT // 2 + 1} frequency bins" ) print(f"\tFeatures shape: {feature.shape}")
type=str, default='inference_outputs', help='Directory of your workspace.') parser.add_argument('--device', default='cuda:0', type=str) args = parser.parse_args() device = torch.device("cuda:0" if torch.cuda.is_available() and args.device == "cuda:0" else "cpu") model = Cnn_AvgPooling(cfg.classes_num).to(device) # checkpoint = torch.load(args.ckpt, map_location=device) # model.load_state_dict(checkpoint['model']) print("Preprocessing audio file..") multichannel_audio = read_multichannel_audio( audio_path=args.audio_file, target_fs=cfg.working_sample_rate) log_mel_features = multichannel_complex_to_log_mel( multichannel_stft(multichannel_audio))[0] print("Inference..") with torch.no_grad(): output_event = model( torch.from_numpy(log_mel_features).to(device).float().unsqueeze(1)) output_event = output_event.cpu() os.makedirs(args.outputs_dir, exist_ok=True) plot_debug_image( log_mel_features, output=output_event[0], plot_path=os.path.join(
import matplotlib.pyplot as plt import numpy as np import soundfile import matplotlib matplotlib.use('TkAgg') if __name__ == '__main__': # audio_path = '/home/ariel/projects/sound/data/FilmClap/original/Meron/S005-S004T1.WAV' # audio_path = '/home/ariel/projects/sound/data/FilmClap/original/StillJames/2C-T001.WAV' # audio_path = '/home/ariel/projects/sound/data/FilmClap/original/JackRinger-05/161019_1233.wav' audio_path = '/home/ariel/projects/sound/data/FilmClap/original/StillJames/8D-T001.WAV' sec_start = 35.45 sec_end = 35.65 multichannel_waveform = read_multichannel_audio( audio_path=audio_path, target_fs=cfg.working_sample_rate) multichannel_waveform = multichannel_waveform[ int(cfg.working_sample_rate * sec_start):int(cfg.working_sample_rate * sec_end)] soundfile.write("tmp_file.WAV", multichannel_waveform, cfg.working_sample_rate) feature = multichannel_stft(multichannel_waveform) feature = multichannel_complex_to_log_mel(feature) frames_num = feature.shape[1] tick_hop = max(1, frames_num // 20) xticks = np.concatenate((np.arange(0, frames_num - tick_hop, tick_hop), [frames_num])) xlabels = [f"{x / cfg.frames_per_second:.3f}s" for x in xticks]