def add_tone(audio, start_time, duration, amplitude, frequency, channel_num=0, taper_duration=0): fs = audio.sample_rate # Create tone. length = signal_utils.seconds_to_frames(duration, fs) phases = np.arange(length) * 2 * np.pi * frequency / fs tone = amplitude * np.sin(phases) # Taper ends if specified. if taper_duration != 0: n = signal_utils.seconds_to_frames(taper_duration, fs) ramp = np.arange(n) / n tone[:n] *= ramp tone[-n:] *= 1 - ramp # Add tone to audio. start_index = signal_utils.seconds_to_frames(start_time, fs) audio.samples[channel_num, start_index:start_index + length] += tone
def __init__(self, file_path, sample_rate, score_scale_factor, score_repetition_factor, output_start_offset=0, output_duration=None): self._sample_rate = sample_rate self._score_scale_factor = score_scale_factor self._score_repetition_factor = score_repetition_factor self._output_start_index = signal_utils.seconds_to_frames( output_start_offset, sample_rate) if output_duration is None: self._output_end_index = None else: max_file_length = signal_utils.seconds_to_frames( output_duration, sample_rate) self._output_end_index = self._output_start_index + max_file_length # Open wave file. self._writer = wave.open(file_path, 'wb') self._writer.setparams((2, 2, sample_rate, 0, 'NONE', None)) self._samples_start_index = 0
def get_partition_read_interval(night_interval, first_input_file_start): offset = (night_interval.start - first_input_file_start).total_seconds() start_index = signal_utils.seconds_to_frames(offset, INPUT_SAMPLE_RATE) duration = (night_interval.end - night_interval.start).total_seconds() length = signal_utils.seconds_to_frames(duration, INPUT_SAMPLE_RATE) return Interval(start_index, start_index + length)
def _get_index_interval(time_interval, start_time, sample_rate): """ Gets the audio file index interval corresponding to the specified time interval. """ start_offset = (time_interval.start - start_time).total_seconds() start_index = signal_utils.seconds_to_frames(start_offset, sample_rate) duration = (time_interval.end - time_interval.start).total_seconds() length = signal_utils.seconds_to_frames(duration, sample_rate) return Interval(start=start_index, end=start_index + length)
def _get_segment_source(clip, segment_source, source_duration): source = segment_source clip_length = clip.length if source == SEGMENT_SOURCE_CLIP: return (0, clip_length) elif source == SEGMENT_SOURCE_CLIP_CENTER: sample_rate = clip.sample_rate source_length = signal_utils.seconds_to_frames( source_duration, sample_rate) if source_length >= clip_length: return (0, clip_length) else: source_start_index = int((clip_length - source_length) // 2) return (source_start_index, source_length) elif source == SEGMENT_SOURCE_SELECTION: return clip.selection else: raise ValueError( 'Unrecognized clip segment source "{}".'.format(source))
def _process_detector_output(self, output_file_path): with open(output_file_path) as output_file: reader = csv.reader(output_file) # Skip header. next(reader) for row in reader: # Get clip start index from peak time. peak_time = self._parse_time(row[0]) peak_index = signal_utils.seconds_to_frames( peak_time, self._input_sample_rate) start_index = peak_index - self._clip_length // 2 annotations = {} # Get detector score. annotations['Detector Score'] = float(row[2]) # Get classification. classification = row[1] if classification != 'OTHE': annotations['Classification'] = 'Call.' + classification # print( # 'processing clip', peak_time, start_index, score, # classification) self._listener.process_clip(start_index, self._clip_length, annotations=annotations)
def _process_timestamps(self, timestamp_file_path): with open(timestamp_file_path) as timestamp_file: reader = csv.reader(timestamp_file) # Skip header next(reader) for row in reader: peak_time = float(row[1]) # Get clip start index from peak time. peak_index = signal_utils.seconds_to_frames( peak_time, self._input_sample_rate) start_index = peak_index - self._clip_length // 2 score = float(row[2]) annotations = {'Detector Score': score} # print('processing clip', peak_time, start_index, score) self._listener.process_clip( start_index, self._clip_length, annotations=annotations)
def extract_call(audio, selection, config): samples = audio.samples sample_rate = audio.sample_rate start_index, end_index = selection center_index = (start_index + end_index - 1) // 2 duration = config.call_segment_duration length = seconds_to_frames(duration, sample_rate) start_index = center_index - length // 2 if start_index < 0: return None else: # start index is at least zero end_index = start_index + length if end_index > len(samples): return None else: return Bunch( samples=samples[start_index:end_index], sample_rate=sample_rate)
def __init__(self, settings, input_sample_rate, listener): open_mp_utils.work_around_multiple_copies_issue() # Suppress TensorFlow INFO and DEBUG log messages. tf.logging.set_verbosity(tf.logging.WARN) self._settings = settings self._input_sample_rate = input_sample_rate self._listener = listener self._clip_length = signal_utils.seconds_to_frames( _CLIP_DURATION, self._input_sample_rate) # Create and open temporary wave file. Do not delete # automatically on close. We will close the file after we # finish writing it, and then BirdVoxDetect will open it # again for reading. We delete the file ourselves after # BirdVoxDetect finishes processing it. self._audio_file = tempfile.NamedTemporaryFile( suffix='.wav', delete=False) # Create wave file writer, through which we will write to the # wave file. self._audio_file_writer = WaveFileWriter( self._audio_file, 1, self._input_sample_rate)
def _get_segment_source(clip, segment_source, source_duration): source = segment_source clip_length = clip.length if source == SEGMENT_SOURCE_CLIP: return (0, clip_length) elif source == SEGMENT_SOURCE_CLIP_CENTER: sample_rate = clip.sample_rate source_length = signal_utils.seconds_to_frames(source_duration, sample_rate) if source_length >= clip_length: return (0, clip_length) else: source_start_index = int((clip_length - source_length) // 2) return (source_start_index, source_length) elif source == SEGMENT_SOURCE_SELECTION: return clip.selection else: raise ValueError( 'Unrecognized clip segment source "{}".'.format(source))
def find_call(audio, config): # TODO: Why does `detect_tseeps` return selections in seconds? # TODO: We're tied to tseeps here since we call `detect_tseeps`. # Perhaps we should call `detect_events` with an appropriate # detector configuration instead. selections = nfc_detection_utils.detect_tseeps(audio) selection = nfc_detection_utils.get_longest_selection(selections) if selection is None: return None else: start_time, end_time = selection sample_rate = float(audio.sample_rate) start_index = seconds_to_frames(start_time, sample_rate) end_index = seconds_to_frames(end_time, sample_rate) return (start_index, end_index)
def __init__(self, settings): self._settings = settings s = settings sample_rate = s.waveform_sample_rate # Get waveform trimming start and end indices. self._start_time_index = signal_utils.seconds_to_frames( s.waveform_start_time, sample_rate) waveform_length = signal_utils.seconds_to_frames( s.waveform_duration, sample_rate) self._end_time_index = self._start_time_index + waveform_length # Get spectrogram settings. window_size = signal_utils.seconds_to_frames( s.spectrogram_window_size, sample_rate) hop_size = signal_utils.seconds_to_frames( s.spectrogram_hop_size, sample_rate) dft_size = tfa_utils.get_dft_size(window_size) self._spectrogram_settings = Settings( window=data_windows.create_window('Hann', window_size), hop_size=hop_size, dft_size=dft_size, reference_power=1) # Get spectrogram shape. num_spectra = tfa_utils.get_num_analysis_records( waveform_length, window_size, hop_size) num_bins = dft_size // 2 + 1 self._spectrogram_shape = (num_spectra, num_bins) self._augmented_spectrogram_shape = (1,) + self._spectrogram_shape # Get spectrogram trimming start and end indices. self._start_freq_index = _freq_to_dft_bin_num( settings.spectrogram_start_freq, sample_rate, dft_size) self._end_freq_index = _freq_to_dft_bin_num( settings.spectrogram_end_freq, sample_rate, dft_size) + 1
def __init__(self, settings): self._settings = settings s = settings sample_rate = s.waveform_sample_rate # Get waveform trimming start and end indices. self._start_time_index = signal_utils.seconds_to_frames( s.waveform_start_time, sample_rate) waveform_length = signal_utils.seconds_to_frames( s.waveform_duration, sample_rate) self._end_time_index = self._start_time_index + waveform_length # Get spectrogram settings. window_size = signal_utils.seconds_to_frames(s.spectrogram_window_size, sample_rate) hop_size = signal_utils.seconds_to_frames(s.spectrogram_hop_size, sample_rate) dft_size = tfa_utils.get_dft_size(window_size) self._spectrogram_settings = Settings( window=data_windows.create_window('Hann', window_size), hop_size=hop_size, dft_size=dft_size, reference_power=1) # Get spectrogram shape. num_spectra = tfa_utils.get_num_analysis_records( waveform_length, window_size, hop_size) num_bins = dft_size // 2 + 1 self._spectrogram_shape = (num_spectra, num_bins) self._augmented_spectrogram_shape = (1, ) + self._spectrogram_shape # Get spectrogram trimming start and end indices. self._start_freq_index = _freq_to_dft_bin_num( settings.spectrogram_start_freq, sample_rate, dft_size) self._end_freq_index = _freq_to_dft_bin_num( settings.spectrogram_end_freq, sample_rate, dft_size) + 1
def __init__( self, file_path, sample_rate, score_scale_factor, score_repetition_factor, output_start_offset=0, output_duration=None): self._sample_rate = sample_rate self._score_scale_factor = score_scale_factor self._score_repetition_factor = score_repetition_factor self._output_start_index = signal_utils.seconds_to_frames( output_start_offset, sample_rate) if output_duration is None: self._output_end_index = None else: max_file_length = signal_utils.seconds_to_frames( output_duration, sample_rate) self._output_end_index = self._output_start_index + max_file_length # Open wave file. self._writer = wave.open(file_path, 'wb') self._writer.setparams((2, 2, sample_rate, 0, 'NONE', None)) self._samples_start_index = 0
def _notify_listener_of_clips(self, peak_indices, peak_scores, input_length, threshold): # print('Clips:') start_offset = self._input_chunk_start_index + self._clip_start_offset peak_indices *= self._hop_size for i, score in zip(peak_indices, peak_scores): # Convert classification index to input index, accounting for # any difference between classification sample rate and input # rate. f = self._input_sample_rate / self._purported_input_sample_rate classification_sample_rate = f * self._classifier_sample_rate t = signal_utils.get_duration(i, classification_sample_rate) i = signal_utils.seconds_to_frames(t, self._input_sample_rate) clip_start_index = i + start_offset clip_end_index = clip_start_index + self._clip_length chunk_end_index = self._input_chunk_start_index + input_length if clip_start_index < 0: logging.warning( 'Rejected clip that started before beginning of ' 'recording.') elif clip_end_index > chunk_end_index: # clip might extend past end of recording, since it extends # past the end of this chunk (we do not know whether or # not the current chunk is the last) logging.warning( 'Rejected clip that ended after end of recording chunk.') else: # all clip samples are in the recording interval extending # from the beginning of the recording to the end of the # current chunk # print( # ' {} {}'.format(clip_start_index, self._clip_length)) annotations = {'Detector Score': 100 * score} self._listener.process_clip(clip_start_index, self._clip_length, threshold, annotations)
def _notify_listener_of_clips( self, peak_indices, peak_scores, input_length, threshold): # print('Clips:') start_offset = self._input_chunk_start_index + self._clip_start_offset peak_indices *= self._hop_size for i, score in zip(peak_indices, peak_scores): # Convert classification index to input index, accounting # for difference between classifier sample rate and input # sample rate. t = signal_utils.get_duration(i, self._classifier_sample_rate) i = signal_utils.seconds_to_frames(t, self._input_sample_rate) clip_start_index = i + start_offset clip_end_index = clip_start_index + self._clip_length chunk_end_index = self._input_chunk_start_index + input_length if clip_start_index < 0: logging.warning( 'Rejected clip that started before beginning of ' 'recording.') elif clip_end_index > chunk_end_index: # clip might extend past end of recording, since it extends # past the end of this chunk (we do not know whether or # not the current chunk is the last) logging.warning( 'Rejected clip that ended after end of recording chunk.') else: # all clip samples are in the recording interval extending # from the beginning of the recording to the end of the # current chunk # print( # ' {} {}'.format(clip_start_index, self._clip_length)) annotations = {'Detector Score': 100 * score} self._listener.process_clip( clip_start_index, self._clip_length, threshold, annotations)
def __init__(self, mode, settings, output_feature_name='spectrogram'): # `mode` can be `DATASET_MODE_TRAINING`, `DATASET_MODE_EVALUATION`, # or `DATASET_MODE_INFERENCE`. # # When `mode` is `DATASET_MODE_TRAINING`, dataset examples are # preprocessed according to certain settings that control waveform # slicing and data augmentation. # # When `mode` is `DATASET_MODE_EVALUATION`, dataset examples are # processed as when it is `DATASET_MODE_TRAINING`, except that # data augmentation can be turned on or off via the # `evaluation_data_augmentation_enabled` setting. # # When `mode` is `DATASET_MODE_INFERENCE`, dataset waveforms are # not sliced as they are when it is `DATASET_MODE_TRAINING` or # `DATASET_MODE_EVALUATION`. Instead, the slicing start index is # always zero. Data augmentation is also disabled. self.settings = settings self.output_feature_name = output_feature_name s = settings (self.time_start_index, self.time_end_index, self.window_size, self.hop_size, self.dft_size, self.freq_start_index, self.freq_end_index) = \ _get_low_level_preprocessing_settings(mode, s) self.waveform_length = self.time_end_index - self.time_start_index self.window_fn = functools.partial( tf.contrib.signal.hann_window, periodic=True) augmentation_enabled = _is_data_augmentation_enabled(mode, s) self.random_waveform_time_shifting_enabled = \ augmentation_enabled and s.random_waveform_time_shifting_enabled if self.random_waveform_time_shifting_enabled: self.max_waveform_time_shift = signal_utils.seconds_to_frames( s.max_waveform_time_shift, s.waveform_sample_rate)
def __init__(self, mode, settings, output_feature_name='spectrogram'): # `mode` can be `DATASET_MODE_TRAINING`, `DATASET_MODE_EVALUATION`, # or `DATASET_MODE_INFERENCE`. # # When `mode` is `DATASET_MODE_TRAINING`, dataset examples are # preprocessed according to certain settings that control waveform # slicing and data augmentation. # # When `mode` is `DATASET_MODE_EVALUATION`, dataset examples are # processed as when it is `DATASET_MODE_TRAINING`, except that # data augmentation can be turned on or off via the # `evaluation_data_augmentation_enabled` setting. # # When `mode` is `DATASET_MODE_INFERENCE`, dataset waveforms are # not sliced as they are when it is `DATASET_MODE_TRAINING` or # `DATASET_MODE_EVALUATION`. Instead, the slicing start index is # always zero. Data augmentation is also disabled. self.settings = settings self.output_feature_name = output_feature_name s = settings (self.time_start_index, self.time_end_index, self.window_size, self.hop_size, self.dft_size, self.freq_start_index, self.freq_end_index) = \ _get_low_level_preprocessing_settings(mode, s) self.waveform_length = self.time_end_index - self.time_start_index self.window_fn = functools.partial(tf.contrib.signal.hann_window, periodic=True) augmentation_enabled = _is_data_augmentation_enabled(mode, s) self.random_waveform_time_shifting_enabled = \ augmentation_enabled and s.random_waveform_time_shifting_enabled if self.random_waveform_time_shifting_enabled: self.max_waveform_time_shift = signal_utils.seconds_to_frames( s.max_waveform_time_shift, s.waveform_sample_rate)
def extract_clip_segment(clip, segment_duration, segment_source, source_duration=None): source = _get_segment_source(clip, segment_source, source_duration) if source is None: return None else: source_start_index, source_length = source sample_rate = clip.sample_rate segment_length = signal_utils.seconds_to_frames( segment_duration, sample_rate) if source_length < segment_length: # source not long enough to extract segment from return None else: # Extract samples from source. if source_length == segment_length: offset = 0 else: offset = random.randrange(source_length - segment_length) start_index = source_start_index + offset end_index = start_index + segment_length samples = clip_manager.instance.get_samples(clip) samples = samples[start_index:end_index] return Bunch(samples=samples, sample_rate=clip.sample_rate, start_index=start_index)
def extract_clip_segment( clip, segment_duration, segment_source, source_duration=None): source = _get_segment_source(clip, segment_source, source_duration) if source is None: return None else: source_start_index, source_length = source sample_rate = clip.sample_rate segment_length = signal_utils.seconds_to_frames( segment_duration, sample_rate) if source_length < segment_length: # source not long enough to extract segment from return None else: # Extract samples from source. if source_length == segment_length: offset = 0 else: offset = random.randrange(source_length - segment_length) start_index = source_start_index + offset end_index = start_index + segment_length samples = clip_manager.instance.get_samples(clip) samples = samples[start_index:end_index] return Bunch( samples=samples, sample_rate=clip.sample_rate, start_index=start_index)
def __init__(self, mode, settings, output_feature_name='spectrogram'): # `mode` can be `DATASET_MODE_TRAINING`, `DATASET_MODE_EVALUATION`, # or `DATASET_MODE_INFERENCE`. # # When `mode` is `DATASET_MODE_TRAINING` or # `DATASET_MODE_EVALUATION, dataset examples are preprocessed # according to certain settings that control waveform modification # and slicing. # # When `mode` is `DATASET_MODE_INFERENCE`, waveform modification # is disabled and the slicing start index is always zero. self.settings = settings self.output_feature_name = output_feature_name s = settings (self.time_start_index, self.time_end_index, self.window_size, self.hop_size, self.dft_size, self.freq_start_index, self.freq_end_index) = \ _get_low_level_preprocessing_settings(mode, s) self.waveform_length = self.time_end_index - self.time_start_index self.window_fn = functools.partial(tf.contrib.signal.hann_window, periodic=True) # Note that we perform random waveform time shifting in the # evaluation dataset mode for a classifier that will be deployed # in a detector. The distribution of event onset times within # clips created by the Old Bird detectors (the current source of # our training data) is less uniform than the more or less flat # distribution that a classifier sees in the recording segments # presented to it when it is deployed in a detector. Random waveform # time shifting flattens and widens the onset time distribution in # the dataset, making it more like what it will see in deployment. self.random_waveform_time_shifting_enabled = \ s.random_waveform_time_shifting_enabled and ( mode == DATASET_MODE_TRAINING or ( mode == DATASET_MODE_EVALUATION and s.target_use == TARGET_USE_DETECTOR)) if self.random_waveform_time_shifting_enabled: self.max_waveform_time_shift = signal_utils.seconds_to_frames( s.max_waveform_time_shift, s.waveform_sample_rate) # We perform random waveform amplitude scaling during training # in order to make the distribution of input amplitudes wider # and more uniform, with the intent of making the classifier # less sensitive to variation in input amplitude. We perform # the same scaling during evaluation in order to gauge the # classifier's performance on a similar input amplitude # distribution. If in the future we perform some sort of # amplitude normalization (e.g. PCEN or normalization based # on order statistical background noise power estimates), the # random scaling may no longer be needed. self.random_waveform_amplitude_scaling_enabled = \ s.random_waveform_amplitude_scaling_enabled and ( mode == DATASET_MODE_TRAINING or mode == DATASET_MODE_EVALUATION)
def create_output_files(night_file_infos): max_length = int( signal_utils.seconds_to_frames(MAX_NIGHT_DURATION * 3600, INPUT_SAMPLE_RATE)) input_samples = np.empty(max_length, dtype='int16') night_intervals = sorted(night_file_infos.keys()) for night_interval in night_intervals: partitions = night_file_infos[night_interval] for file_infos in partitions: start_time = time.time() partition_input_length = 0 first_input_start = file_infos[0][1].start # Get output file start time. if night_interval.start >= first_input_start: output_start_time = night_interval.start else: output_start_time = first_input_start output_file_name = create_output_file_name(output_start_time) print('Creating recording {}...'.format(output_file_name)) partition_read_interval = get_partition_read_interval( night_interval, first_input_start) input_start_index = 0 for i, (input_file_path, _, input_length) in enumerate(file_infos): input_end_index = input_start_index + input_length input_interval = Interval(input_start_index, input_end_index) # Get read interval as partition indices. read_interval = intersect_intervals(input_interval, partition_read_interval) # Get read interval as input file indices. read_interval = Interval( read_interval.start - input_start_index, read_interval.end - input_start_index) read_size = read_interval.end - read_interval.start with soundfile.SoundFile(str(input_file_path)) as sound_file: if read_interval.start != 0: sound_file.seek(read_interval.start) samples = sound_file.read(read_size, dtype='int16') start = partition_input_length end = partition_input_length + read_size input_samples[start:end] = samples print(' Reading {} {} {} {} {} {}...'.format( i, input_file_path.name, input_length, read_interval.start, read_interval.end, read_size)) partition_input_length += read_size input_start_index += input_length duration = partition_input_length / INPUT_SAMPLE_RATE / 3600 print(' Resampling {:.1f} hours of audio...'.format(duration)) output_samples = resampling_utils.resample_to_24000_hz( input_samples[:partition_input_length], INPUT_SAMPLE_RATE) output_samples.shape = (1, len(output_samples)) output_file_path = OUTPUT_DIR_PATH / output_file_name audio_file_utils.write_wave_file(str(output_file_path), output_samples, OUTPUT_SAMPLE_RATE) end_time = time.time() elapsed_time = end_time - start_time partition_duration = partition_input_length / INPUT_SAMPLE_RATE rate = partition_duration / elapsed_time print( (' Processed {:.1f} seconds of audio in {:.1f} seconds, or ' '{:.1f} times faster than real time.').format( partition_duration, elapsed_time, rate))
def create_silence(duration, sample_rate): length = signal_utils.seconds_to_frames(duration, sample_rate) samples = np.zeros((1, length)) return Bunch(samples=samples, sample_rate=sample_rate)
def _s2f(seconds, sample_rate): frames = signal_utils.seconds_to_frames(seconds, sample_rate) return tf.cast(frames, tf.int64)