def run(self): """ Returns: """ self._compute_spectrograms() self.deep_clustering() uncollated_masks = [] for i in range(self.audio_signal.num_channels): uncollated_masks += self._extract_masks(i) collated_masks = [np.dstack([uncollated_masks[s + ch * self.num_sources] for ch in range(self.audio_signal.num_channels)]) for s in range(self.num_sources)] self.masks = [] for mask in collated_masks: if self.mask_type == self.BINARY_MASK: mask = np.round(mask) mask_object = masks.BinaryMask(mask) elif self.mask_type == self.SOFT_MASK: mask_object = masks.SoftMask(mask) else: raise ValueError('Unknown mask type {}!'.format(self.mask_type)) self.masks.append(mask_object) return self.masks
def run(self): """ Returns: background (AudioSignal): An AudioSignal object with repeating background in background.audio_data (to get the corresponding non-repeating foreground run self.make_audio_signals()) Example: :: """ # High pass filter cutoff freq. (in # of freq. bins), +1 to match MATLAB implementation self.high_pass_cutoff = int( np.ceil(self.high_pass_cutoff * (self.stft_params.n_fft_bins - 1) / self.audio_signal.sample_rate)) + 1 # the MATLAB implementation had self._compute_spectrograms() # separate the mixture background by masking background_stft = [] background_mask = [] for i in range(self.audio_signal.num_channels): repeating_mask = self.compute_ft2d_mask(self.ft2d[:, :, i]) repeating_mask[ 0:self. high_pass_cutoff, :] = 1 # high-pass filter the foreground background_mask.append(repeating_mask) # apply mask stft_with_mask = repeating_mask * self.stft[:, :, i] background_stft.append(stft_with_mask) background_stft = np.array(background_stft).transpose((1, 2, 0)) self.background = AudioSignal( stft=background_stft, sample_rate=self.audio_signal.sample_rate) self.background.istft( self.stft_params.window_length, self.stft_params.hop_length, self.stft_params.window_type, overwrite=True, use_librosa=self.use_librosa_stft, truncate_to_length=self.audio_signal.signal_length) background_mask = np.array(background_mask).transpose( (1, 2, 0)).astype('float') background_mask = masks.SoftMask(background_mask) if self.mask_type == self.BINARY_MASK: background_mask = background_mask.mask_to_binary( self.mask_threshold) self.result_masks = [background_mask, background_mask.inverse_mask()] return self.result_masks
def run(self): """ Runs REPET-SIM, a variant of REPET using the cosine similarity matrix to find similar frames to do median filtering. Returns: """ # High pass filter cutoff freq. (in # of freq. bins), +1 to match MATLAB implementation self.high_pass_cutoff = int( np.ceil( float(self.high_pass_cutoff) * (self.stft_params.n_fft_bins - 1) / self.audio_signal.sample_rate) + 1) low = 1 if self.matlab_fidelity else 0 self._compute_spectrograms() self.similarity_indices = self._get_similarity_indices() background_stft = [] background_mask = [] for i in range(self.audio_signal.num_channels): repeating_mask = self._compute_mask( self.magnitude_spectrogram[:, :, i]) repeating_mask[ low:self. high_pass_cutoff, :] = 1 # high-pass filter the foreground background_mask.append(repeating_mask) stft_with_mask = repeating_mask * self.stft[:, :, i] background_stft.append(stft_with_mask) # Set STFT in correct order background_stft = np.array(background_stft).transpose((1, 2, 0)) self._make_background_signal(background_stft) # make a mask and return background_mask = np.array(background_mask).transpose((1, 2, 0)) background_mask = masks.SoftMask(background_mask) if self.mask_type == self.BINARY_MASK: background_mask = background_mask.mask_to_binary( self.mask_threshold) self.result_masks = [background_mask, background_mask.inverse_mask()] return self.result_masks
def run(self): """ Returns: foreground (AudioSignal): An AudioSignal object with melodic foreground in foreground.audio_data (to get the corresponding background run self.make_audio_signals()) Example: :: """ # High pass filter cutoff freq. (in # of freq. bins), +1 to match MATLAB implementation self.high_pass_cutoff = int( np.ceil(self.high_pass_cutoff * (self.stft_params.n_fft_bins - 1) / self.audio_signal.sample_rate)) + 1 self._compute_spectrum() # separate the mixture foreground melody by masking if self.melody_signal is None: self.extract_melody() self.create_melody_signal(100) foreground_mask = self.create_harmonic_mask(self.melody_signal) foreground_mask[0:self.high_pass_cutoff, :] = 0 foreground_mask = masks.SoftMask(foreground_mask) if self.mask_type == self.BINARY_MASK: foreground_mask = foreground_mask.mask_to_binary( self.mask_threshold) self.foreground_mask = foreground_mask self.background_mask = foreground_mask.invert_mask() self.foreground = self.audio_signal.apply_mask(foreground_mask) self.foreground.istft( self.stft_params.window_length, self.stft_params.hop_length, self.stft_params.window_type, overwrite=True, use_librosa=self.use_librosa_stft, truncate_to_length=self.audio_signal.signal_length) return [self.background_mask, self.foreground_mask]
def run(self): """ Returns: Example: :: """ self._compute_spectrograms() # separate the mixture background by masking harmonic_masks = [] percussive_masks = [] for i in range(self.audio_signal.num_channels): # apply mask harmonic_mask, percussive_mask = librosa.decompose.hpss(self.stft[:, :, i], kernel_size=self.kernel_size, mask=True) harmonic_masks.append(harmonic_mask) percussive_masks.append(percussive_mask) # make a new audio signal for the background # make a mask and return harmonic_mask = np.array(harmonic_masks).transpose((1, 2, 0)) percussive_mask = np.array(percussive_masks).transpose((1, 2, 0)) both_masks = [harmonic_mask, percussive_mask] self.masks = [] for mask in both_masks: if self.mask_type == self.BINARY_MASK: mask = np.round(mask) mask_object = masks.BinaryMask(mask) elif self.mask_type == self.SOFT_MASK: mask_object = masks.SoftMask(mask) else: raise ValueError('Unknown mask type {}!'.format(self.mask_type)) self.masks.append(mask_object) return self.masks
def run(self): """ Creates a list of masks (as :class:`separation.masks.mask_base.MaskBase` objects, either :class:`separation.masks.binary_mask.BinaryMask` or :class:`separation.masks.soft_mask.SoftMask` depending on how the object was instantiated) from a list of known source signals (``source_list`` in the constructor). Returns a list of :class:`separation.masks.mask_base.MaskBase` objects (one for each input signal) in the order that they were provided when this object was initialized. Binary masks are created based on the magnitude spectrogram using the following formula: ``mask = (provided_source.mag_spec >= (mixture_mag_spec - provided_source.mag_spec)`` ``mask = (20 * np.log10(source.mag_spec / mixture.mag_spec)) > binary_db_threshold`` Where '``/``' is a element-wise division and '``>``' is element-wise logical greater-than. Soft masks are also created based on the magnitude spectrogram but use the following formula: 1) ``mask = mixture_mag_spec / provided_source.mag_spec`` 2) ``mask = log(mask)`` 3) ``mask = (mask + abs(min(mask))) / max(mask)`` Where all arithmetic operations and log are element-wise. This provides a logarithmically scaled mask that is in the interval [0.0, 1.0]. Returns: estimated_masks (list): List of resultant :class:`separation.masks.mask_base.MaskBase` objects created. Masks in this list are in the same order that ``source_list`` (and :attr:`sources`) are in. Raises: RuntimeError if unknown mask type is provided (Options are [``BinaryMask``, or ``SoftMask``]). """ self._compute_spectrograms() self.result_masks = [] for source in self.sources: mag = source.magnitude_spectrogram_data # Alias this variable, for easy reading if self.mask_type == self.BINARY_MASK: div = np.divide(mag + constants.EPSILON, self._mixture_mag_spec + constants.EPSILON) cur_mask = (20 * np.log10(div)) > self.binary_db_threshold mask = masks.BinaryMask(cur_mask) elif self.mask_type == self.SOFT_MASK: soft_mask = librosa.util.softmask( self.audio_signal.magnitude_spectrogram_data, mag, power=self.power, split_zeros=self.split_zeros) mask = masks.SoftMask(soft_mask) else: raise RuntimeError('Unknown mask type: {}'.format( self.mask_type)) self.result_masks.append(mask) return self.result_masks
def run(self): """ This function calls TransformerNMF on the magnitude spectrogram of each channel in the input audio signal. The templates and activation matrices returned are clustered using K-Means clustering. These clusters are used to create mask objects for each source. Note: The masks in self.result_masks are not returned in a particular order corresponding to the sources, but they are in the same order for each channel. Returns: result_masks (list): A list of :obj:`MaskBase`-derived objects for each source. (to get a list of :obj:`AudioSignal`-derived objects run :func:`make_audio_signals`) Example: .. code-block:: python :linenos: signal = nussl.AudioSignal(path_to_input_file='input_name.wav') # Set up and run NMF MFCC nmf_mfcc = nussl.NMF_MFCC(signal, num_sources=2) # Returns a binary mask by default masks = nmf_mfcc.run() # Get audio signals sources = nmf_mfcc.make_audio_signals() # Output the sources for i, source in enumerate(sources): output_file_name = str(i) + '.wav' source.write_audio_to_file(output_file_name) """ self.audio_signal.stft_params = self.stft_params self.audio_signal.stft() uncollated_masks = [] n_chan = self.audio_signal.num_channels for ch in range(n_chan): channel_stft = self.audio_signal.get_magnitude_spectrogram_channel( ch) # Set up NMF and run nmf = transformer_nmf.TransformerNMF( input_matrix=channel_stft, num_components=self.num_templates, seed=self.random_seed, should_do_epsilon=False, max_num_iterations=self.num_iterations, distance_measure=self.distance_measure) channel_activation_matrix, channel_templates_matrix = nmf.transform( ) # Cluster the templates matrix into Mel frequencies and retrieve labels cluster_templates = librosa.feature.mfcc( S=channel_templates_matrix, n_mfcc=self.n_mfcc)[self.mfcc_start:self.mfcc_end] self.clusterer.fit_transform(cluster_templates.T) self.labeled_templates = self.clusterer.labels_ # Extract sources from signal uncollated_masks += self._extract_masks(channel_templates_matrix, channel_activation_matrix, ch) # Reorder mask arrays so that the channels are collated correctly (this allows for multichannel signals) collated_masks = [ np.dstack([ uncollated_masks[s + ch * self.num_sources] for ch in range(n_chan) ]) for s in range(self.num_sources) ] # Put each numpy array mask into a MaskBase object self.result_masks = [] for mask in collated_masks: if self.mask_type == self.BINARY_MASK: mask = np.round(mask) mask_object = masks.BinaryMask(mask) elif self.mask_type == self.SOFT_MASK: mask_object = masks.SoftMask(mask) else: raise ValueError('Unknown mask type {}!'.format( self.mask_type)) self.result_masks.append(mask_object) return self.result_masks
def run(self): """ Runs the original REPET algorithm Returns: masks (:obj:`MaskBase`): A :obj:`MaskBase`-derived object with repeating background time-frequency data. (to get the corresponding non-repeating foreground run :func:`make_audio_signals`) Example: .. code-block:: python :linenos: signal = nussl.AudioSignal(path_to_input_file='input_name.wav') # Set up and run Repet repet = nussl.Repet(signal) # Returns a soft mask by default masks = repet.run() # or repet() # Get audio signals background, foreground = repet.make_audio_signals() # output the background background.write_audio_to_file('background.wav') """ # High pass filter cutoff freq. (in # of freq. bins), +1 to match MATLAB implementation self.high_pass_cutoff = int( np.ceil(self.high_pass_cutoff * (self.stft_params.n_fft_bins - 1) / self.audio_signal.sample_rate)) + 1 # the MATLAB implementation had low = 1 if self.matlab_fidelity else 0 self._compute_spectrograms() self.repeating_period = self._calculate_repeating_period() # separate the mixture background by masking background_stft = [] background_mask = [] for i in range(self.audio_signal.num_channels): repeating_mask = self._compute_repeating_mask( self.magnitude_spectrogram[:, :, i]) repeating_mask[ low:self. high_pass_cutoff, :] = 1 # high-pass filter the foreground background_mask.append(repeating_mask) # apply mask stft_with_mask = repeating_mask * self.stft[:, :, i] background_stft.append(stft_with_mask) # make a new audio signal for the background background_stft = np.array(background_stft).transpose((1, 2, 0)) self._make_background_signal(background_stft) # make a mask and return background_mask = np.array(background_mask).transpose((1, 2, 0)) background_mask = masks.SoftMask(background_mask) if self.mask_type == self.BINARY_MASK: background_mask = background_mask.mask_to_binary( self.mask_threshold) self.result_masks = [background_mask, background_mask.inverse_mask()] return self.result_masks
def run(self): """ Creates a list of masks (as :class:`separation.masks.mask_base.MaskBase` objects, either :class:`separation.masks.binary_mask.BinaryMask` or :class:`separation.masks.soft_mask.SoftMask` depending on how the object was instantiated) from a list of known source signals (``source_list`` in the constructor). Returns a list of :class:`separation.masks.mask_base.MaskBase` objects (one for each input signal) in the order that they were provided when this object was initialized. Binary masks are created based on the magnitude spectrogram using the following formula: ``mask = (provided_source.mag_spec >= (mixture_mag_spec - provided_source.mag_spec)`` Where '``-``' is a element-wise subtraction (as if the values were binary ints, 0 or 1) and '``>=``' is element-wise logical greater-than-or-equal (again, as if the values were binary ints, 0 or 1). Soft masks are also created based on the magnitude spectrogram but use the following formula: 1) ``mask = mixture_mag_spec / provided_source.mag_spec`` 2) ``mask = log(mask)`` 3) ``mask = (mask + abs(min(mask))) / max(mask)`` Where all arithmetic operations and log are element-wise. This provides a logarithmically scaled mask that is in the interval [0.0, 1.0]. Returns: estimated_masks (list): List of resultant :class:`separation.masks.mask_base.MaskBase` objects created. Masks in this list are in the same order that ``source_list`` (and :attr:`sources`) are in. Raises: RuntimeError if unknown mask type is provided (Options are [``BinaryMask``, or ``SoftMask``]). """ self._compute_spectrograms() self.result_masks = [] for source in self.sources: if self.mask_type == self.BINARY_MASK: mag = source.magnitude_spectrogram_data # Alias this variable, for easy reading cur_mask = (mag >= (self._mixture_mag_spec - mag)) mask = masks.BinaryMask(cur_mask) elif self.mask_type == self.SOFT_MASK: # TODO: This is a kludge. What is the actual right way to do this? sm = np.divide(self.audio_signal.magnitude_spectrogram_data, source.magnitude_spectrogram_data) # log_sm1 = np.log(sm - np.min(sm) + 1) log_sm = np.log(sm) log_sm += np.abs(np.min(log_sm)) log_sm /= np.max(log_sm) mask = masks.SoftMask(sm) else: raise RuntimeError('Unknown mask type: {}'.format( self.mask_type)) self.result_masks.append(mask) return self.result_masks