Ejemplo n.º 1
0
    def run(self):
        """

        Returns:

        """
        self._compute_spectrograms()
        self.deep_clustering()

        uncollated_masks = []
        for i in range(self.audio_signal.num_channels):
            uncollated_masks += self._extract_masks(i)

        collated_masks = [np.dstack([uncollated_masks[s + ch * self.num_sources]
                                     for ch in range(self.audio_signal.num_channels)])
                          for s in range(self.num_sources)]

        self.masks = []

        for mask in collated_masks:
            if self.mask_type == self.BINARY_MASK:
                mask = np.round(mask)
                mask_object = masks.BinaryMask(mask)
            elif self.mask_type == self.SOFT_MASK:

                mask_object = masks.SoftMask(mask)
            else:
                raise ValueError('Unknown mask type {}!'.format(self.mask_type))
            self.masks.append(mask_object)
        return self.masks
Ejemplo n.º 2
0
    def generate_mask(self, ch, assignments):
        """
            Takes binary Mel spectrogram assignments and generates mask
        """
        if self.audio_signal.stft_data is None:
            raise ValueError('Cannot extract masks with no signal_stft data')

        mask = (self.silence_mask[ch, :, :] * assignments)
        mask = np.dot(mask, self.inverse_mel_filter_bank).T
        mask += np.abs(mask.min())
        mask /= (np.max(mask) + 1e-7)
        mask = np.round(mask)

        # mask = np.dstack([mask, mask])

        return masks.BinaryMask(mask)
Ejemplo n.º 3
0
    def run(self):
        """

        Returns:

        Example:
             ::

        """
        self._compute_spectrograms()

        # separate the mixture background by masking
        harmonic_masks = []
        percussive_masks = []
        for i in range(self.audio_signal.num_channels):
            # apply mask
            harmonic_mask, percussive_mask = librosa.decompose.hpss(self.stft[:, :, i],
                                                                    kernel_size=self.kernel_size,
                                                                    mask=True)
            harmonic_masks.append(harmonic_mask)
            percussive_masks.append(percussive_mask)

        # make a new audio signal for the background

        # make a mask and return
        harmonic_mask = np.array(harmonic_masks).transpose((1, 2, 0))
        percussive_mask = np.array(percussive_masks).transpose((1, 2, 0))
        both_masks = [harmonic_mask, percussive_mask]
        
        self.masks = []
        
        for mask in both_masks:
            if self.mask_type == self.BINARY_MASK:
                mask = np.round(mask)
                mask_object = masks.BinaryMask(mask)
            elif self.mask_type == self.SOFT_MASK:
                mask_object = masks.SoftMask(mask)
            else:
                raise ValueError('Unknown mask type {}!'.format(self.mask_type))
            self.masks.append(mask_object)
        return self.masks
Ejemplo n.º 4
0
    def _compute_masks(self):
        """Receives the attenuation and delay peaks and computes a mask to be applied to the signal for source
        separation.

        """
        # compute masks for separation
        best_so_far = np.inf * np.ones_like(self.stft_ch0, dtype=float)

        for i in range(0, self.num_sources):
            mask_array = np.zeros_like(self.stft_ch0, dtype=bool)
            phase = np.exp(-1j * self.frequency_matrix * self.delay_peak[i])
            score = np.abs(self.atn_peak[i] * phase * self.stft_ch0 - self.stft_ch1) ** 2 / (1 + self.atn_peak[i] ** 2)
            mask = (score < best_so_far)
            mask_array[mask] = True
            background_mask = masks.BinaryMask(np.array(mask_array))
            self.result_masks.append(background_mask)
            self.result_masks[0].mask = np.logical_xor(self.result_masks[i].mask, self.result_masks[0].mask)
            best_so_far[mask] = score[mask]

        # Compute first mask based on what the other masks left remaining
        self.result_masks[0].mask = np.logical_not(self.result_masks[0].mask)
        return self.result_masks
Ejemplo n.º 5
0
    def run(self):
        """
        Creates a list of masks (as :class:`separation.masks.mask_base.MaskBase` objects, either 
        :class:`separation.masks.binary_mask.BinaryMask` or :class:`separation.masks.soft_mask.SoftMask` 
        depending on how the object was instantiated) from a list of known source signals (``source_list`` 
        in the constructor).
        
        Returns a list of :class:`separation.masks.mask_base.MaskBase` objects (one for each input signal) 
        in the order that they were provided when this object was initialized.
        
        Binary masks are created based on the magnitude spectrogram using the following formula:
        
                ``mask = (provided_source.mag_spec >= (mixture_mag_spec - provided_source.mag_spec)``
                ``mask = (20 * np.log10(source.mag_spec / mixture.mag_spec)) > binary_db_threshold``

        Where '``/``' is a element-wise division and '``>``' is element-wise logical greater-than.
        
        
        Soft masks are also created based on the magnitude spectrogram but use the following formula:
        
                1) ``mask = mixture_mag_spec / provided_source.mag_spec``
                
                2) ``mask = log(mask)``
                
                3) ``mask = (mask + abs(min(mask))) / max(mask)``
                
        
        Where all arithmetic operations and log are element-wise. This provides a logarithmically scaled mask that is
        in the interval [0.0, 1.0].
        
        
        Returns:
            estimated_masks (list): List of resultant :class:`separation.masks.mask_base.MaskBase` objects created. 
            Masks in this list are in the same order that ``source_list`` (and :attr:`sources`) are in.
                
        Raises:
            RuntimeError if unknown mask type is provided (Options are [``BinaryMask``, or ``SoftMask``]).

        """
        self._compute_spectrograms()
        self.result_masks = []

        for source in self.sources:
            mag = source.magnitude_spectrogram_data  # Alias this variable, for easy reading
            if self.mask_type == self.BINARY_MASK:
                div = np.divide(mag + constants.EPSILON,
                                self._mixture_mag_spec + constants.EPSILON)
                cur_mask = (20 * np.log10(div)) > self.binary_db_threshold
                mask = masks.BinaryMask(cur_mask)

            elif self.mask_type == self.SOFT_MASK:
                soft_mask = librosa.util.softmask(
                    self.audio_signal.magnitude_spectrogram_data,
                    mag,
                    power=self.power,
                    split_zeros=self.split_zeros)

                mask = masks.SoftMask(soft_mask)
            else:
                raise RuntimeError('Unknown mask type: {}'.format(
                    self.mask_type))

            self.result_masks.append(mask)

        return self.result_masks
Ejemplo n.º 6
0
    def run(self):
        """ This function calls TransformerNMF on the magnitude spectrogram of each channel in the input audio signal.
        The templates and activation matrices returned are clustered using K-Means clustering. These clusters are used
        to create mask objects for each source. Note: The masks in self.result_masks are not returned in a particular
        order corresponding to the sources, but they are in the same order for each channel.

        Returns:
            result_masks (list): A list of :obj:`MaskBase`-derived objects for each source.
            (to get a list of :obj:`AudioSignal`-derived objects run :func:`make_audio_signals`)

        Example:

        .. code-block:: python
            :linenos:

            signal = nussl.AudioSignal(path_to_input_file='input_name.wav')

            # Set up and run NMF MFCC
            nmf_mfcc =  nussl.NMF_MFCC(signal, num_sources=2) # Returns a binary mask by default
            masks = nmf_mfcc.run()

            # Get audio signals
            sources = nmf_mfcc.make_audio_signals()

            # Output the sources
            for i, source in enumerate(sources):
                output_file_name = str(i) + '.wav'
                source.write_audio_to_file(output_file_name)
        """
        self.audio_signal.stft_params = self.stft_params
        self.audio_signal.stft()

        uncollated_masks = []
        n_chan = self.audio_signal.num_channels
        for ch in range(n_chan):
            channel_stft = self.audio_signal.get_magnitude_spectrogram_channel(
                ch)

            # Set up NMF and run
            nmf = transformer_nmf.TransformerNMF(
                input_matrix=channel_stft,
                num_components=self.num_templates,
                seed=self.random_seed,
                should_do_epsilon=False,
                max_num_iterations=self.num_iterations,
                distance_measure=self.distance_measure)

            channel_activation_matrix, channel_templates_matrix = nmf.transform(
            )

            # Cluster the templates matrix into Mel frequencies and retrieve labels
            cluster_templates = librosa.feature.mfcc(
                S=channel_templates_matrix,
                n_mfcc=self.n_mfcc)[self.mfcc_start:self.mfcc_end]
            self.clusterer.fit_transform(cluster_templates.T)
            self.labeled_templates = self.clusterer.labels_

            # Extract sources from signal
            uncollated_masks += self._extract_masks(channel_templates_matrix,
                                                    channel_activation_matrix,
                                                    ch)

        # Reorder mask arrays so that the channels are collated correctly (this allows for multichannel signals)
        collated_masks = [
            np.dstack([
                uncollated_masks[s + ch * self.num_sources]
                for ch in range(n_chan)
            ]) for s in range(self.num_sources)
        ]

        # Put each numpy array mask into a MaskBase object
        self.result_masks = []
        for mask in collated_masks:
            if self.mask_type == self.BINARY_MASK:
                mask = np.round(mask)
                mask_object = masks.BinaryMask(mask)
            elif self.mask_type == self.SOFT_MASK:
                mask_object = masks.SoftMask(mask)
            else:
                raise ValueError('Unknown mask type {}!'.format(
                    self.mask_type))
            self.result_masks.append(mask_object)

        return self.result_masks
Ejemplo n.º 7
0
    def run(self):
        """
        Creates a list of masks (as :class:`separation.masks.mask_base.MaskBase` objects, either 
        :class:`separation.masks.binary_mask.BinaryMask` or :class:`separation.masks.soft_mask.SoftMask` 
        depending on how the object was instantiated) from a list of known source signals (``source_list`` 
        in the constructor).
        
        Returns a list of :class:`separation.masks.mask_base.MaskBase` objects (one for each input signal) 
        in the order that they were provided when this object was initialized.
        
        Binary masks are created based on the magnitude spectrogram using the following formula:
        
                ``mask = (provided_source.mag_spec >= (mixture_mag_spec - provided_source.mag_spec)``
                
        Where '``-``' is a element-wise subtraction (as if the values were binary ints, 0 or 1) and '``>=``'
        is element-wise logical greater-than-or-equal (again, as if the values were binary ints, 0 or 1).
        
        
        Soft masks are also created based on the magnitude spectrogram but use the following formula:
        
                1) ``mask = mixture_mag_spec / provided_source.mag_spec``
                
                2) ``mask = log(mask)``
                
                3) ``mask = (mask + abs(min(mask))) / max(mask)``
                
        
        Where all arithmetic operations and log are element-wise. This provides a logarithmically scaled mask that is
        in the interval [0.0, 1.0].
        
        
        Returns:
            estimated_masks (list): List of resultant :class:`separation.masks.mask_base.MaskBase` objects created. 
            Masks in this list are in the same order that ``source_list`` (and :attr:`sources`) are in.
                
        Raises:
            RuntimeError if unknown mask type is provided (Options are [``BinaryMask``, or ``SoftMask``]).

        """
        self._compute_spectrograms()
        self.result_masks = []

        for source in self.sources:
            if self.mask_type == self.BINARY_MASK:
                mag = source.magnitude_spectrogram_data  # Alias this variable, for easy reading
                cur_mask = (mag >= (self._mixture_mag_spec - mag))
                mask = masks.BinaryMask(cur_mask)

            elif self.mask_type == self.SOFT_MASK:
                # TODO: This is a kludge. What is the actual right way to do this?
                sm = np.divide(self.audio_signal.magnitude_spectrogram_data,
                               source.magnitude_spectrogram_data)
                # log_sm1 = np.log(sm - np.min(sm) + 1)
                log_sm = np.log(sm)
                log_sm += np.abs(np.min(log_sm))
                log_sm /= np.max(log_sm)
                mask = masks.SoftMask(sm)
            else:
                raise RuntimeError('Unknown mask type: {}'.format(
                    self.mask_type))

            self.result_masks.append(mask)

        return self.result_masks