Ejemplo n.º 1
0
def multinmf_conv_mu_wrapper(x,
                             n_src,
                             n_latent_var,
                             stft_win_len,
                             partial_rirs=None,
                             W_dict=None,
                             n_iter=500,
                             l1_reg=0.,
                             random_seed=0,
                             verbose=False):
    '''
    A wrapper around multichannel nmf using MU updates to use with pyroormacoustcs.
    Performs STFT and ensures all signals are the correct shape.

    Parameters
    ----------
    x: ndarray
        (n_samples x n_channel) array of time domain samples
    n_src: int
        The number of sources
    n_latent_var: int
        The number of latent variables in the NMF
    stft_win_len:
        The length of the STFT window
    partial_rirs: array_like, optional
        (n_channel x n_src x n_bins) array of partial TF. If provided, Q is not optimized.
    W_dict: array_like, optional
        A dictionary of atoms that can be used in the NMF. If provided, W is not optimized.
    n_iter: int, optional
        The number of iterations of NMF (default 500)
    l1_reg: float, optional
        The weight of the l1 regularization term for the activations (default 0., not regularized)
    random_seed: unsigned int, optional
        The seed to provide to the RNG prior to initialization of NMF parameters. This allows to use
        repeatable initialization.
    verbose: bool, optional
        When true, prints convergence info of NMF (default False)
    '''

    n_channel = x.shape[1]

    # STFT
    window = np.sqrt(pra.cosine(stft_win_len))  # use sqrt because of synthesis
    # X is (n_channel, n_frame, n_bin)
    X = np.array([
        pra.stft(x[:, ch],
                 stft_win_len,
                 stft_win_len // 2,
                 win=window,
                 transform=np.fft.rfft) for ch in range(n_channel)
    ])
    # move axes to match Ozerov's order (n_bin, n_frame, n_channel)
    X = np.moveaxis(X, [0, 1, 2], [2, 1, 0])
    n_bin = X.shape[0]
    n_frame = X.shape[1]

    # Squared magnitude and unit energy per bin
    V = np.abs(X)**2
    V /= np.mean(V)

    # Random initialization of multichannel NMF parameters
    np.random.seed(random_seed)

    K = n_latent_var * n_src
    source_NMF_ind = []
    for j in range(n_src):
        source_NMF_ind = np.reshape(
            np.arange(n_latent_var * n_src, dtype=np.int), (n_src, -1))

    mix_psd = np.mean(V, axis=(1, 2))
    # W is intialized so that its enegy follows mixture PSD
    if W_dict is None:
        W_init = 0.5 * ((np.abs(np.random.randn(n_bin, K)) + np.ones(
            (n_bin, K))) * (mix_psd[:, np.newaxis] * np.ones((1, K))))
        fix_W = False
    else:
        if W_dict.shape[1] == n_latent_var:
            W_init = np.tile(W_dict, n_src)
        elif W_dict.shape[1] == n_src * n_latent_var:
            W_init = W_dict
        else:
            raise ValueError(
                'Mismatch between dictionary size and latent variables')
        fix_W = True

    # follow average activations
    mix_act = np.mean(V, axis=(0, 2))
    H_init = 0.5 * (np.abs(np.random.randn(K, n_frame)) + np.ones(
        (K, n_frame))) * mix_act[np.newaxis, :]

    if partial_rirs is not None:
        # squared mag partial rirs (n_bin, n_channel, n_src)
        Q_init = np.moveaxis(np.abs(partial_rirs)**2, [2], [0])
        Q_init /= np.max(Q_init, axis=0)[None, :, :]
        fix_Q = True
    else:
        # random initialization
        Q_shape = (n_bin, n_channel, n_src)
        Q_init = (0.5 * (1.9 * np.abs(np.random.randn(*Q_shape)) +
                         0.1 * np.ones(Q_shape)))**2
        fix_Q = False

    # RUN NMF
    W_MU, H_MU, Q_MU, cost = \
        multinmf_conv_mu(
                np.abs(X)**2, W_init, H_init, Q_init, source_NMF_ind,
                n_iter=n_iter, fix_Q=fix_Q, fix_W=fix_W,
                H_l1_reg=l1_reg,
                verbose=verbose)

    # Computation of the spatial source images
    Im = multinmf_recons_im(X, W_MU, H_MU, Q_MU, source_NMF_ind)

    sep_sources = []
    # Inverse STFT
    for j in range(n_src):
        # channel-wise istft with synthesis window
        ie_MU = []
        for ch in range(n_channel):
            ie_MU.append(
                pra.istft(Im[:, :, j, ch].T,
                          stft_win_len,
                          stft_win_len // 2,
                          win=window,
                          transform=np.fft.irfft))

        sep_sources.append(np.array(ie_MU).T)

    return np.array(sep_sources)
Ejemplo n.º 2
0
def multinmf_conv_em_wrapper(
        x, n_src, stft_win_len, n_latent_var, n_iter=500, \
        A_init=None, W_init=None, H_init=None, \
        update_a=True, update_w=True, update_h=True, \
        verbose = False):

    '''
    A wrapper around multichannel nmf using EM updates to use with pyroormacoustcs.
    Performs STFT and ensures all signals are the correct shape.

    Parameters
    ----------
    x: ndarray
        (n_samples x n_chan) array of time domain samples
    n_latent_var: int
        number of latent variables in the NMF
    '''

    n_chan = x.shape[1]

    # STFT
    window = np.sqrt(pra.cosine(stft_win_len))  # use sqrt because of synthesis
    # X is (n_chan, n_frame, n_bin)
    X = np.array(
            [pra.stft(x[:,ch], stft_win_len, stft_win_len // 2, win=window, transform=np.fft.rfft) for ch in range(n_chan)]
            )
    # move axes to match Ozerov's order (n_bin, n_frame, n_chan)
    X = np.moveaxis(X, [0,1,2], [2,1,0])
    n_bin = X.shape[0]
    n_frame = X.shape[1]

    if W_init is None:
        K = n_latent_var * n_src
    else:
        K = W_init.shape[-1]

    # Random initialization of multichannel NMF parameters
    source_NMF_ind = []
    for j in range(n_src):
        source_NMF_ind = np.reshape(np.arange(K, dtype=np.int), (n_src,-1))

    mix_psd = 0.5 * (np.mean(np.sum(np.abs(X)**2, axis=2), axis=1))
    if A_init is None:
        # random initialization
        update_a = True
        A_init = (0.5 *
                    ( 1.9 * np.abs(random.randn(n_bin, n_chan, n_src))       \
                    + 0.1 * np.ones((n_bin, n_chan, n_src))                  \
                    ) * np.sign( random.randn(n_bin, n_chan, n_src)          \
                                + 1j * random.randn(n_bin, n_chan, n_src))  \
                )
    else:
        # reshape the partial rir input (n_bin, n_chan, n_src)
        A_init = np.moveaxis(A_init, [2], [0])

    # W is intialized so that its enegy follows mixture PSD
    if W_init is None:
        W_init = 0.5 * (
                ( np.abs(np.random.randn(n_bin,K)) + np.ones((n_bin,K)) )
                * ( mix_psd[:,np.newaxis] * np.ones((1,K)) )
                )
    if H_init is None:
        H_init = 0.5 * ( np.abs(np.random.randn(K,n_frame)) + np.ones((K,n_frame)) )

    Sigma_b_init = mix_psd / 100


    W_EM, H_EM, Ae_EM, Sigma_b_EM, Se_EM, log_like_arr = \
        multinmf_conv_em(X, W_init, H_init, A_init, Sigma_b_init, source_NMF_ind,
            iter_num=n_iter, update_a=update_a, update_w=update_w, update_h=update_h, verbose=verbose)

    Ae_EM = np.moveaxis(Ae_EM, [0], [2])

    # Computation of the spatial source images
    if verbose:
        print('Computation of the spatial source images\n')
    Ie_EM = np.zeros((n_bin,n_frame,n_src,n_chan), dtype=np.complex)
    for j in range(n_src):
        for f in range(n_bin):
            Ie_EM[f,:,j,:] = np.outer(Se_EM[f,:,j], Ae_EM[:,j,f])

    sep_sources = []

    # Inverse STFT
    ie_EM = []
    for j in range(n_src):
        # channel-wise istft with synthesis window
        ie_EM = []
        for ch in range(n_chan):
            ie_EM.append(
                    pra.istft(Ie_EM[:,:,j,ch].T, stft_win_len, stft_win_len // 2, win=window, transform=np.fft.irfft)
                    )
        sep_sources.append(np.array(ie_EM).T)

    return np.array(sep_sources)
Ejemplo n.º 3
0
        with open(cache_file, 'wb') as f:
            pickle.dump(corpus, f)

    # let's find all the sentences from male speakers in the training set
    male_speakers_test = list(set([s.speaker for s in filter(lambda x: x.sex == 'M', corpus.sentence_corpus['TEST'])]))
    male_speakers_train = list(set([s.speaker for s in filter(lambda x: x.sex == 'M', corpus.sentence_corpus['TRAIN'])]))
    female_speakers_test = list(set([s.speaker for s in filter(lambda x: x.sex == 'F', corpus.sentence_corpus['TEST'])]))
    female_speakers_train = list(set([s.speaker for s in filter(lambda x: x.sex == 'F', corpus.sentence_corpus['TRAIN'])]))

    print('Pick a subset of', n_speakers, 'speakers')
    training_set_speakers = male_speakers_train[:n_speakers] + female_speakers_train[:n_speakers]
    print(training_set_speakers)

    # compute all the spectrograms
    print('Compute all the spectrograms')
    window = np.sqrt(pra.cosine(stft_win_len))  # use sqrt because of synthesis
    training_set = dict()
    testing_set = dict()
    for speaker in training_set_speakers:
        training_set_sentences = filter(lambda x: x.speaker == speaker, corpus.sentence_corpus['TRAIN'])
        # X is (n_sentences, n_channel, n_frame)
        x = list()
        X = list()
        for sentence in training_set_sentences:
            print(sentence.speaker, sentence.id,)
            x.append(sentence.samples)
            X.append(pra.stft(sentence.samples, stft_win_len, stft_win_len // 2, win=window, transform=np.fft.rfft).T)
        # TRAIN:
        # Dalia says the magnitude works better...
        training_set[speaker] = np.concatenate([np.abs(spectrogram)**2 for spectrogram in X[0:9]], axis=1)
        # TEST:
Ejemplo n.º 4
0
def example_usage_multinmf_conv_em():
    #
    # example_usage_multinmf_conv_em()
    #
    # Example of usage of EM algorithm for multichannel NMF decomposition in
    #   convolutive mixture
    #
    #
    # input
    # -----
    #
    # ...
    #
    # output
    # ------
    #
    # estimated source images are written in the results_dir
    #
    ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    # Copyright 2017 Robin Scheibler, adapted to Python
    # Copyright 2010 Alexey Ozerov
    # (alexey.ozerov -at- irisa.fr)
    #
    # This software is distributed under the terms of the GNU Public License
    # version 3 (http://www.gnu.org/licenses/gpl.txt)
    #
    # If you use this code please cite this paper
    #
    # A. Ozerov and C. Fevotte,
    # "Multichannel nonnegative matrix factorization in convolutive mixtures for audio source separation,"
    # IEEE Trans. on Audio, Speech and Lang. Proc. special issue on Signal Models and Representations
    # of Musical and Environmental Sounds, vol. 18, no. 3, pp. 550-563, March 2010.
    # Available: http://www.irisa.fr/metiss/ozerov/Publications/OzerovFevotte_IEEE_TASLP10.pdf
    ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

    NMF_CompPerSrcNum = 4
    nsrc = 3
    stft_win_len = 2048

    data_dir = 'data/Speech/'
    results_dir = 'data/Speech/'
    file_prefix = '3sources_3channels'

    # Input time-frequency representation
    print('Input time-frequency representation')
    fs, x = wavfile.read(data_dir + file_prefix + '_mix.wav')
    x = x / (2**15)
    mix_nsamp = x.shape[0]
    nchan = x.shape[1]

    # TODO STFT
    window = pra.cosine(stft_win_len)
    # X is (nchan, nframe, nbin)
    X = np.array([
        pra.stft(x[:, ch],
                 stft_win_len,
                 stft_win_len // 2,
                 win=window,
                 transform=np.fft.rfft) for ch in range(nchan)
    ])
    # move axes to match Ozerov's order (nbin, nfram, nchan)
    X = np.moveaxis(X, [0, 1, 2], [2, 1, 0])
    nbin = X.shape[0]
    nfram = X.shape[1]

    # Random initialization of multichannel NMF parameters
    print('Random initialization of multichannel NMF parameters')
    K = NMF_CompPerSrcNum * nsrc
    source_NMF_ind = []
    for j in range(nsrc):
        source_NMF_ind.append(
            np.arange(NMF_CompPerSrcNum) + j * NMF_CompPerSrcNum)
    mix_psd = 0.5 * (np.mean(np.abs(np.sum(X**2, axis=2)), axis=1))
    random_phases = random.randn(nchan, nsrc,
                                 nbin) + 1j * random.randn(nchan, nsrc, nbin)
    random_phases /= np.abs(random_phases)
    A_init = (0.5 *
              (1.9 * np.abs(random.randn(nchan, nsrc, nbin)) + 0.1 * np.ones(
                  (nchan, nsrc, nbin))) * random_phases)
    # W is intialized so that its enegy follows mixture PSD
    W_init = 0.5 * ((np.abs(random.randn(nbin, K)) + np.ones(
        (nbin, K))) * (mix_psd[:, np.newaxis] * np.ones((1, K))))
    # W_init = np.load("W_dictionary_em.npy")
    # print(W_init.shape)
    # K = W_init.shape[1]
    H_init = 0.5 * (np.abs(random.randn(K, nfram)) + np.ones((K, nfram)))
    Sigma_b_init = mix_psd / 100

    # run 500 iterations of multichannel NMF EM algorithm (with annealing)
    A_init = np.moveaxis(A_init, [2], [0])


    W_EM, H_EM, Ae_EM, Sigma_b_EM, Se_EM, log_like_arr = \
        multinmf_conv_em(X, W_init, H_init, A_init, Sigma_b_init, source_NMF_ind, iter_num=300)

    Ae_EM = np.moveaxis(Ae_EM, [0], [2])

    # Computation of the spatial source images
    print('Computation of the spatial source images\n')
    Ie_EM = np.zeros((nbin, nfram, nsrc, nchan), dtype=np.complex)
    for j in range(nsrc):
        for f in range(nbin):
            Ie_EM[f, :, j, :] = np.outer(Se_EM[f, :, j], Ae_EM[:, j, f])

    # Inverse STFT
    ie_EM = []
    for j in range(nsrc):
        # channel-wise istft with synthesis window
        ie_EM = []
        for ch in range(nchan):
            ie_EM.append(
                pra.istft(Ie_EM[:, :, j, ch].T,
                          stft_win_len,
                          stft_win_len // 2,
                          win=window,
                          transform=np.fft.irfft))
        # write the separated source to a wav file
        out_filename = results_dir + '_sim_EM_' + str(j) + '.wav'
        wavfile.write(out_filename, fs, np.array(ie_EM).T)

    # Plot estimated W and H
    print('Plot estimated W and H')
    plt.figure()
    plot_ind = 1
    for k in range(NMF_CompPerSrcNum):
        for j in range(nsrc):
            plt.subplot(NMF_CompPerSrcNum, nsrc, plot_ind)
            plt.plot(np.log10(np.maximum(W_EM[:, source_NMF_ind[j][k]],
                                         1e-40)))
            plt.title('Source_{}, log10(W_{})'.format(j, k))
            plot_ind += 1
    plt.tight_layout()

    plt.figure()
    plot_ind = 1
    for k in range(NMF_CompPerSrcNum):
        for j in range(nsrc):
            plt.subplot(NMF_CompPerSrcNum, nsrc, plot_ind)
            plt.plot(H_EM[source_NMF_ind[j][k], :])
            plt.title('Source_{}, H_{}'.format(j, k))
            plot_ind = plot_ind + 1
    plt.tight_layout()

    plt.show()

    plt.figure()
    plt.plot(log_like_arr)
    plt.show()