Ejemplo n.º 1
0
def separate(fileIn, fileOut, modelname):
    audio, samplerate = librosa.load(fileIn, sr=22050)
    snips = snipify(audio)
    specs, stfts = SPECify(snips)

    model = autoencoder.loadModel()
    model.load_weights(modelname)
    sourceSpecs = model.predict(specs)
    sourceaudio = np.array([])

    for i in range(0, sourceSpecs.shape[0]):
        sourceSpec = sourceSpecs[i].T
        stft = stfts[i]
        stft = np.expand_dims(stft, axis=2)
        sourceSpec = np.expand_dims(sourceSpec, axis=2)
        sourceSpec = np.expand_dims(sourceSpec, axis=3)
        print(sourceSpec.shape, stft.shape)
        resi = norbert.residual_model(sourceSpec, stft.astype(np.complex128),
                                      1)
        sourceSpecNorbert = norbert.wiener(resi,
                                           stft.astype(np.complex128),
                                           1,
                                           use_softmask=False)
        sourceSpecNorbert1 = sourceSpecNorbert[:, ..., 0, 0]
        sourceaudio = np.append(sourceaudio, librosa.istft(sourceSpecNorbert1))

    soundfile.write(fileOut, sourceaudio, samplerate)
Ejemplo n.º 2
0
def test_residual_copy(X, V):
    X0 = X.clone()
    V0 = V.clone()

    _ = norbert.residual_model(V, X)

    assert torch.allclose(X0, X)
    assert torch.allclose(V0, V)
Ejemplo n.º 3
0
def test_residual_copy(X, V):
    X0 = np.copy(X)
    V0 = np.copy(V)

    _ = norbert.residual_model(V, X)

    assert np.allclose(X0, X)
    assert np.allclose(V0, V)
Ejemplo n.º 4
0
def test_shapes(V, X):
    Y = norbert.residual_model(V, X)
    assert X.shape == Y.shape[:-1]

    Y = norbert.wiener(V, X)
    assert X.shape == Y.shape[:-1]

    Y = norbert.softmask(V, X)
    assert X.shape == Y.shape[:-1]
Ejemplo n.º 5
0
def PostProcess(Y, stft):
    stft = np.expand_dims(stft, axis=2)
    Y = np.expand_dims(Y.T, axis=3)
    resi = norbert.residual_model(Y, stft.astype(np.complex128), 1)
    YNorbert = norbert.wiener(resi,
                              stft.astype(np.complex128),
                              1,
                              use_softmask=False)
    YNorbert1 = YNorbert[:, ..., 0, 0]
    Yaudio = librosa.istft(YNorbert1)
    return Yaudio
Ejemplo n.º 6
0
def separate(audio,
             targets,
             model_name='umxhq',
             niter=1,
             softmask=False,
             alpha=1,
             residual_model=False,
             device='cpu'):

    # convert numpy audio to torch
    audio_torch = torch.tensor(audio.T[None, ...]).float().to(device)

    source_names = []
    V = []

    for j, target in enumerate(tqdm.tqdm(targets)):
        unmix_target = load_model(target=target,
                                  model_name=model_name,
                                  device=device)
        Vj = unmix_target(audio_torch).cpu().detach().numpy()
        if softmask:
            # only exponentiate the model if we use softmask
            Vj = Vj**alpha
        # output is nb_frames, nb_samples, nb_channels, nb_bins
        V.append(Vj[:, 0, ...])  # remove sample dim
        source_names += [target]

    V = np.transpose(np.array(V), (1, 3, 2, 0))

    X = unmix_target.stft(audio_torch).detach().cpu().numpy()
    # convert to complex numpy type
    X = X[..., 0] + X[..., 1] * 1j
    X = X[0].transpose(2, 1, 0)

    if residual_model or len(targets) == 1:
        V = norbert.residual_model(V, X, alpha if softmask else 1)
        source_names += (['residual']
                         if len(targets) > 1 else ['accompaniment'])

    Y = norbert.wiener(V,
                       X.astype(np.complex128),
                       niter,
                       use_softmask=softmask)

    estimates = {}
    for j, name in enumerate(source_names):
        audio_hat = istft(Y[..., j].T,
                          n_fft=unmix_target.stft.n_fft,
                          n_hopsize=unmix_target.stft.n_hop)
        estimates[name] = audio_hat.T

    return estimates
Ejemplo n.º 7
0
def separate(
    audio,
    targets,
    model_name='umxhq',
    niter=1, softmask=False, alpha=1.0,
    residual_model=False, device='cpu'
):
    """
    Performing the separation on audio input

    Parameters
    ----------
    audio: np.ndarray [shape=(nb_samples, nb_channels, nb_timesteps)]
        mixture audio

    targets: list of str
        a list of the separation targets.
        Note that for each target a separate model is expected
        to be loaded.

    model_name: str
        name of torchhub model or path to model folder, defaults to `umxhq`

    niter: int
         Number of EM steps for refining initial estimates in a
         post-processing stage, defaults to 1.

    softmask: boolean
        if activated, then the initial estimates for the sources will
        be obtained through a ratio mask of the mixture STFT, and not
        by using the default behavior of reconstructing waveforms
        by using the mixture phase, defaults to False

    alpha: float
        changes the exponent to use for building ratio masks, defaults to 1.0

    residual_model: boolean
        computes a residual target, for custom separation scenarios
        when not all targets are available, defaults to False

    device: str
        set torch device. Defaults to `cpu`.

    Returns
    -------
    estimates: `dict` [`str`, `np.ndarray`]
        dictionary of all restimates as performed by the separation model.

    """
    # convert numpy audio to torch
    audio_torch = torch.tensor(audio.T[None, ...]).float().to(device)

    source_names = []
    V = []

    for j, target in enumerate(tqdm.tqdm(targets)):
        unmix_target = load_model(
            target=target,
            model_name=model_name,
            device=device
        )
        Vj = unmix_target(audio_torch).cpu().detach().numpy()
        if softmask:
            # only exponentiate the model if we use softmask
            Vj = Vj**alpha
        # output is nb_frames, nb_samples, nb_channels, nb_bins
        V.append(Vj[:, 0, ...])  # remove sample dim
        source_names += [target]

    V = np.transpose(np.array(V), (1, 3, 2, 0))

    X = unmix_target.stft(audio_torch).detach().cpu().numpy()
    # convert to complex numpy type
    X = X[..., 0] + X[..., 1]*1j
    X = X[0].transpose(2, 1, 0)

    if residual_model or len(targets) == 1:
        V = norbert.residual_model(V, X, alpha if softmask else 1)
        source_names += (['residual'] if len(targets) > 1
                         else ['accompaniment'])

    Y = norbert.wiener(V, X.astype(np.complex128), niter,
                       use_softmask=softmask)

    estimates = {}
    for j, name in enumerate(source_names):
        audio_hat = istft(
            Y[..., j].T,
            n_fft=unmix_target.stft.n_fft,
            n_hopsize=unmix_target.stft.n_hop
        )
        estimates[name] = audio_hat.T

    return estimates
Ejemplo n.º 8
0
def separate(
    audio,
    model_path='models/x-umx.h5',
    niter=1,
    softmask=False,
    alpha=1.0,
    residual_model=False
):
    """
    Performing the separation on audio input
    Parameters
    ----------
    audio: np.ndarray [shape=(nb_samples, nb_channels, nb_timesteps)]
        mixture audio
    model_path: str
        path to model folder, defaults to `models/`
    niter: int
         Number of EM steps for refining initial estimates in a
         post-processing stage, defaults to 1.
    softmask: boolean
        if activated, then the initial estimates for the sources will
        be obtained through a ratio mask of the mixture STFT, and not
        by using the default behavior of reconstructing waveforms
        by using the mixture phase, defaults to False
    alpha: float
        changes the exponent to use for building ratio masks, defaults to 1.0
    residual_model: boolean
        computes a residual target, for custom separation scenarios
        when not all targets are available, defaults to False
    Returns
    -------
    estimates: `dict` [`str`, `np.ndarray`]
        dictionary of all restimates as performed by the separation model.
    """
    # convert numpy audio to NNabla Variable
    audio_nn = nn.Variable.from_numpy_array(audio.T[None, ...])
    source_names = []
    V = []

    sources = ['bass', 'drums', 'vocals', 'other']
    for j, target in enumerate(sources):
        if j == 0:
            unmix_target = model.OpenUnmix_CrossNet(max_bin=1487)
            unmix_target.is_predict = True
            nn.load_parameters(model_path)
            mix_spec, msk, _ = unmix_target(audio_nn, test=True)
        Vj = msk[Ellipsis, j*2:j*2+2, :] * mix_spec
        if softmask:
            # only exponentiate the model if we use softmask
            Vj = Vj**alpha
        # output is nb_frames, nb_samples, nb_channels, nb_bins
        V.append(Vj.d[:, 0, ...])  # remove sample dim
        source_names += [target]
    V = np.transpose(np.array(V), (1, 3, 2, 0))

    real, imag = model.STFT(audio_nn, center=True)

    # convert to complex numpy type
    X = real.d + imag.d*1j
    X = X[0].transpose(2, 1, 0)

    if residual_model or len(sources) == 1:
        V = norbert.residual_model(V, X, alpha if softmask else 1)
        source_names += (['residual'] if len(sources) > 1
                         else ['accompaniment'])

    Y = norbert.wiener(V, X.astype(np.complex128), niter,
                       use_softmask=softmask)

    estimates = {}
    for j, name in enumerate(source_names):
        audio_hat = istft(
            Y[..., j].T,
            n_fft=unmix_target.n_fft,
            n_hopsize=unmix_target.n_hop
        )
        estimates[name] = audio_hat.T

    return estimates
Ejemplo n.º 9
0
def separate(audio, args):
    """
    Performing the separation on audio input
    Parameters
    ----------
    audio: np.ndarray [shape=(nb_timesteps, nb_channels)]
        mixture audio
    args : ArgumentParser
        ArgumentParser for OpenUnmix_CrossNet(X-UMX)/OpenUnmix(UMX) Inference

    Returns
    -------
    estimates: `dict` [`str`, `np.ndarray`]
        dictionary of all estimates as performed by the separation model.
    """

    # convert numpy audio to NNabla Variable
    audio_nn = nn.Variable.from_numpy_array(audio.T[None, ...])
    source_names = []
    V = []
    max_bin = bandwidth_to_max_bin(sample_rate=44100,
                                   n_fft=4096,
                                   bandwidth=16000)

    if not args.umx_infer:
        # Run X-UMX Inference
        nn.load_parameters(args.model)
        for j, target in enumerate(args.targets):
            if j == 0:
                unmix_target = model.OpenUnmix_CrossNet(max_bin=max_bin,
                                                        is_predict=True)
                mix_spec, msk, _ = unmix_target(audio_nn, test=True)
                # Network output is (nb_frames, nb_samples, nb_channels, nb_bins)
            V.append((msk[Ellipsis, j * 2:j * 2 + 2, :] * mix_spec).d[:, 0,
                                                                      ...])
            source_names += [target]
    else:
        # Run UMX Inference
        for j, target in enumerate(args.targets):
            with nn.parameter_scope(target):
                unmix_target = model.OpenUnmix(max_bin=max_bin)
                nn.load_parameters(f"{os.path.join(args.model, target)}.h5")
                # Network output is (nb_frames, nb_samples, nb_channels, nb_bins)
                V.append(unmix_target(audio_nn, test=True).d[:, 0, ...])
            source_names += [target]

    V = np.transpose(np.array(V), (1, 3, 2, 0))
    if args.softmask:
        # only exponentiate the model if we use softmask
        V = V**args.alpha

    real, imag = model.get_stft(audio_nn, center=True)

    # convert to complex numpy type
    X = real.d + imag.d * 1j
    X = X[0].transpose(2, 1, 0)

    if args.residual_model or len(args.targets) == 1:
        V = norbert.residual_model(V, X, args.alpha if args.softmask else 1)
        source_names += (['residual']
                         if len(args.targets) > 1 else ['accompaniment'])

    Y = norbert.wiener(V,
                       X.astype(np.complex128),
                       args.niter,
                       use_softmask=args.softmask)

    estimates = {}
    for j, name in enumerate(source_names):
        audio_hat = istft(Y[..., j].T,
                          n_fft=unmix_target.n_fft,
                          n_hopsize=unmix_target.n_hop)
        estimates[name] = audio_hat.T

    return estimates
Ejemplo n.º 10
0
    X = torch.split(x, num_frames, dim=3)
    Vj = []  # holds vocals' spectrograms.
    for i in tqdm.tqdm(range(len(X)), desc='Estimating vocals..'):
        Vj.append(model(X[i]))
    Vj = torch.cat(Vj, dim=3).cpu().detach().numpy()

# Prepare input for MWF.
print('Calculating MWF..')
V_vox = np.transpose(Vj, [3, 0, 1, 2])
V.append(V_vox[:, 0, ...])  # remove sample dim
V = np.transpose(np.array(V), (1, 3, 2, 0))

X = model.mdensenet.stft(audio).detach().cpu().numpy()
X = X[..., 0] + X[..., 1] * 1j
X = X[0].transpose(2, 1, 0)
V = norbert.residual_model(V, X, 1)
Y = norbert.wiener(V, X.astype(np.complex128), 1, use_softmask=False)

# Extract source estimates in time domain.
s = []
estimates = {}
for j in range(Y.shape[-1]):
    audio_hat = istft(Y[..., j].T, n_fft=n_fft, n_hop=n_hop, sr=sr)
    s.append(audio_hat.T)

end_time = time.time()
print(f'Separation duration: {end_time - start_time:.2f} sec.')

print('Saving track..')
out_name = Path(args.out_name).expanduser()
out_name.parent.mkdir(parents=True, exist_ok=True)
Ejemplo n.º 11
0
def separate(
    audio,
    x_umx_target,
    instruments,
    niter=1,
    softmask=False,
    alpha=1.0,
    residual_model=False,
    device="cpu",
):
    """
    Performing the separation on audio input

    Parameters
    ----------
    audio: np.ndarray [shape=(nb_samples, nb_channels, nb_timesteps)]
        mixture audio

    x_umx_target: asteroid.models
        X-UMX model used for separating

    instruments: list
        The list of instruments, e.g., ["bass", "drums", "vocals"]

    niter: int
         Number of EM steps for refining initial estimates in a
         post-processing stage, defaults to 1.

    softmask: boolean
        if activated, then the initial estimates for the sources will
        be obtained through a ratio mask of the mixture STFT, and not
        by using the default behavior of reconstructing waveforms
        by using the mixture phase, defaults to False

    alpha: float
        changes the exponent to use for building ratio masks, defaults to 1.0

    residual_model: boolean
        computes a residual target, for custom separation scenarios
        when not all targets are available, defaults to False

    device: str
        set torch device. Defaults to `cpu`.

    Returns
    -------
    estimates: `dict` [`str`, `np.ndarray`]
        dictionary with all estimates obtained by the separation model.
    """

    # convert numpy audio to torch
    audio_torch = torch.tensor(audio.T[None, ...]).float().to(device)

    source_names = []
    V = []

    masked_tf_rep, _ = x_umx_target(audio_torch)
    # shape: (Sources, frames, batch, channels, fbin)

    for j, target in enumerate(instruments):
        Vj = masked_tf_rep[j, Ellipsis].cpu().detach().numpy()
        if softmask:
            # only exponentiate the model if we use softmask
            Vj = Vj**alpha
        # output is nb_frames, nb_samples, nb_channels, nb_bins
        V.append(Vj[:, 0, Ellipsis])  # remove sample dim
        source_names += [target]

    V = np.transpose(np.array(V), (1, 3, 2, 0))

    # convert to complex numpy type
    tmp = x_umx_target.encoder(audio_torch)
    X = torch_complex_from_magphase(tmp[0].permute(1, 2, 3, 0), tmp[1])
    X = X.detach().cpu().numpy()
    X = X[0].transpose(2, 1, 0)

    if residual_model or len(instruments) == 1:
        V = norbert.residual_model(V, X, alpha if softmask else 1)
        source_names += ["residual"
                         ] if len(instruments) > 1 else ["accompaniment"]

    Y = norbert.wiener(V,
                       X.astype(np.complex128),
                       niter,
                       use_softmask=softmask)

    estimates = {}
    for j, name in enumerate(source_names):
        audio_hat = istft(
            Y[..., j].T,
            rate=x_umx_target.sample_rate,
            n_fft=x_umx_target.in_chan,
            n_hopsize=x_umx_target.n_hop,
        )
        estimates[name] = audio_hat.T

    return estimates
Ejemplo n.º 12
0
def separate(input_path,
             output_path,
             model_name='umxhq',
             targets=('vocals', 'drums', 'bass', 'other'),
             samplerate=44100,
             device='cpu',
             softmask=False,
             residual_model=False,
             alpha=1.0,
             niter=1):
    """
    generate 4 subtargets
    """

    # ENTREE : input path
    # SORTIE : OUTPUT PATH NOM DE DOSSIER ECRIT LES SUBTARGETS EN .WAV DANS CE PATH

    # handling an input audio path
    audio, rate = sf.read(
        input_path,
        always_2d=True,
    )

    if audio.shape[1] > 2:
        warnings.warn('Channel count > 2! '
                      'Only the first two channels will be processed!')
        audio = audio[:, :2]

    if rate != samplerate:
        # resample to model samplerate if needed
        audio = resampy.resample(audio, rate, samplerate, axis=0)

    if audio.shape[1] == 1:
        # if we have mono, let's duplicate it
        # as the input of OpenUnmix is always stereo
        audio = np.repeat(audio, 2, axis=1)
    # convert numpy audio to torch
    audio_torch = torch.tensor(audio.T[None, ...]).float().to(device)

    source_names = []
    V = []

    for j, target in enumerate(tqdm.tqdm(targets)):
        unmix_target = load_model(target=target,
                                  model_name=model_name,
                                  device=device)
        Vj = unmix_target(audio_torch).cpu().detach().numpy()
        if softmask:
            # only exponentiate the model if we use softmask
            Vj = Vj**alpha
        # output is nb_frames, nb_samples, nb_channels, nb_bins
        V.append(Vj[:, 0, ...])  # remove sample dim
        source_names += [target]

    V = np.transpose(np.array(V), (1, 3, 2, 0))

    X = unmix_target.stft(audio_torch).detach().cpu().numpy()
    # convert to complex numpy type
    X = X[..., 0] + X[..., 1] * 1j
    X = X[0].transpose(2, 1, 0)

    if residual_model or len(targets) == 1:
        V = norbert.residual_model(V, X, alpha if softmask else 1)
        source_names += (['residual']
                         if len(targets) > 1 else ['accompaniment'])

    Y = norbert.wiener(V,
                       X.astype(np.complex128),
                       niter,
                       use_softmask=softmask)

    output_path = Path(output_path)
    output_path.mkdir(parents=True, exist_ok=True)
    estimates = {}
    for j, name in enumerate(source_names):
        audio_hat = istft(Y[..., j].T,
                          n_fft=unmix_target.stft.n_fft,
                          n_hopsize=unmix_target.stft.n_hop)
        estimates[name] = audio_hat.T

        # write wav file in output_path
        subtarget_path = output_path.joinpath(name + '.wav')
        sf.write(subtarget_path, estimates[name], samplerate)
    return estimates