Exemple #1
0
def IBM(track, alpha=1, theta=0.5, eval_dir=None):
    """Ideal Binary Mask:
    processing all channels inpependently with the ideal binary mask.

    the mix is send to some source if the spectrogram of that source over that
    of the mix is greater than theta, when the spectrograms are take as
    magnitude of STFT raised to the power alpha. Typical parameters involve a
    ratio of magnitudes (alpha=1) and a majority vote (theta = 0.5)
    """

    # parameters for STFT
    nfft = 2048

    # small epsilon to avoid dividing by zero
    eps = np.finfo(np.float).eps

    # compute STFT of Mixture
    N = track.audio.shape[0]  # remember number of samples for future use
    X = stft(track.audio.T, nperseg=nfft)[-1]
    (I, F, T) = X.shape

    # perform separtion
    estimates = {}
    accompaniment_source = 0
    for name, source in track.sources.items():

        # compute STFT of target source
        Yj = stft(source.audio.T, nperseg=nfft)[-1]

        # Create Binary Mask
        Mask = np.divide(np.abs(Yj)**alpha, (eps + np.abs(X)**alpha))
        Mask[np.where(Mask >= theta)] = 1
        Mask[np.where(Mask < theta)] = 0

        # multiply mask
        Yj = np.multiply(X, Mask)

        # inverte to time domain and set same length as original mixture
        target_estimate = istft(Yj)[1].T[:N, :]

        # set this as the source estimate
        estimates[name] = target_estimate

        # accumulate to the accompaniment if this is not vocals
        if name != 'vocals':
            accompaniment_source += target_estimate

    # set accompaniment source
    estimates['accompaniment'] = accompaniment_source

    if eval_dir is not None:
        museval.eval_mus_track(
            track,
            estimates,
            output_dir=eval_dir,
        )

    return estimates
Exemple #2
0
def IRM(track, alpha=2, eval_dir=None):
    """Ideal Ratio Mask:
    processing all channels inpependently with the ideal ratio mask.
    this is the ratio of spectrograms, where alpha is the exponent to take for
    spectrograms. usual values are 1 (magnitude) and 2 (power)"""

    # STFT parameters
    nfft = 2048

    # small epsilon to avoid dividing by zero
    eps = np.finfo(np.float).eps

    # compute STFT of Mixture
    N = track.audio.shape[0]  # remember number of samples for future use
    X = stft(track.audio.T, nperseg=nfft)[-1]
    (I, F, T) = X.shape

    # Compute sources spectrograms
    P = {}
    # compute model as the sum of spectrograms
    model = eps

    for name, source in track.sources.items():
        # compute spectrogram of target source:
        # magnitude of STFT to the power alpha
        P[name] = np.abs(stft(source.audio.T, nperseg=nfft)[-1])**alpha
        model += P[name]

    # now performs separation
    estimates = {}
    accompaniment_source = 0
    for name, source in track.sources.items():
        # compute soft mask as the ratio between source spectrogram and total
        Mask = np.divide(np.abs(P[name]), model)

        # multiply the mix by the mask
        Yj = np.multiply(X, Mask)

        # invert to time domain
        target_estimate = istft(Yj)[1].T[:N, :]

        # set this as the source estimate
        estimates[name] = target_estimate

        # accumulate to the accompaniment if this is not vocals
        if name != 'vocals':
            accompaniment_source += target_estimate

    estimates['accompaniment'] = accompaniment_source

    if eval_dir is not None:
        museval.eval_mus_track(
            track,
            estimates,
            output_dir=eval_dir,
        )

    return estimates
Exemple #3
0
def estimate_and_evaluate(track):
    # return any number of targets
    estimates = {
        'vocals': track.audio,
        'accompaniment': track.audio
    }

    museval.eval_mus_track(
        track, estimates, output_dir=output_dir
    )

    return estimates
def test_track_scores(reference):
    track, ref_scores = reference

    np.random.seed(0)
    random_voc = np.random.random(track.audio.shape)
    random_acc = np.random.random(track.audio.shape)

    # create a silly regression test
    estimates = {'vocals': random_voc, 'accompaniment': random_acc}

    scores = museval.eval_mus_track(track, estimates)

    est_scores = json.loads(scores.json)
    for target in ref_scores['targets']:
        for metric in ['SDR', 'SIR', 'SAR', 'ISR']:

            ref = np.array([d['metrics'][metric] for d in target['frames']])
            idx = [t['name']
                   for t in est_scores['targets']].index(target['name'])
            est = np.array([
                d['metrics'][metric]
                for d in est_scores['targets'][idx]['frames']
            ])

            assert np.allclose(ref, est, atol=1e-02, equal_nan=True)
Exemple #5
0
def test_estimate_and_evaluate(mus):
    # return any number of targets
    with open(json_path) as json_file:
        ref = json.loads(json_file.read())

    print(os.path.basename(json_path))
    track = mus.load_mus_tracks(
        tracknames=[os.path.splitext(os.path.basename(json_path))[0]])[0]

    np.random.seed(0)
    random_voc = np.random.random(track.audio.shape)
    random_acc = np.random.random(track.audio.shape)

    # create a silly regression test
    estimates = {'vocals': random_voc, 'accompaniment': random_acc}

    scores = museval.eval_mus_track(track, estimates)

    assert scores.validate() is None

    with open(os.path.join('.', track.name) + '.json', 'w+') as f:
        f.write(scores.json)

    scores = json.loads(scores.json)

    for target in ref['targets']:
        for metric in ['SDR', 'SIR', 'SAR', 'ISR']:

            ref = np.array([d['metrics'][metric] for d in target['frames']])
            idx = [t['name'] for t in scores['targets']].index(target['name'])
            est = np.array([
                d['metrics'][metric] for d in scores['targets'][idx]['frames']
            ])

            assert np.allclose(ref, est)
Exemple #6
0
def separate_and_evaluate(
    track,
    targets,
    model_name,
    niter,
    alpha,
    softmask,
    output_dir,
    eval_dir,
    device='cpu'
):
    estimates = test.separate(
        audio=track.audio,
        targets=targets,
        model_name=model_name,
        niter=niter,
        alpha=alpha,
        softmask=softmask,
        device=device
    )
    if output_dir:
        mus.save_estimates(estimates, track, output_dir)

    scores = museval.eval_mus_track(
        track, estimates, output_dir=eval_dir
    )
    return scores
Exemple #7
0
def oracle(track, separation_fn):
    # set (trackwise) norbert objects
    tf = norbert.TF()

    # compute the mixture complex tf transform
    x = tf.transform(track.audio)

    v = []
    for name, value in track.sources.items():
        v_j = np.sum(np.abs(tf.transform(value.audio))**2,
                     axis=-1, keepdims=True)

        v += [np.squeeze(v_j)]

    v = np.moveaxis(np.array(v), 0, 2)

    y = separation_fn(v, x)

    estimates = {}
    for j, (name, value) in enumerate(track.sources.items()):
        audio_hat = tf.inverse_transform(y[..., j])
        estimates[name] = audio_hat

    # Evaluate using museval
    scores = museval.eval_mus_track(
        track, estimates, output_dir=None
    )

    print(scores)

    return estimates
Exemple #8
0
def model_separate_and_evaluate(
    components: Dict, track: musdb.MultiTrack, evaldir, n_nonzero_coeffs: int = 40
):
    mixture = track.audio

    print("Separating")
    separated_sources = model_separate(
        components, mixture, n_nonzero_coeffs=n_nonzero_coeffs
    )
    estimates = {
        "vocals": separated_sources[0],
        "drums": separated_sources[1],
        "bass": separated_sources[2],
        "other": separated_sources[3]
        # "accompaniment": separated_sources[1]
    }

    scores = None

    try:
        print("Evaluating")
        scores = museval.eval_mus_track(track, estimates, output_dir=evaldir)

        print(scores)
        print("Done")
    except ValueError as e:
        pass
    else:
        print("Evaluation Success")

    return separated_sources, scores
Exemple #9
0
def GT(track, eval_dir=None):
    """Ground Truth Signals
    """

    # perform separtion
    estimates = {}
    for name, target in track.targets.items():
        # set accompaniment source
        estimates[name] = target.audio

    if eval_dir is not None:
        museval.eval_mus_track(
            track,
            estimates,
            output_dir=eval_dir,
        )

    return estimates
Exemple #10
0
def separate_and_evaluate(track, args, ext):
    estimates = test.separate(track.audio, args)

    if args.out_dir:
        mus.save_estimates(estimates, track, args.out_dir)

    scores = museval.eval_mus_track(track, estimates, output_dir=args.out_dir)
    # clear cache memory
    ext.clear_memory_cache()
    return scores
Exemple #11
0
def eval_dataset(_dataset, _predictor):

    config = {
        'use_mixer': _predictor.use_mixer,
        'use_demucs': _predictor.use_demucs,
        'dataset': _dataset.root
    }

    wandb.init(project="KUIELab-MDX-Net", entity="ielab", config=config)

    sources = ['bass', 'drums', 'other', 'vocals']

    for idx in range(len(_dataset)):
        track = _dataset[idx]
        estimation = _predictor.demix(track.audio.T)

        # Real SDR
        if len(estimation) == len(sources):
            track_length = _dataset[idx].samples
            if track_length > estimation.shape[-1]:
                raise NotImplementedError
            else:
                estimated_targets_dict = {
                    source: estimated.T
                    for source, estimated in zip(sources, estimation)
                }
                track_score = museval.eval_mus_track(_dataset[idx],
                                                     estimated_targets_dict)
                score_dict = track_score.df.loc[:, ['target', 'metric', 'score']].groupby(
                    ['target', 'metric'])['score'] \
                    .median().to_dict()
                wandb.log(
                    {
                        'test_result/{}_{}'.format(k1, k2):
                        score_dict[(k1, k2)]
                        for k1, k2 in score_dict.keys()
                    },
                    step=idx)

                print(track_score)

                results.add_track(track_score)

    result_dict = results.df.groupby([
        'track', 'target', 'metric'
    ])['score'].median().reset_index().groupby(['target', 'metric'
                                                ])['score'].median().to_dict()
    wandb.log({
        'test_result/agg/{}_{}'.format(k1, k2): result_dict[(k1, k2)]
        for k1, k2 in result_dict.keys()
    })

    wandb.finish()

    print(results)
Exemple #12
0
def MIX(track, eval_dir=None):
    """Mixture as Estimate
    """

    # perform separtion
    estimates = {}
    for name, target in track.sources.items():
        # set accompaniment source
        estimates[name] = track.audio / len(track.sources)

    estimates['accompaniment'] = estimates['bass'] + \
        estimates['drums'] + estimates['other']

    if eval_dir is not None:
        museval.eval_mus_track(
            track,
            estimates,
            output_dir=eval_dir,
        )

    return estimates
def predict(track, model_config, model, model_noise, results_dir=None):
    '''
    Function in accordance with MUSB evaluation API. Takes MUSDB track object and computes corresponding source estimates, as well as calls evlauation script.
    Model has to be saved beforehand into a pickle file containing model configuration dictionary and checkpoint path!
    :param track: Track object
    :param results_dir: Directory where SDR etc. values should be saved
    :return: Source estimates dictionary
    '''

    # Get noise once, use that for all predictions to keep consistency
    noise = model_noise.sample()

    # Determine input and output shapes, if we use U-net as separator
    sep_input_shape = [
        1, 1, model_config.input_height, model_config.input_width
    ]  # [N, C, H, W]

    print("Testing...")

    mix_audio, orig_sr, mix_channels = track.audio, track.rate, track.audio.shape[
        1]  # Audio has (n_samples, n_channels) shape
    separator_preds = predict_track(model_config, model, noise, mix_audio,
                                    orig_sr, sep_input_shape, sep_input_shape)

    # Upsample predicted source audio and convert to stereo. Make sure to resample back to the exact number of samples in the original input (with fractional orig_sr/new_sr this causes issues otherwise)
    pred_audio = {
        name: librosa.resample(separator_preds[name], model_config.sample_rate,
                               orig_sr)[:len(mix_audio)]
        for name in separator_preds.keys()
    }

    if mix_channels > 1:  # Convert to multichannel if mixture input was multichannel by duplicating mono estimate
        pred_audio = {
            name: np.repeat(np.expand_dims(pred_audio[name], 1),
                            mix_channels,
                            axis=1)
            for name in pred_audio.keys()
        }

    # Evaluate using museval, if we are currently evaluating MUSDB
    if results_dir is not None:
        scores = museval.eval_mus_track(track,
                                        pred_audio,
                                        output_dir=results_dir,
                                        win=15,
                                        hop=15.0)

        # print nicely formatted mean scores
        print(scores)

    return pred_audio
Exemple #14
0
def load_and_eval_estimates(track):
    # load estimates from disk instead of processing
    user_results = {}
    track.name = track.filename
    track_estimate_dir = os.path.join(user_estimates_dir, track.subset,
                                      track.filename)
    for target in glob.glob(track_estimate_dir + '/*.wav'):

        target_name = op.splitext(os.path.basename(target))[0]
        try:
            target_audio, rate = sf.read(target, always_2d=True)
            user_results[target_name] = target_audio
        except RuntimeError:
            pass

    museval.eval_mus_track(
        track,
        user_results,
        output_dir=output_dir,
        mode='v3'  # use bss_eval v3 to reproduce sisec 2016 results
    )

    return None
def test_one_estimate(reference):
    track, _ = reference

    np.random.seed(0)
    random_voc = np.random.random(track.audio.shape)

    estimates = {'vocals': random_voc}

    with pytest.warns(UserWarning):
        est_scores = museval.eval_mus_track(track, estimates)

    est_json = json.loads(est_scores.json)

    assert len(est_json['targets']) == 0
Exemple #16
0
    def evaluate(self):
        track = utils.audio_signals_to_musdb_track(self.mixture,
                                                   self.true_sources,
                                                   self.target_dict)

        bss_output = museval.eval_mus_track(track,
                                            self.estimates,
                                            output_dir=self.output_dir,
                                            mode=self.mode,
                                            win=self.win,
                                            hop=self.hop)

        self._populate_scores_dict(bss_output)

        return self.scores
Exemple #17
0
    def add_new_predictions(self, output, track, instrument_ohe):
        # First, validate that the current instrument is initialized.
        self.validate_current_track(track, instrument_ohe)
        # If the received output has a different track and instrument when compared to the last track,
        # this means that the previous track has finished and we should calculate its results with museval
        same_instrument_mask = (
            instrument_ohe == self.current_instrument_ohe).all(dim=1)
        same_track_mask = (track == self.current_track)
        if (~same_instrument_mask.any() or ~same_track_mask.any()
            ) and self.get_current_output_length() == 0:
            raise Exception(
                'Current track is empty, but received a different(new) instrument/track.'
            )
        elif ~same_track_mask.any():
            # Get the observations that belong to the current instrument (and track) and append them to the results
            if same_track_mask.any():
                self.append_results_to_current_track(output[same_track_mask])
            # Retrieve the original track from musdb, generate the estimates from the model's output
            # and Evaluate the finished track
            track = self.mus_db.tracks[self.current_track]
            estimates = {
                instru: torch.cat(output_list).numpy()
                for instru, output_list in self.current_output.items()
            }
            self.results.add_track(museval.eval_mus_track(track, estimates))
            # Reset current results
            self.reset_current_results()
            self.validate_current_track(track[~same_track_mask],
                                        instrument_ohe[~same_track_mask])
            # Get the observations that belong to the new instrument and append them to the results
            self.append_results_to_current_track(output[~same_track_mask])
        elif ~same_instrument_mask.any():
            # Get the observations that belong to the current instrument and append them to the results
            if same_instrument_mask.any():
                self.append_results_to_current_track(
                    output[same_instrument_mask])

            # Update current instrument and
            # Get the observations that belong to the new instrument and append them to the new results.
            self.update_current_instrument(
                instrument_ohe[~same_instrument_mask][0])
            self.append_results_to_current_track(output[~same_instrument_mask])

        # Add the received outputs into the current_results dictionary if there are no changes in instrument
        # or
        else:
            self.append_results_to_current_track(output)
    def on_test_epoch_end(self):

        results = museval.EvalStore(frames_agg='median', tracks_agg='median')

        for idx in range(self.musdb_test.num_tracks):
            estimation = {}
            for target_name in self.target_names:
                estimation[target_name] = get_estimation(idx, target_name, self.test_estimation_dict)
                if estimation[target_name] is not None:
                    estimation[target_name] = estimation[target_name].astype(np.float32)
            # Real SDR
            if len(estimation) == len(self.target_names):
                track_length = self.musdb_test.musdb_reference[idx].samples
                estimated_targets = [estimation[target_name][:track_length] for target_name in self.target_names]
                if track_length > estimated_targets[0].shape[0]:
                    raise NotImplementedError
                else:
                    estimated_targets_dict = {target_name: estimation[target_name][:track_length] for target_name in
                                              self.target_names}
                    track_score = museval.eval_mus_track(
                        self.musdb_test.musdb_reference[idx],
                        estimated_targets_dict
                    )
                    score_dict = track_score.df.loc[:, ['target', 'metric', 'score']].groupby(
                        ['target', 'metric'])['score'] \
                        .median().to_dict()
                    if isinstance(self.logger, WandbLogger):
                        self.logger.experiment.log(
                            {'test_result/{}_{}'.format(k1, k2): score_dict[(k1, k2)] for k1, k2 in score_dict.keys()})
                    else:
                        print(track_score)
                    results.add_track(track_score)
            if idx == 1 and isinstance(self.logger, WandbLogger):
                self.logger.experiment.log({'result_sample_{}_{}'.format(self.current_epoch, target_name): [
                    wandb.Audio(estimation[target_name], caption='{}_{}'.format(idx, target_name), sample_rate=44100)]})

        if isinstance(self.logger, WandbLogger):
            result_dict = results.df.groupby(
                ['track', 'target', 'metric']
            )['score'].median().reset_index().groupby(
                ['target', 'metric']
            )['score'].median().to_dict()
            self.logger.experiment.log(
                {'test_result/agg/{}_{}'.format(k1, k2): result_dict[(k1, k2)] for k1, k2 in result_dict.keys()}
            )
        else:
            print(results)
def test_random_estimate(reference):
    track, _ = reference
    np.random.seed(0)
    random_voc = np.random.random(track.audio.shape)
    random_acc = np.random.random(track.audio.shape)

    # create a silly regression test
    estimates = {'vocals': random_voc, 'accompaniment': random_acc}

    scores = museval.eval_mus_track(track, estimates)

    # save json
    with open(os.path.join('.', track.name) + '.json', 'w+') as f:
        f.write(scores.json)

    # validate json
    assert scores.validate() is None
Exemple #20
0
def separate_and_evaluate(track, model_dir, targets, output_dir):

    fft_size, hop_size, n_channels = 4096, 1024, 2
    audio = track.audio
    for i in range(audio.shape[1]):
        stft = librosa.stft(audio[:, i].flatten(),
                            n_fft=fft_size,
                            hop_length=hop_size).transpose()
        if i == 0:
            data = np.ndarray(shape=(stft.shape[0], n_channels,
                                     fft_size // 2 + 1),
                              dtype=np.complex64)
        data[:, i, :] = stft

    if n_channels == 2 and audio.shape[1] == 1:
        data[:, 1] = data[:, 0]

    inp_stft = data

    out_stfts = {}
    inp_stft_contiguous = np.abs(np.ascontiguousarray(inp_stft))

    for target in targets:
        # Load the model weights for corresponding target
        nn.load_parameters(f"{os.path.join(model_dir, target)}.h5")
        with open(f"./configs/{target}.yaml") as file:
            # Load target specific Hyper parameters
            hparams = yaml.load(file, Loader=yaml.FullLoader)
        with nn.parameter_scope(target):
            out_sep = model_separate(inp_stft_contiguous,
                                     hparams,
                                     ch_flip_average=True)
            out_stfts[target] = out_sep * np.exp(1j * np.angle(inp_stft))

    out_stfts = apply_mwf(out_stfts, inp_stft)

    estimates = {}
    for target in targets:
        estimates[target] = stft2time_domain(out_stfts[target], hop_size)

    if output_dir:
        mus.save_estimates(estimates, track, output_dir)

    scores = museval.eval_mus_track(track, estimates, output_dir=output_dir)
    return scores
def test_aggregate(reference):
    track, _ = reference

    np.random.seed(0)
    random_voc = np.random.random(track.audio.shape)
    random_acc = np.random.random(track.audio.shape)

    # create a silly regression test
    estimates = {'vocals': random_voc, 'accompaniment': random_acc}

    scores = museval.eval_mus_track(track, estimates)

    print(scores.df)

    results = museval.EvalStore()
    results.add_track(scores)
    agg = results.agg_frames_scores()
    print(results)
Exemple #22
0
def separate_and_evaluate(
    track: musdb.MultiTrack,
    targets: list,
    model_str_or_path: str,
    niter: int,
    output_dir: str,
    eval_dir: str,
    residual: bool,
    mus,
    aggregate_dict: dict = None,
    device: Union[str, torch.device] = "cpu",
    wiener_win_len: Optional[int] = None,
    filterbank="torch",
) -> str:

    separator = utils.load_separator(
        model_str_or_path=model_str_or_path,
        targets=targets,
        niter=niter,
        residual=residual,
        wiener_win_len=wiener_win_len,
        device=device,
        pretrained=True,
        filterbank=filterbank,
    )

    separator.freeze()
    separator.to(device)

    audio = torch.as_tensor(track.audio, dtype=torch.float32, device=device)
    audio = utils.preprocess(audio, track.rate, separator.sample_rate)

    estimates = separator(audio)
    estimates = separator.to_dict(estimates, aggregate_dict=aggregate_dict)

    for key in estimates:
        estimates[key] = estimates[key][0].cpu().detach().numpy().T
    if output_dir:
        mus.save_estimates(estimates, track, output_dir)

    scores = museval.eval_mus_track(track, estimates, output_dir=eval_dir)
    return scores
Exemple #23
0
def separate_and_evaluate(
    track,
    model,
    niter,
    alpha,
    softmask,
    output_dir,
    eval_dir,
):
    estimates = test.separate(audio=track.audio,
                              model_path=model,
                              niter=niter,
                              alpha=alpha,
                              softmask=softmask)

    if output_dir:
        mus.save_estimates(estimates, track, output_dir)

    scores = museval.eval_mus_track(track, estimates, output_dir=eval_dir)
    return scores
Exemple #24
0
def model_separate_and_evaluate(components: NMFResults,
                                track: musdb.MultiTrack, evaldir):
    mixture = track.audio

    print("Separating")
    separated_sources = model_separate(components, mixture)
    estimates = {
        "vocals": separated_sources[0],
        "drums": separated_sources[1],
        "bass": separated_sources[2],
        "other": separated_sources[3]
        # "accompaniment": separated_sources[1]
    }

    print("Evaluating")
    scores = museval.eval_mus_track(track, estimates, output_dir=evaldir)

    print(scores)
    print("Done")

    return separated_sources, scores
Exemple #25
0
def oracle(track):
    # compute the mixture complex tf transform
    x = stft(torch.from_numpy(track.audio.T)).transpose(0, 2)
    v = []
    for name, value in track.sources.items():
        v_j = stft(torch.from_numpy(value.audio.T)).transpose(0, 2).abs()**2
        v += [v_j]
    v = torch.stack(v, 3)

    y = norbert.softmask(v, x).permute(3, 2, 1, 0)

    estimates = {}
    for j, (name, value) in enumerate(track.sources.items()):
        audio_hat = istft(y[j]).numpy().T
        estimates[name] = audio_hat

    # Evaluate using museval
    scores = museval.eval_mus_track(track, estimates, output_dir=None)

    print(scores)

    return estimates
Exemple #26
0
def main():
    # Solo musdb por ahora !!

    parser = argparse.ArgumentParser()
    parser.add_argument("--checkpoints",
                        type=str,
                        help="Ruta del modelo a evaluar")
    parser.add_argument("--end",
                        type=int,
                        default=49,
                        choices=range(50),
                        help="Índice de la canción de fin")
    parser.add_argument("--init",
                        type=int,
                        default=0,
                        choices=range(50),
                        help="Índice de la canción de inicio")
    parser.add_argument("--other",
                        action="store_true",
                        help="Utilizar el modelo de other")
    parser.add_argument("--output",
                        type=str,
                        help="Ruta donde se guarda la evaluación")
    parser.add_argument("--partitions",
                        type=int,
                        default=1,
                        help="Número de partes de las canciones de test")
    parser.add_argument("--root", type=str, help="Ruta del dataset")
    parser.add_argument("--vocals",
                        action="store_true",
                        help="Restar vocals para calcular el acompañamiento")

    subparsers = parser.add_subparsers(help="Tipo de modelo", dest="model")
    parser_spec = subparsers.add_parser("spectrogram",
                                        help="Modelo de espectrograma")

    parser_wave = subparsers.add_parser("wave", help="Modelo de wave")

    parser_blend = subparsers.add_parser("blend", help="Modelo de mezcla")
    parser_blend.add_argument("--checkpoints-stft",
                              type=str,
                              help="Ruta del modelo de espectrograma")
    parser_blend.add_argument("--checkpoints-wave",
                              type=str,
                              help="Ruta del modelo de wave")
    args = parser.parse_args()

    use_cuda = torch.cuda.is_available()
    print("GPU disponible:", use_cuda)
    device = torch.device("cuda:0" if use_cuda else "cpu")

    if args.model == "spectrogram":
        separator = SpectrogramSeparator(args.checkpoints, args.other,
                                         args.vocals, device)
    elif args.model == "wave":
        separator = WaveSeparator(args.checkpoints, args.other, args.vocals,
                                  device)
    elif args.model == "blend":
        separator = BlendSeparator(args.checkpoints_stft,
                                   args.checkpoints_wave, args.checkpoints,
                                   args.other, args.vocals, device)
    else:
        raise NotImplementedError

    print("Cargando canciones de test")
    mus = musdb.DB(root=args.root, subsets='test')

    os.makedirs(f"{args.output}/test", exist_ok=True)

    for i in tqdm.tqdm(range(args.init, args.end + 1)):
        track = mus.tracks[i]
        print(f"Canción {i}: {track.name}")

        chunk = track.duration // args.partitions
        for i in range(1, args.partitions):
            print(f"Partición {i}")
            track.chunk_start = ((i - 1) % args.partitions) * chunk
            track.chunk_duration = chunk
            signal = torch.as_tensor(track.audio.T,
                                     dtype=torch.float32).to(device)
            result = separator.separate(signal)
            museval.eval_mus_track(track, result, f"{args.output}{i}")

        print(f"Partición {args.partitions}")
        track.chunk_start = (args.partitions - 1) * chunk
        track.chunk_duration = track.duration - track.chunk_start
        signal = torch.as_tensor(track.audio.T, dtype=torch.float32).to(device)
        result = separator.separate(signal)
        museval.eval_mus_track(track, result,
                               f"{args.output}{args.partitions}")

        merge_jsons(args.output, track.name, args.partitions)

    for i in range(1, args.partitions + 1):
        os.rmdir(f"{args.output}{i}/test")
        os.rmdir(f"{args.output}{i}")
Exemple #27
0
 def evaluate(track_estimates):
     track, estimates = track_estimates
     museval.eval_mus_track(track, estimates, output_dir=output_dir)
Exemple #28
0
def predict(track, model_config, load_model, results_dir=None):
    '''
    Function in accordance with MUSB evaluation API. Takes MUSDB track object and computes corresponding source estimates, as well as calls evlauation script.
    Model has to be saved beforehand into a pickle file containing model configuration dictionary and checkpoint path!
    :param track: Track object
    :param results_dir: Directory where SDR etc. values should be saved
    :return: Source estimates dictionary
    '''

    # Determine input and output shapes, if we use U-net as separator
    disc_input_shape = [
        model_config["batch_size"], model_config["num_frames"], 0
    ]  # Shape of discriminator input
    if model_config["network"] == "unet":
        separator_class = Models.UnetAudioSeparator.UnetAudioSeparator(
            model_config)
    elif model_config["network"] == "unet_spectrogram":
        separator_class = Models.UnetSpectrogramSeparator.UnetSpectrogramSeparator(
            model_config)
    else:
        raise NotImplementedError

    sep_input_shape, sep_output_shape = separator_class.get_padding(
        np.array(disc_input_shape))
    separator_func = separator_class.get_output

    # Batch size of 1
    sep_input_shape[0] = 1
    sep_output_shape[0] = 1

    mix_ph = tf.placeholder(tf.float32, sep_input_shape)

    print("Testing...")

    # BUILD MODELS
    # Separator
    separator_sources = separator_func(mix_ph,
                                       training=False,
                                       return_spectrogram=False,
                                       reuse=False)

    # Start session and queue input threads
    sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    # Load model
    # Load pretrained model to continue training, if we are supposed to
    restorer = tf.train.Saver(None, write_version=tf.train.SaverDef.V2)
    print("Num of variables: " + str(len(tf.global_variables())))
    restorer.restore(sess, load_model)
    print('Pre-trained model restored for song prediction')

    mix_audio, orig_sr, mix_channels = track.audio, track.rate, track.audio.shape[
        1]  # Audio has (n_samples, n_channels) shape
    separator_preds = predict_track(model_config, sess, mix_audio, orig_sr,
                                    sep_input_shape, sep_output_shape,
                                    separator_sources, mix_ph)

    # Upsample predicted source audio and convert to stereo. Make sure to resample back to the exact number of samples in the original input (with fractional orig_sr/new_sr this causes issues otherwise)
    pred_audio = {
        name:
        Utils.resample(separator_preds[name], model_config["expected_sr"],
                       orig_sr)[:mix_audio.shape[0], :]
        for name in model_config["source_names"]
    }

    if model_config[
            "mono_downmix"] and mix_channels > 1:  # Convert to multichannel if mixture input was multichannel by duplicating mono estimate
        pred_audio = {
            name: np.tile(pred_audio[name], [1, mix_channels])
            for name in pred_audio.keys()
        }

    # Evaluate using museval, if we are currently evaluating MUSDB
    if results_dir is not None:
        scores = museval.eval_mus_track(track,
                                        pred_audio,
                                        output_dir=results_dir)

        # print nicely formatted mean scores
        print(scores)

    # Close session, clear computational graph
    sess.close()
    tf.reset_default_graph()

    return pred_audio
Exemple #29
0
def predict(track):
    '''
    Function in accordance with MUSB evaluation API. Takes MUSDB track object and computes corresponding source estimates, as well as calls evlauation script.
    Model has to be saved beforehand into a pickle file containing model configuration dictionary and checkpoint path!
    :param track: Track object
    :return: Source estimates dictionary
    '''
    '''if track.filename[:4] == "test" or int(track.filename[:3]) > 53:
        return {
            'vocals': np.zeros(track.audio.shape),
            'accompaniment': np.zeros(track.audio.shape)
        }'''
    # Load model hyper-parameters and model checkpoint path
    with open("prediction_params.pkl", "r") as file:
        [model_config, load_model] = pickle.load(file)

    # Determine input and output shapes, if we use U-net as separator
    disc_input_shape = [model_config["batch_size"], model_config["num_frames"], 0]  # Shape of discriminator input
    if model_config["network"] == "unet":
        separator_class = Models.UnetAudioSeparator.UnetAudioSeparator(model_config["num_layers"], model_config["num_initial_filters"],
                                                                   output_type=model_config["output_type"],
                                                                   context=model_config["context"],
                                                                   mono=model_config["mono_downmix"],
                                                                   upsampling=model_config["upsampling"],
                                                                   num_sources=model_config["num_sources"],
                                                                   filter_size=model_config["filter_size"],
                                                                   merge_filter_size=model_config["merge_filter_size"])
    elif model_config["network"] == "unet_spectrogram":
        separator_class = Models.UnetSpectrogramSeparator.UnetSpectrogramSeparator(model_config["num_layers"], model_config["num_initial_filters"],
                                                                       mono=model_config["mono_downmix"],
                                                                       num_sources=model_config["num_sources"])
    else:
        raise NotImplementedError

    sep_input_shape, sep_output_shape = separator_class.get_padding(np.array(disc_input_shape))
    separator_func = separator_class.get_output

    # Batch size of 1
    sep_input_shape[0] = 1
    sep_output_shape[0] = 1

    mix_context, sources = Input.get_multitrack_placeholders(sep_output_shape, model_config["num_sources"], sep_input_shape, "input")

    print("Testing...")

    # BUILD MODELS
    # Separator
    separator_sources = separator_func(mix_context, False, reuse=False)

    # Start session and queue input threads
    sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    # Load model
    # Load pretrained model to continue training, if we are supposed to
    restorer = tf.train.Saver(None, write_version=tf.train.SaverDef.V2)
    print("Num of variables" + str(len(tf.global_variables())))
    restorer.restore(sess, load_model)
    print('Pre-trained model restored for song prediction')

    mix_audio, orig_sr, mix_channels = track.audio, track.rate, track.audio.shape[1] # Audio has (n_samples, n_channels) shape
    separator_preds = predict_track(model_config, sess, mix_audio, orig_sr, sep_input_shape, sep_output_shape, separator_sources, mix_context)

    # Upsample predicted source audio and convert to stereo
    pred_audio = [librosa.resample(pred.T, model_config["expected_sr"], orig_sr).T for pred in separator_preds]

    if model_config["mono_downmix"] and mix_channels > 1: # Convert to multichannel if mixture input was multichannel by duplicating mono estimate
        pred_audio = [np.tile(pred, [1, mix_channels]) for pred in pred_audio]

    # Set estimates depending on estimation task (voice or multi-instrument separation)
    if model_config["task"] == "voice": # [acc, vocals] order
        estimates = {
            'vocals' : pred_audio[1],
            'accompaniment' : pred_audio[0]
        }
    else: # [bass, drums, other, vocals]
        estimates = {
            'bass' : pred_audio[0],
            'drums' : pred_audio[1],
            'other' : pred_audio[2],
            'vocals' : pred_audio[3]
        }

    # Evaluate using museval
    scores = museval.eval_mus_track(
        track, estimates, output_dir="/mnt/daten/Datasets/MUSDB18/eval", # SiSec should use longer win and hop parameters here to make evaluation more stable!
    )

    # print nicely formatted mean scores
    print(scores)

    # Close session, clear computational graph
    sess.close()
    tf.reset_default_graph()

    return estimates
        model += newM[name]

    # now performs separation
    estimates = {}
    for name, source in newM.items():  # 遍历所有声部,用mask分离出各个声部
        # compute soft mask as the ratio between source spectrogram and total
        Mask = newM[name] / model

        # multiply the mix by the mask
        Yj = Mask * X_origin

        # invert to time domain
        target_estimate = istft(Yj, nperseg=4096, noverlap=3072)[1].T

        # set this as the source estimate
        estimates[name] = target_estimate

    return estimates


estimates = estimateSpectro(X_origin, newM)

from IPython.display import Audio, display

for target, estimate in estimates.items():
    display(Audio(estimate.T, rate=track[0].rate))

import museval

track_scores = museval.eval_mus_track(track[0], estimates)
print(track_scores)