def IBM(track, alpha=1, theta=0.5, eval_dir=None): """Ideal Binary Mask: processing all channels inpependently with the ideal binary mask. the mix is send to some source if the spectrogram of that source over that of the mix is greater than theta, when the spectrograms are take as magnitude of STFT raised to the power alpha. Typical parameters involve a ratio of magnitudes (alpha=1) and a majority vote (theta = 0.5) """ # parameters for STFT nfft = 2048 # small epsilon to avoid dividing by zero eps = np.finfo(np.float).eps # compute STFT of Mixture N = track.audio.shape[0] # remember number of samples for future use X = stft(track.audio.T, nperseg=nfft)[-1] (I, F, T) = X.shape # perform separtion estimates = {} accompaniment_source = 0 for name, source in track.sources.items(): # compute STFT of target source Yj = stft(source.audio.T, nperseg=nfft)[-1] # Create Binary Mask Mask = np.divide(np.abs(Yj)**alpha, (eps + np.abs(X)**alpha)) Mask[np.where(Mask >= theta)] = 1 Mask[np.where(Mask < theta)] = 0 # multiply mask Yj = np.multiply(X, Mask) # inverte to time domain and set same length as original mixture target_estimate = istft(Yj)[1].T[:N, :] # set this as the source estimate estimates[name] = target_estimate # accumulate to the accompaniment if this is not vocals if name != 'vocals': accompaniment_source += target_estimate # set accompaniment source estimates['accompaniment'] = accompaniment_source if eval_dir is not None: museval.eval_mus_track( track, estimates, output_dir=eval_dir, ) return estimates
def IRM(track, alpha=2, eval_dir=None): """Ideal Ratio Mask: processing all channels inpependently with the ideal ratio mask. this is the ratio of spectrograms, where alpha is the exponent to take for spectrograms. usual values are 1 (magnitude) and 2 (power)""" # STFT parameters nfft = 2048 # small epsilon to avoid dividing by zero eps = np.finfo(np.float).eps # compute STFT of Mixture N = track.audio.shape[0] # remember number of samples for future use X = stft(track.audio.T, nperseg=nfft)[-1] (I, F, T) = X.shape # Compute sources spectrograms P = {} # compute model as the sum of spectrograms model = eps for name, source in track.sources.items(): # compute spectrogram of target source: # magnitude of STFT to the power alpha P[name] = np.abs(stft(source.audio.T, nperseg=nfft)[-1])**alpha model += P[name] # now performs separation estimates = {} accompaniment_source = 0 for name, source in track.sources.items(): # compute soft mask as the ratio between source spectrogram and total Mask = np.divide(np.abs(P[name]), model) # multiply the mix by the mask Yj = np.multiply(X, Mask) # invert to time domain target_estimate = istft(Yj)[1].T[:N, :] # set this as the source estimate estimates[name] = target_estimate # accumulate to the accompaniment if this is not vocals if name != 'vocals': accompaniment_source += target_estimate estimates['accompaniment'] = accompaniment_source if eval_dir is not None: museval.eval_mus_track( track, estimates, output_dir=eval_dir, ) return estimates
def estimate_and_evaluate(track): # return any number of targets estimates = { 'vocals': track.audio, 'accompaniment': track.audio } museval.eval_mus_track( track, estimates, output_dir=output_dir ) return estimates
def test_track_scores(reference): track, ref_scores = reference np.random.seed(0) random_voc = np.random.random(track.audio.shape) random_acc = np.random.random(track.audio.shape) # create a silly regression test estimates = {'vocals': random_voc, 'accompaniment': random_acc} scores = museval.eval_mus_track(track, estimates) est_scores = json.loads(scores.json) for target in ref_scores['targets']: for metric in ['SDR', 'SIR', 'SAR', 'ISR']: ref = np.array([d['metrics'][metric] for d in target['frames']]) idx = [t['name'] for t in est_scores['targets']].index(target['name']) est = np.array([ d['metrics'][metric] for d in est_scores['targets'][idx]['frames'] ]) assert np.allclose(ref, est, atol=1e-02, equal_nan=True)
def test_estimate_and_evaluate(mus): # return any number of targets with open(json_path) as json_file: ref = json.loads(json_file.read()) print(os.path.basename(json_path)) track = mus.load_mus_tracks( tracknames=[os.path.splitext(os.path.basename(json_path))[0]])[0] np.random.seed(0) random_voc = np.random.random(track.audio.shape) random_acc = np.random.random(track.audio.shape) # create a silly regression test estimates = {'vocals': random_voc, 'accompaniment': random_acc} scores = museval.eval_mus_track(track, estimates) assert scores.validate() is None with open(os.path.join('.', track.name) + '.json', 'w+') as f: f.write(scores.json) scores = json.loads(scores.json) for target in ref['targets']: for metric in ['SDR', 'SIR', 'SAR', 'ISR']: ref = np.array([d['metrics'][metric] for d in target['frames']]) idx = [t['name'] for t in scores['targets']].index(target['name']) est = np.array([ d['metrics'][metric] for d in scores['targets'][idx]['frames'] ]) assert np.allclose(ref, est)
def separate_and_evaluate( track, targets, model_name, niter, alpha, softmask, output_dir, eval_dir, device='cpu' ): estimates = test.separate( audio=track.audio, targets=targets, model_name=model_name, niter=niter, alpha=alpha, softmask=softmask, device=device ) if output_dir: mus.save_estimates(estimates, track, output_dir) scores = museval.eval_mus_track( track, estimates, output_dir=eval_dir ) return scores
def oracle(track, separation_fn): # set (trackwise) norbert objects tf = norbert.TF() # compute the mixture complex tf transform x = tf.transform(track.audio) v = [] for name, value in track.sources.items(): v_j = np.sum(np.abs(tf.transform(value.audio))**2, axis=-1, keepdims=True) v += [np.squeeze(v_j)] v = np.moveaxis(np.array(v), 0, 2) y = separation_fn(v, x) estimates = {} for j, (name, value) in enumerate(track.sources.items()): audio_hat = tf.inverse_transform(y[..., j]) estimates[name] = audio_hat # Evaluate using museval scores = museval.eval_mus_track( track, estimates, output_dir=None ) print(scores) return estimates
def model_separate_and_evaluate( components: Dict, track: musdb.MultiTrack, evaldir, n_nonzero_coeffs: int = 40 ): mixture = track.audio print("Separating") separated_sources = model_separate( components, mixture, n_nonzero_coeffs=n_nonzero_coeffs ) estimates = { "vocals": separated_sources[0], "drums": separated_sources[1], "bass": separated_sources[2], "other": separated_sources[3] # "accompaniment": separated_sources[1] } scores = None try: print("Evaluating") scores = museval.eval_mus_track(track, estimates, output_dir=evaldir) print(scores) print("Done") except ValueError as e: pass else: print("Evaluation Success") return separated_sources, scores
def GT(track, eval_dir=None): """Ground Truth Signals """ # perform separtion estimates = {} for name, target in track.targets.items(): # set accompaniment source estimates[name] = target.audio if eval_dir is not None: museval.eval_mus_track( track, estimates, output_dir=eval_dir, ) return estimates
def separate_and_evaluate(track, args, ext): estimates = test.separate(track.audio, args) if args.out_dir: mus.save_estimates(estimates, track, args.out_dir) scores = museval.eval_mus_track(track, estimates, output_dir=args.out_dir) # clear cache memory ext.clear_memory_cache() return scores
def eval_dataset(_dataset, _predictor): config = { 'use_mixer': _predictor.use_mixer, 'use_demucs': _predictor.use_demucs, 'dataset': _dataset.root } wandb.init(project="KUIELab-MDX-Net", entity="ielab", config=config) sources = ['bass', 'drums', 'other', 'vocals'] for idx in range(len(_dataset)): track = _dataset[idx] estimation = _predictor.demix(track.audio.T) # Real SDR if len(estimation) == len(sources): track_length = _dataset[idx].samples if track_length > estimation.shape[-1]: raise NotImplementedError else: estimated_targets_dict = { source: estimated.T for source, estimated in zip(sources, estimation) } track_score = museval.eval_mus_track(_dataset[idx], estimated_targets_dict) score_dict = track_score.df.loc[:, ['target', 'metric', 'score']].groupby( ['target', 'metric'])['score'] \ .median().to_dict() wandb.log( { 'test_result/{}_{}'.format(k1, k2): score_dict[(k1, k2)] for k1, k2 in score_dict.keys() }, step=idx) print(track_score) results.add_track(track_score) result_dict = results.df.groupby([ 'track', 'target', 'metric' ])['score'].median().reset_index().groupby(['target', 'metric' ])['score'].median().to_dict() wandb.log({ 'test_result/agg/{}_{}'.format(k1, k2): result_dict[(k1, k2)] for k1, k2 in result_dict.keys() }) wandb.finish() print(results)
def MIX(track, eval_dir=None): """Mixture as Estimate """ # perform separtion estimates = {} for name, target in track.sources.items(): # set accompaniment source estimates[name] = track.audio / len(track.sources) estimates['accompaniment'] = estimates['bass'] + \ estimates['drums'] + estimates['other'] if eval_dir is not None: museval.eval_mus_track( track, estimates, output_dir=eval_dir, ) return estimates
def predict(track, model_config, model, model_noise, results_dir=None): ''' Function in accordance with MUSB evaluation API. Takes MUSDB track object and computes corresponding source estimates, as well as calls evlauation script. Model has to be saved beforehand into a pickle file containing model configuration dictionary and checkpoint path! :param track: Track object :param results_dir: Directory where SDR etc. values should be saved :return: Source estimates dictionary ''' # Get noise once, use that for all predictions to keep consistency noise = model_noise.sample() # Determine input and output shapes, if we use U-net as separator sep_input_shape = [ 1, 1, model_config.input_height, model_config.input_width ] # [N, C, H, W] print("Testing...") mix_audio, orig_sr, mix_channels = track.audio, track.rate, track.audio.shape[ 1] # Audio has (n_samples, n_channels) shape separator_preds = predict_track(model_config, model, noise, mix_audio, orig_sr, sep_input_shape, sep_input_shape) # Upsample predicted source audio and convert to stereo. Make sure to resample back to the exact number of samples in the original input (with fractional orig_sr/new_sr this causes issues otherwise) pred_audio = { name: librosa.resample(separator_preds[name], model_config.sample_rate, orig_sr)[:len(mix_audio)] for name in separator_preds.keys() } if mix_channels > 1: # Convert to multichannel if mixture input was multichannel by duplicating mono estimate pred_audio = { name: np.repeat(np.expand_dims(pred_audio[name], 1), mix_channels, axis=1) for name in pred_audio.keys() } # Evaluate using museval, if we are currently evaluating MUSDB if results_dir is not None: scores = museval.eval_mus_track(track, pred_audio, output_dir=results_dir, win=15, hop=15.0) # print nicely formatted mean scores print(scores) return pred_audio
def load_and_eval_estimates(track): # load estimates from disk instead of processing user_results = {} track.name = track.filename track_estimate_dir = os.path.join(user_estimates_dir, track.subset, track.filename) for target in glob.glob(track_estimate_dir + '/*.wav'): target_name = op.splitext(os.path.basename(target))[0] try: target_audio, rate = sf.read(target, always_2d=True) user_results[target_name] = target_audio except RuntimeError: pass museval.eval_mus_track( track, user_results, output_dir=output_dir, mode='v3' # use bss_eval v3 to reproduce sisec 2016 results ) return None
def test_one_estimate(reference): track, _ = reference np.random.seed(0) random_voc = np.random.random(track.audio.shape) estimates = {'vocals': random_voc} with pytest.warns(UserWarning): est_scores = museval.eval_mus_track(track, estimates) est_json = json.loads(est_scores.json) assert len(est_json['targets']) == 0
def evaluate(self): track = utils.audio_signals_to_musdb_track(self.mixture, self.true_sources, self.target_dict) bss_output = museval.eval_mus_track(track, self.estimates, output_dir=self.output_dir, mode=self.mode, win=self.win, hop=self.hop) self._populate_scores_dict(bss_output) return self.scores
def add_new_predictions(self, output, track, instrument_ohe): # First, validate that the current instrument is initialized. self.validate_current_track(track, instrument_ohe) # If the received output has a different track and instrument when compared to the last track, # this means that the previous track has finished and we should calculate its results with museval same_instrument_mask = ( instrument_ohe == self.current_instrument_ohe).all(dim=1) same_track_mask = (track == self.current_track) if (~same_instrument_mask.any() or ~same_track_mask.any() ) and self.get_current_output_length() == 0: raise Exception( 'Current track is empty, but received a different(new) instrument/track.' ) elif ~same_track_mask.any(): # Get the observations that belong to the current instrument (and track) and append them to the results if same_track_mask.any(): self.append_results_to_current_track(output[same_track_mask]) # Retrieve the original track from musdb, generate the estimates from the model's output # and Evaluate the finished track track = self.mus_db.tracks[self.current_track] estimates = { instru: torch.cat(output_list).numpy() for instru, output_list in self.current_output.items() } self.results.add_track(museval.eval_mus_track(track, estimates)) # Reset current results self.reset_current_results() self.validate_current_track(track[~same_track_mask], instrument_ohe[~same_track_mask]) # Get the observations that belong to the new instrument and append them to the results self.append_results_to_current_track(output[~same_track_mask]) elif ~same_instrument_mask.any(): # Get the observations that belong to the current instrument and append them to the results if same_instrument_mask.any(): self.append_results_to_current_track( output[same_instrument_mask]) # Update current instrument and # Get the observations that belong to the new instrument and append them to the new results. self.update_current_instrument( instrument_ohe[~same_instrument_mask][0]) self.append_results_to_current_track(output[~same_instrument_mask]) # Add the received outputs into the current_results dictionary if there are no changes in instrument # or else: self.append_results_to_current_track(output)
def on_test_epoch_end(self): results = museval.EvalStore(frames_agg='median', tracks_agg='median') for idx in range(self.musdb_test.num_tracks): estimation = {} for target_name in self.target_names: estimation[target_name] = get_estimation(idx, target_name, self.test_estimation_dict) if estimation[target_name] is not None: estimation[target_name] = estimation[target_name].astype(np.float32) # Real SDR if len(estimation) == len(self.target_names): track_length = self.musdb_test.musdb_reference[idx].samples estimated_targets = [estimation[target_name][:track_length] for target_name in self.target_names] if track_length > estimated_targets[0].shape[0]: raise NotImplementedError else: estimated_targets_dict = {target_name: estimation[target_name][:track_length] for target_name in self.target_names} track_score = museval.eval_mus_track( self.musdb_test.musdb_reference[idx], estimated_targets_dict ) score_dict = track_score.df.loc[:, ['target', 'metric', 'score']].groupby( ['target', 'metric'])['score'] \ .median().to_dict() if isinstance(self.logger, WandbLogger): self.logger.experiment.log( {'test_result/{}_{}'.format(k1, k2): score_dict[(k1, k2)] for k1, k2 in score_dict.keys()}) else: print(track_score) results.add_track(track_score) if idx == 1 and isinstance(self.logger, WandbLogger): self.logger.experiment.log({'result_sample_{}_{}'.format(self.current_epoch, target_name): [ wandb.Audio(estimation[target_name], caption='{}_{}'.format(idx, target_name), sample_rate=44100)]}) if isinstance(self.logger, WandbLogger): result_dict = results.df.groupby( ['track', 'target', 'metric'] )['score'].median().reset_index().groupby( ['target', 'metric'] )['score'].median().to_dict() self.logger.experiment.log( {'test_result/agg/{}_{}'.format(k1, k2): result_dict[(k1, k2)] for k1, k2 in result_dict.keys()} ) else: print(results)
def test_random_estimate(reference): track, _ = reference np.random.seed(0) random_voc = np.random.random(track.audio.shape) random_acc = np.random.random(track.audio.shape) # create a silly regression test estimates = {'vocals': random_voc, 'accompaniment': random_acc} scores = museval.eval_mus_track(track, estimates) # save json with open(os.path.join('.', track.name) + '.json', 'w+') as f: f.write(scores.json) # validate json assert scores.validate() is None
def separate_and_evaluate(track, model_dir, targets, output_dir): fft_size, hop_size, n_channels = 4096, 1024, 2 audio = track.audio for i in range(audio.shape[1]): stft = librosa.stft(audio[:, i].flatten(), n_fft=fft_size, hop_length=hop_size).transpose() if i == 0: data = np.ndarray(shape=(stft.shape[0], n_channels, fft_size // 2 + 1), dtype=np.complex64) data[:, i, :] = stft if n_channels == 2 and audio.shape[1] == 1: data[:, 1] = data[:, 0] inp_stft = data out_stfts = {} inp_stft_contiguous = np.abs(np.ascontiguousarray(inp_stft)) for target in targets: # Load the model weights for corresponding target nn.load_parameters(f"{os.path.join(model_dir, target)}.h5") with open(f"./configs/{target}.yaml") as file: # Load target specific Hyper parameters hparams = yaml.load(file, Loader=yaml.FullLoader) with nn.parameter_scope(target): out_sep = model_separate(inp_stft_contiguous, hparams, ch_flip_average=True) out_stfts[target] = out_sep * np.exp(1j * np.angle(inp_stft)) out_stfts = apply_mwf(out_stfts, inp_stft) estimates = {} for target in targets: estimates[target] = stft2time_domain(out_stfts[target], hop_size) if output_dir: mus.save_estimates(estimates, track, output_dir) scores = museval.eval_mus_track(track, estimates, output_dir=output_dir) return scores
def test_aggregate(reference): track, _ = reference np.random.seed(0) random_voc = np.random.random(track.audio.shape) random_acc = np.random.random(track.audio.shape) # create a silly regression test estimates = {'vocals': random_voc, 'accompaniment': random_acc} scores = museval.eval_mus_track(track, estimates) print(scores.df) results = museval.EvalStore() results.add_track(scores) agg = results.agg_frames_scores() print(results)
def separate_and_evaluate( track: musdb.MultiTrack, targets: list, model_str_or_path: str, niter: int, output_dir: str, eval_dir: str, residual: bool, mus, aggregate_dict: dict = None, device: Union[str, torch.device] = "cpu", wiener_win_len: Optional[int] = None, filterbank="torch", ) -> str: separator = utils.load_separator( model_str_or_path=model_str_or_path, targets=targets, niter=niter, residual=residual, wiener_win_len=wiener_win_len, device=device, pretrained=True, filterbank=filterbank, ) separator.freeze() separator.to(device) audio = torch.as_tensor(track.audio, dtype=torch.float32, device=device) audio = utils.preprocess(audio, track.rate, separator.sample_rate) estimates = separator(audio) estimates = separator.to_dict(estimates, aggregate_dict=aggregate_dict) for key in estimates: estimates[key] = estimates[key][0].cpu().detach().numpy().T if output_dir: mus.save_estimates(estimates, track, output_dir) scores = museval.eval_mus_track(track, estimates, output_dir=eval_dir) return scores
def separate_and_evaluate( track, model, niter, alpha, softmask, output_dir, eval_dir, ): estimates = test.separate(audio=track.audio, model_path=model, niter=niter, alpha=alpha, softmask=softmask) if output_dir: mus.save_estimates(estimates, track, output_dir) scores = museval.eval_mus_track(track, estimates, output_dir=eval_dir) return scores
def model_separate_and_evaluate(components: NMFResults, track: musdb.MultiTrack, evaldir): mixture = track.audio print("Separating") separated_sources = model_separate(components, mixture) estimates = { "vocals": separated_sources[0], "drums": separated_sources[1], "bass": separated_sources[2], "other": separated_sources[3] # "accompaniment": separated_sources[1] } print("Evaluating") scores = museval.eval_mus_track(track, estimates, output_dir=evaldir) print(scores) print("Done") return separated_sources, scores
def oracle(track): # compute the mixture complex tf transform x = stft(torch.from_numpy(track.audio.T)).transpose(0, 2) v = [] for name, value in track.sources.items(): v_j = stft(torch.from_numpy(value.audio.T)).transpose(0, 2).abs()**2 v += [v_j] v = torch.stack(v, 3) y = norbert.softmask(v, x).permute(3, 2, 1, 0) estimates = {} for j, (name, value) in enumerate(track.sources.items()): audio_hat = istft(y[j]).numpy().T estimates[name] = audio_hat # Evaluate using museval scores = museval.eval_mus_track(track, estimates, output_dir=None) print(scores) return estimates
def main(): # Solo musdb por ahora !! parser = argparse.ArgumentParser() parser.add_argument("--checkpoints", type=str, help="Ruta del modelo a evaluar") parser.add_argument("--end", type=int, default=49, choices=range(50), help="Índice de la canción de fin") parser.add_argument("--init", type=int, default=0, choices=range(50), help="Índice de la canción de inicio") parser.add_argument("--other", action="store_true", help="Utilizar el modelo de other") parser.add_argument("--output", type=str, help="Ruta donde se guarda la evaluación") parser.add_argument("--partitions", type=int, default=1, help="Número de partes de las canciones de test") parser.add_argument("--root", type=str, help="Ruta del dataset") parser.add_argument("--vocals", action="store_true", help="Restar vocals para calcular el acompañamiento") subparsers = parser.add_subparsers(help="Tipo de modelo", dest="model") parser_spec = subparsers.add_parser("spectrogram", help="Modelo de espectrograma") parser_wave = subparsers.add_parser("wave", help="Modelo de wave") parser_blend = subparsers.add_parser("blend", help="Modelo de mezcla") parser_blend.add_argument("--checkpoints-stft", type=str, help="Ruta del modelo de espectrograma") parser_blend.add_argument("--checkpoints-wave", type=str, help="Ruta del modelo de wave") args = parser.parse_args() use_cuda = torch.cuda.is_available() print("GPU disponible:", use_cuda) device = torch.device("cuda:0" if use_cuda else "cpu") if args.model == "spectrogram": separator = SpectrogramSeparator(args.checkpoints, args.other, args.vocals, device) elif args.model == "wave": separator = WaveSeparator(args.checkpoints, args.other, args.vocals, device) elif args.model == "blend": separator = BlendSeparator(args.checkpoints_stft, args.checkpoints_wave, args.checkpoints, args.other, args.vocals, device) else: raise NotImplementedError print("Cargando canciones de test") mus = musdb.DB(root=args.root, subsets='test') os.makedirs(f"{args.output}/test", exist_ok=True) for i in tqdm.tqdm(range(args.init, args.end + 1)): track = mus.tracks[i] print(f"Canción {i}: {track.name}") chunk = track.duration // args.partitions for i in range(1, args.partitions): print(f"Partición {i}") track.chunk_start = ((i - 1) % args.partitions) * chunk track.chunk_duration = chunk signal = torch.as_tensor(track.audio.T, dtype=torch.float32).to(device) result = separator.separate(signal) museval.eval_mus_track(track, result, f"{args.output}{i}") print(f"Partición {args.partitions}") track.chunk_start = (args.partitions - 1) * chunk track.chunk_duration = track.duration - track.chunk_start signal = torch.as_tensor(track.audio.T, dtype=torch.float32).to(device) result = separator.separate(signal) museval.eval_mus_track(track, result, f"{args.output}{args.partitions}") merge_jsons(args.output, track.name, args.partitions) for i in range(1, args.partitions + 1): os.rmdir(f"{args.output}{i}/test") os.rmdir(f"{args.output}{i}")
def evaluate(track_estimates): track, estimates = track_estimates museval.eval_mus_track(track, estimates, output_dir=output_dir)
def predict(track, model_config, load_model, results_dir=None): ''' Function in accordance with MUSB evaluation API. Takes MUSDB track object and computes corresponding source estimates, as well as calls evlauation script. Model has to be saved beforehand into a pickle file containing model configuration dictionary and checkpoint path! :param track: Track object :param results_dir: Directory where SDR etc. values should be saved :return: Source estimates dictionary ''' # Determine input and output shapes, if we use U-net as separator disc_input_shape = [ model_config["batch_size"], model_config["num_frames"], 0 ] # Shape of discriminator input if model_config["network"] == "unet": separator_class = Models.UnetAudioSeparator.UnetAudioSeparator( model_config) elif model_config["network"] == "unet_spectrogram": separator_class = Models.UnetSpectrogramSeparator.UnetSpectrogramSeparator( model_config) else: raise NotImplementedError sep_input_shape, sep_output_shape = separator_class.get_padding( np.array(disc_input_shape)) separator_func = separator_class.get_output # Batch size of 1 sep_input_shape[0] = 1 sep_output_shape[0] = 1 mix_ph = tf.placeholder(tf.float32, sep_input_shape) print("Testing...") # BUILD MODELS # Separator separator_sources = separator_func(mix_ph, training=False, return_spectrogram=False, reuse=False) # Start session and queue input threads sess = tf.Session() sess.run(tf.global_variables_initializer()) # Load model # Load pretrained model to continue training, if we are supposed to restorer = tf.train.Saver(None, write_version=tf.train.SaverDef.V2) print("Num of variables: " + str(len(tf.global_variables()))) restorer.restore(sess, load_model) print('Pre-trained model restored for song prediction') mix_audio, orig_sr, mix_channels = track.audio, track.rate, track.audio.shape[ 1] # Audio has (n_samples, n_channels) shape separator_preds = predict_track(model_config, sess, mix_audio, orig_sr, sep_input_shape, sep_output_shape, separator_sources, mix_ph) # Upsample predicted source audio and convert to stereo. Make sure to resample back to the exact number of samples in the original input (with fractional orig_sr/new_sr this causes issues otherwise) pred_audio = { name: Utils.resample(separator_preds[name], model_config["expected_sr"], orig_sr)[:mix_audio.shape[0], :] for name in model_config["source_names"] } if model_config[ "mono_downmix"] and mix_channels > 1: # Convert to multichannel if mixture input was multichannel by duplicating mono estimate pred_audio = { name: np.tile(pred_audio[name], [1, mix_channels]) for name in pred_audio.keys() } # Evaluate using museval, if we are currently evaluating MUSDB if results_dir is not None: scores = museval.eval_mus_track(track, pred_audio, output_dir=results_dir) # print nicely formatted mean scores print(scores) # Close session, clear computational graph sess.close() tf.reset_default_graph() return pred_audio
def predict(track): ''' Function in accordance with MUSB evaluation API. Takes MUSDB track object and computes corresponding source estimates, as well as calls evlauation script. Model has to be saved beforehand into a pickle file containing model configuration dictionary and checkpoint path! :param track: Track object :return: Source estimates dictionary ''' '''if track.filename[:4] == "test" or int(track.filename[:3]) > 53: return { 'vocals': np.zeros(track.audio.shape), 'accompaniment': np.zeros(track.audio.shape) }''' # Load model hyper-parameters and model checkpoint path with open("prediction_params.pkl", "r") as file: [model_config, load_model] = pickle.load(file) # Determine input and output shapes, if we use U-net as separator disc_input_shape = [model_config["batch_size"], model_config["num_frames"], 0] # Shape of discriminator input if model_config["network"] == "unet": separator_class = Models.UnetAudioSeparator.UnetAudioSeparator(model_config["num_layers"], model_config["num_initial_filters"], output_type=model_config["output_type"], context=model_config["context"], mono=model_config["mono_downmix"], upsampling=model_config["upsampling"], num_sources=model_config["num_sources"], filter_size=model_config["filter_size"], merge_filter_size=model_config["merge_filter_size"]) elif model_config["network"] == "unet_spectrogram": separator_class = Models.UnetSpectrogramSeparator.UnetSpectrogramSeparator(model_config["num_layers"], model_config["num_initial_filters"], mono=model_config["mono_downmix"], num_sources=model_config["num_sources"]) else: raise NotImplementedError sep_input_shape, sep_output_shape = separator_class.get_padding(np.array(disc_input_shape)) separator_func = separator_class.get_output # Batch size of 1 sep_input_shape[0] = 1 sep_output_shape[0] = 1 mix_context, sources = Input.get_multitrack_placeholders(sep_output_shape, model_config["num_sources"], sep_input_shape, "input") print("Testing...") # BUILD MODELS # Separator separator_sources = separator_func(mix_context, False, reuse=False) # Start session and queue input threads sess = tf.Session() sess.run(tf.global_variables_initializer()) # Load model # Load pretrained model to continue training, if we are supposed to restorer = tf.train.Saver(None, write_version=tf.train.SaverDef.V2) print("Num of variables" + str(len(tf.global_variables()))) restorer.restore(sess, load_model) print('Pre-trained model restored for song prediction') mix_audio, orig_sr, mix_channels = track.audio, track.rate, track.audio.shape[1] # Audio has (n_samples, n_channels) shape separator_preds = predict_track(model_config, sess, mix_audio, orig_sr, sep_input_shape, sep_output_shape, separator_sources, mix_context) # Upsample predicted source audio and convert to stereo pred_audio = [librosa.resample(pred.T, model_config["expected_sr"], orig_sr).T for pred in separator_preds] if model_config["mono_downmix"] and mix_channels > 1: # Convert to multichannel if mixture input was multichannel by duplicating mono estimate pred_audio = [np.tile(pred, [1, mix_channels]) for pred in pred_audio] # Set estimates depending on estimation task (voice or multi-instrument separation) if model_config["task"] == "voice": # [acc, vocals] order estimates = { 'vocals' : pred_audio[1], 'accompaniment' : pred_audio[0] } else: # [bass, drums, other, vocals] estimates = { 'bass' : pred_audio[0], 'drums' : pred_audio[1], 'other' : pred_audio[2], 'vocals' : pred_audio[3] } # Evaluate using museval scores = museval.eval_mus_track( track, estimates, output_dir="/mnt/daten/Datasets/MUSDB18/eval", # SiSec should use longer win and hop parameters here to make evaluation more stable! ) # print nicely formatted mean scores print(scores) # Close session, clear computational graph sess.close() tf.reset_default_graph() return estimates
model += newM[name] # now performs separation estimates = {} for name, source in newM.items(): # 遍历所有声部,用mask分离出各个声部 # compute soft mask as the ratio between source spectrogram and total Mask = newM[name] / model # multiply the mix by the mask Yj = Mask * X_origin # invert to time domain target_estimate = istft(Yj, nperseg=4096, noverlap=3072)[1].T # set this as the source estimate estimates[name] = target_estimate return estimates estimates = estimateSpectro(X_origin, newM) from IPython.display import Audio, display for target, estimate in estimates.items(): display(Audio(estimate.T, rate=track[0].rate)) import museval track_scores = museval.eval_mus_track(track[0], estimates) print(track_scores)