def test_separator_backends(test_file): adapter = AudioAdapter.default() waveform, _ = adapter.load(test_file) separator_lib = Separator("spleeter:2stems", stft_backend="librosa", multiprocess=False) separator_tf = Separator("spleeter:2stems", stft_backend="tensorflow", multiprocess=False) # Test the stft and inverse stft provides exact reconstruction stft_matrix = separator_lib._stft(waveform) reconstructed = separator_lib._stft(stft_matrix, inverse=True, length=waveform.shape[0]) assert np.allclose(reconstructed, waveform, atol=3e-2) # compare both separation, it should be close out_tf = separator_tf._separate_tensorflow(waveform, test_file) out_lib = separator_lib._separate_librosa(waveform, test_file) for instrument in out_lib.keys(): # test that both outputs are close everywhere assert np.allclose(out_tf[instrument], out_lib[instrument], atol=1e-5)
def separate_audio( self, filename: str = "../audio/audio_example.mp3", bitrate: str = "128k", codec: Codec = Codec.WAV, # output file extension duration: float = 600.0, # max length in seconds offset: float = 0.00, # starting position verbose: bool = False # verbose logging ) -> None: """ Separate audio file(s) using pretrained 2-stems model """ from spleeter.audio.adapter import AudioAdapter from spleeter.separator import Separator configure_logger(verbose) audio_adapter: AudioAdapter = AudioAdapter.get(descriptor=self.adapter) separator: Separator = Separator(self.params_filename, MWF=False, stft_backend=STFTBackend( STFTBackend.AUTO)) separator.separate_to_file( str(filename), str(self.output_path), audio_adapter=audio_adapter, offset=int(offset), duration=duration, codec=codec, bitrate=bitrate, filename_format=self.filename_format, synchronous=False, ) separator.join()
def generate_fake_training_dataset( path, instrument_list=['vocals', 'other'], n_channels=2, n_songs=2, fs=44100, duration=6, ): """ generates a fake training dataset in path: - generates audio files - generates a csv file describing the dataset """ aa = AudioAdapter.default() rng = np.random.RandomState(seed=0) dataset_df = pd.DataFrame(columns=['mix_path'] + [f'{instr}_path' for instr in instrument_list] + ['duration']) for song in range(n_songs): song_path = join(path, 'train', f'song{song}') makedirs(song_path, exist_ok=True) dataset_df.loc[song, f'duration'] = duration for instr in instrument_list + ['mix']: filename = join(song_path, f'{instr}.wav') data = rng.rand(duration * fs, n_channels) - 0.5 aa.save(filename, data, fs) dataset_df.loc[song, f'{instr}_path'] = join('train', f'song{song}', f'{instr}.wav') dataset_df.to_csv(join(path, 'train', 'train.csv'), index=False)
def execute(args): try: logger.info('音声分離処理開始: {0}', args.audio_file, decoration=MLogger.DECORATION_BOX) if not os.path.exists(args.audio_file): logger.error("指定された音声ファイルパスが存在しません。\n{0}", args.audio_file, decoration=MLogger.DECORATION_BOX) return False, None # 親パス(指定がなければ動画のある場所。Colabはローカルで作成するので指定あり想定) base_path = str(pathlib.Path(args.audio_file).parent ) if not args.parent_dir else args.parent_dir if len(args.parent_dir) > 0: process_audio_dir = base_path else: process_audio_dir = os.path.join( base_path, "{0}_{1:%Y%m%d_%H%M%S}".format( os.path.basename(args.audio_file).replace('.', '_'), datetime.datetime.now())) # 既存は削除 if os.path.exists(process_audio_dir): shutil.rmtree(process_audio_dir) # フォルダ生成 os.makedirs(process_audio_dir) audio_adapter = AudioAdapter.default() waveform, sample_rate = audio_adapter.load(args.audio_file) # 音声と曲に分離 separator = Separator('spleeter:2stems') # Perform the separation : prediction = separator.separate(waveform) # 音声データ vocals = prediction['vocals'] vocals_wav_path = f"{process_audio_dir}/vocals.wav" # 一旦wavとして保存 audio_adapter.save(vocals_wav_path, vocals, sample_rate, "wav") logger.info('音声分離処理終了: {0}', process_audio_dir, decoration=MLogger.DECORATION_BOX) return True, process_audio_dir except Exception as e: logger.critical("音声分離で予期せぬエラーが発生しました。", e, decoration=MLogger.DECORATION_BOX) return False, None
def split(input_file, sample_rate=44100, model="2stems"): if model.lower() not in ["2stems", "4stems", "5stems"]: print(f"Invalid model: '{model}'. Using '2stems' model instead.") model = "2stems" separator = Separator(f"spleeter:{model}") audio_loader = AudioAdapter.default() waveform, _ = audio_loader.load(input_file.name, sample_rate=sample_rate) # prediction output is a dictionary whose keys contain instrument/stem names # and values the associated waveforms prediction = separator.separate(waveform, "_") return (prediction)
def separate_audio(file, layers=5, sample_rate=44100): audio_loader = AudioAdapter.default() if layers not in [2, 4, 5]: raise Exception("layers must be 2, 4 or 5!") waveform, _ = audio_loader.load(file, sample_rate=sample_rate) separator = Separator('spleeter:{}stems'.format(layers)) prediction = separator.separate(waveform) return prediction
def get_spleeter_prediction(separator, track_path, source=None): # Get Spleeter prediction taking model and source to obtain as input audio_loader = AudioAdapter.default() waveform, _ = audio_loader.load(track_path, sample_rate=44100) if source == 'other': prediction = separator.separate(waveform) return prediction[source] else: prediction = separator.separate(waveform) return prediction[source]
def __init__(self, cpu_separation: bool, bitrate=256): """Default constructor. :param config: Separator config, defaults to None """ self.audio_bitrate = f'{bitrate}k' self.audio_format = 'mp3' self.sample_rate = 44100 self.spleeter_stem = 'config/4stems-16kHz.json' self.separator = Separator(self.spleeter_stem, stft_backend=STFTBackend.LIBROSA if cpu_separation else STFTBackend.TENSORFLOW, multiprocess=False) self.audio_adapter = AudioAdapter.default()
def music_processing(music_pth, ret_value): separator = Separator(params_descriptor='spleeter:2stems') audio_adapter = AudioAdapter.get( 'spleeter.audio.ffmpeg.FFMPEGProcessAudioAdapter') waveform, _ = audio_adapter.load(music_path, dtype=np.float32, sample_rate=22050) sources = separator.separate(waveform=waveform, audio_descriptor=music_pth) vocals = sources['vocals'] ret_value['vocals'] = vocals return vocals
def __init__( self, cpu_separation: bool, bitrate=256 ): """Default constructor. :param config: Separator config, defaults to None """ self.model_file = 'd3net-mss.h5' self.model_dir = Path('pretrained_models') self.model_file_path = self.model_dir / self.model_file self.context = 'cpu' if cpu_separation else 'cudnn' self.bitrate = f'{bitrate}k' self.sample_rate = 44100 self.audio_adapter = AudioAdapter.default()
def musidi(): link = [x for x in request.form.values()] #name = ''.join(random.choice('qwertyuiopasdfghjklzxcvbnm') for i in range(10)) name = 'song_file' local_file_path = name + ".mp3" ydl_args = { 'format': 'bestaudio/best', 'outtmpl': local_file_path } ydl = youtube_dl.YoutubeDL(ydl_args) ydl.download([link[0]]) separator = Separator('4stems.json', multiprocess=False) audio_loader = AudioAdapter.default() sample_rate = 44100 waveform, _ = audio_loader.load(local_file_path, sample_rate=sample_rate, duration=60) separator.separate_to_file(local_file_path, 'output/', audio_adapter=audio_loader) os.replace(f'./output/{name}/vocals.wav', f'Audio-to-midi-master/input/{name}_vocals.wav') for x in os.walk('Audio-to-midi-master/input/'): print(x) print('running model') _output = subprocess.run([sys.executable, 'Audio-to-midi-master/audio2midi.py', '-in', 'Audio-to-midi-master/input/', '-out', 'Audio-to-midi-master/output/']) os.replace(f'Audio-to-midi-master/output/{name}_vocals.mid', f'./{name}_vocals.mid') print('moved output. can you see it?') ''' f = open(f'{name}_lyrics.txt', 'w') # https://www.thepythoncode.com/article/using-speech-recognition-to-convert-speech-to-text-python r = sr.Recognizer() with sr.AudioFile(f'./Audio-to-midi-master/input/{name}_vocals.wav') as source: # listen for data (load audio to memory) audio_data = r.record(source) # recognize (convert from speech/song to text) text = r.recognize_google(audio_data, language = 'en-IN', show_all=True) # may only work for files less than ~110 seconds long, need to split otherwise f.write(str(text)) f.close() ''' print('printed lyrics to txt. can you see it?') from roll import MidiFile mid = MidiFile(name+'_vocals.mid') mid.draw_roll() return render_template('index.html', output='{}/other.wav'.format(name), picture=name+'_vocals.png', youtube=link[0][-11:])
def generate_fake_eval_dataset(path): """ generate fake evaluation dataset """ aa = AudioAdapter.default() n_songs = 2 fs = 44100 duration = 3 n_channels = 2 rng = np.random.RandomState(seed=0) for song in range(n_songs): song_path = join(path, 'test', f'song{song}') makedirs(song_path, exist_ok=True) for instr in ['mixture', 'vocals', 'bass', 'drums', 'other']: filename = join(song_path, f'{instr}.wav') data = rng.rand(duration * fs, n_channels) - 0.5 aa.save(filename, data, fs)
def __init__(self, model_name='mdx_extra_q', cpu_separation=True, bitrate=256, shifts=5): self.device = 'cpu' if cpu_separation else 'cuda' self.sample_rate = 44100 self.model_name = model_name self.repo = None self.model_dir = Path('pretrained_models') self.shifts = shifts self.split = True self.overlap = 0.25 self.workers = 0 self.verbose = True self.bitrate = f'{bitrate}k' self.audio_adapter = AudioAdapter.default()
def source_seperate_ogg(ogg_list: list): separator = Separator('spleeter:4stems') audio_loader = AudioAdapter.default() sample_rate = 22050 range_ = 32767 for ogg in ogg_list: waveform, _ = audio_loader.load(ogg, sample_rate=sample_rate) prediction = separator.separate(waveform) prediction['other'] = prediction['other'] * range_ save_path = Path( str(ogg).replace('Unprocessed', 'source_separated', 1)) if not os.path.isdir(save_path.parent): os.mkdir(save_path.parent) print(prediction) break
def test_separate(test_file, configuration, backend): """ Test separation from raw data. """ instruments = MODEL_TO_INST[configuration] adapter = AudioAdapter.default() waveform, _ = adapter.load(test_file) separator = Separator(configuration, stft_backend=backend, multiprocess=False) prediction = separator.separate(waveform, test_file) assert len(prediction) == len(instruments) for instrument in instruments: assert instrument in prediction for instrument in instruments: track = prediction[instrument] assert waveform.shape[:-1] == track.shape[:-1] assert not np.allclose(waveform, track) for compared in instruments: if instrument != compared: assert not np.allclose(track, prediction[compared])
def __init__(self, cpu_separation: bool, bitrate=256, softmask=False, alpha=1.0, iterations=1): """Default constructor. :param config: Separator config, defaults to None """ self.model_file = 'x-umx.h5' self.model_dir = Path('pretrained_models') self.model_file_path = self.model_dir / self.model_file self.context = 'cpu' if cpu_separation else 'cudnn' self.softmask = softmask self.alpha = alpha self.iterations = iterations self.bitrate = bitrate self.sample_rate = 44100 self.residual_model = False self.audio_adapter = AudioAdapter.default() self.chunk_duration = 30
def __init__(self, cpu_separation: bool, bitrate=256, softmask=False, alpha=1.0, iterations=1): """Default constructor. :param config: Separator config, defaults to None """ if cpu_separation: raise ValueError('X-UMX only works with GPU. Task aborted.') self.model_file = 'x-umx.h5' self.model_dir = Path('pretrained_models') self.model_file_path = self.model_dir / self.model_file self.context = 'cudnn' self.softmask = softmask self.alpha = alpha self.iterations = iterations self.bitrate = bitrate self.sample_rate = 44100 self.residual_model = False self.audio_adapter = AudioAdapter.default()
def adapter(): """ Target test audio adapter fixture. """ return AudioAdapter.default()
def main(argv=None): args = docopt(__doc__, argv=argv) fi = Clip(args['<input>']) fo = args['-o'] ranges = list( tuple(ptime(t) for t in range.split('~')) for range in args['<range>']) loader = AudioAdapter.default() sample_rate = 44100 separator = Separator('spleeter:2stems') segments = {} for start, end in ranges: print(f'Processing range {start}-{end}...') options = ['-vn', '-r', str(sample_rate), '-f', 'wav'] clip = fi.slice(start, end - start, output_options=options)[0] for i in range(int(args['--pass'])): waveform, _ = loader.load(clip.path, sample_rate=sample_rate) prediction = separator.separate(waveform) output = tmpfile('wav') target = 'accompaniment' if args['--inverse'] else 'vocals' loader.save(output, prediction[target], sample_rate) clip = Clip(output, tmpfile=output) segments[start] = clip print('Writing output file...') # Mute ranges in the original audio track # asetnsamples is required, source: https://superuser.com/a/1230890 filters = '[0:a]asetnsamples=8192,' filters += ','.join(f"volume=0:enable='between(t,{start},{end})'" for start, end in ranges) filters += '[main]' # Delay processed segments for i, (start, end) in enumerate(ranges): delay = int(start * 1000) filters += f';[{i+1}]' filters += 'asetnsamples=8192' filters += f',adelay={delay}|{delay},apad[delay{i+1}]' # Mix muted original track and all processed segments filters += ';[main]' for i, (start, end) in enumerate(ranges): filters += f'[delay{i+1}]' filters += f'amix=inputs={len(ranges) + 1}:duration=first' filters += f',volume={len(ranges) + 1}' filters += '[audio]' command = ['ffmpeg', '-i', fi.path] for start, segment in segments.items(): command += ['-i', segment.path] # Copy codecs from the original video ainfo = fi.ffprobe('stream=codec_name,bit_rate', 'a')['streams'][0] command += [ '-c:v', 'copy', '-c:a', ainfo['codec_name'], '-b:a', ainfo['bit_rate'], '-strict', '-2' ] command += [ '-filter_complex', filters, '-map', '0:v', '-map', '[audio]', fo ] if run(command).returncode != 0: if os.path.exists(fo): os.unlink(fo) raise Exception('ffmpeg exited with non-zero code')
def generate_synthesized_mix(self, filename, separator, pitch_preproc, voicing): # Get file id from filename file_id = filename.split('/')[-1].replace('_vocal.wav', '') # Load audio with Spleeter's AudioAdapter audio_loader = AudioAdapter.default() waveform, _ = audio_loader.load( filename, sample_rate=self.sample_rate ) # Run vocal separation on vocal audio prediction = separator.separate(waveform) audio = prediction['vocals'] # To mono, energy filering and apply EqualLoudness for a better pitch extraction audio_mono = audio.sum(axis=1) / 2 audio_mono_filt = self.filter_audio(audio=audio_mono, coef=0.00125) # Energy filter to remove background noise audio_mono_eqloud = estd.EqualLoudness(sampleRate=self.sample_rate)(audio_mono_filt) # Extract pitch using PredominantMelodyMakam algorithm est_time, est_freq = self.extract_pitch_pmm(audio=audio_mono_eqloud) pitch = [[x, y] for x, y in zip(est_time, est_freq)] # Preprocessing analyzed audio and pitch preprocessor = PitchProcessor( pitch_preproc=pitch_preproc, voicing=voicing, gap_len=25, ) audio, pitch_processed, time_stamps_processed = preprocessor.pre_processing( audio=audio_mono, extracted_pitch=pitch, ) # Get freq limits to compute minf0 tmp_est_freq = [x for x in est_freq if x > 20] if len(tmp_est_freq) > 0: minf0 = min(tmp_est_freq) - 20 else: minf0 = 0 # Synthesize vocal track synthesizer = Synthesizer( model='hpr', minf0=minf0, maxf0=max(pitch_processed) + 50, ) synthesized_audio, pitch_track = synthesizer.synthesize( filtered_audio=audio, pitch_track=pitch_processed, ) # Equalize voice #fx = (AudioEffectsChain().equalizer(200)) #synthesized_audio = fx(synthesized_audio) # Get synthesized mix synthesized_audio_mix = self.mix( filename=filename, synthesized_voice=synthesized_audio ) # Get vocal activations start_times, end_times = self.get_activations(time_stamps_processed, pitch_track) if len(start_times) > 2: # Write synthesized audio to file tmp_wav = 'audio/synth_mix_' + file_id + '.wav' self.save_audio_to_dataset(tmp_wav, synthesized_audio_mix) # Write csv melody annotation to file tmp_txt = 'annotations/melody/synth_mix_' + file_id + '.csv' self.save_pitch_track_to_dataset(tmp_txt, time_stamps_processed, pitch_track) # Write lab activations to file tmp_lab = 'annotations/activations/synth_mix_' + file_id + '.lab' self.save_activation_to_dataset(tmp_lab, start_times, end_times) return synthesized_audio_mix, pitch_track, time_stamps_processed else: print('UNVOICED TRACK! Skipping...') return [], [], []
from spleeter.audio.adapter import AudioAdapter from spleeter.separator import Separator def separate_audio(file, layers=5, sample_rate=44100): audio_loader = AudioAdapter.default() if layers not in [2, 4, 5]: raise Exception("layers must be 2, 4 or 5!") waveform, _ = audio_loader.load(file, sample_rate=sample_rate) separator = Separator('spleeter:{}stems'.format(layers)) prediction = separator.separate(waveform) return prediction if __name__ == "__main__": audio_loader = AudioAdapter.default() prediction = separate_audio("../../data/chitose.mp3", 4) for k, v in prediction.items(): audio_loader.save("../../data/output/{}.mp3".format(k), v, 44100)