Exemple #1
0
def test_separator_backends(test_file):
    adapter = AudioAdapter.default()
    waveform, _ = adapter.load(test_file)

    separator_lib = Separator("spleeter:2stems",
                              stft_backend="librosa",
                              multiprocess=False)
    separator_tf = Separator("spleeter:2stems",
                             stft_backend="tensorflow",
                             multiprocess=False)

    # Test the stft and inverse stft provides exact reconstruction
    stft_matrix = separator_lib._stft(waveform)
    reconstructed = separator_lib._stft(stft_matrix,
                                        inverse=True,
                                        length=waveform.shape[0])
    assert np.allclose(reconstructed, waveform, atol=3e-2)

    # compare both separation, it should be close
    out_tf = separator_tf._separate_tensorflow(waveform, test_file)
    out_lib = separator_lib._separate_librosa(waveform, test_file)

    for instrument in out_lib.keys():
        # test that both outputs are close everywhere
        assert np.allclose(out_tf[instrument], out_lib[instrument], atol=1e-5)
Exemple #2
0
    def separate_audio(
            self,
            filename: str = "../audio/audio_example.mp3",
            bitrate: str = "128k",
            codec: Codec = Codec.WAV,  # output file extension
            duration: float = 600.0,  # max length in seconds
            offset: float = 0.00,  # starting position
            verbose: bool = False  # verbose logging
    ) -> None:
        """
        Separate audio file(s) using pretrained 2-stems model
        """
        from spleeter.audio.adapter import AudioAdapter
        from spleeter.separator import Separator

        configure_logger(verbose)

        audio_adapter: AudioAdapter = AudioAdapter.get(descriptor=self.adapter)
        separator: Separator = Separator(self.params_filename,
                                         MWF=False,
                                         stft_backend=STFTBackend(
                                             STFTBackend.AUTO))
        separator.separate_to_file(
            str(filename),
            str(self.output_path),
            audio_adapter=audio_adapter,
            offset=int(offset),
            duration=duration,
            codec=codec,
            bitrate=bitrate,
            filename_format=self.filename_format,
            synchronous=False,
        )
        separator.join()
Exemple #3
0
def generate_fake_training_dataset(
    path,
    instrument_list=['vocals', 'other'],
    n_channels=2,
    n_songs=2,
    fs=44100,
    duration=6,
):
    """
        generates a fake training dataset in path:
        - generates audio files
        - generates a csv file describing the dataset
    """
    aa = AudioAdapter.default()
    rng = np.random.RandomState(seed=0)
    dataset_df = pd.DataFrame(columns=['mix_path'] +
                              [f'{instr}_path'
                               for instr in instrument_list] + ['duration'])
    for song in range(n_songs):
        song_path = join(path, 'train', f'song{song}')
        makedirs(song_path, exist_ok=True)
        dataset_df.loc[song, f'duration'] = duration
        for instr in instrument_list + ['mix']:
            filename = join(song_path, f'{instr}.wav')
            data = rng.rand(duration * fs, n_channels) - 0.5
            aa.save(filename, data, fs)
            dataset_df.loc[song,
                           f'{instr}_path'] = join('train', f'song{song}',
                                                   f'{instr}.wav')
    dataset_df.to_csv(join(path, 'train', 'train.csv'), index=False)
def execute(args):
    try:
        logger.info('音声分離処理開始: {0}',
                    args.audio_file,
                    decoration=MLogger.DECORATION_BOX)

        if not os.path.exists(args.audio_file):
            logger.error("指定された音声ファイルパスが存在しません。\n{0}",
                         args.audio_file,
                         decoration=MLogger.DECORATION_BOX)
            return False, None

        # 親パス(指定がなければ動画のある場所。Colabはローカルで作成するので指定あり想定)
        base_path = str(pathlib.Path(args.audio_file).parent
                        ) if not args.parent_dir else args.parent_dir

        if len(args.parent_dir) > 0:
            process_audio_dir = base_path
        else:
            process_audio_dir = os.path.join(
                base_path, "{0}_{1:%Y%m%d_%H%M%S}".format(
                    os.path.basename(args.audio_file).replace('.', '_'),
                    datetime.datetime.now()))

        # 既存は削除
        if os.path.exists(process_audio_dir):
            shutil.rmtree(process_audio_dir)

        # フォルダ生成
        os.makedirs(process_audio_dir)

        audio_adapter = AudioAdapter.default()
        waveform, sample_rate = audio_adapter.load(args.audio_file)

        # 音声と曲に分離
        separator = Separator('spleeter:2stems')

        # Perform the separation :
        prediction = separator.separate(waveform)

        # 音声データ
        vocals = prediction['vocals']

        vocals_wav_path = f"{process_audio_dir}/vocals.wav"

        # 一旦wavとして保存
        audio_adapter.save(vocals_wav_path, vocals, sample_rate, "wav")

        logger.info('音声分離処理終了: {0}',
                    process_audio_dir,
                    decoration=MLogger.DECORATION_BOX)

        return True, process_audio_dir
    except Exception as e:
        logger.critical("音声分離で予期せぬエラーが発生しました。",
                        e,
                        decoration=MLogger.DECORATION_BOX)
        return False, None
Exemple #5
0
def split(input_file, sample_rate=44100, model="2stems"):
    if model.lower() not in ["2stems", "4stems", "5stems"]:
        print(f"Invalid model: '{model}'. Using '2stems' model instead.")
        model = "2stems"
    separator = Separator(f"spleeter:{model}")
    audio_loader = AudioAdapter.default()
    waveform, _ = audio_loader.load(input_file.name, sample_rate=sample_rate)
    # prediction output is a dictionary whose keys contain instrument/stem names
    # and values the associated waveforms
    prediction = separator.separate(waveform, "_")
    return (prediction)
Exemple #6
0
def separate_audio(file, layers=5, sample_rate=44100):
    audio_loader = AudioAdapter.default()
    if layers not in [2, 4, 5]:
        raise Exception("layers must be 2, 4 or 5!")
    
    waveform, _ = audio_loader.load(file, sample_rate=sample_rate)
    separator = Separator('spleeter:{}stems'.format(layers))
    
    prediction = separator.separate(waveform)
    
    return prediction
Exemple #7
0
    def get_spleeter_prediction(separator, track_path, source=None):
        # Get Spleeter prediction taking model and source to obtain as input
        audio_loader = AudioAdapter.default()
        waveform, _ = audio_loader.load(track_path, sample_rate=44100)
        if source == 'other':
            prediction = separator.separate(waveform)
            return prediction[source]

        else:
            prediction = separator.separate(waveform)
            return prediction[source]
Exemple #8
0
 def __init__(self, cpu_separation: bool, bitrate=256):
     """Default constructor.
     :param config: Separator config, defaults to None
     """
     self.audio_bitrate = f'{bitrate}k'
     self.audio_format = 'mp3'
     self.sample_rate = 44100
     self.spleeter_stem = 'config/4stems-16kHz.json'
     self.separator = Separator(self.spleeter_stem,
                                stft_backend=STFTBackend.LIBROSA if cpu_separation else STFTBackend.TENSORFLOW,
                                multiprocess=False)
     self.audio_adapter = AudioAdapter.default()
Exemple #9
0
    def music_processing(music_pth, ret_value):
        separator = Separator(params_descriptor='spleeter:2stems')

        audio_adapter = AudioAdapter.get(
            'spleeter.audio.ffmpeg.FFMPEGProcessAudioAdapter')
        waveform, _ = audio_adapter.load(music_path,
                                         dtype=np.float32,
                                         sample_rate=22050)
        sources = separator.separate(waveform=waveform,
                                     audio_descriptor=music_pth)
        vocals = sources['vocals']
        ret_value['vocals'] = vocals
        return vocals
Exemple #10
0
 def __init__(
     self,
     cpu_separation: bool,
     bitrate=256
 ):
     """Default constructor.
     :param config: Separator config, defaults to None
     """
     self.model_file = 'd3net-mss.h5'
     self.model_dir = Path('pretrained_models')
     self.model_file_path = self.model_dir / self.model_file
     self.context = 'cpu' if cpu_separation else 'cudnn'
     self.bitrate = f'{bitrate}k'
     self.sample_rate = 44100
     self.audio_adapter = AudioAdapter.default()
Exemple #11
0
def musidi():
    link = [x for x in request.form.values()]
    #name = ''.join(random.choice('qwertyuiopasdfghjklzxcvbnm') for i in range(10))
    name = 'song_file'

    local_file_path = name + ".mp3"
    ydl_args = {
        'format': 'bestaudio/best',
        'outtmpl': local_file_path
    }

    ydl = youtube_dl.YoutubeDL(ydl_args)
    ydl.download([link[0]]) 

    separator = Separator('4stems.json', multiprocess=False)

    audio_loader = AudioAdapter.default()
    sample_rate = 44100
    waveform, _ = audio_loader.load(local_file_path, sample_rate=sample_rate, duration=60)
    separator.separate_to_file(local_file_path, 'output/', audio_adapter=audio_loader)

    os.replace(f'./output/{name}/vocals.wav', f'Audio-to-midi-master/input/{name}_vocals.wav')
    for x in os.walk('Audio-to-midi-master/input/'):
        print(x)
    print('running model')
    _output = subprocess.run([sys.executable, 'Audio-to-midi-master/audio2midi.py', '-in', 'Audio-to-midi-master/input/', '-out', 'Audio-to-midi-master/output/'])

    os.replace(f'Audio-to-midi-master/output/{name}_vocals.mid', f'./{name}_vocals.mid')
    print('moved output. can you see it?')

    '''
    f = open(f'{name}_lyrics.txt', 'w')
    # https://www.thepythoncode.com/article/using-speech-recognition-to-convert-speech-to-text-python
    r = sr.Recognizer()
    with sr.AudioFile(f'./Audio-to-midi-master/input/{name}_vocals.wav') as source:
        # listen for data (load audio to memory)
        audio_data = r.record(source)
        # recognize (convert from speech/song to text)
        text = r.recognize_google(audio_data, language = 'en-IN', show_all=True) # may only work for files less than ~110 seconds long, need to split otherwise
        f.write(str(text))
    f.close()
    '''
    print('printed lyrics to txt. can you see it?')
    from roll import MidiFile
    mid = MidiFile(name+'_vocals.mid')
    mid.draw_roll()
    return render_template('index.html', output='{}/other.wav'.format(name), picture=name+'_vocals.png', youtube=link[0][-11:])
Exemple #12
0
def generate_fake_eval_dataset(path):
    """
        generate fake evaluation dataset
    """
    aa = AudioAdapter.default()
    n_songs = 2
    fs = 44100
    duration = 3
    n_channels = 2
    rng = np.random.RandomState(seed=0)
    for song in range(n_songs):
        song_path = join(path, 'test', f'song{song}')
        makedirs(song_path, exist_ok=True)
        for instr in ['mixture', 'vocals', 'bass', 'drums', 'other']:
            filename = join(song_path, f'{instr}.wav')
            data = rng.rand(duration * fs, n_channels) - 0.5
            aa.save(filename, data, fs)
 def __init__(self,
              model_name='mdx_extra_q',
              cpu_separation=True,
              bitrate=256,
              shifts=5):
     self.device = 'cpu' if cpu_separation else 'cuda'
     self.sample_rate = 44100
     self.model_name = model_name
     self.repo = None
     self.model_dir = Path('pretrained_models')
     self.shifts = shifts
     self.split = True
     self.overlap = 0.25
     self.workers = 0
     self.verbose = True
     self.bitrate = f'{bitrate}k'
     self.audio_adapter = AudioAdapter.default()
Exemple #14
0
def source_seperate_ogg(ogg_list: list):
    separator = Separator('spleeter:4stems')
    audio_loader = AudioAdapter.default()
    sample_rate = 22050
    range_ = 32767

    for ogg in ogg_list:
        waveform, _ = audio_loader.load(ogg, sample_rate=sample_rate)
        prediction = separator.separate(waveform)
        prediction['other'] = prediction['other'] * range_

        save_path = Path(
            str(ogg).replace('Unprocessed', 'source_separated', 1))
        if not os.path.isdir(save_path.parent):
            os.mkdir(save_path.parent)

        print(prediction)

        break
Exemple #15
0
def test_separate(test_file, configuration, backend):
    """ Test separation from raw data. """
    instruments = MODEL_TO_INST[configuration]
    adapter = AudioAdapter.default()
    waveform, _ = adapter.load(test_file)
    separator = Separator(configuration,
                          stft_backend=backend,
                          multiprocess=False)
    prediction = separator.separate(waveform, test_file)
    assert len(prediction) == len(instruments)
    for instrument in instruments:
        assert instrument in prediction
    for instrument in instruments:
        track = prediction[instrument]
        assert waveform.shape[:-1] == track.shape[:-1]
        assert not np.allclose(waveform, track)
        for compared in instruments:
            if instrument != compared:
                assert not np.allclose(track, prediction[compared])
Exemple #16
0
 def __init__(self,
              cpu_separation: bool,
              bitrate=256,
              softmask=False,
              alpha=1.0,
              iterations=1):
     """Default constructor.
     :param config: Separator config, defaults to None
     """
     self.model_file = 'x-umx.h5'
     self.model_dir = Path('pretrained_models')
     self.model_file_path = self.model_dir / self.model_file
     self.context = 'cpu' if cpu_separation else 'cudnn'
     self.softmask = softmask
     self.alpha = alpha
     self.iterations = iterations
     self.bitrate = bitrate
     self.sample_rate = 44100
     self.residual_model = False
     self.audio_adapter = AudioAdapter.default()
     self.chunk_duration = 30
    def __init__(self,
                 cpu_separation: bool,
                 bitrate=256,
                 softmask=False,
                 alpha=1.0,
                 iterations=1):
        """Default constructor.
        :param config: Separator config, defaults to None
        """
        if cpu_separation:
            raise ValueError('X-UMX only works with GPU. Task aborted.')

        self.model_file = 'x-umx.h5'
        self.model_dir = Path('pretrained_models')
        self.model_file_path = self.model_dir / self.model_file
        self.context = 'cudnn'
        self.softmask = softmask
        self.alpha = alpha
        self.iterations = iterations
        self.bitrate = bitrate
        self.sample_rate = 44100
        self.residual_model = False
        self.audio_adapter = AudioAdapter.default()
def adapter():
    """ Target test audio adapter fixture. """
    return AudioAdapter.default()
Exemple #19
0
def main(argv=None):
    args = docopt(__doc__, argv=argv)

    fi = Clip(args['<input>'])
    fo = args['-o']
    ranges = list(
        tuple(ptime(t) for t in range.split('~')) for range in args['<range>'])

    loader = AudioAdapter.default()
    sample_rate = 44100
    separator = Separator('spleeter:2stems')

    segments = {}

    for start, end in ranges:
        print(f'Processing range {start}-{end}...')

        options = ['-vn', '-r', str(sample_rate), '-f', 'wav']
        clip = fi.slice(start, end - start, output_options=options)[0]

        for i in range(int(args['--pass'])):
            waveform, _ = loader.load(clip.path, sample_rate=sample_rate)
            prediction = separator.separate(waveform)

            output = tmpfile('wav')

            target = 'accompaniment' if args['--inverse'] else 'vocals'
            loader.save(output, prediction[target], sample_rate)

            clip = Clip(output, tmpfile=output)

        segments[start] = clip

    print('Writing output file...')

    # Mute ranges in the original audio track
    # asetnsamples is required, source: https://superuser.com/a/1230890
    filters = '[0:a]asetnsamples=8192,'
    filters += ','.join(f"volume=0:enable='between(t,{start},{end})'"
                        for start, end in ranges)
    filters += '[main]'

    # Delay processed segments
    for i, (start, end) in enumerate(ranges):
        delay = int(start * 1000)
        filters += f';[{i+1}]'
        filters += 'asetnsamples=8192'
        filters += f',adelay={delay}|{delay},apad[delay{i+1}]'

    # Mix muted original track and all processed segments
    filters += ';[main]'
    for i, (start, end) in enumerate(ranges):
        filters += f'[delay{i+1}]'
    filters += f'amix=inputs={len(ranges) + 1}:duration=first'
    filters += f',volume={len(ranges) + 1}'

    filters += '[audio]'

    command = ['ffmpeg', '-i', fi.path]

    for start, segment in segments.items():
        command += ['-i', segment.path]

    # Copy codecs from the original video
    ainfo = fi.ffprobe('stream=codec_name,bit_rate', 'a')['streams'][0]
    command += [
        '-c:v', 'copy', '-c:a', ainfo['codec_name'], '-b:a', ainfo['bit_rate'],
        '-strict', '-2'
    ]

    command += [
        '-filter_complex', filters, '-map', '0:v', '-map', '[audio]', fo
    ]

    if run(command).returncode != 0:
        if os.path.exists(fo):
            os.unlink(fo)
        raise Exception('ffmpeg exited with non-zero code')
    def generate_synthesized_mix(self, filename, separator, pitch_preproc, voicing):
        # Get file id from filename
        file_id = filename.split('/')[-1].replace('_vocal.wav', '')
        
        # Load audio with Spleeter's AudioAdapter
        audio_loader = AudioAdapter.default()
        waveform, _ = audio_loader.load(
            filename,
            sample_rate=self.sample_rate
        )

        # Run vocal separation on vocal audio
        prediction = separator.separate(waveform)
        audio = prediction['vocals']
        
        # To mono, energy filering and apply EqualLoudness for a better pitch extraction
        audio_mono = audio.sum(axis=1) / 2
        audio_mono_filt = self.filter_audio(audio=audio_mono, coef=0.00125)  # Energy filter to remove background noise
        audio_mono_eqloud = estd.EqualLoudness(sampleRate=self.sample_rate)(audio_mono_filt)
        
        # Extract pitch using PredominantMelodyMakam algorithm
        est_time, est_freq = self.extract_pitch_pmm(audio=audio_mono_eqloud)
        pitch = [[x, y] for x, y in zip(est_time, est_freq)]

        # Preprocessing analyzed audio and pitch
        preprocessor = PitchProcessor(
            pitch_preproc=pitch_preproc,
            voicing=voicing,
            gap_len=25,
        )
        audio, pitch_processed, time_stamps_processed = preprocessor.pre_processing(
            audio=audio_mono,
            extracted_pitch=pitch,
        )
        
        # Get freq limits to compute minf0
        tmp_est_freq = [x for x in est_freq if x > 20]
        if len(tmp_est_freq) > 0:
            minf0 = min(tmp_est_freq) - 20
        else:
            minf0 = 0
            
        # Synthesize vocal track
        synthesizer = Synthesizer(
            model='hpr',
            minf0=minf0,
            maxf0=max(pitch_processed) + 50,
        )
        synthesized_audio, pitch_track = synthesizer.synthesize(
            filtered_audio=audio,
            pitch_track=pitch_processed,
        )

        # Equalize voice
        #fx = (AudioEffectsChain().equalizer(200))
        #synthesized_audio = fx(synthesized_audio)

        # Get synthesized mix
        synthesized_audio_mix = self.mix(
            filename=filename,
            synthesized_voice=synthesized_audio
        )
        
        # Get vocal activations
        start_times, end_times = self.get_activations(time_stamps_processed, pitch_track)
        
        if len(start_times) > 2:
            # Write synthesized audio to file
            tmp_wav = 'audio/synth_mix_' + file_id + '.wav'
            self.save_audio_to_dataset(tmp_wav, synthesized_audio_mix)
    
            # Write csv melody annotation to file
            tmp_txt = 'annotations/melody/synth_mix_' + file_id + '.csv'
            self.save_pitch_track_to_dataset(tmp_txt, time_stamps_processed, pitch_track)
            
            # Write lab activations to file
            tmp_lab = 'annotations/activations/synth_mix_' + file_id + '.lab'
            self.save_activation_to_dataset(tmp_lab, start_times, end_times)
    
            return synthesized_audio_mix, pitch_track, time_stamps_processed
        else:
            print('UNVOICED TRACK! Skipping...')
            return [], [], []
Exemple #21
0
from spleeter.audio.adapter import AudioAdapter
from spleeter.separator import Separator

def separate_audio(file, layers=5, sample_rate=44100):
    audio_loader = AudioAdapter.default()
    if layers not in [2, 4, 5]:
        raise Exception("layers must be 2, 4 or 5!")
    
    waveform, _ = audio_loader.load(file, sample_rate=sample_rate)
    separator = Separator('spleeter:{}stems'.format(layers))
    
    prediction = separator.separate(waveform)
    
    return prediction
    

if __name__ == "__main__":
    audio_loader = AudioAdapter.default()
    prediction = separate_audio("../../data/chitose.mp3", 4)
    for k, v in prediction.items():
        audio_loader.save("../../data/output/{}.mp3".format(k), v, 44100)