Ejemplo n.º 1
0
    def music_processing(music_pth, ret_value):
        separator = Separator(params_descriptor='spleeter:2stems')

        audio_adapter = AudioAdapter.get(
            'spleeter.audio.ffmpeg.FFMPEGProcessAudioAdapter')
        waveform, _ = audio_adapter.load(music_path,
                                         dtype=np.float32,
                                         sample_rate=22050)
        sources = separator.separate(waveform=waveform,
                                     audio_descriptor=music_pth)
        vocals = sources['vocals']
        ret_value['vocals'] = vocals
        return vocals
Ejemplo n.º 2
0
def separate_one_audio_on_accompaniment_and_vocals_by_spleeter(path_to_audio, sample_rate, output_directory):
    audio_loader = get_default_audio_adapter()
    separator = Separator('spleeter:2stems')
    filename=path_to_audio.split('/')[-1].split('\\')[-1]
    waveform, _ = audio_loader.load(path_to_audio, sample_rate=sample_rate)
    # Perform the separation :
    prediction = separator.separate(waveform)
    accompaniment=prediction['accompaniment']
    vocals=prediction['vocals']
    wavfile.write(output_directory + '.'.join(filename.split('.')[:-1])+'_accompaniment'+'.wav', sample_rate, accompaniment)
    wavfile.write(output_directory + '.'.join(filename.split('.')[:-1])+'_vocals'+'.wav', sample_rate, vocals)
    del audio_loader, separator, waveform, prediction, accompaniment, vocals
    gc.collect()
Ejemplo n.º 3
0
def test_separate(configuration, instruments):
    """ Test separation from raw data. """
    adapter = get_default_audio_adapter()
    waveform, _ = adapter.load(TEST_AUDIO_DESCRIPTOR)
    separator = Separator(configuration)
    prediction = separator.separate(waveform)
    assert len(prediction) == len(instruments)
    for instrument in instruments:
        assert instrument in prediction
    for instrument in instruments:
        track = prediction[instrument]
        assert not (waveform == track).all()
        for compared in instruments:
            if instrument != compared:
                assert not (track == prediction[compared]).all()
def do_svs_spleeter(y, sr):
    from spleeter.separator import Separator
    import warnings
    separator = Separator('spleeter:2stems')
    warnings.filterwarnings('ignore')

    if sr != 44100:
        y = librosa.core.resample(y=y, orig_sr=sr, target_sr=44100)

    waveform = np.expand_dims(y, axis=1)

    prediction = separator.separate(waveform)
    ret = librosa.core.to_mono(prediction["vocals"].T)
    # soundfile.write("/HDD2/b06902046/ADLfinal/vocal.wav", prediction["vocals"], 44100, subtype='PCM_16')
    # print (prediction["vocals"].shape)
    return ret, 44100
Ejemplo n.º 5
0
    def initialize_components(self):
        spleeter_sr = 44100

        waveform = self._original_mix
        separator = Separator(self.model_name, multiprocess=False)
        waveform = librosa.resample(waveform, self.target_sr, spleeter_sr)
        waveform = np.expand_dims(waveform, axis=1)
        prediction = separator.separate(waveform)

        original_components = [
            librosa.resample(np.mean(prediction[key], axis=1), spleeter_sr,
                             self.target_sr) for key in prediction
        ]

        components_names = list(prediction.keys())
        return original_components, components_names
Ejemplo n.º 6
0
def test_separate(configuration, instruments, backend):
    """ Test separation from raw data. """
    adapter = get_default_audio_adapter()
    waveform, _ = adapter.load(TEST_AUDIO_DESCRIPTOR)
    separator = Separator(configuration, stft_backend=backend)
    prediction = separator.separate(waveform, TEST_AUDIO_DESCRIPTOR)
    assert len(prediction) == len(instruments)
    for instrument in instruments:
        assert instrument in prediction
    for instrument in instruments:
        track = prediction[instrument]
        assert waveform.shape == track.shape
        assert not np.allclose(waveform, track)
        for compared in instruments:
            if instrument != compared:
                assert not np.allclose(track, prediction[compared])
Ejemplo n.º 7
0
def do_svs_spleeter(y, sr):
    from spleeter.separator import Separator
    import warnings
    separator = Separator('spleeter:2stems')
    warnings.filterwarnings('ignore')

    if sr != 44100:
        y = librosa.core.resample(y=y, orig_sr=sr, target_sr=44100)

    waveform = np.expand_dims(y, axis=1)

    prediction = separator.separate(waveform)
    ret = librosa.core.to_mono(prediction["vocals"].T)
    ret = np.clip(ret, -1.0, 1.0)
    del separator
    return ret, 44100
Ejemplo n.º 8
0
def test_separate(test_file, configuration, backend):
    """ Test separation from raw data. """
    instruments = MODEL_TO_INST[configuration]
    adapter = get_default_audio_adapter()
    waveform, _ = adapter.load(test_file)
    separator = Separator(configuration, stft_backend=backend, multiprocess=False)
    prediction = separator.separate(waveform, test_file)
    assert len(prediction) == len(instruments)
    for instrument in instruments:
        assert instrument in prediction
    for instrument in instruments:
        track = prediction[instrument]
        assert waveform.shape[:-1] == track.shape[:-1]
        assert not np.allclose(waveform, track)
        for compared in instruments:
            if instrument != compared:
                assert not np.allclose(track, prediction[compared])
Ejemplo n.º 9
0
class SpleeterSeparator:
    """Performs source separation using Spleeter API."""
    def __init__(self, config=None):
        """Default constructor.

        :param config: Separator config, defaults to None
        """
        if config is None:
            self.audio_bitrate = '256k'
            self.audio_format = 'mp3'
            self.sample_rate = 44100
            self.spleeter_stem = 'config/4stems-16kHz.json'
        else:
            self.audio_bitrate = config['audio_bitrate']
            self.audio_format = config['audio_format']
            self.sample_rate = config['sample_rate']
            self.spleeter_stem = config['spleeter_stem']
        # Use librosa backend as it is less memory intensive
        self.separator = Separator(self.spleeter_stem,
                                   stft_backend='librosa',
                                   multiprocess=False)
        self.audio_adapter = get_default_audio_adapter()

    def separate(self, parts, input_path, output_path):
        """Performs source separation by adding together the parts to be kept.

        :param parts: List of parts to keep ('vocals', 'drums', 'bass', 'other')
        :param input_path: Path to source file
        :param output_path: Path to output file
        :raises e: FFMPEG error
        """
        waveform, _ = self.audio_adapter.load(input_path,
                                              sample_rate=self.sample_rate)
        prediction = self.separator.separate(waveform)
        out = np.zeros_like(prediction['vocals'])
        part_count = 0

        # Add up parts that were requested
        for key in prediction:
            if parts[key]:
                out += prediction[key]
                part_count += 1
        out /= part_count
        self.audio_adapter.save(output_path, out, self.separator._sample_rate,
                                self.audio_format, self.audio_bitrate)
Ejemplo n.º 10
0
def source_seperate_ogg(ogg_list: list):
    separator = Separator('spleeter:4stems')
    audio_loader = AudioAdapter.default()
    sample_rate = 22050
    range_ = 32767

    for ogg in ogg_list:
        waveform, _ = audio_loader.load(ogg, sample_rate=sample_rate)
        prediction = separator.separate(waveform)
        prediction['other'] = prediction['other'] * range_

        save_path = Path(
            str(ogg).replace('Unprocessed', 'source_separated', 1))
        if not os.path.isdir(save_path.parent):
            os.mkdir(save_path.parent)

        print(prediction)

        break
Ejemplo n.º 11
0
def execute(args):
    try:
        logger.info('音声認識処理開始: {0}',
                    args.audio_file,
                    decoration=MLogger.DECORATION_BOX)

        if not os.path.exists(args.audio_file):
            logger.error("指定された音声ファイルパスが存在しません。\n{0}",
                         args.video_file,
                         decoration=MLogger.DECORATION_BOX)
            return False, None

        # 親パス(指定がなければ動画のある場所。Colabはローカルで作成するので指定あり想定)
        base_path = str(pathlib.Path(args.audio_file).parent
                        ) if not args.parent_dir else args.parent_dir

        audio_adapter = get_default_audio_adapter()
        sample_rate = 44100
        waveform, _ = audio_adapter.load(args.audio_file,
                                         sample_rate=sample_rate)

        # 音声と曲に分離
        separator = Separator('spleeter:2stems')

        # Perform the separation :
        prediction = separator.separate(waveform)

        # 音声データ
        vocals = prediction['vocals']

        audio_adapter.save(f"{base_path}/vocals.wav", vocals,
                           separator._sample_rate, "wav", "16k")

        logger.info('音声認識処理終了: {0}',
                    base_path,
                    decoration=MLogger.DECORATION_BOX)

        return True
    except Exception as e:
        logger.critical("音声認識で予期せぬエラーが発生しました。",
                        e,
                        decoration=MLogger.DECORATION_BOX)
        return False
Ejemplo n.º 12
0
    def initialize_components(self):
        spleeter_sr = 44100
        precomputed_name = os.path.basename(self._audio_path) + ".pt"
        precomputed_path = os.path.join(self.spleeter_sources_path,
                                        precomputed_name)
        if self.recompute:
            waveform = self._original_mix
            separator = Separator(self.model_name, multiprocess=False)
            waveform = librosa.resample(waveform, self.target_sr, spleeter_sr)
            waveform = np.expand_dims(waveform, axis=1)
            prediction = separator.separate(waveform)
            pickle_dump(prediction, precomputed_path)
        else:
            prediction = pickle_load(precomputed_path)

        original_components = [
            librosa.resample(np.mean(prediction[key], axis=1), spleeter_sr,
                             self.target_sr) for key in prediction
        ]

        components_names = list(prediction.keys())
        return original_components, components_names
Ejemplo n.º 13
0
def spleet_wav(songpath,outfolder,num_stems):

    rate, audio = wavfile.read(songpath)
    songname = os.path.basename(os.path.normpath(songpath))

    warnings.filterwarnings('ignore')

    stem_param = str(num_stems) + 'stems'

    # Using embedded configuration... stems can be 2,4, 5 (number of instruments in network)
    separator = Separator('spleeter:'+stem_param)

    # Perform the separation
    prediction = separator.separate(audio)

    rate = 44100

    for instrument in prediction:

        name = outfolder + "/" + songname + "_" + instrument + "_16-bit.wav"

        print("Saving", instrument, "as: ", name)

        wavio.write(name, prediction[instrument].astype(np.int16), rate, sampwidth=2)

    print("Overwriting other.wav with merged version")

    sound1 = AudioSegment.from_wav(outfolder + "/" + songname + "_" + "piano" + "_16-bit.wav")
    sound2 = AudioSegment.from_wav(outfolder + "/" + songname + "_" + "other" + "_16-bit.wav")

    merged_piano_other = sound1.overlay(sound2)

    merged_piano_other.export(outfolder + "/" + songname + "_" + "other" + "_16-bit.wav",format="wav")

    os.remove(outfolder + "/" + songname + "_" + "piano" + "_16-bit.wav")

    print("done merging other and piano")
Ejemplo n.º 14
0
    dataset_dir = sys.argv[1]

    from spleeter.separator import Separator
    import warnings
    separator = Separator('spleeter:2stems')

    for the_dir in os.listdir(dataset_dir):
        mix_path = os.path.join(dataset_dir, the_dir, "Mixture.mp3")

        y, sr = librosa.core.load(mix_path, sr=None, mono=True)
        if sr != 44100:
            y = librosa.core.resample(y=y, orig_sr=sr, target_sr=44100)

        waveform = np.expand_dims(y, axis=1)

        prediction = separator.separate(waveform)
        voc = librosa.core.to_mono(prediction["vocals"].T)
        voc = np.clip(voc, -1.0, 1.0)

        acc = librosa.core.to_mono(prediction["accompaniment"].T)
        acc = np.clip(acc, -1.0, 1.0)

        import soundfile
        soundfile.write(os.path.join(dataset_dir, the_dir, "Vocal.wav"),
                        voc,
                        44100,
                        subtype='PCM_16')
        soundfile.write(os.path.join(dataset_dir, the_dir, "Inst.wav"),
                        acc,
                        44100,
                        subtype='PCM_16')
Ejemplo n.º 15
0
    def transcribe(self, input_audio, model_path=None, output="./"):
        """Transcribe vocal notes in the audio.

        This function transcribes onset, offset, and pitch of the vocal in the audio.
        This module is reponsible for predicting onset and offset time of each note,
        and pitches are estimated by the `vocal-contour` submodule.

        Parameters
        ----------
        input_audio: Path
            Path to the raw audio file (.wav).
        model_path: Path
            Path to the trained model or the supported transcription mode.
        output: Path (optional)
            Path for writing out the transcribed MIDI file. Default to the current path.

        Returns
        -------
        midi: pretty_midi.PrettyMIDI
            The transcribed vocal notes.

        Outputs
        -------
        This function will outputs three files as listed below:

        - <song>.mid: the MIDI file with complete transcription results in piano sondfount.
        - <song>_f0.csv: pitch contour information of the vocal.
        - <song>_trans.wav: the rendered pitch contour audio.

        See Also
        --------
        omnizart.cli.vocal.transcribe: CLI entry point of this function.
        omnizart.vocal_contour.transcribe: Pitch estimation function.
        """
        logger.info("Separating vocal track from the audio...")
        separator = Separator('spleeter:2stems')

        # Tricky way to avoid the annoying tensorflow graph being finalized issue.
        separator._params["stft_backend"] = "librosa"  # pylint: disable=protected-access

        wav, fs = load_audio(input_audio, mono=False)
        pred = separator.separate(wav)

        logger.info("Loading model...")
        model, model_settings = self._load_model(model_path)

        logger.info("Extracting feature...")
        wav = librosa.to_mono(pred["vocals"].squeeze().T)
        feature = _extract_vocal_cfp(
            wav,
            fs,
            down_fs=model_settings.feature.sampling_rate,
            hop=model_settings.feature.hop_size,
            fr=model_settings.feature.frequency_resolution,
            fc=model_settings.feature.frequency_center,
            tc=model_settings.feature.time_center,
            g=model_settings.feature.gamma,
            bin_per_octave=model_settings.feature.bins_per_octave)

        logger.info("Predicting...")
        pred = predict(feature, model)

        logger.info("Infering notes...")
        interval = infer_interval(
            pred,
            ctx_len=model_settings.inference.context_length,
            threshold=model_settings.inference.threshold,
            min_dura=model_settings.inference.min_duration,
            t_unit=model_settings.feature.hop_size)

        logger.info("Extracting pitch contour")
        agg_f0 = vcapp.app.transcribe(
            input_audio,
            model_path=model_settings.inference.pitch_model,
            output=output)

        logger.info("Inferencing MIDI...")
        midi = infer_midi(interval,
                          agg_f0,
                          t_unit=model_settings.feature.hop_size)

        self._output_midi(output=output, input_audio=input_audio, midi=midi)
        logger.info("Transcription finished")
        return midi
Ejemplo n.º 16
0
class SpleeterSeparator(ABCSeparator):
    """Spleeter separator uses the spleeter library
    to separate music sources.

    """
    def __init__(self, stems: int, chunk_size=2):
        """
            Args:
                stems (int): total files to generate (2/3/5).
                chunk_size (int): chunk size (in seconds) indicates
                    duration size of individual chunk before splitting.
                NOTE: Longer audio file takes more memory. Hence, splitting
                    the audio is a workaround.
        """

        # specified stem loads a specific model
        # hence, it should be specified which model
        # to load.
        self.stems = stems
        #in minutes
        self.chunk_size = int(chunk_size * 60)

        self._separator = Separator(f"spleeter:{self.stems}stems")

        # spleeter specific config
        self._audio_adapter = get_default_audio_adapter()

    def _chunk(self, waveform, sr):
        chunks = []
        length = len(waveform) // sr
        remainder = len(waveform) % sr
        print(len(waveform), len(waveform) / sr)
        for c in range(0, length, self.chunk_size):
            print(c)
            chunk = waveform[c * sr:(c + self.chunk_size) * sr]
            print(len(chunk))
            yield chunk
        """
        if remainder:
            chunk = waveform[(c + 1)*sr + remainder:]
            print(len(chunk), "remainder")
            yield chunk
        """

    def separate(self, audio: Union[str, np.ndarray], sample_rate=44_100):
        """Separate audio into specified stems.

            Note: Spleeter uses tensorflow backend. Hence, corresponding
            installed device will automatically be used (CPU/GPU).
            Minimum VRAM/RAM requirement: 4GB (for small audio, <6 minutes).

            Args:
                audio_file (str, array): path to the original signal or the signal itself.
                sample_rate (int): sampling rate of the file.

            Returns:
                signal (Signal): separated signals.

            Raises:
                tf.errors.ResourceExhaustedError: When memory gets exhausted.

        """
        if isinstance(audio, np.ndarray):
            waveform = audio
        else:
            waveform, _ = self._audio_adapter.load(audio,
                                                   sample_rate=sample_rate)

        print(waveform.shape)
        #predict in chunks
        prediction = {}
        for chunk in self._chunk(waveform, sample_rate):
            chunk_prediction = self._separator.separate(chunk)

            for chunk_key, chunk_value in chunk_prediction.items():
                if chunk_key not in prediction:
                    prediction[chunk_key] = []
                prediction.get(chunk_key).append(chunk_value)

        #merge chunk prediction
        prediction = {k: np.vstack(v) for k, v in prediction.items()}
        print(list(v.shape for v in prediction.values()))
        signal = Signal(prediction.keys(), prediction.values())

        return signal
Ejemplo n.º 17
0
        print("converting")
    else:
        print("not converting")


sep = Separator('./2stem-finetune-realtime.json',
                MWF=False,
                stft_backend='tensorflow',
                multiprocess=False)


class Spleeter_Server(LADSPA_TCPServer):
    def process(self, channel, sample_rate, data):
        if np.max(data) == np.min(data) == 0:
            return data
        if should_process:
            processed = sep.separate(data.astype('float64').reshape((-1, 1)))
            return processed['vocals'].astype('float32')[:, 0]
        else:
            return data


if __name__ == "__main__":
    signal.signal(signal.SIGUSR1, handler)

    print("warming up")
    sep.separate(np.zeros((1024, 2)))
    print("run kill -SIGUSR1 %d to toggle the service on/off" % os.getpid())
    print("serving on :18083")
    Spleeter_Server.serve_forever(18083)
Ejemplo n.º 18
0
    # audio_loader = get_default_audio_adapter()
    # sample_rate = 44100
    # waveform, _ = audio_loader.load('/path/to/audio/file', sample_rate=sample_rate)

    # Perform the separation :

    # for f in tqdm.tqdm(glob.glob("./seperate/*.wav")):
    f = "audio_example.mp3"
    y,sr = librosa.load(librosa.util.example_audio_file(),sr=None,mono=False)



    yx = np.hstack([y])
    duration = yx.shape[1]/sr
    print(duration)
    time.sleep(4)
    start_time = time.time()

    silce_t = duration/10
    prediction = separator.separate(yx.T)
    for i in tqdm.tqdm(range(10)):
        start = int(sr*silce_t*i)
        end = int(sr*silce_t*(i+1))
        
        silce = yx[:,start:end]
        prediction = separator.separate(silce.T)
        time.sleep(4)

    print(duration,time.time()-start_time)
Ejemplo n.º 19
0
def main(argv=None):
    args = docopt(__doc__, argv=argv)

    fi = Clip(args['<input>'])
    fo = args['-o']
    ranges = list(
        tuple(ptime(t) for t in range.split('~')) for range in args['<range>'])

    loader = AudioAdapter.default()
    sample_rate = 44100
    separator = Separator('spleeter:2stems')

    segments = {}

    for start, end in ranges:
        print(f'Processing range {start}-{end}...')

        options = ['-vn', '-r', str(sample_rate), '-f', 'wav']
        clip = fi.slice(start, end - start, output_options=options)[0]

        for i in range(int(args['--pass'])):
            waveform, _ = loader.load(clip.path, sample_rate=sample_rate)
            prediction = separator.separate(waveform)

            output = tmpfile('wav')

            target = 'accompaniment' if args['--inverse'] else 'vocals'
            loader.save(output, prediction[target], sample_rate)

            clip = Clip(output, tmpfile=output)

        segments[start] = clip

    print('Writing output file...')

    # Mute ranges in the original audio track
    # asetnsamples is required, source: https://superuser.com/a/1230890
    filters = '[0:a]asetnsamples=8192,'
    filters += ','.join(f"volume=0:enable='between(t,{start},{end})'"
                        for start, end in ranges)
    filters += '[main]'

    # Delay processed segments
    for i, (start, end) in enumerate(ranges):
        delay = int(start * 1000)
        filters += f';[{i+1}]'
        filters += 'asetnsamples=8192'
        filters += f',adelay={delay}|{delay},apad[delay{i+1}]'

    # Mix muted original track and all processed segments
    filters += ';[main]'
    for i, (start, end) in enumerate(ranges):
        filters += f'[delay{i+1}]'
    filters += f'amix=inputs={len(ranges) + 1}:duration=first'
    filters += f',volume={len(ranges) + 1}'

    filters += '[audio]'

    command = ['ffmpeg', '-i', fi.path]

    for start, segment in segments.items():
        command += ['-i', segment.path]

    # Copy codecs from the original video
    ainfo = fi.ffprobe('stream=codec_name,bit_rate', 'a')['streams'][0]
    command += [
        '-c:v', 'copy', '-c:a', ainfo['codec_name'], '-b:a', ainfo['bit_rate'],
        '-strict', '-2'
    ]

    command += [
        '-filter_complex', filters, '-map', '0:v', '-map', '[audio]', fo
    ]

    if run(command).returncode != 0:
        if os.path.exists(fo):
            os.unlink(fo)
        raise Exception('ffmpeg exited with non-zero code')
Ejemplo n.º 20
0
class separateQThread(QThread):
    position = Signal(int)
    percent = Signal(float)
    voiceList = Signal(list)
    avgList = Signal(list)
    finish = Signal(bool)

    def __init__(self,
                 videoPath,
                 duration,
                 before,
                 after,
                 multiThread,
                 parent=None):
        super(separateQThread, self).__init__(parent)
        self.videoPath = videoPath
        self.duration = duration
        self.beforeCnt = int(before) // 20
        self.afterCnt = int(after) // 20
        self.separate = Separator('spleeter:2stems',
                                  stft_backend='tensorflow',
                                  multiprocess=multiThread)
        self.audioLoader = get_default_audio_adapter()

    def run(self):
        cuts = self.duration // 60000 + 1
        for cut in range(cuts):
            cmd = [
                'utils/ffmpeg.exe', '-y', '-i', self.videoPath, '-vn', '-ss',
                str(cut * 60), '-t', '60', 'temp_audio.m4a'
            ]
            p = subprocess.Popen(cmd,
                                 stdout=subprocess.PIPE,
                                 stderr=subprocess.STDOUT)
            p.wait()
            for line in p.stdout.readlines():
                try:
                    line = line.decode('gb18030', 'ignore')
                    if 'Audio:' in line:
                        break
                except:
                    pass
            for hz in line.split(','):
                if 'Hz' in hz:
                    hz = int(hz.split('Hz')[0])
                    break
            hz20 = hz // 50  # 20ms
            waveform, _ = self.audioLoader.load('temp_audio.m4a')
            prediction = self.separate.separate(waveform)
            msList = []
            varList = []
            voiceList = []
            avgList = []
            for cnt, l in enumerate(prediction['vocals']):  # 只提取人声键值
                for i in l:
                    msList.append(i)
                if not cnt % hz20:  # 每20ms取一次方差
                    varList.append(np.var(msList))  # 每20ms内的方差值
                    avgList.append(np.mean(msList))  # 每20ms内的平均值
                    msList = []
            med = np.median(varList)  # 1分钟内所有方差中位数
            cnt = self.beforeCnt  # 用户设置打轴前侧预留时间  / 20ms的次数
            start = 0  # 人声开始时间
            end = 0  # 人声结束时间
            avgVarList = []  # 平滑方差值
            for varCnt in range(len(varList) - 5):
                avgVarList.append(np.mean(varList[varCnt:varCnt +
                                                  5]))  # 方差值+后四位一起取平均
            avgVarList += varList[-4:]  # 补上最后四个没计算的方差值
            while cnt < len(avgVarList) - self.afterCnt:  # 开始判断人声区域
                if avgVarList[cnt] >= med:  # 平均方差值超过1分钟内方差中位数
                    start = cut * 60000 + (
                        cnt - self.beforeCnt) * 20  # 开始时间为当前时间-用户前侧留白时间
                    cnt += self.afterCnt  # 向后延伸用户后侧留白时间
                    if cnt < len(avgVarList):  # 没超出一分钟则开始往后查询
                        finishToken = False
                        while not finishToken:
                            try:  # 查询超出长度一律跳出循环
                                while avgVarList[cnt] >= med:  # 向后查询至平均方差值<中位数
                                    cnt += 1
                                cnt += self.afterCnt  # 补一次用户留白时间后再次判断平均方差值<中位数 是则通过
                                if avgVarList[cnt] < med:
                                    finishToken = True
                            except:
                                break
                    end = cut * 60000 + cnt * 20  # 结束时间即结束向后查询的时间
                    voiceList.append([start, end])  # 添加起止时间给信号槽发送
                else:
                    cnt += 1  # 没检测到人声则+1
            self.position.emit(cut + 1)
            self.percent.emit((cut + 1) / cuts * 100)
            self.voiceList.emit(voiceList)
            self.avgList.emit(avgList)


#             plt.subplot(311)
#             plt.plot([x for x in range(len(avgList))], avgList)
#             plt.subplot(312)
#             plt.plot([x for x in range(len(avgVarList))], avgVarList)
#             plt.axhline(med, label='median')
#             plt.subplot(313)
#             x = []
#             y = []
#             modifyVoice = []
#             for l in voiceList:
#                 modifyVoice += l
#             trig = False
#             for i in range(self.duration):
#                 for l in modifyVoice:
#                     if i > l:
#                         trig = not trig
#                 x.append(i)
#                 if not trig:
#                     y.append(0)
#                 else:
#                     y.append(1)
#             plt.plot(x, y)
#             plt.legend()
#             plt.show()
        self.finish.emit(True)
Ejemplo n.º 21
0
    'add padding zeros to each waveform'
    zero_padded_waveform = np.zeros(
        (clipped_waveform.shape[0] + padding_length * 2,
         clipped_waveform.shape[1]))
    zero_padded_waveform[padding_length:-padding_length] = clipped_waveform
    'we may also use the extra data'
    if split_waveform_index - padding_length >= 0 and split_waveform_index + window_size + padding_length < len(
            waveform):
        zero_padded_waveform[:padding_length] = waveform[
            split_waveform_index - padding_length:split_waveform_index]
        zero_padded_waveform[-padding_length:] = waveform[
            split_waveform_index + window_size:split_waveform_index +
            window_size + padding_length]

    'sperate using spleeter'
    prediction = separator.separate(zero_padded_waveform)
    'clip padded part, throw them away'
    prediction['vocals'] = prediction['vocals'][padding_length:-padding_length]
    prediction['accompaniment'] = prediction['accompaniment'][
        padding_length:-padding_length]
    'merge results together'
    vocal_res.extend(prediction['vocals'])
    accompan_res.extend(prediction['accompaniment'])


def interval_to_info(interval_seq):
    res = []
    start = 0
    in_interval = False
    for (index, label) in zip(range(len(interval_seq)), interval_seq):
        if label >= 1 and not in_interval:
Ejemplo n.º 22
0
def separate(waveform):
    separator = Separator('spleeter:2stems')
    return separator.separate(waveform)
Ejemplo n.º 23
0
class AudioDetect:
    def __init__(self, model_path_1, model_path_2):
        self.spleeter = Separator('spleeter:2stems', model_path_1)
        # 基于频域进行音轨分离,分离人声的话一般只需要2轨,accompaniment.wav  提取的背景/伴奏; vocals.wav是提取的人声
        self.spleeter._get_predictor()

        self.ina_speech_segmenter = Segmenter(detect_gender=False,
                                              model_dir=model_path_2)  ######
        logging.info("init done")

    def file_base_name(self, file_path):
        return Path(file_path).resolve().stem

    def spleeter_volcals_file_name(self, input_file, output_dir):
        input_base_name = self.file_base_name(input_file)
        return output_dir + "/" + input_base_name + "/vocals.wav"  # get

    def do_spleeter_from_buffer(self, input_buffer):
        waveform = buffer_utils.buffer_to_wave_for_spleeter(
            input_buffer, 44100)
        sources = self.spleeter.separate(waveform)
        return sources['vocals']

    def do_spleeter(self, input_file, out_dir):  # 分轨文件目录 out_dir
        self.spleeter.separate_to_file(
            input_file,
            out_dir,
            filename_format='{filename}/{instrument}.{codec}')
        return True

    def do_segment_from_buffer(self, input_buffer):
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            mspec, loge, difflen = buffer_utils.feat_from_spleeter_vocals_for_segment_two_transcode(
                input_buffer)
            segmention = self.ina_speech_segmenter.segment_feats(
                mspec, loge, difflen, 0)
        return (True, segmention)

    def do_segment(self, input, output_dir):
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            segmention = self.ina_speech_segmenter(
                self.spleeter_volcals_file_name(input, output_dir))

        return (True, segmention)

    def process_segmention(self, result_dic, segmention):
        last_lable = ""
        last_start = -1
        last_end = -1
        segments = []
        for segment in segmention:
            label = segment[0]
            label = self.map_label(label)
            start = round(float(segment[1]), 2)
            end = round(float(segment[2]), 2)
            if last_lable == "":
                last_lable = label
                last_start = start
                last_end = end
                continue
            if last_lable == label:
                last_end = end
                continue
            else:
                if last_lable == "speech":
                    segments.append({
                        "type": "speech",
                        "startSec": last_start,
                        "endSec": last_end
                    })
                last_lable = label
                last_start = start
                last_end = end

        if last_lable == "speech":
            segments.append({
                "type": "speech",
                "startSec": last_start,
                "endSec": last_end
            })
        result_dic["segments"] = segments

    def map_label(self, label):
        speech_labels = ["music", "speech"]
        if label in speech_labels:
            return "speech"
        return "noEnergy"

    def process_from_buffer(self, input_buffer, input_file):
        result_dic = {}
        result_dic.clear()
        input_base_name = os.path.basename(input_file)
        result_dic["fileName"] = input_base_name

        vocals_data = self.do_spleeter_from_buffer(input_buffer)
        if vocals_data is None:
            logging.error("separate failed")
            return json.dumps(result_dic, ensure_ascii=False)

        result, segmention = self.do_segment_from_buffer(
            vocals_data)  # make sure vocals_data is 16kHz
        if not result:
            logging.error("segment failed")
            return json.dumps(result_dic, ensure_ascii=False)

        self.process_segmention(result_dic, segmention)
        return json.dumps(result_dic, ensure_ascii=False)

    def process(self, input, output):
        result_dic = {}
        result_dic.clear()
        input_base_name = os.path.basename(input)
        result_dic["fileName"] = input_base_name

        if not self.do_spleeter(input, output):  ### step 1
            logging.error("separate failed")
            return json.dumps(result_dic, ensure_ascii=False)

        result, segmention = self.do_segment(input, output)  ### step 2
        if not result:
            logging.error("segment failed")
            return json.dumps(result_dic, ensure_ascii=False)

        self.process_segmention(result_dic, segmention)
        return json.dumps(result_dic, ensure_ascii=False)
Ejemplo n.º 24
0
class ApplicationWindow(QtWidgets.QMainWindow):
    def __init__(self):
        super(ApplicationWindow, self).__init__()
        pg.setConfigOption('background', 'k')
        self.ui = Ui_MainWindow()
        self.ui.setupUi(self)
        self.separator = Separator('spleeter:2stems')

        self.Input_X = []
        self.Input_Y = []
        self.music = []
        self.vocals = []

        self.Ecg = []
        self.Sample_Rate = 1000  #sample rate , taken from the file source
        #----------------------------------------------------------------------------------------------------------------
        self.graphic_View_Array = [
            self.ui.Original_GV, self.ui.Vocals_GV, self.ui.Music_GV,
            self.ui.Original_ECG, self.ui.ECG, self.ui.Arrhythmia
        ]
        for x in self.graphic_View_Array:
            x.getPlotItem().hideAxis('bottom')
            x.getPlotItem().hideAxis('left')
            x.setMouseEnabled(x=False, y=False)

        self.playArray = [
            self.ui.Play_Music, self.ui.Play_original, self.ui.Play_vocals
        ]
        #----------------------------------------------------------------------------------------------------------------
        self.ui.Import.clicked.connect(self.Import)
        self.ui.Import_ECG.clicked.connect(self.Import_ECG)
        #----------------------------------------------------------------------------------------------------------------'
        self.stopArray = [self.ui.Stop, self.ui.Stop2, self.ui.Stop3]
        for x in self.stopArray:
            x.clicked.connect(self.Stop)
    #----------------------------------------------------------------------------------------------------------------
    #----------------------------------------------------------------------------------------------------------------'
        self.ui.Save_music.clicked.connect(lambda: self.Save_music(self.music))
        self.ui.Save_vocals.clicked.connect(
            lambda: self.Save_vocals(self.vocals))

    #----------------------------------------------------------------------------------------------------------------

    def Import(self):
        filePaths = QtWidgets.QFileDialog.getOpenFileNames(
            self, 'Multiple File', "~/Desktop", '*')
        for filePath in filePaths:
            for f in filePath:
                if f == "*" or f == None:
                    break
                ext = os.path.splitext(f)[-1].lower()  # Check file extension
                if ext == ".wav":
                    self.Input_Y, frame_rate = self.ReadFromWav(f)
                    self.Input_X = np.arange(0, len(self.Input_Y))

                    self.plot(self.Input_X, self.Input_Y[:, 0],
                              self.ui.Original_GV, 'r')
                    self.ui.Play_original.clicked.connect(
                        lambda: self.Play_Wav(self.Input_Y))

                    self.vocals, self.music = self.split(self.Input_Y)

                    self.plot(self.Input_X[200:len(self.Input_X) - 2000],
                              self.music[:, 0][200:len(self.Input_X) - 2000],
                              self.ui.Music_GV, 'w')
                    self.plot(self.Input_X[200:len(self.Input_X) - 2000],
                              self.vocals[:, 0][200:len(self.Input_X) - 2000],
                              self.ui.Vocals_GV, 'w')

                    self.ui.Play_Music.clicked.connect(
                        lambda: self.Play_Wav(self.music))
                    self.ui.Play_vocals.clicked.connect(
                        lambda: self.Play_Wav(self.vocals))

                if ext == ".csv":

                    ECG_data = pd.read_csv(f)
                    self.Ecg = [data for data in ECG_data.ECG]
                    self.Ecg = np.array(self.Ecg)
                    x = np.arange(0, len(self.Ecg))
                    self.plot(x, self.Ecg, self.ui.Original_GV, 'r')

                    self.split_ECG(self.Ecg)

    def Import_ECG(self):
        filePaths = QtWidgets.QFileDialog.getOpenFileNames(
            self, 'Multiple File', "~/Desktop", '*')
        for filePath in filePaths:
            for f in filePath:
                if f == "*" or f == None:
                    break
                ext = os.path.splitext(f)[-1].lower()  # Check file extension
                if ext == ".csv":
                    ECG_data = pd.read_csv(f)
                    self.Ecg = [data for data in ECG_data.ECG]
                    self.Ecg = np.array(self.Ecg)
                    x = np.arange(0, len(self.Ecg))
                    self.plot(x, self.Ecg, self.ui.Original_ECG, 'r')

                    self.split_ECG(self.Ecg)

#----------------------------------------------------------------------------------------------------------------

    def ReadFromWav(self, file):
        (freq, sig) = wav.read(file)
        sig = (sig.astype(np.float32)) / 100000
        return (sig, freq)

    #------------------------------------------------------------------------------------------------------------------------

    def split_ECG(self, ecg):
        Data, phase = librosa.magphase(librosa.stft(ecg))
        Filter = librosa.decompose.nn_filter(
            Data,
            aggregate=np.median,
            metric='cosine',
            width=int(librosa.time_to_frames(2, sr=self.Sample_Rate)))
        Filter = np.minimum(Data, Filter)
        margin_i, margin_v = 2, 10
        power = 2
        mask_i = librosa.util.softmask(Filter,
                                       margin_i * (Data - Filter),
                                       power=power)
        mask_v = librosa.util.softmask(Data - Filter,
                                       margin_v * Filter,
                                       power=power)

        pure_arrhythmia = (mask_v * Data) * phase
        pure_ECG = (mask_i * Data) * phase

        arrhythmia = librosa.istft(pure_arrhythmia)
        ECG = librosa.istft(pure_ECG) * 1.5

        x_A = np.arange(0, len(arrhythmia))
        x_E = np.arange(0, len(ECG))

        # pure ECG
        self.plot(x_E, ECG, self.ui.ECG, 'w')
        # Pure arrhythmia
        self.plot(x_A, arrhythmia, self.ui.Arrhythmia, 'w')

    def split(self, WavData):
        splitted = self.separator.separate(WavData)
        Music = (splitted.get('accompaniment'))
        Vocals = (splitted.get('vocals'))
        return Vocals, Music

    #------------------------------------------------------------------------------------------------------------------------
    def Play_Wav(self, array):
        if len(self.Input_Y) != 0:
            sd.play(array)
        else:
            pass

    def Stop(self):
        sd.stop()

    #------------------------------------------------------------------------------------------------------------------------

    def plot(self, x, y, gv, color):
        gv.clear()
        gv.plotItem.getViewBox().setRange(xRange=x, yRange=y)
        gv.plot(x, y, pen=color)

    #------------------------------------------------------------------------------------------------------------------------
    def Save_music(self, arr):
        if len(self.music) > 0:
            write("outputs/music.wav", 44100, arr)

    def Save_vocals(self, arr):
        if len(self.vocals) > 0:
            write("outputs/vocals.wav", 44100, arr)
Ejemplo n.º 25
0
class SpleeterSeparator:
    """Performs source separation using Spleeter API."""
    def __init__(self, config=None):
        """Default constructor.

        :param config: Separator config, defaults to None
        """
        if config is None:
            self.audio_bitrate = '256k'
            self.audio_format = 'mp3'
            self.sample_rate = 44100
            self.spleeter_stem = 'config/4stems-16kHz.json'
        else:
            self.audio_bitrate = config['audio_bitrate']
            self.audio_format = config['audio_format']
            self.sample_rate = config['sample_rate']
            self.spleeter_stem = config['spleeter_stem']
        # Use librosa backend as it is less memory intensive
        self.separator = Separator(self.spleeter_stem,
                                   stft_backend='librosa',
                                   multiprocess=False)
        self.audio_adapter = get_default_audio_adapter()

    def create_static_mix(self, parts, input_path, output_path):
        """Creates a static mix by performing source separation and adding the
           parts to be kept into a single track.

        :param parts: List of parts to keep ('vocals', 'drums', 'bass', 'other')
        :param input_path: Path to source file
        :param output_path: Path to output file
        :raises e: FFMPEG error
        """
        waveform, _ = self.audio_adapter.load(input_path,
                                              sample_rate=self.sample_rate)
        prediction = self.separator.separate(waveform)
        out = np.zeros_like(prediction['vocals'])
        part_count = 0

        # Add up parts that were requested
        for key in prediction:
            if parts[key]:
                out += prediction[key]
                part_count += 1

        self.audio_adapter.save(output_path, out, self.separator._sample_rate,
                                self.audio_format, self.audio_bitrate)

    def separate_into_parts(self, input_path, output_path):
        """Creates a dynamic mix

        :param input_path: [description]
        :type input_path: [type]
        :param output_path: [description]
        :type output_path: [type]
        """
        self.separator.separate_to_file(input_path,
                                        output_path,
                                        self.audio_adapter,
                                        codec='mp3',
                                        bitrate=self.audio_bitrate,
                                        filename_format='{instrument}.{codec}',
                                        synchronous=False)
        self.separator.join(600)
Ejemplo n.º 26
0
# import zipfile
# import uuid

from flask import Flask, flash, request, redirect, url_for, send_from_directory

from spleeter.separator import Separator
from spleeter.audio.adapter import AudioAdapter

app = Flask(__name__)
if __name__ == '__main__':

    separator = Separator('spleeter:5stems')

    audio_loader = AudioAdapter.default()
    sample_rate = 44100
    waveform, _ = audio_loader.load("audio/Never Catch Me.mp3",
                                    sample_rate=sample_rate)

    prediction = separator.separate(waveform, audio_descriptor='')
    print(prediction)

    for instrument, data in prediction.items():
        audio_loader.save(os.path.join("output", f'{instrument}.mp3'), data,
                          sample_rate, 'mp3', '128k')

# ALLOWED_EXTENSIONS = ['mp3', 'wav']

# virtualenv E:\Code\spleeter-back-end\venv -p C:/Users/aidan/AppData/Local/Programs/Python/Python38/python.exe
# virtualenv --python=E:\Code\spleeter-back-end\venv C:/Users/aidan/AppData/Local/Programs/Python/Python38/python.exe

# .\venv\Scripts\activate