Esempio n. 1
0
    def stoi(self, filepath, clean_filepath=None):
        # filepath = path to mashup
        # Needs octave and octave-signal installed
        # Use "pip install oct2py" to install python - octave bridge
        # STOI assumes
        # * a sampling rate of 10kHz, resamples otherwise
        # * window length of 384ms
        # * 15 third octave bands over full frequency range
        # * overlapping segments with hanning window
        # * removes silent frames
        import librosa
        from oct2py import octave
        if clean_filepath is None:
            # No clean file given.
            # Get processed and clean file from mashup.
            vocal_isolation = VocalIsolation(config)
            vocal_isolation.loadWeights(config.weights)
            audio, sampleRate = conversion.load_audio_file(filepath)
            spectrogram = conversion.audio_file_to_spectrogram(
                audio, fftWindowSize=config.fft,
                learn_phase=self.config.learn_phase)

            normalizer = Normalizer()
            normalize = normalizer.get(both=False)
            denormalize = normalizer.get_reverse()

            # normalize
            spectogram, norm = normalize(spectrogram)

            info = vocal_isolation.process_spectrogram(spectrogram,
                                                       config.get_channels())
            spectrogram, new_spectrogram = info
            # de-normalize
            new_spectrogram = denormalize(new_spectrogram, norm)

            processed = conversion.spectrogram_to_audio_file(new_spectrogram,
                                                             config.fft,
                                                             config.phase_iterations)

            clean_filepath = filepath.replace("_all.wav", "_vocal.wav")
            clean, sampling_rate = librosa.load(clean_filepath)
        else:
            # A clean file is given.
            # Compare it with the processed audio.
            processed, sampling_rate = librosa.load(filepath)
            clean, sampling_rate = librosa.load(clean_filepath)

        # Make sure the original and processed audio have the same length
        clean = clean[:processed.shape[0]]

        octave.eval("pkg load signal")
        d = octave.stoi(clean, processed, sampling_rate)
        self._write("stoi: %f" % d)
    def process_spectrogram(self, spectrogram, channels=1):
        chopper = Chopper()
        chopper.name = "infer"
        chopper.params = "{'scale': %d}" % self.config.inference_slice
        chop = chopper.get(both=False)

        slices = chop(spectrogram)

        normalizer = Normalizer()
        normalize = normalizer.get(both=False)
        denormalize = normalizer.get_reverse()

        new_spectrogram = np.zeros((spectrogram.shape[0], 0, channels))
        for slice in slices:
            # normalize
            slice, norm = normalize(slice)

            epanded_spectrogram = conversion.expand_to_grid(
                slice, self.peakDownscaleFactor, channels)
            epanded_spectrogram_with_batch_and_channels = \
                epanded_spectrogram[np.newaxis, :, :]

            predicted_spectrogram_with_batch_and_channels = self.model.predict(
                epanded_spectrogram_with_batch_and_channels)
            # o /// o
            predicted_spectrogram = \
                predicted_spectrogram_with_batch_and_channels[0, :, :, :]
            local_spectrogram = predicted_spectrogram[:slice.shape[0], :slice.
                                                      shape[1], :]

            # de-normalize
            local_spectrogram = denormalize(local_spectrogram, norm)

            new_spectrogram = np.concatenate(
                (new_spectrogram, local_spectrogram), axis=1)
        console.log("Processed spectrogram")
        return spectrogram, new_spectrogram
Esempio n. 3
0
    def volume(self, filepath):
        normalizer = Normalizer()
        normalize = normalizer.get(both=False)
        denormalize = normalizer.get_reverse()

        vocal_file = filepath.replace("_all.wav", "_vocal.wav")
        instrumental_file = filepath.replace("_all.wav", "_instrumental.wav")

        vocal_isolation = VocalIsolation(config)
        vocal_isolation.loadWeights(config.weights)

        instrumental_audio, _ = conversion.load_audio_file(instrumental_file)
        vocal_audio, _ = conversion.load_audio_file(vocal_file)

        instrumental = conversion.audio_file_to_spectrogram(
            instrumental_audio, fftWindowSize=config.fft,
            learn_phase=self.config.learn_phase)
        vocal = conversion.audio_file_to_spectrogram(
            vocal_audio, fftWindowSize=config.fft,
            learn_phase=self.config.learn_phase)

        if not os.path.exists(self.analysisPath):
            os.mkdir(self.analysisPath)
        h5f_path = os.path.join(self.analysisPath,
                                "volume.hdf5")
        h5file = h5py.File(h5f_path, "w")

        ratio = 100
        x = [i/ratio for i in range(1, ratio)] + \
            [1] + \
            [ratio/i for i in range(ratio-1, 0, -1)]
        h5file.create_dataset(name="x", data=x)

        print("Unscaled original mix")
        mashup, norm = normalize(instrumental + vocal)
        info = vocal_isolation.process_spectrogram(mashup,
                                                   config.get_channels())
        new_spectrogram = denormalize(info[1], norm)
        mse = ((new_spectrogram - vocal)**2).mean()
        y = [mse for _ in x]
        plt.loglog(x, y, label="baseline")
        h5file.create_dataset(name="baseline", data=y)

        original_ratio = np.max(vocal)/np.max(instrumental)
        print("Original ratio: %s" % original_ratio)
        vocal /= original_ratio

        print("Change vocal volume")
        y = []
        for i in x:
            mashup, norm = normalize(instrumental + i * vocal)
            info = vocal_isolation.process_spectrogram(mashup,
                                                       config.get_channels())
            new_spectrogram = denormalize(info[1], norm)
            if i != 0:
                new_spectrogram = new_spectrogram / i

            mse = ((new_spectrogram - vocal)**2).mean()
            y.append(mse)
            print(mse)
        plt.loglog(x, y, label="scaled")

        plt.xlabel("vocal/instrumental")
        plt.ylabel("mean squared error")
        plt.legend()

        h5file.create_dataset(name="scale", data=y)
        h5file.close()
        if not os.path.exists(self.analysisPath):
            os.mkdir(self.analysisPath)
        plt.savefig(os.path.join(self.analysisPath, "volume.png"))