def __getitem__(self, idx):

        if torch.is_tensor(idx):  #Ensure the elements are always scalars
            idx = idx.tolist()
        audio_file_name = self.nsynth_meta_df.iloc[idx].note_str + '.wav'
        audio_pitch = self.nsynth_meta_df.iloc[idx].pitch
        audio_data, _ = librosa.load(os.path.join(self.audio_dir,
                                                  audio_file_name),
                                     sr=self.sr)

        mult = 0.25 + ((self.nsynth_meta_df.iloc[idx].part - 1) * 0.5)
        start_location = int(16000 * mult)

        audio_data = audio_data[start_location:start_location +
                                self.sample_length]
        audio_data_stft = librosa.stft(audio_data,
                                       n_fft=(self.sample_length - 1) * 2)
        audio_data = librosa.mu_compress(audio_data, quantize=False)
        audio_data = librosa.util.normalize(audio_data)
        new_sample = np.concatenate([
            np.reshape(audio_data, (self.sample_length, 1)) * 2,
            np.abs(audio_data_stft),
            np.angle(audio_data_stft)
        ],
                                    axis=1)
        return new_sample, self.classes.index(audio_pitch)
Ejemplo n.º 2
0
def _mulaw_compression(wav):
    """Compress the waveform using mu-law compression
    """
    wav = np.pad(wav, (cfg.win_length // 2, ), mode="reflect")
    wav = wav[:((wav.shape[0] - cfg.win_length) // cfg.hop_length + 1) *
              cfg.hop_length]

    wav = 2**(cfg.num_bits - 1) + librosa.mu_compress(wav,
                                                      mu=2**cfg.num_bits - 1)

    return wav
Ejemplo n.º 3
0
def getAllSamples(wavsDir, generatedWavsDir):
    realAudioList, generatedAudioList = [], []
    for wav, generatedWav in zip(os.listdir(wavsDir),
                                 os.listdir(generatedWavsDir)):
        audio, _ = librosa.load(wavsDir + '/' + wav, sr=24000)
        duration = librosa.get_duration(audio, sr=24000)
        offset = np.random.randint(0, duration - 2)
        audio, _ = librosa.load(wavsDir + '/' + wav,
                                sr=24000,
                                offset=int(offset),
                                duration=2)
        quantizedAudio = librosa.mu_compress(audio)
        realAudioList.append(quantizedAudio)
        fakeAudio, _ = librosa.load(generatedWavsDir + '/' + generatedWav,
                                    sr=24000)
        generatedAudioList.append(fakeAudio)
    return realAudioList, generatedAudioList
Ejemplo n.º 4
0
def test_mu_law_correctness(quantization_channels):
    # test reconstruction
    mu_src = np.arange(0, quantization_channels).astype(np.int)
    src = KPB.mu_law_decoding(mu_src, quantization_channels=quantization_channels)
    mu_src_recon = KPB.mu_law_encoding(src, quantization_channels=quantization_channels)

    np.testing.assert_equal(mu_src, mu_src_recon)

    # test against librosa
    resol = 1 / (2 ** 16)
    src = np.arange(-1.0, 1.0, resol).astype(np.float32)
    mu = quantization_channels - 1
    mu_src_ref = librosa.mu_compress(src, mu=quantization_channels - 1, quantize=False)
    mu_src_ref = (mu_src_ref + 1.0) / 2.0 * mu + 0.5
    mu_src_ref = mu_src_ref.astype(np.int)

    mu_src_kapre = KPB.mu_law_encoding(
        tf.convert_to_tensor(src), quantization_channels=quantization_channels
    )
    np.testing.assert_equal(mu_src_ref, mu_src_kapre.numpy())
Ejemplo n.º 5
0
def getDataset(wavsDir, textDir):
    audioList, textList = [], []
    for wav in os.listdir(wavsDir):
        audio, _ = librosa.load(wavsDir + '/' + wav, sr=24000)
        duration = librosa.get_duration(audio, sr=24000)
        offset = np.random.randint(0, duration - 2)
        audio, _ = librosa.load(wavsDir + '/' + wav,
                                sr=24000,
                                offset=int(offset),
                                duration=2)
        quantizedAudio = librosa.mu_compress(audio)
        quantizedAudio = tf.reshape(quantizedAudio[0:-1], (1, 48000, 1))
        audioList.append(quantizedAudio)
        textFile = wav.split('.')[0] + ".txt"
        content = ""
        with open(os.path.join(textDir, textFile), 'r+',
                  encoding='utf-8') as f:
            content = f.read()
        textList.append(content)
    audioDataset = tf.concat(audioList, axis=0)
    textDataset = BERT_MODEL(textList)
    return audioDataset, textDataset
Ejemplo n.º 6
0
def mulaw_encode(samples):
    # Rescale to -1.0..1.0. Encode to -128..127. Return 0..255.
    return (
        librosa.mu_compress(samples / peak_amplitude(samples), quantize=True) +
        128).astype('uint8')
Ejemplo n.º 7
0
def mu_compress(wav, hop_length=200, frame_length=800, bits=10):
    wav = np.pad(wav, (frame_length // 2, ), mode="reflect")
    wav = wav[:((wav.shape[0] - frame_length) // hop_length + 1) * hop_length]
    wav = 2**(bits - 1) + librosa.mu_compress(wav, mu=2**bits - 1)
    return wav
Ejemplo n.º 8
0
def mu_compress(x: np.array, p):
    "Mu expand from C, W in [-1., 1.] to C, W in [-1., 1.] "
    return librosa.mu_compress(x, mu=p.n_classes - 1, quantize=False)
Ejemplo n.º 9
0
    def __init__(self,
                 track_list,
                 x_len,
                 y_len=1,
                 bitrate=16,
                 twos_comp=True,
                 num_classes=256,
                 store_tracks=False,
                 encoder=None,
                 is_scalar=False,
                 sample_step=2000,
                 gc=None,
                 class_label=None):
        self.data = []
        self.tracks = []
        self.receptive_field = x_len
        self.y_len = y_len
        self.num_channels = 1
        self.num_classes = num_classes
        self.bitrate = bitrate
        self.datarange = (-2**(bitrate - 1), 2**(bitrate - 1) - 1)
        self.twos_comp = twos_comp
        self.is_scalar = is_scalar
        # self.bins = np.linspace(-1, 1, num_classes)
        self.data_len = 0
        self.sample_step = sample_step
        self.tracks_buckets = {}
        self.class_label = class_label
        self.gc = gc

        if encoder is None:
            self.encoder = MuEncoder(self.datarange)
        if not track_list:
            raise FileNotFoundError(
                'The data directory contains no file with postfix .wav')
        self.data_root = os.path.dirname(track_list[0])

        # data_root = os.path.join(os.path.dirname(track_list[0]), "data.h5")
        # if os.path.exists(data_root):
        #     hf = h5py.File(data_root, 'r')
        #     matrix = hf.get('dataset')
        #     self.data_array = np.array(matrix)
        #     hf.close()
        # else:
        #     data_matrix = None
        self.track_list = track_list
        store_tracks = True
        for idx, track in enumerate(track_list):
            audio, dtype, sample_rate = self._load_audio_from_wav(track)
            audio = self.trim_silence(audio)
            audio = audio.reshape(-1, 1)
            audio = librosa.mu_compress(audio) + 128
            # if len(audio) > 160000:
            #     audio = audio[:160000]
            # elif len(audio) < 160000:
            #     audio = np.pad(audio, (160000 - len(audio), 0), mode='constant', constant_values=(0, 0))
            # if data_matrix is None:
            #     data_matrix = audio
            # else:
            #     data_matrix = np.concatenate([data_matrix, audio], axis=0)

            # audio = np.pad(audio, ((self.receptive_field, 0), (0, 0)), mode='constant', constant_values=0)

            if store_tracks:
                self.tracks.append({
                    'name': track,
                    'audio': audio,
                    'sample_rate': sample_rate
                })
        # This is a problem this dataset is none-overlapping sampling,
        # that means larger x_len results in smaller data volume.
        #     for i in range(0, len(audio) - self.receptive_field - y_len, self.sample_step):
            for i in range(0,
                           len(audio) - self.receptive_field - y_len,
                           self.receptive_field):
                # x, y = self._extract_segment(audio, self.receptive_field, y_len, start_idx=i)
                self.data.append({
                    'file_idx': idx,
                    'start_idx': i,
                })
            #     x, y = self.preprocess(x, y)
            #     self.data.append({'x': x, 'y': y})
            # self.data.append((track, i))
            # start_idx = self.data_len
            # self.data_len += (len(audio) - x_len - y_len) // self.sample_step
            # self.tracks_buckets[track] = (start_idx, self.data_len - 1)
            # self.data.append(audio)
            # self.data_array = data_matrix
            # hf = h5py.File(data_root, 'w')
            # hf.create_dataset('dataset', data=self.data_array)
            # hf.close()

        self.dtype = np.dtype('int16')
        self.sample_rate = 16000