def parse_audio(self, audio_path): if self.aug_conf and self.aug_conf.speed_volume_perturb: y = load_randomly_augmented_audio(audio_path, self.sample_rate) else: y = load_audio(audio_path) if self.noise_injector: add_noise = np.random.binomial(1, self.aug_conf.noise_prob) if add_noise: y = self.noise_injector.inject_noise(y) n_fft = int(self.sample_rate * self.window_size) win_length = n_fft hop_length = int(self.sample_rate * self.window_stride) # STFT D = librosa.stft(y, n_fft=n_fft, hop_length=hop_length, win_length=win_length, window=self.window) spect, phase = librosa.magphase(D) # S = log(S+1) spect = np.log1p(spect) spect = torch.FloatTensor(spect) if self.normalize: mean = spect.mean() std = spect.std() spect.add_(-mean) spect.div_(std) if self.aug_conf and self.aug_conf.spec_augment: spect = spec_augment(spect) return spect
def parse_audio(self, audio_path): if self.aug_conf and self.aug_conf.speed_volume_perturb: y = load_randomly_augmented_audio(audio_path, self.sample_rate) else: y = load_audio(audio_path) if self.noise_injector: add_noise = np.random.binomial(1, self.aug_conf.noise_prob) if add_noise: y = self.noise_injector.inject_noise(y) ##get ten file de ve hinh # nanlist=audio_path.split("/") # nanLs = nanlist[len(nanlist)-1] # name = nanLs.split(".")[0]+"_"+nanLs.split(".")[1] #tín hiệu thô # fig1,ax22= plt.subplots() # plt.title('Tín hiệu thô của câu nói \'anh có thể gọi cho tôi không\'') # plt.plot(y) # plt.xlabel('Sample') # plt.ylabel('Amplitude') # fig1.savefig('/work/Source/deepspeech.pytorch/deepspeech_pytorch/quyenImg/'+name+'tinhieutho'+'.png') n_fft = int(self.sample_rate * self.window_size) #320 win_length = n_fft #320 hop_length = int(self.sample_rate * self.window_stride) #160 # STFT D = librosa.stft( y, n_fft=n_fft, hop_length=hop_length, win_length=win_length, window=self.window ) #array([[ 2.42148260e-01+0.00000000e+00j, -1.00020550e-01+0.00000000e+00j, spect, phase = librosa.magphase( D ) # S = log(S+1)##array([[ 2.42148260e-01+0.00000000e+00j, -1.00020550e-01+0.00000000e+00j, # print("++**",audio_path) # fig, ax = plt.subplots() # img = librosa.display.specshow(librosa.amplitude_to_db(spect,ref=np.max), y_axis='log', x_axis='time', ax=ax) # ax.set_title(audio_path) # fig.savefig('/work/Source/deepspeech.pytorch/deepspeech_pytorch/quyenImg/'+name+'.png') #có thanh độ lớn biên độ # log_spectrogram = librosa.amplitude_to_db(spect) # plt.figure(figsize=(12,8)) # librosa.display.specshow(log_spectrogram, sr=self.sample_rate, # y_axis='log', x_axis='time',hop_length=160) # plt.xlabel("Time") # plt.ylabel("Frequency") # plt.colorbar(format="%+2.0f dB") # plt.title("Spectrogram (dB)") # plt.savefig('/work/Source/deepspeech.pytorch/deepspeech_pytorch/quyenImg/'+name+'.png') spect = np.log1p( spect ) #tensor([[2.1684e-01, 9.5329e-02, 1.0469e-01, ..., 1.2308e-03, 2.3625e-03, # fig2, ax2 = plt.subplots() # img2 = librosa.display.specshow(librosa.amplitude_to_db(spect,ref=np.max), y_axis='log', x_axis='time', ax=ax) # ax2.set_title(audio_path+"(log)") # fig2.savefig('/work/Source/deepspeech.pytorch/deepspeech_pytorch/quyenImg/'+name+"(log)"+'.png') spect = torch.FloatTensor( spect ) #tensor([[2.1684e-01, 9.5329e-02, 1.0469e-01, ..., 1.2308e-03, 2.3625e-03, if self.normalize: mean = spect.mean() #tính trung bình cộng #mean=np.log1p(mean) std = spect.std() #std=np.log1p(std)#độ lệch chuẩn if (mean == torch.tensor(0) or std == torch.tensor(0)): print("nan nan") spect.add_(-mean) spect.div_(std) if self.aug_conf and self.aug_conf.spec_augment: spect = spec_augment(spect) return spect