def parse_audio_w2l2(self, audio_path): if self.augment: y = (load_randomly_augmented_audio(audio_path, self.sample_rate,w2l2=True)).astype(np.float32) else: y = load_audio(audio_path) if self.noiseInjector: add_noise = np.random.binomial(1, self.noise_prob) if add_noise: y = self.noiseInjector.inject_noise(y) n_fft = 512 mfcc = pytorch_mfcc.MFCC(samplerate=self.sample_rate,winlen=self.window_size,winstep=self.window_stride,numcep=13,nfilt=26,nfft=n_fft,lowfreq=0,highfreq=None,preemph=0,ceplifter=22,appendEnergy=False).cuda() y = Variable(y, requires_grad=True) mfccs = mfcc(y) mean = mfccs.mean() std = mfccs.std() mfccs = torch.add(mfccs, -mean) mfccs = mfccs / std return mfccs.data
def attack(self, iterations, target_path, lr, bandwidth): flag = 0 model = WaveToLetter.load_model(args.model_path) model = model.to(device) model.eval() #eval. This is different from stage1 signal = self.get_signal(self.audio_path) orig = self.get_signal(self.orig_path) index_max, index_min = self.attention(signal) start_attack_time = time.time() for i in range(iterations): print('Iteration:', str(i)) # print(signal[index_max:index_max+20]) # print(signal[index_max]) # if args.printSilence: # print() # print(sigindex_maxnal.shape) # print("20:", signal[index_max:index_max+20]) mfcc = pytorch_mfcc.MFCC(samplerate=self.sample_rate, winlen=self.window_size, winstep=self.window_stride, numcep=13, nfilt=26, nfft=512, lowfreq=0, highfreq=None, preemph=0, ceplifter=22, appendEnergy=False).cuda() mfccs = mfcc(signal) mfccs = self.normalize(mfccs) if args.printSilence: print("mfccs", mfccs) inputsMags = self.mfccs_to_inputs(mfccs) out = model(inputsMags) path = self.orig_path.split( 'wav')[0] + 'txt' + self.orig_path.split('wav')[1] fp = open(path) transcriptReal = fp.readlines()[0] print("Ref:", transcriptReal.lower()) seq_length = out.size(1) sizes = Variable(torch.Tensor([1.0]).mul_(int(seq_length)).int(), requires_grad=False) if args.printSilence: print("out", out) print("softmax", F.softmax(out, dim=-1).data) decoded_output, _, = decoder.decode( F.softmax(out, dim=-1).data, sizes) transcript = decoded_output[0][0] print("Hyp:", transcript.lower()) out = out.transpose(0, 1) if args.target: transcriptTarget = args.target else: fp = open(target_path) transcriptTarget = fp.readlines()[0] print("Tar:", transcriptTarget.lower()) if transcript.lower() == transcriptTarget.lower() and i > 0: if args.target: target_path = args.target save_path = self.save(signal, target_path, self.orig_path, lr, iterations, i, bandwidth) generate_time = time.time() - start_attack_time print('Time taken (s): {generate_time:.4f}\t'.format( generate_time=generate_time)) self.save_figure(signal, save_path) break target = list( filter(None, [ self.labels_map.get(x) for x in list(transcriptTarget.upper()) ])) targets = torch.IntTensor(target) target_sizes = torch.IntTensor([len(target)]) ctcloss = self.criterion(out, targets, sizes, target_sizes) # print("ctcloss:", ctcloss) # print("delta_2:", 100*torch.sum((signal - orig)**2)) # loss = ctcloss + 100*torch.sum((signal - orig)**2) loss = ctcloss print("loss:", loss) loss.backward() grad = np.array(signal.grad) is_nan = np.isnan(grad) is_nan_new = is_nan[is_nan == True] for j in range(len(grad)): if is_nan[j]: grad[j] = 10 wer = decoder.wer(transcript.lower(), transcriptTarget.lower()) / float( len(transcriptTarget.lower().split())) # the iterative proportional clipping method # print('grad:{}'.format(grad[index_max:index_max+20])) perturbation = lr * torch.from_numpy(grad) # print('perturbation', perturbation[index_max]) signal_next_relative = torch.clamp( (signal.data - perturbation) / orig, min=1 - bandwidth, max=1 + bandwidth) # print("signal_next_relative1:", signal_next_relative[index_max]) signal.data = signal_next_relative.mul(orig) # print(signal_next_relative[index_max]*orig[index_max]) # print("signal.data:", signal.data[index_max]) # if (i + 1) % 15000 == 0 and flag < 1: # # anneal lr # # lr *= 0.5 # lr = lr / args.learning_anneal # flag += 1 # print("wer", wer) # print("lr", lr) print("\n") signal.grad.data.zero_() print("Come to the end")