コード例 #1
0
    def parse_audio_w2l2(self, audio_path):
        if self.augment:
            y = (load_randomly_augmented_audio(audio_path, self.sample_rate,w2l2=True)).astype(np.float32)

        else:
            y = load_audio(audio_path)
        if self.noiseInjector:
            add_noise = np.random.binomial(1, self.noise_prob)
            if add_noise:
                y = self.noiseInjector.inject_noise(y)
        n_fft = 512
        mfcc = pytorch_mfcc.MFCC(samplerate=self.sample_rate,winlen=self.window_size,winstep=self.window_stride,numcep=13,nfilt=26,nfft=n_fft,lowfreq=0,highfreq=None,preemph=0,ceplifter=22,appendEnergy=False).cuda()

        y = Variable(y, requires_grad=True)
        mfccs = mfcc(y)

        mean = mfccs.mean()
        std = mfccs.std()
        mfccs = torch.add(mfccs, -mean)
        mfccs = mfccs / std
        return mfccs.data
コード例 #2
0
    def attack(self, iterations, target_path, lr, bandwidth):
        flag = 0
        model = WaveToLetter.load_model(args.model_path)
        model = model.to(device)
        model.eval()  #eval. This is different from stage1
        signal = self.get_signal(self.audio_path)
        orig = self.get_signal(self.orig_path)

        index_max, index_min = self.attention(signal)
        start_attack_time = time.time()
        for i in range(iterations):
            print('Iteration:', str(i))
            # print(signal[index_max:index_max+20])
            # print(signal[index_max])

            # if args.printSilence:
            #     print()
            #     print(sigindex_maxnal.shape)
            #     print("20:", signal[index_max:index_max+20])

            mfcc = pytorch_mfcc.MFCC(samplerate=self.sample_rate,
                                     winlen=self.window_size,
                                     winstep=self.window_stride,
                                     numcep=13,
                                     nfilt=26,
                                     nfft=512,
                                     lowfreq=0,
                                     highfreq=None,
                                     preemph=0,
                                     ceplifter=22,
                                     appendEnergy=False).cuda()
            mfccs = mfcc(signal)
            mfccs = self.normalize(mfccs)

            if args.printSilence:
                print("mfccs", mfccs)
            inputsMags = self.mfccs_to_inputs(mfccs)
            out = model(inputsMags)

            path = self.orig_path.split(
                'wav')[0] + 'txt' + self.orig_path.split('wav')[1]
            fp = open(path)
            transcriptReal = fp.readlines()[0]
            print("Ref:", transcriptReal.lower())

            seq_length = out.size(1)
            sizes = Variable(torch.Tensor([1.0]).mul_(int(seq_length)).int(),
                             requires_grad=False)

            if args.printSilence:
                print("out", out)
                print("softmax", F.softmax(out, dim=-1).data)

            decoded_output, _, = decoder.decode(
                F.softmax(out, dim=-1).data, sizes)
            transcript = decoded_output[0][0]
            print("Hyp:", transcript.lower())

            out = out.transpose(0, 1)
            if args.target:
                transcriptTarget = args.target
            else:
                fp = open(target_path)
                transcriptTarget = fp.readlines()[0]
            print("Tar:", transcriptTarget.lower())

            if transcript.lower() == transcriptTarget.lower() and i > 0:
                if args.target:
                    target_path = args.target
                save_path = self.save(signal, target_path, self.orig_path, lr,
                                      iterations, i, bandwidth)
                generate_time = time.time() - start_attack_time
                print('Time taken (s): {generate_time:.4f}\t'.format(
                    generate_time=generate_time))
                self.save_figure(signal, save_path)
                break

            target = list(
                filter(None, [
                    self.labels_map.get(x)
                    for x in list(transcriptTarget.upper())
                ]))
            targets = torch.IntTensor(target)
            target_sizes = torch.IntTensor([len(target)])
            ctcloss = self.criterion(out, targets, sizes, target_sizes)
            # print("ctcloss:", ctcloss)
            # print("delta_2:", 100*torch.sum((signal - orig)**2))
            # loss = ctcloss + 100*torch.sum((signal - orig)**2)
            loss = ctcloss
            print("loss:", loss)

            loss.backward()

            grad = np.array(signal.grad)
            is_nan = np.isnan(grad)
            is_nan_new = is_nan[is_nan == True]
            for j in range(len(grad)):
                if is_nan[j]:
                    grad[j] = 10

            wer = decoder.wer(transcript.lower(),
                              transcriptTarget.lower()) / float(
                                  len(transcriptTarget.lower().split()))

            # the iterative proportional clipping method

            # print('grad:{}'.format(grad[index_max:index_max+20]))
            perturbation = lr * torch.from_numpy(grad)
            # print('perturbation', perturbation[index_max])
            signal_next_relative = torch.clamp(
                (signal.data - perturbation) / orig,
                min=1 - bandwidth,
                max=1 + bandwidth)
            # print("signal_next_relative1:", signal_next_relative[index_max])
            signal.data = signal_next_relative.mul(orig)
            # print(signal_next_relative[index_max]*orig[index_max])
            # print("signal.data:", signal.data[index_max])

            # if (i + 1) % 15000 == 0 and flag < 1:
            #     # anneal lr
            #     # lr *= 0.5
            #     lr = lr / args.learning_anneal
            #     flag += 1
            # print("wer", wer)
            # print("lr", lr)
            print("\n")
            signal.grad.data.zero_()
        print("Come to the end")