def forward(self, p_vects, q_vects, p_frames_mask, q_frames_mask): ''' p/q_vects = [num_speakers X num_feats X max_num_mfcc_frames x mfcc_dim] p/q_frames_mask = [num_speakers X num_feats X max_num_mfcc_frames x mfcc_dim] -> The associated 0s and 1s mask of p/q_lengths n.b. mfcc_dim = 13 usually (using c0 for energy instead of log-energy) num_feats = 46*47*0.5 = 1128 usually max_num_mfcc_frames = the maximum number of frames associated with a particular phone for any speaker -> often set to 4000 ''' # Apply the attack noise = torch.exp(self.noise_root) # Need to add spectral noise # Pad to spectral dimension padding = torch.zeros(p_vects.size(0), p_vects.size(1), p_vects.size(2), self.spectral_dim - self.mfcc_dim) padded_p_vects = torch.cat((p_vects, padding), 3) padded_q_vects = torch.cat((q_vects, padding), 3) # Apply inverse dct log_spectral_p = dct.idct(padded_p_vects) log_spectral_q = dct.idct(padded_q_vects) # Apply inverse log spectral_p = torch.exp(log_spectral_p) spectral_q = torch.exp(log_spectral_q) # Restructure noise noise_struct = noise.unsqueeze(1).unsqueeze(1).repeat( 1, p_vects.size(1), p_vects.size(2), 1) # Add the adversarial attack noise attacked_spectral_p = spectral_p + noise_struct attacked_spectral_q = spectral_q + noise_struct # Apply the log attacked_log_spectral_p = torch.log(attacked_spectral_p) attacked_log_spectral_q = torch.log(attacked_spectral_q) # Apply the dct attacked_padded_p = dct.dct(attacked_log_spectral_p) attacked_padded_q = dct.dct(attacked_log_spectral_q) # Truncate to mfcc dimension p_vects_attacked = torch.narrow(attacked_padded_p, 3, 0, self.mfcc_dim) q_vects_attacked = torch.narrow(attacked_padded_q, 3, 0, self.mfcc_dim) # Apply mask of zeros/ones, to ensure spectral noise only applied up to p/q lengths p_vects_masked = p_vects_attacked * p_frames_mask q_vects_masked = q_vects_attacked * q_frames_mask # Pass through trained model trained_model = torch.load(self.trained_model_path) trained_model.eval() y = trained_model(p_vects_masked, q_vects_masked, p_frames_mask, q_frames_mask) return y
def attack_cov(self, means, covs, means_atck, noise): ''' This update is derived from a log normal approximation of a shifted log normal distribution ''' step1 = dct.idct(covs) step2 = torch.transpose(dct.idct(torch.transpose(step1, -1, -2)), -1, -2) step3 = torch.diagonal(step2, offset=0, dim1=-2, dim2=-1) step4 = dct.idct(means) + (step3 * 0.5) padding = torch.zeros(means.size(0), means.size(1), self.spectral_dim - self.mfcc_dim).to(self.device) padded_step4 = torch.cat((step4, padding), 2) step5 = torch.exp(padded_step4) + noise step6 = torch.log(step5) * 2 step6_trunc = torch.narrow(step6, 2, 0, self.mfcc_dim) step7 = step6_trunc - (2 * dct.idct(means_atck)) step8 = torch.diag_embed(step7) step9 = dct.dct(step8) step10 = torch.transpose(dct.dct(torch.transpose(step9, -1, -2)), -1, -2) # Make sure no negative diagonal values step11 = torch.diag_embed( torch.clamp(torch.diagonal(step10, offset=0, dim1=-2, dim2=-1), min=1.0)) stepa = torch.diagonal(covs, offset=0, dim1=-2, dim2=-1) stepb = torch.diag_embed(stepa) #attacked_covs = covs - stepb + step11 attacked_covs = step11 # Have to neglect off-diagonal terms to ensure covariance matrices are positive definite noised = attacked_covs + (1e-2 * torch.eye(13).to(self.device)) return noised
def forward(self, p_means, p_covariances, q_means, q_covariances, num_phones_mask): ''' p/q_means = [num_speakers X num_feats X mfcc_dim] p/q_covariances = [num_speakers X num_feats X mfcc_dim X mfcc_dim] num_phones_mask = [num_speakers X num_feats], with a 0 corresponding to positiion that should be -1 (no phones observed) and a 1 everywhere else. n.b. num_feats = 46*47*0.5 = 1128 usually, where 47 = num_phones ''' noise = torch.exp(self.noise_root) # Need to add spectral noise with first order Taylor approximation # Pad to spectral dimension padding = torch.zeros(p_means.size(0), p_means.size(1), self.spectral_dim - self.mfcc_dim) padded_p_means = torch.cat((p_means, padding), 2) padded_q_means = torch.cat((q_means, padding), 2) # Apply inverse dct log_spectral_p = dct.idct(padded_p_means) log_spectral_q = dct.idct(padded_q_means) # Apply inverse log spectral_p = torch.exp(log_spectral_p) spectral_q = torch.exp(log_spectral_q) # Hadamard division with the spectral noise attacked_spectral_p = noise / spectral_p attacked_spectral_q = noise / spectral_q # Apply the dct attacked_padded_p = dct.dct(attacked_spectral_p) attacked_padded_q = dct.dct(attacked_spectral_q) # Truncate to mfcc dimension p_means_attacked_second_term = torch.narrow(attacked_padded_p, 2, 0, self.mfcc_dim) q_means_attacked_second_term = torch.narrow(attacked_padded_q, 2, 0, self.mfcc_dim) # Combine Taylor expansion p_means_attacked = p_means + p_means_attacked_second_term q_means_attacked = q_means + q_means_attacked_second_term # Pass through trained model trained_model = torch.load(self.trained_model_path) trained_model.eval() y = trained_model(p_means_attacked, p_covariances, q_means_attacked, q_covariances, num_phones_mask) return y
def forward(self, p_means, p_covariances, q_means, q_covariances, num_phones_mask): ''' p/q_means = [num_speakers X num_feats X mfcc_dim] p/q_covariances = [num_speakers X num_feats X mfcc_dim X mfcc_dim] num_phones_mask = [num_speakers X num_feats], with a 0 corresponding to positiion that should be -1 (no phones observed) and a 1 everywhere else. n.b. num_feats = 46*47*0.5 = 1128 usually, where 47 = num_phones ''' noise = torch.exp(self.noise_root) # Need to add spectral noise # Pad to spectral dimension padding = torch.zeros(p_means.size(0), p_means.size(1), self.spectral_dim - self.mfcc_dim) padded_p_means = torch.cat((p_means, padding), 2) padded_q_means = torch.cat((q_means, padding), 2) # Apply inverse dct log_spectral_p = dct.idct(padded_p_means) log_spectral_q = dct.idct(padded_q_means) # Apply inverse log spectral_p = torch.exp(log_spectral_p) spectral_q = torch.exp(log_spectral_q) # Add the adversarial attack noise attacked_spectral_p = spectral_p + noise attacked_spectral_q = spectral_q + noise # Apply the log attacked_log_spectral_p = torch.log(attacked_spectral_p) attacked_log_spectral_q = torch.log(attacked_spectral_q) # Apply the dct attacked_padded_p = dct.dct(attacked_log_spectral_p) attacked_padded_q = dct.dct(attacked_log_spectral_q) # Truncate to mfcc dimension p_means_attacked = torch.narrow(attacked_padded_p, 2, 0, self.mfcc_dim) q_means_attacked = torch.narrow(attacked_padded_q, 2, 0, self.mfcc_dim) # Pass through trained model trained_model = torch.load(self.trained_model_path) trained_model.eval() y = trained_model(p_means_attacked, p_covariances, q_means_attacked, q_covariances, num_phones_mask) return y
def attack_mean(self, means, noise): # Need to add spectral noise # Pad to spectral dimension padding = torch.zeros(means.size(0), means.size(1), self.spectral_dim - self.mfcc_dim).to(self.device) padded_means = torch.cat((means, padding), 2) # Apply inverse dct log_spectral = dct.idct(padded_means) # Apply inverse log spectral = torch.exp(log_spectral) # Add the adversarial attack noise attacked_spectral = spectral + noise # Apply the log attacked_log_spectral = torch.log(attacked_spectral) # Apply the dct attacked_padded = dct.dct(attacked_log_spectral) # Truncate to mfcc dimension means_attacked = torch.narrow(attacked_padded, 2, 0, self.mfcc_dim) return means_attacked
def spectral_attack(X, attack): X = torch.from_numpy(X).float() X_sq = X.squeeze() attack = torch.from_numpy(attack).float() # Add the attack in the spectral space # Pad to spectral dimension padding = torch.zeros(attack.size(0) - X_sq.size(0)) padded_X = torch.cat((X_sq, padding)) # Apply inverse dct log_spectral_X = dct.idct(padded_X) # Apply inverse log spectral_X = torch.exp(log_spectral_X) # Add the adversarial attack attacked_spectral_X = spectral_X + attack # Get back to mfcc domain attacked_log_spectral_X = torch.log(attacked_spectral_X) attacked_padded_X = dct.dct(attacked_log_spectral_X) X_attacked = torch.narrow(attacked_padded_X, 0, 0, X_sq.size(0)) X_attacked = X_attacked.detach().numpy() return X_attacked
def test_idct(): for norm in [None, 'ortho']: for N in [5, 2, 32, 111]: x = np.random.normal(size=(1, N)) X = dct.dct(torch.tensor(x), norm=norm) y = dct.idct(X, norm=norm).numpy() assert np.abs(x - y).max() < EPS, x
def attack(self, samples, noise): ''' Perform attack in the spectral space ''' # Pad to spectral dimension padding = torch.zeros(samples.size(0), samples.size(1), samples.size(2), self.spectral_dim - self.mfcc_dim).to(self.device) padded_samples = torch.cat((samples, padding), 3) # Apply inverse dct log_spectral = dct.idct(padded_samples) # Apply inverse log spectral = torch.exp(log_spectral) # Add the adversarial attack noise attacked_spectral = spectral + noise # Apply the log attacked_log_spectral = torch.log(attacked_spectral) # Apply the dct attacked_padded = dct.dct(attacked_log_spectral) # Truncate to mfcc dimension samples_attacked = torch.narrow(attacked_padded, 3, 0, self.mfcc_dim) return samples_attacked
def forward(self, x): filt = dct.idct(F.pad(self.weight, (0, self.index.size(0) - self.weight.size(1))), norm='ortho') filt = filt[:, self.index.long()] filt = torch.reshape(filt, (self.no, self.ni)) x = F.linear(x, filt, bias=self.bias) return x
def h_func_dct(lateral_slice): l, m, n = lateral_slice.shape dct_slice = dct.dct(lateral_slice) tubes = [dct_slice[i, :, 0] for i in range(l)] h_tubes = [] for tube in tubes: h_tubes.append(torch.exp(tube) / torch.sum(torch.exp(tube))) res_slice = torch.stack(h_tubes, dim=0).reshape(l, m, n) idct_a = dct.idct(res_slice) return torch.sum(idct_a, dim=0)
def forward(self, x: torch.Tensor) -> torch.Tensor: if self.train: x_flat = x.view([-1, np.prod(x.size()[1:])]) x_dct = dct.dct(x_flat) r = self.greater_mask(x_dct) b = self.bernoulli_mask(x_flat.shape) y_dct = x_dct * r + x_dct * ~r * b y = dct.idct(y_dct) y = y.view(x.size()) return y else: return x
def forward(self, input): """ This is the fully manual implementation of the forward and backward passes via the torch.autograd.Function. :param input: the input map (e.g., an image) :return: the result of 2D convolution """ # ctx, input, filter, bias, padding = (0, 0), stride = (1, 1), # args = None, out_size = None, is_manual = tensor([0]), # conv_index = None filter = self.weight # N - number of input maps (or images in the batch). # C - number of input channels. # H - height of the input map (e.g., height of an image). # W - width of the input map (e.g. width of an image). N, C, H, W = input.size() # F - number of filters. # C - number of channels in each filter. # HH - the height of the filter. # WW - the width of the filter (its length). F, C, HH, WW = filter.size() pad_filter_H = H - HH pad_filter_W = W - WW filter = torch_pad(filter, (0, pad_filter_W, 0, pad_filter_H), 'constant', 0) input = dct(input) filter = dct(filter) # permute from N, C, H, W to H, W, N, C input = input.permute(2, 3, 0, 1) # permute from F, C, H, W to H, W, C, F filter = filter.permute(2, 3, 1, 0) result = torch.matmul(input, filter) # permute from H, W, N, F to N, F, H, W result = result.permute(2, 3, 0, 1) result = idct(result) out_H, out_W = self.out_HW(H, W, HH, WW) result = result[..., :out_H, :out_W] if self.bias is not None: # Add the bias term for each filter (it has to be unsqueezed to # the dimension of the out to properly sum up the values). unsqueezed_bias = self.bias.unsqueeze(-1).unsqueeze(-1) result += unsqueezed_bias if (self.stride_H != 1 or self.stride_W != 1) and ( self.stride_type is StrideType.STANDARD): result = result[:, :, ::self.stride_H, ::self.stride_W] return result
def reset_parameters(self): # initialise using dct function I = torch.eye(self.N) if self.cuda: I = I.cuda() if self.type == 'dct1': self.weight.data = dct.dct1(I).data.t() elif self.type == 'idct1': self.weight.data = dct.idct1(I).data.t() elif self.type == 'dct': self.weight.data = dct.dct(I, norm=self.norm).data.t() elif self.type == 'idct': self.weight.data = dct.idct(I, norm=self.norm).data.t() self.weight.requires_grad = False # don't learn this!
def forward(self, x): n, d = x.size() x = self.A * x # first diagonal matrix x = self.pack(x) x = dct.dct(x) # forward DCT x = self.unpack(x) x = self.D * x # second diagonal matrix x = self.pack(x) x = self.riffle(x) x = dct.idct(x) # inverse DCT x = self.unpack(x) if self.bias is not None: return x + self.bias else: return x
def reset_parameters(self): super(LinearACDC, self).reset_parameters() # this is probably not a good way to do this if 'A' not in self.__dict__.keys(): self.A = nn.Parameter(torch.Tensor(self.out_features, 1)) self.D = nn.Parameter(torch.Tensor(self.out_features, 1)) self.A.data.normal_(1., 1e-2) self.D.data.normal_(1., 1e-2) # need to have DCT matrices stored for speed # they have to be Parameters so they'll be N = self.out_features self.dct = dct.dct(torch.eye(N)) self.idct = dct.idct(torch.eye(N)) # remove weight Parameter del self.weight
def forward(self, x): filt = dct.idct(F.pad(self.weight, (0, self.index.size(0) - self.weight.size(1))), norm='ortho') filt = filt[:, self.index.long()] filt = torch.reshape( filt, (self.no, self.ni, self.kernel_size, self.kernel_size)) x = F.conv2d(x, filt, bias=self.bias, stride=self.stride, padding=self.padding, groups=self.groups) return x
def reset_parameters(self): super(ConvACDC, self).reset_parameters() # this is probably not a good way to do this assert self.kernel_size[0] == self.kernel_size[ 1], "%s" % self.kernel_size N = self.out_channels * self.kernel_size[0] if 'A' not in self.__dict__.keys(): self.A = nn.Parameter(torch.Tensor(N, 1)) self.D = nn.Parameter(torch.Tensor(N, 1)) self.A.data.normal_(1., 1e-2) self.D.data.normal_(1., 1e-2) # initialise DCT matrices self.dct = dct.dct(torch.eye(N)) self.idct = dct.idct(torch.eye(N)) # remove weight Parameter del self.weight
def spectral_convert(X, num_channels): X = torch.from_numpy(X).float() X_sq = X.squeeze() # Pad to spectral dimension padding = torch.zeros(num_channels - X_sq.size(0)) padded_X = torch.cat((X_sq, padding)) # Apply inverse dct log_spectral_X = dct.idct(padded_X) # Apply inverse log spectral_X = torch.exp(log_spectral_X) # Convert back to numpy spectral_X = spectral_X.detach().numpy() return spectral_X
def isdct_torch(dcts, *, frame_step, frame_length=None, window=torch.hamming_window): """Compute Inverse Short-Time Discrete Cosine Transform of `dct`. Parameters other than `dcts` are keyword-only. Parameters ---------- dcts : DCT matrix/matrices from `sdct_torch` frame_step : Number of samples between adjacent DCT columns (should be the same value that was passed to `sdct_torch`). frame_length : Ignored. Window length and DCT frame length in samples. Can be None (default) or same value as passed to `sdct_torch`. window : Window to use for DCT. Either a window tensor (see documentation for `torch.stft`), or a window tensor constructor, `window(frame_length) -> Tensor`. Default: hamming window. Returns ------- signals : Time-domain signal(s) reconstructed from `dcts`, a `[..., n_samples]` tensor. Note that `n_samples` may be different from the original signals' lengths as passed to `sdct_torch`, because no padding is applied. """ *_, frame_length2, n_frames = dcts.shape assert frame_length in {None, frame_length2} signals = torch_overlap_add( torch_dct.idct(dcts.transpose(-1, -2), norm="ortho").transpose(-1, -2), frame_step=frame_step, ) if callable(window): window = window(frame_length2).to(signals) if window is not None: window_frames = window[:, None].expand(-1, n_frames) window_signal = torch_overlap_add(window_frames, frame_step=frame_step) signals = signals / window_signal return signals
def test_cuda(): if torch.cuda.is_available(): device = torch.device('cuda:0') for N in [2, 5, 32, 111]: x = np.random.normal(size=( 1, N, )) ref = fftpack.dct(x, type=1) act = dct.dct1(torch.tensor(x, device=device)).cpu().numpy() assert np.abs(ref - act).max() < EPS, ref for d in [2, 3, 4]: x = np.random.normal(size=(2, ) * d) ref = fftpack.dct(x, type=1) act = dct.dct1(torch.tensor(x, device=device)).cpu().numpy() assert np.abs(ref - act).max() < EPS, ref for norm in [None, 'ortho']: for N in [2, 3, 5, 32, 111]: x = np.random.normal(size=( 1, N, )) ref = fftpack.dct(x, type=2, norm=norm) act = dct.dct(torch.tensor(x, device=device), norm=norm).cpu().numpy() assert np.abs(ref - act).max() < EPS, (norm, N) for d in [2, 3, 4, 11]: x = np.random.normal(size=(2, ) * d) ref = fftpack.dct(x, type=2, norm=norm) act = dct.dct(torch.tensor(x, device=device), norm=norm).cpu().numpy() assert np.abs(ref - act).max() < EPS, (norm, d) for N in [5, 2, 32, 111]: x = np.random.normal(size=(1, N)) X = dct.dct(torch.tensor(x, device=device), norm=norm) y = dct.idct(X, norm=norm).cpu().numpy() assert np.abs(x - y).max() < EPS, x
def t_product_multiprocess(A, B): tmp = torch_mp.get_context('spawn') assert(A.shape[0] == B.shape[0] and A.shape[2] == B.shape[1]) dct_A = torch.transpose(dct.dct(torch.transpose(A, 0, 2)), 0, 2) dct_B = torch.transpose(dct.dct(torch.transpose(B, 0, 2)), 0, 2) dct_C = torch.zeros(A.shape[0], A.shape[1], B.shape[2]) #dct_A.share_memory_() #dct_B.share_memory_() #dct_C.share_memory_() processes = [] # num_cores = torch_mp.cpu_count() for i in range(dct_C.shape[0]): p = tmp.Process(target=t_product_slice, args=(dct_A, dct_B, dct_C, i)) p.start() processes.append(p) for p in processes: p.join() C = torch.transpose(dct.idct(torch.transpose(dct_C, 0, 2)), 0, 2) return C
def image_idct(dct_x): """Inverts image_dct(), by performing a type-III DCT.""" dct_x = torch.as_tensor(dct_x) dct_y = torch_dct.idct(torch.transpose(dct_x, 1, 2), norm='ortho') image = torch_dct.idct(torch.transpose(dct_y, 1, 2), norm='ortho') return image
def forward(self, p_vects, q_vects, p_frames_mask, q_frames_mask, num_phones_mask): ''' p/q_vects = [num_speakers X num_feats X max_num_mfcc_frames x mfcc_dim] p/q_lengths = [num_speakers X num_feats] -> stores the number of observed frames associated with the corresponding phone p/q_frames_mask = [num_speakers X num_feats X max_num_mfcc_frames x mfcc_dim] -> The associated 0s and 1s mask of p/q_lengths num_phones_mask = [num_speakers X num_feats], with a 0 corresponding to position that should be -1 (no phones observed) and a 1 everywhere else. n.b. mfcc_dim = 13 usually (using c0 for energy instead of log-energy) num_feats = 46*47*0.5 = 1128 usually max_num_mfcc_frames = the maximum number of frames associated with a particular phone for any speaker -> often set to 4000 ''' # Apply the attack noise = torch.exp(self.noise_root) # Need to add spectral noise # Pad to spectral dimension padding = torch.zeros(p_vects.size(0), p_vects.size(1), p_vects.size(2), self.spectral_dim - self.mfcc_dim).to(self.device) padded_p_vects = torch.cat((p_vects, padding), 3) padded_q_vects = torch.cat((q_vects, padding), 3) # Apply inverse dct log_spectral_p = dct.idct(padded_p_vects) log_spectral_q = dct.idct(padded_q_vects) # Apply inverse log spectral_p = torch.exp(log_spectral_p) spectral_q = torch.exp(log_spectral_q) # Add the adversarial attack noise attacked_spectral_p = spectral_p + noise attacked_spectral_q = spectral_q + noise # Apply the log attacked_log_spectral_p = torch.log(attacked_spectral_p) attacked_log_spectral_q = torch.log(attacked_spectral_q) # Apply the dct attacked_padded_p = dct.dct(attacked_log_spectral_p) attacked_padded_q = dct.dct(attacked_log_spectral_q) # Truncate to mfcc dimension p_vects_attacked = torch.narrow(attacked_padded_p, 3, 0, self.mfcc_dim) q_vects_attacked = torch.narrow(attacked_padded_q, 3, 0, self.mfcc_dim) # Apply mask of zeros/ones, to ensure spectral noise only applied up to p/q lengths p_vects_masked = p_vects_attacked * p_frames_mask q_vects_masked = q_vects_attacked * q_frames_mask # Compute the p/q_means tensor and covariance tensor p_means, p_covariances, q_means, q_covariances = self.get_pq_means_covs( p_vects_masked, q_vects_masked, p_frames_mask, q_frames_mask, num_phones_mask) # add small noise to all covariance matrices to ensure they are non-singular p_covariances_noised = p_covariances + (1e-2 * torch.eye(13).to(self.device)) q_covariances_noised = q_covariances + (1e-2 * torch.eye(13).to(self.device)) # print(p_covariances_noised[0,3,:,:]) # print(q_covariances_noised[1,4,:,:]) # Pass through trained model trained_model = torch.load(self.trained_model_path) trained_model.to(self.device) trained_model.eval() y = trained_model(p_means, p_covariances_noised, q_means, q_covariances_noised, num_phones_mask) return y
import torch import torch_dct as dct x = torch.randn(200) X = dct.dct(x) # DCT-II done through the last dimension y = dct.idct(X) # scaled DCT-III done through the last dimension assert (torch.abs(x - y)).sum() < 1e-10 # x == y within numerical tolerance