def get_coefficients(self): ar, cr, a, b, c, d = self.term.get_coefficients() # Real componenets crd = cr * self.delta coeffs = [2 * ar * (torch.cosh(crd) - 1) / crd**2, cr] # Imaginary coefficients cd = c * self.delta dd = d * self.delta c2 = c**2 d2 = d**2 factor = 2.0 / (self.delta * (c2 + d2))**2 cos_term = torch.cosh(cd) * torch.cos(dd) - 1 sin_term = torch.sinh(cd) * torch.sin(dd) C1 = a * (c2 - d2) + 2 * b * c * d C2 = b * (c2 - d2) - 2 * a * c * d coeffs += [ factor * (C1 * cos_term - C2 * sin_term), factor * (C2 * cos_term + C1 * sin_term), c, d, ] return coeffs
def double_soliton(x: torch.tensor, t: torch.tensor, c: float, x0: float) -> torch.tensor: """Single soliton solution of the KdV equation (u_t + u_{xxx} - 6 u u_x = 0) source: http://lie.math.brocku.ca/~sanco/solitons/kdv_solitons.php Args: x ([Tensor]): Input vector of spatial coordinates. t ([Tensor]): Input vector of temporal coordinates. c ([Array]): Array containing the velocities of the two solitons, note that c[0] > c[1]. x0 ([Array]): Array containing the offsets of the two solitons. Returns: [Tensor]: Solution. """ assert c[0] > c[1], "c1 has to be bigger than c[2]" xi0 = (np.sqrt(c[0]) / 2 * (x - c[0] * t - x0[0]) ) # switch to moving coordinate frame xi1 = np.sqrt(c[1]) / 2 * (x - c[1] * t - x0[1]) part_1 = 2 * (c[0] - c[1]) numerator = c[0] * torch.cosh(xi1)**2 + c[1] * torch.sinh(xi0)**2 denominator_1 = (np.sqrt(c[0]) - np.sqrt(c[1])) * torch.cosh(xi0 + xi1) denominator_2 = (np.sqrt(c[0]) + np.sqrt(c[1])) * torch.cosh(xi0 - xi1) u = part_1 * numerator / (denominator_1 + denominator_2)**2 coords = torch.cat((t.reshape(-1, 1), x.reshape(-1, 1)), dim=1) return coords, u.view(-1, 1)
def forward(self): M = torch.eye(6) if self.K1 < 0: K1 = -self.K1 flip = True else: K1 = self.K1 flip = False k = torch.sqrt(K1) kl = self.L * k M[0,0] = torch.cos(kl) M[0,1] = torch.sin(kl) / k M[1,0] = -k * torch.sin(kl) M[1,1] = torch.cos(kl) M[2,2] = torch.cosh(kl) M[2,3] = torch.sinh(kl) / k M[3,2] = k * torch.sinh(kl) M[3,3] = torch.cosh(kl) if flip: M = rot(- np.pi / 2) @ M @ rot(np.pi / 2) return M
def generate_data_labels(ka_period_list, x0, y0, z0, b, dt, kd, kdmin): a = torch.from_numpy(ka_period_list[:, 0]).double().cpu() k = torch.from_numpy(ka_period_list[:, 1]).double().cpu() xp = torch.from_numpy(np.ones(360000) * x0).double().cpu() yp = torch.from_numpy(np.ones(360000) * y0).double().cpu() zp = torch.from_numpy(np.ones(360000) * z0).double().cpu() yokr = torch.from_numpy(np.zeros((360000, 1000))).double().cpu() l_bif = torch.from_numpy(np.zeros(360000)).long().cpu() l_okr = torch.from_numpy(np.zeros(360000)).long().cpu() # xml[0] = x0 for i in range(1, kd + 1): xx = xp yy = yp zz = zp kx1 = a * torch.log(b * torch.cosh(yy)) - a * torch.log(xx + b) ky1 = k * yy - zz - (xx + b) * torch.tanh(yy) kz1 = (2 * k + 1) * yy - 2 * zz x1 = xx + (dt * kx1) / 2 y1 = yy + (dt * ky1) / 2 z1 = zz + (dt * kz1) / 2 kx2 = a * torch.log(b * torch.cosh(y1)) - a * torch.log(x1 + b) ky2 = k * y1 - z1 - (x1 + b) * torch.tanh(y1) kz2 = (2 * k + 1) * y1 - 2 * z1 x2 = xx + (dt * kx2) / 2 y2 = yy + (dt * ky2) / 2 z2 = zz + (dt * kz2) / 2 kx3 = a * torch.log(b * torch.cosh(y2)) - a * torch.log(x2 + b) ky3 = k * y2 - z2 - (x2 + b) * torch.tanh(y2) kz3 = (2 * k + 1) * y2 - 2 * z2 x3 = xx + (dt * kx3) y3 = yy + (dt * ky3) z3 = zz + (dt * kz3) kx4 = a * torch.log(b * torch.cosh(y3)) - a * torch.log(x3 + b) ky4 = k * y3 - z3 - (x3 + b) * torch.tanh(y3) kz4 = (2 * k + 1) * y3 - 2 * z3 xp = xx + ((kx1 + 2 * kx2 + 2 * kx3 + kx4) / 6) * dt yp = yy + ((ky1 + 2 * ky2 + 2 * ky3 + ky4) / 6) * dt zp = zz + ((kz1 + 2 * kz2 + 2 * kz3 + kz4) / 6) * dt # xml[k] = xp if i >= kdmin: r00 = (yy - 1.0) * (yp - 1.0) y00 = (xx - 1.0) - (yy - 1.0) * (xp - xx) / (yp - yy) condition = (r00 <= 0) & (y00 > 0) yokr[condition, l_bif[condition]] = y00[condition] l_bif[condition] += 1 max_l_bif = max(l_bif).item() for i in range(0, max_l_bif): cond_i = l_bif == i + 1 if yokr[cond_i, 0:i + 1].nelement() != 0: cond_i_idx = torch.where(cond_i)[0] periods, indices = yokr[cond_i, 0:i + 1].sort(descending=True) l_okr[cond_i] = 1 tmp = periods[:, 0] for j in range(1, i + 1): cond_abs = torch.abs(periods[:, j] - tmp) >= 0.01 cond_abs_idx = cond_i_idx[cond_abs] tmp[cond_abs] = periods[cond_abs, j] l_okr[cond_abs_idx] += 1 return a, k, l_okr
def train_epoch(self, t, x, y, thres_cosh=50, thres_emb=6): self.model.train() r = np.arange(x.size(0)) np.random.shuffle(r) r = torch.LongTensor(r).cuda() # Loop batches for i in range(0, len(r), self.sbatch): if i + self.sbatch <= len(r): b = r[i:i + self.sbatch] else: b = r[i:] # images=torch.autograd.Variable(x[b],volatile=False) # targets=torch.autograd.Variable(y[b],volatile=False) # task=torch.autograd.Variable(torch.LongTensor([t]).cuda(),volatile=False) images = torch.autograd.Variable(x[b]) targets = torch.autograd.Variable(y[b]) task = torch.autograd.Variable(torch.LongTensor([t]).cuda()) s = (self.smax - 1 / self.smax) * i / len(r) + 1 / self.smax # Forward output, masks = self.model.forward(task, images, s=s) if self.split: output = output[t] loss, _ = self.criterion(output, targets, masks) # Backward self.optimizer.zero_grad() loss.backward() # Restrict layer gradients in backprop if t > 0: for n, p in self.model.named_parameters(): if n in self.mask_back: p.grad.data *= self.mask_back[n] # Compensate embedding gradients for n, p in self.model.named_parameters(): if n.startswith('e'): num = torch.cosh( torch.clamp(s * p.data, -thres_cosh, thres_cosh)) + 1 den = torch.cosh(p.data) + 1 p.grad.data *= self.smax / s * num / den # Apply step if args.optimizer == 'SGD' or args.optimizer == 'SGD_momentum_decay': torch.nn.utils.clip_grad_norm(self.model.parameters(), self.clipgrad) self.optimizer.step() # Constrain embeddings for n, p in self.model.named_parameters(): if n.startswith('e'): p.data = torch.clamp(p.data, -thres_emb, thres_emb) #print(masks[-1].data.view(1,-1)) #if i>=5*self.sbatch: sys.exit() #if i==0: print(masks[-2].data.view(1,-1),masks[-2].data.max(),masks[-2].data.min()) #print(masks[-2].data.view(1,-1)) return
def effective_energy(self, sample, frozen_nodes, tree_order, tree_hierarchy): h = sample.matmul(self.J[frozen_nodes, :]) + self.h tree_energy = torch.zeros(sample.shape[0], device=self.device, dtype=sample.dtype) tree = torch.from_numpy(np.array(tree_order)).to(self.device) for layer in tree_hierarchy: index_matrix = torch.zeros(len(layer), 2, dtype=torch.int64, device=self.device) index_matrix[:, 0] = torch.arange(len(layer)) if len(self.J[layer][:, tree].nonzero()) != 0: index_matrix.index_copy_(0, self.J[layer][:, tree].nonzero()[:, 0], self.J[layer][:, tree].nonzero()) index = index_matrix[:, 1] root = tree[index] hpj = self.J[layer, root] + h[:, layer] hmj = -self.J[layer, root] + h[:, layer] tree_energy += -torch.log(2 * (torch.cosh(self.beta * hpj) * torch.cosh(self.beta * hmj)).sqrt()).sum(dim=1) / self.beta for k in range(len(root)): h[:, root[k]] += torch.log(torch.cosh(self.beta * hpj) / torch.cosh(self.beta * hmj))[:, k] / (2 * self.beta) tree = tree[len(layer):] batch = sample.shape[0] assert sample.shape[1] == len(frozen_nodes) J = self.J[frozen_nodes][:, frozen_nodes].to_sparse() fvs_energy = -torch.bmm(sample.view(batch, 1, len(frozen_nodes)), torch.sparse.mm(J, sample.t()).t().view(batch, len(frozen_nodes), 1)).reshape(batch) / 2 fvs_energy -= sample @ self.h[frozen_nodes] energy = fvs_energy + tree_energy return self.beta * energy
def calc_loss(y_hat, y_cuda, mag_hat, batch_size=20, scale_by_freq=None, l1_lambda=2e-5, reg_logcosh=False): # Reconstruction term plus regularization -> Slightly less wiggly waveform #loss = logcosh(y_hat, y_cuda) + 1e-5*torch.abs(mag_hat).mean() # loss = logcosh(y_hat, y_cuda) + 2e-5*torch.abs(mag_hat).mean() #print("y_hat.dtype, y_cuda.dtype, mag_hat.dtype, scale_by_freq.dtype =",y_hat.dtype, y_cuda.dtype, mag_hat.dtype, scale_by_freq.dtype) if not reg_logcosh: if scale_by_freq is None: loss = logcosh(y_hat, y_cuda) + l1_lambda * torch.abs(mag_hat).mean( ) # second term is an L1 regularization to help 'damp' high-freq noise else: loss = logcosh( y_hat, y_cuda ) + l1_lambda / 10 * torch.abs(mag_hat * scale_by_freq).mean( ) # second term is an L1 regularization to help 'damp' high-freq noise else: if scale_by_freq is None: loss = logcosh(y_hat, y_cuda) + l1_lambda * torch.mean( torch.log(torch.cosh(mag_hat)) ) # second term is an L1 regularization to help 'damp' high-freq noise else: loss = logcosh(y_hat, y_cuda) + l1_lambda / 10 * torch.mean( scale_by_freq * torch.log(torch.cosh(mag_hat)) ) # second term is an L1 regularization to help 'damp' high-freq noise return loss
def expm(self, p, d_p, lr=None, out=None, normalize=False): """Exponential map for hyperboloid""" if out is None: out = p if d_p.is_sparse: ix, d_val = d_p._indices().squeeze(), d_p._values() p_val = self.normalize(p.index_select(0, ix)) ldv = self.ldot(d_val, d_val, keepdim=True) if self.debug: assert all(ldv > 0), "Tangent norm must be greater 0" assert all(ldv == ldv), "Tangent norm includes NaNs" nd_p = ldv.clamp_(min=0).sqrt_() t = th.clamp(nd_p, max=self.norm_clip) nd_p.clamp_(min=self.eps) newp = (th.cosh(t) * p_val).addcdiv_(th.sinh(t) * d_val, nd_p) if normalize: newp = self.normalize(newp) p.index_copy_(0, ix, newp) else: if lr is not None: d_p.narrow(-1, 0, 1).mul_(-1) d_p.addcmul_((self.ldot(p, d_p, keepdim=True)).expand_as(p), p) d_p.mul_(-lr) ldv = self.ldot(d_p, d_p, keepdim=True) if self.debug: assert all(ldv > 0), "Tangent norm must be greater 0" assert all(ldv == ldv), "Tangent norm includes NaNs" nd_p = ldv.clamp_(min=0).sqrt_() t = th.clamp(nd_p, max=self.norm_clip) nd_p.clamp_(min=self.eps) newp = (th.cosh(t) * p).addcdiv_(th.sinh(t) * d_p, nd_p) if normalize: newp = self.normalize(newp) p.copy_(newp)
def log_cosh(pred, truth, sample_weight=None): ey_t = truth - pred if sample_weight is not None: return torch.mean(torch.log(torch.cosh(ey_t + 1e-12)) * sample_weight) else: return torch.mean(torch.log(torch.cosh(ey_t + 1e-12)))
def compute_hmds(data=None, distance_matrix=None, model='poincare', dimensions=2): """ Compute h-MDS following the paper "Representation Tradeoffs for Hyperbolic Embeddings" (Algorithm 2) It is the closest to PCA in the hyperbolic space input model: hyperbolic model where data sits in, in order to compute distances If distance matrix is computed, we do not use data. The distances are the same independently of the hyperbolic model used. """ assert data is not None or distance_matrix is not None, 'We have to obtain the data somehow' # Compute distance matrix, and then Y=cosh(d) if distance_matrix is None: x = data.unsqueeze(1).expand(data.shape[0], data.shape[0], data.shape[1]).contiguous().view(-1, data.shape[1]) y = data.unsqueeze(0).expand(data.shape[0], data.shape[0], data.shape[1]).contiguous().view(-1, data.shape[1]) if model == 'poincare': distance_matrix = gmath.dist(x=x, y=y, k=torch.tensor(-1.0)) Y = torch.cosh(distance_matrix) else: # model == 'hyperboloid' Y = hyperboloid_distance(x, y) else: Y = torch.cosh(distance_matrix) Y = Y.view(data.shape[0], data.shape[0]).detach() pca = PCA(n_components=dimensions, svd_solver='full') data_hyperboloid_reduced = pca.fit_transform(-Y.cpu().numpy()) x0 = np.sqrt((data_hyperboloid_reduced**2).sum(axis=-1, keepdims=True)+1) data_hyperboloid_reduced = np.concatenate([data_hyperboloid_reduced, x0], axis=-1) return data_hyperboloid_reduced
def backward(self, grad_output, grad_output_mean): #STE Part input, = self.saved_tensors grad_input = grad_output.clone() grad_input = (2 / torch.cosh(input)) * (2 / torch.cosh(input)) * ( grad_input) #grad_input[input.ge(1)] = 0 #great or equal #grad_input[input.le(-1)] = 0 #less or equal return grad_input
def get_value(self, tau0): dt = self.delta ar, cr, a, b, c, d = self.term.get_coefficients() # Format the lags correctly tau0 = torch.abs(as_tensor(tau0)) tau = tau0[..., None] # Precompute some factors dpt = dt + tau dmt = dt - tau # Real parts: # tau > Delta crd = cr * dt cosh = torch.cosh(crd) norm = 2 * ar / crd**2 K_large = torch.sum(norm * (cosh - 1) * torch.exp(-cr * tau), axis=-1) # tau < Delta crdmt = cr * dmt K_small = K_large + torch.sum(norm * (crdmt - torch.sinh(crdmt)), axis=-1) # Complex part cd = c * dt dd = d * dt c2 = c**2 d2 = d**2 c2pd2 = c2 + d2 C1 = a * (c2 - d2) + 2 * b * c * d C2 = b * (c2 - d2) - 2 * a * c * d norm = 1.0 / (dt * c2pd2)**2 k0 = torch.exp(-c * tau) cdt = torch.cos(d * tau) sdt = torch.sin(d * tau) # For tau > Delta cos_term = 2 * (torch.cosh(cd) * torch.cos(dd) - 1) sin_term = 2 * (torch.sinh(cd) * torch.sin(dd)) factor = k0 * norm K_large += torch.sum((C1 * cos_term - C2 * sin_term) * factor * cdt, axis=-1) K_large += torch.sum((C2 * cos_term + C1 * sin_term) * factor * sdt, axis=-1) # tau < Delta edmt = torch.exp(-c * dmt) edpt = torch.exp(-c * dpt) cos_term = (edmt * torch.cos(d * dmt) + edpt * torch.cos(d * dpt) - 2 * k0 * cdt) sin_term = (edmt * torch.sin(d * dmt) + edpt * torch.sin(d * dpt) - 2 * k0 * sdt) K_small += torch.sum(2 * (a * c + b * d) * c2pd2 * dmt * norm, axis=-1) K_small += torch.sum((C1 * cos_term + C2 * sin_term) * norm, axis=-1) mask = tau0 >= dt return K_large * mask + K_small * (~mask)
def forward(self, gaze_gt, hp_gt, gaze_pred, hp_pred): # weight_hp = torch.sum(1 - torch.cos(hp_gt - hp_pred), dim=1) # cos_distant = torch.sum(1 - torch.cos(gaze_gt - gaze_pred), dim=1) hp_losls = torch.sum(torch.log(torch.cosh(hp_gt - hp_pred)), dim=1) # L1_gaze = torch.sum(torch.abs(gaze_gt - gaze_pred) , dim=1) gaze_loss = torch.sum(torch.log(torch.cosh(gaze_gt - gaze_pred)), dim=1) # return torch.mean(0.25* L2_hp_weight + (L2_gaze)) return torch.mean(0.2 * hp_losls + (gaze_loss))
def step(self, model, mask_back, t, s=None, thres_cosh=None, smax=None, clipgrad=None, finetune=False, closure=None): """Performs a single optimization step. Constraining joint objective based gradient and weight decay gradient. Momentum is disregarded, as cancelled out neurons don't build up momentum anyway. """ loss = None if closure is not None: loss = closure() for group in self.param_groups: weight_decay = group['weight_decay'] momentum = group['momentum'] dampening = group['dampening'] nesterov = group['nesterov'] for p, (modp_name, modp) in zip(group['params'], model.named_parameters()): assert modp is p if p.grad is None: continue d_p = p.grad.data if weight_decay != 0: if 'embs' not in modp_name: # Don't decay embedding params, doesn't train d_p.add_(weight_decay, p.data) # Constrain grad if t > 0: # Restrict layer gradients in backprop: a^{<t} if modp_name in mask_back: p.grad.data *= mask_back[modp_name] # See before: stored as (1 - x) with prev task masks # Compensate embedding gradients if not finetune: if 'embs' in modp_name: num = torch.cosh(torch.clamp(s * p.data, -thres_cosh, thres_cosh)) + 1 den = torch.cosh(p.data) + 1 p.grad.data *= smax / s * num / den # Clip torch.nn.utils.clip_grad_norm(p, clipgrad) # Leave momentum as is if momentum != 0: param_state = self.state[p] if 'momentum_buffer' not in param_state: buf = param_state['momentum_buffer'] = torch.clone(d_p).detach() else: buf = param_state['momentum_buffer'] buf.mul_(momentum).add_(1 - dampening, d_p) if nesterov: d_p = d_p.add(momentum, buf) else: d_p = buf p.data.add_(-group['lr'], d_p) return loss
def train_epoch(self, t, data, iter_bar, which_type): self.model.train() # Loop batches for step, batch in enumerate(iter_bar): batch = [ bat.to(self.device) if bat is not None else None for bat in batch ] input_ids, segment_ids, input_mask, targets, _ = batch s = (self.smax - 1 / self.smax) * step / len(data) + 1 / self.smax task = torch.autograd.Variable(torch.LongTensor([t]).cuda(), volatile=True) # Forward outputs = self.model.forward(task, input_ids, segment_ids, input_mask, which_type, s) output = outputs[t] loss = self.criterion(output, targets) iter_bar.set_description('Train Iter (loss=%5.3f)' % loss.item()) # Backward self.optimizer.zero_grad() loss.backward() if t > 0 and which_type == 'mcl': task = torch.autograd.Variable(torch.LongTensor([t]).cuda(), volatile=False) mask = self.model.ac.mask(task, s=self.smax) mask = torch.autograd.Variable(mask.data.clone(), requires_grad=False) for n, p in self.model.named_parameters(): if n in rnn_weights: # print('n: ',n) # print('p: ',p.grad.size()) p.grad.data *= self.model.get_view_for(n, mask) # Compensate embedding gradients for n, p in self.model.ac.named_parameters(): if 'ac.e' in n: num = torch.cosh( torch.clamp(s * p.data, -self.thres_cosh, self.thres_cosh)) + 1 den = torch.cosh(p.data) + 1 p.grad.data *= self.smax / s * num / den torch.nn.utils.clip_grad_norm(self.model.parameters(), self.clipgrad) self.optimizer.step() # Constrain embeddings for n, p in self.model.ac.named_parameters(): if 'ac.e' in n: p.data = torch.clamp(p.data, -self.thres_emb, self.thres_emb) return
def backward(ctx, grad_output): grad_input = None shape = ctx.shape lambd = ctx.fitter.lambd max_slope = ctx.fitter.max_slope monotonic = ctx.fitter.monotonic eps = ctx.fitter.eps power = ctx.fitter.power order = ctx.fitter.order dm = ctx.fitter.dm m = ctx.fitter.m a0 = ctx.fitter.a0.view(shape) if ctx.needs_input_grad[0]: dF = ctx.residual[torch.eye( shape[0], dtype=bool).repeat_interleave(shape[1], axis=0)].view(shape) dF0 = ctx.residual.sum(axis=-1).view(shape) * -.5 * dm.view(-1, 1) summation = dF + dF0 if order > 0: a1 = ctx.fitter.a1.view(shape) if max_slope is None: dF1 = (ctx.residual * m).sum( axis=-1).view(shape) * -1.5 * (dm * m).view(-1, 1) summation += dF1 else: a0 = max_slope * a0 dF1 = (ctx.residual*m).sum(axis=-1).view(shape) *\ (-1.5* (dm*m).view(-1,1) *(1/torch.cosh(a1/(a0+eps)))**2+\ -a1/(a0+eps)*(1/torch.cosh(a1/(a0+eps)))**2*-.5* dm.view(-1,1)*max_slope+\ torch.tanh(a1/(a0+eps))*-.5* dm.view(-1,1)*max_slope) summation += dF1 if order > 1: a2 = ctx.fitter.a2.view(shape) if not monotonic: dF2 = (ctx.residual*.5*(3*m**2-1)).sum(axis=-1).view(shape) *\ -2.5*(dm*0.5*(3*m**2-1)).view(-1,1) #dc_2/ds is this term summation += dF2 else: a1 = a1 / 3. dF2 = (ctx.residual*.5*(3*m**2-1)).sum(axis=-1).view(shape) *\ (1/torch.cosh(a2/(a1+eps))**2*(-2.5*dm*0.5*(3*m**2-1)).view(-1,1)+\ (1/torch.cosh(a2/(a1+eps))**2* -a2/(a1+eps)*-1.5*(dm*m).view(-1,1)/3. +\ -1.5*(dm*m).view(-1,1)/3.*(torch.tanh(a2/(a1+eps))))) summation += dF2 summation *= (-power) / np.prod(shape) if lambd is not None: summation += -lambd*2/np.prod(shape) *\ 3/2* ctx.fitter.a1.view(shape)*(m*dm).view(-1,1) grad_input = grad_output * summation * ctx.weights return grad_input, None, None, None, None
def expm(self, p, d_p, lr=None, out=None, normalize=False): """Exponential map for hyperboloid""" if out is None: out = p # print("LORENTZIAN EXPONENTIAL MAP") if d_p.is_sparse: # t0 = time.time() ix, d_val = d_p._indices().squeeze(), d_p._values() # This pulls `ix` out of the original embedding table, which could # be in a corrupted state. normalize it to fix it back to the # surface of the hyperboloid... # TODO: we should only do the normalize if we know that we are # training with multiple threads, otherwise this is a bit wasteful p_val = self.normalize(p.index_select(0, ix)) ldv = self.ldot(d_val, d_val, keepdim=True) if self.debug: assert all(ldv > 0), "Tangent norm must be greater 0" assert all(ldv == ldv), "Tangent norm includes NaNs" nd_p = ldv.clamp_(min=0).sqrt_() t = th.clamp(nd_p, max=self.norm_clip) nd_p.clamp_(min=self.eps) newp = (th.cosh(t) * p_val).addcdiv_(th.sinh(t) * d_val, nd_p) # print("is p_val sparse: {}".format(p_val.is_sparse)) # print("is nd_p sparse: {}".format(nd_p.is_sparse)) # print("is p sparse: {}".format(p.is_sparse)) # print("is newp sparse: {}".format(newp.is_sparse)) print(f"p = {p}") print(f"d_p = {d_p}") # print(newp) if normalize: newp = self.normalize(newp) print(newp.shape) p.index_copy_(0, ix, newp) # t1 = time.time() # print("iteration time = {}".format(t1-t0)) else: if lr is not None: d_p.narrow(-1, 0, 1).mul_(-1) d_p.addcmul_((self.ldot(p, d_p, keepdim=True)).expand_as(p), p) d_p.mul_(-lr) ldv = self.ldot(d_p, d_p, keepdim=True) if self.debug: assert all(ldv > 0), "Tangent norm must be greater 0" assert all(ldv == ldv), "Tangent norm includes NaNs" nd_p = ldv.clamp_(min=0).sqrt_() t = th.clamp(nd_p, max=self.norm_clip) nd_p.clamp_(min=self.eps) newp = (th.cosh(t) * p).addcdiv_(th.sinh(t) * d_p, nd_p) if normalize: newp = self.normalize(newp) p.copy_(newp)
def train_epoch(self, t, x, y, thres_cosh=50, thres_emb=6): self.model.train() r = np.arange(x.size(0)) np.random.shuffle(r) r = torch.LongTensor(r).cuda() # Loop batches for i in range(0, len(r), self.sbatch): if i + self.sbatch <= len(r): b = r[i:i + self.sbatch] else: b = r[i:] images = torch.autograd.Variable(x[b]) targets = torch.autograd.Variable(y[b]) task = torch.autograd.Variable(torch.LongTensor([t]).cuda()) s = (self.smax - 1 / self.smax) * i / len(r) + 1 / self.smax # Forward output, masks = self.model.forward(task, images, s=s) output = output[t] loss, _ = self.criterion(output, targets, masks) # Backward self.optimizer.zero_grad() loss.backward() # Restrict layer gradients in backprop if t > 0: for n, p in self.model.named_parameters(): if n in self.mask_back: p.grad.data *= self.mask_back[n] # Compensate embedding gradients for n, p in self.model.named_parameters(): if n.startswith('e'): num = torch.cosh( torch.clamp(s * p.data, -thres_cosh, thres_cosh)) + 1 den = torch.cosh(p.data) + 1 p.grad.data *= self.smax / s * num / den # Apply step self.optimizer.step() # Constrain embeddings for n, p in self.model.named_parameters(): if n.startswith('e'): p.data = torch.clamp(p.data, -thres_emb, thres_emb) return
def exp(self, X, G): # check for multi dimenstions G_lnorm = self.norm(X, G) if self._k == 1: ex = torch.cosh(G_lnorm) * X + torch.sinh(G_lnorm) * (G / G_lnorm) if G_lnorm == 0: ex = X return ex else: G_lnorm = G_lnorm.view(-1, 1) ex = torch.cosh(G_lnorm) * X + torch.sinh(G_lnorm) * (G / G_lnorm) exclude = G_lnorm == 0 exclude = exclude.view(-1) ex[exclude, :] = X[exclude, :] return ex
def log_cosh_loss(y_pred, y_true): loss = torch.cosh(y_pred - y_true) loss = torch.log(loss) loss = loss.mean(dim=0) loss = torch.log(loss) loss = torch.sum(loss) return loss
def grad_log_prob(self, r): c = __to_tensor__(self.c) dim = self.dim res = - r / self.sigma.pow(2) + (dim - 1) * c.sqrt() * \ torch.cosh(c.sqrt() * r) / torch.sinh(c.sqrt() * r) res[r < 0] = 0.0 return res
def backward(ctx, grad_output): input, weight, bias, output = ctx.saved_variables grad_input = grad_weight = grad_bias = None grad_stride = grad_padding = grad_dilation = grad_groups = grad_nonlinearity_g = None # an easy way to get the gradient wrt the non-gaussianity is to calculate the # nongaussianity, then just call backwards on this. # From https://github.com/pytorch/pytorch/issues/1776 # (but perhaps isn't maximally efficient; one could calculate the derivative manually) nongaussianity = torch.mean(torch.log(torch.cosh(output))) if ctx.needs_input_grad[0]: grad_input = torch.autograd.grad(nongaussianity, input, grad_output) if ctx.needs_input_grad[1]: if ctx.super_or_sub == "super": grad_weight = ctx.ica_strength * torch.autograd.grad( nongaussianity, weight, grad_output) elif ctx.super_or_sub == "sub": grad_weight = -ctx.ica_strength * torch.autograd.grad( nongaussianity, weight, grad_output) # no change in bias gradient return grad_input, grad_weight, grad_bias, grad_stride, grad_padding, grad_dilation, grad_groups, grad_nonlinearity_g
def pseudo_hyperbolic_gaussian(z, mu_h, cov, version, vt=None, u=None): batch_size, n_h = mu_h.shape n = n_h - 1 mu0 = to_cuda_var(torch.zeros(batch_size, n)) v0 = torch.cat((to_cuda_var(torch.ones(batch_size, 1)), mu0), 1) # origin of the hyperbolic space # try not using inverse exp. mapping if vt is already known if vt is None and u is None: u = inv_exp_map(z, mu_h) v = parallel_transport(u, mu_h, v0) vt = v[:, 1:] logp_vt = (MultivariateNormal(mu0, cov).log_prob(vt)).view(-1, 1) else: logp_vt = (MultivariateNormal(mu0, cov).log_prob(vt)).view(-1, 1) r = lorentz_tangent_norm(u) if version == 1: alpha = -lorentz_product(v0, mu_h) log_det_proj_mu = n * (torch.log(torch.sinh(r)) - torch.log(r)) + torch.log( torch.cosh(r)) + torch.log(alpha) elif version == 2: log_det_proj_mu = (n - 1) * (torch.log(torch.sinh(r)) - torch.log(r)) logp_z = logp_vt - log_det_proj_mu return logp_vt, logp_z
def exp(self, lr): """ Exponential map """ x = self.data.detach() # print("norm", HyperboloidParameter.norm_h(x)) v = -lr * self.grad retract = False if retract: # retraction # print("retract") self.data = x + v else: # print("tangent", HyperboloidParameter.dot_h(x, v)) assert torch.all(~torch.isnan(v)) n = self.__class__.norm_h(v).unsqueeze(-1) assert torch.all(~torch.isnan(n)) n.clamp_(max=1.0) # e = torch.cosh(n)*x + torch.sinh(n)*v/n mask = torch.abs(n)<1e-7 cosh = torch.cosh(n) cosh[mask] = 1.0 sinh = torch.sinh(n) sinh[mask] = 0.0 n[mask] = 1.0 e = cosh*x + sinh/n*v # assert torch.all(-HyperboloidParameter.dot_h(e,e) >= 0), torch.min(-HyperboloidParameter.dot_h(e,e)) self.data = e self.proj()
def forward(self, y_t, y_prime_t): ey_t = y_t - y_prime_t value = torch.log(torch.cosh(ey_t + 1e-12)) if self.reduction == 'mean': return torch.mean(value) elif self.reduction == 'sum': return torch.sum(value)
def train(args, epoch, net, trainLoader, optimizer): net.train() for batch_idx, (data, _) in enumerate(trainLoader): if args.cuda: data = data.cuda() data = Variable(data) data = torch.mean(data, 1).view(-1,32**2) optimizer.zero_grad() output = torch.squeeze(net(data)) # now reconstruct the input data_r = output.mm(net.linear.weight) # the loss mse_loss = F.mse_loss(data,data_r) nongaussianity = args.ica * torch.mean(torch.log(torch.cosh(output))) loss = mse_loss + nongaussianity loss.backward() optimizer.step() print('Train Epoch {}: Loss: {:.6f},\t Nongaussianity: {:.6f}\t'.format(epoch, mse_loss.item(), nongaussianity.item()))
def forward(self, x, y, predict=False): """Computes logcosh-loss with weights Arguments: x {torch.Tensor} -- Predictions of shape (B, F), where F is number of targets y {tuple} -- (targets, weights), where targets is of shape (B, F) and weights of shape (F) Returns: [torch.Tensor] -- Averaged, weighted loss over batch. """ # Unpack into targets and weights targets, weights = y # Calculate actual loss logcosh = torch.log(torch.cosh(x - targets)) # weight to control contributions # Now weigh it loss_weighted = torch.sum(self._weights * logcosh, dim=-1) # Mean over the batch if not predict: loss = torch.mean(loss_weighted) else: loss = loss_weighted return loss
def get_celerite_matrices(self, x, diag): dt = self.delta ar, cr, a, b, c, d = self.term.get_coefficients() # Real part cd = cr * dt delta_diag = 2 * torch.sum(ar * (cd - torch.sinh(cd)) / cd**2) # Complex part cd = c * dt dd = d * dt c2 = c**2 d2 = d**2 c2pd2 = c2 + d2 C1 = a * (c2 - d2) + 2 * b * c * d C2 = b * (c2 - d2) - 2 * a * c * d norm = (dt * c2pd2)**2 sinh = torch.sinh(cd) cosh = torch.cosh(cd) delta_diag += 2 * torch.sum( (C2 * cosh * torch.sin(dd) - C1 * sinh * torch.cos(dd) + (a * c + b * d) * dt * c2pd2) / norm) new_diag = as_tensor(diag) + delta_diag return super().get_celerite_matrices(x, new_diag)
def chart(self, inp): """ Map inp onto the hyperboloid using the global chart """ k = inp.shape[-1] # inp_norm = torch.norm(inp, p=2, dim=-1, keepdim=True) # d = F.normalize(inp, p=2, dim=-1) cv = torch.cosh(inp.narrow(-1,1,1)).squeeze() cu = torch.cosh(inp.narrow(-1,0,1)).squeeze() sv = torch.sinh(inp.narrow(-1,1,1)).squeeze() su = torch.sinh(inp.narrow(-1,0,1)).squeeze() # h_ = inp return torch.stack((cu*cv, cv*su, sv), dim=-1)
def __init__(self): super().__init__() self.activation_func_derivs = [ lambda ds, x: torch.tanh(x), lambda ds, x: 1 / torch.cosh(x)**2, lambda ds, x: -2 * ds[0] * ds[1], lambda ds, x: ds[2]**2 / ds[1] - 2 * ds[1]**2 ] # ordered list of activation function and its derivatives