def grad_like(self, r, eps): """ Gradient of likelihood w.r.t variational parameters Args: r (): Transformed random sample eps (): Random sample Returns: gradient w.r.t variances, gradient w.r.t mean """ if self.obs_idx is not None: r_obs = r[self.obs_idx] else: r_obs = r dr = self.likelihood_grad(r_obs, self.y) dr[np.isnan(dr)] = 0. if self.obs_idx is not None: grad_mu = np.zeros(self.m) grad_mu[self.obs_idx] = dr else: grad_mu = dr grad_S = np.multiply( grad_mu, np.multiply( eps, np.multiply(0.5 / np.sqrt(np.exp(self.q_S)), np.exp(self.q_S)))) return grad_S, grad_mu
def doPDE(values, movablePts, xPoints, yPoints, xIntPoints, yIntPoints): # Update the values based on diffusion of the proteins to nearby cells D = 0.1 # diffusion parameter valuesT = np.transpose(values) adjustmentPDEX = D * nonLinearAdjustment(xPoints) adjustmentPDEY = D * nonLinearAdjustment(yPoints) #simple diffusion is just a convolution convolveLinear = np.array([1 * D, -2 * D, 1 * D]) # accumulate the changes due to diffusion for rep in range(50): # print(rep) newValuesX = list([]) newValuesY = list([]) for i in range(HowManyCells): row = values[i] + sig.convolve( values[i], convolveLinear)[1:-1] #take off first and last rowY = valuesT[i] + sig.convolve( valuesT[i], convolveLinear)[1:-1] #take off first and last # non-linear diffusion, add the adjustment if i in xIntPoints: row = row + np.multiply(row, adjustmentPDEX) if i in yIntPoints: rowY = rowY + np.multiply(rowY, adjustmentPDEY) newValuesX.append(row) newValuesY.append(rowY) #Merge rows and transposed columns values = np.array(newValuesX) + np.array(newValuesY).T # add source at each iteration values = values + addSources3(xPoints, yPoints) #Update transposed values valuesT = values.T # the total update returned is the difference between the original values and the values after diffusion return values
def forward(self, X1, X2, **kwargs): alpha, mean_lam, gamma, delta = self._get_params(X1, **kwargs) cfg1, res1, kappa1, kr_pref1, _ = self._compute_terms( X1, alpha, mean_lam, gamma, delta) if X2 is not X1: cfg2, res2, kappa2, kr_pref2, _ = self._compute_terms( X2, alpha, mean_lam, gamma, delta) else: cfg2, res2, kappa2, kr_pref2 = cfg1, res1, kappa1, kr_pref1 res2 = anp.reshape(res2, (1, -1)) kappa2 = anp.reshape(kappa2, (1, -1)) kr_pref2 = anp.reshape(kr_pref2, (1, -1)) kappa12 = self._compute_kappa( anp.add(res1, res2), alpha, mean_lam) kmat_res = anp.subtract(kappa12, anp.multiply(kappa1, kappa2)) kmat_res = anp.multiply(kr_pref1, anp.multiply( kr_pref2, kmat_res)) kmat_x = self.kernel_x(cfg1, cfg2) if self.encoding_delta is None: if delta > 0.0: tmpmat = anp.add(kappa1, anp.subtract( kappa2, kappa12 * delta)) tmpmat = tmpmat * (-delta) + 1.0 else: tmpmat = 1.0 else: tmpmat = anp.add(kappa1, anp.subtract( kappa2, anp.multiply(kappa12, delta))) tmpmat = anp.multiply(tmpmat, -delta) + 1.0 return kmat_x * tmpmat + kmat_res
def step(self, *inputs): grad = self.flattened_grad(self.theta, *inputs) # optionally resample momentum if self.resample_momentum > 0 and self.count % self.resample_momentum == 0: np.copyto(self.p, self._srng.normal(size=self.theta.shape)) # Constant mass just defined here so that we can easily change it should we want to Minv = 1. Minvh = 1. # pre-generate a sample sample = self._srng.normal(size=self.theta.shape) * np.sqrt(self.epsilon * 2 * self.A) # the SG-HMC update equations # update p self.p += - self.epsilon * Minvh * grad \ - self.epsilon * (self.xi - self.A) * self.p \ - self.epsilon * Minv * self.A * self.p \ + Minvh * sample # in-place multiplication with epsilon to make sure # we have the values available in updates np.multiply(Minvh * self.p, self.epsilon, self.updates) # update theta self.theta += self.updates # update xi self.xi += self.epsilon * (self.p**2 - 1) self.xi_acc += self.xi # callbacks self.count += 1 if self.count % self.callback_every == 0: #print(self.theta, (self.epsilon * r_t - self.epsilon * r_t**2)) for callback in self.callbacks: callback(self.count, self) return self.unflatten(self.theta)
def _w_cross_hessian(self, sigma, Y, basis, beta, K_X): if beta is None: return 0 else: K = self._weighted_kernel(sigma, Y, basis, K_X) if basis is None: basis_Y = Y else: basis_Y = basis n_y, d_y = Y.shape n_basis, _ = basis_Y.shape K_b = np.matmul(K, beta) b_d = np.matmul(Y, beta.T) - np.outer( np.ones([n_y, 1]), np.sum(np.multiply(beta, basis_Y), axis=1)) K_b_mat = np.multiply(K, b_d) K_b_d = np.sum(K_b_mat, axis=1) K_b_y = np.matmul(K_b_mat, basis_Y) h = (2. * sigma) * K_b + (2. * sigma)**2 * ( K_b_y - np.multiply(np.reshape(K_b_d, [-1, 1]), Y)) return h
def forward(self, X1, X2): """ Actual computation of the matrix of squared distances (see details above) :param X1: input data of size (n1,d) :param X2: input data of size (n2,d) :param inverse_bandwidths_internal: self.inverse_bandwidths_internal """ # In case inverse_bandwidths if of size (1, dimension), dimension>1, # ARD is handled by broadcasting inverse_bandwidths = anp.reshape(self._inverse_bandwidths(), (1, -1)) if X2 is X1: X1_scaled = anp.multiply(X1, inverse_bandwidths) D = -2.0 * anp.dot(X1_scaled, anp.transpose(X1_scaled)) X1_squared_norm = anp.sum(anp.square(X1_scaled), axis=1) D = D + anp.reshape(X1_squared_norm, (1, -1)) D = D + anp.reshape(X1_squared_norm, (-1, 1)) else: X1_scaled = anp.multiply(X1, inverse_bandwidths) X2_scaled = anp.multiply(X2, inverse_bandwidths) X1_squared_norm = anp.sum(anp.square(X1_scaled), axis=1) X2_squared_norm = anp.sum(anp.square(X2_scaled), axis=1) D = -2.0 * anp.matmul(X1_scaled, anp.transpose(X2_scaled)) D = D + anp.reshape(X1_squared_norm, (-1, 1)) D = D + anp.reshape(X2_squared_norm, (1, -1)) return anp.abs(D)
def event_baseline_transform(params, X, n_particles_per_event=10): features = [] for e in X: features.append(e[:n_particles_per_event]) h_jets = np.vstack(features) h_jets = h_jets.reshape(len(X), n_particles_per_event, -1) # GRU layer h = np.zeros((len(X), params["rnn_b_h"].shape[0])) for t in range(n_particles_per_event): xt = h_jets[:, n_particles_per_event - 1 - t, :] zt = sigmoid( np.dot(params["rnn_W_zh"], h.T).T + np.dot(params["rnn_W_zx"], xt.T).T + params["rnn_b_z"]) rt = sigmoid( np.dot(params["rnn_W_rh"], h.T).T + np.dot(params["rnn_W_rx"], xt.T).T + params["rnn_b_r"]) ht = relu( np.dot(params["rnn_W_hh"], np.multiply(rt, h).T).T + np.dot(params["rnn_W_hx"], xt.T).T + params["rnn_b_h"]) h = np.multiply(1. - zt, h) + np.multiply(zt, ht) return h
def forward_step(params, X=None, cell_state_0=None, hid_state_0=None): hid_state = np.repeat(hid_state_0, X.shape[0] - hid_state_0.shape[0] + 1, axis=0) cell_state_1 = np.add( np.multiply( # <-- forget old info cell_state_0, sigmoid( c([X, hid_state]) @ params['forget']['w'] + params['forget']['b']), # <-- forget gate ), np.multiply( # <-- write new info sigmoid( c([X, hid_state]) @ params['ingate']['w'] + params['ingate']['b']), # <-- input gate np.tanh( c([X, hid_state]) @ params['change']['w'] + params['change']['b']), # <-- change gate )) hid_state_1 = np.multiply( sigmoid(c([X, hid_state]) @ params['outgate']['w']), # 1, np.tanh(cell_state_1)) return cell_state_1, hid_state_1
def variance(self, n_s): """ Stochastic approximator of predictive variance. Follows "Massively Scalable GPs" Args: n_s (int): Number of iterations to run stochastic approximation Returns: Approximate predictive variance at grid points """ if self.root_eigdecomp is None: self.sqrt_eig() if self.obs_idx is not None: root_K = self.root_eigdecomp[self.obs_idx, :] else: root_K = self.root_eigdecomp diag = kron_list_diag(self.Ks) samples = [] for i in range(n_s): g_m = np.random.normal(size=self.m) g_n = np.random.normal(size=self.n) right_side = np.sqrt(self.W).dot(np.dot(root_K, g_m)) +\ np.sqrt(self.noise) * g_n r = self.opt.cg(self.Ks, right_side) if self.obs_idx is not None: Wr = np.zeros(self.m) Wr[self.obs_idx] = np.multiply(np.sqrt(self.W), r) else: Wr = np.multiply(np.sqrt(self.W), r) samples.append(kron_mvp(self.Ks, Wr)) var = np.var(samples, axis=0) return np.clip(diag - var, 0, 1e12).flatten(), var
def partial_derivatives(x, y, W, V, b, c): # Filling in some dummy values # THIS IS WHERE YOU WILL WRITE YOUR PARTIAL DERIVATIVES #Below is for dLdc dLdc = np.ones(c.shape) e = [0, 0, 1, 0] l = c + V @ sig(b + W @ x) for i in range(4): gf = np.exp(l[i]) / np.sum(np.exp(l)) dLdc[i] = gf - e[i] #Below is for dLdV h = sig(b + W @ x) dLdV = dLdc @ h.T #Below is for dLdb dLdh = V.T @ dLdc s = b + W @ x dLdb = np.multiply(sigp(s), dLdh) #Below is for dLdW tmp = dLdh @ x.T dLdW = np.multiply(sigp(s), tmp) return dLdW, dLdV, dLdb, dLdc
def _grad_laplacian(self, sigma, Y, basis, K_X): dist = self._square_dist(Y, basis=basis) K = self._update_kernel(sigma, Y, basis, K_X) if basis is None: basis_Y = Y else: basis_Y = basis _, d = Y.shape # if K_d_mat is None: K_d_mat = np.multiply(K, dist) G = 4. * (sigma**2) * ((2 + d) * K - 2. * sigma * K_d_mat) # if K_d is None: K_d = np.sum(K_d_mat, axis=1) # if self_KK is None: KK = np.sum(K, axis=1) tmp = 4. * (sigma**2) * ((2 + d) * KK - 2. * sigma * K_d) tmp = tmp.reshape([-1, 1]) h = np.multiply(tmp, Y) - np.matmul(G, basis_Y) return h
def cost(coef): X_coef = -1 * np.matmul(X_, coef) z = 1 / (1 + np.exp(X_coef)) epsilon = 1e-5 class1 = np.multiply(y_, np.log(z + epsilon)) class2 = np.multiply(1 - y_, np.log(1 - z + epsilon)) ans = -(1 / y_.size) * (np.sum(class1 + class2)) return ans
def loss(pred, targ): # pred=pred/np.sum(pred) likelihood = np.multiply(targ, pred) + np.multiply(1.0 - targ, 1.0 - pred) likelihood_norm = likelihood + EPS log_likelihood = np.sum(np.log(likelihood_norm)) lost = -log_likelihood # dist=pred-targ # lost=np.sum(np.linalg.norm(dist)) return lost
def gradphi(phi, x, z): dphidz1 = np.array([ np.multiply((2 * (x - 1)**2) / (z[0]**3), phi[:, 0]), np.zeros(len(x)) ]).T dphidz2 = np.array([ np.zeros(len(x)), np.multiply((2 * (x - 5)**2) / (z[1]**3), phi[:, 1]) ]).T return (np.array([dphidz1, dphidz2]))
def cost(coef): X_coef = -1 * np.matmul(X_, coef) z = 1 / (1 + np.exp(X_coef)) epsilon = 1e-5 class1 = np.multiply(y_, np.log(z + epsilon)) class2 = np.multiply(1 - y_, np.log(1 - z + epsilon)) ans = -(1 / y_.size) * (np.sum(class1 + class2)) if self.penalty == "l1": return ans + self.val * np.sum(np.absolute(coef)) else: return ans + self.val * np.sum(np.square(coef))
def cost(params, batch_from, batch_to): X_batch = X[batch_from:batch_to, :] Y_batch = Y[batch_from:batch_to, :] Z = self._forward(params, X_batch) A = self.layers[-1].activation_fn(Z) # compute cost logprobs = np.multiply(np.log(A), Y_batch) + np.multiply( np.log((1 - A)), (1 - Y_batch)) cost = -1 * np.sum(logprobs) return cost
def forward(self, current, h_prev): z_in = np.matmul(current, self.params['Wiz']) + np.matmul( h_prev, self.params['Whz']) + self.params['bz'] z = sigmoid(z_in) r_in = np.matmul(current, self.params['Wir']) + np.matmul( h_prev, self.params['Whr']) + self.params['br'] r = sigmoid(r_in) g_in = np.matmul(current, self.params['Win']) + np.multiply( np.matmul(h_prev, self.params['Whn']), r) + self.params['bg'] g = np.tanh(g_in) h_current = np.multiply((1 - z), g) + np.multiply(z, h_prev) return h_current
def loss(w): lossVal = 0 for wi, aH in zip(w, globalAlphaHats): den = 1 / np.sum(np.multiply(n, wi)) wiXA = np.multiply(wi, localAlphaHats) dot = np.sum(np.multiply(wiXA, n)) tilde = den * dot lossVal = lossVal + .5 * np.square(aH - tilde) # The weights across all local estimates for each global estimate should sum to 1 lossVal = lossVal + wOneLambda * .5 * np.sum(np.square(wi - 1)) lossVal = lossVal + regLambda * np.linalg.norm(w) return lossVal
def _w_grad(self, sigma, Y, basis, beta, K_X): n_y, d_y = Y.shape n_basis, _ = basis.shape K = self._weighted_kernel(sigma, Y, basis, K_X) b_d = np.matmul(Y, beta.T) - np.outer( np.ones([n_y, 1]), np.sum(np.multiply(beta, basis), axis=1)) K_b_mat = np.multiply(K, b_d) K_b_d = np.sum(K_b_mat, axis=1) return (2. * sigma) * K_b_d
def loss(localAlphaHats): lossVal = 0 # localAlphaHats = 1 / (1 + np.exp(-1 * localAlphaHats)) for wi, aH in zip(w, globalAlphaHats): tilde = 1 / np.sum(np.multiply(n, wi)) wiXA = np.multiply(wi, localAlphaHats) tilde = tilde * np.sum(np.multiply(wiXA, n)) lossVal = lossVal + .5 * np.square(aH - tilde) lossVal = lossVal + varLambda * np.sum(np.var(localAlphaHats, axis=1)) lossVal = lossVal + anchorLambda * np.sum( np.square(localAlphaHats - a0)) return lossVal
def ll(x, num_peds, ess, robot_mu_x, robot_mu_y, ped_mu_x, ped_mu_y, \ cov_robot_x, cov_robot_y, inv_cov_robot_x, inv_cov_robot_y, \ cov_ped_x, cov_ped_y, inv_cov_ped_x, inv_cov_ped_y, \ one_over_cov_sum_x, one_over_cov_sum_y, normalize): T = np.size(robot_mu_x) quad_robot_mu_x = np.dot((x[:T]-robot_mu_x).T, np.dot(inv_cov_robot_x, \ x[:T]-robot_mu_x)) quad_robot_mu_y = np.dot((x[T:2*T]-robot_mu_y).T, np.dot(inv_cov_robot_y, \ x[T:2*T]-robot_mu_y)) llambda = -0.5 * quad_robot_mu_x - 0.5 * quad_robot_mu_y n = 2 for ped in range(ess): quad_ped_mu_x = np.dot((x[n*T:(n+1)*T]-ped_mu_x[ped]).T, np.dot(\ inv_cov_ped_x[ped], x[n*T:(n+1)*T]-ped_mu_x[ped])) quad_ped_mu_y = np.dot((x[(n+1)*T:(n+2)*T]-ped_mu_y[ped]).T, np.dot(\ inv_cov_ped_y[ped], x[(n+1)*T:(n+2)*T]-ped_mu_y[ped])) llambda = llambda - 0.5 * quad_ped_mu_x - 0.5 * quad_ped_mu_y n = n + 2 n = 2 for ped in range(ess): # if normalize == True: # # normalize_x = np.multiply(np.power(2*np.pi,-0.5), \ # one_over_std_sum_x[ped]) # # normalize_y = np.multiply(np.power(2*np.pi,-0.5), \ # one_over_std_sum_y[ped]) # else: normalize_x = 1. normalize_y = 1. vel_x = np.tile(x[:T], (T, 1)).T - np.tile(x[n * T:(n + 1) * T], (T, 1)) vel_y = np.tile(x[T:2 * T], (T, 1)).T - np.tile(x[(n + 1) * T:(n + 2) * T], (T, 1)) n = n + 2 vel_x_2 = np.power(vel_x, 2) vel_y_2 = np.power(vel_y, 2) quad_robot_ped_x = np.multiply(vel_x_2, one_over_cov_sum_x[ped]) quad_robot_ped_y = np.multiply(vel_y_2, one_over_cov_sum_y[ped]) Z_x = np.multiply(normalize_x, np.exp(-0.5 * quad_robot_ped_x)) Z_y = np.multiply(normalize_y, np.exp(-0.5 * quad_robot_ped_y)) Z = np.multiply(Z_x, Z_y) log_znot_norm = np.sum(np.log1p(-Z)) llambda = llambda + log_znot_norm return -1. * llambda
def d_ll(x, T, \ robot_mu_x, robot_mu_y, \ ped_mu_x, ped_mu_y, \ cov_robot_x, cov_robot_y, \ inv_cov_robot_x, inv_cov_robot_y, \ cov_ped_x, cov_ped_y, \ inv_cov_ped_x, inv_cov_ped_y, \ one_over_cov_sum_x, one_over_cov_sum_y, normalize): d_alpha = [0. for _ in range(4 * T)] d_beta = [0. for _ in range(4 * T)] d_llambda = np.asarray([0. for _ in range(4 * T)]) n = 2 vel_x = x[:T] - x[n * T:(n + 1) * T] vel_y = x[T:2 * T] - x[(n + 1) * T:(n + 2) * T] one_over_var_sum_x = np.diag(one_over_cov_sum_x) one_over_var_sum_y = np.diag(one_over_cov_sum_y) # if normalize == True: # normalize_x = np.multiply(np.power(2*np.pi, -0.5), \ # np.diag(one_over_std_sum_x)) # normalize_y = np.multiply(np.power(2*np.pi, -0.5), \ # np.diag(one_over_std_sum_y)) # else: normalize_x = 1. normalize_y = 1. quad_x = np.multiply(one_over_var_sum_x, np.power(vel_x, 2)) quad_y = np.multiply(one_over_var_sum_y, np.power(vel_y, 2)) Z_x = np.multiply(normalize_x, np.exp(-0.5 * quad_x)) Z_y = np.multiply(normalize_y, np.exp(-0.5 * quad_y)) Z = np.multiply(Z_x, Z_y) X = np.divide(Z, 1. - Z) alpha_x = np.multiply(X, np.multiply(vel_x, one_over_var_sum_x)) alpha_y = np.multiply(X, np.multiply(vel_y, one_over_var_sum_y)) # X and Y COMPONENT OF R DERIVATIVE d_alpha[:T] = np.add(d_alpha[:T], alpha_x) d_alpha[T:2 * T] = np.add(d_alpha[T:2 * T], alpha_y) d_alpha[n * T:(n + 1) * T] = -alpha_x d_alpha[(n + 1) * T:(n + 2) * T] = -alpha_y d_beta[n * T:(n + 1) * T] = -np.dot(x[n * T:(n + 1) * T] - ped_mu_x, inv_cov_ped_x) d_beta[(n + 1) * T:(n + 2) * T] = -np.dot(x[(n + 1) * T:(n + 2) * T] - ped_mu_y, inv_cov_ped_y) d_beta[:T] = -np.dot(x[:T] - robot_mu_x, inv_cov_robot_x) d_beta[T:2 * T] = -np.dot(x[T:2 * T] - robot_mu_y, inv_cov_robot_y) d_llambda[0:2 * T] = np.add(d_alpha[0:2 * T], d_beta[0:2 * T]) d_llambda[2 * T:] = np.add(d_alpha[2 * T:], d_beta[2 * T:]) return -1. * d_llambda
def objective(params, iter): fake_weights, bias = params weights = np.multiply((fake_weights + fake_weights.T) / 2, diag_mask) pll = 0 for i in range(len(imgs)): img = np.reshape(imgs[i], -1) activations = np.matmul(weights, img) + bias output = sigmoid(activations) eps = 1e-10 img[img < 0] = 0 pll += np.sum(np.multiply(img, np.log(output+eps)) + np.multiply(1-img, np.log(1-output+eps))) if iter % 100 == 0: print(-pll) return -pll
def diff_test_feature(test_feature_array): norm_mean, norm_variance = self.predict(test_feature_array) # De-normalize, and variance -> stddev pred_mean = norm_mean * std_data + mean_data pred_std = anp.sqrt(norm_variance) * std_data head_gradients_mean = anp.reshape(head_gradients['mean'], pred_mean.shape) head_gradients_std = anp.reshape(head_gradients['std'], pred_std.shape) # Added to mimic mxnet.autograd.backward pred_mean_sum = anp.sum( anp.multiply(pred_mean, head_gradients_mean)) pred_std_sum = anp.sum(anp.multiply(pred_std, head_gradients_std)) return pred_mean_sum + pred_std_sum
def neg_ll(self, x, c, n, *params): f = np.zeros_like(self.p) params = np.reshape(params, (self.m, self.dist.k + 1)) f = np.zeros_like(x) for i in range(self.m): like = self.dist.like(x, c, n, *params[i, 1::]) like = np.multiply(params[i, 0], like) f = f + like f = np.where(f <= 0, surpyval.TINIEST, f) f = np.where(f < 1, f, 1) f = np.log(f) f = np.multiply(n, f) f = -np.sum(f) return f
def Q(self, params): params = params.reshape(self.m, self.dist.k) f = np.zeros_like(self.p) for i in range(self.m): like = self.dist.like(self.x, self.c, self.n, *params[i]) like += surpyval.TINIEST like = np.where(like < 1, like, 1) like = np.log(like) like = np.multiply(self.n, like) f[i] = np.multiply(self.p[i], like) f = -np.sum(f) self.loglike = f return f
def _w_grad(self, sigma, Y, basis, beta, K_X): n_y, d_y = Y.shape n_basis, _ = basis.shape K = self._update_kernel(sigma, Y, basis, K_X) # if b_d is None: b_d = np.matmul(Y, beta.T) - np.outer( np.ones([n_y, 1]), np.sum(np.multiply(beta, basis), axis=1)) # if K_b_mat is None: K_b_mat = np.multiply(K, b_d) # if K_b_d is None: K_b_d = np.sum(K_b_mat, axis=1) return (2. * sigma) * K_b_d
def _hessian_bloc_dim(self, sigma, Y_i, Y_j, K, i, j): n = Y_i.shape[0] Y_ii = np.reshape(Y_i, [1, -1]) Y_jj = np.reshape(Y_j, [1, -1]) diff_i = np.tile(Y_ii, [n, 1]) diff_i = diff_i.T - diff_i diff_j = np.tile(Y_jj, [n, 1]) diff_j = diff_j.T - diff_j if i == j: return (np.multiply(K, (2. * (sigma) - 4. * (sigma**2) * np.multiply(diff_i, diff_j)))) else: return -4. * (sigma**2) * (np.multiply( K, np.multiply(diff_i, diff_j)))
def KLqp(self, S, q_mu): """ Calculates KL divergence between q and p Args: S (): Variational variances q_mu (): Variational mean Returns: KL divergence between q and p """ k_inv_mu = kron_mvp(self.K_invs, self.mu - q_mu) mu_penalty = np.sum(np.multiply(self.mu - q_mu, k_inv_mu)) det_S = np.sum(S) trace_term = np.sum(np.multiply(self.k_inv_diag, np.exp(S))) kl = 0.5 * (self.det_K - self.m - det_S + trace_term + mu_penalty) return kl
def prep_opt(y_train, N, coeffs): summedy_mat = np.sum(y_train, axis=0) summedy = np.reshape(summedy_mat, [np.size(summedy_mat), -1]) a1 = np.reshape([np.repeat(coeffs.T[1], N)], [np.size(summedy), -1]) a0 = np.reshape([np.repeat(coeffs.T[0], N)], [np.size(summedy), -1]) a1y = np.multiply(a1, summedy) a0y = np.multiply(a0, summedy) consts = np.sum(gammaln( y_train + scale)) - D * n_neurons * N * gammaln(scale) - np.sum( coeffs.T[0] * (D * scale * N)) - np.sum(a0y) - np.sum(summedy * np.log(scale)) return summedy, a1y, a0y, a1, consts
def marginal(self, kernel): """ calculates marginal likelihood Args: Ks_new: new covariance if needed Returns: np.array for marginal likelihood """ if kernel.params is not None: self.Ks = self.construct_Ks() self.alpha = np.zeros([self.X.shape[0]]) self.W = np.zeros([self.X.shape[0]]) self.grads = np.zeros([self.X.shape[0]]) self.f = self.mu self.f_pred = self.f self.run(10) Ks = self.Ks eigs = [np.expand_dims(np.linalg.eig(K)[0], 1) for K in Ks] eig_K = np.squeeze(kron_list(eigs)) self.eig_K = eig_K if self.obs_idx is not None: f_lim = self.f[self.obs_idx] alpha_lim = self.alpha[self.obs_idx] mu_lim = self.mu[self.obs_idx] W_lim = self.W[self.obs_idx] eig_k_lim = eig_K[self.obs_idx] pen = -0.5 * np.sum(np.multiply(alpha_lim, f_lim - mu_lim)) pen = np.where(np.isnan(pen), np.zeros_like(pen), pen) eigs = 0.5 * np.sum(np.log(1 + np.multiply(eig_k_lim, W_lim))) eigs = np.where(np.isnan(eigs), np.zeros_like(eigs), eigs) like = np.sum(self.likelihood.log_like(f_lim, self.y)) like = np.where(np.isnan(like), np.zeros_like(like), like) return -(pen+eigs+like) pen = -0.5 * np.sum(np.multiply(self.alpha, self.f - self.mu)) eigs = - 0.5*np.sum(np.log(1 + np.multiply(eig_K, self.W))) like = np.sum(self.likelihood.log_like(self.f, self.y)) return -(pen+eigs+like)
def beta_grads(Ks, beta, i): Karr = np.array(Ks) anum = Ks[i] * np.exp(Ks[i] * beta) aden = np.sum(np.exp(beta * Karr)) a = anum / aden bnum = np.exp(Ks[i] * beta) * (np.sum(np.multiply(Karr, np.exp(Karr * beta)))) bden = aden * aden b = bnum / bden return a - b
def monomial(x, y, x_test): n = len(x) A = np.vander(x, increasing=True) c = np.linalg.solve(A, y) y_test = np.zeros_like(x_test) for j in xrange(n-1, -1, -1): y_test = np.multiply(y_test, x_test) + c[j] return y_test
def write(mem, w_t, e_t, a_t): """ The writing procedure as described in 3.2. w_t is a length N weighting over the rows as above. e_t (the erase vector) is length M with elements all in (0,1). a_t (the add vector) is length M with no such restrictions. We first multiply the memory matrix pointwise by [1-w_t(i)e_t] Then we do M_t(i) <- w_t(i)a_t. According to the paper, the erase/add decomposition was inspired by the forget/input gates in LSTM. """ # Perform erasure on the existing memory, parametrized by e_t and w_t W = np.reshape(w_t, (w_t.shape[0], 1)) E = np.reshape(e_t, (e_t.shape[0], 1)) # Transpose W so we can create WTE, a matrix whose i,j-th element # represents the extent to which we will erase M_t[i,j] WTE = np.dot(W, E.T) # KEEP is such that KEEP[i,j] represents the extent to which we # will keep M_t[i,j] KEEP = np.ones(mem.shape) - WTE # To complete erasure, multiply memory pointwise by KEEP newmem = np.multiply(mem, KEEP) # Perform addition on the newly erased memory # Convert add vector to a matrix A = np.reshape(a_t, (a_t.shape[0], 1)) # Add is the add vector weighted by w_t, which is added pointwise to # the existing memory, finishing the write sequence. ADD = np.dot(W, A.T) newmem = newmem + ADD return newmem
def test_multiply_arg1(): fun = lambda x, y : np.multiply(x, y) d_fun = grad(fun, 1) check_grads(fun, npr.rand(), npr.rand()) check_grads(d_fun, npr.rand(), npr.rand())
def manual_grads(params): """ Compute the gradient of the loss WRT the parameters Ordering of the operations is reverse of that in fprop() """ deltas = {} for key, val in params.iteritems(): deltas[key] = np.zeros_like(val) [loss, mems, ps, ys, os, zos, hs, zhs, xs, rs, w_rs, w_ws, adds, erases, k_rs, k_ws, g_rs, g_ws, wc_rs, wc_ws, zbeta_rs, zbeta_ws, zs_rs, zs_ws, wg_rs, wg_ws] = self.stats dd = {} drs = {} dzh = {} dmem = {} # might not need this, since we have dmemtilde dmemtilde = {} du_r = {} du_w = {} dwg_r = {} dwg_w = {} for t in reversed(xrange(len(targets))): dy = np.copy(ps[t]) dy -= targets[t].T # backprop into y deltas['oy'] += np.dot(dy, os[t].T) deltas['by'] += dy if t < len(targets) - 1: # r[t] affects cost through zh[t+1] via Wrh drs[t] = np.dot(self.W['rh'].T, dzh[t + 1]) # right now, mems[t] influences cost through rs[t+1], via w_rs[t+1] dmem[t] = np.dot( w_rs[t + 1], drs[t + 1].reshape((self.M,1)).T ) # and also through mems at next step W = np.reshape(w_ws[t+1], (w_ws[t+1].shape[0], 1)) E = np.reshape(erases[t+1], (erases[t+1].shape[0], 1)) WTE = np.dot(W, E.T) KEEP = np.ones(mems[0].shape) - WTE dmem[t] += np.multiply(dmemtilde[t+1], KEEP) # and also through its influence on the content weighting next step dmem[t] += du_r[t+1] + du_w[t+1] dmemtilde[t] = dmem[t] # erases[t] affects cost through mems[t], via w_ws[t] derase = np.dot(np.multiply(dmemtilde[t], -mems[t-1]).T, w_ws[t]) # zerase affects just erases through a sigmoid dzerase = derase * (erases[t] * (1 - erases[t])) # adds[t] affects costs through mems[t], via w_ws dadd = np.dot(dmem[t].T, w_ws[t]) # zadds affects just adds through a tanh dzadd = dadd * (1 - adds[t] * adds[t]) # dbadds is just dzadds deltas['badds'] += dzadd deltas['oadds'] += np.dot(dzadd, os[t].T) deltas['berases'] += dzerase deltas['oerases'] += np.dot(dzerase, os[t].T) # # read weights affect what is read, via what's in mems[t-1] # dwc_r = np.dot(mems[t-1], drs[t]) # # write weights affect mem[t] through adding # dwc_w = np.dot(dmem[t], adds[t]) # # they also affect memtilde[t] through erasing # dwc_w += np.dot(np.multiply(dmemtilde[t], -mems[t-1]), erases[t]) dw_r = np.dot(mems[t-1], drs[t]) dw_r += dwg_r[t+1] * (1 - g_rs[t+1]) # write weights affect mem[t] through adding dw_w = np.dot(dmem[t], adds[t]) # they also affect memtilde[t] through erasing dw_w += np.dot(np.multiply(dmemtilde[t], -mems[t-1]), erases[t]) dw_w += dwg_w[t+1] * (1 - g_ws[t+1]) sgwr = np.zeros((self.N, self.N)) sgww = np.zeros((self.N, self.N)) for i in range(self.N): sgwr[i,i] = softmax(zs_rs[t])[0] sgwr[i,(i+1) % self.N] = softmax(zs_rs[t])[2] sgwr[i,(i-1) % self.N] = softmax(zs_rs[t])[1] sgww[i,i] = softmax(zs_ws[t])[0] sgww[i,(i+1) % self.N] = softmax(zs_ws[t])[2] sgww[i,(i-1) % self.N] = softmax(zs_ws[t])[1] # right now, shifted weights are final weight dws_r = dw_r dws_w = dw_w dwg_r[t] = np.dot(sgwr.T, dws_r) dwg_w[t] = np.dot(sgww.T, dws_w) dwc_r = dwg_r[t] * g_rs[t] dwc_w = dwg_w[t] * g_ws[t] """ We need dw/dK now w has N elts and K has N elts and we want, for every elt of W, the grad of that elt w.r.t. each of the N elts of K. that gives us N * N things """ # first, we must build up the K values (should be taken from fprop) K_rs = [] K_ws = [] for i in range(self.N): K_rs.append(cosine_sim(mems[t-1][i, :], k_rs[t])) K_ws.append(cosine_sim(mems[t-1][i, :], k_ws[t])) # then, we populate the grads dwdK_r = np.zeros((self.N, self.N)) dwdK_w = np.zeros((self.N, self.N)) # for every row in the memory for i in range(self.N): # for every element in the weighting for j in range(self.N): dwdK_r[i,j] += softmax_grads(K_rs, softplus(zbeta_rs[t]), i, j) dwdK_w[i,j] += softmax_grads(K_ws, softplus(zbeta_ws[t]), i, j) # compute dK for all i in N # K is the evaluated cosine similarity for the i-th row of mem matrix dK_r = np.zeros_like(w_rs[0]) dK_w = np.zeros_like(w_ws[0]) # for all i in N (for every row that we've simmed) for i in range(self.N): # for every j in N (for every elt of the weighting) for j in range(self.N): # specifically, dwdK_r will change, and for write as well dK_r[i] += dwc_r[j] * dwdK_r[i,j] dK_w[i] += dwc_w[j] * dwdK_w[i,j] """ dK_r_dk_rs is a list of N things each elt of the list corresponds to grads of K_idx w.r.t. the key k_t so it should be a length N list of M by 1 vectors """ dK_r_dk_rs = [] dK_r_dmem = [] for i in range(self.N): # let k_rs be u, Mem[i] be v u = np.reshape(k_rs[t], (self.M,)) v = mems[t-1][i, :] dK_r_dk_rs.append( dKdu(u,v) ) dK_r_dmem.append( dKdu(v,u)) dK_w_dk_ws = [] dK_w_dmem = [] for i in range(self.N): # let k_ws be u, Mem[i] be v u = np.reshape(k_ws[t], (self.M,)) v = mems[t-1][i, :] dK_w_dk_ws.append( dKdu(u,v) ) dK_w_dmem.append( dKdu(v,u)) # compute delta for keys dk_r = np.zeros_like(k_rs[0]) dk_w = np.zeros_like(k_ws[0]) # for every one of M elt of dk_r for i in range(self.M): # for every one of the N Ks for j in range(self.N): # add delta K_r[j] * dK_r[j] / dk_r[i] # add influence on through K_r[j] dk_r[i] += dK_r[j] * dK_r_dk_rs[j][i] dk_w[i] += dK_w[j] * dK_w_dk_ws[j][i] # these represent influence of mem on next K """ Let's let du_r[t] represent the influence of mems[t-1] on the cost through the K values this is analogous to dk_w, but, k only every affects that whereas mems[t-1] will also affect what is read at time t+1 and through memtilde at time t+1 """ du_r[t] = np.zeros_like(mems[0]) du_w[t] = np.zeros_like(mems[0]) # for every row in mems[t-1] for i in range(self.N): # for every elt of this row (one of M) for j in range(self.M): du_r[t][i,j] = dK_r[i] * dK_r_dmem[i][j] du_w[t][i,j] = dK_w[i] * dK_w_dmem[i][j] # key values are activated as tanh dzk_r = dk_r * (1 - k_rs[t] * k_rs[t]) dzk_w = dk_w * (1 - k_ws[t] * k_ws[t]) deltas['ok_r'] += np.dot(dzk_r, os[t].T) deltas['ok_w'] += np.dot(dzk_w, os[t].T) deltas['bk_r'] += dzk_r deltas['bk_w'] += dzk_w dg_r = np.dot(dwg_r[t].T, (wc_rs[t] - w_rs[t-1]) ) dg_w = np.dot(dwg_w[t].T, (wc_ws[t] - w_ws[t-1]) ) # compute dzg_r, dzg_w dzg_r = dg_r * (g_rs[t] * (1 - g_rs[t])) dzg_w = dg_w * (g_ws[t] * (1 - g_ws[t])) deltas['og_r'] += np.dot(dzg_r, os[t].T) deltas['og_w'] += np.dot(dzg_w, os[t].T) deltas['bg_r'] += dzg_r deltas['bg_w'] += dzg_w # compute dbeta, which affects w_content through interaction with Ks dwcdbeta_r = np.zeros_like(w_rs[0]) dwcdbeta_w = np.zeros_like(w_ws[0]) for i in range(self.N): dwcdbeta_r[i] = beta_grads(K_rs, softplus(zbeta_rs[t]), i) dwcdbeta_w[i] = beta_grads(K_ws, softplus(zbeta_ws[t]), i) dbeta_r = np.zeros_like(zbeta_rs[0]) dbeta_w = np.zeros_like(zbeta_ws[0]) for i in range(self.N): dbeta_r[0] += dwc_r[i] * dwcdbeta_r[i] dbeta_w[0] += dwc_w[i] * dwcdbeta_w[i] # beta is activated from zbeta by softplus, grad of which is sigmoid dzbeta_r = dbeta_r * sigmoid(zbeta_rs[t]) dzbeta_w = dbeta_w * sigmoid(zbeta_ws[t]) deltas['obeta_r'] += np.dot(dzbeta_r, os[t].T) deltas['obeta_w'] += np.dot(dzbeta_w, os[t].T) deltas['bbeta_r'] += dzbeta_r deltas['bbeta_w'] += dzbeta_w sgsr = np.zeros((self.N, 3)) sgsw = np.zeros((self.N, 3)) for i in range(self.N): sgsr[i,1] = wg_rs[t][(i - 1) % self.N] sgsr[i,0] = wg_rs[t][i] sgsr[i,2] = wg_rs[t][(i + 1) % self.N] sgsw[i,1] = wg_ws[t][(i - 1) % self.N] sgsw[i,0] = wg_ws[t][i] sgsw[i,2] = wg_ws[t][(i + 1) % self.N] ds_r = np.dot(sgsr.T, dws_r) ds_w = np.dot(sgsw.T, dws_w) shift_act_jac_r = np.zeros((3,3)) shift_act_jac_w = np.zeros((3,3)) bf = np.array([[1.0]]) for i in range(3): for j in range(3): shift_act_jac_r[i,j] = softmax_grads(zs_rs[t], bf, i, j) shift_act_jac_w[i,j] = softmax_grads(zs_ws[t], bf, i, j) dzs_r = np.dot(shift_act_jac_r.T, ds_r) dzs_w = np.dot(shift_act_jac_w.T, ds_w) deltas['os_r'] += np.dot(dzs_r, os[t].T) deltas['os_w'] += np.dot(dzs_w, os[t].T) deltas['bs_r'] += dzs_r deltas['bs_w'] += dzs_w else: drs[t] = np.zeros_like(rs[0]) dmemtilde[t] = np.zeros_like(mems[0]) du_r[t] = np.zeros_like(mems[0]) du_w[t] = np.zeros_like(mems[0]) dwg_r[t] = np.zeros_like(w_rs[0]) dwg_w[t] = np.zeros_like(w_ws[0]) # o affects y through Woy do = np.dot(params['oy'].T, dy) if t < len(targets) - 1: # and also zadd through Woadds do += np.dot(params['oadds'].T, dzadd) do += np.dot(params['oerases'].T, dzerase) # and also through the keys do += np.dot(params['ok_r'].T, dzk_r) do += np.dot(params['ok_w'].T, dzk_w) # and also through the interpolators do += np.dot(params['og_r'].T, dzg_r) do += np.dot(params['og_w'].T, dzg_w) # and also through beta do += np.dot(params['obeta_r'].T, dzbeta_r) do += np.dot(params['obeta_w'].T, dzbeta_w) # and also through the shift values do += np.dot(params['os_r'].T, dzs_r) do += np.dot(params['os_w'].T, dzs_w) # compute deriv w.r.t. pre-activation of o dzo = do * (1 - os[t] * os[t]) deltas['ho'] += np.dot(dzo, hs[t].T) deltas['bo'] += dzo # compute hidden dh dh = np.dot(params['ho'].T, dzo) # compute deriv w.r.t. pre-activation of h dzh[t] = dh * (1 - hs[t] * hs[t]) deltas['xh'] += np.dot(dzh[t], xs[t].T) deltas['bh'] += dzh[t] # Wrh affects zh via rs[t-1] deltas['rh'] += np.dot(dzh[t], rs[t-1].reshape((self.M, 1)).T) return deltas
def mul(first_tree_rep, second_tree_rep): return auto_grad_np.multiply(first_tree_rep, second_tree_rep)
def l2(x): """ Hacky l2-norm computation to be used for tracking update magnitude. """ return np.sqrt(np.sum(np.multiply(x, x)))
def fprop(params): """ Forward pass of the NTM. """ W = params # aliasing for brevity xs, zhs, hs, ys, ps, ts, zos, os = {}, {}, {}, {}, {}, {}, {}, {} def l(): """ Silly utility function that should be called in init. """ return [{} for _ in xrange(self.heads)] rs = l() zk_rs = l() k_rs, beta_rs, g_rs, s_rs, gamma_rs = l(),l(),l(),l(),l() k_ws, beta_ws, g_ws, s_ws, gamma_ws = l(),l(),l(),l(),l() adds, erases = l(),l() w_ws, w_rs = l(),l() # read weights and write weights for idx in range(self.heads): rs[idx][-1] = self.W['rsInit' + str(idx)] # stores values read from memory w_ws[idx][-1] = softmax(self.W['w_wsInit' + str(idx)]) w_rs[idx][-1] = softmax(self.W['w_rsInit' + str(idx)]) mems = {} # the state of the memory at every timestep mems[-1] = self.W['memsInit'] loss = 0 for t in xrange(len(inputs)): xs[t] = np.reshape(np.array(inputs[t]),inputs[t].shape[::-1]) rsum = 0 for idx in range(self.heads): rsum = rsum + np.dot(W['rh' + str(idx)], np.reshape(rs[idx][t-1],(self.M,1))) zhs[t] = np.dot(W['xh'], xs[t]) + rsum + W['bh'] hs[t] = np.tanh(zhs[t]) zos[t] = np.dot(W['ho'], hs[t]) + W['bo'] os[t] = np.tanh(zos[t]) for idx in range(self.heads): # parameters to the read head zk_rs[idx][t] =np.dot(W['ok_r' + str(idx)],os[t]) + W['bk_r' + str(idx)] k_rs[idx][t] = np.tanh(zk_rs[idx][t]) beta_rs[idx][t] = softplus(np.dot(W['obeta_r' + str(idx)],os[t]) + W['bbeta_r' + str(idx)]) g_rs[idx][t] = sigmoid(np.dot(W['og_r' + str(idx)],os[t]) + W['bg_r' + str(idx)]) s_rs[idx][t] = softmax(np.dot(W['os_r' + str(idx)],os[t]) + W['bs_r' + str(idx)]) gamma_rs[idx][t] = 1 + sigmoid(np.dot(W['ogamma_r' + str(idx)], os[t]) + W['bgamma_r' + str(idx)]) # parameters to the write head k_ws[idx][t] = np.tanh(np.dot(W['ok_w' + str(idx)],os[t]) + W['bk_w' + str(idx)]) beta_ws[idx][t] = softplus(np.dot(W['obeta_w' + str(idx)], os[t]) + W['bbeta_w' + str(idx)]) g_ws[idx][t] = sigmoid(np.dot(W['og_w' + str(idx)],os[t]) + W['bg_w' + str(idx)]) s_ws[idx][t] = softmax(np.dot(W['os_w' + str(idx)],os[t]) + W['bs_w' + str(idx)]) gamma_ws[idx][t] = 1 + sigmoid(np.dot(W['ogamma_w' + str(idx)], os[t]) + W['bgamma_w' + str(idx)]) # the erase and add vectors # these are also parameters to the write head # but they describe "what" is to be written rather than "where" adds[idx][t] = np.tanh(np.dot(W['oadds' + str(idx)], os[t]) + W['badds' + str(idx)]) erases[idx][t] = sigmoid(np.dot(W['oerases' + str(idx)], os[t]) + W['erases' + str(idx)]) w_ws[idx][t] = addressing.create_weights( k_ws[idx][t] , beta_ws[idx][t] , g_ws[idx][t] , s_ws[idx][t] , gamma_ws[idx][t] , w_ws[idx][t-1] , mems[t-1]) w_rs[idx][t] = addressing.create_weights( k_rs[idx][t] , beta_rs[idx][t] , g_rs[idx][t] , s_rs[idx][t] , gamma_rs[idx][t] , w_rs[idx][t-1] , mems[t-1]) ys[t] = np.dot(W['oy'], os[t]) + W['by'] ps[t] = sigmoid(ys[t]) one = np.ones(ps[t].shape) ts[t] = np.reshape(np.array(targets[t]),(self.out_size,1)) epsilon = 2**-23 # to prevent log(0) a = np.multiply(ts[t] , np.log2(ps[t] + epsilon)) b = np.multiply(one - ts[t], np.log2(one-ps[t] + epsilon)) loss = loss - (a + b) for idx in range(self.heads): # read from the memory rs[idx][t] = memory.read(mems[t-1],w_rs[idx][t]) # write into the memory mems[t] = memory.write(mems[t-1],w_ws[idx][t],erases[idx][t],adds[idx][t]) self.stats = [loss, mems, ps, ys, os, zos, hs, zhs, xs, rs, w_rs, w_ws, adds, erases] return np.sum(loss)
def test_multiply_arg1(): fun = lambda x, y : np.multiply(x, y) check_grads(fun)(npr.rand(), npr.rand())