def wake(self, wake_data, it): ddc = self.reg self.wake_data = wake_data.copy() gs = [] nl_obs = self.nlayer - 1 if self.layer_plastic[-1]: grad_name = "x%d->dlogp%d" % (nl_obs, nl_obs) g = self.reg.predict(self.wake_data, grad_name) self.wake_data["dlogp%d" % (nl_obs)] = g gs.insert(0, g.mean(0)) # NOT NECESSARY grad_name = "x%d->dnorm%d" % (nl_obs, nl_obs) g = self.reg.predict(self.wake_data, grad_name) self.wake_data["dnorm%d" % (nl_obs)] = g grad_name = "x%d->dnatsuff%d" % (nl_obs, nl_obs) g = self.reg.predict(self.wake_data, grad_name) self.wake_data["dnatsuff%d" % (nl_obs)] = g else: gs.insert(0, np.zeros_like(self.model.dists[-1].ps)) if self.nlayer > 1: for i in range(self.nlayer - 2, -1, -1): mean_name = "mx%d_x%d" % (i, nl_obs) if i == self.nlayer - 2: fun_name = "x%d->x%d" % (nl_obs, self.nlayer - 2) else: fun_name = "mx%d_x%d->x%d" % (i + 1, nl_obs, i) self.wake_data[mean_name] = ddc.predict( self.wake_data, fun_name) if self.layer_plastic[i]: grad_name = "x%d->dlogp%d" % (i, i) g = self.approx_E(mean_name, grad_name) self.wake_data["dlogp%d" % i] = g gs.insert(0, g.mean(0)) # NOT NECESSARY grad_name = "x%d->dnorm%d" % (i, i) g = self.approx_E(mean_name, grad_name) self.wake_data["dnorm%d" % (i)] = g grad_name = "x%d->dnatsuff%d" % (i, i) g = self.approx_E(mean_name, grad_name) self.wake_data["dnatsuff%d" % i] = g else: gs.insert(0, np.zeros_like(self.model.dists[i].ps)) gs = np.concatenate(gs) self.gradient_step(gs, it)
def _setup(self, network): self.accu = defaultdict(dict) self.d_accu = defaultdict(dict) for i, layer in enumerate(network.parametric_layers): for n in layer.parameters.keys(): self.accu[i][n] = np.zeros_like(layer.parameters[n]) self.d_accu[i][n] = np.zeros_like(layer.parameters[n])
def adam(data, paramvec, loss, batch_size, rate, epochs=1, b1=0.9, b2=0.999, epsilon=1e-8, callback=None): m = np.zeros_like(paramvec) v = np.zeros_like(paramvec) vals = [] i = 0 for epoch in range(epochs): for minibatch in make_batches(batch_size, data): val, g = vgrad(loss)(paramvec, *minibatch) m = (1. - b1) * g + b1 * m v = (1. - b2) * g**2 + b2 * v mhat = m / (1 - b1**(i + 1)) vhat = v / (1 - b2**(i + 1)) paramvec -= rate * mhat / (np.sqrt(vhat) + epsilon) vals.append(val) i += 1 if callback: callback(epoch, paramvec, vals) return paramvec
def viterbi(log_pi0, log_Ps, ll): """ Find the most likely state sequence This is modified from pyhsmm.internals.hmm_states by Matthew Johnson. """ T, K = ll.shape # Check if the transition matrices are stationary or # time-varying (hetero) hetero = (log_Ps.shape[0] == T - 1) if not hetero: assert log_Ps.shape[0] == 1 # Pass max-sum messages backward scores = np.zeros_like(ll) args = np.zeros_like(ll, dtype=int) for t in range(T - 2, -1, -1): vals = log_Ps[t * hetero] + scores[t + 1] + ll[t + 1] args[t + 1] = vals.argmax(axis=1) scores[t] = vals.max(axis=1) # Now maximize forwards z = np.zeros(T, dtype=int) z[0] = (scores[0] + log_pi0 + ll[0]).argmax() for t in range(1, T): z[t] = args[t, z[t - 1]] return z
def _make_grad_hmm_normalizer(argnum, ans, log_pi0, log_Ps, ll): # Unbox the inputs if necessary unbox = lambda x: x if isinstance(x, np.ndarray) else x._value log_pi0 = unbox(log_pi0) log_Ps = unbox(log_Ps) ll = unbox(ll) # Make sure everything is C contiguous to_c = lambda arr: np.copy(arr, 'C') if not arr.flags['C_CONTIGUOUS' ] else arr log_pi0 = to_c(log_pi0) log_Ps = to_c(log_Ps) ll = to_c(ll) dlog_pi0 = np.zeros_like(log_pi0) dlog_Ps = np.zeros_like(log_Ps) dll = np.zeros_like(ll) T, K = ll.shape # Forward pass to get alphas alphas = np.zeros((T, K)) forward_pass(log_pi0, log_Ps, ll, alphas) grad_hmm_normalizer(log_Ps, alphas, dlog_pi0, dlog_Ps, dll) if argnum == 0: return lambda g: g * dlog_pi0 if argnum == 1: return lambda g: g * dlog_Ps if argnum == 2: return lambda g: g * dll
def fit_adam(self,X,Y,disp=False,n_epochs=10,batch_size = 8,drop_first=None,beta_1=0.9,beta_2=0.999,learning_rate = 1e-3): m=int(X.shape[0]/batch_size) ps = self.get_params() mt=np.zeros_like(ps) vt=np.zeros_like(ps) args=[X,Y,drop_first] loss=1e3 t=0 for i in range(0,n_epochs): for j in range(0,m): bx = X[j*batch_size:(j+1)*batch_size] by = Y[j*batch_size:(j+1)*batch_size] args=[bx,by,drop_first] t=t+1 ps = self.get_params() g = self.errf_grad(ps,args) mt = beta_1*mt + (1.0-beta_1)*g vt = beta_2*vt + (1.0-beta_2)*g**2 amt = mt/(1.0-beta_1**t) avt = vt/(1.0-beta_2**t) ps_new = ps - learning_rate*amt/(np.sqrt(avt)+1e-8) self.set_params(ps_new) args=[X,Y,drop_first] loss = self.errf(ps_new,args) print("epoch",i,"loss",loss)
def compute_f_fprime_t_avg_(W, perturbation, burn_in=0.5, max_dist=1): W0x, W0y, W1x, W1y, W2x, W2y, W3x, W3y, s02, k0, k1, k2, k3, kappa, T0, T1, T2, T3, XX, XXp, Eta, Xi, h = parse_W( W) fval = compute_f_(Eta, Xi, s02) fprimeval = compute_fprime_(Eta, Xi, s02) u0, u1, u2, u3 = compute_us(W, fval, fprimeval) resEta = Eta - u0 - u2 resXi = Xi - u1 - u3 YY = fval + perturbation YYp = fprimeval + 0 YYmean = np.zeros_like(Eta) YYprimemean = np.zeros_like(Eta) def dYYdt(YY, Eta1, Xi1): return -YY + compute_f_(Eta1, Xi1, s02) def dYYpdt(YYp, Eta1, Xi1): return -YYp + compute_fprime_(Eta1, Xi1, s02) for t in range(niter): if np.mean(np.abs(YY - fval)) < max_dist: u0, u1, u2, u3 = compute_us(W, YY, YYp) Eta1 = resEta + u0 + u2 Xi1 = resXi + u1 + u3 YY = YY + dt * dYYdt(YY, Eta1, Xi1) YYp = YYp + dt * dYYpdt(YYp, Eta1, Xi1) elif np.remainder(t, 500) == 0: print('unstable fixed point?') if t > niter * burn_in: YYmean = YYmean + 1 / niter / burn_in * YY YYprimemean = YYprimemean + 1 / niter / burn_in * YYp return YYmean, YYprimemean
def __init__(self, W, learning_rate=10e-5, decay=0.95, blend=0.95): """ This is the Alex Graves RMSProp variant from Generating Sequences with Recurrent Neural Networks. It scales parameter updates by a running estimate of the variance of the parameter rather than just a running estimate of the magnitude. decay governs how fast the momentum falls off. blend governs the extent to which we take the current parameter value into account when updating our estimate of variance. """ self.lr = learning_rate self.d = decay self.b = blend self.ns = {} # store the mean of the square self.gs = {} # store the mean, which will later be squared self.ms = {} # momentum self.qs = {} # update norm over param norm - ideally this stays around 10e-3 for k, v in W.iteritems(): self.ns[k] = np.zeros_like(v) self.gs[k] = np.zeros_like(v) self.ms[k] = np.zeros_like(v) self.qs[k] = self.lr
def apply_gradient_adam(x, g, i_batch, m=None, v=None, step_size=0.001, b1=0.9, b2=0.999, eps=1e-7, verbose=True): g = np.array(g) if m is None or v is None: m = np.zeros_like(x) v = np.zeros_like(v) m = (1 - b1) * g + b1 * m # First moment estimate. v = (1 - b2) * (g**2) + b2 * v # Second moment estimate. mhat = m / (1 - b1**(i_batch + 1)) # Bias correction. vhat = v / (1 - b2**(i_batch + 1)) d = step_size * mhat / (np.sqrt(vhat) + eps) x = x - d if verbose: try: print_flush( ' Step size modifier is {}.'.format( np.mean(mhat / (np.sqrt(vhat) + eps))), 0, comm.Get_rank()) except: print(' Step size modifier is {}.'.format( np.mean(mhat / (np.sqrt(vhat) + eps)))) return x, m, v
def compute_f_fprime_t_avg_(W,perturbation,burn_in=0.5,max_dist=1): Wmx,Wmy,Wsx,Wsy,s02,K,kappa,T,XX,XXp,Eta,Xi,h = parse_W(W) fval = compute_f_(Eta,Xi,s02) fprimeval = compute_fprime_(Eta,Xi,s02) resEta = Eta - u_fn(XX,fval,Wmx,Wmy,K,kappa,T) resXi = Xi - u_fn(XX,fval,Wsx,Wsy,K,kappa,T) YY = fval + perturbation YYp = fprimeval YYmean = np.zeros_like(Eta) YYprimemean = np.zeros_like(Eta) def dYYdt(YY,Eta1,Xi1): return -YY + compute_f_(Eta1,Xi1,s02) def dYYpdt(YYp,Eta1,Xi1): return -YYp + compute_fprime_(Eta1,Xi1,s02) for t in range(niter): if np.mean(np.abs(YY-fval)) < max_dist: Eta1 = resEta + u_fn(XX,YY,Wmx,Wmy,K,kappa,T) Xi1 = resXi + u_fn(XX,YY,Wsx,Wsy,K,kappa,T) YY = YY + dt*dYYdt(YY,Eta1,Xi1) YYp = YYp + dt*dYYpdt(YYp,Eta1,Xi1) else: print('unstable fixed point?') #Eta1 = resEta + u_fn(XX,YY,Wmx,Wmy,K,kappa,T) #Xi1 = resXi + u_fn(XX,YY,Wsx,Wsy,K,kappa,T) #YY = YY + dt*dYYdt(YY,Eta1,Xi1) if t>niter*burn_in: #YYp = compute_fprime_(Eta1,Xi1,s02) YYmean = YYmean + 1/niter/burn_in*YY YYprimemean = YYprimemean + 1/niter/burn_in*YYp return YYmean,YYprimemean
def _make_grad_hmm_normalizer(argnum, ans, pi0, Ps, ll): # Make sure everything is C contiguous and unboxed pi0 = to_c(pi0) Ps = to_c(Ps) ll = to_c(ll) dlog_pi0 = np.zeros_like(pi0) dlog_Ps = np.zeros_like(Ps) dll = np.zeros_like(ll) T, K = ll.shape # Forward pass to get alphas alphas = np.zeros((T, K)) forward_pass(pi0, Ps, ll, alphas) log_Ps = np.log(Ps + LOG_EPS) - logsumexp(Ps, axis=1, keepdims=True) grad_hmm_normalizer(log_Ps, alphas, dlog_pi0, dlog_Ps, dll) # Compute necessary gradient # Account for the log transformation # df/dP = df/dlogP * dlogP/dP = df/dlogP * 1 / P if argnum == 0: return lambda g: g * dlog_pi0 / (pi0 + DIV_EPS) if argnum == 1: return lambda g: g * dlog_Ps / (Ps + DIV_EPS) if argnum == 2: return lambda g: g * dll
def compute_eig_penalty(W): # still need to finish! Hopefully won't need W0mx,W0my,W0sx,W0sy,s020,K0,kappa0,T0,XX0,XXp0,Eta0,Xi0,h10,h20,Eta10,Eta20 = parse_W(W) eig_penalty_dir_w,eig_penalty_dir_k,eig_penalty_dir_kappa = compute_eig_penalty_(W0my,k0,kappa0) eig_penalty_W = unparse_W(np.zeros_like(W0mx),eig_penalty_dir_w,np.zeros_like(W0sx),np.zeros_like(W0sy),np.zeros_like(s020),eig_penalty_dir_k,eig_penalty_dir_kappa,np.zeros_like(XX0),np.zeros_like(XXp0),np.zeros_like(Eta0),np.zeros_like(Xi0)) # assert(True==False) return eig_penalty_W
def stf_4dim_time_day(tensor, r, random_seed=0, num_iter=100, eps=1e-8, lr=1): np.random.seed(random_seed) args_num = [1, 2, 3, 4] def cost(tensor, home, appliance, day, hour): pred = np.einsum('Hr, Ar, ADr, ATr ->HADT', home, appliance, day, hour) mask = ~np.isnan(tensor) error = (pred - tensor)[mask].flatten() return np.sqrt((error**2).mean()) mg = multigrad(cost, argnums=args_num) sizes = [(x, r) for x in tensor.shape] # ADr sizes[-2] = (tensor.shape[1], tensor.shape[-2], r) # ATr sizes[-1] = (tensor.shape[1], tensor.shape[-1], r) home = np.random.rand(*sizes[0]) appliance = np.random.rand(*sizes[1]) day = np.random.rand(*sizes[2]) hour = np.random.rand(*sizes[3]) sum_home = np.zeros_like(home) sum_appliance = np.zeros_like(appliance) sum_day = np.zeros_like(day) sum_hour = np.zeros_like(hour) # GD procedure for i in range(num_iter): del_home, del_appliance, del_day, del_hour = mg( tensor, home, appliance, day, hour) sum_home += eps + np.square(del_home) lr_home = np.divide(lr, np.sqrt(sum_home)) home -= lr_home * del_home sum_appliance += eps + np.square(del_appliance) lr_appliance = np.divide(lr, np.sqrt(sum_appliance)) appliance -= lr_appliance * del_appliance sum_day += eps + np.square(del_day) lr_day = np.divide(lr, np.sqrt(sum_day)) day -= lr_day * del_day sum_hour += eps + np.square(del_hour) lr_hour = np.divide(lr, np.sqrt(sum_hour)) hour -= lr_hour * del_hour # Projection to non-negative space home[home < 0] = 1e-8 appliance[appliance < 0] = 1e-8 day[day < 0] = 1e-8 hour[hour < 0] = 1e-8 if i % 50 == 0: print(cost(tensor, home, appliance, day, hour), i) sys.stdout.flush() return home, appliance, day, hour
def _control(self, xs, us, k, K, alpha=1): xs_new = np.zeros_like(xs) us_new = np.zeros_like(us) xs_new[0] = xs[0].copy() for i in range(self.horizon): us_new[i] = us[i] + alpha * k[i] + K[i].dot(xs_new[i] - xs[i]) xs_new[i+1] = self.dynamics(xs_new[i], us_new[i]) return xs_new, us_new
def job_per_round(f, x0, obsv, avg=True, decay=True, callback=None, **kwargs): '''ASGD, requiring index of observation be passed to loss func''' x_avg = x0.copy() # running avg optimal x_hat = x0.copy() # per-round optimal reg = kwargs.get('reg', 1e-2) n_rep = kwargs.get('n_rep', 10) # gradient steps per observation γ0 = kwargs.get('learning_rate', 0.1) epochs = kwargs.get('epochs', 2) # for ADAM b1 = kwargs.get('b1', 0.9) b2 = kwargs.get('b2', 0.999) eps = kwargs.get('eps', 10**-8) η = γ0 # init m = np.zeros_like(x0) v = np.zeros_like(x0) μ = 1 if callback is None: callback = lambda *args, **kws: None for epoch in range(epochs): samp = random.sample(obsv, k=len(obsv)) for n, a in tqdm(enumerate(samp, 1), total=len(samp) - 1): n_iter = (epoch * len(obsv) + n) * n_rep for i in range(n_iter, n_iter + n_rep): f_inst = partial(f, obsv, n - 1) g = grad(f_inst)(x_hat, reg=reg) m = (1 - b1) * g + b1 * m # First moment estimate. v = (1 - b2) * (g**2) + b2 * v # Second moment estimate. mhat = m / (1 - b1**(i + 1)) # Bias correction. vhat = v / (1 - b2**(i + 1)) # exp. learning rate decay if decay: η = γ0 * (1 + γ0 * reg * i / len(samp))**(-.75) # step w/ momentum x_hat = x_hat - η * mhat / (np.sqrt(vhat) + eps) # x_hat = x_hat - η * g # no momentum # Averaging if avg: μ = 1. / np.max([1., i - x_hat.size, i - len(obsv)]) x_avg = x_avg + μ * (x_hat - x_avg) else: x_avg = x_hat P = loss._softmax(loss._symmetrize(x_avg), axis=1) callback(n_iter, P, text=f'η={η:.2e}\nμ={μ:.2e}') return x_avg
def factorization(tensor, num_latent, num_iter=2000, lr=1, dis=False, random_seed=0, eps=1e-8, T_known=None): np.random.seed(random_seed) cost = cost_abs args_num = [0, 1, 2] mg = autograd.multigrad(cost, argnums=args_num) M, N, K = tensor.shape H = np.random.rand(M, num_latent) A = np.random.rand(N, num_latent) T = np.random.rand(K, num_latent) sum_square_gradients_A = np.zeros_like(A) sum_square_gradients_H = np.zeros_like(H) sum_square_gradients_T = np.zeros_like(T) if T_known is not None: T = set_known(T, T_known) # GD procedure for i in range(num_iter): del_h, del_a, del_t = mg(H, A, T, tensor) sum_square_gradients_A += eps + np.square(del_a) lr_a = np.divide(lr, np.sqrt(sum_square_gradients_A)) A -= lr_a * del_a sum_square_gradients_H += eps + np.square(del_h) sum_square_gradients_T += eps + np.square(del_t) lr_h = np.divide(lr, np.sqrt(sum_square_gradients_H)) lr_t = np.divide(lr, np.sqrt(sum_square_gradients_T)) H -= lr_h * del_h T -= lr_t * del_t if T_known is not None: T = set_known(T, T_known) # Projection to non-negative space H[H < 0] = 1e-8 A[A < 0] = 1e-8 T[T < 0] = 1e-8 if i % 500 == 0: if dis: print(cost(H, A, T, tensor)) return H, A, T
def _get_grad_log_post(W1D, Wprior, H, y, X, testing=False): """Returns multinomial gradient of the negative log posterior probability with C classes. Parameters ---------- W1D : array-like, shape (C*p, ) Flattened vector of parameters at which the negative log posterior is to be evaluated Wprior : array-like, shape (C, p) vector of prior means on the parameters to be fit H : array-like, shape (C*p, C*p) or independent between classes (C, p, p) Array of prior Hessian (inverse covariance of prior distribution of parameters) y : array-like, shape (N, ) starting at 0 vector of binary ({0, 1, ... C} possible responses) X : array-like, shape (N, p) array of features Returns ------- grad_log_post1D : array-like, shape (C*p, ) Flattened gradient of negative log posterior References ---------- Chapter 8 of Murphy, K. 'Machine Learning a Probabilistic Perspective', MIT Press (2012) Chapter 4 of Bishop, C. 'Pattern Recognition and Machine Learning', Springer (2006) """ # calculate gradient log posterior C, p = Wprior.shape W = W1D.reshape(C, p) mu = _get_softmax_probs(X, W) # shape (N, C) grad_log_likelihood = np.zeros_like(W) grad_log_prior = np.zeros_like(W) for c in range(C): if H.shape == (C, p, p): grad_log_likelihood[:, c] = X.T @ (mu[:, c] - np.int32(y == c)) K = (W[c] - Wprior[c]).reshape(-1) grad_log_prior[c] = H[c] @ K elif H.shape == (C * p, C * p): grad_log_likelihood[c] = X.T @ (mu[:, c] - np.int32(y == c)) if H.shape == (C * p, C * p): K = (W - Wprior).reshape(-1) # change to shape (C*p, ) grad_log_prior = H @ K grad_log_prior = grad_log_prior.reshape(C, p) # change to shape (C, p) grad_log_posterior = grad_log_likelihood + grad_log_prior grad_log_post1D = grad_log_posterior.reshape(-1) if testing: return [grad_log_post1D, grad_log_likelihood.reshape(-1), grad_log_prior.reshape(-1)] else: return grad_log_post1D
def test_grad_hmm_normalizer(T=10, K=3): pi0, Ps, ll = make_parameters(T, K) dlogpi0, dlogPs, dll = np.zeros_like(pi0), np.zeros_like(Ps), np.zeros_like(ll) alphas = np.zeros((T, K)) forward_pass(pi0, Ps, ll, alphas) grad_hmm_normalizer(np.log(Ps), alphas, dlogpi0, dlogPs, dll) assert np.allclose(dlogpi0 / pi0, grad(hmm_normalizer_np, argnum=0)(pi0, Ps, ll)) assert np.allclose(dlogPs / Ps, grad(hmm_normalizer_np, argnum=1)(pi0, Ps, ll)) assert np.allclose(dll, grad(hmm_normalizer_np, argnum=2)(pi0, Ps, ll))
def test_grad_hmm_normalizer(T=1000, K=3): log_pi0, log_Ps, ll = make_parameters(T, K) dlog_pi0, dlog_Ps, dll = np.zeros_like(log_pi0), np.zeros_like(log_Ps), np.zeros_like(ll) alphas = np.zeros((T, K)) forward_pass(-np.log(K) * np.ones(K), log_Ps, ll, alphas) grad_hmm_normalizer(log_Ps, alphas, dlog_pi0, dlog_Ps, dll) assert np.allclose(dlog_pi0, grad(hmm_normalizer_np, argnum=0)(log_pi0, log_Ps, ll)) assert np.allclose(dlog_Ps, grad(hmm_normalizer_np, argnum=1)(log_pi0, log_Ps, ll)) assert np.allclose(dll, grad(hmm_normalizer_np, argnum=2)(log_pi0, log_Ps, ll))
def compute_f_fprime_t_avg_12_(W1, W2, perturbation, max_dist=1, burn_in=0.5): # max dist added 10/14/20 #Wmx,Wmy,Wsx,Wsy,s02,Kin,Kout,kappa,Tin,Tout,XX,XXp,Eta,Xi,h1,h2 = parse_W(W) W0x, W0y, W1x, W1y, W2x, W2y, W3x, W3y, W0xrun, W0yrun, s02, Kin0, Kin1, Kxout0, Kyout0, Kxout1, Kyout1, kappa, Tin0, Tin1, Txout0, Tyout0, Txout1, Tyout1, h1, h2, bl, amp = parse_W1( W1) XX, XXp, Eta, Xi = parse_W2(W2) fval = compute_f_(Eta, Xi, s02) fprimeval = compute_fprime_(Eta, Xi, s02) u0, u1, u2, u3 = compute_us(W1, W2, fval, fprimeval) if share_residuals: resEta = Eta - u0 - u2 resXi = Xi - u1 - u3 resEta12 = np.concatenate((resEta, resEta), axis=0) resXi12 = np.concatenate((resXi, resXi), axis=0) else: resEta12 = 0 resXi12 = 0 dHH = np.zeros((nN, nQ * nS * nT)) dHH[:, np.arange(2, nQ * nS * nT, nQ)] = 1 dHH = np.concatenate((dHH * h1, dHH * h2), axis=0) YY = fval + perturbation YYp = fprimeval XX12 = np.concatenate((XX, XX), axis=0) YY12 = np.concatenate((YY, YY), axis=0) YYp12 = np.concatenate((YYp, YYp), axis=0) YYmean = np.zeros_like(YY12) YYprimemean = np.zeros_like(YY12) def dYYdt(YY, Eta1, Xi1): return -YY + compute_f_(Eta1, Xi1, s02) def dYYpdt(YYp, Eta1, Xi1): return -YYp + compute_fprime_(Eta1, Xi1, s02) for t in range(niter): if np.mean(np.abs(YY - fval)) < max_dist: u0, u1, u2, u3 = compute_us(W1, W2, YY12, YYp12) Eta121 = resEta12 + u0 + u2 + dHH Xi121 = resXi12 + u1 + u3 YY12 = YY12 + dt * dYYdt(YY12, Eta121, Xi121) YYp12 = YYp12 + dt * dYYpdt(YYp12, Eta121, Xi121) elif np.remainder(t, 500) == 0: print('unstable fixed point?') if t > niter * burn_in: YYmean = YYmean + 1 / niter / burn_in * YY12 YYprimemean = YYprimemean + 1 / niter / burn_in * YYp12 #YYmean = YYmean + np.tile(bl,nS*nT)[np.newaxis,:] return YYmean, YYprimemean
def apply_gradient_adam(x, g, i_batch, m=None, v=None, step_size=0.001, b1=0.9, b2=0.999, eps=1e-8): g = np.array(g) if m is None or v is None: m = np.zeros_like(x) v = np.zeros_like(v) m = (1 - b1) * g + b1 * m # First moment estimate. v = (1 - b2) * (g ** 2) + b2 * v # Second moment estimate. mhat = m / (1 - b1 ** (i_batch + 1)) # Bias correction. vhat = v / (1 - b2 ** (i_batch + 1)) x = x - step_size * mhat / (np.sqrt(vhat) + eps) return x, m, v
def StepDescent(self, parameters): self.para_log = np.log(parameters) gradient = self.grad_exp(self.para_log) if self.m_para is None: self.m_para = np.zeros_like(self.para_log) self.s_para = np.zeros_like(self.para_log) self.m_para = self.beta1 * self.m_para - (1 - self.beta1) * gradient self.s_para = self.beta2 * self.s_para + \ (1 - self.beta2) * gradient * gradient para_temp = self.para_log - self.step_size * self.m_para / np.sqrt( self.s_para + 1e-10) return np.exp(para_temp), gradient
def neg_ll(self, x, c, n, *params): f = np.zeros_like(self.p) params = np.reshape(params, (self.m, self.dist.k + 1)) f = np.zeros_like(x) for i in range(self.m): like = self.dist.like(x, c, n, *params[i, 1::]) like = np.multiply(params[i, 0], like) f = f + like f = np.where(f <= 0, surpyval.TINIEST, f) f = np.where(f < 1, f, 1) f = np.log(f) f = np.multiply(n, f) f = -np.sum(f) return f
def wake(self, wake_data, it): reg = self.reg self.wake_data = wake_data.copy() gs = [] nl_obs = (self.nlayer - 1) mean_name = "mx%dx%d_x%d" % (nl_obs - 1, nl_obs, nl_obs) fun_name = "x%d->x%dx%d" % (nl_obs, nl_obs - 1, nl_obs) self.wake_data[mean_name] = reg.predict(self.wake_data, fun_name) if self.layer_plastic[-1]: grad_name = "x%d->dlogp%d" % (nl_obs, nl_obs) g = reg.predict(self.wake_data, grad_name) #g = self.approx_E(mean_name, grad_name) self.wake_data["dlogp%d" % (nl_obs)] = g gs.insert(0, g.mean(0)) else: gs.insert(0, np.zeros_like(self.model.dists[-1].ps)) for i in range(nl_obs - 1, 0, -1): mean_name = "mx%dx%d_x%d" % (i - 1, i, nl_obs) fun_name = "mx%dx%d_x%d->x%dx%d" % (i, i + 1, nl_obs, i - 1, i) self.wake_data[mean_name] = reg.predict(self.wake_data, fun_name) if self.layer_plastic[i]: grad_name = "x%dx%d->dlogp%d" % (i - 1, i, i) g = self.approx_E(mean_name, grad_name) self.wake_data["dlogp%d" % i] = g gs.insert(0, g.mean(0)) else: gs.insert(0, np.zeros_like(self.model.dists[i].ps)) if self.layer_plastic[0]: mean_name = "mx0_x%d" % (nl_obs) fun_name = "mx0x1_x%d->x0" % (nl_obs) self.wake_data[mean_name] = reg.predict(self.wake_data, fun_name) g = self.approx_E(mean_name, "x0->dlogp0") self.wake_data["dlogp0"] = g gs.insert(0, g.mean(0)) else: gs.insert(0, np.zeros_like(self.model.dists[0].ps)) gs = np.concatenate(gs) self.gradient_step(gs, it)
def gen_traces(datafiles, blcutoff=blcutoff, blspan=blspan): #nbefore=nbefore,nafter=nafter trialwise = np.array(()) ctrialwise = np.array(()) strialwise = np.array(()) dfofall = np.array(()) baselineall = np.array(()) for datafile in datafiles: frm = sio.loadmat(datafile.replace('.rois', '.mat'), squeeze_me=True)['info']['frame'][()][1:] with h5py.File(datafile, mode='r') as f: to_add = f['corrected'][:].T to_add[np.isnan(to_add)] = np.nanmin(to_add) # baseline = np.percentile(to_add,blcutoff,axis=1) baseline = sfi.percentile_filter(to_add[:, ::ds], blcutoff, (1, int(blspan / ds))) baseline = np.repeat(baseline, ds, axis=1) for i in range(baseline.shape[0]): baseline[i] = sfi.gaussian_filter1d(baseline[i], blspan / 2) # if baseline.shape[1]<to_add.shape[1]: # baseline = np.hstack((baseline,np.repeat(baseline[:,-1],to_add.shape[1]-baseline.shape[1]))) if baseline.shape[1] > to_add.shape[1]: baseline = baseline[:, :to_add.shape[1]] c = np.zeros_like(to_add) s = np.zeros_like(to_add) dfof = np.zeros_like(to_add) for i in range(c.shape[0]): # dfof = (to_add[i]-baseline[i,np.newaxis])/baseline[i,np.newaxis] dfof[i] = (to_add[i] - baseline[i, :]) / (baseline[i, :]) #try: c[i], s[i], _, _, _ = deconvolve(dfof[i].astype(np.float64), penalty=1, sn=5e-3) #except: # print("in "+datafile+" couldn't do "+str(i)) try: trialwise = np.concatenate((trialwise, to_add), axis=0) ctrialwise = np.concatenate((ctrialwise, c), axis=0) strialwise = np.concatenate((strialwise, s), axis=0) dfofall = np.concatenate((dfofall, dfof), axis=0) baselineall = np.concatenate((baselineall, baseline), axis=0) except: trialwise = to_add.copy() ctrialwise = c.copy() strialwise = s.copy() dfofall = dfof.copy() baselineall = baseline.copy() return trialwise, ctrialwise, strialwise, dfofall, baselineall
def compute_f_fprime_t_avg_12_(W1, W2, perturbation, max_dist=1, burn_in=0.5): # max dist added 10/14/20 #Wmx,Wmy,Wsx,Wsy,s02,K,kappa,T,XX,XXp,Eta,Xi,h1,h2 = parse_W(W) Wmx, Wmy, Wsx, Wsy, s02, K, kappa, T, h1, h2 = parse_W1(W1) XX, XXp, Eta, Xi = parse_W2(W2) fval = compute_f_(Eta, Xi, s02) fprimeval = compute_fprime_(Eta, Xi, s02) if share_residuals: resEta = Eta - u_fn(XX, fval, Wmx, Wmy, K, kappa, T) resXi = Xi - u_fn(XX, fval, Wsx, Wsy, K, kappa, T) resEta12 = np.concatenate((resEta, resEta), axis=0) resXi12 = np.concatenate((resXi, resXi), axis=0) else: resEta12 = 0 resXi12 = 0 dHH = np.zeros((nN, nQ * nS * nT)) dHH[:, np.arange(2, nQ * nS * nT, nQ)] = 1 dHH = np.concatenate((dHH * h1, dHH * h2), axis=0) YY = fval + perturbation YYp = fprimeval XX12 = np.concatenate((XX, XX), axis=0) YY12 = np.concatenate((YY, YY), axis=0) YYp12 = np.concatenate((YYp, YYp), axis=0) YYmean = np.zeros_like(YY12) YYprimemean = np.zeros_like(YY12) def dYYdt(YY, Eta1, Xi1): return -YY + compute_f_(Eta1, Xi1, s02) def dYYpdt(YYp, Eta1, Xi1): return -YYp + compute_fprime_(Eta1, Xi1, s02) for t in range(niter): if np.mean(np.abs(YY - fval)) < max_dist: Eta121 = resEta12 + u_fn(XX12, YY12, Wmx, Wmy, K, kappa, T) + dHH Xi121 = resXi12 + u_fn(XX12, YY12, Wmx, Wmy, K, kappa, T) YY12 = YY12 + dt * dYYdt(YY12, Eta121, Xi121) YYp12 = YYp12 + dt * dYYpdt(YYp12, Eta121, Xi121) elif np.remainder(t, 500) == 0: print('unstable fixed point?') if t > niter * burn_in: YYmean = YYmean + 1 / niter / burn_in * YY12 YYprimemean = YYprimemean + 1 / niter / burn_in * YYp12 return YYmean, YYprimemean
def updateEmissionDist( self, msg, node_smoothed ): emission_dist_numerator = np.zeros_like( msg.emission_dist ) emission_dist_denominator = np.zeros_like( msg.pi0 ) # Update the emission distribution for node, ys in zip( msg.nodes, msg.ys ): measurements = ys.shape[ 0 ] for y in ys: emission_dist_numerator[ :, y ] += node_smoothed[ node ] emission_dist_denominator += node_smoothed[ node ] * measurements self.emission_dist.params = ( emission_dist_numerator / emission_dist_denominator[ :, None ], ) assert np.allclose( self.emission_dist.params[ 0 ].sum( axis=-1 ), 1.0 )
def marginal(self, kernel): """ calculates marginal likelihood Args: Ks_new: new covariance if needed Returns: np.array for marginal likelihood """ if kernel.params is not None: self.Ks = self.construct_Ks() self.alpha = np.zeros([self.X.shape[0]]) self.W = np.zeros([self.X.shape[0]]) self.grads = np.zeros([self.X.shape[0]]) self.f = self.mu self.f_pred = self.f self.run(10) Ks = self.Ks eigs = [np.expand_dims(np.linalg.eig(K)[0], 1) for K in Ks] eig_K = np.squeeze(kron_list(eigs)) self.eig_K = eig_K if self.obs_idx is not None: f_lim = self.f[self.obs_idx] alpha_lim = self.alpha[self.obs_idx] mu_lim = self.mu[self.obs_idx] W_lim = self.W[self.obs_idx] eig_k_lim = eig_K[self.obs_idx] pen = -0.5 * np.sum(np.multiply(alpha_lim, f_lim - mu_lim)) pen = np.where(np.isnan(pen), np.zeros_like(pen), pen) eigs = 0.5 * np.sum(np.log(1 + np.multiply(eig_k_lim, W_lim))) eigs = np.where(np.isnan(eigs), np.zeros_like(eigs), eigs) like = np.sum(self.likelihood.log_like(f_lim, self.y)) like = np.where(np.isnan(like), np.zeros_like(like), like) return -(pen+eigs+like) pen = -0.5 * np.sum(np.multiply(self.alpha, self.f - self.mu)) eigs = - 0.5*np.sum(np.log(1 + np.multiply(eig_K, self.W))) like = np.sum(self.likelihood.log_like(self.f, self.y)) return -(pen+eigs+like)
def adadelta(paramvec, loss, batches, epochs=1, rho=0.95, epsilon=1e-6, callback=None): sum_gsq = np.zeros_like(paramvec) sum_usq = np.zeros_like(paramvec) vals = [] for epoch in range(epochs): permuted_batches = [batches[i] for i in npr.permutation(len(batches))] for im, angle in permuted_batches: val, g = vgrad(loss)(paramvec, im, angle) sum_gsq = rho*sum_gsq + (1.-rho)*g**2 ud = -np.sqrt(sum_usq + epsilon) / np.sqrt(sum_gsq + epsilon) * g sum_usq = rho*sum_usq + (1.-rho)*ud**2 paramvec = paramvec + ud vals.append(val) if callback: callback(epoch, paramvec, vals, permuted_batches) return paramvec
def backward_pass(self, delta): if len(delta.shape) == 2: delta = delta[:, np.newaxis, :] n_samples, n_timesteps, input_shape = delta.shape p = self._params # Temporal gradient arrays grad = {k: np.zeros_like(p[k]) for k in p.keys()} dh_next = np.zeros((n_samples, input_shape)) output = np.zeros((n_samples, n_timesteps, self.input_dim)) # Backpropagation through time for i in reversed(range(n_timesteps)): dhi = self.activation_d(self.states[:, i, :]) * (delta[:, i, :] + dh_next) grad['W'] += np.dot(self.last_input[:, i, :].T, dhi) grad['b'] += delta[:, i, :].sum(axis=0) grad['U'] += np.dot(self.states[:, i - 1, :].T, dhi) dh_next = np.dot(dhi, p['U'].T) d = np.dot(delta[:, i, :], p['U'].T) output[:, i, :] = np.dot(d, p['W'].T) # Change actual gradient arrays for k in grad.keys(): self._params.update_grad(k, grad[k]) return output
def to_unconstrained_arr(p): """ Numerically stable transform from positive reals to real line Implements ag_np.log(ag_np.exp(x) - 1.0) Autograd friendly and fully vectorized Args ---- p : array of values in (0, +\infty) Returns ------- ans : array of values in (-\infty, +\infty), same size as p """ ## Handle numpy array case if not isinstance(p, float): mask1 = p > 10.0 mask0 = ag_np.logical_not(mask1) out = ag_np.zeros_like(p) out[mask0] = ag_np.log(ag_np.expm1(p[mask0])) out[mask1] = p[mask1] + ag_np.log1p(-ag_np.exp(-p[mask1])) return out ## Handle scalar float case else: if p > 10: return p + ag_np.log1p(-ag_np.exp(-p)) else: return ag_np.log(ag_np.expm1(p))
def to_common_arr(x): """ Numerically stable transform from real line to positive reals Returns ag_np.log(1.0 + ag_np.exp(x)) Autograd friendly and fully vectorized Args ---- x : array of values in (-\infty, +\infty) Returns ------- ans : array of values in (0, +\infty), same size as x """ if not isinstance(x, float): mask1 = x > 0 mask0 = ag_np.logical_not(mask1) out = ag_np.zeros_like(x) out[mask0] = ag_np.log1p(ag_np.exp(x[mask0])) out[mask1] = x[mask1] + ag_np.log1p(ag_np.exp(-x[mask1])) return out if x > 0: return x + ag_np.log1p(ag_np.exp(-x)) else: return ag_np.log1p(ag_np.exp(x))
def lorenz96_ode(t, x, params): ''' This is the ODE function in eq.(19) of the paper. This function will be sent as a parameter to the scipy.integrate.ode(). This function is called in the simulate.py file. Input: t: time. This should always be the first parameter. This is required by the scipy.integrate.ode(). x: d-dimensional state at time t. params: 1-dimensional parameter. Output: d-dimensional derivative dx/dt=[x0_dot, x1_dot,...]. ''' F = params[0] T = x.shape[0] xdot = np.zeros_like(x) xdot[0] = ((x[1] - x[T - 2]) * x[T - 1]) - x[0] xdot[1] = ((x[2] - x[T - 1]) * x[0]) - x[1] xdot[T - 1] = ((x[0] - x[T - 3]) * x[T - 2]) - x[T - 1] for i in range(2, T - 1): xdot[i] = ((x[i + 1] - x[i - 2]) * x[i - 1]) - x[i] xdot = xdot + F return xdot
def adam(data, paramvec, loss, batch_size, rate, epochs=1, b1=0.9, b2=0.999, epsilon=1e-8, callback=None): m = np.zeros_like(paramvec) v = np.zeros_like(paramvec) vals = [] i = 0 for epoch in range(epochs): for minibatch in make_batches(batch_size, data): val, g = vgrad(loss)(paramvec, *minibatch) m = (1. - b1)*g + b1*m v = (1. - b2)*g**2 + b2*v mhat = m / (1 - b1**(i+1)) vhat = v / (1 - b2**(i+1)) paramvec -= rate * mhat / (np.sqrt(vhat) + epsilon) vals.append(val) i += 1 if callback: callback(epoch, paramvec, vals) return paramvec
def monomial(x, y, x_test): n = len(x) A = np.vander(x, increasing=True) c = np.linalg.solve(A, y) y_test = np.zeros_like(x_test) for j in xrange(n-1, -1, -1): y_test = np.multiply(y_test, x_test) + c[j] return y_test
def manual_grads(params): """ Compute the gradient of the loss WRT the parameters Ordering of the operations is reverse of that in fprop() """ deltas = {} for key, val in params.iteritems(): deltas[key] = np.zeros_like(val) loss, mems, ps, ys, os, zos, hs, zhs, xs, rs, w_rs, w_ws, adds, erases = self.stats dd = [{} for _ in range(self.heads)] for t in reversed(xrange(len(targets))): if t < len(inputs) - 1: for idx in range(self.heads): # grab gradient from the future dnext = dd[idx][t+1] # propagate the gradients to the first input of read(). dread1 = np.dot(mems[t-1], dnext) # propagate the gradients to the second input of read(). dread2 = np.dot(w_rs[idx][t], dnext.T) # TODO: propagate the gradients through write() pass ts = np.reshape(np.array(targets[t]),(self.out_size,1)) # gradient of cross entropy loss function. dt = (ps[t] - ts) / (math.log(2) * ps[t] * (1 - ps[t])) # propagate the gradient backwards through the flow graph, # updating parameters as we go dt *= sigmoid_prime(ys[t]) deltas['oy'] = np.dot(dt, os[t].T) deltas['by'] = dt if t < len(inputs) - 1: for idx in range(self.heads): # TODO: Update parameters oadds, oerases, ok_r, bbeta_r, og_r, os_r, ok_w ... # use dread1 and dread2 computed above as the starting gradients pass dt = np.dot(params['oy'].T, dt) dt *= tanh_prime(zos[t]) deltas['ho'] = np.dot(dt, hs[t].T) deltas['bo'] = dt dt = np.dot(params['ho'].T, dt) dt *= tanh_prime(zhs[t]) deltas['xh'] = np.dot(dt, xs[t].T) deltas['bh'] = dt for idx in range(self.heads): deltas['rh' + str(idx)] += np.dot(dt, rs[idx][t-1].reshape((self.M, 1)).T) # save the gradient for propagating backwards through time dd[idx][t] = np.dot(params['rh' + str(idx)].T, dt) return deltas
def rmsprop(data, paramvec, loss, batch_size, rate, epochs=1, rho=0.9, epsilon=1e-6, callback=None): sumsq = np.zeros_like(paramvec) vals = [] for epoch in range(epochs): for minibatch in make_batches(batch_size, data): val, g = vgrad(loss)(paramvec, *minibatch) sumsq = rho*sumsq + (1.-rho)*g**2 paramvec = paramvec - rate * g / np.sqrt(sumsq + epsilon) vals.append(val) if callback: callback(epoch, paramvec, vals) return paramvec
def compute_rotated_map(self, rotation): """ Compute stellar maps projected on the plane of the sky for a given rotation of the star Args: rotation (float) : rotation around the star in degrees given as [longitude, latitude] in degrees Returns: pixel_unique (int) : vector with the "active" healpix pixels pixel_map (int) : map showing the healpix pixel projected on the plane of the sky mu_pixel (float): map of the astrocentric angle for each pixel on the plane of the sky (zero for pixels not in the star) T_pixel (float): map of temperatures for each pixel on the plane of the sky """ mu_pixel = np.zeros_like(self.mu_angle) T_pixel = np.zeros_like(self.mu_angle) # Get the projection of the healpix pixel indices on the plane of the sky pixel_map = self.projector.projmap(self.indices, self.f_vec2pix, rot=rotation)[:,0:int(self.npix/2)] # Get the unique elements in the vector pixel_unique = np.unique(pixel_map) # Now loop over all unique pixels, filling up the array of the projected map with the mu and temeperature values for j in range(len(pixel_unique)): ind = np.where(pixel_map == pixel_unique[j]) if (np.all(np.isfinite(self.mu_angle[ind[0],ind[1]]))): if (self.mu_angle[ind[0],ind[1]].size == 0): value = 0.0 else: value = np.nanmean(self.mu_angle[ind[0],ind[1]]) mu_pixel[ind[0],ind[1]] = value T_pixel[ind[0],ind[1]] = self.temperature_map[int(pixel_unique[j])] else: mu_pixel[ind[0],ind[1]] = 0.0 T_pixel[ind[0],ind[1]] = 0.0 return pixel_unique, pixel_map, mu_pixel, T_pixel
def backward_pass(self, delta): if len(delta.shape) == 2: delta = delta[:, np.newaxis, :] n_samples, n_timesteps, input_shape = delta.shape # Temporal gradient arrays grad = {k: np.zeros_like(self._params[k]) for k in self._params.keys()} dh_next = np.zeros((n_samples, input_shape)) output = np.zeros((n_samples, n_timesteps, self.input_dim)) # Backpropagation through time for i in reversed(range(n_timesteps)): dhi = delta[:, i, :] * self.gates['o'][:, i, :] * self.activation_d(self.states[:, i, :]) + dh_next og = delta[:, i, :] * self.activation(self.states[:, i, :]) de_o = og * self.sigmoid_d(self.gates['o'][:, i, :]) grad['W_o'] += np.dot(self.last_input[:, i, :].T, de_o) grad['U_o'] += np.dot(self.outputs[:, i - 1, :].T, de_o) grad['b_o'] += de_o.sum(axis=0) de_f = (dhi * self.states[:, i - 1, :]) * self.sigmoid_d(self.gates['f'][:, i, :]) grad['W_f'] += np.dot(self.last_input[:, i, :].T, de_f) grad['U_f'] += np.dot(self.outputs[:, i - 1, :].T, de_f) grad['b_f'] += de_f.sum(axis=0) de_i = (dhi * self.gates['c'][:, i, :]) * self.sigmoid_d(self.gates['i'][:, i, :]) grad['W_i'] += np.dot(self.last_input[:, i, :].T, de_i) grad['U_i'] += np.dot(self.outputs[:, i - 1, :].T, de_i) grad['b_i'] += de_i.sum(axis=0) de_c = (dhi * self.gates['i'][:, i, :]) * self.activation_d(self.gates['c'][:, i, :]) grad['W_c'] += np.dot(self.last_input[:, i, :].T, de_c) grad['U_c'] += np.dot(self.outputs[:, i - 1, :].T, de_c) grad['b_c'] += de_c.sum(axis=0) dh_next = dhi * self.gates['f'][:, i, :] # TODO: propagate error to the next layer # Change actual gradient arrays for k in grad.keys(): self._params.update_grad(k, grad[k]) return output
def getDiffs(model, deltas, inputs, targets, epsilon): """ For every (weight,delta) combo in zip(weights, deltas): Add epsilon to that weight and compute the loss (first_loss) Remove epsilon from that weight and compute the loss (second_loss) Check how close (first loss - second loss) / 2h is to the delta from bprop """ diff_tensors = [] for D in deltas: diff_tensors.append(np.zeros_like(D)) for W,D,diffs in zip(model.weights, deltas, diff_tensors): # for each weight tensor in our model for i in range(W.shape[0]): for j in range(W.shape[1]): # for each weight in that tensor # compute f(x+h) for that weight W[i,j] += epsilon loss, _, _, _, _, _, _ = model.lossFun(inputs, targets, False) loss_plus = np.sum(loss) # compute f(x - h) for that weight W[i,j] -= epsilon*2 loss, _, _, _, _, _, _ = model.lossFun(inputs, targets, False) loss_minus = np.sum(loss) # grad check must leave weights unchanged # so reset the weight that we changed W[i,j] += epsilon # compute the numerical grad w.r.t. this param grad = (loss_plus - loss_minus) / (2 * epsilon) diffs[i,j] = grad - D[i,j] return diff_tensors
def manual_grads(params): """ Compute the gradient of the loss WRT the parameters Ordering of the operations is reverse of that in fprop() """ deltas = {} for key, val in params.iteritems(): deltas[key] = np.zeros_like(val) [loss, mems, ps, ys, os, zos, hs, zhs, xs, rs, w_rs, w_ws, adds, erases, k_rs, k_ws, g_rs, g_ws, wc_rs, wc_ws, zbeta_rs, zbeta_ws, zs_rs, zs_ws, wg_rs, wg_ws] = self.stats dd = {} drs = {} dzh = {} dmem = {} # might not need this, since we have dmemtilde dmemtilde = {} du_r = {} du_w = {} dwg_r = {} dwg_w = {} for t in reversed(xrange(len(targets))): dy = np.copy(ps[t]) dy -= targets[t].T # backprop into y deltas['oy'] += np.dot(dy, os[t].T) deltas['by'] += dy if t < len(targets) - 1: # r[t] affects cost through zh[t+1] via Wrh drs[t] = np.dot(self.W['rh'].T, dzh[t + 1]) # right now, mems[t] influences cost through rs[t+1], via w_rs[t+1] dmem[t] = np.dot( w_rs[t + 1], drs[t + 1].reshape((self.M,1)).T ) # and also through mems at next step W = np.reshape(w_ws[t+1], (w_ws[t+1].shape[0], 1)) E = np.reshape(erases[t+1], (erases[t+1].shape[0], 1)) WTE = np.dot(W, E.T) KEEP = np.ones(mems[0].shape) - WTE dmem[t] += np.multiply(dmemtilde[t+1], KEEP) # and also through its influence on the content weighting next step dmem[t] += du_r[t+1] + du_w[t+1] dmemtilde[t] = dmem[t] # erases[t] affects cost through mems[t], via w_ws[t] derase = np.dot(np.multiply(dmemtilde[t], -mems[t-1]).T, w_ws[t]) # zerase affects just erases through a sigmoid dzerase = derase * (erases[t] * (1 - erases[t])) # adds[t] affects costs through mems[t], via w_ws dadd = np.dot(dmem[t].T, w_ws[t]) # zadds affects just adds through a tanh dzadd = dadd * (1 - adds[t] * adds[t]) # dbadds is just dzadds deltas['badds'] += dzadd deltas['oadds'] += np.dot(dzadd, os[t].T) deltas['berases'] += dzerase deltas['oerases'] += np.dot(dzerase, os[t].T) # # read weights affect what is read, via what's in mems[t-1] # dwc_r = np.dot(mems[t-1], drs[t]) # # write weights affect mem[t] through adding # dwc_w = np.dot(dmem[t], adds[t]) # # they also affect memtilde[t] through erasing # dwc_w += np.dot(np.multiply(dmemtilde[t], -mems[t-1]), erases[t]) dw_r = np.dot(mems[t-1], drs[t]) dw_r += dwg_r[t+1] * (1 - g_rs[t+1]) # write weights affect mem[t] through adding dw_w = np.dot(dmem[t], adds[t]) # they also affect memtilde[t] through erasing dw_w += np.dot(np.multiply(dmemtilde[t], -mems[t-1]), erases[t]) dw_w += dwg_w[t+1] * (1 - g_ws[t+1]) sgwr = np.zeros((self.N, self.N)) sgww = np.zeros((self.N, self.N)) for i in range(self.N): sgwr[i,i] = softmax(zs_rs[t])[0] sgwr[i,(i+1) % self.N] = softmax(zs_rs[t])[2] sgwr[i,(i-1) % self.N] = softmax(zs_rs[t])[1] sgww[i,i] = softmax(zs_ws[t])[0] sgww[i,(i+1) % self.N] = softmax(zs_ws[t])[2] sgww[i,(i-1) % self.N] = softmax(zs_ws[t])[1] # right now, shifted weights are final weight dws_r = dw_r dws_w = dw_w dwg_r[t] = np.dot(sgwr.T, dws_r) dwg_w[t] = np.dot(sgww.T, dws_w) dwc_r = dwg_r[t] * g_rs[t] dwc_w = dwg_w[t] * g_ws[t] """ We need dw/dK now w has N elts and K has N elts and we want, for every elt of W, the grad of that elt w.r.t. each of the N elts of K. that gives us N * N things """ # first, we must build up the K values (should be taken from fprop) K_rs = [] K_ws = [] for i in range(self.N): K_rs.append(cosine_sim(mems[t-1][i, :], k_rs[t])) K_ws.append(cosine_sim(mems[t-1][i, :], k_ws[t])) # then, we populate the grads dwdK_r = np.zeros((self.N, self.N)) dwdK_w = np.zeros((self.N, self.N)) # for every row in the memory for i in range(self.N): # for every element in the weighting for j in range(self.N): dwdK_r[i,j] += softmax_grads(K_rs, softplus(zbeta_rs[t]), i, j) dwdK_w[i,j] += softmax_grads(K_ws, softplus(zbeta_ws[t]), i, j) # compute dK for all i in N # K is the evaluated cosine similarity for the i-th row of mem matrix dK_r = np.zeros_like(w_rs[0]) dK_w = np.zeros_like(w_ws[0]) # for all i in N (for every row that we've simmed) for i in range(self.N): # for every j in N (for every elt of the weighting) for j in range(self.N): # specifically, dwdK_r will change, and for write as well dK_r[i] += dwc_r[j] * dwdK_r[i,j] dK_w[i] += dwc_w[j] * dwdK_w[i,j] """ dK_r_dk_rs is a list of N things each elt of the list corresponds to grads of K_idx w.r.t. the key k_t so it should be a length N list of M by 1 vectors """ dK_r_dk_rs = [] dK_r_dmem = [] for i in range(self.N): # let k_rs be u, Mem[i] be v u = np.reshape(k_rs[t], (self.M,)) v = mems[t-1][i, :] dK_r_dk_rs.append( dKdu(u,v) ) dK_r_dmem.append( dKdu(v,u)) dK_w_dk_ws = [] dK_w_dmem = [] for i in range(self.N): # let k_ws be u, Mem[i] be v u = np.reshape(k_ws[t], (self.M,)) v = mems[t-1][i, :] dK_w_dk_ws.append( dKdu(u,v) ) dK_w_dmem.append( dKdu(v,u)) # compute delta for keys dk_r = np.zeros_like(k_rs[0]) dk_w = np.zeros_like(k_ws[0]) # for every one of M elt of dk_r for i in range(self.M): # for every one of the N Ks for j in range(self.N): # add delta K_r[j] * dK_r[j] / dk_r[i] # add influence on through K_r[j] dk_r[i] += dK_r[j] * dK_r_dk_rs[j][i] dk_w[i] += dK_w[j] * dK_w_dk_ws[j][i] # these represent influence of mem on next K """ Let's let du_r[t] represent the influence of mems[t-1] on the cost through the K values this is analogous to dk_w, but, k only every affects that whereas mems[t-1] will also affect what is read at time t+1 and through memtilde at time t+1 """ du_r[t] = np.zeros_like(mems[0]) du_w[t] = np.zeros_like(mems[0]) # for every row in mems[t-1] for i in range(self.N): # for every elt of this row (one of M) for j in range(self.M): du_r[t][i,j] = dK_r[i] * dK_r_dmem[i][j] du_w[t][i,j] = dK_w[i] * dK_w_dmem[i][j] # key values are activated as tanh dzk_r = dk_r * (1 - k_rs[t] * k_rs[t]) dzk_w = dk_w * (1 - k_ws[t] * k_ws[t]) deltas['ok_r'] += np.dot(dzk_r, os[t].T) deltas['ok_w'] += np.dot(dzk_w, os[t].T) deltas['bk_r'] += dzk_r deltas['bk_w'] += dzk_w dg_r = np.dot(dwg_r[t].T, (wc_rs[t] - w_rs[t-1]) ) dg_w = np.dot(dwg_w[t].T, (wc_ws[t] - w_ws[t-1]) ) # compute dzg_r, dzg_w dzg_r = dg_r * (g_rs[t] * (1 - g_rs[t])) dzg_w = dg_w * (g_ws[t] * (1 - g_ws[t])) deltas['og_r'] += np.dot(dzg_r, os[t].T) deltas['og_w'] += np.dot(dzg_w, os[t].T) deltas['bg_r'] += dzg_r deltas['bg_w'] += dzg_w # compute dbeta, which affects w_content through interaction with Ks dwcdbeta_r = np.zeros_like(w_rs[0]) dwcdbeta_w = np.zeros_like(w_ws[0]) for i in range(self.N): dwcdbeta_r[i] = beta_grads(K_rs, softplus(zbeta_rs[t]), i) dwcdbeta_w[i] = beta_grads(K_ws, softplus(zbeta_ws[t]), i) dbeta_r = np.zeros_like(zbeta_rs[0]) dbeta_w = np.zeros_like(zbeta_ws[0]) for i in range(self.N): dbeta_r[0] += dwc_r[i] * dwcdbeta_r[i] dbeta_w[0] += dwc_w[i] * dwcdbeta_w[i] # beta is activated from zbeta by softplus, grad of which is sigmoid dzbeta_r = dbeta_r * sigmoid(zbeta_rs[t]) dzbeta_w = dbeta_w * sigmoid(zbeta_ws[t]) deltas['obeta_r'] += np.dot(dzbeta_r, os[t].T) deltas['obeta_w'] += np.dot(dzbeta_w, os[t].T) deltas['bbeta_r'] += dzbeta_r deltas['bbeta_w'] += dzbeta_w sgsr = np.zeros((self.N, 3)) sgsw = np.zeros((self.N, 3)) for i in range(self.N): sgsr[i,1] = wg_rs[t][(i - 1) % self.N] sgsr[i,0] = wg_rs[t][i] sgsr[i,2] = wg_rs[t][(i + 1) % self.N] sgsw[i,1] = wg_ws[t][(i - 1) % self.N] sgsw[i,0] = wg_ws[t][i] sgsw[i,2] = wg_ws[t][(i + 1) % self.N] ds_r = np.dot(sgsr.T, dws_r) ds_w = np.dot(sgsw.T, dws_w) shift_act_jac_r = np.zeros((3,3)) shift_act_jac_w = np.zeros((3,3)) bf = np.array([[1.0]]) for i in range(3): for j in range(3): shift_act_jac_r[i,j] = softmax_grads(zs_rs[t], bf, i, j) shift_act_jac_w[i,j] = softmax_grads(zs_ws[t], bf, i, j) dzs_r = np.dot(shift_act_jac_r.T, ds_r) dzs_w = np.dot(shift_act_jac_w.T, ds_w) deltas['os_r'] += np.dot(dzs_r, os[t].T) deltas['os_w'] += np.dot(dzs_w, os[t].T) deltas['bs_r'] += dzs_r deltas['bs_w'] += dzs_w else: drs[t] = np.zeros_like(rs[0]) dmemtilde[t] = np.zeros_like(mems[0]) du_r[t] = np.zeros_like(mems[0]) du_w[t] = np.zeros_like(mems[0]) dwg_r[t] = np.zeros_like(w_rs[0]) dwg_w[t] = np.zeros_like(w_ws[0]) # o affects y through Woy do = np.dot(params['oy'].T, dy) if t < len(targets) - 1: # and also zadd through Woadds do += np.dot(params['oadds'].T, dzadd) do += np.dot(params['oerases'].T, dzerase) # and also through the keys do += np.dot(params['ok_r'].T, dzk_r) do += np.dot(params['ok_w'].T, dzk_w) # and also through the interpolators do += np.dot(params['og_r'].T, dzg_r) do += np.dot(params['og_w'].T, dzg_w) # and also through beta do += np.dot(params['obeta_r'].T, dzbeta_r) do += np.dot(params['obeta_w'].T, dzbeta_w) # and also through the shift values do += np.dot(params['os_r'].T, dzs_r) do += np.dot(params['os_w'].T, dzs_w) # compute deriv w.r.t. pre-activation of o dzo = do * (1 - os[t] * os[t]) deltas['ho'] += np.dot(dzo, hs[t].T) deltas['bo'] += dzo # compute hidden dh dh = np.dot(params['ho'].T, dzo) # compute deriv w.r.t. pre-activation of h dzh[t] = dh * (1 - hs[t] * hs[t]) deltas['xh'] += np.dot(dzh[t], xs[t].T) deltas['bh'] += dzh[t] # Wrh affects zh via rs[t-1] deltas['rh'] += np.dot(dzh[t], rs[t-1].reshape((self.M, 1)).T) return deltas
def precompute_rotation_maps(self, rotations=None): """ Compute the averaged spectrum on the star for a given temperature map and for a given rotation Args: rotations (float) : [N_phases x 2] giving [longitude, latitude] in degrees for each phase Returns: None """ if (rotations is None): print("Use some angles for the rotations") return self.n_phases = rotations.shape[0] self.avg_mu = [None] * self.n_phases self.avg_v = [None] * self.n_phases self.velocity = [None] * self.n_phases self.n_pixel_unique = [None] * self.n_phases self.n_pixels = [None] * self.n_phases self.pixel_unique = [None] * self.n_phases for loop in range(self.n_phases): mu_pixel = np.zeros_like(self.mu_angle) v_pixel = np.zeros_like(self.vel_projection) pixel_map = self.projector.projmap(self.indices, self.f_vec2pix, rot=rotations[loop,:])[:,0:int(self.npix/2)] pixel_unique = np.unique(pixel_map[np.isfinite(pixel_map)]) for j in range(len(pixel_unique)): ind = np.where(pixel_map == pixel_unique[j]) if (np.all(np.isfinite(self.mu_angle[ind[0],ind[1]]))): if (self.mu_angle[ind[0],ind[1]].size == 0): mu_pixel[ind[0],ind[1]] = 0.0 v_pixel[ind[0],ind[1]] = 0.0 else: if (self.clv): value = np.nanmean(self.mu_angle[ind[0],ind[1]]) else: value = 1.0 mu_pixel[ind[0],ind[1]] = value value = np.nanmean(self.vel_projection[ind[0],ind[1]]) v_pixel[ind[0],ind[1]] = value else: mu_pixel[ind[0],ind[1]] = 0.0 v_pixel[ind[0],ind[1]] = 0.0 self.n_pixel_unique[loop] = len(pixel_unique) self.avg_mu[loop] = np.zeros(self.n_pixel_unique[loop]) self.avg_v[loop] = np.zeros(self.n_pixel_unique[loop]) self.velocity[loop] = np.zeros(self.n_pixel_unique[loop]) self.n_pixels[loop] = np.zeros(self.n_pixel_unique[loop], dtype='int') self.pixel_unique[loop] = pixel_unique.astype('int') for i in range(len(pixel_unique)): ind = np.where(pixel_map == pixel_unique[i]) self.n_pixels[loop][i] = len(ind[0]) self.avg_mu[loop][i] = np.unique(mu_pixel[ind[0], ind[1]]) self.avg_v[loop][i] = np.unique(v_pixel[ind[0], ind[1]]) self.velocity[loop][i] = self.avg_mu[loop][i] * self.avg_v[loop][i]
return num / den2 if __name__ == "__main__": M = 5 u = np.random.uniform(high=1, low=-1, size=(M,)) v = np.random.uniform(high=1, low=-1, size=(M,)) print cosine_sim(u,v) # compute deltas automatically # just with respect to u cs_grad = grad(cosine_sim, argnum=0) auto_deltas = cs_grad(u,v) # compute deltas manually manual_deltas = np.zeros_like(auto_deltas) # compute the denominator anorm = np.sqrt(np.sum(u*u)) bnorm = np.sqrt(np.sum(v*v)) den2 = (anorm * bnorm) + 1e-5 a = v / den2 b = u / np.sum(np.square(u)) c = cosine_sim(u,v) manual_deltas = a - b*c print "auto deltas" print auto_deltas print "manual deltas"
def init_grad(self): for key in self._params.keys(): if key not in self._grads: self._grads[key] = np.zeros_like(self._params[key])