Example #1
0
 def grad_like(self, r, eps):
     """
     Gradient of likelihood w.r.t variational parameters
     Args:
         r (): Transformed random sample
         eps (): Random sample
     Returns: gradient w.r.t variances, gradient w.r.t mean
     """
     if self.obs_idx is not None:
         r_obs = r[self.obs_idx]
     else:
         r_obs = r
     dr = self.likelihood_grad(r_obs, self.y)
     dr[np.isnan(dr)] = 0.
     if self.obs_idx is not None:
         grad_mu = np.zeros(self.m)
         grad_mu[self.obs_idx] = dr
     else:
         grad_mu = dr
     grad_S = np.multiply(
         grad_mu,
         np.multiply(
             eps,
             np.multiply(0.5 / np.sqrt(np.exp(self.q_S)),
                         np.exp(self.q_S))))
     return grad_S, grad_mu
Example #2
0
def doPDE(values, movablePts, xPoints, yPoints, xIntPoints, yIntPoints):
    # Update the values based on diffusion of the proteins to nearby cells
    D = 0.1  # diffusion parameter
    valuesT = np.transpose(values)
    adjustmentPDEX = D * nonLinearAdjustment(xPoints)
    adjustmentPDEY = D * nonLinearAdjustment(yPoints)

    #simple diffusion is just a convolution
    convolveLinear = np.array([1 * D, -2 * D, 1 * D])
    # accumulate the changes due to diffusion
    for rep in range(50):
        # print(rep)
        newValuesX = list([])
        newValuesY = list([])
        for i in range(HowManyCells):
            row = values[i] + sig.convolve(
                values[i], convolveLinear)[1:-1]  #take off first and last
            rowY = valuesT[i] + sig.convolve(
                valuesT[i], convolveLinear)[1:-1]  #take off first and last
            # non-linear diffusion, add the adjustment
            if i in xIntPoints:
                row = row + np.multiply(row, adjustmentPDEX)
            if i in yIntPoints:
                rowY = rowY + np.multiply(rowY, adjustmentPDEY)
            newValuesX.append(row)
            newValuesY.append(rowY)

        #Merge rows and transposed columns
        values = np.array(newValuesX) + np.array(newValuesY).T
        # add source at each iteration
        values = values + addSources3(xPoints, yPoints)
        #Update transposed values
        valuesT = values.T
    # the total update returned is the difference between the original values and the values after diffusion
    return values
Example #3
0
    def forward(self, X1, X2, **kwargs):
        alpha, mean_lam, gamma, delta = self._get_params(X1, **kwargs)
        cfg1, res1, kappa1, kr_pref1, _ = self._compute_terms(
            X1, alpha, mean_lam, gamma, delta)
        if X2 is not X1:
            cfg2, res2, kappa2, kr_pref2, _ = self._compute_terms(
                X2, alpha, mean_lam, gamma, delta)
        else:
            cfg2, res2, kappa2, kr_pref2 = cfg1, res1, kappa1, kr_pref1
        res2 = anp.reshape(res2, (1, -1))
        kappa2 = anp.reshape(kappa2, (1, -1))
        kr_pref2 = anp.reshape(kr_pref2, (1, -1))
        kappa12 = self._compute_kappa(
            anp.add(res1, res2), alpha, mean_lam)
        kmat_res = anp.subtract(kappa12, anp.multiply(kappa1, kappa2))
        kmat_res = anp.multiply(kr_pref1, anp.multiply(
            kr_pref2, kmat_res))

        kmat_x = self.kernel_x(cfg1, cfg2)
        if self.encoding_delta is None:
            if delta > 0.0:
                tmpmat = anp.add(kappa1, anp.subtract(
                    kappa2, kappa12 * delta))
                tmpmat = tmpmat * (-delta) + 1.0
            else:
                tmpmat = 1.0
        else:
            tmpmat = anp.add(kappa1, anp.subtract(
                kappa2, anp.multiply(kappa12, delta)))
            tmpmat = anp.multiply(tmpmat, -delta) + 1.0

        return kmat_x * tmpmat + kmat_res
Example #4
0
    def step(self, *inputs):
        grad = self.flattened_grad(self.theta, *inputs)

        # optionally resample momentum 
        if self.resample_momentum > 0 and self.count % self.resample_momentum == 0:
            np.copyto(self.p, self._srng.normal(size=self.theta.shape))

        # Constant mass just defined here so that we can easily change it should we want to
        Minv  = 1.
        Minvh = 1.
        # pre-generate a sample
        sample = self._srng.normal(size=self.theta.shape) * np.sqrt(self.epsilon * 2 * self.A)
        # the SG-HMC update equations
        # update p
        self.p += - self.epsilon * Minvh * grad \
                  - self.epsilon * (self.xi - self.A) * self.p \
                  - self.epsilon * Minv * self.A * self.p  \
                  + Minvh * sample
        # in-place multiplication with epsilon to make sure
        # we have the values available in updates
        np.multiply(Minvh * self.p, self.epsilon, self.updates)
        # update theta
        self.theta += self.updates
        # update xi
        self.xi += self.epsilon * (self.p**2  - 1)
        self.xi_acc += self.xi

        # callbacks
        self.count += 1
        if self.count % self.callback_every == 0:
            #print(self.theta, (self.epsilon * r_t - self.epsilon * r_t**2))
            for callback in self.callbacks:
                callback(self.count, self)
        return self.unflatten(self.theta)
Example #5
0
    def _w_cross_hessian(self, sigma, Y, basis, beta, K_X):

        if beta is None:
            return 0
        else:

            K = self._weighted_kernel(sigma, Y, basis, K_X)
            if basis is None:
                basis_Y = Y
            else:
                basis_Y = basis

            n_y, d_y = Y.shape
            n_basis, _ = basis_Y.shape
            K_b = np.matmul(K, beta)
            b_d = np.matmul(Y, beta.T) - np.outer(
                np.ones([n_y, 1]), np.sum(np.multiply(beta, basis_Y), axis=1))

            K_b_mat = np.multiply(K, b_d)
            K_b_d = np.sum(K_b_mat, axis=1)

            K_b_y = np.matmul(K_b_mat, basis_Y)

            h = (2. * sigma) * K_b + (2. * sigma)**2 * (
                K_b_y - np.multiply(np.reshape(K_b_d, [-1, 1]), Y))
            return h
    def forward(self, X1, X2):
        """
        Actual computation of the matrix of squared distances (see details above)

        :param X1: input data of size (n1,d)
        :param X2: input data of size (n2,d)
        :param inverse_bandwidths_internal: self.inverse_bandwidths_internal
        """
        # In case inverse_bandwidths if of size (1, dimension), dimension>1,
        # ARD is handled by broadcasting
        inverse_bandwidths = anp.reshape(self._inverse_bandwidths(), (1, -1))

        if X2 is X1:
            X1_scaled = anp.multiply(X1, inverse_bandwidths)
            D = -2.0 * anp.dot(X1_scaled, anp.transpose(X1_scaled))
            X1_squared_norm = anp.sum(anp.square(X1_scaled), axis=1)
            D = D + anp.reshape(X1_squared_norm, (1, -1))
            D = D + anp.reshape(X1_squared_norm, (-1, 1))
        else:
            X1_scaled = anp.multiply(X1, inverse_bandwidths)
            X2_scaled = anp.multiply(X2, inverse_bandwidths)
            X1_squared_norm = anp.sum(anp.square(X1_scaled), axis=1)
            X2_squared_norm = anp.sum(anp.square(X2_scaled), axis=1)
            D = -2.0 * anp.matmul(X1_scaled, anp.transpose(X2_scaled))
            D = D + anp.reshape(X1_squared_norm, (-1, 1))
            D = D + anp.reshape(X2_squared_norm, (1, -1))

        return anp.abs(D)
Example #7
0
File: recnn.py Project: wz1070/jets
def event_baseline_transform(params, X, n_particles_per_event=10):
    features = []

    for e in X:
        features.append(e[:n_particles_per_event])

    h_jets = np.vstack(features)
    h_jets = h_jets.reshape(len(X), n_particles_per_event, -1)

    # GRU layer
    h = np.zeros((len(X), params["rnn_b_h"].shape[0]))

    for t in range(n_particles_per_event):
        xt = h_jets[:, n_particles_per_event - 1 - t, :]
        zt = sigmoid(
            np.dot(params["rnn_W_zh"], h.T).T +
            np.dot(params["rnn_W_zx"], xt.T).T + params["rnn_b_z"])
        rt = sigmoid(
            np.dot(params["rnn_W_rh"], h.T).T +
            np.dot(params["rnn_W_rx"], xt.T).T + params["rnn_b_r"])
        ht = relu(
            np.dot(params["rnn_W_hh"],
                   np.multiply(rt, h).T).T +
            np.dot(params["rnn_W_hx"], xt.T).T + params["rnn_b_h"])
        h = np.multiply(1. - zt, h) + np.multiply(zt, ht)

    return h
Example #8
0
def forward_step(params, X=None, cell_state_0=None, hid_state_0=None):
    hid_state = np.repeat(hid_state_0,
                          X.shape[0] - hid_state_0.shape[0] + 1,
                          axis=0)
    cell_state_1 = np.add(
        np.multiply(  # <-- forget old info
            cell_state_0,
            sigmoid(
                c([X, hid_state]) @ params['forget']['w'] +
                params['forget']['b']),  # <-- forget gate
        ),
        np.multiply(  # <-- write new info
            sigmoid(
                c([X, hid_state]) @ params['ingate']['w'] +
                params['ingate']['b']),  # <-- input gate
            np.tanh(
                c([X, hid_state]) @ params['change']['w'] +
                params['change']['b']),  # <-- change gate
        ))

    hid_state_1 = np.multiply(
        sigmoid(c([X, hid_state]) @ params['outgate']['w']),
        # 1,
        np.tanh(cell_state_1))

    return cell_state_1, hid_state_1
Example #9
0
    def variance(self, n_s):
        """
        Stochastic approximator of predictive variance.
         Follows "Massively Scalable GPs"
        Args:
            n_s (int): Number of iterations to run stochastic approximation

        Returns: Approximate predictive variance at grid points

        """

        if self.root_eigdecomp is None:
            self.sqrt_eig()
        if self.obs_idx is not None:
            root_K = self.root_eigdecomp[self.obs_idx, :]
        else:
            root_K = self.root_eigdecomp

        diag = kron_list_diag(self.Ks)
        samples = []
        for i in range(n_s):
            g_m = np.random.normal(size=self.m)
            g_n = np.random.normal(size=self.n)
            right_side = np.sqrt(self.W).dot(np.dot(root_K, g_m)) +\
                         np.sqrt(self.noise) * g_n
            r = self.opt.cg(self.Ks, right_side)
            if self.obs_idx is not None:
                Wr = np.zeros(self.m)
                Wr[self.obs_idx] = np.multiply(np.sqrt(self.W), r)
            else:
                Wr = np.multiply(np.sqrt(self.W), r)
            samples.append(kron_mvp(self.Ks, Wr))
        var = np.var(samples, axis=0)
        return np.clip(diag - var, 0, 1e12).flatten(), var
Example #10
0
def partial_derivatives(x, y, W, V, b, c):
	# Filling in some dummy values
	# THIS IS WHERE YOU WILL WRITE YOUR PARTIAL DERIVATIVES
    
    #Below is for dLdc
    dLdc = np.ones(c.shape)
    e = [0, 0, 1, 0]
    l = c + V @ sig(b + W @ x)
    for i in range(4):
        gf = np.exp(l[i]) / np.sum(np.exp(l))
        dLdc[i] = gf - e[i]
    
    #Below is for dLdV
    h = sig(b + W @ x)
    dLdV = dLdc @ h.T
    
    #Below is for dLdb
    dLdh = V.T @ dLdc
    s = b + W @ x
    dLdb = np.multiply(sigp(s), dLdh)
    
    #Below is for dLdW
    tmp = dLdh @ x.T
    dLdW = np.multiply(sigp(s), tmp)
    return dLdW, dLdV, dLdb, dLdc
    def _grad_laplacian(self, sigma, Y, basis, K_X):

        dist = self._square_dist(Y, basis=basis)

        K = self._update_kernel(sigma, Y, basis, K_X)

        if basis is None:
            basis_Y = Y
        else:
            basis_Y = basis

        _, d = Y.shape
        #		if K_d_mat is None:
        K_d_mat = np.multiply(K, dist)
        G = 4. * (sigma**2) * ((2 + d) * K - 2. * sigma * K_d_mat)

        #		if K_d is None:
        K_d = np.sum(K_d_mat, axis=1)
        #		if self_KK is None:
        KK = np.sum(K, axis=1)

        tmp = 4. * (sigma**2) * ((2 + d) * KK - 2. * sigma * K_d)
        tmp = tmp.reshape([-1, 1])
        h = np.multiply(tmp, Y) - np.matmul(G, basis_Y)
        return h
 def cost(coef):
     X_coef = -1 * np.matmul(X_, coef)
     z = 1 / (1 + np.exp(X_coef))
     epsilon = 1e-5
     class1 = np.multiply(y_, np.log(z + epsilon))
     class2 = np.multiply(1 - y_, np.log(1 - z + epsilon))
     ans = -(1 / y_.size) * (np.sum(class1 + class2))
     return ans
Example #13
0
def loss(pred, targ):
    # pred=pred/np.sum(pred)
    likelihood = np.multiply(targ, pred) + np.multiply(1.0 - targ, 1.0 - pred)
    likelihood_norm = likelihood + EPS
    log_likelihood = np.sum(np.log(likelihood_norm))
    lost = -log_likelihood
    # dist=pred-targ
    # lost=np.sum(np.linalg.norm(dist))
    return lost
Example #14
0
def gradphi(phi, x, z):
    dphidz1 = np.array([
        np.multiply((2 * (x - 1)**2) / (z[0]**3), phi[:, 0]),
        np.zeros(len(x))
    ]).T
    dphidz2 = np.array([
        np.zeros(len(x)),
        np.multiply((2 * (x - 5)**2) / (z[1]**3), phi[:, 1])
    ]).T
    return (np.array([dphidz1, dphidz2]))
 def cost(coef):
     X_coef = -1 * np.matmul(X_, coef)
     z = 1 / (1 + np.exp(X_coef))
     epsilon = 1e-5
     class1 = np.multiply(y_, np.log(z + epsilon))
     class2 = np.multiply(1 - y_, np.log(1 - z + epsilon))
     ans = -(1 / y_.size) * (np.sum(class1 + class2))
     if self.penalty == "l1":
         return ans + self.val * np.sum(np.absolute(coef))
     else:
         return ans + self.val * np.sum(np.square(coef))
Example #16
0
        def cost(params, batch_from, batch_to):
            X_batch = X[batch_from:batch_to, :]
            Y_batch = Y[batch_from:batch_to, :]
            Z = self._forward(params, X_batch)

            A = self.layers[-1].activation_fn(Z)
            # compute cost
            logprobs = np.multiply(np.log(A), Y_batch) + np.multiply(
                np.log((1 - A)), (1 - Y_batch))
            cost = -1 * np.sum(logprobs)
            return cost
 def forward(self, current, h_prev):
     z_in = np.matmul(current, self.params['Wiz']) + np.matmul(
         h_prev, self.params['Whz']) + self.params['bz']
     z = sigmoid(z_in)
     r_in = np.matmul(current, self.params['Wir']) + np.matmul(
         h_prev, self.params['Whr']) + self.params['br']
     r = sigmoid(r_in)
     g_in = np.matmul(current, self.params['Win']) + np.multiply(
         np.matmul(h_prev, self.params['Whn']), r) + self.params['bg']
     g = np.tanh(g_in)
     h_current = np.multiply((1 - z), g) + np.multiply(z, h_prev)
     return h_current
Example #18
0
 def loss(w):
     lossVal = 0
     for wi, aH in zip(w, globalAlphaHats):
         den = 1 / np.sum(np.multiply(n, wi))
         wiXA = np.multiply(wi, localAlphaHats)
         dot = np.sum(np.multiply(wiXA, n))
         tilde = den * dot
         lossVal = lossVal + .5 * np.square(aH - tilde)
         # The weights across all local estimates for each global estimate should sum to 1
         lossVal = lossVal + wOneLambda * .5 * np.sum(np.square(wi - 1))
     lossVal = lossVal + regLambda * np.linalg.norm(w)
     return lossVal
Example #19
0
    def _w_grad(self, sigma, Y, basis, beta, K_X):

        n_y, d_y = Y.shape
        n_basis, _ = basis.shape

        K = self._weighted_kernel(sigma, Y, basis, K_X)

        b_d = np.matmul(Y, beta.T) - np.outer(
            np.ones([n_y, 1]), np.sum(np.multiply(beta, basis), axis=1))
        K_b_mat = np.multiply(K, b_d)
        K_b_d = np.sum(K_b_mat, axis=1)
        return (2. * sigma) * K_b_d
Example #20
0
 def loss(localAlphaHats):
     lossVal = 0
     #         localAlphaHats = 1 / (1 + np.exp(-1 * localAlphaHats))
     for wi, aH in zip(w, globalAlphaHats):
         tilde = 1 / np.sum(np.multiply(n, wi))
         wiXA = np.multiply(wi, localAlphaHats)
         tilde = tilde * np.sum(np.multiply(wiXA, n))
         lossVal = lossVal + .5 * np.square(aH - tilde)
     lossVal = lossVal + varLambda * np.sum(np.var(localAlphaHats, axis=1))
     lossVal = lossVal + anchorLambda * np.sum(
         np.square(localAlphaHats - a0))
     return lossVal
def ll(x, num_peds, ess, robot_mu_x, robot_mu_y, ped_mu_x, ped_mu_y, \
       cov_robot_x, cov_robot_y, inv_cov_robot_x, inv_cov_robot_y, \
       cov_ped_x, cov_ped_y, inv_cov_ped_x, inv_cov_ped_y, \
       one_over_cov_sum_x, one_over_cov_sum_y, normalize):
    T = np.size(robot_mu_x)

    quad_robot_mu_x = np.dot((x[:T]-robot_mu_x).T, np.dot(inv_cov_robot_x, \
                                                                x[:T]-robot_mu_x))
    quad_robot_mu_y = np.dot((x[T:2*T]-robot_mu_y).T, np.dot(inv_cov_robot_y, \
                                                             x[T:2*T]-robot_mu_y))
    llambda = -0.5 * quad_robot_mu_x - 0.5 * quad_robot_mu_y

    n = 2
    for ped in range(ess):
        quad_ped_mu_x = np.dot((x[n*T:(n+1)*T]-ped_mu_x[ped]).T, np.dot(\
                                inv_cov_ped_x[ped], x[n*T:(n+1)*T]-ped_mu_x[ped]))
        quad_ped_mu_y = np.dot((x[(n+1)*T:(n+2)*T]-ped_mu_y[ped]).T, np.dot(\
                            inv_cov_ped_y[ped], x[(n+1)*T:(n+2)*T]-ped_mu_y[ped]))
        llambda = llambda - 0.5 * quad_ped_mu_x - 0.5 * quad_ped_mu_y
        n = n + 2

    n = 2
    for ped in range(ess):
        # if normalize == True:
        #   # normalize_x = np.multiply(np.power(2*np.pi,-0.5), \
        # one_over_std_sum_x[ped])
        #   # normalize_y = np.multiply(np.power(2*np.pi,-0.5), \
        # one_over_std_sum_y[ped])
        # else:
        normalize_x = 1.
        normalize_y = 1.

        vel_x = np.tile(x[:T], (T, 1)).T - np.tile(x[n * T:(n + 1) * T],
                                                   (T, 1))
        vel_y = np.tile(x[T:2 * T],
                        (T, 1)).T - np.tile(x[(n + 1) * T:(n + 2) * T], (T, 1))
        n = n + 2

        vel_x_2 = np.power(vel_x, 2)
        vel_y_2 = np.power(vel_y, 2)

        quad_robot_ped_x = np.multiply(vel_x_2, one_over_cov_sum_x[ped])
        quad_robot_ped_y = np.multiply(vel_y_2, one_over_cov_sum_y[ped])

        Z_x = np.multiply(normalize_x, np.exp(-0.5 * quad_robot_ped_x))
        Z_y = np.multiply(normalize_y, np.exp(-0.5 * quad_robot_ped_y))

        Z = np.multiply(Z_x, Z_y)

        log_znot_norm = np.sum(np.log1p(-Z))

        llambda = llambda + log_znot_norm
    return -1. * llambda
def d_ll(x, T, \
                 robot_mu_x, robot_mu_y, \
                 ped_mu_x, ped_mu_y, \
                 cov_robot_x, cov_robot_y, \
                 inv_cov_robot_x, inv_cov_robot_y, \
                 cov_ped_x, cov_ped_y, \
                 inv_cov_ped_x, inv_cov_ped_y, \
                 one_over_cov_sum_x, one_over_cov_sum_y, normalize):
    d_alpha = [0. for _ in range(4 * T)]
    d_beta = [0. for _ in range(4 * T)]
    d_llambda = np.asarray([0. for _ in range(4 * T)])

    n = 2
    vel_x = x[:T] - x[n * T:(n + 1) * T]
    vel_y = x[T:2 * T] - x[(n + 1) * T:(n + 2) * T]

    one_over_var_sum_x = np.diag(one_over_cov_sum_x)
    one_over_var_sum_y = np.diag(one_over_cov_sum_y)

    # if normalize == True:
    #   normalize_x = np.multiply(np.power(2*np.pi, -0.5), \
    #                                                   np.diag(one_over_std_sum_x))
    #   normalize_y = np.multiply(np.power(2*np.pi, -0.5), \
    #                                                   np.diag(one_over_std_sum_y))
    # else:
    normalize_x = 1.
    normalize_y = 1.

    quad_x = np.multiply(one_over_var_sum_x, np.power(vel_x, 2))
    quad_y = np.multiply(one_over_var_sum_y, np.power(vel_y, 2))

    Z_x = np.multiply(normalize_x, np.exp(-0.5 * quad_x))
    Z_y = np.multiply(normalize_y, np.exp(-0.5 * quad_y))

    Z = np.multiply(Z_x, Z_y)
    X = np.divide(Z, 1. - Z)

    alpha_x = np.multiply(X, np.multiply(vel_x, one_over_var_sum_x))
    alpha_y = np.multiply(X, np.multiply(vel_y, one_over_var_sum_y))
    #        X and Y COMPONENT OF R DERIVATIVE
    d_alpha[:T] = np.add(d_alpha[:T], alpha_x)
    d_alpha[T:2 * T] = np.add(d_alpha[T:2 * T], alpha_y)

    d_alpha[n * T:(n + 1) * T] = -alpha_x
    d_alpha[(n + 1) * T:(n + 2) * T] = -alpha_y

    d_beta[n * T:(n + 1) *
           T] = -np.dot(x[n * T:(n + 1) * T] - ped_mu_x, inv_cov_ped_x)
    d_beta[(n + 1) * T:(n + 2) *
           T] = -np.dot(x[(n + 1) * T:(n + 2) * T] - ped_mu_y, inv_cov_ped_y)

    d_beta[:T] = -np.dot(x[:T] - robot_mu_x, inv_cov_robot_x)
    d_beta[T:2 * T] = -np.dot(x[T:2 * T] - robot_mu_y, inv_cov_robot_y)

    d_llambda[0:2 * T] = np.add(d_alpha[0:2 * T], d_beta[0:2 * T])
    d_llambda[2 * T:] = np.add(d_alpha[2 * T:], d_beta[2 * T:])
    return -1. * d_llambda
Example #23
0
 def objective(params, iter):
     fake_weights, bias = params
     weights = np.multiply((fake_weights + fake_weights.T) / 2, diag_mask)
     pll = 0
     for i in range(len(imgs)):
         img = np.reshape(imgs[i], -1)
         activations = np.matmul(weights, img) + bias
         output = sigmoid(activations)
         eps = 1e-10
         img[img < 0] = 0
         pll += np.sum(np.multiply(img, np.log(output+eps)) + np.multiply(1-img, np.log(1-output+eps)))
     if iter % 100 == 0: print(-pll)
     return -pll
Example #24
0
 def diff_test_feature(test_feature_array):
     norm_mean, norm_variance = self.predict(test_feature_array)
     # De-normalize, and variance -> stddev
     pred_mean = norm_mean * std_data + mean_data
     pred_std = anp.sqrt(norm_variance) * std_data
     head_gradients_mean = anp.reshape(head_gradients['mean'],
                                       pred_mean.shape)
     head_gradients_std = anp.reshape(head_gradients['std'],
                                      pred_std.shape)
     # Added to mimic mxnet.autograd.backward
     pred_mean_sum = anp.sum(
         anp.multiply(pred_mean, head_gradients_mean))
     pred_std_sum = anp.sum(anp.multiply(pred_std, head_gradients_std))
     return pred_mean_sum + pred_std_sum
Example #25
0
 def neg_ll(self, x, c, n, *params):
     f = np.zeros_like(self.p)
     params = np.reshape(params, (self.m, self.dist.k + 1))
     f = np.zeros_like(x)
     for i in range(self.m):
         like = self.dist.like(x, c, n, *params[i, 1::])
         like = np.multiply(params[i, 0], like)
         f = f + like
     f = np.where(f <= 0, surpyval.TINIEST, f)
     f = np.where(f < 1, f, 1)
     f = np.log(f)
     f = np.multiply(n, f)
     f = -np.sum(f)
     return f
Example #26
0
    def Q(self, params):
        params = params.reshape(self.m, self.dist.k)
        f = np.zeros_like(self.p)
        for i in range(self.m):
            like = self.dist.like(self.x, self.c, self.n, *params[i])
            like += surpyval.TINIEST
            like = np.where(like < 1, like, 1)
            like = np.log(like)
            like = np.multiply(self.n, like)

            f[i] = np.multiply(self.p[i], like)
        f = -np.sum(f)
        self.loglike = f
        return f
    def _w_grad(self, sigma, Y, basis, beta, K_X):

        n_y, d_y = Y.shape
        n_basis, _ = basis.shape

        K = self._update_kernel(sigma, Y, basis, K_X)

        #		if b_d is None:
        b_d = np.matmul(Y, beta.T) - np.outer(
            np.ones([n_y, 1]), np.sum(np.multiply(beta, basis), axis=1))
        #		if K_b_mat is None:
        K_b_mat = np.multiply(K, b_d)
        #		if K_b_d is None:
        K_b_d = np.sum(K_b_mat, axis=1)
        return (2. * sigma) * K_b_d
Example #28
0
    def _hessian_bloc_dim(self, sigma, Y_i, Y_j, K, i, j):
        n = Y_i.shape[0]
        Y_ii = np.reshape(Y_i, [1, -1])
        Y_jj = np.reshape(Y_j, [1, -1])
        diff_i = np.tile(Y_ii, [n, 1])
        diff_i = diff_i.T - diff_i
        diff_j = np.tile(Y_jj, [n, 1])
        diff_j = diff_j.T - diff_j

        if i == j:
            return (np.multiply(K, (2. * (sigma) - 4. *
                                    (sigma**2) * np.multiply(diff_i, diff_j))))
        else:
            return -4. * (sigma**2) * (np.multiply(
                K, np.multiply(diff_i, diff_j)))
Example #29
0
    def KLqp(self, S, q_mu):
        """
        Calculates KL divergence between q and p
        Args:
            S (): Variational variances
            q_mu (): Variational mean
        Returns: KL divergence between q and p
        """

        k_inv_mu = kron_mvp(self.K_invs, self.mu - q_mu)
        mu_penalty = np.sum(np.multiply(self.mu - q_mu, k_inv_mu))
        det_S = np.sum(S)
        trace_term = np.sum(np.multiply(self.k_inv_diag, np.exp(S)))
        kl = 0.5 * (self.det_K - self.m - det_S + trace_term + mu_penalty)
        return kl
Example #30
0
def prep_opt(y_train, N, coeffs):
    summedy_mat = np.sum(y_train, axis=0)
    summedy = np.reshape(summedy_mat, [np.size(summedy_mat), -1])

    a1 = np.reshape([np.repeat(coeffs.T[1], N)], [np.size(summedy), -1])
    a0 = np.reshape([np.repeat(coeffs.T[0], N)], [np.size(summedy), -1])
    a1y = np.multiply(a1, summedy)
    a0y = np.multiply(a0, summedy)

    consts = np.sum(gammaln(
        y_train + scale)) - D * n_neurons * N * gammaln(scale) - np.sum(
            coeffs.T[0] *
            (D * scale * N)) - np.sum(a0y) - np.sum(summedy * np.log(scale))

    return summedy, a1y, a0y, a1, consts
Example #31
0
    def marginal(self, kernel):
        """
        calculates marginal likelihood
        Args:
            Ks_new: new covariance if needed
        Returns: np.array for marginal likelihood

        """

        if kernel.params is not None:
            self.Ks = self.construct_Ks()
            self.alpha = np.zeros([self.X.shape[0]])
            self.W = np.zeros([self.X.shape[0]])
            self.grads = np.zeros([self.X.shape[0]])
            self.f = self.mu
            self.f_pred = self.f
            self.run(10)

        Ks = self.Ks
        eigs = [np.expand_dims(np.linalg.eig(K)[0], 1) for K in Ks]
        eig_K = np.squeeze(kron_list(eigs))
        self.eig_K = eig_K

        if self.obs_idx is not None:
            f_lim = self.f[self.obs_idx]
            alpha_lim = self.alpha[self.obs_idx]
            mu_lim = self.mu[self.obs_idx]
            W_lim = self.W[self.obs_idx]
            eig_k_lim = eig_K[self.obs_idx]

            pen = -0.5 * np.sum(np.multiply(alpha_lim,
                                       f_lim - mu_lim))
            pen = np.where(np.isnan(pen), np.zeros_like(pen), pen)
            eigs = 0.5 * np.sum(np.log(1 + np.multiply(eig_k_lim,
                                       W_lim)))
            eigs = np.where(np.isnan(eigs), np.zeros_like(eigs), eigs)
            like = np.sum(self.likelihood.log_like(f_lim, self.y))
            like = np.where(np.isnan(like), np.zeros_like(like), like)

            return -(pen+eigs+like)

        pen = -0.5 * np.sum(np.multiply(self.alpha,
                                   self.f - self.mu))
        eigs = - 0.5*np.sum(np.log(1 +
                                   np.multiply(eig_K, self.W)))
        like = np.sum(self.likelihood.log_like(self.f, self.y))

        return -(pen+eigs+like)
Example #32
0
def beta_grads(Ks, beta, i):
  Karr = np.array(Ks)
  anum = Ks[i] * np.exp(Ks[i] * beta)
  aden = np.sum(np.exp(beta * Karr))
  a = anum / aden

  bnum = np.exp(Ks[i] * beta) * (np.sum(np.multiply(Karr, np.exp(Karr * beta))))
  bden = aden * aden
  b = bnum / bden
  return a - b
Example #33
0
def monomial(x, y, x_test):
    n = len(x)

    A = np.vander(x, increasing=True)
    c = np.linalg.solve(A, y)

    y_test = np.zeros_like(x_test)
    for j in xrange(n-1, -1, -1):
        y_test = np.multiply(y_test, x_test) + c[j]

    return y_test
Example #34
0
def write(mem, w_t, e_t, a_t):
    """
    The writing procedure as described in 3.2.
    w_t is a length N weighting over the rows as above.
    e_t (the erase vector) is length M with elements all in (0,1).
    a_t (the add vector) is length M with no such restrictions.
    We first multiply the memory matrix pointwise by [1-w_t(i)e_t]
    Then we do M_t(i) <- w_t(i)a_t.
    According to the paper, the erase/add decomposition was
        inspired by the forget/input gates in LSTM.
    """
    # Perform erasure on the existing memory, parametrized by e_t and w_t
    W = np.reshape(w_t, (w_t.shape[0], 1))
    E = np.reshape(e_t, (e_t.shape[0], 1))

    # Transpose W so we can create WTE, a matrix whose i,j-th element
        # represents the extent to which we will erase M_t[i,j]
    WTE = np.dot(W, E.T)

    # KEEP is such that KEEP[i,j] represents the extent to which we
        # will keep M_t[i,j]
    KEEP = np.ones(mem.shape) - WTE

    # To complete erasure, multiply memory pointwise by KEEP
    newmem = np.multiply(mem, KEEP)

    # Perform addition on the newly erased memory
    # Convert add vector to a matrix
    A = np.reshape(a_t, (a_t.shape[0], 1))

    # Add is the add vector weighted by w_t, which is added pointwise to
        # the existing memory, finishing the write sequence.
    ADD = np.dot(W, A.T)
    newmem = newmem + ADD

    return newmem
Example #35
0
def test_multiply_arg1():
    fun = lambda x, y : np.multiply(x, y)
    d_fun = grad(fun, 1)
    check_grads(fun, npr.rand(), npr.rand())
    check_grads(d_fun, npr.rand(), npr.rand())
Example #36
0
    def manual_grads(params):
      """
      Compute the gradient of the loss WRT the parameters
      Ordering of the operations is reverse of that in fprop()
      """
      deltas = {}
      for key, val in params.iteritems():
        deltas[key] = np.zeros_like(val)

      [loss, mems, ps, ys, os, zos, hs, zhs, xs, rs, w_rs,
       w_ws, adds, erases, k_rs, k_ws, g_rs, g_ws, wc_rs, wc_ws,
       zbeta_rs, zbeta_ws, zs_rs, zs_ws, wg_rs, wg_ws] = self.stats
      dd = {}
      drs = {}
      dzh = {}
      dmem = {} # might not need this, since we have dmemtilde
      dmemtilde = {}
      du_r = {}
      du_w = {}
      dwg_r = {}
      dwg_w = {}
      for t in reversed(xrange(len(targets))):

        dy = np.copy(ps[t])
        dy -= targets[t].T # backprop into y

        deltas['oy'] += np.dot(dy, os[t].T)
        deltas['by'] += dy

        if t < len(targets) - 1:
          # r[t] affects cost through zh[t+1] via Wrh
          drs[t] = np.dot(self.W['rh'].T, dzh[t + 1])

          # right now, mems[t] influences cost through rs[t+1], via w_rs[t+1]
          dmem[t] = np.dot( w_rs[t + 1], drs[t + 1].reshape((self.M,1)).T )
          # and also through mems at next step
          W = np.reshape(w_ws[t+1], (w_ws[t+1].shape[0], 1))
          E = np.reshape(erases[t+1], (erases[t+1].shape[0], 1))
          WTE = np.dot(W, E.T)
          KEEP = np.ones(mems[0].shape) - WTE
          dmem[t] += np.multiply(dmemtilde[t+1], KEEP)
          # and also through its influence on the content weighting next step
          dmem[t] += du_r[t+1] + du_w[t+1]

          dmemtilde[t] = dmem[t]

          # erases[t] affects cost through mems[t], via w_ws[t]
          derase = np.dot(np.multiply(dmemtilde[t], -mems[t-1]).T, w_ws[t])

          # zerase affects just erases through a sigmoid
          dzerase = derase * (erases[t] * (1 - erases[t]))

          # adds[t] affects costs through mems[t], via w_ws
          dadd = np.dot(dmem[t].T, w_ws[t])

          # zadds affects just adds through a tanh
          dzadd = dadd * (1 - adds[t] * adds[t])

          # dbadds is just dzadds
          deltas['badds'] += dzadd

          deltas['oadds'] += np.dot(dzadd, os[t].T)

          deltas['berases'] += dzerase

          deltas['oerases'] += np.dot(dzerase, os[t].T)

          # # read weights affect what is read, via what's in mems[t-1]
          # dwc_r = np.dot(mems[t-1], drs[t])

          # # write weights affect mem[t] through adding
          # dwc_w = np.dot(dmem[t], adds[t])
          # # they also affect memtilde[t] through erasing
          # dwc_w += np.dot(np.multiply(dmemtilde[t], -mems[t-1]), erases[t])

          dw_r = np.dot(mems[t-1], drs[t])
          dw_r += dwg_r[t+1] * (1 - g_rs[t+1])

          # write weights affect mem[t] through adding
          dw_w = np.dot(dmem[t], adds[t])
          # they also affect memtilde[t] through erasing
          dw_w += np.dot(np.multiply(dmemtilde[t], -mems[t-1]), erases[t])
          dw_w += dwg_w[t+1] * (1 - g_ws[t+1])

          sgwr = np.zeros((self.N, self.N))
          sgww = np.zeros((self.N, self.N))
          for i in range(self.N):
            sgwr[i,i] = softmax(zs_rs[t])[0]
            sgwr[i,(i+1) % self.N] = softmax(zs_rs[t])[2]
            sgwr[i,(i-1) % self.N] = softmax(zs_rs[t])[1]

            sgww[i,i] = softmax(zs_ws[t])[0]
            sgww[i,(i+1) % self.N] = softmax(zs_ws[t])[2]
            sgww[i,(i-1) % self.N] = softmax(zs_ws[t])[1]

          # right now, shifted weights are final weight
          dws_r = dw_r
          dws_w = dw_w

          dwg_r[t] = np.dot(sgwr.T, dws_r)
          dwg_w[t] = np.dot(sgww.T, dws_w)

          dwc_r = dwg_r[t] * g_rs[t]
          dwc_w = dwg_w[t] * g_ws[t]


          """
          We need dw/dK
          now w has N elts and K has N elts
          and we want, for every elt of W, the grad of that elt w.r.t. each
          of the N elts of K. that gives us N * N things
          """
          # first, we must build up the K values (should be taken from fprop)
          K_rs = []
          K_ws = []
          for i in range(self.N):
            K_rs.append(cosine_sim(mems[t-1][i, :], k_rs[t]))
            K_ws.append(cosine_sim(mems[t-1][i, :], k_ws[t]))

          # then, we populate the grads
          dwdK_r = np.zeros((self.N, self.N))
          dwdK_w = np.zeros((self.N, self.N))
          # for every row in the memory
          for i in range(self.N):
            # for every element in the weighting
            for j in range(self.N):
              dwdK_r[i,j] += softmax_grads(K_rs, softplus(zbeta_rs[t]), i, j)
              dwdK_w[i,j] += softmax_grads(K_ws, softplus(zbeta_ws[t]), i, j)

          # compute dK for all i in N
          # K is the evaluated cosine similarity for the i-th row of mem matrix
          dK_r = np.zeros_like(w_rs[0])
          dK_w = np.zeros_like(w_ws[0])

          # for all i in N (for every row that we've simmed)
          for i in range(self.N):
            # for every j in N (for every elt of the weighting)
            for j in range(self.N):
              # specifically, dwdK_r will change, and for write as well
              dK_r[i] += dwc_r[j] * dwdK_r[i,j] 
              dK_w[i] += dwc_w[j] * dwdK_w[i,j]

          """
          dK_r_dk_rs is a list of N things
          each elt of the list corresponds to grads of K_idx
          w.r.t. the key k_t
          so it should be a length N list of M by 1 vectors
          """

          dK_r_dk_rs = []
          dK_r_dmem = []
          for i in range(self.N):
            # let k_rs be u, Mem[i] be v
            u = np.reshape(k_rs[t], (self.M,))
            v = mems[t-1][i, :]
            dK_r_dk_rs.append( dKdu(u,v) )
            dK_r_dmem.append( dKdu(v,u))

          dK_w_dk_ws = []
          dK_w_dmem = []
          for i in range(self.N):
            # let k_ws be u, Mem[i] be v
            u = np.reshape(k_ws[t], (self.M,))
            v = mems[t-1][i, :]
            dK_w_dk_ws.append( dKdu(u,v) )
            dK_w_dmem.append( dKdu(v,u))

          # compute delta for keys
          dk_r = np.zeros_like(k_rs[0])
          dk_w = np.zeros_like(k_ws[0])
          # for every one of M elt of dk_r
          for i in range(self.M):
            # for every one of the N Ks
            for j in range(self.N):
              # add delta K_r[j] * dK_r[j] / dk_r[i]
              # add influence on through K_r[j]
              dk_r[i] += dK_r[j] * dK_r_dk_rs[j][i]
              dk_w[i] += dK_w[j] * dK_w_dk_ws[j][i]

          # these represent influence of mem on next K
          """
          Let's let du_r[t] represent the
          influence of mems[t-1] on the cost through the K values
          this is analogous to dk_w, but, k only every affects that
          whereas mems[t-1] will also affect what is read at time t+1
          and through memtilde at time t+1
          """
          du_r[t] = np.zeros_like(mems[0])
          du_w[t] = np.zeros_like(mems[0])
          # for every row in mems[t-1]
          for i in range(self.N):
            # for every elt of this row (one of M)
            for j in range(self.M):
              du_r[t][i,j] = dK_r[i] * dK_r_dmem[i][j]
              du_w[t][i,j] = dK_w[i] * dK_w_dmem[i][j]

          # key values are activated as tanh
          dzk_r = dk_r * (1 - k_rs[t] * k_rs[t])
          dzk_w = dk_w * (1 - k_ws[t] * k_ws[t])

          deltas['ok_r'] += np.dot(dzk_r, os[t].T)
          deltas['ok_w'] += np.dot(dzk_w, os[t].T)

          deltas['bk_r'] += dzk_r
          deltas['bk_w'] += dzk_w

          dg_r = np.dot(dwg_r[t].T, (wc_rs[t] - w_rs[t-1]) )
          dg_w = np.dot(dwg_w[t].T, (wc_ws[t] - w_ws[t-1]) )

          # compute dzg_r, dzg_w
          dzg_r = dg_r * (g_rs[t] * (1 - g_rs[t]))
          dzg_w = dg_w * (g_ws[t] * (1 - g_ws[t]))

          deltas['og_r'] += np.dot(dzg_r, os[t].T)
          deltas['og_w'] += np.dot(dzg_w, os[t].T)

          deltas['bg_r'] += dzg_r
          deltas['bg_w'] += dzg_w

          # compute dbeta, which affects w_content through interaction with Ks

          dwcdbeta_r = np.zeros_like(w_rs[0])
          dwcdbeta_w = np.zeros_like(w_ws[0])
          for i in range(self.N):
            dwcdbeta_r[i] = beta_grads(K_rs, softplus(zbeta_rs[t]), i)
            dwcdbeta_w[i] = beta_grads(K_ws, softplus(zbeta_ws[t]), i)

          dbeta_r = np.zeros_like(zbeta_rs[0])
          dbeta_w = np.zeros_like(zbeta_ws[0])
          for i in range(self.N):
            dbeta_r[0] += dwc_r[i] * dwcdbeta_r[i]
            dbeta_w[0] += dwc_w[i] * dwcdbeta_w[i]

          # beta is activated from zbeta by softplus, grad of which is sigmoid
          dzbeta_r = dbeta_r * sigmoid(zbeta_rs[t])
          dzbeta_w = dbeta_w * sigmoid(zbeta_ws[t])

          deltas['obeta_r'] += np.dot(dzbeta_r, os[t].T)
          deltas['obeta_w'] += np.dot(dzbeta_w, os[t].T)

          deltas['bbeta_r'] += dzbeta_r
          deltas['bbeta_w'] += dzbeta_w

          sgsr = np.zeros((self.N, 3))
          sgsw = np.zeros((self.N, 3))
          for i in range(self.N):
            sgsr[i,1] = wg_rs[t][(i - 1) % self.N]
            sgsr[i,0] = wg_rs[t][i]
            sgsr[i,2] = wg_rs[t][(i + 1) % self.N]

            sgsw[i,1] = wg_ws[t][(i - 1) % self.N]
            sgsw[i,0] = wg_ws[t][i]
            sgsw[i,2] = wg_ws[t][(i + 1) % self.N]

          ds_r = np.dot(sgsr.T, dws_r)
          ds_w = np.dot(sgsw.T, dws_w)

          shift_act_jac_r = np.zeros((3,3))
          shift_act_jac_w = np.zeros((3,3))
          bf = np.array([[1.0]])
          for i in range(3):
            for j in range(3):
              shift_act_jac_r[i,j] = softmax_grads(zs_rs[t], bf, i, j)
              shift_act_jac_w[i,j] = softmax_grads(zs_ws[t], bf, i, j)

          dzs_r = np.dot(shift_act_jac_r.T, ds_r)
          dzs_w = np.dot(shift_act_jac_w.T, ds_w)

          deltas['os_r'] += np.dot(dzs_r, os[t].T)
          deltas['os_w'] += np.dot(dzs_w, os[t].T)

          deltas['bs_r'] += dzs_r
          deltas['bs_w'] += dzs_w

        else:
          drs[t] = np.zeros_like(rs[0])
          dmemtilde[t] = np.zeros_like(mems[0])
          du_r[t] = np.zeros_like(mems[0])
          du_w[t] = np.zeros_like(mems[0])
          dwg_r[t] = np.zeros_like(w_rs[0])
          dwg_w[t] = np.zeros_like(w_ws[0])

        # o affects y through Woy
        do = np.dot(params['oy'].T, dy)
        if t < len(targets) - 1:
          # and also zadd through Woadds
          do += np.dot(params['oadds'].T, dzadd)
          do += np.dot(params['oerases'].T, dzerase)
          # and also through the keys
          do += np.dot(params['ok_r'].T, dzk_r)
          do += np.dot(params['ok_w'].T, dzk_w)
          # and also through the interpolators
          do += np.dot(params['og_r'].T, dzg_r)
          do += np.dot(params['og_w'].T, dzg_w)
          # and also through beta
          do += np.dot(params['obeta_r'].T, dzbeta_r)
          do += np.dot(params['obeta_w'].T, dzbeta_w)
          # and also through the shift values
          do += np.dot(params['os_r'].T, dzs_r)
          do += np.dot(params['os_w'].T, dzs_w)


        # compute deriv w.r.t. pre-activation of o
        dzo = do * (1 - os[t] * os[t])

        deltas['ho'] += np.dot(dzo, hs[t].T)
        deltas['bo'] += dzo

        # compute hidden dh
        dh = np.dot(params['ho'].T, dzo)

        # compute deriv w.r.t. pre-activation of h
        dzh[t] = dh * (1 - hs[t] * hs[t])

        deltas['xh'] += np.dot(dzh[t], xs[t].T)
        deltas['bh'] += dzh[t]

        # Wrh affects zh via rs[t-1]
        deltas['rh'] += np.dot(dzh[t], rs[t-1].reshape((self.M, 1)).T)

      return deltas
def mul(first_tree_rep, second_tree_rep):

    return auto_grad_np.multiply(first_tree_rep, second_tree_rep)
Example #38
0
def l2(x):
  """
  Hacky l2-norm computation to be used for tracking update magnitude.
  """
  return np.sqrt(np.sum(np.multiply(x, x)))
Example #39
0
    def fprop(params):
      """
      Forward pass of the NTM.
      """

      W = params # aliasing for brevity

      xs, zhs, hs, ys, ps, ts, zos, os = {}, {}, {}, {}, {}, {}, {}, {}

      def l():
        """
        Silly utility function that should be called in init.
        """
        return [{} for _ in xrange(self.heads)]

      rs = l()
      zk_rs = l()
      k_rs, beta_rs, g_rs, s_rs, gamma_rs = l(),l(),l(),l(),l()
      k_ws, beta_ws, g_ws, s_ws, gamma_ws = l(),l(),l(),l(),l()
      adds, erases = l(),l()
      w_ws, w_rs = l(),l() # read weights and write weights
      for idx in range(self.heads):
        rs[idx][-1] = self.W['rsInit' + str(idx)] # stores values read from memory
        w_ws[idx][-1] = softmax(self.W['w_wsInit' + str(idx)])
        w_rs[idx][-1] = softmax(self.W['w_rsInit' + str(idx)])

      mems = {} # the state of the memory at every timestep
      mems[-1] = self.W['memsInit']
      loss = 0

      for t in xrange(len(inputs)):

        xs[t] = np.reshape(np.array(inputs[t]),inputs[t].shape[::-1])

        rsum = 0
        for idx in range(self.heads):
          rsum = rsum + np.dot(W['rh' + str(idx)], np.reshape(rs[idx][t-1],(self.M,1)))
        zhs[t] = np.dot(W['xh'], xs[t]) + rsum + W['bh']
        hs[t] = np.tanh(zhs[t])

        zos[t] = np.dot(W['ho'], hs[t]) + W['bo']
        os[t] = np.tanh(zos[t])

        for idx in range(self.heads):
          # parameters to the read head
          zk_rs[idx][t] =np.dot(W['ok_r' + str(idx)],os[t]) + W['bk_r' + str(idx)]
          k_rs[idx][t] = np.tanh(zk_rs[idx][t])
          beta_rs[idx][t] = softplus(np.dot(W['obeta_r' + str(idx)],os[t])
                                     + W['bbeta_r' + str(idx)])
          g_rs[idx][t] = sigmoid(np.dot(W['og_r' + str(idx)],os[t]) + W['bg_r' + str(idx)])
          s_rs[idx][t] = softmax(np.dot(W['os_r' + str(idx)],os[t]) + W['bs_r' + str(idx)])
          gamma_rs[idx][t] = 1 + sigmoid(np.dot(W['ogamma_r' + str(idx)], os[t])
                                         + W['bgamma_r' + str(idx)])

          # parameters to the write head
          k_ws[idx][t] = np.tanh(np.dot(W['ok_w' + str(idx)],os[t]) + W['bk_w' + str(idx)])
          beta_ws[idx][t] = softplus(np.dot(W['obeta_w' + str(idx)], os[t])
                                     + W['bbeta_w' + str(idx)])
          g_ws[idx][t] = sigmoid(np.dot(W['og_w' + str(idx)],os[t]) + W['bg_w' + str(idx)])
          s_ws[idx][t] = softmax(np.dot(W['os_w' + str(idx)],os[t]) + W['bs_w' + str(idx)])
          gamma_ws[idx][t] = 1 + sigmoid(np.dot(W['ogamma_w' + str(idx)], os[t])
                                         + W['bgamma_w' + str(idx)])

          # the erase and add vectors
          # these are also parameters to the write head
          # but they describe "what" is to be written rather than "where"
          adds[idx][t] = np.tanh(np.dot(W['oadds' + str(idx)], os[t]) + W['badds' + str(idx)])
          erases[idx][t] = sigmoid(np.dot(W['oerases' + str(idx)], os[t]) + W['erases' + str(idx)])

          w_ws[idx][t] = addressing.create_weights(   k_ws[idx][t]
                                                    , beta_ws[idx][t]
                                                    , g_ws[idx][t]
                                                    , s_ws[idx][t]
                                                    , gamma_ws[idx][t]
                                                    , w_ws[idx][t-1]
                                                    , mems[t-1])

          w_rs[idx][t] = addressing.create_weights(   k_rs[idx][t]
                                                    , beta_rs[idx][t]
                                                    , g_rs[idx][t]
                                                    , s_rs[idx][t]
                                                    , gamma_rs[idx][t]
                                                    , w_rs[idx][t-1]
                                                    , mems[t-1])

        ys[t] = np.dot(W['oy'], os[t]) + W['by']
        ps[t] = sigmoid(ys[t])

        one = np.ones(ps[t].shape)
        ts[t] = np.reshape(np.array(targets[t]),(self.out_size,1))

        epsilon = 2**-23 # to prevent log(0)
        a = np.multiply(ts[t] , np.log2(ps[t] + epsilon))
        b = np.multiply(one - ts[t], np.log2(one-ps[t] + epsilon))
        loss = loss - (a + b)

        for idx in range(self.heads):
          # read from the memory
          rs[idx][t] = memory.read(mems[t-1],w_rs[idx][t])

          # write into the memory
          mems[t] = memory.write(mems[t-1],w_ws[idx][t],erases[idx][t],adds[idx][t])

      self.stats = [loss, mems, ps, ys, os, zos, hs, zhs, xs, rs, w_rs, w_ws, adds, erases]
      return np.sum(loss)
Example #40
0
def test_multiply_arg1():
    fun = lambda x, y : np.multiply(x, y)
    check_grads(fun)(npr.rand(), npr.rand())