Example #1
0
 def density_given_previous_a_and_x(x, w, v, b, activations_factor, p_prev, a_prev, x_prev):
     a = a_prev + T.dot(T.shape_padright(x_prev, 1), T.shape_padleft(w, 1))
     h = self.nonlinearity(a * activations_factor)  # BxH
     t = T.dot(h, v) + b
     p_xi_is_one = T.nnet.sigmoid(t) * constantX(0.9999) + constantX(0.0001 * 0.5)  # Make logistic regression more robust by having the sigmoid saturate at 0.00005 and 0.99995
     p = p_prev + x * T.log(p_xi_is_one) + (1 - x) * T.log(1 - p_xi_is_one)
     return (p, a, x)
Example #2
0
    def sym_mask_logdensity_estimator_intermediate(self, x, mask):
        non_linearity_name = self.parameters["nonlinearity"].get_name()
        assert non_linearity_name == "sigmoid" or non_linearity_name == "RLU"
        x = x.T  # BxD
        mask = mask.T  # BxD
        output_mask = constantX(1) - mask  # BxD
        D = constantX(self.n_visible)
        d = mask.sum(1)  # d is the 1-based index of the dimension whose value to infer (not the size of the context)
        masked_input = x * mask  # BxD
        h = self.nonlinearity(T.dot(masked_input, self.W1) + T.dot(mask, self.Wflags) + self.b1)  # BxH
        for l in xrange(self.n_layers - 1):
            h = self.nonlinearity(T.dot(h, self.Ws[l]) + self.bs[l])  # BxH
        z_alpha = T.tensordot(h, self.V_alpha, [[1], [1]]) + T.shape_padleft(self.b_alpha)
        z_mu = T.tensordot(h, self.V_mu, [[1], [1]]) + T.shape_padleft(self.b_mu)
        z_sigma = T.tensordot(h, self.V_sigma, [[1], [1]]) + T.shape_padleft(self.b_sigma)
        temp = T.exp(z_alpha)  # + 1e-6
        # temp += T.shape_padright(temp.sum(2)/1e-3)
        Alpha = temp / T.shape_padright(temp.sum(2))  # BxDxC
        Mu = z_mu  # BxDxC
        Sigma = T.exp(z_sigma)  # + 1e-6 #BxDxC

        # Alpha = Alpha * T.shape_padright(output_mask) + T.shape_padright(mask)
        # Mu = Mu * T.shape_padright(output_mask)
        # Sigma = Sigma * T.shape_padright(output_mask) + T.shape_padright(mask)
        # Phi = -constantX(0.5) * T.sqr((Mu - T.shape_padright(x*output_mask)) / Sigma) - T.log(Sigma) - constantX(0.5 * np.log(2*np.pi)) #BxDxC

        Phi = (
            -constantX(0.5) * T.sqr((Mu - T.shape_padright(x)) / Sigma)
            - T.log(Sigma)
            - constantX(0.5 * np.log(2 * np.pi))
        )  # BxDxC
        logdensity = (log_sum_exp(Phi + T.log(Alpha), axis=2) * output_mask).sum(1) * D / (D - d)
        return (logdensity, z_alpha, z_mu, z_sigma, Alpha, Mu, Sigma, h)
Example #3
0
 def density_given_previous_a_and_x(x, w, v, c, p_prev, a_prev, x_prev, bias_prev):
     a = a_prev + T.dot(T.shape_padright(x_prev, 1), T.shape_padleft(w, 1))
     bias = bias_prev + constantX(np.log(2)) - T.log(1 + T.exp(w))
     h = self.nonlinearity(a + bias + self.b)  # BxH
     t = T.dot(h, v) + c
     p_xi_is_one = T.nnet.sigmoid(t) * constantX(0.9999) + constantX(0.0001 * 0.5)  # Make logistic regression more robust by having the sigmoid saturate at 0.00005 and 0.99995
     p = p_prev + x * T.log(p_xi_is_one) + (1 - x) * T.log(1 - p_xi_is_one)
     return (p, a, x, bias)
Example #4
0
        def density_and_gradients(x_i, x_im1, w_i, V_alpha, b_alpha, V_mu,
                                  b_mu, V_sigma, b_sigma, activation_factor,
                                  a_i, lp_accum, dP_da_ip1):
            B = T.cast(x_i.shape[0], floatX)
            pot = a_i * activation_factor
            h = self.nonlinearity(pot)  # BxH

            z_alpha = T.dot(h, V_alpha) + T.shape_padleft(b_alpha)
            z_mu = T.dot(h, V_mu) + T.shape_padleft(b_mu)
            z_sigma = T.dot(h, V_sigma) + T.shape_padleft(b_sigma)

            Alpha = T.nnet.softmax(z_alpha)  # BxC
            Mu = z_mu  # BxC
            Sigma = T.exp(z_sigma)  # BxC

            Phi = -constantX(0.5) * T.sqr(
                (Mu - T.shape_padright(x_i, 1)) /
                Sigma) - T.log(Sigma) - constantX(0.5 * np.log(2 * np.pi))
            wPhi = T.maximum(Phi + T.log(Alpha), constantX(-100.0))

            lp_current = -log_sum_exp(wPhi)  # negative log likelihood
            # lp_current_sum = T.sum(lp_current)

            Pi = T.exp(wPhi - T.shape_padright(lp_current, 1))  # #
            dp_dz_alpha = Pi - Alpha  # BxC
            # dp_dz_alpha = T.grad(lp_current_sum, z_alpha)
            gb_alpha = dp_dz_alpha.mean(0, dtype=floatX)  # C
            gV_alpha = T.dot(h.T, dp_dz_alpha) / B  # HxC

            dp_dz_mu = -Pi * (Mu - T.shape_padright(x_i, 1)) / T.sqr(Sigma)
            # dp_dz_mu = T.grad(lp_current_sum, z_mu)
            dp_dz_mu = dp_dz_mu * Sigma  # Heuristic
            gb_mu = dp_dz_mu.mean(0, dtype=floatX)
            gV_mu = T.dot(h.T, dp_dz_mu) / B

            dp_dz_sigma = Pi * (
                T.sqr(T.shape_padright(x_i, 1) - Mu) / T.sqr(Sigma) - 1)
            # dp_dz_sigma = T.grad(lp_current_sum, z_sigma)
            gb_sigma = dp_dz_sigma.mean(0, dtype=floatX)
            gV_sigma = T.dot(h.T, dp_dz_sigma) / B

            dp_dh = T.dot(dp_dz_alpha, V_alpha.T) + T.dot(
                dp_dz_mu, V_mu.T) + T.dot(dp_dz_sigma, V_sigma.T)  # BxH
            if non_linearity_name == "sigmoid":
                dp_dpot = dp_dh * h * (1 - h)
            elif non_linearity_name == "RLU":
                dp_dpot = dp_dh * (pot > 0)

            gfact = (dp_dpot * a_i).sum(1).mean(0, dtype=floatX)  # 1

            dP_da_i = dP_da_ip1 + dp_dpot * activation_factor  # BxH
            gW = T.dot(T.shape_padleft(x_im1, 1), dP_da_i).flatten() / B

            return (a_i -
                    T.dot(T.shape_padright(x_im1, 1), T.shape_padleft(w_i, 1)),
                    lp_accum + lp_current, dP_da_i, gW, gb_alpha, gV_alpha,
                    gb_mu, gV_mu, gb_sigma, gV_sigma, gfact)
Example #5
0
 def density_given_previous_a_and_x(x, w, v, b, activations_factor,
                                    p_prev, a_prev, x_prev):
     a = a_prev + T.dot(T.shape_padright(x_prev, 1),
                        T.shape_padleft(w, 1))
     h = self.nonlinearity(a * activations_factor)  # BxH
     t = T.dot(h, v) + b
     p_xi_is_one = T.nnet.sigmoid(t) * constantX(0.9999) + constantX(
         0.0001 * 0.5
     )  # Make logistic regression more robust by having the sigmoid saturate at 0.00005 and 0.99995
     p = p_prev + x * T.log(p_xi_is_one) + (1 - x) * T.log(1 -
                                                           p_xi_is_one)
     return (p, a, x)
Example #6
0
 def density_given_previous_a_and_x(x, w, wb, v, c, p_prev, a_prev,
                                    bias_prev):
     h = self.nonlinearity(a_prev + bias_prev)  # BxH
     t = T.dot(h, v) + c
     p_xi_is_one = T.nnet.sigmoid(t) * constantX(0.9999) + constantX(
         0.0001 * 0.5
     )  # Make logistic regression more robust by having the sigmoid saturate at 0.00005 and 0.99995
     p = p_prev + x * T.log(p_xi_is_one) + (1 - x) * T.log(1 -
                                                           p_xi_is_one)
     a = a_prev + T.dot(T.shape_padright(x, 1),
                        T.shape_padleft(w - wb, 1))
     bias = bias_prev + T.log(1 + T.exp(wb)) - T.log(1 + T.exp(w))
     return (p, a, bias)
Example #7
0
        def density_and_gradients(x_i, x_im1, w_i, V_alpha, b_alpha, V_mu, b_mu, V_sigma, b_sigma, activation_factor, a_i, lp_accum, dP_da_ip1):
            B = T.cast(x_i.shape[0], floatX)
            pot = a_i * activation_factor
            h = self.nonlinearity(pot)  # BxH

            z_alpha = T.dot(h, V_alpha) + T.shape_padleft(b_alpha)
            z_mu = T.dot(h, V_mu) + T.shape_padleft(b_mu)
            z_sigma = T.dot(h, V_sigma) + T.shape_padleft(b_sigma)

            Alpha = T.nnet.softmax(z_alpha)  # BxC
            Mu = z_mu  # BxC
            Sigma = T.exp(z_sigma)  # BxC

            Phi = -constantX(0.5) * T.sqr((Mu - T.shape_padright(x_i, 1)) / Sigma) - T.log(Sigma) - constantX(0.5 * np.log(2 * np.pi))
            wPhi = T.maximum(Phi + T.log(Alpha), constantX(-100.0))

            lp_current = -log_sum_exp(wPhi)  # negative log likelihood
            # lp_current_sum = T.sum(lp_current)

            Pi = T.exp(wPhi - T.shape_padright(lp_current, 1))  # #
            dp_dz_alpha = Pi - Alpha  # BxC
            # dp_dz_alpha = T.grad(lp_current_sum, z_alpha)
            gb_alpha = dp_dz_alpha.mean(0, dtype=floatX)  # C
            gV_alpha = T.dot(h.T, dp_dz_alpha) / B  # HxC

            dp_dz_mu = -Pi * (Mu - T.shape_padright(x_i, 1)) / T.sqr(Sigma)
            # dp_dz_mu = T.grad(lp_current_sum, z_mu)
            dp_dz_mu = dp_dz_mu * Sigma  # Heuristic
            gb_mu = dp_dz_mu.mean(0, dtype=floatX)
            gV_mu = T.dot(h.T, dp_dz_mu) / B

            dp_dz_sigma = Pi * (T.sqr(T.shape_padright(x_i, 1) - Mu) / T.sqr(Sigma) - 1)
            # dp_dz_sigma = T.grad(lp_current_sum, z_sigma)
            gb_sigma = dp_dz_sigma.mean(0, dtype=floatX)
            gV_sigma = T.dot(h.T, dp_dz_sigma) / B

            dp_dh = T.dot(dp_dz_alpha, V_alpha.T) + T.dot(dp_dz_mu, V_mu.T) + T.dot(dp_dz_sigma, V_sigma.T)  # BxH
            if non_linearity_name == "sigmoid":
                dp_dpot = dp_dh * h * (1 - h)
            elif non_linearity_name == "RLU":
                dp_dpot = dp_dh * (pot > 0)

            gfact = (dp_dpot * a_i).sum(1).mean(0, dtype=floatX)  # 1

            dP_da_i = dP_da_ip1 + dp_dpot * activation_factor  # BxH
            gW = T.dot(T.shape_padleft(x_im1, 1), dP_da_i).flatten() / B

            return (a_i - T.dot(T.shape_padright(x_im1, 1), T.shape_padleft(w_i, 1)),
                    lp_accum + lp_current,
                    dP_da_i,
                    gW, gb_alpha, gV_alpha, gb_mu, gV_mu, gb_sigma, gV_sigma, gfact)
Example #8
0
        def density_given_previous_a_and_x(x, w, V_alpha, b_alpha, V_mu, b_mu,
                                           V_sigma, b_sigma,
                                           activations_factor, p_prev, a_prev,
                                           x_prev):
            a = a_prev + T.dot(T.shape_padright(x_prev, 1),
                               T.shape_padleft(w, 1))
            h = self.nonlinearity(a * activations_factor)  # BxH

            Alpha = T.nnet.softmax(
                T.dot(h, V_alpha) + T.shape_padleft(b_alpha))  # BxC
            Mu = T.dot(h, V_mu) + T.shape_padleft(b_mu)  # BxC
            Sigma = T.exp(
                (T.dot(h, V_sigma) + T.shape_padleft(b_sigma)))  # BxC
            p = p_prev + log_sum_exp(-constantX(0.5) * T.sqr(
                (Mu - T.shape_padright(x, 1)) / Sigma) - T.log(Sigma) -
                                     constantX(0.5 * np.log(2 * np.pi)) +
                                     T.log(Alpha))
            return (p, a, x)
Example #9
0
        def density_given_previous_a_and_x(x, w, V_alpha, b_alpha, V_mu, b_mu, V_sigma, b_sigma, activations_factor, p_prev, a_prev, x_prev):
            a = a_prev + T.dot(T.shape_padright(x_prev, 1), T.shape_padleft(w, 1))
            h = self.nonlinearity(a * activations_factor)  # BxH

            Alpha = T.nnet.softmax(T.dot(h, V_alpha) + T.shape_padleft(b_alpha))  # BxC
            Mu = T.dot(h, V_mu) + T.shape_padleft(b_mu)  # BxC
            Sigma = T.exp((T.dot(h, V_sigma) + T.shape_padleft(b_sigma)))  # BxC
            p = p_prev + log_sum_exp(-constantX(0.5) * T.sqr((Mu - T.shape_padright(x, 1)) / Sigma) - T.log(Sigma) - constantX(0.5 * np.log(2 * np.pi)) + T.log(Alpha))
            return (p, a, x)
Example #10
0
    def sym_mask_logdensity_estimator_intermediate(self, x, mask):
        non_linearity_name = self.parameters["nonlinearity"].get_name()
        assert (non_linearity_name == "sigmoid" or non_linearity_name == "RLU")
        x = x.T  # BxD
        mask = mask.T  # BxD
        output_mask = constantX(1) - mask  # BxD
        D = constantX(self.n_visible)
        # d is the 1-based index of the dimension whose value to infer (not the
        # size of the context)
        d = mask.sum(1)
        masked_input = x * mask  # BxD
        h = self.nonlinearity(
            T.dot(masked_input, self.W1) + T.dot(mask, self.Wflags) +
            self.b1)  # BxH
        for l in xrange(self.n_layers - 1):
            h = self.nonlinearity(T.dot(h, self.Ws[l]) + self.bs[l])  # BxH
        z_alpha = T.tensordot(h, self.V_alpha, [[1], [1]]) + T.shape_padleft(
            self.b_alpha)
        z_mu = T.tensordot(h, self.V_mu, [[1], [1]]) + T.shape_padleft(
            self.b_mu)
        z_sigma = T.tensordot(h, self.V_sigma, [[1], [1]]) + T.shape_padleft(
            self.b_sigma)
        temp = T.exp(z_alpha)  # + 1e-6
        # temp += T.shape_padright(temp.sum(2)/1e-3)
        Alpha = temp / T.shape_padright(temp.sum(2))  # BxDxC
        Mu = z_mu  # BxDxC
        Sigma = T.exp(z_sigma)  # + 1e-6 #BxDxC

        # Alpha = Alpha * T.shape_padright(output_mask) + T.shape_padright(mask)
        # Mu = Mu * T.shape_padright(output_mask)
        # Sigma = Sigma * T.shape_padright(output_mask) + T.shape_padright(mask)
        # Phi = -constantX(0.5) * T.sqr((Mu - T.shape_padright(x*output_mask)) /
        # Sigma) - T.log(Sigma) - constantX(0.5 * np.log(2*np.pi)) #BxDxC

        Phi = -constantX(0.5) * T.sqr((Mu - T.shape_padright(x)) / Sigma) - \
            T.log(Sigma) - constantX(0.5 * np.log(2 * np.pi))  # BxDxC
        logdensity = (log_sum_exp(Phi + T.log(Alpha), axis=2) *
                      output_mask).sum(1) * D / (D - d)
        return (logdensity, z_alpha, z_mu, z_sigma, Alpha, Mu, Sigma, h)
Example #11
0
 def sym_mask_logdensity_estimator(self, x, mask):
     """ x is a matrix of column datapoints (DxB) D = n_visible, B = batch size """
     # non_linearity_name = self.parameters["nonlinearity"].get_name()
     # assert(non_linearity_name == "sigmoid" or non_linearity_name=="RLU")
     x = x.T  # BxD
     mask = mask.T  # BxD
     output_mask = constantX(1) - mask  # BxD
     D = constantX(self.n_visible)
     d = mask.sum(1)  # d is the 1-based index of the dimension whose value to infer (not the size of the context)
     masked_input = x * mask  # BxD
     h = self.nonlinearity(T.dot(masked_input, self.W1) + T.dot(mask, self.Wflags) + self.b1)  # BxH
     for l in xrange(self.n_layers - 1):
         h = self.nonlinearity(T.dot(h, self.Ws[l]) + self.bs[l])  # BxH
     t = T.dot(h, self.V.T) + self.c  # BxD
     p_x_is_one = T.nnet.sigmoid(t) * constantX(0.9999) + constantX(0.0001 * 0.5)  # BxD
     lp = ((x * T.log(p_x_is_one) + (constantX(1) - x) * T.log(constantX(1) - p_x_is_one)) * output_mask).sum(1) * D / (D - d)  # B
     return lp
Example #12
0
 def sym_mask_logdensity_estimator(self, x, mask):
     """ x is a matrix of column datapoints (DxB) D = n_visible, B = batch size """
     # non_linearity_name = self.parameters["nonlinearity"].get_name()
     # assert(non_linearity_name == "sigmoid" or non_linearity_name=="RLU")
     x = x.T  # BxD
     mask = mask.T  # BxD
     output_mask = constantX(1) - mask  # BxD
     D = constantX(self.n_visible)
     d = mask.sum(
         1
     )  # d is the 1-based index of the dimension whose value to infer (not the size of the context)
     masked_input = x * mask  # BxD
     h = self.nonlinearity(
         T.dot(masked_input, self.W1) + T.dot(mask, self.Wflags) +
         self.b1)  # BxH
     for l in xrange(self.n_layers - 1):
         h = self.nonlinearity(T.dot(h, self.Ws[l]) + self.bs[l])  # BxH
     t = T.dot(h, self.V.T) + self.c  # BxD
     p_x_is_one = T.nnet.sigmoid(t) * constantX(0.9999) + constantX(
         0.0001 * 0.5)  # BxD
     lp = ((x * T.log(p_x_is_one) +
            (constantX(1) - x) * T.log(constantX(1) - p_x_is_one)) *
           output_mask).sum(1) * D / (D - d)  # B
     return lp