def sym_mask_logdensity_estimator_intermediate(self, x, mask): non_linearity_name = self.parameters["nonlinearity"].get_name() assert non_linearity_name == "sigmoid" or non_linearity_name == "RLU" x = x.T # BxD mask = mask.T # BxD output_mask = constantX(1) - mask # BxD D = constantX(self.n_visible) d = mask.sum(1) # d is the 1-based index of the dimension whose value to infer (not the size of the context) masked_input = x * mask # BxD h = self.nonlinearity(T.dot(masked_input, self.W1) + T.dot(mask, self.Wflags) + self.b1) # BxH for l in xrange(self.n_layers - 1): h = self.nonlinearity(T.dot(h, self.Ws[l]) + self.bs[l]) # BxH z_alpha = T.tensordot(h, self.V_alpha, [[1], [1]]) + T.shape_padleft(self.b_alpha) z_mu = T.tensordot(h, self.V_mu, [[1], [1]]) + T.shape_padleft(self.b_mu) z_sigma = T.tensordot(h, self.V_sigma, [[1], [1]]) + T.shape_padleft(self.b_sigma) temp = T.exp(z_alpha) # + 1e-6 # temp += T.shape_padright(temp.sum(2)/1e-3) Alpha = temp / T.shape_padright(temp.sum(2)) # BxDxC Mu = z_mu # BxDxC Sigma = T.exp(z_sigma) # + 1e-6 #BxDxC # Alpha = Alpha * T.shape_padright(output_mask) + T.shape_padright(mask) # Mu = Mu * T.shape_padright(output_mask) # Sigma = Sigma * T.shape_padright(output_mask) + T.shape_padright(mask) # Phi = -constantX(0.5) * T.sqr((Mu - T.shape_padright(x*output_mask)) / Sigma) - T.log(Sigma) - constantX(0.5 * np.log(2*np.pi)) #BxDxC Phi = ( -constantX(0.5) * T.sqr((Mu - T.shape_padright(x)) / Sigma) - T.log(Sigma) - constantX(0.5 * np.log(2 * np.pi)) ) # BxDxC logdensity = (log_sum_exp(Phi + T.log(Alpha), axis=2) * output_mask).sum(1) * D / (D - d) return (logdensity, z_alpha, z_mu, z_sigma, Alpha, Mu, Sigma, h)
def density_and_gradients(x_i, x_im1, w_i, V_alpha, b_alpha, V_mu, b_mu, V_sigma, b_sigma, activation_factor, a_i, lp_accum, dP_da_ip1): B = T.cast(x_i.shape[0], theano.config.floatX) pot = a_i * activation_factor h = self.nonlinearity(pot) # BxH z_alpha = T.dot(h, V_alpha) + T.shape_padleft(b_alpha) z_mu = T.dot(h, V_mu) + T.shape_padleft(b_mu) z_sigma = T.dot(h, V_sigma) + T.shape_padleft(b_sigma) Alpha = T.nnet.softmax(z_alpha) # BxC Mu = z_mu # BxC Sigma = T.exp(z_sigma) # BxC Phi = -T.log( 2 * Sigma) - T.abs_(Mu - T.shape_padright(x_i, 1)) / Sigma wPhi = T.maximum(Phi + T.log(Alpha), constantX(-100.0)) lp_current = log_sum_exp(wPhi) # lp_current_sum = T.sum(lp_current) Pi = T.exp(wPhi - T.shape_padright(lp_current, 1)) # # dp_dz_alpha = Pi - Alpha # BxC # dp_dz_alpha = T.grad(lp_current_sum, z_alpha) gb_alpha = dp_dz_alpha.mean(0, dtype=theano.config.floatX) # C gV_alpha = T.dot(h.T, dp_dz_alpha) / B # HxC # dp_dz_mu = T.grad(lp_current_sum, z_mu) dp_dz_mu = Pi * T.sgn(T.shape_padright(x_i, 1) - Mu) / Sigma # dp_dz_mu = dp_dz_mu * Sigma gb_mu = dp_dz_mu.mean(0, dtype=theano.config.floatX) gV_mu = T.dot(h.T, dp_dz_mu) / B # dp_dz_sigma = T.grad(lp_current_sum, z_sigma) dp_dz_sigma = Pi * (T.abs_(T.shape_padright(x_i, 1) - Mu) / Sigma - 1) gb_sigma = dp_dz_sigma.mean(0, dtype=theano.config.floatX) gV_sigma = T.dot(h.T, dp_dz_sigma) / B dp_dh = T.dot(dp_dz_alpha, V_alpha.T) + T.dot( dp_dz_mu, V_mu.T) + T.dot(dp_dz_sigma, V_sigma.T) # BxH if non_linearity_name == "sigmoid": dp_dpot = dp_dh * h * (1 - h) elif non_linearity_name == "RLU": dp_dpot = dp_dh * (pot > 0) gfact = (dp_dpot * a_i).sum(1).mean( 0, dtype=theano.config.floatX) # 1 dP_da_i = dP_da_ip1 + dp_dpot * activation_factor # BxH gW = T.dot(T.shape_padleft(x_im1, 1), dP_da_i).flatten() / B return (a_i - T.dot(T.shape_padright(x_im1, 1), T.shape_padleft(w_i, 1)), lp_accum + lp_current, dP_da_i, gW, gb_alpha, gV_alpha, gb_mu, gV_mu, gb_sigma, gV_sigma, gfact)
def density_given_previous_a_and_x(x, w, V_alpha, b_alpha, V_mu, b_mu, V_sigma, b_sigma, activations_factor, p_prev, a_prev, x_prev): a = a_prev + T.dot(T.shape_padright(x_prev, 1), T.shape_padleft(w, 1)) h = self.nonlinearity(a * activations_factor) # BxH Alpha = T.nnet.softmax(T.dot(h, V_alpha) + T.shape_padleft(b_alpha)) # BxC Mu = T.dot(h, V_mu) + T.shape_padleft(b_mu) # BxC Sigma = T.exp((T.dot(h, V_sigma) + T.shape_padleft(b_sigma))) # BxC p = p_prev + log_sum_exp(T.log(Alpha) - T.log(2 * Sigma) - T.abs_(Mu - T.shape_padright(x, 1)) / Sigma) return (p, a, x)
def density_and_gradients(x_i, x_im1, w_i, V_alpha, b_alpha, V_mu, b_mu, V_sigma, b_sigma, activation_factor, a_i, lp_accum, dP_da_ip1): B = T.cast(x_i.shape[0], theano.config.floatX) pot = a_i * activation_factor h = self.nonlinearity(pot) # BxH z_alpha = T.dot(h, V_alpha) + T.shape_padleft(b_alpha) z_mu = T.dot(h, V_mu) + T.shape_padleft(b_mu) z_sigma = T.dot(h, V_sigma) + T.shape_padleft(b_sigma) Alpha = T.nnet.softmax(z_alpha) # BxC Mu = z_mu # BxC Sigma = T.exp(z_sigma) # BxC Phi = -T.log(2 * Sigma) - T.abs_(Mu - T.shape_padright(x_i, 1)) / Sigma wPhi = T.maximum(Phi + T.log(Alpha), constantX(-100.0)) lp_current = log_sum_exp(wPhi) # lp_current_sum = T.sum(lp_current) Pi = T.exp(wPhi - T.shape_padright(lp_current, 1)) # # dp_dz_alpha = Pi - Alpha # BxC # dp_dz_alpha = T.grad(lp_current_sum, z_alpha) gb_alpha = dp_dz_alpha.mean(0, dtype=theano.config.floatX) # C gV_alpha = T.dot(h.T, dp_dz_alpha) / B # HxC # dp_dz_mu = T.grad(lp_current_sum, z_mu) dp_dz_mu = Pi * T.sgn(T.shape_padright(x_i, 1) - Mu) / Sigma # dp_dz_mu = dp_dz_mu * Sigma gb_mu = dp_dz_mu.mean(0, dtype=theano.config.floatX) gV_mu = T.dot(h.T, dp_dz_mu) / B # dp_dz_sigma = T.grad(lp_current_sum, z_sigma) dp_dz_sigma = Pi * (T.abs_(T.shape_padright(x_i, 1) - Mu) / Sigma - 1) gb_sigma = dp_dz_sigma.mean(0, dtype=theano.config.floatX) gV_sigma = T.dot(h.T, dp_dz_sigma) / B dp_dh = T.dot(dp_dz_alpha, V_alpha.T) + T.dot(dp_dz_mu, V_mu.T) + T.dot(dp_dz_sigma, V_sigma.T) # BxH if non_linearity_name == "sigmoid": dp_dpot = dp_dh * h * (1 - h) elif non_linearity_name == "RLU": dp_dpot = dp_dh * (pot > 0) gfact = (dp_dpot * a_i).sum(1).mean(0, dtype=theano.config.floatX) # 1 dP_da_i = dP_da_ip1 + dp_dpot * activation_factor # BxH gW = T.dot(T.shape_padleft(x_im1, 1), dP_da_i).flatten() / B return (a_i - T.dot(T.shape_padright(x_im1, 1), T.shape_padleft(w_i, 1)), lp_accum + lp_current, dP_da_i, gW, gb_alpha, gV_alpha, gb_mu, gV_mu, gb_sigma, gV_sigma, gfact)
def density_given_previous_a_and_x(x, w, V_alpha, b_alpha, V_mu, b_mu, V_sigma, b_sigma, activations_factor, p_prev, a_prev, x_prev): a = a_prev + T.dot(T.shape_padright(x_prev, 1), T.shape_padleft(w, 1)) h = self.nonlinearity(a * activations_factor) # BxH Alpha = T.nnet.softmax( T.dot(h, V_alpha) + T.shape_padleft(b_alpha)) # BxC Mu = T.dot(h, V_mu) + T.shape_padleft(b_mu) # BxC Sigma = T.exp( (T.dot(h, V_sigma) + T.shape_padleft(b_sigma))) # BxC p = p_prev + log_sum_exp( T.log(Alpha) - T.log(2 * Sigma) - T.abs_(Mu - T.shape_padright(x, 1)) / Sigma) return (p, a, x)
def sym_mask_logdensity_estimator_intermediate(self, x, mask): non_linearity_name = self.parameters["nonlinearity"].get_name() assert (non_linearity_name == "sigmoid" or non_linearity_name == "RLU") x = x.T # BxD mask = mask.T # BxD output_mask = constantX(1) - mask # BxD D = constantX(self.n_visible) # d is the 1-based index of the dimension whose value to infer (not the # size of the context) d = mask.sum(1) masked_input = x * mask # BxD h = self.nonlinearity( T.dot(masked_input, self.W1) + T.dot(mask, self.Wflags) + self.b1) # BxH for l in xrange(self.n_layers - 1): h = self.nonlinearity(T.dot(h, self.Ws[l]) + self.bs[l]) # BxH z_alpha = T.tensordot(h, self.V_alpha, [[1], [1]]) + T.shape_padleft( self.b_alpha) z_mu = T.tensordot(h, self.V_mu, [[1], [1]]) + T.shape_padleft( self.b_mu) z_sigma = T.tensordot(h, self.V_sigma, [[1], [1]]) + T.shape_padleft( self.b_sigma) temp = T.exp(z_alpha) # + 1e-6 # temp += T.shape_padright(temp.sum(2)/1e-3) Alpha = temp / T.shape_padright(temp.sum(2)) # BxDxC Mu = z_mu # BxDxC Sigma = T.exp(z_sigma) # + 1e-6 #BxDxC # Alpha = Alpha * T.shape_padright(output_mask) + T.shape_padright(mask) # Mu = Mu * T.shape_padright(output_mask) # Sigma = Sigma * T.shape_padright(output_mask) + T.shape_padright(mask) # Phi = -constantX(0.5) * T.sqr((Mu - T.shape_padright(x*output_mask)) / # Sigma) - T.log(Sigma) - constantX(0.5 * np.log(2*np.pi)) #BxDxC Phi = -constantX(0.5) * T.sqr((Mu - T.shape_padright(x)) / Sigma) - \ T.log(Sigma) - constantX(0.5 * np.log(2 * np.pi)) # BxDxC logdensity = (log_sum_exp(Phi + T.log(Alpha), axis=2) * output_mask).sum(1) * D / (D - d) return (logdensity, z_alpha, z_mu, z_sigma, Alpha, Mu, Sigma, h)