def density_given_previous_a_and_x(x, w, v, b, activations_factor, p_prev, a_prev, x_prev): a = a_prev + T.dot(T.shape_padright(x_prev, 1), T.shape_padleft(w, 1)) h = self.nonlinearity(a * activations_factor) # BxH t = T.dot(h, v) + b p_xi_is_one = T.nnet.sigmoid(t) * constantX(0.9999) + constantX(0.0001 * 0.5) # Make logistic regression more robust by having the sigmoid saturate at 0.00005 and 0.99995 p = p_prev + x * T.log(p_xi_is_one) + (1 - x) * T.log(1 - p_xi_is_one) return (p, a, x)
def sym_mask_logdensity_estimator_intermediate(self, x, mask): non_linearity_name = self.parameters["nonlinearity"].get_name() assert non_linearity_name == "sigmoid" or non_linearity_name == "RLU" x = x.T # BxD mask = mask.T # BxD output_mask = constantX(1) - mask # BxD D = constantX(self.n_visible) d = mask.sum(1) # d is the 1-based index of the dimension whose value to infer (not the size of the context) masked_input = x * mask # BxD h = self.nonlinearity(T.dot(masked_input, self.W1) + T.dot(mask, self.Wflags) + self.b1) # BxH for l in xrange(self.n_layers - 1): h = self.nonlinearity(T.dot(h, self.Ws[l]) + self.bs[l]) # BxH z_alpha = T.tensordot(h, self.V_alpha, [[1], [1]]) + T.shape_padleft(self.b_alpha) z_mu = T.tensordot(h, self.V_mu, [[1], [1]]) + T.shape_padleft(self.b_mu) z_sigma = T.tensordot(h, self.V_sigma, [[1], [1]]) + T.shape_padleft(self.b_sigma) temp = T.exp(z_alpha) # + 1e-6 # temp += T.shape_padright(temp.sum(2)/1e-3) Alpha = temp / T.shape_padright(temp.sum(2)) # BxDxC Mu = z_mu # BxDxC Sigma = T.exp(z_sigma) # + 1e-6 #BxDxC # Alpha = Alpha * T.shape_padright(output_mask) + T.shape_padright(mask) # Mu = Mu * T.shape_padright(output_mask) # Sigma = Sigma * T.shape_padright(output_mask) + T.shape_padright(mask) # Phi = -constantX(0.5) * T.sqr((Mu - T.shape_padright(x*output_mask)) / Sigma) - T.log(Sigma) - constantX(0.5 * np.log(2*np.pi)) #BxDxC Phi = ( -constantX(0.5) * T.sqr((Mu - T.shape_padright(x)) / Sigma) - T.log(Sigma) - constantX(0.5 * np.log(2 * np.pi)) ) # BxDxC logdensity = (log_sum_exp(Phi + T.log(Alpha), axis=2) * output_mask).sum(1) * D / (D - d) return (logdensity, z_alpha, z_mu, z_sigma, Alpha, Mu, Sigma, h)
def density_given_previous_a_and_x(x, w, v, c, p_prev, a_prev, x_prev, bias_prev): a = a_prev + T.dot(T.shape_padright(x_prev, 1), T.shape_padleft(w, 1)) bias = bias_prev + constantX(np.log(2)) - T.log(1 + T.exp(w)) h = self.nonlinearity(a + bias + self.b) # BxH t = T.dot(h, v) + c p_xi_is_one = T.nnet.sigmoid(t) * constantX(0.9999) + constantX(0.0001 * 0.5) # Make logistic regression more robust by having the sigmoid saturate at 0.00005 and 0.99995 p = p_prev + x * T.log(p_xi_is_one) + (1 - x) * T.log(1 - p_xi_is_one) return (p, a, x, bias)
def density_and_gradients(x_i, x_im1, w_i, V_alpha, b_alpha, V_mu, b_mu, V_sigma, b_sigma, activation_factor, a_i, lp_accum, dP_da_ip1): B = T.cast(x_i.shape[0], floatX) pot = a_i * activation_factor h = self.nonlinearity(pot) # BxH z_alpha = T.dot(h, V_alpha) + T.shape_padleft(b_alpha) z_mu = T.dot(h, V_mu) + T.shape_padleft(b_mu) z_sigma = T.dot(h, V_sigma) + T.shape_padleft(b_sigma) Alpha = T.nnet.softmax(z_alpha) # BxC Mu = z_mu # BxC Sigma = T.exp(z_sigma) # BxC Phi = -constantX(0.5) * T.sqr( (Mu - T.shape_padright(x_i, 1)) / Sigma) - T.log(Sigma) - constantX(0.5 * np.log(2 * np.pi)) wPhi = T.maximum(Phi + T.log(Alpha), constantX(-100.0)) lp_current = -log_sum_exp(wPhi) # negative log likelihood # lp_current_sum = T.sum(lp_current) Pi = T.exp(wPhi - T.shape_padright(lp_current, 1)) # # dp_dz_alpha = Pi - Alpha # BxC # dp_dz_alpha = T.grad(lp_current_sum, z_alpha) gb_alpha = dp_dz_alpha.mean(0, dtype=floatX) # C gV_alpha = T.dot(h.T, dp_dz_alpha) / B # HxC dp_dz_mu = -Pi * (Mu - T.shape_padright(x_i, 1)) / T.sqr(Sigma) # dp_dz_mu = T.grad(lp_current_sum, z_mu) dp_dz_mu = dp_dz_mu * Sigma # Heuristic gb_mu = dp_dz_mu.mean(0, dtype=floatX) gV_mu = T.dot(h.T, dp_dz_mu) / B dp_dz_sigma = Pi * ( T.sqr(T.shape_padright(x_i, 1) - Mu) / T.sqr(Sigma) - 1) # dp_dz_sigma = T.grad(lp_current_sum, z_sigma) gb_sigma = dp_dz_sigma.mean(0, dtype=floatX) gV_sigma = T.dot(h.T, dp_dz_sigma) / B dp_dh = T.dot(dp_dz_alpha, V_alpha.T) + T.dot( dp_dz_mu, V_mu.T) + T.dot(dp_dz_sigma, V_sigma.T) # BxH if non_linearity_name == "sigmoid": dp_dpot = dp_dh * h * (1 - h) elif non_linearity_name == "RLU": dp_dpot = dp_dh * (pot > 0) gfact = (dp_dpot * a_i).sum(1).mean(0, dtype=floatX) # 1 dP_da_i = dP_da_ip1 + dp_dpot * activation_factor # BxH gW = T.dot(T.shape_padleft(x_im1, 1), dP_da_i).flatten() / B return (a_i - T.dot(T.shape_padright(x_im1, 1), T.shape_padleft(w_i, 1)), lp_accum + lp_current, dP_da_i, gW, gb_alpha, gV_alpha, gb_mu, gV_mu, gb_sigma, gV_sigma, gfact)
def density_given_previous_a_and_x(x, w, v, b, activations_factor, p_prev, a_prev, x_prev): a = a_prev + T.dot(T.shape_padright(x_prev, 1), T.shape_padleft(w, 1)) h = self.nonlinearity(a * activations_factor) # BxH t = T.dot(h, v) + b p_xi_is_one = T.nnet.sigmoid(t) * constantX(0.9999) + constantX( 0.0001 * 0.5 ) # Make logistic regression more robust by having the sigmoid saturate at 0.00005 and 0.99995 p = p_prev + x * T.log(p_xi_is_one) + (1 - x) * T.log(1 - p_xi_is_one) return (p, a, x)
def density_given_previous_a_and_x(x, w, wb, v, c, p_prev, a_prev, bias_prev): h = self.nonlinearity(a_prev + bias_prev) # BxH t = T.dot(h, v) + c p_xi_is_one = T.nnet.sigmoid(t) * constantX(0.9999) + constantX( 0.0001 * 0.5 ) # Make logistic regression more robust by having the sigmoid saturate at 0.00005 and 0.99995 p = p_prev + x * T.log(p_xi_is_one) + (1 - x) * T.log(1 - p_xi_is_one) a = a_prev + T.dot(T.shape_padright(x, 1), T.shape_padleft(w - wb, 1)) bias = bias_prev + T.log(1 + T.exp(wb)) - T.log(1 + T.exp(w)) return (p, a, bias)
def density_and_gradients(x_i, x_im1, w_i, V_alpha, b_alpha, V_mu, b_mu, V_sigma, b_sigma, activation_factor, a_i, lp_accum, dP_da_ip1): B = T.cast(x_i.shape[0], floatX) pot = a_i * activation_factor h = self.nonlinearity(pot) # BxH z_alpha = T.dot(h, V_alpha) + T.shape_padleft(b_alpha) z_mu = T.dot(h, V_mu) + T.shape_padleft(b_mu) z_sigma = T.dot(h, V_sigma) + T.shape_padleft(b_sigma) Alpha = T.nnet.softmax(z_alpha) # BxC Mu = z_mu # BxC Sigma = T.exp(z_sigma) # BxC Phi = -constantX(0.5) * T.sqr((Mu - T.shape_padright(x_i, 1)) / Sigma) - T.log(Sigma) - constantX(0.5 * np.log(2 * np.pi)) wPhi = T.maximum(Phi + T.log(Alpha), constantX(-100.0)) lp_current = -log_sum_exp(wPhi) # negative log likelihood # lp_current_sum = T.sum(lp_current) Pi = T.exp(wPhi - T.shape_padright(lp_current, 1)) # # dp_dz_alpha = Pi - Alpha # BxC # dp_dz_alpha = T.grad(lp_current_sum, z_alpha) gb_alpha = dp_dz_alpha.mean(0, dtype=floatX) # C gV_alpha = T.dot(h.T, dp_dz_alpha) / B # HxC dp_dz_mu = -Pi * (Mu - T.shape_padright(x_i, 1)) / T.sqr(Sigma) # dp_dz_mu = T.grad(lp_current_sum, z_mu) dp_dz_mu = dp_dz_mu * Sigma # Heuristic gb_mu = dp_dz_mu.mean(0, dtype=floatX) gV_mu = T.dot(h.T, dp_dz_mu) / B dp_dz_sigma = Pi * (T.sqr(T.shape_padright(x_i, 1) - Mu) / T.sqr(Sigma) - 1) # dp_dz_sigma = T.grad(lp_current_sum, z_sigma) gb_sigma = dp_dz_sigma.mean(0, dtype=floatX) gV_sigma = T.dot(h.T, dp_dz_sigma) / B dp_dh = T.dot(dp_dz_alpha, V_alpha.T) + T.dot(dp_dz_mu, V_mu.T) + T.dot(dp_dz_sigma, V_sigma.T) # BxH if non_linearity_name == "sigmoid": dp_dpot = dp_dh * h * (1 - h) elif non_linearity_name == "RLU": dp_dpot = dp_dh * (pot > 0) gfact = (dp_dpot * a_i).sum(1).mean(0, dtype=floatX) # 1 dP_da_i = dP_da_ip1 + dp_dpot * activation_factor # BxH gW = T.dot(T.shape_padleft(x_im1, 1), dP_da_i).flatten() / B return (a_i - T.dot(T.shape_padright(x_im1, 1), T.shape_padleft(w_i, 1)), lp_accum + lp_current, dP_da_i, gW, gb_alpha, gV_alpha, gb_mu, gV_mu, gb_sigma, gV_sigma, gfact)
def density_given_previous_a_and_x(x, w, V_alpha, b_alpha, V_mu, b_mu, V_sigma, b_sigma, activations_factor, p_prev, a_prev, x_prev): a = a_prev + T.dot(T.shape_padright(x_prev, 1), T.shape_padleft(w, 1)) h = self.nonlinearity(a * activations_factor) # BxH Alpha = T.nnet.softmax( T.dot(h, V_alpha) + T.shape_padleft(b_alpha)) # BxC Mu = T.dot(h, V_mu) + T.shape_padleft(b_mu) # BxC Sigma = T.exp( (T.dot(h, V_sigma) + T.shape_padleft(b_sigma))) # BxC p = p_prev + log_sum_exp(-constantX(0.5) * T.sqr( (Mu - T.shape_padright(x, 1)) / Sigma) - T.log(Sigma) - constantX(0.5 * np.log(2 * np.pi)) + T.log(Alpha)) return (p, a, x)
def density_given_previous_a_and_x(x, w, V_alpha, b_alpha, V_mu, b_mu, V_sigma, b_sigma, activations_factor, p_prev, a_prev, x_prev): a = a_prev + T.dot(T.shape_padright(x_prev, 1), T.shape_padleft(w, 1)) h = self.nonlinearity(a * activations_factor) # BxH Alpha = T.nnet.softmax(T.dot(h, V_alpha) + T.shape_padleft(b_alpha)) # BxC Mu = T.dot(h, V_mu) + T.shape_padleft(b_mu) # BxC Sigma = T.exp((T.dot(h, V_sigma) + T.shape_padleft(b_sigma))) # BxC p = p_prev + log_sum_exp(-constantX(0.5) * T.sqr((Mu - T.shape_padright(x, 1)) / Sigma) - T.log(Sigma) - constantX(0.5 * np.log(2 * np.pi)) + T.log(Alpha)) return (p, a, x)
def sym_mask_logdensity_estimator_intermediate(self, x, mask): non_linearity_name = self.parameters["nonlinearity"].get_name() assert (non_linearity_name == "sigmoid" or non_linearity_name == "RLU") x = x.T # BxD mask = mask.T # BxD output_mask = constantX(1) - mask # BxD D = constantX(self.n_visible) # d is the 1-based index of the dimension whose value to infer (not the # size of the context) d = mask.sum(1) masked_input = x * mask # BxD h = self.nonlinearity( T.dot(masked_input, self.W1) + T.dot(mask, self.Wflags) + self.b1) # BxH for l in xrange(self.n_layers - 1): h = self.nonlinearity(T.dot(h, self.Ws[l]) + self.bs[l]) # BxH z_alpha = T.tensordot(h, self.V_alpha, [[1], [1]]) + T.shape_padleft( self.b_alpha) z_mu = T.tensordot(h, self.V_mu, [[1], [1]]) + T.shape_padleft( self.b_mu) z_sigma = T.tensordot(h, self.V_sigma, [[1], [1]]) + T.shape_padleft( self.b_sigma) temp = T.exp(z_alpha) # + 1e-6 # temp += T.shape_padright(temp.sum(2)/1e-3) Alpha = temp / T.shape_padright(temp.sum(2)) # BxDxC Mu = z_mu # BxDxC Sigma = T.exp(z_sigma) # + 1e-6 #BxDxC # Alpha = Alpha * T.shape_padright(output_mask) + T.shape_padright(mask) # Mu = Mu * T.shape_padright(output_mask) # Sigma = Sigma * T.shape_padright(output_mask) + T.shape_padright(mask) # Phi = -constantX(0.5) * T.sqr((Mu - T.shape_padright(x*output_mask)) / # Sigma) - T.log(Sigma) - constantX(0.5 * np.log(2*np.pi)) #BxDxC Phi = -constantX(0.5) * T.sqr((Mu - T.shape_padright(x)) / Sigma) - \ T.log(Sigma) - constantX(0.5 * np.log(2 * np.pi)) # BxDxC logdensity = (log_sum_exp(Phi + T.log(Alpha), axis=2) * output_mask).sum(1) * D / (D - d) return (logdensity, z_alpha, z_mu, z_sigma, Alpha, Mu, Sigma, h)
def sym_mask_logdensity_estimator(self, x, mask): """ x is a matrix of column datapoints (DxB) D = n_visible, B = batch size """ # non_linearity_name = self.parameters["nonlinearity"].get_name() # assert(non_linearity_name == "sigmoid" or non_linearity_name=="RLU") x = x.T # BxD mask = mask.T # BxD output_mask = constantX(1) - mask # BxD D = constantX(self.n_visible) d = mask.sum(1) # d is the 1-based index of the dimension whose value to infer (not the size of the context) masked_input = x * mask # BxD h = self.nonlinearity(T.dot(masked_input, self.W1) + T.dot(mask, self.Wflags) + self.b1) # BxH for l in xrange(self.n_layers - 1): h = self.nonlinearity(T.dot(h, self.Ws[l]) + self.bs[l]) # BxH t = T.dot(h, self.V.T) + self.c # BxD p_x_is_one = T.nnet.sigmoid(t) * constantX(0.9999) + constantX(0.0001 * 0.5) # BxD lp = ((x * T.log(p_x_is_one) + (constantX(1) - x) * T.log(constantX(1) - p_x_is_one)) * output_mask).sum(1) * D / (D - d) # B return lp
def sym_mask_logdensity_estimator(self, x, mask): """ x is a matrix of column datapoints (DxB) D = n_visible, B = batch size """ # non_linearity_name = self.parameters["nonlinearity"].get_name() # assert(non_linearity_name == "sigmoid" or non_linearity_name=="RLU") x = x.T # BxD mask = mask.T # BxD output_mask = constantX(1) - mask # BxD D = constantX(self.n_visible) d = mask.sum( 1 ) # d is the 1-based index of the dimension whose value to infer (not the size of the context) masked_input = x * mask # BxD h = self.nonlinearity( T.dot(masked_input, self.W1) + T.dot(mask, self.Wflags) + self.b1) # BxH for l in xrange(self.n_layers - 1): h = self.nonlinearity(T.dot(h, self.Ws[l]) + self.bs[l]) # BxH t = T.dot(h, self.V.T) + self.c # BxD p_x_is_one = T.nnet.sigmoid(t) * constantX(0.9999) + constantX( 0.0001 * 0.5) # BxD lp = ((x * T.log(p_x_is_one) + (constantX(1) - x) * T.log(constantX(1) - p_x_is_one)) * output_mask).sum(1) * D / (D - d) # B return lp