def gelu(input_tensor): """Gaussian Error Linear Unit. This is a smoother version of the RELU. Original paper: https://arxiv.org/abs/1606.08415 Args: input_tensor: float Tensor to perform activation. Returns: `input_tensor` with the GELU activation applied. """ cdf = 0.5 * (1.0 + tf.erf(input_tensor / tf.sqrt(2.0))) return input_tensor * cdf
def KL(p, q, hypers=None, global_step=1.0E99): if isinstance(p, DiagonalGaussianVar): if isinstance(q, DiagonalGaussianVar): safe_qvar = q.var + bu.EPSILON entropy_term = 0.5 * (1 + bu.log2pi + tf.log(p.var)) cross_entropy_term = 0.5 * (bu.log2pi + tf.log(safe_qvar) + (p.var + (p.mean - q.mean)**2) / safe_qvar) return tf.reduce_sum(cross_entropy_term - entropy_term) elif isinstance(q, DiagonalLaplaceVar): sigma = tf.sqrt(p.var) mu_ovr_sigma = p.mean / sigma tmp = 2 * bu.standard_gaussian(mu_ovr_sigma) + mu_ovr_sigma * tf.erf(mu_ovr_sigma * bu.one_ovr_sqrt2) tmp *= sigma / q.b tmp += 0.5 * tf.log(2 * q.b * q.b / (pi * p.var)) - 0.5 return tf.reduce_sum(tmp) elif isinstance(q, InverseGammaVar): return EBKL(p, q, hypers, global_step) print('unsupported KL')
def gaussian_cdf(x, radius): return 0.5 * (1 + tf.erf(x / (math.sqrt(2.) * radius)))
def __init__( self, input_node, hidden_layers_node, output_node, learning_rate, batch_size, display_step, activation, seed=1, feature_selection=False, a=1, sigma=0.1, lam=0.5, param_search=False ): #Note: a, sigma, lam should be set by params dict that will be passed to this class. self.param_search = param_search # Register hyperparameters for feature selection self.a = a self.sigma = sigma self.lam = lam # Register regular hyperparameters self.lr = learning_rate self.batch_size = batch_size self.display_step = display_step # to print loss/acc information during training G = tf.Graph() with G.as_default(): self.sess = tf.Session(graph=G) # tf Graph Input X = tf.placeholder( tf.float32, [None, input_node]) # X.shape == [batch_size, feature_size] y = tf.placeholder(tf.float32, [None, output_node]) train_gates = tf.placeholder(tf.float32, [1], name='train_gates') self.nnweights = [] prev_node = input_node prev_x = X with tf.variable_scope('gates', reuse=tf.AUTO_REUSE): self.alpha = tf.get_variable( 'alpha', [ prev_node, ], initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.01)) prev_x = self.feature_selector(prev_x, train_gates) layer_name = 'layer' + str(1) for i in range(len(hidden_layers_node)): layer_name = 'layer' + str(i + 1) with tf.variable_scope(layer_name, reuse=tf.AUTO_REUSE): weights = tf.get_variable( 'weights', [prev_node, hidden_layers_node[i]], initializer=tf.truncated_normal_initializer( stddev=0.1)) self.nnweights.append(weights) biases = tf.get_variable( 'biases', [hidden_layers_node[i]], initializer=tf.constant_initializer(0.0)) layer_out = (tf.matmul(prev_x, weights) + biases ) # Softmax if activation == 'relu': layer_out = tf.nn.relu(layer_out) elif activation == 'sigmoid': layer_out = tf.nn.sigmoid(layer_out) elif activation == 'tanh': layer_out = tf.nn.tanh(layer_out) elif activation == 'none': layer_out = (layer_out) else: raise NotImplementedError('activation not recognized') prev_node = hidden_layers_node[i] prev_x = layer_out # Output of model # Minimize error using cross entropy if output_node == 1: # pred = layer_out weights = tf.get_variable( 'weights', [1, 1], initializer=tf.truncated_normal_initializer(stddev=0.1)) self.nnweights.append(weights) biases = tf.get_variable( 'biases', [1], initializer=tf.constant_initializer(0.0)) pred = (tf.matmul(layer_out, weights) + biases) loss_fun = tf.reduce_mean(tf.squared_difference(pred, y)) else: pred = tf.nn.softmax(layer_out) pred_log = (layer_out) loss_fun = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(labels=y, logits=layer_out)) if feature_selection: ## gates regularization input2cdf = self.alpha #reg = 0.5*(1 + tf.erf(input2cdf/(self.sigma*np.sqrt(2)))) reg = 0.5 - 0.5 * tf.erf((-1 / (2 * self.a) - input2cdf) / (self.sigma * np.sqrt(2))) reg_gates = self.lam * tf.reduce_mean(reg) loss = loss_fun + reg_gates self.reg_gates = reg_gates # for debugging else: loss = loss_fun self.reg_gates = 0 # Get optimizer train_step = tf.train.GradientDescentOptimizer( learning_rate).minimize(loss) # For evaluation correct_prediction = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1)) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) # Initialize the variables (i.e. assign their default value) init_op = tf.global_variables_initializer() self.saver = tf.train.Saver() # Save into class members self.X = X self.y = y self.pred = pred self.train_gates = train_gates self.loss = loss self.pred_log = pred_log self.train_step = train_step self.correct_prediction = correct_prediction self.accuracy = accuracy self.output_node = output_node self.weights = weights self.biases = biases # set random state tf.set_random_seed(seed) self.sess.run(init_op)
def gelu(input_tensor): cdf = 0.5 * (1.0 + tf.erf(input_tensor / tf.sqrt(2.0))) return input_tensor * cdf
def gelu(x): """Apply gelu function.""" return x * 0.5 * (1.0 + tf.erf(x / math.sqrt(2.0)))
def gaussian_cdf(x): return 0.5 * (1.0 + tf.erf(x * one_ovr_sqrt2))