def _tf_sparsemax_loss(self, z, q, dtype, use_gpu): z = z.astype(dtype) q = q.astype(dtype) with self.test_session(use_gpu=use_gpu): tf_sparsemax_op = sparsemax(z) tf_loss_op = sparsemax_loss(z, tf_sparsemax_op, q) tf_loss_out = tf_loss_op.eval() return tf_loss_op, tf_loss_out
def _test_gradient_against_estimate(self, dtype, random, use_gpu): """check sparsemax Rop, against estimated Rop""" z = random.uniform(low=-3, high=3, size=(test_obs, 10)).astype(dtype) logits = array_ops.placeholder(dtype, name='z') sparsemax_op = sparsemax(logits) with self.test_session(use_gpu=use_gpu): err = gradient_checker.compute_gradient_error( logits, z.shape, sparsemax_op, z.shape, x_init_value=z, delta=1e-9) self.assertLess(err, 1e-4)
def _test_gradient_against_numpy(self, dtype, random, use_gpu): """check sparsemax Rop, aginst numpy Rop""" z = random.uniform(low=-3, high=3, size=(test_obs, 10)).astype(dtype) logits = constant_op.constant(z, name='z') sparsemax_op = sparsemax(logits) sparsemax_grad_op = gradients_impl.gradients(sparsemax_op, [logits])[0] with self.test_session(use_gpu=use_gpu): tf_grad = sparsemax_grad_op.eval() np_grad = self._np_sparsemax_grad(z) self.assertAllCloseAccordingToType(np_grad, tf_grad) self.assertShapeEqual(np_grad, sparsemax_grad_op)
def _test_gradient_against_numpy(self, dtype, random, use_gpu): """check sparsemax Rop, against numpy Rop""" z = random.uniform(low=-3, high=3, size=(test_obs, 10)).astype(dtype) logits = constant_op.constant(z, name='z') sparsemax_op = sparsemax(logits) sparsemax_grad_op = gradients_impl.gradients(sparsemax_op, [logits])[0] with self.test_session(use_gpu=use_gpu): tf_grad = sparsemax_grad_op.eval() np_grad = self._np_sparsemax_grad(z) self.assertAllCloseAccordingToType(np_grad, tf_grad) self.assertShapeEqual(np_grad, sparsemax_grad_op)
def _test_gradient_against_estimate(self, dtype, random, use_gpu): """check sparsemax-loss Rop, against estimated-loss Rop""" z = random.uniform(low=-3, high=3, size=(test_obs, 10)).astype(dtype) q = np.zeros((test_obs, 10)).astype(dtype) q[np.arange(0, test_obs), np.random.randint(0, 10, size=test_obs)] = 1 logits = array_ops.placeholder(dtype, name='z') sparsemax_op = sparsemax(logits) loss_op = sparsemax_loss(logits, sparsemax_op, q) with self.test_session(use_gpu=use_gpu): err = gradient_checker.compute_gradient_error( logits, z.shape, loss_op, (test_obs,), x_init_value=z, delta=1e-9) self.assertLess(err, 1e-4)
def _test_gradient_against_estimate(self, dtype, random, use_gpu): """check sparsemax Rop, aginst estimated Rop""" z = random.uniform(low=-3, high=3, size=(test_obs, 10)).astype(dtype) logits = array_ops.placeholder(dtype, name='z') sparsemax_op = sparsemax(logits) with self.test_session(use_gpu=use_gpu): err = gradient_checker.compute_gradient_error(logits, z.shape, sparsemax_op, z.shape, x_init_value=z, delta=1e-9) self.assertLess(err, 1e-4)
def sml(labels, logits): sm = sparsemax(logits) #loss = -np.dot(logits,labels) #smz=sparsemax(logits) shifted_logits = logits - \ math_ops.reduce_mean(logits, axis=1)[:, array_ops.newaxis] # sum over support support = math_ops.cast(sm > 0, sm.dtype) sum_s = support * sm * (shifted_logits - 0.5 * sm) # - z_k + ||q||^2 q_part = labels * (0.5 * labels - shifted_logits) return math_ops.reduce_sum(sum_s + q_part, axis=1)
def _test_gradient_against_numpy(self, dtype, random, use_gpu): """check sparsemax-loss Rop, against numpy Rop""" z = random.uniform(low=-3, high=3, size=(test_obs, 10)) q = np.zeros((test_obs, 10)) q[np.arange(0, test_obs), np.random.randint(0, 10, size=test_obs)] = 1 logits = constant_op.constant(z.astype(dtype), name='z') sparsemax_op = sparsemax(logits) loss_op = sparsemax_loss(logits, sparsemax_op, q.astype(dtype)) loss_grad_op = gradients_impl.gradients(loss_op, [logits])[0] with self.test_session(use_gpu=use_gpu): tf_grad = loss_grad_op.eval() np_grad = self._np_sparsemax_loss_grad(z, q).astype(dtype) self.assertAllCloseAccordingToType( np_grad, tf_grad, half_atol=1e-2, half_rtol=5e-3) self.assertShapeEqual(np_grad, loss_grad_op)
def _test_gradient_against_estimate(self, dtype, random, use_gpu): """check sparsemax-loss Rop, against estimated-loss Rop""" z = random.uniform(low=-3, high=3, size=(test_obs, 10)).astype(dtype) q = np.zeros((test_obs, 10)).astype(dtype) q[np.arange(0, test_obs), np.random.randint(0, 10, size=test_obs)] = 1 logits = array_ops.placeholder(dtype, name='z') sparsemax_op = sparsemax(logits) loss_op = sparsemax_loss(logits, sparsemax_op, q) with self.test_session(use_gpu=use_gpu): err = gradient_checker.compute_gradient_error(logits, z.shape, loss_op, (test_obs, ), x_init_value=z, delta=1e-9) self.assertLess(err, 1e-4)
def _test_gradient_against_numpy(self, dtype, random, use_gpu): """check sparsemax-loss Rop, against numpy Rop""" z = random.uniform(low=-3, high=3, size=(test_obs, 10)) q = np.zeros((test_obs, 10)) q[np.arange(0, test_obs), np.random.randint(0, 10, size=test_obs)] = 1 logits = constant_op.constant(z.astype(dtype), name='z') sparsemax_op = sparsemax(logits) loss_op = sparsemax_loss(logits, sparsemax_op, q.astype(dtype)) loss_grad_op = gradients_impl.gradients(loss_op, [logits])[0] with self.test_session(use_gpu=use_gpu): tf_grad = loss_grad_op.eval() np_grad = self._np_sparsemax_loss_grad(z, q).astype(dtype) self.assertAllCloseAccordingToType(np_grad, tf_grad, half_atol=1e-2, half_rtol=5e-3) self.assertShapeEqual(np_grad, loss_grad_op)
def _tf_sparsemax(self, z, dtype, use_gpu): with self.test_session(use_gpu=use_gpu): tf_sparsemax_op = sparsemax(z.astype(dtype)) tf_sparsemax_out = tf_sparsemax_op.eval() return tf_sparsemax_op, tf_sparsemax_out
def encoder(self, data, is_training): """TabNet encoder model.""" with tf.compat.v1.variable_scope("Encoder", reuse=tf.compat.v1.AUTO_REUSE): # Reads and normalizes input features. features = tf.compat.v1.feature_column.input_layer( data, self.columns) features = tf.layers.batch_normalization( features, training=is_training, momentum=self.batch_momentum) batch_size = tf.shape(features)[0] # Initializes decision-step dependent variables. output_aggregated = tf.zeros([batch_size, self.output_dim]) masked_features = features mask_values = tf.zeros([batch_size, self.num_features]) aggregated_mask_values = tf.zeros([batch_size, self.num_features]) complemantary_aggregated_mask_values = tf.ones( [batch_size, self.num_features]) total_entropy = 0 if is_training: v_b = self.virtual_batch_size else: v_b = 1 for ni in range(self.num_decision_steps): # Feature transformer with two shared and two decision step dependent # blocks is used below. reuse_flag = (ni > 0) transform_f1 = tf.layers.dense(masked_features, self.feature_dim * 2, name="Transform_f1", reuse=reuse_flag, use_bias=False) transform_f1 = tf.layers.batch_normalization( transform_f1, training=is_training, momentum=self.batch_momentum, virtual_batch_size=v_b) transform_f1 = glu(transform_f1, self.feature_dim) transform_f2 = tf.layers.dense(transform_f1, self.feature_dim * 2, name="Transform_f2", reuse=reuse_flag, use_bias=False) transform_f2 = tf.layers.batch_normalization( transform_f2, training=is_training, momentum=self.batch_momentum, virtual_batch_size=v_b) transform_f2 = (glu(transform_f2, self.feature_dim) + transform_f1) * np.sqrt(0.5) transform_f3 = tf.layers.dense(transform_f2, self.feature_dim * 2, name="Transform_f3" + str(ni), use_bias=False) transform_f3 = tf.layers.batch_normalization( transform_f3, training=is_training, momentum=self.batch_momentum, virtual_batch_size=v_b) transform_f3 = (glu(transform_f3, self.feature_dim) + transform_f2) * np.sqrt(0.5) transform_f4 = tf.layers.dense(transform_f3, self.feature_dim * 2, name="Transform_f4" + str(ni), use_bias=False) transform_f4 = tf.layers.batch_normalization( transform_f4, training=is_training, momentum=self.batch_momentum, virtual_batch_size=v_b) transform_f4 = (glu(transform_f4, self.feature_dim) + transform_f3) * np.sqrt(0.5) if ni > 0: decision_out = tf.nn.relu( transform_f4[:, :self.output_dim]) # Decision aggregation. output_aggregated += decision_out # Aggregated masks are used for visualization of the # feature importance attributes. scale_agg = tf.reduce_sum( decision_out, axis=1, keep_dims=True) / (self.num_decision_steps - 1) aggregated_mask_values += mask_values * scale_agg features_for_coef = (transform_f4[:, self.output_dim:]) if ni < self.num_decision_steps - 1: # Determines the feature masks via linear and nonlinear # transformations, taking into account of aggregated feature use. mask_values = tf.layers.dense(features_for_coef, self.num_features, name="Transform_coef" + str(ni), use_bias=False) mask_values = tf.layers.batch_normalization( mask_values, training=is_training, momentum=self.batch_momentum, virtual_batch_size=v_b) mask_values *= complemantary_aggregated_mask_values mask_values = contrib_sparsemax.sparsemax(mask_values) # Relaxation factor controls the amount of reuse of features between # different decision blocks and updated with the values of # coefficients. complemantary_aggregated_mask_values *= ( self.relaxation_factor - mask_values) # Entropy is used to penalize the amount of sparsity in feature # selection. total_entropy += tf.reduce_mean( tf.reduce_sum(-mask_values * tf.math.log(mask_values + self.epsilon), axis=1)) / (self.num_decision_steps - 1) # Feature selection. masked_features = tf.multiply(mask_values, features) # Visualization of the feature selection mask at decision step ni tf.compat.v1.summary.image( "Mask for step" + str(ni), tf.expand_dims(tf.expand_dims(mask_values, 0), 3), max_outputs=1) # Visualization of the aggregated feature importances tf.compat.v1.summary.image( "Aggregated mask", tf.expand_dims(tf.expand_dims(aggregated_mask_values, 0), 3), max_outputs=1) return output_aggregated, total_entropy