def test_glu(): # Test with vector input x = tf.random.uniform([10], -1., 1.) out = custom_objs.glu(x, n_units=None) assert tf.reduce_mean(out) < 0. # Test with matrix input x = tf.random.uniform([10, 10], -1., 1.) out = custom_objs.glu(x, n_units=None) assert tf.reduce_mean(out) < 0. # Test with matrix input and n_units with pytest.raises(tf.errors.InvalidArgumentError): out2 = custom_objs.glu(x, n_units=2) # needs to have equal partition of vectors
def call(self, inputs, training=None): if self.input_features is not None: features = self.input_features(inputs) features = self.input_bn(features, training=training) else: features = inputs batch_size = tf.shape(features)[0] self._step_feature_selection_masks = [] self._step_aggregate_feature_selection_mask = None # Initializes decision-step dependent variables. output_aggregated = tf.zeros([batch_size, self.output_dim]) masked_features = features mask_values = tf.zeros([batch_size, self.num_features]) aggregated_mask_values = tf.zeros([batch_size, self.num_features]) complemantary_aggregated_mask_values = tf.ones( [batch_size, self.num_features]) total_entropy = 0.0 entropy_loss = 0. for ni in range(self.num_decision_steps): # Feature transformer with two shared and two decision step dependent # blocks is used below. transform_f1 = self.transform_f1(masked_features, training=training) transform_f1 = glu(transform_f1, self.feature_dim) transform_f2 = self.transform_f2(transform_f1, training=training) transform_f2 = (glu(transform_f2, self.feature_dim) + transform_f1) * tf.math.sqrt(0.5) transform_f3 = self.transform_f3(transform_f2, training=training) transform_f3 = (glu(transform_f3, self.feature_dim) + transform_f2) * tf.math.sqrt(0.5) transform_f4 = self.transform_f4(transform_f3, training=training) transform_f4 = (glu(transform_f4, self.feature_dim) + transform_f3) * tf.math.sqrt(0.5) if (ni > 0): decision_out = tf.nn.relu(transform_f4[:, :self.output_dim]) # Decision aggregation. output_aggregated += decision_out # Aggregated masks are used for visualization of the # feature importance attributes. scale_agg = tf.reduce_sum(decision_out, axis=1, keepdims=True) scale_agg = scale_agg / tf.cast(self.num_decision_steps - 1, tf.float32) aggregated_mask_values += mask_values * scale_agg features_for_coef = (transform_f4[:, self.output_dim:]) if (ni < (self.num_decision_steps - 1)): # Determines the feature masks via linear and nonlinear # transformations, taking into account of aggregated feature use. mask_values = self.transform_coef(features_for_coef, training=training) mask_values *= complemantary_aggregated_mask_values mask_values = sparsemax(mask_values, axis=-1) # Relaxation factor controls the amount of reuse of features between # different decision blocks and updated with the values of # coefficients. complemantary_aggregated_mask_values *= ( self.relaxation_factor - mask_values) # Entropy is used to penalize the amount of sparsity in feature # selection. total_entropy += tf.reduce_mean( tf.reduce_sum( -mask_values * tf.math.log(mask_values + self.epsilon), axis=1)) / ( tf.cast(self.num_decision_steps - 1, tf.float32)) # Add entropy loss entropy_loss = total_entropy # Feature selection. masked_features = tf.multiply(mask_values, features) # Visualization of the feature selection mask at decision step ni # tf.summary.image( # "Mask for step" + str(ni), # tf.expand_dims(tf.expand_dims(mask_values, 0), 3), # max_outputs=1) mask_at_step_i = tf.expand_dims(tf.expand_dims(mask_values, 0), 3) self._step_feature_selection_masks.append(mask_at_step_i) else: # This branch is needed for correct compilation by tf.autograph entropy_loss = 0. # Adds the loss automatically self.add_loss(self.sparsity_coefficient * entropy_loss) # Visualization of the aggregated feature importances # tf.summary.image( # "Aggregated mask", # tf.expand_dims(tf.expand_dims(aggregated_mask_values, 0), 3), # max_outputs=1) agg_mask = tf.expand_dims(tf.expand_dims(aggregated_mask_values, 0), 3) self._step_aggregate_feature_selection_mask = agg_mask return output_aggregated