def _select_columns(self, mode, features): input_mask = features["input_mask"] column_ids = features["column_ids"] with tf.variable_scope("bert"): with tf.variable_scope("embeddings", reuse=tf.compat.v1.AUTO_REUSE): input_embeddings, _ = modeling.embedding_lookup( input_ids=features["input_ids"], vocab_size=self._vocab_size, embedding_size=self._hidden_size, initializer_range=self._initializer_range, word_embedding_name="word_embeddings") if self._use_positional_embeddings: token_type_ids = [] token_type_features = [ "segment_ids", "column_ids", "row_ids", "prev_label_ids", "column_ranks", "inv_column_ranks", "numeric_relations" ] for key in token_type_features: if self._disabled_features is not None and key in self._disabled_features: token_type_ids.append(tf.zeros_like(features[key])) else: token_type_ids.append(features[key]) input_embeddings = modeling.embedding_postprocessor( input_tensor=input_embeddings, use_token_type=True, token_type_ids=token_type_ids, token_type_vocab_size=self._type_vocab_size, token_type_embedding_name="token_type_embeddings", use_position_embeddings=self._use_position_embeddings, position_embedding_name="position_embeddings", initializer_range=self._initializer_range, max_position_embeddings=self._max_position_embeddings, extra_embeddings=None, dropout_prob=0.0) # Indexes all the zero values from the input_mask by (max_num_columns+1) # The index 0 is for the question and from 1 to max_num_columns included # is for the columns. masked_col_ids = column_ids * input_mask + (1 - input_mask) * ( self._max_num_columns + 1) col_index = segmented_tensor.IndexMap( indices=masked_col_ids, num_segments=self._max_num_columns + 2, batch_dims=1) average_embeddings, _ = segmented_tensor.reduce_mean( input_embeddings, col_index) # Removes the last index as it contains the avg of non selected values average_embeddings = average_embeddings[:, :-1] normalize_average_embeddings = tf.math.l2_normalize( average_embeddings, axis=2) questions_embeddings = normalize_average_embeddings[:, :1] columns_embeddings = normalize_average_embeddings[:, 1:] multiply = columns_embeddings * questions_embeddings multiply = tf.where(tf.is_nan(multiply), tf.zeros_like(multiply), multiply) column_scores = tf.math.reduce_sum(multiply, axis=-1, name="column_scores") return column_scores
def test_reduce_mean(self): values, row_index, col_index = self._prepare_tables() cell_index = segmented_tensor.ProductIndexMap(row_index, col_index) row_mean, _ = segmented_tensor.reduce_mean(values, row_index) col_mean, _ = segmented_tensor.reduce_mean(values, col_index) cell_mean, _ = segmented_tensor.reduce_mean(values, cell_index) with self.session() as sess: self.assertAllClose(sess.run(row_mean), [[6.0 / 3.0, 3.0 / 3.0, 8.0 / 3.0], [6.0 / 3.0, 3.0 / 3.0, 8.0 / 3.0]]) self.assertAllClose(sess.run(col_mean), [[9.0 / 6.0, 8.0 / 3.0, 0.0], [4.0 / 3.0, 5.0 / 3.0, 8.0 / 3.0]]) self.assertAllClose(sess.run(cell_mean), [[ 3.0 / 2.0, 3.0, 0.0, 2.0 / 2.0, 1.0, 0.0, 4.0 / 2.0, 4.0, 0.0 ], [1.0, 2.0, 3.0, 2.0, 0.0, 1.0, 1.0, 3.0, 4.0]])
def compute_column_logits(output_layer, cell_index, cell_mask, init_cell_selection_weights_to_zero, allow_empty_column_selection): """Computes logits for each column. Args: output_layer: <float>[batch_size, seq_length, hidden_dim] Output of the encoder layer. cell_index: segmented_tensor.IndexMap [batch_size, seq_length] Index that groups tokens into cells. cell_mask: <float>[batch_size, max_num_rows * max_num_cols] Input mask per cell, 1 for cells that exists in the example and 0 for padding. init_cell_selection_weights_to_zero: Whether the initial weights should be set to 0. This is also applied to column logits, as they are used to select the cells. This ensures that all columns have the same prior probability. allow_empty_column_selection: Allow to select no column. Returns: <float>[batch_size, max_num_cols] Logits per column. Logits will be set to a very low value (such that the probability is 0) for the special id 0 (which means "outside the table") or columns that do not apear in the table. """ hidden_size = output_layer.shape.as_list()[-1] column_output_weights = tf.get_variable( "column_output_weights", [hidden_size], initializer=tf.zeros_initializer() if init_cell_selection_weights_to_zero else classification_initializer()) column_output_bias = tf.get_variable("column_output_bias", shape=(), initializer=tf.zeros_initializer()) token_logits = ( tf.einsum("bsj,j->bs", output_layer, column_output_weights) + column_output_bias) # Average the logits per cell and then per column. # Note that by linearity it doesn't matter if we do the averaging on the # embeddings or on the logits. For performance we do the projection first. # [batch_size, max_num_cols * max_num_rows] cell_logits, cell_logits_index = segmented_tensor.reduce_mean( token_logits, cell_index) column_index = cell_index.project_inner(cell_logits_index) # [batch_size, max_num_cols] column_logits, out_index = segmented_tensor.reduce_sum( cell_logits * cell_mask, column_index) cell_count, _ = segmented_tensor.reduce_sum(cell_mask, column_index) column_logits /= cell_count + EPSILON_ZERO_DIVISION # Mask columns that do not appear in the example. is_padding = tf.logical_and(cell_count < 0.5, tf.not_equal(out_index.indices, 0)) column_logits += CLOSE_ENOUGH_TO_LOG_ZERO * tf.cast(is_padding, tf.float32) if not allow_empty_column_selection: column_logits += CLOSE_ENOUGH_TO_LOG_ZERO * tf.cast( tf.equal(out_index.indices, 0), tf.float32) return column_logits
def _compute_column_scores_from_token_scores(self, mode, output_layer, features): """Gets the columns scores by avereging the tokens scores.""" with tf.variable_scope(PRUNING_SCOPE, reuse=tf.AUTO_REUSE): if mode == tf_estimator.ModeKeys.TRAIN: output_layer = tf.nn.dropout( output_layer, keep_prob=_SEQUENCE_OUTPUT_KEEP_PROB) input_mask = features["input_mask"] row_ids = features["row_ids"] column_ids = features["column_ids"] # Construct indices for the table. row_index = segmented_tensor.IndexMap( indices=tf.minimum(row_ids, self._max_num_rows - 1), num_segments=self._max_num_rows, batch_dims=1) col_index = segmented_tensor.IndexMap( indices=tf.minimum(column_ids, self._max_num_columns), num_segments=self._max_num_columns + 1, batch_dims=1) cell_index = segmented_tensor.ProductIndexMap(row_index, col_index) # Masks. # <float32>[batch_size, seq_length] input_mask_float = tf.cast(input_mask, tf.float32) # Mask for cells that exist in the table (i.e. that are not padding). cell_mask, _ = segmented_tensor.reduce_mean( input_mask_float, cell_index) # Compute logits per column which can be used to select a column. # <float32>[batch_size, max_num_columns] column_scores = utils.compute_column_logits( output_layer=output_layer, cell_index=cell_index, cell_mask=cell_mask, init_cell_selection_weights_to_zero=False, allow_empty_column_selection=False)[:, 1:] column_scores = tf.debugging.assert_all_finite( column_scores, "column_scores contains nan values.") return column_scores
def call(self, inputs, cell_index, cell_mask): ''' Args: inputs: <float>[batch_size, seq_length, hidden_dim] Output of the encoder layer. cell_index: segmented_tensor.IndexMap [batch_size, seq_length] Index that groups tokens into cells. cell_mask: <float>[batch_size, max_num_rows * max_num_cols] Input mask per cell, 1 for cells that exists in the example and 0 for padding. ''' token_logits = ( tf.einsum("bsj,j->bs", inputs, self.column_output_weights) + self.column_output_bias) # Average the logits per cell and then per column. # Note that by linearity it doesn't matter if we do the averaging on the # embeddings or on the logits. For performance we do the projection first. # [batch_size, max_num_cols * max_num_rows] cell_logits, cell_logits_index = segmented_tensor.reduce_mean( token_logits, cell_index) column_index = cell_index.project_inner(cell_logits_index) # [batch_size, max_num_cols] column_logits, out_index = segmented_tensor.reduce_sum( cell_logits * cell_mask, column_index) cell_count, _ = segmented_tensor.reduce_sum(cell_mask, column_index) column_logits /= cell_count + EPSILON_ZERO_DIVISION # Mask columns that do not appear in the example. is_padding = tf.logical_and(cell_count < 0.5, tf.not_equal(out_index.indices, 0)) column_logits += CLOSE_ENOUGH_TO_LOG_ZERO * \ tf.cast(is_padding, tf.float32) if not self.allow_empty_column_selection: column_logits += CLOSE_ENOUGH_TO_LOG_ZERO * tf.cast( tf.equal(out_index.indices, 0), tf.float32) return column_logits
def call(self, input_token_ids, input_mask, segment_ids, column_ids, row_ids, prev_label_ids, column_ranks, inv_column_ranks, numeric_relations, label_ids, **kwargs): # Construct indices for the table. row_index = segmented_tensor.IndexMap( indices=tf.minimum(tf.cast(row_ids, tf.int32), self.tapas_classifier_config.max_num_rows - 1), num_segments=self.tapas_classifier_config.max_num_rows, batch_dims=1) col_index = segmented_tensor.IndexMap( indices=tf.minimum( tf.cast(column_ids, tf.int32), self.tapas_classifier_config.max_num_columns - 1), num_segments=self.tapas_classifier_config.max_num_columns, batch_dims=1) cell_index = segmented_tensor.ProductIndexMap(row_index, col_index) # Masks. # <float32>[batch_size, seq_length] table_mask = tf.where(row_ids > 0, tf.ones_like(row_ids), tf.zeros_like(row_ids)) input_mask_float = tf.cast(input_mask, tf.float32) table_mask_float = tf.cast(table_mask, tf.float32) # Mask for cells that exist in the table (i.e. that are not padding). cell_mask, _ = segmented_tensor.reduce_mean(input_mask_float, cell_index) pooled_output, sequence_output = self.bert([ input_token_ids, input_mask, segment_ids, column_ids, row_ids, prev_label_ids, column_ranks, inv_column_ranks, numeric_relations ], **kwargs) # Compute logits per token. These are used to select individual cells. logits = self.compute_token_logits(sequence_output) # Compute logits per column. These are used to select a column. if self.tapas_classifier_config.select_one_column: column_logits = self.compute_column_logits(sequence_output, cell_index, cell_mask) logits_cls = None if self.do_model_classification: logits_cls = self.compute_classification_logits(pooled_output) if self.tapas_classifier_config.average_logits_per_cell: logits_per_cell, _ = segmented_tensor.reduce_mean( logits, cell_index) logits = segmented_tensor.gather(logits_per_cell, cell_index) dist_per_token = tfp.distributions.Bernoulli(logits=logits) if self.tapas_classifier_config.select_one_column: logits = single_column_cell_selection(logits, column_logits, label_ids, cell_index, col_index, cell_mask) dist_per_token = tfp.distributions.Bernoulli(logits=logits) logits_aggregation = None if self.do_model_aggregation: logits_aggregation = self.calculate_aggregation_logits( pooled_output) probs = _get_probs(dist_per_token) * input_mask_float return logits, probs, logits_aggregation, logits_cls
def single_column_cell_selection(token_logits, column_logits, label_ids, cell_index, col_index, cell_mask): """Computes the loss for cell selection constrained to a single column. The loss is a hierarchical log-likelihood. The model first predicts a column and then selects cells within that column (conditioned on the column). Cells outside the selected column are never selected. Args: token_logits: <float>[batch_size, seq_length] Logits per token. column_logits: <float>[batch_size, max_num_cols] Logits per column. label_ids: <int32>[batch_size, seq_length] Labels per token. cell_index: segmented_tensor.IndexMap [batch_size, seq_length] Index that groups tokens into cells. col_index: segmented_tensor.IndexMap [batch_size, seq_length] Index that groups tokens into columns. cell_mask: <float>[batch_size, max_num_rows * max_num_cols] Input mask per cell, 1 for cells that exists in the example and 0 for padding. Returns: selection_loss_per_example: <float>[batch_size] Loss for each example. logits: <float>[batch_size, seq_length] New logits which are only allowed to select cells in a single column. Logits outside of the most likely column according to `column_logits` will be set to a very low value (such that the probabilities are 0). """ # First find the column we should select. We use the column with maximum # number of selected cells. labels_per_column, _ = segmented_tensor.reduce_sum( tf.cast(label_ids, tf.float32), col_index) column_label = tf.argmax(labels_per_column, axis=-1, output_type=tf.int32) # Check if there are no selected cells in the column. In that case the model # should predict the special column id 0, which means "select nothing". no_cell_selected = tf.equal(tf.reduce_max(labels_per_column, axis=-1), 0) column_label = tf.where(no_cell_selected, tf.zeros_like(column_label), column_label) column_dist = tfp.distributions.Categorical(logits=column_logits) # Reduce the labels and logits to per-cell from per-token. logits_per_cell, _ = segmented_tensor.reduce_mean(token_logits, cell_index) _, labels_index = segmented_tensor.reduce_max(tf.cast(label_ids, tf.int32), cell_index) # Mask for the selected column. column_id_for_cells = cell_index.project_inner(labels_index).indices # Set the probs outside the selected column (selected by the *model*) # to 0. This ensures backwards compatibility with models that select # cells from multiple columns. selected_column_id = tf.argmax(column_logits, axis=-1, output_type=tf.int32) selected_column_mask = tf.cast( tf.equal(column_id_for_cells, tf.expand_dims(selected_column_id, axis=-1)), tf.float32) # Never select cells with the special column id 0. selected_column_mask = tf.where(tf.equal(column_id_for_cells, 0), tf.zeros_like(selected_column_mask), selected_column_mask) logits_per_cell += CLOSE_ENOUGH_TO_LOG_ZERO * ( 1.0 - cell_mask * selected_column_mask) logits = segmented_tensor.gather(logits_per_cell, cell_index) return logits
def _get_classification_outputs( config, is_training, output_layer, output_layer_aggregation, label_ids, input_mask, table_mask, aggregation_function_id, answer, numeric_values, numeric_values_scale, row_ids, column_ids, classification_class_index, ): """Creates a classification model. Args: config: Configuration for Tapas model. is_training: Whether the model is training. output_layer: <float32>[batch_size, seq_length, hidden_size] output_layer_aggregation: <float32>[batch_size, hidden_size] label_ids: <int32>[batch_size, seq_length] input_mask: <int32>[batch_size, seq_length] table_mask: <int32>[batch_size, seq_length] aggregation_function_id: <int32>[batch_size] answer: <float32>[batch_size] numeric_values: <float32>[batch_size, seq_length] numeric_values_scale: <float32>[batch_size, seq_length] row_ids: <int32>[batch_size, seq_length] column_ids: <int32>[batch_size, seq_length] classification_class_index: <int32>[batch] Returns: Outputs """ if is_training: # I.e., 0.1 dropout output_layer = tf.nn.dropout(output_layer, keep_prob=0.9) # Construct indices for the table. row_index = segmented_tensor.IndexMap(indices=tf.minimum( row_ids, config.max_num_rows - 1), num_segments=config.max_num_rows, batch_dims=1) col_index = segmented_tensor.IndexMap(indices=tf.minimum( column_ids, config.max_num_columns - 1), num_segments=config.max_num_columns, batch_dims=1) cell_index = segmented_tensor.ProductIndexMap(row_index, col_index) # Masks. # <float32>[batch_size, seq_length] input_mask_float = tf.cast(input_mask, tf.float32) table_mask_float = tf.cast(table_mask, tf.float32) # Mask for cells that exist in the table (i.e. that are not padding). cell_mask, _ = segmented_tensor.reduce_mean(input_mask_float, cell_index) # Compute logits per token. These are used to select individual cells. logits = utils.compute_token_logits( output_layer=output_layer, temperature=config.temperature, init_cell_selection_weights_to_zero=( config.init_cell_selection_weights_to_zero)) # Compute logits per column. These are used to select a column. if config.select_one_column: column_logits = utils.compute_column_logits( output_layer=output_layer, cell_index=cell_index, cell_mask=cell_mask, init_cell_selection_weights_to_zero=( config.init_cell_selection_weights_to_zero), allow_empty_column_selection=config.allow_empty_column_selection) # TODO(pawelnow): Extract this into a function. # Compute aggregation function logits. do_model_aggregation = config.num_aggregation_labels > 0 if do_model_aggregation: hidden_size_agg = output_layer_aggregation.shape[-1].value output_weights_agg = tf.get_variable( "output_weights_agg", shape=[config.num_aggregation_labels, hidden_size_agg], initializer=_classification_initializer()) output_bias_agg = tf.get_variable( "output_bias_agg", shape=[config.num_aggregation_labels], initializer=tf.zeros_initializer()) do_model_classification = config.num_classification_labels > 0 logits_cls = None if do_model_classification: logits_cls = compute_classification_logits( config.num_classification_labels, output_layer_aggregation) with tf.variable_scope("loss"): total_loss = 0.0 is_supervised = (not do_model_aggregation or not config.use_answer_as_supervision) ### Semi-supervised cell selection in case of no aggregation ############################################################# # If the answer (the denotation) appears directly in the table we might # select the answer without applying any aggregation function. There are # some ambiguous cases, see _calculate_aggregate_mask for more info. # `aggregate_mask` is 1 for examples where we chose to aggregate and 0 # for examples where we chose to select the answer directly. # `label_ids` encodes the positions of the answer appearing in the table. if is_supervised: aggregate_mask = None else: # <float32>[batch_size] aggregate_mask = _calculate_aggregate_mask( answer=answer, output_layer_aggregation=output_layer_aggregation, output_bias_agg=output_bias_agg, output_weights_agg=output_weights_agg, cell_select_pref=config.cell_select_pref, label_ids=label_ids) ### Cell selection log-likelihood ################################### if config.average_logits_per_cell: logits_per_cell, _ = segmented_tensor.reduce_mean( logits, cell_index) logits = segmented_tensor.gather(logits_per_cell, cell_index) dist_per_token = tfp.distributions.Bernoulli(logits=logits) selection_loss_per_example = None if config.select_one_column: selection_loss_per_example, logits = _single_column_cell_selection_loss( token_logits=logits, column_logits=column_logits, label_ids=label_ids, cell_index=cell_index, col_index=col_index, cell_mask=cell_mask) dist_per_token = tfp.distributions.Bernoulli(logits=logits) else: weight = tf.where( label_ids == 0, tf.ones_like(label_ids, dtype=tf.float32), config.positive_weight * tf.ones_like(label_ids, dtype=tf.float32)) selection_loss_per_token = -dist_per_token.log_prob( label_ids) * weight selection_loss_per_example = ( tf.reduce_sum(selection_loss_per_token * input_mask_float, axis=1) / (tf.reduce_sum(input_mask_float, axis=1) + _EPSILON_ZERO_DIVISION)) ### Logits for the aggregation function ######################################### logits_aggregation = None if do_model_aggregation: logits_aggregation = _calculate_aggregation_logits( output_layer_aggregation, output_weights_agg, output_bias_agg) ### Classification loss ############################### if do_model_classification: one_hot_labels = tf.one_hot(classification_class_index, depth=config.num_classification_labels, dtype=tf.float32) if config.classification_label_weight: label_weights = [ config.classification_label_weight.get(i, 1.0) for i in range(config.num_classification_labels) ] one_hot_labels *= tf.constant(label_weights, dtype=tf.float32) log_probs = tf.nn.log_softmax(logits_cls, axis=-1) # <float32>[batch_size] per_example_classification_intermediate = -tf.reduce_sum( one_hot_labels * log_probs, axis=-1) cls_loss = tf.reduce_mean(per_example_classification_intermediate) total_loss += cls_loss ### Supervised cell selection ############################### span_indexes = None span_logits = None if config.span_prediction != SpanPredictionMode.NONE: ( span_indexes, span_logits, span_loss, ) = span_prediction_utils.get_span_logits_by_mode( config.span_prediction, output_layer, label_ids, column_ids, row_ids, max_span_length=10, ) total_loss += span_loss elif config.disable_per_token_loss: pass elif config.mask_examples_without_labels: total_loss += tf.reduce_mean( span_prediction_utils.compute_masked_example_loss( label_ids, selection_loss_per_example, )) elif is_supervised: total_loss += tf.reduce_mean(selection_loss_per_example) else: # For the not supervissed case, do not assign loss for cell selection total_loss += tf.reduce_mean(selection_loss_per_example * (1.0 - aggregate_mask)) ### Semi-supervised regression loss and supervised loss for aggregations ######################################################################### if do_model_aggregation: # Note that `aggregate_mask` is None if the setting is supervised. per_example_additional_loss = _calculate_aggregation_loss( logits_aggregation, aggregate_mask, aggregation_function_id, config) if config.use_answer_as_supervision: # Add regression loss for numeric answers which require aggregation. answer_loss, large_answer_loss_mask = _calculate_regression_loss( answer, aggregate_mask, dist_per_token, numeric_values, numeric_values_scale, table_mask_float, logits_aggregation, config) per_example_additional_loss += answer_loss # Zero loss for examples with answer_loss > cutoff. per_example_additional_loss *= large_answer_loss_mask total_loss += tf.reduce_mean(per_example_additional_loss) return Outputs( total_loss=total_loss, logits=logits, probs=_get_probs(dist_per_token) * input_mask_float, logits_aggregation=logits_aggregation, logits_cls=logits_cls, span_indexes=span_indexes, span_logits=span_logits, )