Ejemplo n.º 1
0
 def test_tapas_with_hard_selection(self, config, expected_res):
     with self.cached_session() as sess:
         scores = tf.constant([[1.0, 1.0, 0.3, 0.3, 0.3, 0.2, 0.8],
                               [1.0, 0.3, 0.2, 0.2, 0.2, 0.3, 0.0]])
         bert_config_file = tempfile.mktemp()
         bert_config = modeling.BertConfig(
             type_vocab_size=[3, 256, 256, 2, 256, 256, 10],
             vocab_size=64,
             hidden_size=16,
             num_hidden_layers=4,
             num_attention_heads=2,
             intermediate_size=32,
             max_position_embeddings=64)
         bert_config.to_json_file(bert_config_file)
         config = _Tapas(bert_config_file=bert_config_file,
                         selection=_Tapas.Selection.COLUMNS,
                         loss=_Loss(
                             train=config,
                             eval=config,
                             unsupervised=_Unsupervised(
                                 regularization=_Regularization.NONE)))
         selector = table_pruning.TapasPruningSelector(config=config,
                                                       max_num_columns=4,
                                                       max_num_rows=4,
                                                       max_num_tokens=4)
         hard_selection_mask = selector.apply_hard_selection(
             scores=scores, mode=tf_estimator.ModeKeys.TRAIN)
         sess.run(tf.global_variables_initializer())
         hard_selection_mask = sess.run(hard_selection_mask)
         tf.logging.info("-------------hard_selection_mask---------------")
         tf.logging.info(hard_selection_mask)
         self.assertAllClose(hard_selection_mask, expected_res, atol=1e-5)
Ejemplo n.º 2
0
    def test_tapas_column_logits(self):
        # Validate that all the transformer parameters are used and that the
        # output has the right shape and is properly masked.
        bert_config_file = tempfile.mktemp()
        num_hidden_layers = 4
        type_vocab_size = [3, 256, 256, 2, 256, 256, 10]
        bert_config = modeling.BertConfig(type_vocab_size=type_vocab_size,
                                          vocab_size=64,
                                          hidden_size=16,
                                          num_hidden_layers=num_hidden_layers,
                                          num_attention_heads=2,
                                          intermediate_size=32,
                                          max_position_embeddings=64)
        bert_config.to_json_file(bert_config_file)

        with self.cached_session() as sess:
            features = _create_features()
            max_num_columns = 6
            config = _Tapas(bert_config_file=bert_config_file,
                            selection=_Tapas.Selection.COLUMNS,
                            loss=_Loss(unsupervised=_Unsupervised(
                                regularization=_Regularization.L1)))
            max_num_tokens = 6
            model = table_pruning.TapasPruningSelector(
                config=config,
                max_num_columns=max_num_columns,
                max_num_rows=max_num_columns,
                max_num_tokens=max_num_tokens)

            column_scores = model.select_columns(tf_estimator.ModeKeys.TRAIN,
                                                 features)

            tf.logging.info("*** Features ***")
            for name in sorted(features):
                tf.logging.info("  name = %s, shape = %s", name,
                                features[name].shape)
            tf.logging.info("**** Trainable Variables ****")
            tvars = tf.trainable_variables()
            self.assertLen(tvars,
                           16 * num_hidden_layers + len(type_vocab_size) + 8)
            for var in tvars:
                tf.logging.info("  name = %s, shape = %s", var.name, var.shape)
            sess.run(tf.global_variables_initializer())
            column_scores = sess.run(column_scores)
            tf.logging.info("*** Scores ***")
            self.assertLen(column_scores, 2)
            tf.logging.info("*** Scores example 0 ***")
            scores_example_0 = column_scores[0]
            tf.logging.info(scores_example_0)
            # At this stage we don't apply yet the sigmoid so the column_scores for
            # non existing columns in column_ids are scored -10000.0.
            for i in range(2, max_num_columns):
                self.assertEqual(scores_example_0[i], -10000.0)
            tf.logging.info("*** Scores example 1 ***")
            scores_example_1 = column_scores[1]
            tf.logging.info(scores_example_1)
            for i in range(1, max_num_columns):
                self.assertEqual(scores_example_1[i], -10000.0)
Ejemplo n.º 3
0
 def test_gather_nd(self):
     bert_config_file = tempfile.mktemp()
     num_hidden_layers = 4
     type_vocab_size = [3, 256, 256, 2, 256, 256, 10]
     bert_config = modeling.BertConfig(type_vocab_size=type_vocab_size,
                                       vocab_size=64,
                                       hidden_size=16,
                                       num_hidden_layers=num_hidden_layers,
                                       num_attention_heads=2,
                                       intermediate_size=32,
                                       max_position_embeddings=64)
     bert_config.to_json_file(bert_config_file)
     with self.cached_session() as sess:
         features = _create_features()
         max_num_columns = 4
         max_num_tokens = 4
         config = _Tapas(bert_config_file=bert_config_file,
                         selection=_Tapas.Selection.COLUMNS,
                         loss=_Loss(
                             train=_HardSelection(selection_fn=_TOP_K),
                             eval=_HardSelection(selection_fn=_TOP_K),
                             add_classification_loss=True,
                             unsupervised=_Unsupervised(
                                 regularization=_Regularization.NONE)))
         selector = table_pruning.TapasPruningSelector(
             config=config,
             max_num_columns=max_num_columns,
             max_num_rows=max_num_columns,
             max_num_tokens=max_num_tokens)
         stats = selector.compute_scores(mode=tf_estimator.ModeKeys.TRAIN,
                                         features=features)
         token_scores = stats.token_scores
         gather_op = stats.gather_op
         new_features = gather_op(features=features, scores=token_scores)
         sess.run(tf.global_variables_initializer())
         new_features = sess.run(new_features)
         expected_column_ids = [[0, 0, 0, 1], [0, 0, 0, 0]]
         expected_input_mask = [[1, 1, 1, 1], [1, 1, 1, 1]]
         expected_input_ids = [[0, 1, 2, 3], [1, 1, 1, 1]]
         tf.logging.info("-------------features---------------")
         for k, v in new_features.items():
             tf.logging.info(f"-------------{k}---------------")
             tf.logging.info(v)
         self.assertAllClose(new_features["column_ids"],
                             expected_column_ids, 5)
         self.assertAllClose(new_features["input_mask"],
                             expected_input_mask, 5)
         self.assertAllClose(new_features["input_ids"], expected_input_ids,
                             5)
Ejemplo n.º 4
0
        def create_model(self):
            input_ids = BertModelTest.ids_tensor(
                [self.batch_size, self.seq_length], self.vocab_size)

            input_mask = None
            if self.use_input_mask:
                input_mask = BertModelTest.ids_tensor(
                    [self.batch_size, self.seq_length], vocab_size=2)

            token_type_ids = None
            if self.use_token_type_ids:
                token_type_ids = BertModelTest.ids_tensor(
                    [self.batch_size, self.seq_length], self.type_vocab_size)

            config = modeling.BertConfig(
                vocab_size=self.vocab_size,
                hidden_size=self.hidden_size,
                num_hidden_layers=self.num_hidden_layers,
                num_attention_heads=self.num_attention_heads,
                intermediate_size=self.intermediate_size,
                hidden_act=self.hidden_act,
                hidden_dropout_prob=self.hidden_dropout_prob,
                attention_probs_dropout_prob=self.attention_probs_dropout_prob,
                max_position_embeddings=self.max_position_embeddings,
                type_vocab_size=self.type_vocab_size,
                initializer_range=self.initializer_range,
                softmax_temperature=self.softmax_temperature)

            model = modeling.BertModel(
                config=config,
                is_training=self.is_training,
                input_ids=input_ids,
                input_mask=input_mask,
                token_type_ids=token_type_ids,
                scope=self.scope,
                proj_value_length=self.proj_value_length)

            outputs = {
                "embedding_output": model.get_embedding_output(),
                "sequence_output": model.get_sequence_output(),
                "pooled_output": model.get_pooled_output(),
                "all_encoder_layers": model.get_all_encoder_layers(),
            }
            return outputs
Ejemplo n.º 5
0
 def test_compute_scores(self, selection, train_selection_fn,
                         expected_scores):
     bert_config_file = tempfile.mktemp()
     num_hidden_layers = 4
     type_vocab_size = [3, 256, 256, 2, 256, 256, 10]
     bert_config = modeling.BertConfig(type_vocab_size=type_vocab_size,
                                       vocab_size=64,
                                       hidden_size=16,
                                       num_hidden_layers=num_hidden_layers,
                                       num_attention_heads=2,
                                       intermediate_size=32,
                                       max_position_embeddings=64)
     bert_config.to_json_file(bert_config_file)
     with self.cached_session() as sess:
         features = _create_features()
         max_num_columns = 4
         max_num_tokens = 6
         config = _Tapas(
             bert_config_file=bert_config_file,
             selection=selection,
             loss=_Loss(
                 train=_HardSelection(selection_fn=train_selection_fn),
                 eval=_HardSelection(selection_fn=train_selection_fn),
                 unsupervised=_Unsupervised(
                     regularization=_Regularization.NONE)))
         selector = table_pruning.TapasPruningSelector(
             config=config,
             max_num_columns=max_num_columns,
             max_num_rows=max_num_columns,
             max_num_tokens=max_num_tokens)
         stats = selector.compute_scores(mode=tf_estimator.ModeKeys.TRAIN,
                                         features=features)
         token_scores = stats.token_scores
         sess.run(tf.global_variables_initializer())
         token_scores = sess.run(token_scores)
         tf.logging.info("-------------token_scores---------------")
         tf.logging.info(token_scores)
         self.assertAllClose(token_scores, expected_scores, atol=1e-5)
Ejemplo n.º 6
0
 def test_config_to_json_string(self):
     config = modeling.BertConfig(vocab_size=99, hidden_size=37)
     obj = json.loads(config.to_json_string())
     self.assertEqual(obj["vocab_size"], 99)
     self.assertEqual(obj["hidden_size"], 37)
Ejemplo n.º 7
0
 def test_tapas_loss(self, test_name, expected_loss):
     bert_config_file = tempfile.mktemp()
     num_hidden_layers = 4
     type_vocab_size = [3, 256, 256, 2, 256, 256, 10]
     bert_config = modeling.BertConfig(type_vocab_size=type_vocab_size,
                                       vocab_size=64,
                                       hidden_size=16,
                                       num_hidden_layers=num_hidden_layers,
                                       num_attention_heads=2,
                                       intermediate_size=32,
                                       max_position_embeddings=64)
     bert_config.to_json_file(bert_config_file)
     with self.cached_session() as sess:
         features = _create_features()
         max_num_columns = 4
         loss_config = _Loss(
             train=_HardSelection(selection_fn=_TOP_K),
             eval=_HardSelection(selection_fn=_MASK_TOP_K),
             unsupervised=_Unsupervised(regularization=_Regularization.L1))
         if test_name == "supervised":
             loss_config = _Loss(
                 train=_HardSelection(selection_fn=_TOP_K),
                 eval=_HardSelection(selection_fn=_MASK_TOP_K),
                 supervised=_Supervised(back_propagation=True))
         config = _Tapas(bert_config_file=bert_config_file,
                         selection=_Tapas.Selection.COLUMNS,
                         loss=loss_config)
         max_num_tokens = 6
         required_columns = tf.constant([[1., 0., 0., 0.], [1., 0., 0.,
                                                            0.]])
         model = table_pruning.TapasPruningSelector(
             config=config,
             max_num_columns=max_num_columns,
             max_num_rows=max_num_columns,
             max_num_tokens=max_num_tokens)
         column_scores = model.select_columns(tf_estimator.ModeKeys.TRAIN,
                                              features)
         column_score_mask = table_pruning.get_mask_columns_scores(
             max_num_columns=max_num_columns,
             scores=column_scores,
             column_ids=features["column_ids"])
         token_scores = table_pruning.get_token_scores_from_column_scores(
             column_ids=features["column_ids"],
             column_probs=tf.sigmoid(column_scores),
             input_mask=features["input_mask"],
             max_num_columns=max_num_columns)
         loss = model.compute_loss(required_columns=required_columns,
                                   column_scores=column_scores,
                                   column_score_mask=column_score_mask,
                                   token_scores=token_scores)
         sess.run(tf.global_variables_initializer())
         required_columns = sess.run(required_columns)
         tf.logging.info("-------------required_columns---------------")
         tf.logging.info(required_columns)
         column_scores = sess.run(column_scores)
         tf.logging.info("-------------column_scores---------------")
         tf.logging.info(column_scores)
         column_score_mask = sess.run(column_score_mask)
         tf.logging.info("-------------column_score_mask---------------")
         tf.logging.info(column_score_mask)
         loss = sess.run(loss)
         tf.logging.info("-------------pruning_loss---------------")
         tf.logging.info(loss)
         self.assertAlmostEqual(loss, expected_loss, 5)