def parse_file_full_embeddings_tapas(fname, outfilename):
    model_name = 'google/tapas-base'
    tokenizer = TapasTokenizer.from_pretrained(model_name)
    config = TapasConfig('google-base-finetuned-wikisql-supervised')
    model = TapasForQuestionAnswering.from_pretrained(model_name, config=config).to(device)

    final_dict = {}
    with open(fname) as f:
        data = list(f)

    print("Num Examples: {}".format(len(data)))
    for i, line in enumerate(data):
        #print(line)
        result = json.loads(line)
        tbl_id = result['table_id']
        table_string = ' '.join(_tbl(result))
        table_list = table_string.split(SPLIT_WORD)
        table_list_filtered = [token for token in table_list if token != '']
        dict_index = {key : [] for key in table_list_filtered}
        table = pd.DataFrame(dict_index)

        query = [' '.join(result['question']['words'])]
        inputs = tokenizer(table=table, queries=query)
        out = model(inputs)[0].tolist()
        final_dict[tbl_id] = out

        if i % 200 == 0:
            print("Num Examples Done: {}".format(i))

    with open(outfilename, 'w') as outfile:
        json.dump(final_dict, outfile)
Ejemplo n.º 2
0
def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, tapas_config_file, pytorch_dump_path):
    # Initialise PyTorch model
    #config = TapasConfig.from_json_file(tapas_config_file)
    config = TapasConfig(task="SQA")
    print("Building PyTorch model from configuration: {}".format(str(config)))
    #model = TapasForMaskedLM(config)
    model = TapasForQuestionAnswering(config)

    # Load weights from tf checkpoint
    load_tf_weights_in_tapas(model, config, tf_checkpoint_path)

    # Save pytorch-model
    print("Save PyTorch model to {}".format(pytorch_dump_path))
    torch.save(model.state_dict(), pytorch_dump_path)
Ejemplo n.º 3
0
 def get_config(self):
     return TapasConfig(
         vocab_size=self.vocab_size,
         hidden_size=self.hidden_size,
         num_hidden_layers=self.num_hidden_layers,
         num_attention_heads=self.num_attention_heads,
         intermediate_size=self.intermediate_size,
         hidden_act=self.hidden_act,
         hidden_dropout_prob=self.hidden_dropout_prob,
         attention_probs_dropout_prob=self.attention_probs_dropout_prob,
         max_position_embeddings=self.max_position_embeddings,
         type_vocab_sizes=self.type_vocab_sizes,
         initializer_range=self.initializer_range,
         positive_weight=self.positive_weight,
         num_aggregation_labels=self.num_aggregation_labels,
         num_labels=self.num_labels,
         aggregation_loss_importance=self.aggregation_loss_importance,
         use_answer_as_supervision=self.use_answer_as_supervision,
         answer_loss_importance=self.answer_loss_importance,
         use_normalized_answer_loss=self.use_normalized_answer_loss,
         huber_loss_delta=self.huber_loss_delta,
         temperature=self.temperature,
         agg_temperature=self.agg_temperature,
         use_gumbel_for_cells=self.use_gumbel_for_cells,
         use_gumbel_for_agg=self.use_gumbel_for_agg,
         average_approximation_function=self.average_approximation_function,
         cell_selection_preference=self.cell_selection_preference,
         answer_loss_cutoff=self.answer_loss_cutoff,
         max_num_rows=self.max_num_rows,
         max_num_columns=self.max_num_columns,
         average_logits_per_cell=self.average_logits_per_cell,
         select_one_column=self.select_one_column,
         allow_empty_column_selection=self.allow_empty_column_selection,
         init_cell_selection_weights_to_zero=self.
         init_cell_selection_weights_to_zero,
         reset_position_index_per_cell=self.reset_position_index_per_cell,
         disable_per_token_loss=self.disable_per_token_loss,
     )
Ejemplo n.º 4
0
    def prepare_config_and_inputs(self):
        input_ids = ids_tensor([self.batch_size, self.seq_length],
                               self.vocab_size).to(torch_device)

        input_mask = None
        if self.use_input_mask:
            input_mask = random_attention_mask(
                [self.batch_size, self.seq_length]).to(torch_device)

        token_type_ids = []
        for type_vocab_size in self.type_vocab_sizes:
            token_type_ids.append(
                ids_tensor(shape=[self.batch_size, self.seq_length],
                           vocab_size=type_vocab_size))
        token_type_ids = torch.stack(token_type_ids, dim=2).to(torch_device)

        sequence_labels = None
        token_labels = None
        labels = None
        numeric_values = None
        numeric_values_scale = None
        float_answer = None
        aggregation_labels = None
        if self.use_labels:
            sequence_labels = ids_tensor(
                [self.batch_size],
                self.type_sequence_label_size).to(torch_device)
            token_labels = ids_tensor([self.batch_size, self.seq_length],
                                      self.num_labels).to(torch_device)
            labels = ids_tensor([self.batch_size, self.seq_length],
                                vocab_size=2).to(torch_device)
            numeric_values = floats_tensor([self.batch_size,
                                            self.seq_length]).to(torch_device)
            numeric_values_scale = floats_tensor(
                [self.batch_size, self.seq_length]).to(torch_device)
            float_answer = floats_tensor([self.batch_size]).to(torch_device)
            aggregation_labels = ids_tensor(
                [self.batch_size],
                self.num_aggregation_labels).to(torch_device)

        config = TapasConfig(
            vocab_size=self.vocab_size,
            hidden_size=self.hidden_size,
            num_hidden_layers=self.num_hidden_layers,
            num_attention_heads=self.num_attention_heads,
            intermediate_size=self.intermediate_size,
            hidden_act=self.hidden_act,
            hidden_dropout_prob=self.hidden_dropout_prob,
            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
            max_position_embeddings=self.max_position_embeddings,
            type_vocab_sizes=self.type_vocab_sizes,
            initializer_range=self.initializer_range,
            positive_weight=self.positive_weight,
            num_aggregation_labels=self.num_aggregation_labels,
            num_labels=self.num_labels,
            aggregation_loss_importance=self.aggregation_loss_importance,
            use_answer_as_supervision=self.use_answer_as_supervision,
            answer_loss_importance=self.answer_loss_importance,
            use_normalized_answer_loss=self.use_normalized_answer_loss,
            huber_loss_delta=self.huber_loss_delta,
            temperature=self.temperature,
            agg_temperature=self.agg_temperature,
            use_gumbel_for_cells=self.use_gumbel_for_cells,
            use_gumbel_for_agg=self.use_gumbel_for_agg,
            average_approximation_function=self.average_approximation_function,
            cell_selection_preference=self.cell_selection_preference,
            answer_loss_cutoff=self.answer_loss_cutoff,
            max_num_rows=self.max_num_rows,
            max_num_columns=self.max_num_columns,
            average_logits_per_cell=self.average_logits_per_cell,
            select_one_column=self.select_one_column,
            allow_empty_column_selection=self.allow_empty_column_selection,
            init_cell_selection_weights_to_zero=self.
            init_cell_selection_weights_to_zero,
            reset_position_index_per_cell=self.reset_position_index_per_cell,
            disable_per_token_loss=self.disable_per_token_loss,
        )

        return (
            config,
            input_ids,
            input_mask,
            token_type_ids,
            sequence_labels,
            token_labels,
            labels,
            numeric_values,
            numeric_values_scale,
            float_answer,
            aggregation_labels,
        )
Ejemplo n.º 5
0
def convert_tf_checkpoint_to_pytorch(task, reset_position_index_per_cell,
                                     tf_checkpoint_path, tapas_config_file,
                                     pytorch_dump_path):
    # Initialise PyTorch model.
    # If you want to convert a checkpoint that uses absolute position embeddings, make sure to set reset_position_index_per_cell of
    # TapasConfig to False.

    # initialize configuration from json file
    config = TapasConfig.from_json_file(tapas_config_file)
    # set absolute/relative position embeddings parameter
    config.reset_position_index_per_cell = reset_position_index_per_cell

    # set remaining parameters of TapasConfig as well as the model based on the task
    if task == "SQA":
        model = TapasForQuestionAnswering(config=config)
    elif task == "WTQ":
        # run_task_main.py hparams
        config.num_aggregation_labels = 4
        config.use_answer_as_supervision = True
        # hparam_utils.py hparams
        config.answer_loss_cutoff = 0.664694
        config.cell_selection_preference = 0.207951
        config.huber_loss_delta = 0.121194
        config.init_cell_selection_weights_to_zero = True
        config.select_one_column = True
        config.allow_empty_column_selection = False
        config.temperature = 0.0352513

        model = TapasForQuestionAnswering(config=config)
    elif task == "WIKISQL_SUPERVISED":
        # run_task_main.py hparams
        config.num_aggregation_labels = 4
        config.use_answer_as_supervision = False
        # hparam_utils.py hparams
        config.answer_loss_cutoff = 36.4519
        config.cell_selection_preference = 0.903421
        config.huber_loss_delta = 222.088
        config.init_cell_selection_weights_to_zero = True
        config.select_one_column = True
        config.allow_empty_column_selection = True
        config.temperature = 0.763141

        model = TapasForQuestionAnswering(config=config)
    elif task == "TABFACT":
        model = TapasForSequenceClassification(config=config)
    elif task == "MLM":
        model = TapasForMaskedLM(config=config)
    elif task == "INTERMEDIATE_PRETRAINING":
        model = TapasModel(config=config)
    else:
        raise ValueError(f"Task {task} not supported.")

    print(f"Building PyTorch model from configuration: {config}")
    # Load weights from tf checkpoint
    load_tf_weights_in_tapas(model, config, tf_checkpoint_path)

    # Save pytorch-model (weights and configuration)
    print(f"Save PyTorch model to {pytorch_dump_path}")
    model.save_pretrained(pytorch_dump_path)

    # Save tokenizer files
    print(f"Save tokenizer files to {pytorch_dump_path}")
    tokenizer = TapasTokenizer(vocab_file=tf_checkpoint_path[:-10] +
                               "vocab.txt",
                               model_max_length=512)
    tokenizer.save_pretrained(pytorch_dump_path)

    print("Used relative position embeddings:",
          model.config.reset_position_index_per_cell)