def parse_file_full_embeddings_tapas(fname, outfilename): model_name = 'google/tapas-base' tokenizer = TapasTokenizer.from_pretrained(model_name) config = TapasConfig('google-base-finetuned-wikisql-supervised') model = TapasForQuestionAnswering.from_pretrained(model_name, config=config).to(device) final_dict = {} with open(fname) as f: data = list(f) print("Num Examples: {}".format(len(data))) for i, line in enumerate(data): #print(line) result = json.loads(line) tbl_id = result['table_id'] table_string = ' '.join(_tbl(result)) table_list = table_string.split(SPLIT_WORD) table_list_filtered = [token for token in table_list if token != ''] dict_index = {key : [] for key in table_list_filtered} table = pd.DataFrame(dict_index) query = [' '.join(result['question']['words'])] inputs = tokenizer(table=table, queries=query) out = model(inputs)[0].tolist() final_dict[tbl_id] = out if i % 200 == 0: print("Num Examples Done: {}".format(i)) with open(outfilename, 'w') as outfile: json.dump(final_dict, outfile)
def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, tapas_config_file, pytorch_dump_path): # Initialise PyTorch model #config = TapasConfig.from_json_file(tapas_config_file) config = TapasConfig(task="SQA") print("Building PyTorch model from configuration: {}".format(str(config))) #model = TapasForMaskedLM(config) model = TapasForQuestionAnswering(config) # Load weights from tf checkpoint load_tf_weights_in_tapas(model, config, tf_checkpoint_path) # Save pytorch-model print("Save PyTorch model to {}".format(pytorch_dump_path)) torch.save(model.state_dict(), pytorch_dump_path)
def get_config(self): return TapasConfig( vocab_size=self.vocab_size, hidden_size=self.hidden_size, num_hidden_layers=self.num_hidden_layers, num_attention_heads=self.num_attention_heads, intermediate_size=self.intermediate_size, hidden_act=self.hidden_act, hidden_dropout_prob=self.hidden_dropout_prob, attention_probs_dropout_prob=self.attention_probs_dropout_prob, max_position_embeddings=self.max_position_embeddings, type_vocab_sizes=self.type_vocab_sizes, initializer_range=self.initializer_range, positive_weight=self.positive_weight, num_aggregation_labels=self.num_aggregation_labels, num_labels=self.num_labels, aggregation_loss_importance=self.aggregation_loss_importance, use_answer_as_supervision=self.use_answer_as_supervision, answer_loss_importance=self.answer_loss_importance, use_normalized_answer_loss=self.use_normalized_answer_loss, huber_loss_delta=self.huber_loss_delta, temperature=self.temperature, agg_temperature=self.agg_temperature, use_gumbel_for_cells=self.use_gumbel_for_cells, use_gumbel_for_agg=self.use_gumbel_for_agg, average_approximation_function=self.average_approximation_function, cell_selection_preference=self.cell_selection_preference, answer_loss_cutoff=self.answer_loss_cutoff, max_num_rows=self.max_num_rows, max_num_columns=self.max_num_columns, average_logits_per_cell=self.average_logits_per_cell, select_one_column=self.select_one_column, allow_empty_column_selection=self.allow_empty_column_selection, init_cell_selection_weights_to_zero=self. init_cell_selection_weights_to_zero, reset_position_index_per_cell=self.reset_position_index_per_cell, disable_per_token_loss=self.disable_per_token_loss, )
def prepare_config_and_inputs(self): input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size).to(torch_device) input_mask = None if self.use_input_mask: input_mask = random_attention_mask( [self.batch_size, self.seq_length]).to(torch_device) token_type_ids = [] for type_vocab_size in self.type_vocab_sizes: token_type_ids.append( ids_tensor(shape=[self.batch_size, self.seq_length], vocab_size=type_vocab_size)) token_type_ids = torch.stack(token_type_ids, dim=2).to(torch_device) sequence_labels = None token_labels = None labels = None numeric_values = None numeric_values_scale = None float_answer = None aggregation_labels = None if self.use_labels: sequence_labels = ids_tensor( [self.batch_size], self.type_sequence_label_size).to(torch_device) token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels).to(torch_device) labels = ids_tensor([self.batch_size, self.seq_length], vocab_size=2).to(torch_device) numeric_values = floats_tensor([self.batch_size, self.seq_length]).to(torch_device) numeric_values_scale = floats_tensor( [self.batch_size, self.seq_length]).to(torch_device) float_answer = floats_tensor([self.batch_size]).to(torch_device) aggregation_labels = ids_tensor( [self.batch_size], self.num_aggregation_labels).to(torch_device) config = TapasConfig( vocab_size=self.vocab_size, hidden_size=self.hidden_size, num_hidden_layers=self.num_hidden_layers, num_attention_heads=self.num_attention_heads, intermediate_size=self.intermediate_size, hidden_act=self.hidden_act, hidden_dropout_prob=self.hidden_dropout_prob, attention_probs_dropout_prob=self.attention_probs_dropout_prob, max_position_embeddings=self.max_position_embeddings, type_vocab_sizes=self.type_vocab_sizes, initializer_range=self.initializer_range, positive_weight=self.positive_weight, num_aggregation_labels=self.num_aggregation_labels, num_labels=self.num_labels, aggregation_loss_importance=self.aggregation_loss_importance, use_answer_as_supervision=self.use_answer_as_supervision, answer_loss_importance=self.answer_loss_importance, use_normalized_answer_loss=self.use_normalized_answer_loss, huber_loss_delta=self.huber_loss_delta, temperature=self.temperature, agg_temperature=self.agg_temperature, use_gumbel_for_cells=self.use_gumbel_for_cells, use_gumbel_for_agg=self.use_gumbel_for_agg, average_approximation_function=self.average_approximation_function, cell_selection_preference=self.cell_selection_preference, answer_loss_cutoff=self.answer_loss_cutoff, max_num_rows=self.max_num_rows, max_num_columns=self.max_num_columns, average_logits_per_cell=self.average_logits_per_cell, select_one_column=self.select_one_column, allow_empty_column_selection=self.allow_empty_column_selection, init_cell_selection_weights_to_zero=self. init_cell_selection_weights_to_zero, reset_position_index_per_cell=self.reset_position_index_per_cell, disable_per_token_loss=self.disable_per_token_loss, ) return ( config, input_ids, input_mask, token_type_ids, sequence_labels, token_labels, labels, numeric_values, numeric_values_scale, float_answer, aggregation_labels, )
def convert_tf_checkpoint_to_pytorch(task, reset_position_index_per_cell, tf_checkpoint_path, tapas_config_file, pytorch_dump_path): # Initialise PyTorch model. # If you want to convert a checkpoint that uses absolute position embeddings, make sure to set reset_position_index_per_cell of # TapasConfig to False. # initialize configuration from json file config = TapasConfig.from_json_file(tapas_config_file) # set absolute/relative position embeddings parameter config.reset_position_index_per_cell = reset_position_index_per_cell # set remaining parameters of TapasConfig as well as the model based on the task if task == "SQA": model = TapasForQuestionAnswering(config=config) elif task == "WTQ": # run_task_main.py hparams config.num_aggregation_labels = 4 config.use_answer_as_supervision = True # hparam_utils.py hparams config.answer_loss_cutoff = 0.664694 config.cell_selection_preference = 0.207951 config.huber_loss_delta = 0.121194 config.init_cell_selection_weights_to_zero = True config.select_one_column = True config.allow_empty_column_selection = False config.temperature = 0.0352513 model = TapasForQuestionAnswering(config=config) elif task == "WIKISQL_SUPERVISED": # run_task_main.py hparams config.num_aggregation_labels = 4 config.use_answer_as_supervision = False # hparam_utils.py hparams config.answer_loss_cutoff = 36.4519 config.cell_selection_preference = 0.903421 config.huber_loss_delta = 222.088 config.init_cell_selection_weights_to_zero = True config.select_one_column = True config.allow_empty_column_selection = True config.temperature = 0.763141 model = TapasForQuestionAnswering(config=config) elif task == "TABFACT": model = TapasForSequenceClassification(config=config) elif task == "MLM": model = TapasForMaskedLM(config=config) elif task == "INTERMEDIATE_PRETRAINING": model = TapasModel(config=config) else: raise ValueError(f"Task {task} not supported.") print(f"Building PyTorch model from configuration: {config}") # Load weights from tf checkpoint load_tf_weights_in_tapas(model, config, tf_checkpoint_path) # Save pytorch-model (weights and configuration) print(f"Save PyTorch model to {pytorch_dump_path}") model.save_pretrained(pytorch_dump_path) # Save tokenizer files print(f"Save tokenizer files to {pytorch_dump_path}") tokenizer = TapasTokenizer(vocab_file=tf_checkpoint_path[:-10] + "vocab.txt", model_max_length=512) tokenizer.save_pretrained(pytorch_dump_path) print("Used relative position embeddings:", model.config.reset_position_index_per_cell)