def __init__(self, hparams): super().__init__() self.hparams = hparams self.Qmodel = BertModel.from_pretrained(self.hparams.bert_path) self.Tmodel = TableBertModel.from_pretrained(self.hparams.tabert_path) self.criterion = nn.MarginRankingLoss(margin=1) self.avg_pooler = nn.AdaptiveAvgPool2d([1, 768])
def __init__(self, hparams): super(TaBERTTuner, self).__init__() self.hparams = hparams self.model = TableBertModel.from_pretrained( 'tabert_base_k1/model.bin') #jz #for multi-classification, col-encoding is (bs, 15, 768)->(bs, 768)->(bs, 500)->(bs, 15), label is (bs, 1) #first layer #jz self.l1 = nn.Linear(768, 500) self.l1_cat = nn.Linear(1536, 500) #second layer self.l2 = nn.Linear(500, max_len) #jz #softmax self.sm = nn.Softmax(dim=1) #loss self.l = nn.CrossEntropyLoss() #attention: apply to only column_encoding. weighted sum of column_encoding. #weight: (bs, 15, 768)->(bs, 15, 768)->(bs, 15, 1)->(bs, 15), weight*column_encoding: (bs,1,15)*(bs,15,768)->(bs,1,768) self.lin_bias = nn.Linear(768, 768) self.att_weight = nn.Parameter(torch.rand(768, 1)) self.sm_att = nn.Softmax(dim=1)
def __init__(self, params): super().__init__() self.data_dir = params.data_dir self.query_tokenizer = BertTokenizer.from_pretrained(params.bert_path) table_model = TableBertModel.from_pretrained(params.tabert_path) self.table_tokenizer = table_model.tokenizer self.train_batch_size = params.train_batch_size self.valid_batch_size = params.valid_batch_size if hasattr(params, 'test_batch_size'): self.test_batch_size = params.test_batch_size
def __init__(self, hparams): super(TaBERTTuner, self).__init__() self.hparams = hparams self.model = TableBertModel.from_pretrained('tabert_base_k1/model.bin') #jz #first layer #jz self.l1 = nn.Linear(768, 500) self.l1_cat = nn.Linear(1536, 500) #second layer self.l2 = nn.Linear(500, 2) #jz #softmax # self.sm = nn.LogSoftmax(dim=1) self.sm = nn.Softmax(dim=2) #loss weight_try = torch.FloatTensor([1,0.01]) #weight_try = torch.FloatTensor([1,0.167]) self.l = nn.CrossEntropyLoss(ignore_index = 2,weight=weight_try) #jz: 2 is index for padding #self.l = nn.CrossEntropyLoss(ignore_index = 2) self.l = self.l.to('cuda')
input_ids.append(q["input_ids"].squeeze()) token_type_ids.append(q["token_type_ids"].squeeze()) attention_mask.append(q["attention_mask"].squeeze()) query = { "input_ids": torch.stack(input_ids), "token_type_ids": torch.stack(token_type_ids), "attention_mask": torch.stack(attention_mask) } return query, column, caption, rel, qid, tid if __name__ == "__main__": query_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') bert_model = BertModel.from_pretrained('bert-base-uncased') table_model = TableBertModel.from_pretrained( 'model/tabert_base_k3/model.bin') table_tokenizer = table_model.tokenizer dataset = QueryTableDataset( data_dir='data/1', data_type='train', query_tokenizer=query_tokenizer, table_tokenizer=table_tokenizer, prepare=True, ) dataloader = DataLoader(dataset, batch_size=2, collate_fn=query_table_collate_fn) for _ in range(1): for d in dataloader:
from table_bert import TableBertModel model = TableBertModel.from_pretrained( '/tabert/models/tabert_base_k1/model.bin', ) from table_bert import Table, Column table = Table(id='List of countries by GDP (PPP)', header=[ Column('Nation', 'text', sample_value='United States'), Column('Gross Domestic Product', 'real', sample_value='21,439,453') ], data=[ ['United States', '21,439,453'], ['China', '27,308,857'], ['European Union', '22,774,165'], ]).tokenize(model.tokenizer) # To visualize table in an IPython notebook: # display(table.to_data_frame(), detokenize=True) context = 'show me countries ranked by GDP' # model takes batched, tokenized inputs context_encoding, column_encoding, info_dict = model.encode( contexts=[model.tokenizer.tokenize(context)], tables=[table]) print(context_encoding.shape) print(column_encoding.shape)
from table_bert import TableBertModel from table_bert import Table, Column import torch model = TableBertModel.from_pretrained( '/Users/mac/Desktop/syt/Deep-Learning/Repos/TaBERT/pretrained-models/tabert_base_k3/model.bin', ) table = Table(id='List of countries by GDP (PPP)', header=[ Column('Nation', 'text', sample_value='United States'), Column('Gross Domestic Product', 'real', sample_value='21,439,453') ], data=[ ['United States', '21,439,453'], ['China', '27,308,857'], ['European Union', '22,774,165'], ]).tokenize(model.tokenizer) table2 = Table(id='List of countries by GDP (PPP)', header=[ Column('Nation', 'text', sample_value='United States'), Column('Gross Domestic Product', 'real', sample_value='21,439,453'), Column('Continent', 'text', sample_value='North America') ], data=[
def __init__(self, hparams, decoder,enc_hid_dim, dec_hid_dim): super(TaBERTTuner, self).__init__() self.hparams = hparams self.encoder = TableBertModel.from_pretrained('bert-base-uncased') self.hidden = nn.Linear(enc_hid_dim, dec_hid_dim) self.decoder = decoder
def __init__(self, hparams): super().__init__() self.hparams = hparams self.Qmodel = BertModel.from_pretrained(self.hparams.bert_path) self.Tmodel = TableBertModel.from_pretrained(self.hparams.tabert_path) self.norm = nn.LayerNorm(768)