def _build(self): for idx in tqdm(range(len(self.data))): qs = self.data.loc[idx, 'context'] ans = self.data.loc[idx, 'answer'] heads = self.data.loc[idx, 'header'] tit = self.data.loc[idx, 'title'] rs = self.data.loc[idx, 'rows'] col = [Column(z[0], z[1], sample_value=z[2]) for z in heads] table = Table( id=tit, header=col, data=rs ).tokenize(self.model.tokenizer) self.tabs.append(table) self.context.append(self.model.tokenizer.tokenize(qs)) # print('ans:',ans) ans_tokenized = self.model.tokenizer.tokenize(str(ans[0])) encoded_dict = tokenizer.encode_plus( ans_tokenized, # Sentence to encode. add_special_tokens = True, # Add '[CLS]' and '[SEP]' truncation=True, max_length = 35, # Pad & truncate all sentences. pad_to_max_length = True, return_attention_mask = True, # Construct attn. masks. return_tensors = 'pt', # Return pytorch tensors. ) # print('answers',ans[0]) # print('answers',ans_tokenized) # print('tockenized answers:',encoded_dict['input_ids']) self.answers.append(encoded_dict['input_ids'])
def _build(self): for idx in tqdm(range(len(self.data))): qs = self.data.loc[idx, 'context'] #ans = self.data.loc[idx, 'answer'] heads = self.data.loc[idx, 'header'] tit = self.data.loc[idx, 'title'] rs = self.data.loc[idx, 'rows'] label = self.data.loc[ idx, 'select_column'] #jz: if use multi-class classification, here is a number in [0,14]. col = [Column(z[0], z[1], sample_value=z[2]) for z in heads] table = Table(id=tit, header=col, data=rs).tokenize(self.model.tokenizer) self.tabs.append(table) self.context.append(self.model.tokenizer.tokenize(qs)) #self.answers.append(self.model.tokenizer.convert_tokens_to_ids(self.model.tokenizer.tokenize(str(ans[0])))[0]) self.label.append(label) #jz
def prepare(self, data_dir, data_type, query_tokenizer, table_tokenizer, max_query_length): if self._check_exists(): return processed_dir = Path(self.processed_folder) processed_dir.mkdir(exist_ok=True) if not (query_tokenizer and table_tokenizer): raise RuntimeError( 'Tokenizers are not found.' + ' You must set query_tokenizer and table_tokenizer') print('Processing...') query_dict = defaultdict() pos_tables, neg_tables = defaultdict(list), defaultdict(list) data = [] path = Path(data_dir + '/' + data_type + '.jsonl') with open(path) as f: for line in f.readlines(): if not line.strip(): break # 테이블 기본 Meta data 파싱 jsonStr = json.loads(line) tableId = jsonStr['docid'] query = jsonStr['query'] qid = jsonStr['qid'] rel = jsonStr['rel'] if qid not in query_dict: query_tokenized = query_tokenizer.encode_plus( query, max_length=max_query_length, padding='max_length', truncation=True, return_tensors="pt") query_dict[ qid] = query_tokenized # BERT **input input_ids, seg_ids, mas_ids # Raw Json 파싱 raw_json = json.loads(jsonStr['table']['raw_json']) title = raw_json['pgTitle'] secTitle = raw_json['secondTitle'] hRow = raw_json['numHeaderRows'] row = raw_json['numDataRows'] col = raw_json['numCols'] caption = raw_json['caption'] heading = raw_json['title'] body = raw_json['data'] if col == 0 or row == 0: continue column_rep = Table( id=title, header=[Column(h.strip(), 'text') for h in heading], data=body).tokenize(table_tokenizer) # TODO: caption을 다양하게 주는부분, 비교실험 해볼부분임 caption = " ".join( heading) + " " + title + " " + secTitle + " " + caption caption_rep = table_tokenizer.tokenize(caption) if rel == '0': neg_tables[qid].append((column_rep, caption_rep)) else: pos_tables[qid].append((column_rep, caption_rep)) for qid in query_dict: if not pos_tables[qid]: continue for t in itertools.product(pos_tables[qid], neg_tables[qid]): data.append([query_dict[qid]] + list(itertools.chain.from_iterable(t))) # Save with open(os.path.join(processed_dir, self.ids_file), 'wb') as f: torch.save(data, f) print('Done!')
from table_bert import TableBertModel model = TableBertModel.from_pretrained( '/tabert/models/tabert_base_k1/model.bin', ) from table_bert import Table, Column table = Table(id='List of countries by GDP (PPP)', header=[ Column('Nation', 'text', sample_value='United States'), Column('Gross Domestic Product', 'real', sample_value='21,439,453') ], data=[ ['United States', '21,439,453'], ['China', '27,308,857'], ['European Union', '22,774,165'], ]).tokenize(model.tokenizer) # To visualize table in an IPython notebook: # display(table.to_data_frame(), detokenize=True) context = 'show me countries ranked by GDP' # model takes batched, tokenized inputs context_encoding, column_encoding, info_dict = model.encode( contexts=[model.tokenizer.tokenize(context)], tables=[table]) print(context_encoding.shape) print(column_encoding.shape)
def prepare(self, data_dir, data_type, query_tokenizer, table_tokenizer, max_query_length): if self._check_exists(): return processed_dir = Path(self.processed_folder) processed_dir.mkdir(exist_ok=True) if not (query_tokenizer and table_tokenizer): raise RuntimeError( 'Tokenizers are not found.' + ' You must set query_tokenizer and table_tokenizer') print('Processing...') query_dict = defaultdict() pos_tables, neg_tables = defaultdict(list), defaultdict(list) path = Path(data_dir + '/' + data_type + '.jsonl') with open(path) as f: for line in f.readlines(): if not line.strip(): break # 테이블 기본 Meta data 파싱 jsonStr = json.loads(line) tableId = jsonStr['docid'] query = jsonStr['query'] qid = jsonStr['qid'] rel = jsonStr['rel'] if qid not in query_dict: query_tokenized = query_tokenizer.encode_plus( query, max_length=max_query_length, padding='max_length', truncation=True, return_tensors="pt") query_dict[qid] = query_tokenized # Raw Json 파싱 raw_json = json.loads(jsonStr['table']['raw_json']) title = raw_json['pgTitle'] secTitle = raw_json['secondTitle'] hRow = raw_json['numHeaderRows'] row = raw_json['numDataRows'] col = raw_json['numCols'] caption = raw_json['caption'] heading = raw_json['title'] body = raw_json['data'] if col == 0 or row == 0: continue column_rep = Table( id=title, header=[Column(h.strip(), 'text') for h in heading], data=body).tokenize(table_tokenizer) caption_rep = table_tokenizer.tokenize(caption) if rel == '0': neg_tables[qid].append((tableId, column_rep, caption_rep)) else: pos_tables[qid].append((tableId, column_rep, caption_rep)) queries = [(k, v) for k, v in query_dict.items()] tables = (pos_tables, neg_tables) with open(os.path.join(processed_dir, self.query_file), 'wb') as f: torch.save(queries, f) with open(os.path.join(processed_dir, self.table_file), 'wb') as f: torch.save(tables, f) print('Done!')
from table_bert import TableBertModel from table_bert import Table, Column import torch model = TableBertModel.from_pretrained( '/Users/mac/Desktop/syt/Deep-Learning/Repos/TaBERT/pretrained-models/tabert_base_k3/model.bin', ) table = Table(id='List of countries by GDP (PPP)', header=[ Column('Nation', 'text', sample_value='United States'), Column('Gross Domestic Product', 'real', sample_value='21,439,453') ], data=[ ['United States', '21,439,453'], ['China', '27,308,857'], ['European Union', '22,774,165'], ]).tokenize(model.tokenizer) table2 = Table(id='List of countries by GDP (PPP)', header=[ Column('Nation', 'text', sample_value='United States'), Column('Gross Domestic Product', 'real', sample_value='21,439,453'), Column('Continent', 'text', sample_value='North America') ], data=[