Esempio n. 1
0
    def _build(self):
        for idx in tqdm(range(len(self.data))):
            qs = self.data.loc[idx, 'context']
            ans = self.data.loc[idx, 'answer']
            heads = self.data.loc[idx, 'header']
            tit = self.data.loc[idx, 'title']
            rs = self.data.loc[idx, 'rows']

            col = [Column(z[0], z[1], sample_value=z[2]) for z in heads]
            table = Table(
                id=tit,
                header=col,
                data=rs
            ).tokenize(self.model.tokenizer)
            self.tabs.append(table)

            self.context.append(self.model.tokenizer.tokenize(qs))
#             print('ans:',ans)
            ans_tokenized = self.model.tokenizer.tokenize(str(ans[0]))
            encoded_dict = tokenizer.encode_plus(
                        ans_tokenized,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        truncation=True,
                        max_length = 35,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
#             print('answers',ans[0])
#             print('answers',ans_tokenized)
#             print('tockenized answers:',encoded_dict['input_ids'])
            self.answers.append(encoded_dict['input_ids'])
Esempio n. 2
0
    def _build(self):
        for idx in tqdm(range(len(self.data))):
            qs = self.data.loc[idx, 'context']
            #ans = self.data.loc[idx, 'answer']
            heads = self.data.loc[idx, 'header']
            tit = self.data.loc[idx, 'title']
            rs = self.data.loc[idx, 'rows']
            label = self.data.loc[
                idx,
                'select_column']  #jz: if use multi-class classification, here is a number in [0,14].

            col = [Column(z[0], z[1], sample_value=z[2]) for z in heads]

            table = Table(id=tit, header=col,
                          data=rs).tokenize(self.model.tokenizer)

            self.tabs.append(table)
            self.context.append(self.model.tokenizer.tokenize(qs))
            #self.answers.append(self.model.tokenizer.convert_tokens_to_ids(self.model.tokenizer.tokenize(str(ans[0])))[0])

            self.label.append(label)  #jz
Esempio n. 3
0
    def prepare(self, data_dir, data_type, query_tokenizer, table_tokenizer,
                max_query_length):
        if self._check_exists():
            return

        processed_dir = Path(self.processed_folder)
        processed_dir.mkdir(exist_ok=True)
        if not (query_tokenizer and table_tokenizer):
            raise RuntimeError(
                'Tokenizers are not found.' +
                ' You must set query_tokenizer and table_tokenizer')
        print('Processing...')

        query_dict = defaultdict()
        pos_tables, neg_tables = defaultdict(list), defaultdict(list)
        data = []
        path = Path(data_dir + '/' + data_type + '.jsonl')

        with open(path) as f:
            for line in f.readlines():
                if not line.strip():
                    break
                # 테이블 기본 Meta data 파싱
                jsonStr = json.loads(line)
                tableId = jsonStr['docid']
                query = jsonStr['query']
                qid = jsonStr['qid']
                rel = jsonStr['rel']

                if qid not in query_dict:
                    query_tokenized = query_tokenizer.encode_plus(
                        query,
                        max_length=max_query_length,
                        padding='max_length',
                        truncation=True,
                        return_tensors="pt")
                    query_dict[
                        qid] = query_tokenized  # BERT **input input_ids, seg_ids, mas_ids

                # Raw Json 파싱
                raw_json = json.loads(jsonStr['table']['raw_json'])
                title = raw_json['pgTitle']
                secTitle = raw_json['secondTitle']
                hRow = raw_json['numHeaderRows']
                row = raw_json['numDataRows']
                col = raw_json['numCols']
                caption = raw_json['caption']
                heading = raw_json['title']
                body = raw_json['data']

                if col == 0 or row == 0:
                    continue

                column_rep = Table(
                    id=title,
                    header=[Column(h.strip(), 'text') for h in heading],
                    data=body).tokenize(table_tokenizer)
                # TODO: caption을 다양하게 주는부분, 비교실험 해볼부분임
                caption = " ".join(
                    heading) + " " + title + " " + secTitle + " " + caption
                caption_rep = table_tokenizer.tokenize(caption)

                if rel == '0':
                    neg_tables[qid].append((column_rep, caption_rep))
                else:
                    pos_tables[qid].append((column_rep, caption_rep))

        for qid in query_dict:
            if not pos_tables[qid]:
                continue

            for t in itertools.product(pos_tables[qid], neg_tables[qid]):
                data.append([query_dict[qid]] +
                            list(itertools.chain.from_iterable(t)))

        # Save
        with open(os.path.join(processed_dir, self.ids_file), 'wb') as f:
            torch.save(data, f)
        print('Done!')
Esempio n. 4
0
from table_bert import TableBertModel

model = TableBertModel.from_pretrained(
    '/tabert/models/tabert_base_k1/model.bin', )

from table_bert import Table, Column

table = Table(id='List of countries by GDP (PPP)',
              header=[
                  Column('Nation', 'text', sample_value='United States'),
                  Column('Gross Domestic Product',
                         'real',
                         sample_value='21,439,453')
              ],
              data=[
                  ['United States', '21,439,453'],
                  ['China', '27,308,857'],
                  ['European Union', '22,774,165'],
              ]).tokenize(model.tokenizer)

# To visualize table in an IPython notebook:
# display(table.to_data_frame(), detokenize=True)

context = 'show me countries ranked by GDP'

# model takes batched, tokenized inputs
context_encoding, column_encoding, info_dict = model.encode(
    contexts=[model.tokenizer.tokenize(context)], tables=[table])

print(context_encoding.shape)
print(column_encoding.shape)
Esempio n. 5
0
    def prepare(self, data_dir, data_type, query_tokenizer, table_tokenizer,
                max_query_length):
        if self._check_exists():
            return

        processed_dir = Path(self.processed_folder)
        processed_dir.mkdir(exist_ok=True)

        if not (query_tokenizer and table_tokenizer):
            raise RuntimeError(
                'Tokenizers are not found.' +
                ' You must set query_tokenizer and table_tokenizer')

        print('Processing...')
        query_dict = defaultdict()
        pos_tables, neg_tables = defaultdict(list), defaultdict(list)

        path = Path(data_dir + '/' + data_type + '.jsonl')
        with open(path) as f:
            for line in f.readlines():
                if not line.strip():
                    break

                # 테이블 기본 Meta data 파싱
                jsonStr = json.loads(line)
                tableId = jsonStr['docid']
                query = jsonStr['query']
                qid = jsonStr['qid']
                rel = jsonStr['rel']

                if qid not in query_dict:
                    query_tokenized = query_tokenizer.encode_plus(
                        query,
                        max_length=max_query_length,
                        padding='max_length',
                        truncation=True,
                        return_tensors="pt")
                    query_dict[qid] = query_tokenized

                # Raw Json 파싱
                raw_json = json.loads(jsonStr['table']['raw_json'])
                title = raw_json['pgTitle']
                secTitle = raw_json['secondTitle']
                hRow = raw_json['numHeaderRows']
                row = raw_json['numDataRows']
                col = raw_json['numCols']
                caption = raw_json['caption']
                heading = raw_json['title']
                body = raw_json['data']

                if col == 0 or row == 0:
                    continue

                column_rep = Table(
                    id=title,
                    header=[Column(h.strip(), 'text') for h in heading],
                    data=body).tokenize(table_tokenizer)
                caption_rep = table_tokenizer.tokenize(caption)
                if rel == '0':
                    neg_tables[qid].append((tableId, column_rep, caption_rep))
                else:
                    pos_tables[qid].append((tableId, column_rep, caption_rep))

        queries = [(k, v) for k, v in query_dict.items()]
        tables = (pos_tables, neg_tables)
        with open(os.path.join(processed_dir, self.query_file), 'wb') as f:
            torch.save(queries, f)
        with open(os.path.join(processed_dir, self.table_file), 'wb') as f:
            torch.save(tables, f)

        print('Done!')
Esempio n. 6
0
from table_bert import TableBertModel
from table_bert import Table, Column

import torch

model = TableBertModel.from_pretrained(
    '/Users/mac/Desktop/syt/Deep-Learning/Repos/TaBERT/pretrained-models/tabert_base_k3/model.bin',
)

table = Table(id='List of countries by GDP (PPP)',
              header=[
                  Column('Nation', 'text', sample_value='United States'),
                  Column('Gross Domestic Product',
                         'real',
                         sample_value='21,439,453')
              ],
              data=[
                  ['United States', '21,439,453'],
                  ['China', '27,308,857'],
                  ['European Union', '22,774,165'],
              ]).tokenize(model.tokenizer)

table2 = Table(id='List of countries by GDP (PPP)',
               header=[
                   Column('Nation', 'text', sample_value='United States'),
                   Column('Gross Domestic Product',
                          'real',
                          sample_value='21,439,453'),
                   Column('Continent', 'text', sample_value='North America')
               ],
               data=[