def parse(cls, header, content, nlp_model, caption=None) -> 'Table': columns = [] sampled_values = [] for col_ids, col_name in enumerate(header): sample_value = None for row in content: cell_val = row[col_ids] if len(cell_val.strip()) > 0: sample_value = cell_val break assert sample_value is not None sampled_values.append(sample_value) parsed_values = nlp_model.pipe(sampled_values) for col_id, sampled_value_annot in enumerate(parsed_values): tokenized_value = [token.text for token in sampled_value_annot] ner_tags = [token.ent_type_ for token in sampled_value_annot] pos_tags = [token.pos_ for token in sampled_value_annot] sample_value_entry ={ 'value': sampled_value_annot.text, 'tokens': tokenized_value, 'ner_tags': ner_tags } col_name = header[col_id] col_type = data_utils.infer_column_type_from_sampled_value(sample_value_entry) columns.append(Column(col_name, col_type, sample_value=sample_value_entry)) return cls(columns, content, caption=caption)
del instance['tokens'] del instance['masked_lm_labels'] del instance['info'] if __name__ == '__main__': config = TableBertConfig() tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') input_formatter = VanillaTableBertInputFormatter(config, tokenizer) header = [] for i in range(1000): header.append( Column( name='test', type='text', name_tokens=['test'] * 3, sample_value='ha ha ha yay', sample_value_tokens=['ha', 'ha', 'ha', 'yay'] ) ) print( input_formatter.get_row_input( context='12 213 5 345 23 234'.split(), header=header, row_data=[col.sample_value_tokens for col in header], trim_long_table=True ) )
def from_dict(cls, entry: Dict, tokenizer: Optional[BertTokenizer], suffix) -> 'Example': def _get_data_source(): return 'wiki' if 'wiki' in entry['uuid'] else 'common_crawl' source = _get_data_source() header_entry = entry['header'] if source == 'wiki' else entry['table'][ 'header'] header = [] column_data = [] for col in header_entry: sample_value = col['sample_value']['value'] if tokenizer: name_tokens = tokenizer.tokenize(col['name']) else: name_tokens = None column = Column(col['name'], col['type'], sample_value, name_tokens=name_tokens) header.append(column) if source == 'wiki': for row in entry['data'][1:]: for col_id, (tag, cell_val) in enumerate(row): if col_id >= len(column_data): column_data.append([]) column_data[col_id].append(cell_val) else: for row in entry['table']['rows']: for col_id, (cell_val) in enumerate(row): if col_id >= len(column_data): column_data.append([]) column_data[col_id].append(cell_val) context_before = [] context_after = [] if source == 'wiki': for para in entry['context_before']: for sent in para: if tokenizer: sent = tokenizer.tokenize(sent) context_before.append(sent) caption = entry['caption'] if caption: if tokenizer: caption = tokenizer.tokenize(entry['caption']) context_before.append(caption) else: for sent in entry['context_before']: if tokenizer: sent = tokenizer.tokenize(sent) context_before.append(sent) for sent in entry['context_after']: if tokenizer: sent = tokenizer.tokenize(sent) context_after.append(sent) uuid = entry['uuid'] return cls(uuid, header, [context_before, context_after], column_data=column_data, source=source)
def from_serialized(cls, data) -> 'Example': header = [Column(**x) for x in data['header']] data['header'] = header return Example(**data)
def get_table_bert_input_from_context( env_context: List[Dict], bert_model: TableBertModel, is_training: bool, **kwargs ) -> Tuple[List[Any], List[Table]]: contexts = [] tables = [] content_snapshot_strategy = kwargs.get('content_snapshot_strategy', None) if content_snapshot_strategy: assert content_snapshot_strategy in ('sampled_rows', 'synthetic_row') for e in env_context: contexts.append(e['question_tokens']) if model_use_vertical_attention(bert_model): sample_row_num = bert_model.config.sample_row_num if content_snapshot_strategy == 'sampled_rows': if 'sampled_rows' not in e: sampled_rows = get_question_biased_sampled_rows( e['question_tokens'], e['table'], num_rows=sample_row_num ) e['sampled_rows'] = sampled_rows sampled_rows = e['sampled_rows'] else: if is_training: sampled_rows = [ e['table'].data[idx] for idx in sorted( np.random.choice( list(range(len(e['table']))), replace=False, size=sample_row_num ) ) ] else: sampled_rows = e['table'].data[:sample_row_num] table = e['table'].with_rows(sampled_rows) else: table = e['table'] if content_snapshot_strategy: if 'sampled_rows' not in e: if content_snapshot_strategy == 'sampled_rows': sampled_rows = get_question_biased_sampled_rows( e['question_tokens'], e['table'], num_rows=1 ) e['sampled_rows'] = sampled_rows elif content_snapshot_strategy == 'synthetic_row': sampled_cells = get_question_biased_sampled_cells( e['question_tokens'], e['table'] ) e['sampled_rows'] = [sampled_cells] sampled_row = e['sampled_rows'][0] new_header = [] for idx, column in enumerate(e['table'].header): cell_value = sampled_row[idx] if isinstance(sampled_row, list) else sampled_row[column.name] new_column = Column( name=column.name, name_tokens=column.name_tokens, type=column.type, sample_value=cell_value, sample_value_tokens=cell_value ) new_header.append(new_column) table = Table( id=table.id, header=new_header, data=[{column.name: column.sample_value_tokens for column in new_header}] ) tables.append(table) return contexts, tables