Exemple #1
0
    def parse(cls, header, content, nlp_model, caption=None) -> 'Table':
        columns = []
        sampled_values = []
        for col_ids, col_name in enumerate(header):
            sample_value = None
            for row in content:
                cell_val = row[col_ids]
                if len(cell_val.strip()) > 0:
                    sample_value = cell_val
                    break

            assert sample_value is not None
            sampled_values.append(sample_value)

        parsed_values = nlp_model.pipe(sampled_values)
        for col_id, sampled_value_annot in enumerate(parsed_values):
            tokenized_value = [token.text for token in sampled_value_annot]
            ner_tags = [token.ent_type_ for token in sampled_value_annot]
            pos_tags = [token.pos_ for token in sampled_value_annot]

            sample_value_entry ={
                'value': sampled_value_annot.text,
                'tokens': tokenized_value,
                'ner_tags': ner_tags
            }

            col_name = header[col_id]
            col_type = data_utils.infer_column_type_from_sampled_value(sample_value_entry)

            columns.append(Column(col_name, col_type, sample_value=sample_value_entry))

        return cls(columns, content, caption=caption)
Exemple #2
0
        del instance['tokens']
        del instance['masked_lm_labels']
        del instance['info']


if __name__ == '__main__':
    config = TableBertConfig()
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    input_formatter = VanillaTableBertInputFormatter(config, tokenizer)

    header = []
    for i in range(1000):
        header.append(
            Column(
                name='test',
                type='text',
                name_tokens=['test'] * 3,
                sample_value='ha ha ha yay',
                sample_value_tokens=['ha', 'ha', 'ha', 'yay']
            )
        )

    print(
        input_formatter.get_row_input(
            context='12 213 5 345 23 234'.split(),
            header=header,
            row_data=[col.sample_value_tokens for col in header],
            trim_long_table=True
        )
    )
Exemple #3
0
    def from_dict(cls, entry: Dict, tokenizer: Optional[BertTokenizer],
                  suffix) -> 'Example':
        def _get_data_source():
            return 'wiki' if 'wiki' in entry['uuid'] else 'common_crawl'

        source = _get_data_source()

        header_entry = entry['header'] if source == 'wiki' else entry['table'][
            'header']
        header = []
        column_data = []
        for col in header_entry:
            sample_value = col['sample_value']['value']
            if tokenizer:
                name_tokens = tokenizer.tokenize(col['name'])
            else:
                name_tokens = None
            column = Column(col['name'],
                            col['type'],
                            sample_value,
                            name_tokens=name_tokens)
            header.append(column)

        if source == 'wiki':
            for row in entry['data'][1:]:
                for col_id, (tag, cell_val) in enumerate(row):
                    if col_id >= len(column_data):
                        column_data.append([])

                    column_data[col_id].append(cell_val)
        else:
            for row in entry['table']['rows']:
                for col_id, (cell_val) in enumerate(row):
                    if col_id >= len(column_data):
                        column_data.append([])

                    column_data[col_id].append(cell_val)

        context_before = []
        context_after = []

        if source == 'wiki':
            for para in entry['context_before']:
                for sent in para:
                    if tokenizer:
                        sent = tokenizer.tokenize(sent)

                    context_before.append(sent)

            caption = entry['caption']
            if caption:
                if tokenizer:
                    caption = tokenizer.tokenize(entry['caption'])

                context_before.append(caption)
        else:
            for sent in entry['context_before']:
                if tokenizer:
                    sent = tokenizer.tokenize(sent)
                context_before.append(sent)

            for sent in entry['context_after']:
                if tokenizer:
                    sent = tokenizer.tokenize(sent)
                context_after.append(sent)

        uuid = entry['uuid']

        return cls(uuid,
                   header, [context_before, context_after],
                   column_data=column_data,
                   source=source)
Exemple #4
0
 def from_serialized(cls, data) -> 'Example':
     header = [Column(**x) for x in data['header']]
     data['header'] = header
     return Example(**data)
def get_table_bert_input_from_context(
    env_context: List[Dict],
    bert_model: TableBertModel,
    is_training: bool,
    **kwargs
) -> Tuple[List[Any], List[Table]]:
    contexts = []
    tables = []

    content_snapshot_strategy = kwargs.get('content_snapshot_strategy', None)
    if content_snapshot_strategy:
        assert content_snapshot_strategy in ('sampled_rows', 'synthetic_row')

    for e in env_context:
        contexts.append(e['question_tokens'])

        if model_use_vertical_attention(bert_model):
            sample_row_num = bert_model.config.sample_row_num
            if content_snapshot_strategy == 'sampled_rows':
                if 'sampled_rows' not in e:
                    sampled_rows = get_question_biased_sampled_rows(
                        e['question_tokens'], e['table'],
                        num_rows=sample_row_num
                    )
                    e['sampled_rows'] = sampled_rows

                sampled_rows = e['sampled_rows']
            else:
                if is_training:
                    sampled_rows = [
                        e['table'].data[idx]
                        for idx
                        in sorted(
                            np.random.choice(
                                list(range(len(e['table']))),
                                replace=False,
                                size=sample_row_num
                            )
                        )
                    ]
                else:
                    sampled_rows = e['table'].data[:sample_row_num]

            table = e['table'].with_rows(sampled_rows)
        else:
            table = e['table']
            if content_snapshot_strategy:
                if 'sampled_rows' not in e:
                    if content_snapshot_strategy == 'sampled_rows':
                        sampled_rows = get_question_biased_sampled_rows(
                            e['question_tokens'], e['table'],
                            num_rows=1
                        )
                        e['sampled_rows'] = sampled_rows
                    elif content_snapshot_strategy == 'synthetic_row':
                        sampled_cells = get_question_biased_sampled_cells(
                            e['question_tokens'], e['table']
                        )
                        e['sampled_rows'] = [sampled_cells]

                sampled_row = e['sampled_rows'][0]
                new_header = []
                for idx, column in enumerate(e['table'].header):
                    cell_value = sampled_row[idx] if isinstance(sampled_row, list) else sampled_row[column.name]
                    new_column = Column(
                        name=column.name, name_tokens=column.name_tokens, type=column.type,
                        sample_value=cell_value, sample_value_tokens=cell_value
                    )
                    new_header.append(new_column)

                table = Table(
                    id=table.id, header=new_header,
                    data=[{column.name: column.sample_value_tokens for column in new_header}]
                )

        tables.append(table)

    return contexts, tables