Beispiel #1
0
 def test_grid_indexing(self):
     # single elements
     self.assertEqual(self.easy_table[0, 0], self.a)
     self.assertEqual(self.easy_table[-1, -1], self.f)
     # full row
     self.assertListEqual(self.easy_table[0, :], [self.a, self.b, self.c])
     self.assertListEqual(self.easy_table[1, :], [self.d, self.e, self.f])
     # partial row
     self.assertListEqual(self.easy_table[0, 1:], [self.b, self.c])
     self.assertListEqual(self.easy_table[0, :2], [self.a, self.b])
     self.assertListEqual(self.easy_table[0, 1:2], [self.b])
     # full column
     self.assertListEqual(self.easy_table[:, 0], [self.a, self.d])
     # partial column
     self.assertListEqual(self.easy_table[1:, 0], [self.d])
     self.assertListEqual(self.easy_table[:1, 0], [self.a])
     self.assertListEqual(self.easy_table[1:2, 0], [self.d])
     # full subgrid
     self.assertEqual(self.easy_table, self.easy_table[:, :])
     # partial subgrid
     self.assertEqual(self.easy_table[1:2, 1:2],
                      Table.create_from_grid(grid=[[self.e]]))
     self.assertEqual(self.easy_table[1:, 1:],
                      Table.create_from_grid(grid=[[self.e, self.f]]))
     self.assertEqual(
         self.easy_table[:2, :2],
         Table.create_from_grid(grid=[[self.a, self.b], [self.d, self.e]]))
Beispiel #2
0
    def setUp(self):
        self.a = Cell(tokens=[Token(text='a')], rowspan=1, colspan=1)
        self.b = Cell(tokens=[Token(text='b')], rowspan=1, colspan=1)
        self.c = Cell(tokens=[Token(text='c')], rowspan=1, colspan=1)
        self.d = Cell(tokens=[Token(text='d')], rowspan=1, colspan=1)
        self.e = Cell(tokens=[Token(text='e')], rowspan=1, colspan=1)
        self.f = Cell(tokens=[Token(text='f')], rowspan=1, colspan=1)
        self.easy_table = Table(caption='hi this is caption')
        self.easy_table.grid = np.array([[self.a, self.b, self.c],
                                         [self.d, self.e, self.f]])

        self.hard_table = Table.create_from_cells(cells=[
            Cell(tokens=[Token(text='')], rowspan=2, colspan=2),
            Cell(tokens=[Token(text='C')], rowspan=1, colspan=2),
            Cell(tokens=[Token(text='C:1')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='C:2')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='R')], rowspan=3, colspan=1),
            Cell(tokens=[Token(text='R:1')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='a')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='b')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='R:2')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='c')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='d')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='R:3')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='e')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='f')], rowspan=1, colspan=1)
        ],
                                                  nrow=5,
                                                  ncol=4,
                                                  paper_id='abc',
                                                  page_num=0,
                                                  caption='hi this is caption')
Beispiel #3
0
 def setUp(self):
     self.table1 = Table()
     self.table2 = Table()
     self.pairwise_mapping = PairwiseMapping(self.table1,
                                             self.table2,
                                             score=1.0,
                                             column_mappings=list())
Beispiel #4
0
 def test_append_bottom(self):
     self.assertEqual(
         self.easy_table.append_bottom(other=Table.create_from_grid(
             grid=[[self.f, self.b, self.d], [self.c, self.e, self.a]])),
         Table.create_from_grid(
             grid=[[self.a, self.b, self.c], [self.d, self.e, self.f],
                   [self.f, self.b, self.d], [self.c, self.e, self.a]]))
Beispiel #5
0
    def setUp(self):
        self.table_permute_rows = Table.create_from_cells(cells=[
            Cell(tokens=[Token(text='subject')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='header1')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='header2')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='z')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='5')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='6')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='y')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='3')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='4')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='x')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='1')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='2')], rowspan=1, colspan=1)
        ],
                                                          nrow=4,
                                                          ncol=3)

        self.table_extra_rows = Table.create_from_cells(cells=[
            Cell(tokens=[Token(text='subject')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='header1')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='header2')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='x')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='1')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='2')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='y')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='3')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='4')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='z')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='5')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='6')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='w')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='7')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='8')], rowspan=1, colspan=1)
        ],
                                                        nrow=5,
                                                        ncol=3)

        self.table_missing_rows = Table.create_from_cells(cells=[
            Cell(tokens=[Token(text='subject')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='header1')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='header2')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='x')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='1')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='2')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='y')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='3')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='4')], rowspan=1, colspan=1)
        ],
                                                          nrow=3,
                                                          ncol=3)
Beispiel #6
0
 def test_insert_row(self):
     x = Cell(tokens=[Token(text='x')], rowspan=1, colspan=1)
     y = Cell(tokens=[Token(text='y')], rowspan=1, colspan=1)
     z = Cell(tokens=[Token(text='z')], rowspan=1, colspan=1)
     self.assertEqual(
         self.easy_table.insert_row(index=0, row=[x, y, z]),
         Table.create_from_grid(grid=[[x, y, z], [self.a, self.b, self.c],
                                      [self.d, self.e, self.f]]))
     self.assertEqual(
         self.easy_table.insert_row(index=1, row=[x, y, z]),
         Table.create_from_grid(grid=[[self.a, self.b, self.c], [x, y, z],
                                      [self.d, self.e, self.f]]))
     with self.assertRaises(Exception):
         self.easy_table.insert_row(index=1, row=[x, y])
Beispiel #7
0
    def _create_table_from_omnipage_xml(self, table_tag: Tag, caption: str,
                                        paper_id: str) -> Table:

        ncol = len(table_tag.find('gridtable').find_all('gridcol'))
        nrow = len(table_tag.find('gridtable').find_all('gridrow'))

        cells = []
        for cell_tag in table_tag.find_all('cellzone'):

            # BUILD LIST OF TOKENS
            tokens = []
            for word_tag in cell_tag.find_all('wd'):
                token = Token(text=word_tag.get_text(strip=True))
                tokens.append(token)

            # BUILD CELL FROM LIST OF TOKENS
            cell = Cell(tokens=tokens,
                        rowspan=int(cell_tag.get('gridrowtill')) -
                        int(cell_tag.get('gridrowfrom')) + 1,
                        colspan=int(cell_tag.get('gridcoltill')) -
                        int(cell_tag.get('gridcolfrom')) + 1)
            cells.append(cell)

        # BUILD TABLE FROM LIST OF CELLS
        table = Table.create_from_cells(cells=cells,
                                        nrow=nrow,
                                        ncol=ncol,
                                        paper_id=paper_id,
                                        page_num=0,
                                        caption=caption)

        return table
Beispiel #8
0
    def test_aggregate_tables(self):
        schema_matcher = SchemaMatcher()

        target_schema = Table.create_from_cells(cells=[
            Cell(tokens=[Token(text='subject')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='header1')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='header2')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='not_copied')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='not_copied')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='not_copied')], rowspan=1, colspan=1)
        ],
                                                nrow=2,
                                                ncol=3)

        pred_aggregate_table = schema_matcher.aggregate_tables(
            pairwise_mappings=[
                PairwiseMapping(self.table_source,
                                target_schema,
                                score=-999,
                                column_mappings=[(1, 2), (2, 1)])
            ],
            target_schema=target_schema)

        gold_aggregate_table = Table.create_from_cells([
            Cell(tokens=[Token(text='subject')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='header1')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='header2')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='x')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='2')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='1')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='y')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='4')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='3')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='z')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='6')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='5')], rowspan=1, colspan=1)
        ],
                                                       nrow=4,
                                                       ncol=3)

        print(pred_aggregate_table)
        print(gold_aggregate_table)
        self.assertEquals(pred_aggregate_table, gold_aggregate_table)
Beispiel #9
0
    def test_compute_metrics(self):
        pred_table_missing_header = Table.create_from_cells(
            cells=[
                Cell(tokens=[Token(text='x')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='1')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='2')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='y')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='3')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='4')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='z')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='5')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='6')], rowspan=1, colspan=1)
            ], nrow=3, ncol=3)

        with self.assertRaises(Exception):
            compute_metrics(gold_table=self.gold_table,
                            pred_table=pred_table_missing_header)

        self.assertEqual(
            cell_level_recall(gold_table=self.gold_table,
                              pred_table=self.pred_table_empty), 0.0)

        pred_table_permuted_header = Table.create_from_cells(
            cells=[
                Cell(tokens=[Token(text='subject')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='header2')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='header1')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='x')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='2')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='1')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='y')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='4')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='3')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='z')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='6')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='5')], rowspan=1, colspan=1)
            ], nrow=4, ncol=3)

        with self.assertRaises(Exception):
            compute_metrics(gold_table=self.gold_table,
                            pred_table=pred_table_permuted_header)
Beispiel #10
0
    def _create_table_from_tetml(self, table_id: int, table_tag: Tag,
                                 paper_id: str, caption: str) -> Table:
        cells = []
        ncol_per_row = []
        for i, row_tag in enumerate(table_tag.find_all('row')):

            ncol_per_row.append(0)
            for cell_tag in row_tag.find_all('cell'):

                # BUILD LIST OF TOKENS
                tokens = []
                for word_tag in cell_tag.find_all('word'):
                    word_box_tag = word_tag.find('box')
                    token = Token(
                        text=word_box_tag.get_text(strip=True),
                        # `find_all` gets font per character,
                        # but use `find` because assume font
                        # is constant within same word
                        font=word_box_tag.find('glyph').get('font'),
                        bounding_box=Box(llx=float(word_box_tag.get('llx')),
                                         lly=float(word_box_tag.get('lly')),
                                         urx=float(word_box_tag.get('urx')),
                                         ury=float(word_box_tag.get('ury'))))
                    tokens.append(token)

                # BUILD CELL FROM LIST OF TOKENS
                cell = Cell(
                    tokens=tokens,
                    rowspan=1,
                    colspan=int(cell_tag.get('colspan')) \
                        if cell_tag.get('colspan') else 1
                )
                cells.append(cell)
                ncol_per_row[i] += cell.colspan

        # TODO: add more filters here if necessary
        if not all([ncol == ncol_per_row[0] for ncol in ncol_per_row]):
            raise TetmlXMLToTablesParserException(
                'Table {} has unequal columns per row. Skipping...'.format(
                    table_id))

        # TODO: `page_num` and `paper_id` fields
        # BUILD TABLE FROM LIST OF CELLS
        table = Table.create_from_cells(cells=cells,
                                        nrow=len(ncol_per_row),
                                        ncol=ncol_per_row[0],
                                        paper_id=paper_id,
                                        page_num=0,
                                        caption=caption)
        return table
Beispiel #11
0
 def setUp(self):
     self.table_source = Table.create_from_cells([
         Cell(tokens=[Token(text='subject')], rowspan=1, colspan=1),
         Cell(tokens=[Token(text='header1')], rowspan=1, colspan=1),
         Cell(tokens=[Token(text='header2')], rowspan=1, colspan=1),
         Cell(tokens=[Token(text='x')], rowspan=1, colspan=1),
         Cell(tokens=[Token(text='1')], rowspan=1, colspan=1),
         Cell(tokens=[Token(text='2')], rowspan=1, colspan=1),
         Cell(tokens=[Token(text='y')], rowspan=1, colspan=1),
         Cell(tokens=[Token(text='3')], rowspan=1, colspan=1),
         Cell(tokens=[Token(text='4')], rowspan=1, colspan=1),
         Cell(tokens=[Token(text='z')], rowspan=1, colspan=1),
         Cell(tokens=[Token(text='5')], rowspan=1, colspan=1),
         Cell(tokens=[Token(text='6')], rowspan=1, colspan=1)
     ],
                                                 nrow=4,
                                                 ncol=3)
Beispiel #12
0
 def test_compute_bounding_box(self):
     table = Table.create_from_cells(cells=[
         Cell(tokens=[Token(text='e')],
              rowspan=1,
              colspan=1,
              bounding_box=Box(llx=-1.0, lly=-0.5, urx=1.0, ury=1.0)),
         Cell(tokens=[Token(text='e')],
              rowspan=1,
              colspan=1,
              bounding_box=Box(llx=1.5, lly=-0.5, urx=2.5, ury=1.5))
     ],
                                     nrow=1,
                                     ncol=2,
                                     paper_id='abc',
                                     page_num=0,
                                     caption='hi this is caption')
     box = table.bounding_box
     self.assertEqual(box.ll.x, -1.0)
     self.assertEqual(box.ll.y, -0.5)
     self.assertEqual(box.ur.x, 2.5)
     self.assertEqual(box.ur.y, 1.5)
Beispiel #13
0
    def aggregate_tables(self,
                         pairwise_mappings: List[PairwiseMapping],
                         target_schema: Table) -> Table:



        # initialize empty aggregate table
        num_rows_agg_table = sum([pairwise_mapping.table1.nrow - 1
                                  for pairwise_mapping in pairwise_mappings])

        aggregate_table = Table.create_from_grid(grid=np.array([
            [None for _ in range(target_schema.ncol)]
            for _ in range(num_rows_agg_table)
        ]))
        aggregate_table = aggregate_table.insert_row(index=0,
                                                     row=target_schema[0, :])

        index_agg_table_insert = 1
        # TODO: `table1` is always the table that needs to be aggregated to `table2`=target
        for pairwise_mapping in sorted(pairwise_mappings):

            for idx_source_row in range(1, pairwise_mapping.table1.nrow):
                # copy subject for this row
                aggregate_table.grid[index_agg_table_insert, 0] = \
                    pairwise_mapping.table1[idx_source_row, 0]

                # fill cells with source table values according to column mappings
                for index_source_col, index_target_col in pairwise_mapping.column_mappings:
                    aggregate_table.grid[
                        index_agg_table_insert, index_target_col] = \
                        pairwise_mapping.table1[
                            idx_source_row, index_source_col]

                index_agg_table_insert += 1

        return aggregate_table
Beispiel #14
0
    def test_map_tables(self):
        target_schema_easy = Table.create_from_cells(cells=[
            Cell(tokens=[Token(text='subject')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='header1')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='header2')], rowspan=1, colspan=1)
        ],
                                                     nrow=1,
                                                     ncol=3)

        target_schema_less = Table.create_from_cells(cells=[
            Cell(tokens=[Token(text='subject')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='header2')], rowspan=1, colspan=1)
        ],
                                                     nrow=1,
                                                     ncol=2)

        target_schema_more = Table.create_from_cells(cells=[
            Cell(tokens=[Token(text='subject')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='header0')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='header1')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='header2')], rowspan=1, colspan=1)
        ],
                                                     nrow=1,
                                                     ncol=4)

        target_schema_permuted = Table.create_from_cells(cells=[
            Cell(tokens=[Token(text='subject')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='header2')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='header1')], rowspan=1, colspan=1)
        ],
                                                         nrow=1,
                                                         ncol=3)

        schema_matcher = ColNameSchemaMatcher()
        self.assertListEqual(
            schema_matcher.map_tables(tables=[self.table_source],
                                      target_schema=target_schema_easy),
            [
                PairwiseMapping(self.table_source,
                                target_schema_easy,
                                score=2.0,
                                column_mappings=[(1, 1), (2, 2)])
            ])

        self.assertListEqual(
            schema_matcher.map_tables(tables=[self.table_source],
                                      target_schema=target_schema_permuted),
            [
                PairwiseMapping(self.table_source,
                                target_schema_permuted,
                                score=2.0,
                                column_mappings=[(1, 2), (2, 1)])
            ])

        self.assertListEqual(
            schema_matcher.map_tables(tables=[self.table_source],
                                      target_schema=target_schema_more),
            [
                PairwiseMapping(self.table_source,
                                target_schema_more,
                                score=2.0,
                                column_mappings=[(1, 2), (2, 3)])
            ])

        self.assertListEqual(
            schema_matcher.map_tables(tables=[self.table_source],
                                      target_schema=target_schema_less),
            [
                PairwiseMapping(self.table_source,
                                target_schema_less,
                                score=1.0,
                                column_mappings=[(2, 1)])
            ])

        self.assertListEqual(
            schema_matcher.map_tables(tables=[
                self.table_source, self.table_less_header,
                self.table_more_header
            ],
                                      target_schema=target_schema_permuted),
            [
                PairwiseMapping(self.table_source,
                                target_schema_permuted,
                                score=2.0,
                                column_mappings=[(1, 2), (2, 1)]),
                PairwiseMapping(self.table_less_header,
                                target_schema_permuted,
                                score=1.0,
                                column_mappings=[(1, 1)]),
                PairwiseMapping(self.table_more_header,
                                target_schema_permuted,
                                score=2.0,
                                column_mappings=[(1, 1), (2, 2)]),
            ])
Beispiel #15
0
    def setUp(self):
        """
        gold:
            subject, header1, header2
            x, 1, 2
            y, 3, 4
            z, 5, 6
        """
        self.gold_table = Table.create_from_cells(
            cells=[
                Cell(tokens=[Token(text='subject')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='header1')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='header2')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='x')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='1')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='2')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='y')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='3')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='4')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='z')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='5')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='6')], rowspan=1, colspan=1)
            ], nrow=4, ncol=3)

        self.gold_table_empty = Table.create_from_cells(
            cells=[
                Cell(tokens=[Token(text='subject')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='header1')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='header2')], rowspan=1, colspan=1)
            ], nrow=1, ncol=3)

        self.pred_table_perfect = self.gold_table

        self.pred_table_empty = Table.create_from_cells(
            cells=[
                Cell(tokens=[Token(text='subject')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='header1')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='header2')], rowspan=1, colspan=1)
            ], nrow=1, ncol=3)

        self.pred_table_permute_rows = Table.create_from_cells(
            cells=[
                Cell(tokens=[Token(text='subject')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='header1')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='header2')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='z')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='5')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='6')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='y')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='3')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='4')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='x')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='1')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='2')], rowspan=1, colspan=1)
            ], nrow=4, ncol=3)

        self.pred_table_extra_rows = Table.create_from_cells(
            cells=[
                Cell(tokens=[Token(text='subject')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='header1')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='header2')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='x')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='1')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='2')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='y')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='3')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='4')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='z')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='5')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='6')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='w')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='7')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='8')], rowspan=1, colspan=1)
            ], nrow=5, ncol=3)

        self.pred_table_missing_rows = Table.create_from_cells(
            cells=[
                Cell(tokens=[Token(text='subject')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='header1')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='header2')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='x')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='1')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='2')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='y')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='3')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='4')], rowspan=1, colspan=1)
            ], nrow=3, ncol=3)

        self.pred_table_partial_credit = Table.create_from_cells(
            cells=[
                Cell(tokens=[Token(text='subject')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='header1')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='header2')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='x')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='1')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='1')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='y')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='4')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='4')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='z')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='3')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='3')], rowspan=1, colspan=1)
            ], nrow=4, ncol=3)
Beispiel #16
0
    def setUp(self):
        self.table_source = Table.create_from_cells([
            Cell(tokens=[Token(text='subject')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='header1')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='header2')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='x')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='1')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='2')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='y')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='3')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='4')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='z')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='5')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='6')], rowspan=1, colspan=1)
        ],
                                                    nrow=4,
                                                    ncol=3)

        self.table_less_header = Table.create_from_cells([
            Cell(tokens=[Token(text='subject')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='header2')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='x')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='1')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='z')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='5')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='y')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='4')], rowspan=1, colspan=1)
        ],
                                                         nrow=4,
                                                         ncol=2)

        self.table_more_header = Table.create_from_cells([
            Cell(tokens=[Token(text='subject')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='header2')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='header1')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='header3')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='x')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='1')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='1')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='1')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='z')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='5')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='5')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='5')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='y')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='4')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='4')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='4')], rowspan=1, colspan=1)
        ],
                                                         nrow=4,
                                                         ncol=4)

        self.table_permute_header = Table.create_from_cells([
            Cell(tokens=[Token(text='subject')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='header2')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='header1')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='x')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='1')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='2')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='z')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='5')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='6')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='y')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='3')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='4')], rowspan=1, colspan=1)
        ],
                                                            nrow=4,
                                                            ncol=3)

        self.table_no_header = Table.create_from_cells([
            Cell(tokens=[Token(text='x')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='1')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='2')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='z')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='5')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='6')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='y')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='3')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='4')], rowspan=1, colspan=1)
        ],
                                                       nrow=3,
                                                       ncol=3)

        self.table_only_header = Table.create_from_cells(cells=[
            Cell(tokens=[Token(text='subject')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='header1')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='header2')], rowspan=1, colspan=1)
        ],
                                                         nrow=1,
                                                         ncol=3)
Beispiel #17
0
    def test_improper_table(self):
        # misspecified nrow or ncol
        with self.assertRaises(Exception):
            Table.create_from_cells(cells=[
                Cell(tokens=[Token(text='a')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='b')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='c')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='d')], rowspan=1, colspan=1)
            ],
                                    nrow=2,
                                    ncol=1,
                                    paper_id='',
                                    page_num=0,
                                    caption='')

        with self.assertRaises(Exception):
            Table.create_from_cells(cells=[
                Cell(tokens=[Token(text='a')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='b')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='c')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='d')], rowspan=1, colspan=1)
            ],
                                    nrow=1,
                                    ncol=2,
                                    paper_id='',
                                    page_num=0,
                                    caption='')

        # not enough cells to fill out table
        with self.assertRaises(Exception):
            Table.create_from_cells(cells=[
                Cell(tokens=[Token(text='a')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='b')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='c')], rowspan=1, colspan=1)
            ],
                                    nrow=2,
                                    ncol=2,
                                    paper_id='',
                                    page_num=0,
                                    caption='')

        with self.assertRaises(Exception):
            Table.create_from_cells(cells=[
                Cell(tokens=[Token(text='a')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='b')], rowspan=1, colspan=1)
            ],
                                    nrow=2,
                                    ncol=2,
                                    paper_id='',
                                    page_num=0,
                                    caption='')

        # cell juts out of table boundaries
        with self.assertRaises(Exception):
            Table.create_from_cells(
                cells=[Cell(tokens=[Token(text='a')], rowspan=1, colspan=2)],
                nrow=1,
                ncol=1,
                paper_id='',
                page_num=0,
                caption='')
Beispiel #18
0
 def test_create_from_grid(self):
     self.assertEqual(
         Table.create_from_grid(
             grid=[[self.a, self.b, self.c], [self.d, self.e, self.f]]),
         self.easy_table)
Beispiel #19
0
class TestTable(unittest.TestCase):
    def setUp(self):
        self.a = Cell(tokens=[Token(text='a')], rowspan=1, colspan=1)
        self.b = Cell(tokens=[Token(text='b')], rowspan=1, colspan=1)
        self.c = Cell(tokens=[Token(text='c')], rowspan=1, colspan=1)
        self.d = Cell(tokens=[Token(text='d')], rowspan=1, colspan=1)
        self.e = Cell(tokens=[Token(text='e')], rowspan=1, colspan=1)
        self.f = Cell(tokens=[Token(text='f')], rowspan=1, colspan=1)
        self.easy_table = Table(caption='hi this is caption')
        self.easy_table.grid = np.array([[self.a, self.b, self.c],
                                         [self.d, self.e, self.f]])

        self.hard_table = Table.create_from_cells(cells=[
            Cell(tokens=[Token(text='')], rowspan=2, colspan=2),
            Cell(tokens=[Token(text='C')], rowspan=1, colspan=2),
            Cell(tokens=[Token(text='C:1')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='C:2')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='R')], rowspan=3, colspan=1),
            Cell(tokens=[Token(text='R:1')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='a')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='b')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='R:2')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='c')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='d')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='R:3')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='e')], rowspan=1, colspan=1),
            Cell(tokens=[Token(text='f')], rowspan=1, colspan=1)
        ],
                                                  nrow=5,
                                                  ncol=4,
                                                  paper_id='abc',
                                                  page_num=0,
                                                  caption='hi this is caption')

    def test_create_from_grid(self):
        self.assertEqual(
            Table.create_from_grid(
                grid=[[self.a, self.b, self.c], [self.d, self.e, self.f]]),
            self.easy_table)

    # TODO
    def test_create_from_cells(self):
        pass

    def test_improper_table(self):
        # misspecified nrow or ncol
        with self.assertRaises(Exception):
            Table.create_from_cells(cells=[
                Cell(tokens=[Token(text='a')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='b')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='c')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='d')], rowspan=1, colspan=1)
            ],
                                    nrow=2,
                                    ncol=1,
                                    paper_id='',
                                    page_num=0,
                                    caption='')

        with self.assertRaises(Exception):
            Table.create_from_cells(cells=[
                Cell(tokens=[Token(text='a')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='b')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='c')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='d')], rowspan=1, colspan=1)
            ],
                                    nrow=1,
                                    ncol=2,
                                    paper_id='',
                                    page_num=0,
                                    caption='')

        # not enough cells to fill out table
        with self.assertRaises(Exception):
            Table.create_from_cells(cells=[
                Cell(tokens=[Token(text='a')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='b')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='c')], rowspan=1, colspan=1)
            ],
                                    nrow=2,
                                    ncol=2,
                                    paper_id='',
                                    page_num=0,
                                    caption='')

        with self.assertRaises(Exception):
            Table.create_from_cells(cells=[
                Cell(tokens=[Token(text='a')], rowspan=1, colspan=1),
                Cell(tokens=[Token(text='b')], rowspan=1, colspan=1)
            ],
                                    nrow=2,
                                    ncol=2,
                                    paper_id='',
                                    page_num=0,
                                    caption='')

        # cell juts out of table boundaries
        with self.assertRaises(Exception):
            Table.create_from_cells(
                cells=[Cell(tokens=[Token(text='a')], rowspan=1, colspan=2)],
                nrow=1,
                ncol=1,
                paper_id='',
                page_num=0,
                caption='')

    def test_shape_properties(self):
        self.assertEqual(self.easy_table.nrow, 2)
        self.assertEqual(self.easy_table.ncol, 3)
        self.assertEqual(self.easy_table.dim, (2, 3))
        self.assertEqual(self.hard_table.nrow, 5)
        self.assertEqual(self.hard_table.ncol, 4)
        self.assertEqual(self.hard_table.dim, (5, 4))

    def test_grid_indexing(self):
        # single elements
        self.assertEqual(self.easy_table[0, 0], self.a)
        self.assertEqual(self.easy_table[-1, -1], self.f)
        # full row
        self.assertListEqual(self.easy_table[0, :], [self.a, self.b, self.c])
        self.assertListEqual(self.easy_table[1, :], [self.d, self.e, self.f])
        # partial row
        self.assertListEqual(self.easy_table[0, 1:], [self.b, self.c])
        self.assertListEqual(self.easy_table[0, :2], [self.a, self.b])
        self.assertListEqual(self.easy_table[0, 1:2], [self.b])
        # full column
        self.assertListEqual(self.easy_table[:, 0], [self.a, self.d])
        # partial column
        self.assertListEqual(self.easy_table[1:, 0], [self.d])
        self.assertListEqual(self.easy_table[:1, 0], [self.a])
        self.assertListEqual(self.easy_table[1:2, 0], [self.d])
        # full subgrid
        self.assertEqual(self.easy_table, self.easy_table[:, :])
        # partial subgrid
        self.assertEqual(self.easy_table[1:2, 1:2],
                         Table.create_from_grid(grid=[[self.e]]))
        self.assertEqual(self.easy_table[1:, 1:],
                         Table.create_from_grid(grid=[[self.e, self.f]]))
        self.assertEqual(
            self.easy_table[:2, :2],
            Table.create_from_grid(grid=[[self.a, self.b], [self.d, self.e]]))

    def test_str(self):
        self.assertEqual(str(self.easy_table),
                         'a\tb\tc\nd\te\tf' + '\n' + 'hi this is caption')
        t = '\t\tC\tC\n\t\tC:1\tC:2\nR\tR:1\ta\tb\nR\tR:2\tc\td\nR\tR:3\te\tf'
        c = 'hithisiscaption'
        self.assertEqual(str(self.hard_table).replace(' ', ''), t + '\n' + c)

    def test_insert_row(self):
        x = Cell(tokens=[Token(text='x')], rowspan=1, colspan=1)
        y = Cell(tokens=[Token(text='y')], rowspan=1, colspan=1)
        z = Cell(tokens=[Token(text='z')], rowspan=1, colspan=1)
        self.assertEqual(
            self.easy_table.insert_row(index=0, row=[x, y, z]),
            Table.create_from_grid(grid=[[x, y, z], [self.a, self.b, self.c],
                                         [self.d, self.e, self.f]]))
        self.assertEqual(
            self.easy_table.insert_row(index=1, row=[x, y, z]),
            Table.create_from_grid(grid=[[self.a, self.b, self.c], [x, y, z],
                                         [self.d, self.e, self.f]]))
        with self.assertRaises(Exception):
            self.easy_table.insert_row(index=1, row=[x, y])

    def test_insert_column(self):
        x = Cell(tokens=[Token(text='x')], rowspan=1, colspan=1)
        y = Cell(tokens=[Token(text='y')], rowspan=1, colspan=1)
        self.assertEqual(
            self.easy_table.insert_column(index=1, column=[x, y]),
            Table.create_from_grid(grid=[[self.a, x, self.b, self.c],
                                         [self.d, y, self.e, self.f]]))
        with self.assertRaises(Exception):
            self.easy_table.insert_column(index=1, column=[x, y, y])

    def test_delete_row(self):
        self.assertEqual(
            self.easy_table.delete_row(index=1),
            Table.create_from_grid(grid=[[self.a, self.b, self.c]]))

    def test_delete_column(self):
        self.assertEqual(
            self.easy_table.delete_column(index=1),
            Table.create_from_grid(grid=[[self.a, self.c], [self.d, self.f]]))

    def test_append_left(self):
        self.assertEqual(
            self.easy_table.append_left(other=Table.create_from_grid(
                grid=[[self.f, self.b, self.d], [self.c, self.e, self.a]])),
            Table.create_from_grid(
                grid=[[self.f, self.b, self.d, self.a, self.b, self.c],
                      [self.c, self.e, self.a, self.d, self.e, self.f]]))

    def test_append_right(self):
        self.assertEqual(
            self.easy_table.append_right(other=Table.create_from_grid(
                grid=[[self.f, self.b, self.d], [self.c, self.e, self.a]])),
            Table.create_from_grid(
                grid=[[self.a, self.b, self.c, self.f, self.b, self.d],
                      [self.d, self.e, self.f, self.c, self.e, self.a]]))

    def test_append_top(self):
        self.assertEqual(
            self.easy_table.append_top(other=Table.create_from_grid(
                grid=[[self.f, self.b, self.d], [self.c, self.e, self.a]])),
            Table.create_from_grid(
                grid=[[self.f, self.b, self.d], [self.c, self.e, self.a],
                      [self.a, self.b, self.c], [self.d, self.e, self.f]]))

    def test_append_bottom(self):
        self.assertEqual(
            self.easy_table.append_bottom(other=Table.create_from_grid(
                grid=[[self.f, self.b, self.d], [self.c, self.e, self.a]])),
            Table.create_from_grid(
                grid=[[self.a, self.b, self.c], [self.d, self.e, self.f],
                      [self.f, self.b, self.d], [self.c, self.e, self.a]]))

    def test_compute_bounding_box(self):
        table = Table.create_from_cells(cells=[
            Cell(tokens=[Token(text='e')],
                 rowspan=1,
                 colspan=1,
                 bounding_box=Box(llx=-1.0, lly=-0.5, urx=1.0, ury=1.0)),
            Cell(tokens=[Token(text='e')],
                 rowspan=1,
                 colspan=1,
                 bounding_box=Box(llx=1.5, lly=-0.5, urx=2.5, ury=1.5))
        ],
                                        nrow=1,
                                        ncol=2,
                                        paper_id='abc',
                                        page_num=0,
                                        caption='hi this is caption')
        box = table.bounding_box
        self.assertEqual(box.ll.x, -1.0)
        self.assertEqual(box.ll.y, -0.5)
        self.assertEqual(box.ur.x, 2.5)
        self.assertEqual(box.ur.y, 1.5)

    # TODO: implement this later
    def test_eq(self):
        pass
Beispiel #20
0
 def __init__(self, table: Table):
     self.cell_classes = self._classify_cells(table)
     if self._should_transpose(table):
         table = table.transpose()
     self.table = self._normalize(table)
Beispiel #21
0
 def test_delete_column(self):
     self.assertEqual(
         self.easy_table.delete_column(index=1),
         Table.create_from_grid(grid=[[self.a, self.c], [self.d, self.f]]))
Beispiel #22
0
 def test_delete_row(self):
     self.assertEqual(
         self.easy_table.delete_row(index=1),
         Table.create_from_grid(grid=[[self.a, self.b, self.c]]))