def test_grid_indexing(self): # single elements self.assertEqual(self.easy_table[0, 0], self.a) self.assertEqual(self.easy_table[-1, -1], self.f) # full row self.assertListEqual(self.easy_table[0, :], [self.a, self.b, self.c]) self.assertListEqual(self.easy_table[1, :], [self.d, self.e, self.f]) # partial row self.assertListEqual(self.easy_table[0, 1:], [self.b, self.c]) self.assertListEqual(self.easy_table[0, :2], [self.a, self.b]) self.assertListEqual(self.easy_table[0, 1:2], [self.b]) # full column self.assertListEqual(self.easy_table[:, 0], [self.a, self.d]) # partial column self.assertListEqual(self.easy_table[1:, 0], [self.d]) self.assertListEqual(self.easy_table[:1, 0], [self.a]) self.assertListEqual(self.easy_table[1:2, 0], [self.d]) # full subgrid self.assertEqual(self.easy_table, self.easy_table[:, :]) # partial subgrid self.assertEqual(self.easy_table[1:2, 1:2], Table.create_from_grid(grid=[[self.e]])) self.assertEqual(self.easy_table[1:, 1:], Table.create_from_grid(grid=[[self.e, self.f]])) self.assertEqual( self.easy_table[:2, :2], Table.create_from_grid(grid=[[self.a, self.b], [self.d, self.e]]))
def setUp(self): self.a = Cell(tokens=[Token(text='a')], rowspan=1, colspan=1) self.b = Cell(tokens=[Token(text='b')], rowspan=1, colspan=1) self.c = Cell(tokens=[Token(text='c')], rowspan=1, colspan=1) self.d = Cell(tokens=[Token(text='d')], rowspan=1, colspan=1) self.e = Cell(tokens=[Token(text='e')], rowspan=1, colspan=1) self.f = Cell(tokens=[Token(text='f')], rowspan=1, colspan=1) self.easy_table = Table(caption='hi this is caption') self.easy_table.grid = np.array([[self.a, self.b, self.c], [self.d, self.e, self.f]]) self.hard_table = Table.create_from_cells(cells=[ Cell(tokens=[Token(text='')], rowspan=2, colspan=2), Cell(tokens=[Token(text='C')], rowspan=1, colspan=2), Cell(tokens=[Token(text='C:1')], rowspan=1, colspan=1), Cell(tokens=[Token(text='C:2')], rowspan=1, colspan=1), Cell(tokens=[Token(text='R')], rowspan=3, colspan=1), Cell(tokens=[Token(text='R:1')], rowspan=1, colspan=1), Cell(tokens=[Token(text='a')], rowspan=1, colspan=1), Cell(tokens=[Token(text='b')], rowspan=1, colspan=1), Cell(tokens=[Token(text='R:2')], rowspan=1, colspan=1), Cell(tokens=[Token(text='c')], rowspan=1, colspan=1), Cell(tokens=[Token(text='d')], rowspan=1, colspan=1), Cell(tokens=[Token(text='R:3')], rowspan=1, colspan=1), Cell(tokens=[Token(text='e')], rowspan=1, colspan=1), Cell(tokens=[Token(text='f')], rowspan=1, colspan=1) ], nrow=5, ncol=4, paper_id='abc', page_num=0, caption='hi this is caption')
def setUp(self): self.table1 = Table() self.table2 = Table() self.pairwise_mapping = PairwiseMapping(self.table1, self.table2, score=1.0, column_mappings=list())
def test_append_bottom(self): self.assertEqual( self.easy_table.append_bottom(other=Table.create_from_grid( grid=[[self.f, self.b, self.d], [self.c, self.e, self.a]])), Table.create_from_grid( grid=[[self.a, self.b, self.c], [self.d, self.e, self.f], [self.f, self.b, self.d], [self.c, self.e, self.a]]))
def setUp(self): self.table_permute_rows = Table.create_from_cells(cells=[ Cell(tokens=[Token(text='subject')], rowspan=1, colspan=1), Cell(tokens=[Token(text='header1')], rowspan=1, colspan=1), Cell(tokens=[Token(text='header2')], rowspan=1, colspan=1), Cell(tokens=[Token(text='z')], rowspan=1, colspan=1), Cell(tokens=[Token(text='5')], rowspan=1, colspan=1), Cell(tokens=[Token(text='6')], rowspan=1, colspan=1), Cell(tokens=[Token(text='y')], rowspan=1, colspan=1), Cell(tokens=[Token(text='3')], rowspan=1, colspan=1), Cell(tokens=[Token(text='4')], rowspan=1, colspan=1), Cell(tokens=[Token(text='x')], rowspan=1, colspan=1), Cell(tokens=[Token(text='1')], rowspan=1, colspan=1), Cell(tokens=[Token(text='2')], rowspan=1, colspan=1) ], nrow=4, ncol=3) self.table_extra_rows = Table.create_from_cells(cells=[ Cell(tokens=[Token(text='subject')], rowspan=1, colspan=1), Cell(tokens=[Token(text='header1')], rowspan=1, colspan=1), Cell(tokens=[Token(text='header2')], rowspan=1, colspan=1), Cell(tokens=[Token(text='x')], rowspan=1, colspan=1), Cell(tokens=[Token(text='1')], rowspan=1, colspan=1), Cell(tokens=[Token(text='2')], rowspan=1, colspan=1), Cell(tokens=[Token(text='y')], rowspan=1, colspan=1), Cell(tokens=[Token(text='3')], rowspan=1, colspan=1), Cell(tokens=[Token(text='4')], rowspan=1, colspan=1), Cell(tokens=[Token(text='z')], rowspan=1, colspan=1), Cell(tokens=[Token(text='5')], rowspan=1, colspan=1), Cell(tokens=[Token(text='6')], rowspan=1, colspan=1), Cell(tokens=[Token(text='w')], rowspan=1, colspan=1), Cell(tokens=[Token(text='7')], rowspan=1, colspan=1), Cell(tokens=[Token(text='8')], rowspan=1, colspan=1) ], nrow=5, ncol=3) self.table_missing_rows = Table.create_from_cells(cells=[ Cell(tokens=[Token(text='subject')], rowspan=1, colspan=1), Cell(tokens=[Token(text='header1')], rowspan=1, colspan=1), Cell(tokens=[Token(text='header2')], rowspan=1, colspan=1), Cell(tokens=[Token(text='x')], rowspan=1, colspan=1), Cell(tokens=[Token(text='1')], rowspan=1, colspan=1), Cell(tokens=[Token(text='2')], rowspan=1, colspan=1), Cell(tokens=[Token(text='y')], rowspan=1, colspan=1), Cell(tokens=[Token(text='3')], rowspan=1, colspan=1), Cell(tokens=[Token(text='4')], rowspan=1, colspan=1) ], nrow=3, ncol=3)
def test_insert_row(self): x = Cell(tokens=[Token(text='x')], rowspan=1, colspan=1) y = Cell(tokens=[Token(text='y')], rowspan=1, colspan=1) z = Cell(tokens=[Token(text='z')], rowspan=1, colspan=1) self.assertEqual( self.easy_table.insert_row(index=0, row=[x, y, z]), Table.create_from_grid(grid=[[x, y, z], [self.a, self.b, self.c], [self.d, self.e, self.f]])) self.assertEqual( self.easy_table.insert_row(index=1, row=[x, y, z]), Table.create_from_grid(grid=[[self.a, self.b, self.c], [x, y, z], [self.d, self.e, self.f]])) with self.assertRaises(Exception): self.easy_table.insert_row(index=1, row=[x, y])
def _create_table_from_omnipage_xml(self, table_tag: Tag, caption: str, paper_id: str) -> Table: ncol = len(table_tag.find('gridtable').find_all('gridcol')) nrow = len(table_tag.find('gridtable').find_all('gridrow')) cells = [] for cell_tag in table_tag.find_all('cellzone'): # BUILD LIST OF TOKENS tokens = [] for word_tag in cell_tag.find_all('wd'): token = Token(text=word_tag.get_text(strip=True)) tokens.append(token) # BUILD CELL FROM LIST OF TOKENS cell = Cell(tokens=tokens, rowspan=int(cell_tag.get('gridrowtill')) - int(cell_tag.get('gridrowfrom')) + 1, colspan=int(cell_tag.get('gridcoltill')) - int(cell_tag.get('gridcolfrom')) + 1) cells.append(cell) # BUILD TABLE FROM LIST OF CELLS table = Table.create_from_cells(cells=cells, nrow=nrow, ncol=ncol, paper_id=paper_id, page_num=0, caption=caption) return table
def test_aggregate_tables(self): schema_matcher = SchemaMatcher() target_schema = Table.create_from_cells(cells=[ Cell(tokens=[Token(text='subject')], rowspan=1, colspan=1), Cell(tokens=[Token(text='header1')], rowspan=1, colspan=1), Cell(tokens=[Token(text='header2')], rowspan=1, colspan=1), Cell(tokens=[Token(text='not_copied')], rowspan=1, colspan=1), Cell(tokens=[Token(text='not_copied')], rowspan=1, colspan=1), Cell(tokens=[Token(text='not_copied')], rowspan=1, colspan=1) ], nrow=2, ncol=3) pred_aggregate_table = schema_matcher.aggregate_tables( pairwise_mappings=[ PairwiseMapping(self.table_source, target_schema, score=-999, column_mappings=[(1, 2), (2, 1)]) ], target_schema=target_schema) gold_aggregate_table = Table.create_from_cells([ Cell(tokens=[Token(text='subject')], rowspan=1, colspan=1), Cell(tokens=[Token(text='header1')], rowspan=1, colspan=1), Cell(tokens=[Token(text='header2')], rowspan=1, colspan=1), Cell(tokens=[Token(text='x')], rowspan=1, colspan=1), Cell(tokens=[Token(text='2')], rowspan=1, colspan=1), Cell(tokens=[Token(text='1')], rowspan=1, colspan=1), Cell(tokens=[Token(text='y')], rowspan=1, colspan=1), Cell(tokens=[Token(text='4')], rowspan=1, colspan=1), Cell(tokens=[Token(text='3')], rowspan=1, colspan=1), Cell(tokens=[Token(text='z')], rowspan=1, colspan=1), Cell(tokens=[Token(text='6')], rowspan=1, colspan=1), Cell(tokens=[Token(text='5')], rowspan=1, colspan=1) ], nrow=4, ncol=3) print(pred_aggregate_table) print(gold_aggregate_table) self.assertEquals(pred_aggregate_table, gold_aggregate_table)
def test_compute_metrics(self): pred_table_missing_header = Table.create_from_cells( cells=[ Cell(tokens=[Token(text='x')], rowspan=1, colspan=1), Cell(tokens=[Token(text='1')], rowspan=1, colspan=1), Cell(tokens=[Token(text='2')], rowspan=1, colspan=1), Cell(tokens=[Token(text='y')], rowspan=1, colspan=1), Cell(tokens=[Token(text='3')], rowspan=1, colspan=1), Cell(tokens=[Token(text='4')], rowspan=1, colspan=1), Cell(tokens=[Token(text='z')], rowspan=1, colspan=1), Cell(tokens=[Token(text='5')], rowspan=1, colspan=1), Cell(tokens=[Token(text='6')], rowspan=1, colspan=1) ], nrow=3, ncol=3) with self.assertRaises(Exception): compute_metrics(gold_table=self.gold_table, pred_table=pred_table_missing_header) self.assertEqual( cell_level_recall(gold_table=self.gold_table, pred_table=self.pred_table_empty), 0.0) pred_table_permuted_header = Table.create_from_cells( cells=[ Cell(tokens=[Token(text='subject')], rowspan=1, colspan=1), Cell(tokens=[Token(text='header2')], rowspan=1, colspan=1), Cell(tokens=[Token(text='header1')], rowspan=1, colspan=1), Cell(tokens=[Token(text='x')], rowspan=1, colspan=1), Cell(tokens=[Token(text='2')], rowspan=1, colspan=1), Cell(tokens=[Token(text='1')], rowspan=1, colspan=1), Cell(tokens=[Token(text='y')], rowspan=1, colspan=1), Cell(tokens=[Token(text='4')], rowspan=1, colspan=1), Cell(tokens=[Token(text='3')], rowspan=1, colspan=1), Cell(tokens=[Token(text='z')], rowspan=1, colspan=1), Cell(tokens=[Token(text='6')], rowspan=1, colspan=1), Cell(tokens=[Token(text='5')], rowspan=1, colspan=1) ], nrow=4, ncol=3) with self.assertRaises(Exception): compute_metrics(gold_table=self.gold_table, pred_table=pred_table_permuted_header)
def _create_table_from_tetml(self, table_id: int, table_tag: Tag, paper_id: str, caption: str) -> Table: cells = [] ncol_per_row = [] for i, row_tag in enumerate(table_tag.find_all('row')): ncol_per_row.append(0) for cell_tag in row_tag.find_all('cell'): # BUILD LIST OF TOKENS tokens = [] for word_tag in cell_tag.find_all('word'): word_box_tag = word_tag.find('box') token = Token( text=word_box_tag.get_text(strip=True), # `find_all` gets font per character, # but use `find` because assume font # is constant within same word font=word_box_tag.find('glyph').get('font'), bounding_box=Box(llx=float(word_box_tag.get('llx')), lly=float(word_box_tag.get('lly')), urx=float(word_box_tag.get('urx')), ury=float(word_box_tag.get('ury')))) tokens.append(token) # BUILD CELL FROM LIST OF TOKENS cell = Cell( tokens=tokens, rowspan=1, colspan=int(cell_tag.get('colspan')) \ if cell_tag.get('colspan') else 1 ) cells.append(cell) ncol_per_row[i] += cell.colspan # TODO: add more filters here if necessary if not all([ncol == ncol_per_row[0] for ncol in ncol_per_row]): raise TetmlXMLToTablesParserException( 'Table {} has unequal columns per row. Skipping...'.format( table_id)) # TODO: `page_num` and `paper_id` fields # BUILD TABLE FROM LIST OF CELLS table = Table.create_from_cells(cells=cells, nrow=len(ncol_per_row), ncol=ncol_per_row[0], paper_id=paper_id, page_num=0, caption=caption) return table
def setUp(self): self.table_source = Table.create_from_cells([ Cell(tokens=[Token(text='subject')], rowspan=1, colspan=1), Cell(tokens=[Token(text='header1')], rowspan=1, colspan=1), Cell(tokens=[Token(text='header2')], rowspan=1, colspan=1), Cell(tokens=[Token(text='x')], rowspan=1, colspan=1), Cell(tokens=[Token(text='1')], rowspan=1, colspan=1), Cell(tokens=[Token(text='2')], rowspan=1, colspan=1), Cell(tokens=[Token(text='y')], rowspan=1, colspan=1), Cell(tokens=[Token(text='3')], rowspan=1, colspan=1), Cell(tokens=[Token(text='4')], rowspan=1, colspan=1), Cell(tokens=[Token(text='z')], rowspan=1, colspan=1), Cell(tokens=[Token(text='5')], rowspan=1, colspan=1), Cell(tokens=[Token(text='6')], rowspan=1, colspan=1) ], nrow=4, ncol=3)
def test_compute_bounding_box(self): table = Table.create_from_cells(cells=[ Cell(tokens=[Token(text='e')], rowspan=1, colspan=1, bounding_box=Box(llx=-1.0, lly=-0.5, urx=1.0, ury=1.0)), Cell(tokens=[Token(text='e')], rowspan=1, colspan=1, bounding_box=Box(llx=1.5, lly=-0.5, urx=2.5, ury=1.5)) ], nrow=1, ncol=2, paper_id='abc', page_num=0, caption='hi this is caption') box = table.bounding_box self.assertEqual(box.ll.x, -1.0) self.assertEqual(box.ll.y, -0.5) self.assertEqual(box.ur.x, 2.5) self.assertEqual(box.ur.y, 1.5)
def aggregate_tables(self, pairwise_mappings: List[PairwiseMapping], target_schema: Table) -> Table: # initialize empty aggregate table num_rows_agg_table = sum([pairwise_mapping.table1.nrow - 1 for pairwise_mapping in pairwise_mappings]) aggregate_table = Table.create_from_grid(grid=np.array([ [None for _ in range(target_schema.ncol)] for _ in range(num_rows_agg_table) ])) aggregate_table = aggregate_table.insert_row(index=0, row=target_schema[0, :]) index_agg_table_insert = 1 # TODO: `table1` is always the table that needs to be aggregated to `table2`=target for pairwise_mapping in sorted(pairwise_mappings): for idx_source_row in range(1, pairwise_mapping.table1.nrow): # copy subject for this row aggregate_table.grid[index_agg_table_insert, 0] = \ pairwise_mapping.table1[idx_source_row, 0] # fill cells with source table values according to column mappings for index_source_col, index_target_col in pairwise_mapping.column_mappings: aggregate_table.grid[ index_agg_table_insert, index_target_col] = \ pairwise_mapping.table1[ idx_source_row, index_source_col] index_agg_table_insert += 1 return aggregate_table
def test_map_tables(self): target_schema_easy = Table.create_from_cells(cells=[ Cell(tokens=[Token(text='subject')], rowspan=1, colspan=1), Cell(tokens=[Token(text='header1')], rowspan=1, colspan=1), Cell(tokens=[Token(text='header2')], rowspan=1, colspan=1) ], nrow=1, ncol=3) target_schema_less = Table.create_from_cells(cells=[ Cell(tokens=[Token(text='subject')], rowspan=1, colspan=1), Cell(tokens=[Token(text='header2')], rowspan=1, colspan=1) ], nrow=1, ncol=2) target_schema_more = Table.create_from_cells(cells=[ Cell(tokens=[Token(text='subject')], rowspan=1, colspan=1), Cell(tokens=[Token(text='header0')], rowspan=1, colspan=1), Cell(tokens=[Token(text='header1')], rowspan=1, colspan=1), Cell(tokens=[Token(text='header2')], rowspan=1, colspan=1) ], nrow=1, ncol=4) target_schema_permuted = Table.create_from_cells(cells=[ Cell(tokens=[Token(text='subject')], rowspan=1, colspan=1), Cell(tokens=[Token(text='header2')], rowspan=1, colspan=1), Cell(tokens=[Token(text='header1')], rowspan=1, colspan=1) ], nrow=1, ncol=3) schema_matcher = ColNameSchemaMatcher() self.assertListEqual( schema_matcher.map_tables(tables=[self.table_source], target_schema=target_schema_easy), [ PairwiseMapping(self.table_source, target_schema_easy, score=2.0, column_mappings=[(1, 1), (2, 2)]) ]) self.assertListEqual( schema_matcher.map_tables(tables=[self.table_source], target_schema=target_schema_permuted), [ PairwiseMapping(self.table_source, target_schema_permuted, score=2.0, column_mappings=[(1, 2), (2, 1)]) ]) self.assertListEqual( schema_matcher.map_tables(tables=[self.table_source], target_schema=target_schema_more), [ PairwiseMapping(self.table_source, target_schema_more, score=2.0, column_mappings=[(1, 2), (2, 3)]) ]) self.assertListEqual( schema_matcher.map_tables(tables=[self.table_source], target_schema=target_schema_less), [ PairwiseMapping(self.table_source, target_schema_less, score=1.0, column_mappings=[(2, 1)]) ]) self.assertListEqual( schema_matcher.map_tables(tables=[ self.table_source, self.table_less_header, self.table_more_header ], target_schema=target_schema_permuted), [ PairwiseMapping(self.table_source, target_schema_permuted, score=2.0, column_mappings=[(1, 2), (2, 1)]), PairwiseMapping(self.table_less_header, target_schema_permuted, score=1.0, column_mappings=[(1, 1)]), PairwiseMapping(self.table_more_header, target_schema_permuted, score=2.0, column_mappings=[(1, 1), (2, 2)]), ])
def setUp(self): """ gold: subject, header1, header2 x, 1, 2 y, 3, 4 z, 5, 6 """ self.gold_table = Table.create_from_cells( cells=[ Cell(tokens=[Token(text='subject')], rowspan=1, colspan=1), Cell(tokens=[Token(text='header1')], rowspan=1, colspan=1), Cell(tokens=[Token(text='header2')], rowspan=1, colspan=1), Cell(tokens=[Token(text='x')], rowspan=1, colspan=1), Cell(tokens=[Token(text='1')], rowspan=1, colspan=1), Cell(tokens=[Token(text='2')], rowspan=1, colspan=1), Cell(tokens=[Token(text='y')], rowspan=1, colspan=1), Cell(tokens=[Token(text='3')], rowspan=1, colspan=1), Cell(tokens=[Token(text='4')], rowspan=1, colspan=1), Cell(tokens=[Token(text='z')], rowspan=1, colspan=1), Cell(tokens=[Token(text='5')], rowspan=1, colspan=1), Cell(tokens=[Token(text='6')], rowspan=1, colspan=1) ], nrow=4, ncol=3) self.gold_table_empty = Table.create_from_cells( cells=[ Cell(tokens=[Token(text='subject')], rowspan=1, colspan=1), Cell(tokens=[Token(text='header1')], rowspan=1, colspan=1), Cell(tokens=[Token(text='header2')], rowspan=1, colspan=1) ], nrow=1, ncol=3) self.pred_table_perfect = self.gold_table self.pred_table_empty = Table.create_from_cells( cells=[ Cell(tokens=[Token(text='subject')], rowspan=1, colspan=1), Cell(tokens=[Token(text='header1')], rowspan=1, colspan=1), Cell(tokens=[Token(text='header2')], rowspan=1, colspan=1) ], nrow=1, ncol=3) self.pred_table_permute_rows = Table.create_from_cells( cells=[ Cell(tokens=[Token(text='subject')], rowspan=1, colspan=1), Cell(tokens=[Token(text='header1')], rowspan=1, colspan=1), Cell(tokens=[Token(text='header2')], rowspan=1, colspan=1), Cell(tokens=[Token(text='z')], rowspan=1, colspan=1), Cell(tokens=[Token(text='5')], rowspan=1, colspan=1), Cell(tokens=[Token(text='6')], rowspan=1, colspan=1), Cell(tokens=[Token(text='y')], rowspan=1, colspan=1), Cell(tokens=[Token(text='3')], rowspan=1, colspan=1), Cell(tokens=[Token(text='4')], rowspan=1, colspan=1), Cell(tokens=[Token(text='x')], rowspan=1, colspan=1), Cell(tokens=[Token(text='1')], rowspan=1, colspan=1), Cell(tokens=[Token(text='2')], rowspan=1, colspan=1) ], nrow=4, ncol=3) self.pred_table_extra_rows = Table.create_from_cells( cells=[ Cell(tokens=[Token(text='subject')], rowspan=1, colspan=1), Cell(tokens=[Token(text='header1')], rowspan=1, colspan=1), Cell(tokens=[Token(text='header2')], rowspan=1, colspan=1), Cell(tokens=[Token(text='x')], rowspan=1, colspan=1), Cell(tokens=[Token(text='1')], rowspan=1, colspan=1), Cell(tokens=[Token(text='2')], rowspan=1, colspan=1), Cell(tokens=[Token(text='y')], rowspan=1, colspan=1), Cell(tokens=[Token(text='3')], rowspan=1, colspan=1), Cell(tokens=[Token(text='4')], rowspan=1, colspan=1), Cell(tokens=[Token(text='z')], rowspan=1, colspan=1), Cell(tokens=[Token(text='5')], rowspan=1, colspan=1), Cell(tokens=[Token(text='6')], rowspan=1, colspan=1), Cell(tokens=[Token(text='w')], rowspan=1, colspan=1), Cell(tokens=[Token(text='7')], rowspan=1, colspan=1), Cell(tokens=[Token(text='8')], rowspan=1, colspan=1) ], nrow=5, ncol=3) self.pred_table_missing_rows = Table.create_from_cells( cells=[ Cell(tokens=[Token(text='subject')], rowspan=1, colspan=1), Cell(tokens=[Token(text='header1')], rowspan=1, colspan=1), Cell(tokens=[Token(text='header2')], rowspan=1, colspan=1), Cell(tokens=[Token(text='x')], rowspan=1, colspan=1), Cell(tokens=[Token(text='1')], rowspan=1, colspan=1), Cell(tokens=[Token(text='2')], rowspan=1, colspan=1), Cell(tokens=[Token(text='y')], rowspan=1, colspan=1), Cell(tokens=[Token(text='3')], rowspan=1, colspan=1), Cell(tokens=[Token(text='4')], rowspan=1, colspan=1) ], nrow=3, ncol=3) self.pred_table_partial_credit = Table.create_from_cells( cells=[ Cell(tokens=[Token(text='subject')], rowspan=1, colspan=1), Cell(tokens=[Token(text='header1')], rowspan=1, colspan=1), Cell(tokens=[Token(text='header2')], rowspan=1, colspan=1), Cell(tokens=[Token(text='x')], rowspan=1, colspan=1), Cell(tokens=[Token(text='1')], rowspan=1, colspan=1), Cell(tokens=[Token(text='1')], rowspan=1, colspan=1), Cell(tokens=[Token(text='y')], rowspan=1, colspan=1), Cell(tokens=[Token(text='4')], rowspan=1, colspan=1), Cell(tokens=[Token(text='4')], rowspan=1, colspan=1), Cell(tokens=[Token(text='z')], rowspan=1, colspan=1), Cell(tokens=[Token(text='3')], rowspan=1, colspan=1), Cell(tokens=[Token(text='3')], rowspan=1, colspan=1) ], nrow=4, ncol=3)
def setUp(self): self.table_source = Table.create_from_cells([ Cell(tokens=[Token(text='subject')], rowspan=1, colspan=1), Cell(tokens=[Token(text='header1')], rowspan=1, colspan=1), Cell(tokens=[Token(text='header2')], rowspan=1, colspan=1), Cell(tokens=[Token(text='x')], rowspan=1, colspan=1), Cell(tokens=[Token(text='1')], rowspan=1, colspan=1), Cell(tokens=[Token(text='2')], rowspan=1, colspan=1), Cell(tokens=[Token(text='y')], rowspan=1, colspan=1), Cell(tokens=[Token(text='3')], rowspan=1, colspan=1), Cell(tokens=[Token(text='4')], rowspan=1, colspan=1), Cell(tokens=[Token(text='z')], rowspan=1, colspan=1), Cell(tokens=[Token(text='5')], rowspan=1, colspan=1), Cell(tokens=[Token(text='6')], rowspan=1, colspan=1) ], nrow=4, ncol=3) self.table_less_header = Table.create_from_cells([ Cell(tokens=[Token(text='subject')], rowspan=1, colspan=1), Cell(tokens=[Token(text='header2')], rowspan=1, colspan=1), Cell(tokens=[Token(text='x')], rowspan=1, colspan=1), Cell(tokens=[Token(text='1')], rowspan=1, colspan=1), Cell(tokens=[Token(text='z')], rowspan=1, colspan=1), Cell(tokens=[Token(text='5')], rowspan=1, colspan=1), Cell(tokens=[Token(text='y')], rowspan=1, colspan=1), Cell(tokens=[Token(text='4')], rowspan=1, colspan=1) ], nrow=4, ncol=2) self.table_more_header = Table.create_from_cells([ Cell(tokens=[Token(text='subject')], rowspan=1, colspan=1), Cell(tokens=[Token(text='header2')], rowspan=1, colspan=1), Cell(tokens=[Token(text='header1')], rowspan=1, colspan=1), Cell(tokens=[Token(text='header3')], rowspan=1, colspan=1), Cell(tokens=[Token(text='x')], rowspan=1, colspan=1), Cell(tokens=[Token(text='1')], rowspan=1, colspan=1), Cell(tokens=[Token(text='1')], rowspan=1, colspan=1), Cell(tokens=[Token(text='1')], rowspan=1, colspan=1), Cell(tokens=[Token(text='z')], rowspan=1, colspan=1), Cell(tokens=[Token(text='5')], rowspan=1, colspan=1), Cell(tokens=[Token(text='5')], rowspan=1, colspan=1), Cell(tokens=[Token(text='5')], rowspan=1, colspan=1), Cell(tokens=[Token(text='y')], rowspan=1, colspan=1), Cell(tokens=[Token(text='4')], rowspan=1, colspan=1), Cell(tokens=[Token(text='4')], rowspan=1, colspan=1), Cell(tokens=[Token(text='4')], rowspan=1, colspan=1) ], nrow=4, ncol=4) self.table_permute_header = Table.create_from_cells([ Cell(tokens=[Token(text='subject')], rowspan=1, colspan=1), Cell(tokens=[Token(text='header2')], rowspan=1, colspan=1), Cell(tokens=[Token(text='header1')], rowspan=1, colspan=1), Cell(tokens=[Token(text='x')], rowspan=1, colspan=1), Cell(tokens=[Token(text='1')], rowspan=1, colspan=1), Cell(tokens=[Token(text='2')], rowspan=1, colspan=1), Cell(tokens=[Token(text='z')], rowspan=1, colspan=1), Cell(tokens=[Token(text='5')], rowspan=1, colspan=1), Cell(tokens=[Token(text='6')], rowspan=1, colspan=1), Cell(tokens=[Token(text='y')], rowspan=1, colspan=1), Cell(tokens=[Token(text='3')], rowspan=1, colspan=1), Cell(tokens=[Token(text='4')], rowspan=1, colspan=1) ], nrow=4, ncol=3) self.table_no_header = Table.create_from_cells([ Cell(tokens=[Token(text='x')], rowspan=1, colspan=1), Cell(tokens=[Token(text='1')], rowspan=1, colspan=1), Cell(tokens=[Token(text='2')], rowspan=1, colspan=1), Cell(tokens=[Token(text='z')], rowspan=1, colspan=1), Cell(tokens=[Token(text='5')], rowspan=1, colspan=1), Cell(tokens=[Token(text='6')], rowspan=1, colspan=1), Cell(tokens=[Token(text='y')], rowspan=1, colspan=1), Cell(tokens=[Token(text='3')], rowspan=1, colspan=1), Cell(tokens=[Token(text='4')], rowspan=1, colspan=1) ], nrow=3, ncol=3) self.table_only_header = Table.create_from_cells(cells=[ Cell(tokens=[Token(text='subject')], rowspan=1, colspan=1), Cell(tokens=[Token(text='header1')], rowspan=1, colspan=1), Cell(tokens=[Token(text='header2')], rowspan=1, colspan=1) ], nrow=1, ncol=3)
def test_improper_table(self): # misspecified nrow or ncol with self.assertRaises(Exception): Table.create_from_cells(cells=[ Cell(tokens=[Token(text='a')], rowspan=1, colspan=1), Cell(tokens=[Token(text='b')], rowspan=1, colspan=1), Cell(tokens=[Token(text='c')], rowspan=1, colspan=1), Cell(tokens=[Token(text='d')], rowspan=1, colspan=1) ], nrow=2, ncol=1, paper_id='', page_num=0, caption='') with self.assertRaises(Exception): Table.create_from_cells(cells=[ Cell(tokens=[Token(text='a')], rowspan=1, colspan=1), Cell(tokens=[Token(text='b')], rowspan=1, colspan=1), Cell(tokens=[Token(text='c')], rowspan=1, colspan=1), Cell(tokens=[Token(text='d')], rowspan=1, colspan=1) ], nrow=1, ncol=2, paper_id='', page_num=0, caption='') # not enough cells to fill out table with self.assertRaises(Exception): Table.create_from_cells(cells=[ Cell(tokens=[Token(text='a')], rowspan=1, colspan=1), Cell(tokens=[Token(text='b')], rowspan=1, colspan=1), Cell(tokens=[Token(text='c')], rowspan=1, colspan=1) ], nrow=2, ncol=2, paper_id='', page_num=0, caption='') with self.assertRaises(Exception): Table.create_from_cells(cells=[ Cell(tokens=[Token(text='a')], rowspan=1, colspan=1), Cell(tokens=[Token(text='b')], rowspan=1, colspan=1) ], nrow=2, ncol=2, paper_id='', page_num=0, caption='') # cell juts out of table boundaries with self.assertRaises(Exception): Table.create_from_cells( cells=[Cell(tokens=[Token(text='a')], rowspan=1, colspan=2)], nrow=1, ncol=1, paper_id='', page_num=0, caption='')
def test_create_from_grid(self): self.assertEqual( Table.create_from_grid( grid=[[self.a, self.b, self.c], [self.d, self.e, self.f]]), self.easy_table)
class TestTable(unittest.TestCase): def setUp(self): self.a = Cell(tokens=[Token(text='a')], rowspan=1, colspan=1) self.b = Cell(tokens=[Token(text='b')], rowspan=1, colspan=1) self.c = Cell(tokens=[Token(text='c')], rowspan=1, colspan=1) self.d = Cell(tokens=[Token(text='d')], rowspan=1, colspan=1) self.e = Cell(tokens=[Token(text='e')], rowspan=1, colspan=1) self.f = Cell(tokens=[Token(text='f')], rowspan=1, colspan=1) self.easy_table = Table(caption='hi this is caption') self.easy_table.grid = np.array([[self.a, self.b, self.c], [self.d, self.e, self.f]]) self.hard_table = Table.create_from_cells(cells=[ Cell(tokens=[Token(text='')], rowspan=2, colspan=2), Cell(tokens=[Token(text='C')], rowspan=1, colspan=2), Cell(tokens=[Token(text='C:1')], rowspan=1, colspan=1), Cell(tokens=[Token(text='C:2')], rowspan=1, colspan=1), Cell(tokens=[Token(text='R')], rowspan=3, colspan=1), Cell(tokens=[Token(text='R:1')], rowspan=1, colspan=1), Cell(tokens=[Token(text='a')], rowspan=1, colspan=1), Cell(tokens=[Token(text='b')], rowspan=1, colspan=1), Cell(tokens=[Token(text='R:2')], rowspan=1, colspan=1), Cell(tokens=[Token(text='c')], rowspan=1, colspan=1), Cell(tokens=[Token(text='d')], rowspan=1, colspan=1), Cell(tokens=[Token(text='R:3')], rowspan=1, colspan=1), Cell(tokens=[Token(text='e')], rowspan=1, colspan=1), Cell(tokens=[Token(text='f')], rowspan=1, colspan=1) ], nrow=5, ncol=4, paper_id='abc', page_num=0, caption='hi this is caption') def test_create_from_grid(self): self.assertEqual( Table.create_from_grid( grid=[[self.a, self.b, self.c], [self.d, self.e, self.f]]), self.easy_table) # TODO def test_create_from_cells(self): pass def test_improper_table(self): # misspecified nrow or ncol with self.assertRaises(Exception): Table.create_from_cells(cells=[ Cell(tokens=[Token(text='a')], rowspan=1, colspan=1), Cell(tokens=[Token(text='b')], rowspan=1, colspan=1), Cell(tokens=[Token(text='c')], rowspan=1, colspan=1), Cell(tokens=[Token(text='d')], rowspan=1, colspan=1) ], nrow=2, ncol=1, paper_id='', page_num=0, caption='') with self.assertRaises(Exception): Table.create_from_cells(cells=[ Cell(tokens=[Token(text='a')], rowspan=1, colspan=1), Cell(tokens=[Token(text='b')], rowspan=1, colspan=1), Cell(tokens=[Token(text='c')], rowspan=1, colspan=1), Cell(tokens=[Token(text='d')], rowspan=1, colspan=1) ], nrow=1, ncol=2, paper_id='', page_num=0, caption='') # not enough cells to fill out table with self.assertRaises(Exception): Table.create_from_cells(cells=[ Cell(tokens=[Token(text='a')], rowspan=1, colspan=1), Cell(tokens=[Token(text='b')], rowspan=1, colspan=1), Cell(tokens=[Token(text='c')], rowspan=1, colspan=1) ], nrow=2, ncol=2, paper_id='', page_num=0, caption='') with self.assertRaises(Exception): Table.create_from_cells(cells=[ Cell(tokens=[Token(text='a')], rowspan=1, colspan=1), Cell(tokens=[Token(text='b')], rowspan=1, colspan=1) ], nrow=2, ncol=2, paper_id='', page_num=0, caption='') # cell juts out of table boundaries with self.assertRaises(Exception): Table.create_from_cells( cells=[Cell(tokens=[Token(text='a')], rowspan=1, colspan=2)], nrow=1, ncol=1, paper_id='', page_num=0, caption='') def test_shape_properties(self): self.assertEqual(self.easy_table.nrow, 2) self.assertEqual(self.easy_table.ncol, 3) self.assertEqual(self.easy_table.dim, (2, 3)) self.assertEqual(self.hard_table.nrow, 5) self.assertEqual(self.hard_table.ncol, 4) self.assertEqual(self.hard_table.dim, (5, 4)) def test_grid_indexing(self): # single elements self.assertEqual(self.easy_table[0, 0], self.a) self.assertEqual(self.easy_table[-1, -1], self.f) # full row self.assertListEqual(self.easy_table[0, :], [self.a, self.b, self.c]) self.assertListEqual(self.easy_table[1, :], [self.d, self.e, self.f]) # partial row self.assertListEqual(self.easy_table[0, 1:], [self.b, self.c]) self.assertListEqual(self.easy_table[0, :2], [self.a, self.b]) self.assertListEqual(self.easy_table[0, 1:2], [self.b]) # full column self.assertListEqual(self.easy_table[:, 0], [self.a, self.d]) # partial column self.assertListEqual(self.easy_table[1:, 0], [self.d]) self.assertListEqual(self.easy_table[:1, 0], [self.a]) self.assertListEqual(self.easy_table[1:2, 0], [self.d]) # full subgrid self.assertEqual(self.easy_table, self.easy_table[:, :]) # partial subgrid self.assertEqual(self.easy_table[1:2, 1:2], Table.create_from_grid(grid=[[self.e]])) self.assertEqual(self.easy_table[1:, 1:], Table.create_from_grid(grid=[[self.e, self.f]])) self.assertEqual( self.easy_table[:2, :2], Table.create_from_grid(grid=[[self.a, self.b], [self.d, self.e]])) def test_str(self): self.assertEqual(str(self.easy_table), 'a\tb\tc\nd\te\tf' + '\n' + 'hi this is caption') t = '\t\tC\tC\n\t\tC:1\tC:2\nR\tR:1\ta\tb\nR\tR:2\tc\td\nR\tR:3\te\tf' c = 'hithisiscaption' self.assertEqual(str(self.hard_table).replace(' ', ''), t + '\n' + c) def test_insert_row(self): x = Cell(tokens=[Token(text='x')], rowspan=1, colspan=1) y = Cell(tokens=[Token(text='y')], rowspan=1, colspan=1) z = Cell(tokens=[Token(text='z')], rowspan=1, colspan=1) self.assertEqual( self.easy_table.insert_row(index=0, row=[x, y, z]), Table.create_from_grid(grid=[[x, y, z], [self.a, self.b, self.c], [self.d, self.e, self.f]])) self.assertEqual( self.easy_table.insert_row(index=1, row=[x, y, z]), Table.create_from_grid(grid=[[self.a, self.b, self.c], [x, y, z], [self.d, self.e, self.f]])) with self.assertRaises(Exception): self.easy_table.insert_row(index=1, row=[x, y]) def test_insert_column(self): x = Cell(tokens=[Token(text='x')], rowspan=1, colspan=1) y = Cell(tokens=[Token(text='y')], rowspan=1, colspan=1) self.assertEqual( self.easy_table.insert_column(index=1, column=[x, y]), Table.create_from_grid(grid=[[self.a, x, self.b, self.c], [self.d, y, self.e, self.f]])) with self.assertRaises(Exception): self.easy_table.insert_column(index=1, column=[x, y, y]) def test_delete_row(self): self.assertEqual( self.easy_table.delete_row(index=1), Table.create_from_grid(grid=[[self.a, self.b, self.c]])) def test_delete_column(self): self.assertEqual( self.easy_table.delete_column(index=1), Table.create_from_grid(grid=[[self.a, self.c], [self.d, self.f]])) def test_append_left(self): self.assertEqual( self.easy_table.append_left(other=Table.create_from_grid( grid=[[self.f, self.b, self.d], [self.c, self.e, self.a]])), Table.create_from_grid( grid=[[self.f, self.b, self.d, self.a, self.b, self.c], [self.c, self.e, self.a, self.d, self.e, self.f]])) def test_append_right(self): self.assertEqual( self.easy_table.append_right(other=Table.create_from_grid( grid=[[self.f, self.b, self.d], [self.c, self.e, self.a]])), Table.create_from_grid( grid=[[self.a, self.b, self.c, self.f, self.b, self.d], [self.d, self.e, self.f, self.c, self.e, self.a]])) def test_append_top(self): self.assertEqual( self.easy_table.append_top(other=Table.create_from_grid( grid=[[self.f, self.b, self.d], [self.c, self.e, self.a]])), Table.create_from_grid( grid=[[self.f, self.b, self.d], [self.c, self.e, self.a], [self.a, self.b, self.c], [self.d, self.e, self.f]])) def test_append_bottom(self): self.assertEqual( self.easy_table.append_bottom(other=Table.create_from_grid( grid=[[self.f, self.b, self.d], [self.c, self.e, self.a]])), Table.create_from_grid( grid=[[self.a, self.b, self.c], [self.d, self.e, self.f], [self.f, self.b, self.d], [self.c, self.e, self.a]])) def test_compute_bounding_box(self): table = Table.create_from_cells(cells=[ Cell(tokens=[Token(text='e')], rowspan=1, colspan=1, bounding_box=Box(llx=-1.0, lly=-0.5, urx=1.0, ury=1.0)), Cell(tokens=[Token(text='e')], rowspan=1, colspan=1, bounding_box=Box(llx=1.5, lly=-0.5, urx=2.5, ury=1.5)) ], nrow=1, ncol=2, paper_id='abc', page_num=0, caption='hi this is caption') box = table.bounding_box self.assertEqual(box.ll.x, -1.0) self.assertEqual(box.ll.y, -0.5) self.assertEqual(box.ur.x, 2.5) self.assertEqual(box.ur.y, 1.5) # TODO: implement this later def test_eq(self): pass
def __init__(self, table: Table): self.cell_classes = self._classify_cells(table) if self._should_transpose(table): table = table.transpose() self.table = self._normalize(table)
def test_delete_column(self): self.assertEqual( self.easy_table.delete_column(index=1), Table.create_from_grid(grid=[[self.a, self.c], [self.d, self.f]]))
def test_delete_row(self): self.assertEqual( self.easy_table.delete_row(index=1), Table.create_from_grid(grid=[[self.a, self.b, self.c]]))