def test_create_from_cells(self): table = Table(cells=[self.a], nrow=2, ncol=2) assert_array_equal(table.grid, self.single_cell_table.grid) table = Table(cells=[ self.a, self.b, self.c, self.d, self.e, self.f, self.i, self.j, self.g, self.k, self.l, self.h, self.m, self.n ], nrow=5, ncol=4) assert_array_equal(table.grid, self.full_table.grid)
def test_empty_table(self): with self.assertRaises(AssertionError): Table(cells=[], nrow=0, ncol=0) with self.assertRaises(AssertionError): Table() with self.assertRaises(AssertionError): Table(grid=[[]]) with self.assertRaises(AssertionError): Table(grid=[])
def test_create_from_grid(self): self.assertListEqual( Table(grid=np.array([[self.a, self.a], [self.a, self.a]])).cells, [self.a]) self.assertListEqual( Table(grid=[[self.a, self.a, self.b, self.b], [self.a, self.a, self.c, self.d], [self.e, self.f, self.i, self.j], [self.e, self.g, self.k, self.l], [self.e, self.h, self.m, self.n]]).cells, [ self.a, self.b, self.c, self.d, self.e, self.f, self.i, self.j, self.g, self.k, self.l, self.h, self.m, self.n ])
def _merge_label_cells(self, table: Table, index_topmost_value_row: int, index_leftmost_value_col: int) -> Table: """ Some thoughts: Typically, the top-left quadrant cells are a super-label to describe the label columns in the bottom-left quadrant. Hence, when merging 'LABEL' cells, we'll prioritize merging rows over merging columns. """ new_table = table if index_leftmost_value_col > 1: # decrement col indices of all cells to the right of collapsed cols for i in range(new_table.nrow): for j in range(index_leftmost_value_col, new_table.ncol): new_table[i, j].index_topleft_col -= \ (index_leftmost_value_col - 1) # overwrite leftmost col cell tokens for i in range(new_table.nrow): new_table[i, 0].tokens = reduce(lambda l1, l2: l1 + l2, [ c.tokens for c in new_table[i, :index_leftmost_value_col] ]) # delete collapsed cols (except for leftmost) new_grid = np.delete(new_table.grid, list(range(1, index_leftmost_value_col)), axis=1) new_table = Table(grid=new_grid.tolist()) if index_topmost_value_row > 1: # decrement row indices of all cells below the collapsed rows for i in range(index_topmost_value_row, new_table.nrow): for j in range(new_table.ncol): new_table[i, j].index_topleft_row -= \ (index_topmost_value_row - 1) # overwrite topmost row cell tokens for j in range(new_table.ncol): new_table[0, j].tokens = reduce( lambda l1, l2: l1 + l2, [c.tokens for c in new_table[:index_topmost_value_row, j]]) # delete collapsed cols (except for leftmost) new_grid = np.delete(new_table.grid, list(range(1, index_topmost_value_row)), axis=0) new_table = Table(grid=new_grid.tolist()) return new_table
def insert_row(self, index: int, row: List[Cell]): assert len(row) == self.ncol new_grid = np.insert(arr=self.normalized_table.grid, obj=index, values=row, axis=0) for i in range(index, self.normalized_table.nrow + 1): for j in range(self.normalized_table.ncol): new_grid[i, j].index_topleft_row += 1 self.normalized_table = Table(grid=new_grid.tolist())
def insert_column(self, index: int, column: List[Cell]): assert len(column) == self.nrow new_grid = np.insert(arr=self.normalized_table.grid, obj=index, values=column, axis=1) for i in range(self.normalized_table.nrow): for j in range(index, self.normalized_table.ncol + 1): new_grid[i, j].index_topleft_col += 1 self.normalized_table = Table(grid=new_grid.tolist())
def normalize_table(self, table: Table) -> Table: new_cells = [] for raw_cell in table.cells: for i, j in raw_cell.indices: new_cells.append( Cell(tokens=raw_cell.tokens, index_topleft_row=i, index_topleft_col=j, rowspan=1, colspan=1)) return Table(cells=new_cells, nrow=table.nrow, ncol=table.ncol)
def _standardize_cell_sizes(self, table: Table) -> Table: """Creates new cells for multispan cells""" new_cells = [] for raw_cell in table.cells: for i, j in raw_cell.indices: new_cell = Cell(tokens=raw_cell.tokens, index_topleft_row=i, index_topleft_col=j, rowspan=1, colspan=1) new_cells.append(new_cell) return Table(cells=new_cells, nrow=table.nrow, ncol=table.ncol)
def _add_empty_header(self, table: Table) -> Table: for cell in table.cells: cell.index_topleft_row += 1 new_grid = np.insert(table.grid, 0, values=[ Cell(tokens=[], index_topleft_row=0, index_topleft_col=j, rowspan=1, colspan=1) for j in range(table.ncol) ], axis=0) new_table = Table(grid=new_grid) return new_table
def _add_empty_subject(self, table: Table) -> Table: for cell in table.cells: cell.index_topleft_col += 1 new_grid = np.insert(table.grid, 0, values=[ Cell(tokens=[], index_topleft_row=i, index_topleft_col=0, rowspan=1, colspan=1) for i in range(table.nrow) ], axis=1) new_table = Table(grid=new_grid) return new_table
def predict(self, tables: List[Table], target_schema: List[str]) -> Table: schema_table = Table(cells=[ Cell(tokens=[s], index_topleft_row=0, index_topleft_col=j, rowspan=1, colspan=1) for j, s in enumerate(target_schema) ], nrow=1, ncol=len(target_schema)) # match each table to the schema (order doesnt matter) for table in tables: score, column_alignments = \ self.compute_column_alignments_by_column_names(schema_table, table) schema_table = self.merge_two_tables( target=schema_table, source=table, column_alignments=column_alignments) return schema_table
def merge_two_tables(self, target: Table, source: Table, column_alignments: List[Tuple[int, int]], pad: str = 'NONE') -> Table: """Merge a `source` table into a `target` table based on their `column_alignments`, which is a List of Tuple[int, int] that index the `target` column and the `source` column, respectively. Unaligned target columns are padded.""" t = np.array([[str(cell) for cell in row] for row in target.grid], dtype=object) s = np.array([[str(cell) for cell in row] for row in source.grid[1:]], dtype=object) index_t_cols = [i for i, j in column_alignments] index_s_cols = [j for i, j in column_alignments] new_rows = np.array([], dtype=object).reshape(source.nrow - 1, 0) for j in range(target.ncol): # target column has a source column alignment if j in index_t_cols: new_col = s[:, index_s_cols[index_t_cols.index(j)]] \ .reshape(source.nrow - 1, 1) # padding if target column doesnt have a source column alignment else: new_col = np.array([[pad]] * (source.nrow - 1), dtype=object) new_rows = np.append(new_rows, new_col, axis=1) # append rows of permuted source (excluding header) into target t = np.append(t, new_rows, axis=0) # convert to a table new_table = Table( grid=[[Cell([cell], i, j) for j, cell in enumerate(row)] for i, row in enumerate(t)]) return new_table
def predict_oracle(source_tables: List[Table], gold_table: Table) -> Table: # convert tables into numpy arrays for easier management # - strip header row & subject col # - pad sources w/ Nones s.t. they have at least as many columns as gold gold = np.array([[str(cell) for cell in row] for row in gold_table.grid[1:, 1:]], dtype=object) sources = [] for source_table in source_tables: s = { 'subject': np.array([str(cell) for cell in source_table.grid[1:, 0]], dtype=object), 'source': np.array([[str(cell) for cell in row] for row in source_table.grid[1:, 1:]], dtype=object) } n_pad_cols = gold.shape[1] - s['source'].shape[1] if n_pad_cols > 0: padding = np.empty(shape=[s['source'].shape[0], n_pad_cols], dtype=object) s['source'] = np.append(s['source'], padding, axis=1) sources.append(s) # initialize predicted output pred = np.array([[str(cell) for cell in gold_table.grid[0, :]]], dtype=object) # continue until every gold row is matched and/or run out of sources while gold.shape[0] > 0 and len(sources) > 0: # # (1) which source table has most similar columns to gold? # scores = [] all_column_mappings = [] for s in sources: # represent each column j as a list [ cell_1j, cell_2j, ... ] # gold & source can have differing-length columns gold_cols = [list(col) for col in zip(*gold)] source_cols = [list(col) for col in zip(*s['source'])] # align columns between gold & source score, column_mappings = compute_best_alignments( x=gold_cols, y=source_cols, sim=lambda gold_col, source_col: len( compute_intersection(x=gold_col, y=source_col))) scores.append(score) all_column_mappings.append(column_mappings) # pick best match among sources & permute its cols to match gold # also, pop this source from the list of sources index_best_score = np.argmax(scores) best_column_mappings = all_column_mappings[index_best_score] s = sources.pop(index_best_score) permute_source_cols = [ source_col for gold_col, source_col in best_column_mappings ] source = s['source'][:, permute_source_cols] subject = s['subject'] # # (2) which rows of (col-permuted) source table match best to gold rows? # # represent each row i as a tuple = ( cell_i1, cell_i2, ..., cell_ik ) # where k = ncol(gold) gold_rows = [tuple(cell for cell in row) for row in gold] source_rows = [tuple(cell for cell in row) for row in source] # align rows between gold & source # if score is 0, then break because no more matching is possible score, row_mappings = compute_best_alignments_with_threshold( x=gold_rows, y=source_rows, sim=lambda gold_row, source_row: sum( [g_i == s_i for g_i, s_i in zip(gold_row, source_row)]), threshold=0) if score == 0: break index_gold_rows = [] index_source_rows = [] for index_gold_row, index_source_row in row_mappings: index_gold_rows.append(index_gold_row) index_source_rows.append(index_source_row) # # (3) append matched source rows to pred # new_rows = [[str(index_best_score) + '__' + subject[i]] + list(source_rows[i]) for i in index_source_rows] pred = np.append(pred, new_rows, axis=0) # # (4) remove gold rows that matched # gold = np.delete(gold, index_gold_rows, axis=0) return Table( grid=[[Cell([cell], i, j, 0, 0) for j, cell in enumerate(row)] for i, row in enumerate(pred)])
gold = np.delete(gold, index_gold_rows, axis=0) return Table( grid=[[Cell([cell], i, j, 0, 0) for j, cell in enumerate(row)] for i, row in enumerate(pred)]) if __name__ == '__main__': source_table1 = Table(cells=[ Cell([''], 0, 0), Cell(['x'], 0, 1), Cell(['y'], 0, 2), Cell(['z'], 0, 3), Cell(['s:m1'], 1, 0), Cell(['a'], 1, 1), Cell(['?'], 1, 2), Cell(['2'], 1, 3), Cell(['s:m2'], 2, 0), Cell(['b'], 2, 1), Cell(['?'], 2, 2), Cell(['1'], 2, 3), ], nrow=3, ncol=4) source_table2 = Table(cells=[ Cell([''], 0, 0), Cell(['w'], 0, 1), Cell(['s:m3'], 1, 0), Cell(['a'], 1, 1), Cell(['s:m4'], 2, 0), Cell(['b'], 2, 1),
def setUp(self): """ > | | C | > | | C:1 | C:2 | > R | R:1 | a | b | > R | R:2 | c | d | > R | R:3 | e | f | """ self.a = Cell(tokens=[''], index_topleft_row=0, index_topleft_col=0, rowspan=2, colspan=2) self.b = Cell(tokens=['C'], index_topleft_row=0, index_topleft_col=2, rowspan=1, colspan=2) self.c = Cell(tokens=['C:1'], index_topleft_row=1, index_topleft_col=2, rowspan=1, colspan=1) self.d = Cell(tokens=['C:2'], index_topleft_row=1, index_topleft_col=3, rowspan=1, colspan=1) self.e = Cell(tokens=['R'], index_topleft_row=2, index_topleft_col=0, rowspan=3, colspan=1) self.f = Cell(tokens=['R:1'], index_topleft_row=2, index_topleft_col=1, rowspan=1, colspan=1) self.g = Cell(tokens=['R:2'], index_topleft_row=3, index_topleft_col=1, rowspan=1, colspan=1) self.h = Cell(tokens=['R:3'], index_topleft_row=4, index_topleft_col=1, rowspan=1, colspan=1) self.i = Cell(tokens=['a'], index_topleft_row=2, index_topleft_col=2, rowspan=1, colspan=1) self.j = Cell(tokens=['b'], index_topleft_row=2, index_topleft_col=3, rowspan=1, colspan=1) self.k = Cell(tokens=['c'], index_topleft_row=3, index_topleft_col=2, rowspan=1, colspan=1) self.l = Cell(tokens=['d'], index_topleft_row=3, index_topleft_col=3, rowspan=1, colspan=1) self.m = Cell(tokens=['e'], index_topleft_row=4, index_topleft_col=2, rowspan=1, colspan=1) self.n = Cell(tokens=['f'], index_topleft_row=4, index_topleft_col=3, rowspan=1, colspan=1) self.single_cell_table = Table( grid=[[self.a, self.a], [self.a, self.a]]) self.full_table = Table(grid=[[self.a, self.a, self.b, self.b], [self.a, self.a, self.c, self.d], [self.e, self.f, self.i, self.j], [self.e, self.g, self.k, self.l], [self.e, self.h, self.m, self.n]])
def test_improper_table(self): # misspecified nrow or ncol raises IndexError with self.assertRaises(IndexError): Table(cells=[ Cell(tokens=['a'], index_topleft_row=0, index_topleft_col=0, rowspan=1, colspan=1), Cell(tokens=['b'], index_topleft_row=0, index_topleft_col=1, rowspan=1, colspan=1), Cell(tokens=['c'], index_topleft_row=1, index_topleft_col=0, rowspan=1, colspan=1), Cell(tokens=['d'], index_topleft_row=1, index_topleft_col=1, rowspan=1, colspan=1) ], nrow=2, ncol=1) with self.assertRaises(IndexError): Table(cells=[ Cell(tokens=['a'], index_topleft_row=0, index_topleft_col=0, rowspan=1, colspan=1), Cell(tokens=['b'], index_topleft_row=0, index_topleft_col=1, rowspan=1, colspan=1), Cell(tokens=['c'], index_topleft_row=1, index_topleft_col=0, rowspan=1, colspan=1), Cell(tokens=['d'], index_topleft_row=1, index_topleft_col=1, rowspan=1, colspan=1) ], nrow=1, ncol=2) # not enough cells to fill out table with self.assertRaises(ValueError): Table(cells=[ Cell(tokens=['a'], index_topleft_row=0, index_topleft_col=0, rowspan=1, colspan=1), Cell(tokens=['b'], index_topleft_row=0, index_topleft_col=1, rowspan=1, colspan=1), Cell(tokens=['c'], index_topleft_row=1, index_topleft_col=0, rowspan=1, colspan=1) ], nrow=2, ncol=2) with self.assertRaises(ValueError): Table(cells=[ Cell(tokens=['a'], index_topleft_row=0, index_topleft_col=0, rowspan=1, colspan=1), Cell(tokens=['b'], index_topleft_row=0, index_topleft_col=1, rowspan=1, colspan=1) ], nrow=2, ncol=2) # cell protrudes out of table boundaries with self.assertRaises(IndexError): Table(cells=[ Cell(tokens=['a'], index_topleft_row=0, index_topleft_col=0, rowspan=1, colspan=2) ], nrow=1, ncol=1)
def test_multiple_create_inputs(self): with self.assertRaises(AssertionError): Table(grid=[[]], cells=[])
def delete_column(self, index: int): new_grid = np.delete(arr=self.normalized_table.grid, obj=index, axis=1) for i in range(self.normalized_table.nrow): for j in range(index, self.normalized_table.ncol - 1): new_grid[i, j].index_topleft_col -= 1 self.normalized_table = Table(grid=new_grid.tolist())